Skip to content

edsnlp.pipelines.qualifiers

factories

base

Qualifier

Bases: BaseComponent

Implements the NegEx algorithm.

PARAMETER DESCRIPTION
nlp

spaCy nlp pipeline to use for matching.

TYPE: Language

attr

spaCy's attribute to use: a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr' we can also add a key for each regex.

TYPE: str

on_ents_only

Whether to look for matches around detected entities only. Useful for faster inference in downstream tasks.

TYPE: bool

explain

Whether to keep track of cues for each entity.

TYPE: bool

**terms

Terms to look for.

TYPE: Dict[str, Optional[List[str]]]

Source code in edsnlp/pipelines/qualifiers/base.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
class Qualifier(BaseComponent):
    """
    Implements the NegEx algorithm.

    Parameters
    ----------
    nlp : Language
        spaCy nlp pipeline to use for matching.
    attr : str
        spaCy's attribute to use:
        a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr'
        we can also add a key for each regex.
    on_ents_only : bool
        Whether to look for matches around detected entities only.
        Useful for faster inference in downstream tasks.
    explain : bool
        Whether to keep track of cues for each entity.
    **terms : Dict[str, Optional[List[str]]]
        Terms to look for.
    """

    defaults = dict()

    def __init__(
        self,
        nlp: Language,
        attr: str,
        on_ents_only: bool,
        explain: bool,
        **terms: Dict[str, Optional[List[str]]],
    ):

        if attr.upper() == "NORM":
            check_normalizer(nlp)

        self.phrase_matcher = EDSPhraseMatcher(vocab=nlp.vocab, attr=attr)
        self.phrase_matcher.build_patterns(nlp=nlp, terms=terms)

        self.on_ents_only = on_ents_only
        self.explain = explain

    def get_defaults(
        self, **kwargs: Dict[str, Optional[List[str]]]
    ) -> Dict[str, List[str]]:
        """
        Merge terms with their defaults. Null keys are replaced with defaults.

        Returns
        -------
        Dict[str, List[str]]
            Merged dictionary
        """
        # Filter out empty keys
        kwargs = {k: v for k, v in kwargs.items() if v is not None}

        # Update defaults
        terms = self.defaults.copy()
        terms.update(kwargs)

        return terms

    def get_matches(self, doc: Doc) -> List[Span]:
        """
        Extract matches.

        Parameters
        ----------
        doc : Doc
            spaCy `Doc` object.

        Returns
        -------
        List[Span]
            List of detected spans
        """
        if self.on_ents_only:

            sents = set([ent.sent for ent in doc.ents])
            match_iterator = map(
                lambda sent: self.phrase_matcher(sent, as_spans=True), sents
            )

            matches = chain.from_iterable(match_iterator)

        else:
            matches = self.phrase_matcher(doc, as_spans=True)

        return list(matches)

    def __call__(self, doc: Doc) -> Doc:
        return self.process(doc)
defaults = dict() class-attribute
phrase_matcher = EDSPhraseMatcher(vocab=nlp.vocab, attr=attr) instance-attribute
on_ents_only = on_ents_only instance-attribute
explain = explain instance-attribute
__init__(nlp, attr, on_ents_only, explain, **terms)
Source code in edsnlp/pipelines/qualifiers/base.py
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
def __init__(
    self,
    nlp: Language,
    attr: str,
    on_ents_only: bool,
    explain: bool,
    **terms: Dict[str, Optional[List[str]]],
):

    if attr.upper() == "NORM":
        check_normalizer(nlp)

    self.phrase_matcher = EDSPhraseMatcher(vocab=nlp.vocab, attr=attr)
    self.phrase_matcher.build_patterns(nlp=nlp, terms=terms)

    self.on_ents_only = on_ents_only
    self.explain = explain
get_defaults(**kwargs)

Merge terms with their defaults. Null keys are replaced with defaults.

RETURNS DESCRIPTION
Dict[str, List[str]]

Merged dictionary

Source code in edsnlp/pipelines/qualifiers/base.py
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def get_defaults(
    self, **kwargs: Dict[str, Optional[List[str]]]
) -> Dict[str, List[str]]:
    """
    Merge terms with their defaults. Null keys are replaced with defaults.

    Returns
    -------
    Dict[str, List[str]]
        Merged dictionary
    """
    # Filter out empty keys
    kwargs = {k: v for k, v in kwargs.items() if v is not None}

    # Update defaults
    terms = self.defaults.copy()
    terms.update(kwargs)

    return terms
get_matches(doc)

Extract matches.

PARAMETER DESCRIPTION
doc

spaCy Doc object.

TYPE: Doc

RETURNS DESCRIPTION
List[Span]

List of detected spans

Source code in edsnlp/pipelines/qualifiers/base.py
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
def get_matches(self, doc: Doc) -> List[Span]:
    """
    Extract matches.

    Parameters
    ----------
    doc : Doc
        spaCy `Doc` object.

    Returns
    -------
    List[Span]
        List of detected spans
    """
    if self.on_ents_only:

        sents = set([ent.sent for ent in doc.ents])
        match_iterator = map(
            lambda sent: self.phrase_matcher(sent, as_spans=True), sents
        )

        matches = chain.from_iterable(match_iterator)

    else:
        matches = self.phrase_matcher(doc, as_spans=True)

    return list(matches)
__call__(doc)
Source code in edsnlp/pipelines/qualifiers/base.py
114
115
def __call__(self, doc: Doc) -> Doc:
    return self.process(doc)

check_normalizer(nlp)

Source code in edsnlp/pipelines/qualifiers/base.py
12
13
14
15
16
17
18
19
20
21
22
def check_normalizer(nlp: Language) -> None:
    components = {name: component for name, component in nlp.pipeline}
    normalizer = components.get("normalizer")

    if normalizer and not normalizer.lowercase:
        logger.warning(
            "You have chosen the NORM attribute, but disabled lowercasing "
            "in your normalisation pipeline. "
            "This WILL hurt performance : you might want to use the "
            "LOWER attribute instead."
        )

history

patterns

history = ['antécédents', 'atcd', 'atcds', 'tacds', 'antécédent'] module-attribute

factory

DEFAULT_CONFIG = dict(attr='NORM', history=patterns.history, termination=termination, use_sections=False, explain=False, on_ents_only=True) module-attribute
create_component(nlp, name, history, termination, use_sections, attr, explain, on_ents_only)
Source code in edsnlp/pipelines/qualifiers/history/factory.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
@deprecated_factory("antecedents", "eds.history", default_config=DEFAULT_CONFIG)
@deprecated_factory("eds.antecedents", "eds.history", default_config=DEFAULT_CONFIG)
@deprecated_factory("history", "eds.history", default_config=DEFAULT_CONFIG)
@Language.factory("eds.history", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    history: Optional[List[str]],
    termination: Optional[List[str]],
    use_sections: bool,
    attr: str,
    explain: str,
    on_ents_only: bool,
):
    return History(
        nlp,
        attr=attr,
        history=history,
        termination=termination,
        use_sections=use_sections,
        explain=explain,
        on_ents_only=on_ents_only,
    )

history

History

Bases: Qualifier

Implements an history detection algorithm.

The components looks for terms indicating history in the text.

PARAMETER DESCRIPTION
nlp

spaCy nlp pipeline to use for matching.

TYPE: Language

history

List of terms indicating medical history reference.

TYPE: Optional[List[str]]

termination

List of syntagme termination terms.

TYPE: Optional[List[str]]

use_sections

Whether to use section pipeline to detect medical history section.

TYPE: bool

attr

spaCy's attribute to use: a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr' we can also add a key for each regex.

TYPE: str

on_ents_only

Whether to look for matches around detected entities only. Useful for faster inference in downstream tasks.

TYPE: bool

regex

A dictionnary of regex patterns.

TYPE: Optional[Dict[str, Union[List[str], str]]]

explain

Whether to keep track of cues for each entity.

TYPE: bool

Source code in edsnlp/pipelines/qualifiers/history/history.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
class History(Qualifier):
    """
    Implements an history detection algorithm.

    The components looks for terms indicating history in the text.

    Parameters
    ----------
    nlp : Language
        spaCy nlp pipeline to use for matching.
    history : Optional[List[str]]
        List of terms indicating medical history reference.
    termination : Optional[List[str]]
        List of syntagme termination terms.
    use_sections : bool
        Whether to use section pipeline to detect medical history section.
    attr : str
        spaCy's attribute to use:
        a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr'
        we can also add a key for each regex.
    on_ents_only : bool
        Whether to look for matches around detected entities only.
        Useful for faster inference in downstream tasks.
    regex : Optional[Dict[str, Union[List[str], str]]]
        A dictionnary of regex patterns.
    explain : bool
        Whether to keep track of cues for each entity.
    """

    defaults = dict(
        history=history,
        termination=termination,
    )

    def __init__(
        self,
        nlp: Language,
        attr: str,
        history: Optional[List[str]],
        termination: Optional[List[str]],
        use_sections: bool,
        explain: bool,
        on_ents_only: bool,
    ):

        terms = self.get_defaults(
            history=history,
            termination=termination,
        )

        super().__init__(
            nlp=nlp,
            attr=attr,
            on_ents_only=on_ents_only,
            explain=explain,
            **terms,
        )

        self.set_extensions()

        self.sections = use_sections and (
            "eds.sections" in nlp.pipe_names or "sections" in nlp.pipe_names
        )
        if use_sections and not self.sections:
            logger.warning(
                "You have requested that the pipeline use annotations "
                "provided by the `section` pipeline, but it was not set. "
                "Skipping that step."
            )

    @staticmethod
    def set_extensions() -> None:

        if not Token.has_extension("history"):
            Token.set_extension("history", default=False)

        if not Token.has_extension("antecedents"):
            Token.set_extension(
                "antecedents",
                getter=deprecated_getter_factory("antecedents", "history"),
            )

        if not Token.has_extension("antecedent"):
            Token.set_extension(
                "antecedent",
                getter=deprecated_getter_factory("antecedent", "history"),
            )

        if not Token.has_extension("history_"):
            Token.set_extension(
                "history_",
                getter=lambda token: "ATCD" if token._.history else "CURRENT",
            )

        if not Token.has_extension("antecedents_"):
            Token.set_extension(
                "antecedents_",
                getter=deprecated_getter_factory("antecedents_", "history_"),
            )

        if not Token.has_extension("antecedent_"):
            Token.set_extension(
                "antecedent_",
                getter=deprecated_getter_factory("antecedent_", "history_"),
            )

        if not Span.has_extension("history"):
            Span.set_extension("history", default=False)

        if not Span.has_extension("antecedents"):
            Span.set_extension(
                "antecedents",
                getter=deprecated_getter_factory("antecedents", "history"),
            )

        if not Span.has_extension("antecedent"):
            Span.set_extension(
                "antecedent",
                getter=deprecated_getter_factory("antecedent", "history"),
            )

        if not Span.has_extension("history_"):
            Span.set_extension(
                "history_",
                getter=lambda span: "ATCD" if span._.history else "CURRENT",
            )

        if not Span.has_extension("antecedents_"):
            Span.set_extension(
                "antecedents_",
                getter=deprecated_getter_factory("antecedents_", "history_"),
            )

        if not Span.has_extension("antecedent_"):
            Span.set_extension(
                "antecedent_",
                getter=deprecated_getter_factory("antecedent_", "history_"),
            )

        if not Span.has_extension("history_cues"):
            Span.set_extension("history_cues", default=[])

        if not Span.has_extension("antecedents_cues"):
            Span.set_extension(
                "antecedents_cues",
                getter=deprecated_getter_factory("antecedents_cues", "history_cues"),
            )

        if not Span.has_extension("antecedent_cues"):
            Span.set_extension(
                "antecedent_cues",
                getter=deprecated_getter_factory("antecedent_cues", "history_cues"),
            )

    def process(self, doc: Doc) -> Doc:
        """
        Finds entities related to history.

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        doc:
            spaCy Doc object, annotated for history
        """

        matches = self.get_matches(doc)

        terminations = get_spans(matches, "termination")
        boundaries = self._boundaries(doc, terminations)

        # Removes duplicate matches and pseudo-expressions in one statement
        matches = filter_spans(matches, label_to_remove="pseudo")

        entities = list(doc.ents) + list(doc.spans.get("discarded", []))
        ents = None

        sections = []

        if self.sections:
            sections = [
                Span(doc, section.start, section.end, label="ATCD")
                for section in doc.spans["sections"]
                if section.label_ == "antécédents"
            ]

        for start, end in boundaries:
            ents, entities = consume_spans(
                entities,
                filter=lambda s: check_inclusion(s, start, end),
                second_chance=ents,
            )

            sub_matches, matches = consume_spans(
                matches, lambda s: start <= s.start < end
            )

            sub_sections, sections = consume_spans(sections, lambda s: doc[start] in s)

            if self.on_ents_only and not ents:
                continue

            cues = get_spans(sub_matches, "history")
            cues += sub_sections

            history = bool(cues)

            if not self.on_ents_only:
                for token in doc[start:end]:
                    token._.history = history

            for ent in ents:
                ent._.history = ent._.history or history

                if self.explain:
                    ent._.history_cues += cues

                if not self.on_ents_only and ent._.history:
                    for token in ent:
                        token._.history = True

        return doc
defaults = dict(history=history, termination=termination) class-attribute
sections = use_sections and 'eds.sections' in nlp.pipe_names or 'sections' in nlp.pipe_names instance-attribute
__init__(nlp, attr, history, termination, use_sections, explain, on_ents_only)
Source code in edsnlp/pipelines/qualifiers/history/history.py
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def __init__(
    self,
    nlp: Language,
    attr: str,
    history: Optional[List[str]],
    termination: Optional[List[str]],
    use_sections: bool,
    explain: bool,
    on_ents_only: bool,
):

    terms = self.get_defaults(
        history=history,
        termination=termination,
    )

    super().__init__(
        nlp=nlp,
        attr=attr,
        on_ents_only=on_ents_only,
        explain=explain,
        **terms,
    )

    self.set_extensions()

    self.sections = use_sections and (
        "eds.sections" in nlp.pipe_names or "sections" in nlp.pipe_names
    )
    if use_sections and not self.sections:
        logger.warning(
            "You have requested that the pipeline use annotations "
            "provided by the `section` pipeline, but it was not set. "
            "Skipping that step."
        )
set_extensions()
Source code in edsnlp/pipelines/qualifiers/history/history.py
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
@staticmethod
def set_extensions() -> None:

    if not Token.has_extension("history"):
        Token.set_extension("history", default=False)

    if not Token.has_extension("antecedents"):
        Token.set_extension(
            "antecedents",
            getter=deprecated_getter_factory("antecedents", "history"),
        )

    if not Token.has_extension("antecedent"):
        Token.set_extension(
            "antecedent",
            getter=deprecated_getter_factory("antecedent", "history"),
        )

    if not Token.has_extension("history_"):
        Token.set_extension(
            "history_",
            getter=lambda token: "ATCD" if token._.history else "CURRENT",
        )

    if not Token.has_extension("antecedents_"):
        Token.set_extension(
            "antecedents_",
            getter=deprecated_getter_factory("antecedents_", "history_"),
        )

    if not Token.has_extension("antecedent_"):
        Token.set_extension(
            "antecedent_",
            getter=deprecated_getter_factory("antecedent_", "history_"),
        )

    if not Span.has_extension("history"):
        Span.set_extension("history", default=False)

    if not Span.has_extension("antecedents"):
        Span.set_extension(
            "antecedents",
            getter=deprecated_getter_factory("antecedents", "history"),
        )

    if not Span.has_extension("antecedent"):
        Span.set_extension(
            "antecedent",
            getter=deprecated_getter_factory("antecedent", "history"),
        )

    if not Span.has_extension("history_"):
        Span.set_extension(
            "history_",
            getter=lambda span: "ATCD" if span._.history else "CURRENT",
        )

    if not Span.has_extension("antecedents_"):
        Span.set_extension(
            "antecedents_",
            getter=deprecated_getter_factory("antecedents_", "history_"),
        )

    if not Span.has_extension("antecedent_"):
        Span.set_extension(
            "antecedent_",
            getter=deprecated_getter_factory("antecedent_", "history_"),
        )

    if not Span.has_extension("history_cues"):
        Span.set_extension("history_cues", default=[])

    if not Span.has_extension("antecedents_cues"):
        Span.set_extension(
            "antecedents_cues",
            getter=deprecated_getter_factory("antecedents_cues", "history_cues"),
        )

    if not Span.has_extension("antecedent_cues"):
        Span.set_extension(
            "antecedent_cues",
            getter=deprecated_getter_factory("antecedent_cues", "history_cues"),
        )
process(doc)

Finds entities related to history.

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
doc

spaCy Doc object, annotated for history

Source code in edsnlp/pipelines/qualifiers/history/history.py
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
def process(self, doc: Doc) -> Doc:
    """
    Finds entities related to history.

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    doc:
        spaCy Doc object, annotated for history
    """

    matches = self.get_matches(doc)

    terminations = get_spans(matches, "termination")
    boundaries = self._boundaries(doc, terminations)

    # Removes duplicate matches and pseudo-expressions in one statement
    matches = filter_spans(matches, label_to_remove="pseudo")

    entities = list(doc.ents) + list(doc.spans.get("discarded", []))
    ents = None

    sections = []

    if self.sections:
        sections = [
            Span(doc, section.start, section.end, label="ATCD")
            for section in doc.spans["sections"]
            if section.label_ == "antécédents"
        ]

    for start, end in boundaries:
        ents, entities = consume_spans(
            entities,
            filter=lambda s: check_inclusion(s, start, end),
            second_chance=ents,
        )

        sub_matches, matches = consume_spans(
            matches, lambda s: start <= s.start < end
        )

        sub_sections, sections = consume_spans(sections, lambda s: doc[start] in s)

        if self.on_ents_only and not ents:
            continue

        cues = get_spans(sub_matches, "history")
        cues += sub_sections

        history = bool(cues)

        if not self.on_ents_only:
            for token in doc[start:end]:
                token._.history = history

        for ent in ents:
            ent._.history = ent._.history or history

            if self.explain:
                ent._.history_cues += cues

            if not self.on_ents_only and ent._.history:
                for token in ent:
                    token._.history = True

    return doc

family

patterns

family: List[str] = ['aïeul', 'aïeux', 'antécédent familial', 'antécédents familiaux', 'arrière-grand-mère', 'arrière-grand-père', 'arrière-grands-parents', 'cousin', 'cousine', 'cousines', 'cousins', 'enfant', 'enfants', 'épouse', 'époux', 'familial', 'familiale', 'familiales', 'familiaux', 'famille', 'fiancé', 'fiancée', 'fils', 'frère', 'frères', 'grand-mère', 'grand-père', 'grands-parents', 'maman', 'mari', 'mère', 'oncle', 'papa', 'parent', 'parents', 'père', 'soeur', 'sœur', 'sœurs', 'soeurs', 'tante'] module-attribute

family

FamilyContext

Bases: Qualifier

Implements a family context detection algorithm.

The components looks for terms indicating family references in the text.

PARAMETER DESCRIPTION
nlp

spaCy nlp pipeline to use for matching.

TYPE: Language

family

List of terms indicating family reference.

TYPE: Optional[List[str]]

terminations

List of termination terms, to separate syntagmas.

TYPE: Optional[List[str]]

attr

spaCy's attribute to use: a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr' we can also add a key for each regex.

TYPE: str

on_ents_only

Whether to look for matches around detected entities only. Useful for faster inference in downstream tasks.

TYPE: bool

regex

A dictionnary of regex patterns.

TYPE: Optional[Dict[str, Union[List[str], str]]]

explain

Whether to keep track of cues for each entity.

TYPE: bool

use_sections

Whether to use annotated sections (namely antécédents familiaux).

TYPE: bool, by default

Source code in edsnlp/pipelines/qualifiers/family/family.py
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
class FamilyContext(Qualifier):
    """
    Implements a family context detection algorithm.

    The components looks for terms indicating family references in the text.

    Parameters
    ----------
    nlp : Language
        spaCy nlp pipeline to use for matching.
    family : Optional[List[str]]
        List of terms indicating family reference.
    terminations : Optional[List[str]]
        List of termination terms, to separate syntagmas.
    attr : str
        spaCy's attribute to use:
        a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr'
        we can also add a key for each regex.
    on_ents_only : bool
        Whether to look for matches around detected entities only.
        Useful for faster inference in downstream tasks.
    regex : Optional[Dict[str, Union[List[str], str]]]
        A dictionnary of regex patterns.
    explain : bool
        Whether to keep track of cues for each entity.
    use_sections : bool, by default `False`
        Whether to use annotated sections (namely `antécédents familiaux`).
    """

    defaults = dict(
        family=family,
        termination=termination,
    )

    def __init__(
        self,
        nlp: Language,
        attr: str,
        family: Optional[List[str]],
        termination: Optional[List[str]],
        use_sections: bool,
        explain: bool,
        on_ents_only: bool,
    ):

        terms = self.get_defaults(
            family=family,
            termination=termination,
        )

        super().__init__(
            nlp=nlp,
            attr=attr,
            on_ents_only=on_ents_only,
            explain=explain,
            **terms,
        )

        self.set_extensions()

        self.sections = use_sections and (
            "eds.sections" in nlp.pipe_names or "sections" in nlp.pipe_names
        )
        if use_sections and not self.sections:
            logger.warning(
                "You have requested that the pipeline use annotations "
                "provided by the `section` pipeline, but it was not set. "
                "Skipping that step."
            )

    @staticmethod
    def set_extensions() -> None:
        if not Token.has_extension("family"):
            Token.set_extension("family", default=False)

        if not Token.has_extension("family_"):
            Token.set_extension(
                "family_",
                getter=lambda token: "FAMILY" if token._.family else "PATIENT",
            )

        if not Span.has_extension("family"):
            Span.set_extension("family", default=False)

        if not Span.has_extension("family_"):
            Span.set_extension(
                "family_",
                getter=lambda span: "FAMILY" if span._.family else "PATIENT",
            )

        if not Span.has_extension("family_cues"):
            Span.set_extension("family_cues", default=[])

        if not Doc.has_extension("family"):
            Doc.set_extension("family", default=[])

    def process(self, doc: Doc) -> Doc:
        """
        Finds entities related to family context.

        Parameters
        ----------
        doc: spaCy Doc object

        Returns
        -------
        doc: spaCy Doc object, annotated for context
        """
        matches = self.get_matches(doc)

        terminations = get_spans(matches, "termination")
        boundaries = self._boundaries(doc, terminations)

        # Removes duplicate matches and pseudo-expressions in one statement
        matches = filter_spans(matches, label_to_remove="pseudo")

        entities = list(doc.ents) + list(doc.spans.get("discarded", []))
        ents = None

        sections = []

        if self.sections:
            sections = [
                Span(doc, section.start, section.end, label="FAMILY")
                for section in doc.spans["sections"]
                if section.label_ == "antécédents familiaux"
            ]

        for start, end in boundaries:

            ents, entities = consume_spans(
                entities,
                filter=lambda s: check_inclusion(s, start, end),
                second_chance=ents,
            )

            sub_matches, matches = consume_spans(
                matches, lambda s: start <= s.start < end
            )

            sub_sections, sections = consume_spans(sections, lambda s: doc[start] in s)

            if self.on_ents_only and not ents:
                continue

            cues = get_spans(sub_matches, "family")
            cues += sub_sections

            if not cues:
                continue

            family = bool(cues)

            if not family:
                continue

            if not self.on_ents_only:
                for token in doc[start:end]:
                    token._.family = True

            for ent in ents:
                ent._.family = True
                if self.explain:
                    ent._.family_cues += cues
                if not self.on_ents_only:
                    for token in ent:
                        token._.family = True

        return doc
defaults = dict(family=family, termination=termination) class-attribute
sections = use_sections and 'eds.sections' in nlp.pipe_names or 'sections' in nlp.pipe_names instance-attribute
__init__(nlp, attr, family, termination, use_sections, explain, on_ents_only)
Source code in edsnlp/pipelines/qualifiers/family/family.py
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
def __init__(
    self,
    nlp: Language,
    attr: str,
    family: Optional[List[str]],
    termination: Optional[List[str]],
    use_sections: bool,
    explain: bool,
    on_ents_only: bool,
):

    terms = self.get_defaults(
        family=family,
        termination=termination,
    )

    super().__init__(
        nlp=nlp,
        attr=attr,
        on_ents_only=on_ents_only,
        explain=explain,
        **terms,
    )

    self.set_extensions()

    self.sections = use_sections and (
        "eds.sections" in nlp.pipe_names or "sections" in nlp.pipe_names
    )
    if use_sections and not self.sections:
        logger.warning(
            "You have requested that the pipeline use annotations "
            "provided by the `section` pipeline, but it was not set. "
            "Skipping that step."
        )
set_extensions()
Source code in edsnlp/pipelines/qualifiers/family/family.py
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
@staticmethod
def set_extensions() -> None:
    if not Token.has_extension("family"):
        Token.set_extension("family", default=False)

    if not Token.has_extension("family_"):
        Token.set_extension(
            "family_",
            getter=lambda token: "FAMILY" if token._.family else "PATIENT",
        )

    if not Span.has_extension("family"):
        Span.set_extension("family", default=False)

    if not Span.has_extension("family_"):
        Span.set_extension(
            "family_",
            getter=lambda span: "FAMILY" if span._.family else "PATIENT",
        )

    if not Span.has_extension("family_cues"):
        Span.set_extension("family_cues", default=[])

    if not Doc.has_extension("family"):
        Doc.set_extension("family", default=[])
process(doc)

Finds entities related to family context.

PARAMETER DESCRIPTION
doc

TYPE: Doc

RETURNS DESCRIPTION
doc
Source code in edsnlp/pipelines/qualifiers/family/family.py
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
def process(self, doc: Doc) -> Doc:
    """
    Finds entities related to family context.

    Parameters
    ----------
    doc: spaCy Doc object

    Returns
    -------
    doc: spaCy Doc object, annotated for context
    """
    matches = self.get_matches(doc)

    terminations = get_spans(matches, "termination")
    boundaries = self._boundaries(doc, terminations)

    # Removes duplicate matches and pseudo-expressions in one statement
    matches = filter_spans(matches, label_to_remove="pseudo")

    entities = list(doc.ents) + list(doc.spans.get("discarded", []))
    ents = None

    sections = []

    if self.sections:
        sections = [
            Span(doc, section.start, section.end, label="FAMILY")
            for section in doc.spans["sections"]
            if section.label_ == "antécédents familiaux"
        ]

    for start, end in boundaries:

        ents, entities = consume_spans(
            entities,
            filter=lambda s: check_inclusion(s, start, end),
            second_chance=ents,
        )

        sub_matches, matches = consume_spans(
            matches, lambda s: start <= s.start < end
        )

        sub_sections, sections = consume_spans(sections, lambda s: doc[start] in s)

        if self.on_ents_only and not ents:
            continue

        cues = get_spans(sub_matches, "family")
        cues += sub_sections

        if not cues:
            continue

        family = bool(cues)

        if not family:
            continue

        if not self.on_ents_only:
            for token in doc[start:end]:
                token._.family = True

        for ent in ents:
            ent._.family = True
            if self.explain:
                ent._.family_cues += cues
            if not self.on_ents_only:
                for token in ent:
                    token._.family = True

    return doc

factory

DEFAULT_CONFIG = dict(family=None, termination=None, attr='NORM', use_sections=False, explain=False, on_ents_only=True) module-attribute
create_component(nlp, name, family, termination, attr, explain, on_ents_only, use_sections)
Source code in edsnlp/pipelines/qualifiers/family/factory.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
@deprecated_factory("family", "eds.family", default_config=DEFAULT_CONFIG)
@Language.factory("eds.family", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    family: Optional[List[str]],
    termination: Optional[List[str]],
    attr: str,
    explain: bool,
    on_ents_only: bool,
    use_sections: bool,
):
    return FamilyContext(
        nlp,
        family=family,
        termination=termination,
        attr=attr,
        explain=explain,
        on_ents_only=on_ents_only,
        use_sections=use_sections,
    )

negation

patterns

pseudo: List[str] = ['aucun changement', 'aucun doute', 'aucune hésitation', 'aucune diminution', 'ne permet pas d', 'ne permet pas de', "n'exclut pas", 'non négligeable', "pas d'amélioration", "pas d'augmentation", "pas d'autre", 'pas de changement', 'pas de diminution', 'pas de doute', 'pas exclu', 'pas exclue', 'pas exclues', 'pas exclus', 'pas immunisé', 'pas immunisée', 'pas immunisés', 'pas immunisées', 'sans amélioration', 'sans aucun doute', 'sans augmentation', 'sans certitude', 'sans changement', 'sans diminution', 'sans doute', 'sans être certain'] module-attribute
preceding: List[str] = ['à la place de', 'absence', 'absence de signe de', 'absence de', 'aucun signe de', 'aucun', 'aucune preuve', 'aucune', 'aucunes', 'aucuns', 'décline', 'décliné', 'dépourvu', 'dépourvue', 'dépourvues', 'dépourvus', 'disparition de', 'disparition des', 'excluent', 'exclut', 'impossibilité de', 'immunisé', 'immunisée', 'immunisés', 'immunisées', 'incompatible avec', 'incompatibles avec', 'jamais', 'ne manifestaient pas', 'ne manifestait pas', 'ne manifeste pas', 'ne manifestent pas', 'ne pas', 'ne présentaient pas', 'ne présentait pas', 'ne présente pas', 'ne présentent pas', 'ne ressemble pas', 'ne ressemblent pas', 'négatif pour', "n'est pas", "n'était pas", 'ni', 'niant', 'nie', 'nié', 'nullement', 'pas d', 'pas de cause de', 'pas de signe de', 'pas de signes de', 'pas de', 'pas nécessaire de', 'pas', "permet d'exclure", "plus d'aspect de", 'sans manifester de', 'sans présenter de', 'sans', 'symptôme atypique'] module-attribute
following: List[str] = [':0', ': 0', ':non', ': non', 'absent', 'absente', 'absentes', 'absents', 'dépourvu', 'dépourvue', 'dépourvues', 'dépourvus', 'disparaissent', 'disparait', 'est exclu', 'est exclue', 'immunisé', 'immunisée', 'immunisés', 'immunisées', 'impossible', 'improbable', 'négatif', 'négatifs', 'négative', 'négatives', 'négligeable', 'négligeables', 'nié', 'niée', 'non', 'pas nécessaire', 'peu probable', 'sont exclues', 'sont exclus'] module-attribute
verbs: List[str] = ['éliminer', 'exclure', 'interdire', 'nier', 'réfuter', 'rejeter'] module-attribute

negation

Negation

Bases: Qualifier

Implements the NegEx algorithm.

The component looks for five kinds of expressions in the text :

  • preceding negations, ie cues that precede a negated expression

  • following negations, ie cues that follow a negated expression

  • pseudo negations : contain a negation cue, but are not negations (eg "pas de doute"/"no doubt")

  • negation verbs, ie verbs that indicate a negation

  • terminations, ie words that delimit propositions. The negation spans from the preceding cue to the termination.

PARAMETER DESCRIPTION
nlp

spaCy nlp pipeline to use for matching.

TYPE: Language

attr

spaCy's attribute to use

TYPE: str

pseudo

List of pseudo negation terms.

TYPE: Optional[List[str]]

preceding

List of preceding negation terms

TYPE: Optional[List[str]]

following

List of following negation terms.

TYPE: Optional[List[str]]

termination

List of termination terms.

TYPE: Optional[List[str]]

verbs

List of negation verbs.

TYPE: Optional[List[str]]

on_ents_only

Whether to look for matches around detected entities only. Useful for faster inference in downstream tasks.

TYPE: bool

within_ents

Whether to consider cues within entities.

TYPE: bool

explain

Whether to keep track of cues for each entity.

TYPE: bool

Source code in edsnlp/pipelines/qualifiers/negation/negation.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
class Negation(Qualifier):
    """
    Implements the NegEx algorithm.

    The component looks for five kinds of expressions in the text :

    - preceding negations, ie cues that precede a negated expression

    - following negations, ie cues that follow a negated expression

    - pseudo negations : contain a negation cue, but are not negations
      (eg "pas de doute"/"no doubt")

    - negation verbs, ie verbs that indicate a negation

    - terminations, ie words that delimit propositions.
      The negation spans from the preceding cue to the termination.

    Parameters
    ----------
    nlp : Language
        spaCy nlp pipeline to use for matching.
    attr : str
        spaCy's attribute to use
    pseudo : Optional[List[str]]
        List of pseudo negation terms.
    preceding : Optional[List[str]]
        List of preceding negation terms
    following : Optional[List[str]]
        List of following negation terms.
    termination : Optional[List[str]]
        List of termination terms.
    verbs : Optional[List[str]]
        List of negation verbs.
    on_ents_only : bool
        Whether to look for matches around detected entities only.
        Useful for faster inference in downstream tasks.
    within_ents : bool
        Whether to consider cues within entities.
    explain : bool
        Whether to keep track of cues for each entity.
    """

    defaults = dict(
        following=following,
        preceding=preceding,
        pseudo=pseudo,
        verbs=verbs,
        termination=termination,
    )

    def __init__(
        self,
        nlp: Language,
        attr: str,
        pseudo: Optional[List[str]],
        preceding: Optional[List[str]],
        following: Optional[List[str]],
        termination: Optional[List[str]],
        verbs: Optional[List[str]],
        on_ents_only: bool,
        within_ents: bool,
        explain: bool,
    ):

        terms = self.get_defaults(
            pseudo=pseudo,
            preceding=preceding,
            following=following,
            termination=termination,
            verbs=verbs,
        )
        terms["verbs"] = self.load_verbs(terms["verbs"])

        super().__init__(
            nlp=nlp,
            attr=attr,
            on_ents_only=on_ents_only,
            explain=explain,
            **terms,
        )

        self.within_ents = within_ents
        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:

        if not Token.has_extension("negation"):
            Token.set_extension("negation", default=False)

        if not Token.has_extension("negated"):
            Token.set_extension(
                "negated", getter=deprecated_getter_factory("negated", "negation")
            )

        if not Token.has_extension("negation_"):
            Token.set_extension(
                "negation_",
                getter=lambda token: "NEG" if token._.negation else "AFF",
            )

        if not Token.has_extension("polarity_"):
            Token.set_extension(
                "polarity_",
                getter=deprecated_getter_factory("polarity_", "negation_"),
            )

        if not Span.has_extension("negation"):
            Span.set_extension("negation", default=False)

        if not Span.has_extension("negated"):
            Span.set_extension(
                "negated", getter=deprecated_getter_factory("negated", "negation")
            )

        if not Span.has_extension("negation_cues"):
            Span.set_extension("negation_cues", default=[])

        if not Span.has_extension("negation_"):
            Span.set_extension(
                "negation_",
                getter=lambda span: "NEG" if span._.negation else "AFF",
            )

        if not Span.has_extension("polarity_"):
            Span.set_extension(
                "polarity_",
                getter=deprecated_getter_factory("polarity_", "negation_"),
            )

        if not Doc.has_extension("negations"):
            Doc.set_extension("negations", default=[])

    def load_verbs(self, verbs: List[str]) -> List[str]:
        """
        Conjugate negating verbs to specific tenses.

        Parameters
        ----------
        verbs: list of negating verbs to conjugate

        Returns
        -------
        list_neg_verbs: List of negating verbs conjugated to specific tenses.
        """

        neg_verbs = get_verbs(verbs)

        neg_verbs = neg_verbs.loc[
            ((neg_verbs["mode"] == "Indicatif") & (neg_verbs["tense"] == "Présent"))
            | (neg_verbs["tense"] == "Participe Présent")
            | (neg_verbs["tense"] == "Participe Passé")
        ]

        list_neg_verbs = list(neg_verbs["term"].unique())

        return list_neg_verbs

    def annotate_entity(
        self,
        ent: Span,
        sub_preceding: List[Span],
        sub_following: List[Span],
    ) -> None:
        """
        Annotate entities using preceding and following negations.

        Parameters
        ----------
        ent : Span
            Entity to annotate
        sub_preceding : List[Span]
            List of preceding negations cues
        sub_following : List[Span]
            List of following negations cues
        """
        if self.within_ents:
            cues = [m for m in sub_preceding if m.end <= ent.end]
            cues += [m for m in sub_following if m.start >= ent.start]
        else:
            cues = [m for m in sub_preceding if m.end <= ent.start]
            cues += [m for m in sub_following if m.start >= ent.end]

        negation = ent._.negation or bool(cues)

        ent._.negation = negation

        if self.explain and negation:
            ent._.negation_cues += cues

        if not self.on_ents_only and negation:
            for token in ent:
                token._.negation = True

    def process(self, doc: Doc) -> Doc:
        """
        Finds entities related to negation.

        Parameters
        ----------
        doc: spaCy `Doc` object

        Returns
        -------
        doc: spaCy `Doc` object, annotated for negation
        """

        matches = self.get_matches(doc)

        terminations = get_spans(matches, "termination")
        boundaries = self._boundaries(doc, terminations)

        entities = list(doc.ents) + list(doc.spans.get("discarded", []))
        ents = None

        # Removes duplicate matches and pseudo-expressions in one statement
        matches = filter_spans(matches, label_to_remove="pseudo")

        for start, end in boundaries:

            ents, entities = consume_spans(
                entities,
                filter=lambda s: check_inclusion(s, start, end),
                second_chance=ents,
            )

            sub_matches, matches = consume_spans(
                matches, lambda s: start <= s.start < end
            )

            if self.on_ents_only and not ents:
                continue

            sub_preceding = get_spans(sub_matches, "preceding")
            sub_following = get_spans(sub_matches, "following")
            # Verbs precede negated content
            sub_preceding += get_spans(sub_matches, "verbs")

            if not sub_preceding + sub_following:
                continue

            if not self.on_ents_only:
                for token in doc[start:end]:
                    token._.negation = any(
                        m.end <= token.i for m in sub_preceding
                    ) or any(m.start > token.i for m in sub_following)

            for ent in ents:
                self.annotate_entity(
                    ent=ent,
                    sub_preceding=sub_preceding,
                    sub_following=sub_following,
                )

        return doc

    def __call__(self, doc: Doc) -> Doc:
        return self.process(doc)
defaults = dict(following=following, preceding=preceding, pseudo=pseudo, verbs=verbs, termination=termination) class-attribute
within_ents = within_ents instance-attribute
__init__(nlp, attr, pseudo, preceding, following, termination, verbs, on_ents_only, within_ents, explain)
Source code in edsnlp/pipelines/qualifiers/negation/negation.py
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
def __init__(
    self,
    nlp: Language,
    attr: str,
    pseudo: Optional[List[str]],
    preceding: Optional[List[str]],
    following: Optional[List[str]],
    termination: Optional[List[str]],
    verbs: Optional[List[str]],
    on_ents_only: bool,
    within_ents: bool,
    explain: bool,
):

    terms = self.get_defaults(
        pseudo=pseudo,
        preceding=preceding,
        following=following,
        termination=termination,
        verbs=verbs,
    )
    terms["verbs"] = self.load_verbs(terms["verbs"])

    super().__init__(
        nlp=nlp,
        attr=attr,
        on_ents_only=on_ents_only,
        explain=explain,
        **terms,
    )

    self.within_ents = within_ents
    self.set_extensions()
set_extensions()
Source code in edsnlp/pipelines/qualifiers/negation/negation.py
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
@staticmethod
def set_extensions() -> None:

    if not Token.has_extension("negation"):
        Token.set_extension("negation", default=False)

    if not Token.has_extension("negated"):
        Token.set_extension(
            "negated", getter=deprecated_getter_factory("negated", "negation")
        )

    if not Token.has_extension("negation_"):
        Token.set_extension(
            "negation_",
            getter=lambda token: "NEG" if token._.negation else "AFF",
        )

    if not Token.has_extension("polarity_"):
        Token.set_extension(
            "polarity_",
            getter=deprecated_getter_factory("polarity_", "negation_"),
        )

    if not Span.has_extension("negation"):
        Span.set_extension("negation", default=False)

    if not Span.has_extension("negated"):
        Span.set_extension(
            "negated", getter=deprecated_getter_factory("negated", "negation")
        )

    if not Span.has_extension("negation_cues"):
        Span.set_extension("negation_cues", default=[])

    if not Span.has_extension("negation_"):
        Span.set_extension(
            "negation_",
            getter=lambda span: "NEG" if span._.negation else "AFF",
        )

    if not Span.has_extension("polarity_"):
        Span.set_extension(
            "polarity_",
            getter=deprecated_getter_factory("polarity_", "negation_"),
        )

    if not Doc.has_extension("negations"):
        Doc.set_extension("negations", default=[])
load_verbs(verbs)

Conjugate negating verbs to specific tenses.

PARAMETER DESCRIPTION
verbs

TYPE: List[str]

RETURNS DESCRIPTION
list_neg_verbs
Source code in edsnlp/pipelines/qualifiers/negation/negation.py
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
def load_verbs(self, verbs: List[str]) -> List[str]:
    """
    Conjugate negating verbs to specific tenses.

    Parameters
    ----------
    verbs: list of negating verbs to conjugate

    Returns
    -------
    list_neg_verbs: List of negating verbs conjugated to specific tenses.
    """

    neg_verbs = get_verbs(verbs)

    neg_verbs = neg_verbs.loc[
        ((neg_verbs["mode"] == "Indicatif") & (neg_verbs["tense"] == "Présent"))
        | (neg_verbs["tense"] == "Participe Présent")
        | (neg_verbs["tense"] == "Participe Passé")
    ]

    list_neg_verbs = list(neg_verbs["term"].unique())

    return list_neg_verbs
annotate_entity(ent, sub_preceding, sub_following)

Annotate entities using preceding and following negations.

PARAMETER DESCRIPTION
ent

Entity to annotate

TYPE: Span

sub_preceding

List of preceding negations cues

TYPE: List[Span]

sub_following

List of following negations cues

TYPE: List[Span]

Source code in edsnlp/pipelines/qualifiers/negation/negation.py
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
def annotate_entity(
    self,
    ent: Span,
    sub_preceding: List[Span],
    sub_following: List[Span],
) -> None:
    """
    Annotate entities using preceding and following negations.

    Parameters
    ----------
    ent : Span
        Entity to annotate
    sub_preceding : List[Span]
        List of preceding negations cues
    sub_following : List[Span]
        List of following negations cues
    """
    if self.within_ents:
        cues = [m for m in sub_preceding if m.end <= ent.end]
        cues += [m for m in sub_following if m.start >= ent.start]
    else:
        cues = [m for m in sub_preceding if m.end <= ent.start]
        cues += [m for m in sub_following if m.start >= ent.end]

    negation = ent._.negation or bool(cues)

    ent._.negation = negation

    if self.explain and negation:
        ent._.negation_cues += cues

    if not self.on_ents_only and negation:
        for token in ent:
            token._.negation = True
process(doc)

Finds entities related to negation.

PARAMETER DESCRIPTION
doc

TYPE: Doc

RETURNS DESCRIPTION
doc
Source code in edsnlp/pipelines/qualifiers/negation/negation.py
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
def process(self, doc: Doc) -> Doc:
    """
    Finds entities related to negation.

    Parameters
    ----------
    doc: spaCy `Doc` object

    Returns
    -------
    doc: spaCy `Doc` object, annotated for negation
    """

    matches = self.get_matches(doc)

    terminations = get_spans(matches, "termination")
    boundaries = self._boundaries(doc, terminations)

    entities = list(doc.ents) + list(doc.spans.get("discarded", []))
    ents = None

    # Removes duplicate matches and pseudo-expressions in one statement
    matches = filter_spans(matches, label_to_remove="pseudo")

    for start, end in boundaries:

        ents, entities = consume_spans(
            entities,
            filter=lambda s: check_inclusion(s, start, end),
            second_chance=ents,
        )

        sub_matches, matches = consume_spans(
            matches, lambda s: start <= s.start < end
        )

        if self.on_ents_only and not ents:
            continue

        sub_preceding = get_spans(sub_matches, "preceding")
        sub_following = get_spans(sub_matches, "following")
        # Verbs precede negated content
        sub_preceding += get_spans(sub_matches, "verbs")

        if not sub_preceding + sub_following:
            continue

        if not self.on_ents_only:
            for token in doc[start:end]:
                token._.negation = any(
                    m.end <= token.i for m in sub_preceding
                ) or any(m.start > token.i for m in sub_following)

        for ent in ents:
            self.annotate_entity(
                ent=ent,
                sub_preceding=sub_preceding,
                sub_following=sub_following,
            )

    return doc
__call__(doc)
Source code in edsnlp/pipelines/qualifiers/negation/negation.py
273
274
def __call__(self, doc: Doc) -> Doc:
    return self.process(doc)

factory

DEFAULT_CONFIG = dict(pseudo=None, preceding=None, following=None, termination=None, verbs=None, attr='NORM', on_ents_only=True, within_ents=False, explain=False) module-attribute
create_component(nlp, name, attr, pseudo, preceding, following, termination, verbs, on_ents_only, within_ents, explain)
Source code in edsnlp/pipelines/qualifiers/negation/factory.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
@deprecated_factory("negation", "eds.negation", default_config=DEFAULT_CONFIG)
@Language.factory("eds.negation", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    attr: str,
    pseudo: Optional[List[str]],
    preceding: Optional[List[str]],
    following: Optional[List[str]],
    termination: Optional[List[str]],
    verbs: Optional[List[str]],
    on_ents_only: bool,
    within_ents: bool,
    explain: bool,
):

    return Negation(
        nlp=nlp,
        attr=attr,
        pseudo=pseudo,
        preceding=preceding,
        following=following,
        termination=termination,
        verbs=verbs,
        on_ents_only=on_ents_only,
        within_ents=within_ents,
        explain=explain,
    )

reported_speech

patterns

verbs: List[str] = ['affirmer', 'ajouter', 'assurer', 'confirmer', 'demander', 'dire', 'déclarer', 'décrire', 'décrire', 'démontrer', 'expliquer', 'faire remarquer', 'indiquer', 'informer', 'insinuer', 'insister', 'jurer', 'nier', 'nier', 'noter', 'objecter', 'observer', 'parler', 'promettre', 'préciser', 'prétendre', 'prévenir', 'raconter', 'rappeler', 'rapporter', 'reconnaître', 'réfuter', 'répliquer', 'répondre', 'répéter', 'révéler', 'se plaindre', 'souhaiter', 'souligner', 'supplier', 'verbaliser', 'vouloir', 'vouloir'] module-attribute
following: List[str] = ["d'après le patient", "d'après la patiente"] module-attribute
preceding: List[str] = ['pas de critique de', 'crainte de', 'menace de', 'insiste sur le fait que', "d'après le patient", "d'après la patiente", 'peur de'] module-attribute
quotation: str = '(\\".+\\")|(\\«.+\\»)' module-attribute

reported_speech

ReportedSpeech

Bases: Qualifier

Implements a reported speech detection algorithm.

The components looks for terms indicating patient statements, and quotations to detect patient speech.

PARAMETER DESCRIPTION
nlp

spaCy nlp pipeline to use for matching.

TYPE: Language

quotation

String gathering all quotation cues.

TYPE: str

verbs

List of reported speech verbs.

TYPE: List[str]

following

List of terms following a reported speech.

TYPE: List[str]

preceding

List of terms preceding a reported speech.

TYPE: List[str]

filter_matches

Whether to filter out overlapping matches.

TYPE: bool

attr

spaCy's attribute to use: a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr' we can also add a key for each regex.

TYPE: str

on_ents_only

Whether to look for matches around detected entities only. Useful for faster inference in downstream tasks.

TYPE: bool

within_ents

Whether to consider cues within entities.

TYPE: bool

explain

Whether to keep track of cues for each entity.

TYPE: bool

Source code in edsnlp/pipelines/qualifiers/reported_speech/reported_speech.py
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
class ReportedSpeech(Qualifier):
    """
    Implements a reported speech detection algorithm.

    The components looks for terms indicating patient statements,
    and quotations to detect patient speech.

    Parameters
    ----------
    nlp : Language
        spaCy nlp pipeline to use for matching.
    quotation : str
        String gathering all quotation cues.
    verbs : List[str]
        List of reported speech verbs.
    following : List[str]
        List of terms following a reported speech.
    preceding : List[str]
        List of terms preceding a reported speech.
    filter_matches : bool
        Whether to filter out overlapping matches.
    attr : str
        spaCy's attribute to use:
        a string with the value "TEXT" or "NORM",
        or a dict with the key 'term_attr'
        we can also add a key for each regex.
    on_ents_only : bool
        Whether to look for matches around detected entities only.
        Useful for faster inference in downstream tasks.
    within_ents : bool
        Whether to consider cues within entities.
    explain : bool
        Whether to keep track of cues for each entity.
    """

    defaults = dict(
        following=following,
        preceding=preceding,
        verbs=verbs,
        quotation=quotation,
    )

    def __init__(
        self,
        nlp: Language,
        attr: str,
        pseudo: Optional[List[str]],
        preceding: Optional[List[str]],
        following: Optional[List[str]],
        quotation: Optional[List[str]],
        verbs: Optional[List[str]],
        on_ents_only: bool,
        within_ents: bool,
        explain: bool,
    ):

        terms = self.get_defaults(
            pseudo=pseudo,
            preceding=preceding,
            following=following,
            quotation=quotation,
            verbs=verbs,
        )
        terms["verbs"] = self.load_verbs(terms["verbs"])

        quotation = terms.pop("quotation")

        super().__init__(
            nlp=nlp,
            attr=attr,
            on_ents_only=on_ents_only,
            explain=explain,
            **terms,
        )

        self.regex_matcher = RegexMatcher(attr=attr)
        self.regex_matcher.build_patterns(dict(quotation=quotation))

        self.within_ents = within_ents

        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:

        if not Token.has_extension("reported_speech"):
            Token.set_extension("reported_speech", default=False)

        if not Token.has_extension("reported_speech_"):
            Token.set_extension(
                "reported_speech_",
                getter=lambda token: "REPORTED"
                if token._.reported_speech
                else "DIRECT",
            )

        if not Span.has_extension("reported_speech"):
            Span.set_extension("reported_speech", default=False)

        if not Span.has_extension("reported_speech_"):
            Span.set_extension(
                "reported_speech_",
                getter=lambda span: "REPORTED" if span._.reported_speech else "DIRECT",
            )

        if not Span.has_extension("reported_speech_cues"):
            Span.set_extension("reported_speech_cues", default=[])

        if not Doc.has_extension("rspeechs"):
            Doc.set_extension("rspeechs", default=[])

    def load_verbs(self, verbs: List[str]) -> List[str]:
        """
        Conjugate reporting verbs to specific tenses (trhid person)

        Parameters
        ----------
        verbs: list of reporting verbs to conjugate

        Returns
        -------
        list_rep_verbs: List of reporting verbs conjugated to specific tenses.
        """

        rep_verbs = get_verbs(verbs)

        rep_verbs = rep_verbs.loc[
            (
                (rep_verbs["mode"] == "Indicatif")
                & (rep_verbs["tense"] == "Présent")
                & (rep_verbs["person"].isin(["3s", "3p"]))
            )
            | (rep_verbs["tense"] == "Participe Présent")
            | (rep_verbs["tense"] == "Participe Passé")
        ]

        list_rep_verbs = list(rep_verbs["term"].unique())

        return list_rep_verbs

    def process(self, doc: Doc) -> Doc:
        """
        Finds entities related to reported speech.

        Parameters
        ----------
        doc: spaCy Doc object

        Returns
        -------
        doc: spaCy Doc object, annotated for negation
        """

        matches = self.get_matches(doc)
        matches += list(self.regex_matcher(doc, as_spans=True))

        boundaries = self._boundaries(doc)

        entities = list(doc.ents) + list(doc.spans.get("discarded", []))
        ents = None

        # Removes duplicate matches and pseudo-expressions in one statement
        matches = filter_spans(matches, label_to_remove="pseudo")

        for start, end in boundaries:

            ents, entities = consume_spans(
                entities,
                filter=lambda s: check_inclusion(s, start, end),
                second_chance=ents,
            )

            sub_matches, matches = consume_spans(
                matches, lambda s: start <= s.start < end
            )

            if self.on_ents_only and not ents:
                continue

            sub_preceding = get_spans(sub_matches, "preceding")
            sub_following = get_spans(sub_matches, "following")
            sub_verbs = get_spans(sub_matches, "verbs")
            sub_quotation = get_spans(sub_matches, "quotation")

            if not sub_preceding + sub_following + sub_verbs + sub_quotation:
                continue

            if not self.on_ents_only:
                for token in doc[start:end]:
                    token._.reported_speech = (
                        any(m.end <= token.i for m in sub_preceding + sub_verbs)
                        or any(m.start > token.i for m in sub_following)
                        or any(
                            ((m.start < token.i) & (m.end > token.i + 1))
                            for m in sub_quotation
                        )
                    )
            for ent in ents:

                if self.within_ents:
                    cues = [m for m in sub_preceding + sub_verbs if m.end <= ent.end]
                    cues += [m for m in sub_following if m.start >= ent.start]
                else:
                    cues = [m for m in sub_preceding + sub_verbs if m.end <= ent.start]
                    cues += [m for m in sub_following if m.start >= ent.end]

                cues += [
                    m
                    for m in sub_quotation
                    if (m.start < ent.start) & (m.end > ent.end)
                ]

                reported_speech = ent._.reported_speech or bool(cues)
                ent._.reported_speech = reported_speech

                if self.explain:
                    ent._.reported_speech_cues += cues

                if not self.on_ents_only and reported_speech:
                    for token in ent:
                        token._.reported_speech = True
        return doc
defaults = dict(following=following, preceding=preceding, verbs=verbs, quotation=quotation) class-attribute
regex_matcher = RegexMatcher(attr=attr) instance-attribute
within_ents = within_ents instance-attribute
__init__(nlp, attr, pseudo, preceding, following, quotation, verbs, on_ents_only, within_ents, explain)
Source code in edsnlp/pipelines/qualifiers/reported_speech/reported_speech.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
def __init__(
    self,
    nlp: Language,
    attr: str,
    pseudo: Optional[List[str]],
    preceding: Optional[List[str]],
    following: Optional[List[str]],
    quotation: Optional[List[str]],
    verbs: Optional[List[str]],
    on_ents_only: bool,
    within_ents: bool,
    explain: bool,
):

    terms = self.get_defaults(
        pseudo=pseudo,
        preceding=preceding,
        following=following,
        quotation=quotation,
        verbs=verbs,
    )
    terms["verbs"] = self.load_verbs(terms["verbs"])

    quotation = terms.pop("quotation")

    super().__init__(
        nlp=nlp,
        attr=attr,
        on_ents_only=on_ents_only,
        explain=explain,
        **terms,
    )

    self.regex_matcher = RegexMatcher(attr=attr)
    self.regex_matcher.build_patterns(dict(quotation=quotation))

    self.within_ents = within_ents

    self.set_extensions()
set_extensions()
Source code in edsnlp/pipelines/qualifiers/reported_speech/reported_speech.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
@staticmethod
def set_extensions() -> None:

    if not Token.has_extension("reported_speech"):
        Token.set_extension("reported_speech", default=False)

    if not Token.has_extension("reported_speech_"):
        Token.set_extension(
            "reported_speech_",
            getter=lambda token: "REPORTED"
            if token._.reported_speech
            else "DIRECT",
        )

    if not Span.has_extension("reported_speech"):
        Span.set_extension("reported_speech", default=False)

    if not Span.has_extension("reported_speech_"):
        Span.set_extension(
            "reported_speech_",
            getter=lambda span: "REPORTED" if span._.reported_speech else "DIRECT",
        )

    if not Span.has_extension("reported_speech_cues"):
        Span.set_extension("reported_speech_cues", default=[])

    if not Doc.has_extension("rspeechs"):
        Doc.set_extension("rspeechs", default=[])
load_verbs(verbs)

Conjugate reporting verbs to specific tenses (trhid person)

PARAMETER DESCRIPTION
verbs

TYPE: List[str]

RETURNS DESCRIPTION
list_rep_verbs
Source code in edsnlp/pipelines/qualifiers/reported_speech/reported_speech.py
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
def load_verbs(self, verbs: List[str]) -> List[str]:
    """
    Conjugate reporting verbs to specific tenses (trhid person)

    Parameters
    ----------
    verbs: list of reporting verbs to conjugate

    Returns
    -------
    list_rep_verbs: List of reporting verbs conjugated to specific tenses.
    """

    rep_verbs = get_verbs(verbs)

    rep_verbs = rep_verbs.loc[
        (
            (rep_verbs["mode"] == "Indicatif")
            & (rep_verbs["tense"] == "Présent")
            & (rep_verbs["person"].isin(["3s", "3p"]))
        )
        | (rep_verbs["tense"] == "Participe Présent")
        | (rep_verbs["tense"] == "Participe Passé")
    ]

    list_rep_verbs = list(rep_verbs["term"].unique())

    return list_rep_verbs
process(doc)

Finds entities related to reported speech.

PARAMETER DESCRIPTION
doc

TYPE: Doc

RETURNS DESCRIPTION
doc
Source code in edsnlp/pipelines/qualifiers/reported_speech/reported_speech.py
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
def process(self, doc: Doc) -> Doc:
    """
    Finds entities related to reported speech.

    Parameters
    ----------
    doc: spaCy Doc object

    Returns
    -------
    doc: spaCy Doc object, annotated for negation
    """

    matches = self.get_matches(doc)
    matches += list(self.regex_matcher(doc, as_spans=True))

    boundaries = self._boundaries(doc)

    entities = list(doc.ents) + list(doc.spans.get("discarded", []))
    ents = None

    # Removes duplicate matches and pseudo-expressions in one statement
    matches = filter_spans(matches, label_to_remove="pseudo")

    for start, end in boundaries:

        ents, entities = consume_spans(
            entities,
            filter=lambda s: check_inclusion(s, start, end),
            second_chance=ents,
        )

        sub_matches, matches = consume_spans(
            matches, lambda s: start <= s.start < end
        )

        if self.on_ents_only and not ents:
            continue

        sub_preceding = get_spans(sub_matches, "preceding")
        sub_following = get_spans(sub_matches, "following")
        sub_verbs = get_spans(sub_matches, "verbs")
        sub_quotation = get_spans(sub_matches, "quotation")

        if not sub_preceding + sub_following + sub_verbs + sub_quotation:
            continue

        if not self.on_ents_only:
            for token in doc[start:end]:
                token._.reported_speech = (
                    any(m.end <= token.i for m in sub_preceding + sub_verbs)
                    or any(m.start > token.i for m in sub_following)
                    or any(
                        ((m.start < token.i) & (m.end > token.i + 1))
                        for m in sub_quotation
                    )
                )
        for ent in ents:

            if self.within_ents:
                cues = [m for m in sub_preceding + sub_verbs if m.end <= ent.end]
                cues += [m for m in sub_following if m.start >= ent.start]
            else:
                cues = [m for m in sub_preceding + sub_verbs if m.end <= ent.start]
                cues += [m for m in sub_following if m.start >= ent.end]

            cues += [
                m
                for m in sub_quotation
                if (m.start < ent.start) & (m.end > ent.end)
            ]

            reported_speech = ent._.reported_speech or bool(cues)
            ent._.reported_speech = reported_speech

            if self.explain:
                ent._.reported_speech_cues += cues

            if not self.on_ents_only and reported_speech:
                for token in ent:
                    token._.reported_speech = True
    return doc

factory

DEFAULT_CONFIG = dict(pseudo=None, preceding=None, following=None, quotation=None, verbs=None, attr='NORM', on_ents_only=True, within_ents=False, explain=False) module-attribute
create_component(nlp, name, attr, pseudo, preceding, following, quotation, verbs, on_ents_only, within_ents, explain)
Source code in edsnlp/pipelines/qualifiers/reported_speech/factory.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
@deprecated_factory("rspeech", "eds.reported_speech", default_config=DEFAULT_CONFIG)
@deprecated_factory(
    "reported_speech", "eds.reported_speech", default_config=DEFAULT_CONFIG
)
@Language.factory("eds.reported_speech", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    attr: str,
    pseudo: Optional[List[str]],
    preceding: Optional[List[str]],
    following: Optional[List[str]],
    quotation: Optional[List[str]],
    verbs: Optional[List[str]],
    on_ents_only: bool,
    within_ents: bool,
    explain: bool,
):
    return ReportedSpeech(
        nlp=nlp,
        attr=attr,
        pseudo=pseudo,
        preceding=preceding,
        following=following,
        quotation=quotation,
        verbs=verbs,
        on_ents_only=on_ents_only,
        within_ents=within_ents,
        explain=explain,
    )

hypothesis

patterns

pseudo: List[str] = ['aucun doute', 'même si', 'pas de condition', 'pas de doute', 'sans aucun doute', 'sans condition', 'sans risque'] module-attribute
confirmation: List[str] = ['certain', 'certaine', 'certainement', 'certaines', 'certains', 'confirmer', 'évidemment', 'évident', 'évidente', 'montrer que', 'visiblement'] module-attribute
preceding: List[str] = ['à condition', 'à la condition que', 'à moins que', 'au cas où', 'conditionnellement', 'doute', 'en admettant que', 'en cas', 'en considérant que', 'en supposant que', 'éventuellement', 'faudrait', 'hypothèse', 'hypothèses', 'idée depas confirmer', 'pas sûr', 'pas sûre', 'peut correspondre', 'peut-être', 'peuvent correspondre', 'possible', 'possiblement', 'potentiel', 'potentielle', 'potentiellement', 'potentielles', 'potentiels', 'prédisposant à', 'probable', 'probablement', 'probables', "recherche d'recherche de", 'recherche des', 'risque', 'sauf si', 'selon', 'si', "s'il", 'soit', 'sous condition', 'sous réserve', 'suspicion'] module-attribute
following: List[str] = ['?', 'envisagé', 'envisageable', 'envisageables', 'envisagées', 'envisagés', 'hypothétique', 'hypothétiquement', 'hypothétiques', 'pas certain', 'pas certaine', 'pas clair', 'pas claire', 'pas confirmé', 'pas confirmée', 'pas confirmées', 'pas confirmés', 'pas évident', 'pas évidente', 'pas sûr', 'pas sûre', 'possible', 'potentiel', 'potentielle', 'potentiels', 'probable', 'probables', ': \n', ':\n'] module-attribute
verbs_hyp: List[str] = ['douter', 'envisager', "s'apparenter", 'sembler', 'soupçonner', 'suggérer', 'suspecter'] module-attribute
verbs_eds: List[str] = ['abandonner', 'abolir', 'aborder', 'accepter', 'accidenter', 'accompagnemer', 'accompagner', 'acoller', 'acquérir', 'activer', 'actualiser', 'adapter', 'adhérer', 'adjuver', 'admettre', 'administrer', 'adopter', 'adresser', 'aggraver', 'agir', 'agréer', 'aider', 'aimer', 'alcooliser', 'alerter', 'alimenter', 'aller', 'allonger', 'alléger', 'alterner', 'altérer', 'amender', 'amener', 'améliorer', 'amyotrophier', 'améliorer', 'analyser', 'anesthésier', 'animer', 'annexer', 'annuler', 'anonymiser', 'anticiper', 'anticoaguler', 'apercevoir', 'aplatir', 'apparaître', 'appareiller', 'appeler', 'appliquer', 'apporter', 'apprendre', 'apprécier', 'appuyer', 'argumenter', 'arquer', 'arrêter', 'arriver', 'arrêter', 'articuler', 'aspirer', 'asseoir', 'assister', 'associer', 'assurer', 'assécher', 'attacher', 'atteindre', 'attendre', 'attribuer', 'augmenter', 'autonomiser', 'autoriser', 'avaler', 'avancer', 'avertir', 'avoir', 'avérer', 'aérer', 'baisser', 'ballonner', 'blesser', 'bloquer', 'boire', 'border', 'brancher', 'brûler', 'bénéficier', 'cadrer', 'calcifier', 'calculer', 'calmer', 'canaliser', 'capter', 'carencer', 'casser', 'centrer', 'cerner', 'certifier', 'changer', 'charger', 'chevaucher', 'choisir', 'chronomoduler', 'chuter', 'cicatriser', 'circoncire', 'circuler', 'classer', 'codéiner', 'coincer', 'colorer', 'combler', 'commander', 'commencer', 'communiquer', 'comparer', 'compliquer', 'compléter', 'comporter', 'comprendre', 'comprimer', 'concerner', 'conclure', 'condamner', 'conditionner', 'conduire', 'confiner', 'confirmer', 'confronter', 'congeler', 'conjoindre', 'conjuguer', 'connaître', 'connecter', 'conseiller', 'conserver', 'considérer', 'consommer', 'constater', 'constituer', 'consulter', 'contacter', 'contaminer', 'contenir', 'contentionner', 'continuer', 'contracter', 'contrarier', 'contribuer', 'contrôler', 'convaincre', 'convenir', 'convier', 'convoquer', 'copier', 'correspondre', 'corriger', 'corréler', 'coucher', 'coupler', 'couvrir', 'crapotter', 'creuser', 'croire', 'croiser', 'créer', 'crémer', 'crépiter', 'cumuler', 'curariser', 'céder', 'dater', 'demander', 'demeurer', 'destiner', 'devenir', 'devoir', 'diagnostiquer', 'dialyser', 'dicter', 'diffuser', 'différencier', 'différer', 'digérer', 'dilater', 'diluer', 'diminuer', 'diner', 'dire', 'diriger', 'discuter', 'disparaître', 'disposer', 'dissocier', 'disséminer', 'disséquer', 'distendre', 'distinguer', 'divorcer', 'documenter', 'donner', 'dorer', 'doser', 'doubler', 'durer', 'dyaliser', 'dyspner', 'débuter', 'décaler', 'déceler', 'décider', 'déclarer', 'déclencher', 'découvrir', 'décrire', 'décroître', 'décurariser', 'décéder', 'dédier', 'définir', 'dégrader', 'délivrer', 'dépasser', 'dépendre', 'déplacer', 'dépolir', 'déposer', 'dériver', 'dérouler', 'désappareiller', 'désigner', 'désinfecter', 'désorienter', 'détecter', 'déterminer', 'détruire', 'développer', 'dévouer', 'dîner', 'écraser', 'effacer', 'effectuer', 'effondrer', 'emboliser', 'emmener', 'empêcher', 'encadrer', 'encourager', 'endormir', 'endurer', 'enlever', 'enregistrer', 'entamer', 'entendre', 'entourer', 'entraîner', 'entreprendre', 'entrer', 'envahir', 'envisager', 'envoyer', 'espérer', 'essayer', 'estimer', 'être', 'examiner', 'excentrer', 'exciser', 'exclure', 'expirer', 'expliquer', 'explorer', 'exposer', 'exprimer', 'extérioriser', 'exécuter', 'faciliter', 'faire', 'fatiguer', 'favoriser', 'faxer', 'fermer', 'figurer', 'fixer', 'focaliser', 'foncer', 'former', 'fournir', 'fractionner', 'fragmenter', 'fuiter', 'fusionner', 'garder', 'graver', 'guider', 'gérer', 'gêner', 'honorer', 'hopsitaliser', 'hospitaliser', 'hydrater', 'hyperartérialiser', 'hyperfixer', 'hypertrophier', 'hésiter', 'identifier', 'illustrer', 'immuniser', 'impacter', 'implanter', 'impliquer', 'importer', 'imposer', 'impregner', 'imprimer', 'inclure', 'indifferencier', 'indiquer', 'infecter', 'infertiliser', 'infiltrer', 'informer', 'inhaler', 'initier', 'injecter', 'inscrire', 'insister', 'installer', 'interdire', 'interpréter', 'interrompre', 'intervenir', 'intituler', 'introduire', 'intéragir', 'inverser', 'inviter', 'ioder', 'ioniser', 'irradier', 'itérativer', 'joindre', 'juger', 'justifier', 'laisser', 'laminer', 'lancer', 'latéraliser', 'laver', 'lever', 'lier', 'ligaturer', 'limiter', 'lire', 'localiser', 'loger', 'louper', 'luire', 'lutter', 'lyricer', 'lyser', 'maculer', 'macérer', 'maintenir', 'majorer', 'malaiser', 'manger', 'manifester', 'manipuler', 'manquer', 'marcher', 'marier', 'marmoner', 'marquer', 'masquer', 'masser', 'mater', 'mener', 'mesurer', 'meteoriser', 'mettre', 'mitiger', 'modifier', 'moduler', 'modérer', 'monter', 'montrer', 'motiver', 'moucheter', 'mouler', 'mourir', 'multiopéréer', 'munir', 'muter', 'médicaliser', 'météoriser', 'naître', 'normaliser', 'noter', 'nuire', 'numériser', 'nécessiter', 'négativer', 'objectiver', 'observer', 'obstruer', 'obtenir', 'occasionner', 'occuper', 'opposer', 'opérer', 'organiser', 'orienter', 'ouvrir', 'palper', 'parasiter', 'paraître', 'parcourir', 'parer', 'paresthésier', 'parfaire', 'partager', 'partir', 'parvenir', 'passer', 'penser', 'percevoir', 'perdre', 'perforer', 'permettre', 'persister', 'personnaliser', 'peser', 'pigmenter', 'piloter', 'placer', 'plaindre', 'planifier', 'plier', 'plonger', 'porter', 'poser', 'positionner', 'posséder', 'poursuivre', 'pousser', 'pouvoir', 'pratiquer', 'preciser', 'prendre', 'prescrire', 'prier', 'produire', 'programmer', 'prolonger', 'prononcer', 'proposer', 'prouver', 'provoquer', 'préciser', 'précéder', 'prédominer', 'préexister', 'préférer', 'prélever', 'préparer', 'présenter', 'préserver', 'prévenir', 'prévoir', 'puruler', 'pénétrer', 'radiofréquencer', 'ralentir', 'ramener', 'rappeler', 'rapporter', 'rapprocher', 'rassurer', 'rattacher', 'rattraper', 'realiser', 'recenser', 'recevoir', 'rechercher', 'recommander', 'reconnaître', 'reconsulter', 'recontacter', 'recontrôler', 'reconvoquer', 'recouvrir', 'recueillir', 'recuperer', 'redescendre', 'rediscuter', 'refaire', 'refouler', 'refuser', 'regarder', 'rehausser', 'relancer', 'relayer', 'relever', 'relire', 'relâcher', 'remanier', 'remarquer', 'remercier', 'remettre', 'remonter', 'remplacer', 'remplir', 'rencontrer', 'rendormir', 'rendre', 'renfermer', 'renforcer', 'renouveler', 'renseigner', 'rentrer', 'reparler', 'repasser', 'reporter', 'reprendre', 'represcrire', 'reproduire', 'reprogrammer', 'représenter', 'repérer', 'requérir', 'respecter', 'ressembler', 'ressentir', 'rester', 'restreindre', 'retarder', 'retenir', 'retirer', 'retrouver', 'revasculariser', 'revenir', 'reverticaliser', 'revoir', 'rompre', 'rouler', 'réadapter', 'réadmettre', 'réadresser', 'réaliser', 'récidiver', 'récupérer', 'rédiger', 'réduire', 'réessayer', 'réexpliquer', 'référer', 'régler', 'régresser', 'réhausser', 'réopérer', 'répartir', 'répondre', 'répéter', 'réserver', 'résorber', 'résoudre', 'réséquer', 'réveiller', 'révéler', 'réévaluer', 'rêver', 'sacrer', 'saisir', 'satisfaire', 'savoir', 'scanner', 'scolariser', 'sembler', 'sensibiliser', 'sentir', 'serrer', 'servir', 'sevrer', 'signaler', 'signer', 'situer', 'siéger', 'soigner', 'sommeiller', 'sonder', 'sortir', 'souffler', 'souhaiter', 'soulager', 'soussigner', 'souvenir', 'spécialiser', 'stabiliser', 'statuer', 'stenter', 'stopper', 'stratifier', 'subir', 'substituer', 'sucrer', 'suggérer', 'suivre', 'supporter', 'supprimer', 'surajouter', 'surmonter', 'surveiller', 'survenir', 'suspecter', 'suspendre', 'suturer', 'synchroniser', 'systématiser', 'sécréter', 'sécuriser', 'sédater', 'séjourner', 'séparer', 'taire', 'taper', 'teinter', 'tendre', 'tenir', 'tenter', 'terminer', 'tester', 'thromboser', 'tirer', 'tiroir', 'tissulaire', 'titulariser', 'tolérer', 'tourner', 'tracer', 'trachéotomiser', 'traduire', 'traiter', 'transcrire', 'transférer', 'transmettre', 'transporter', 'trasnfixer', 'travailler', 'tronquer', 'trouver', 'téléphoner', 'ulcérer', 'uriner', 'utiliser', 'vacciner', 'valider', 'valoir', 'varier', 'vasculariser', 'venir', 'verifier', 'vieillir', 'viser', 'visualiser', 'vivre', 'voir', 'vouloir', 'vérifier', 'ébaucher', 'écarter', 'échographier', 'échoguider', 'échoir', 'échouer', 'éclairer', 'écraser', 'élargir', 'éliminer', 'émousser', 'épaissir', 'épargner', 'épuiser', 'épurer', 'équilibrer', 'établir', 'étager', 'étendre', 'étiqueter', 'étrangler', 'évaluer', 'éviter', 'évoluer', 'évoquer', 'être'] module-attribute

hypothesis

Hypothesis

Bases: Qualifier

Hypothesis detection with spaCy.

The component looks for five kinds of expressions in the text :

  • preceding hypothesis, ie cues that precede a hypothetic expression
  • following hypothesis, ie cues that follow a hypothetic expression
  • pseudo hypothesis : contain a hypothesis cue, but are not hypothesis (eg "pas de doute"/"no doubt")
  • hypothetic verbs : verbs indicating hypothesis (eg "douter")
  • classic verbs conjugated to the conditional, thus indicating hypothesis
PARAMETER DESCRIPTION
nlp

spaCy nlp pipeline to use for matching.

TYPE: Language

pseudo

List of pseudo hypothesis cues.

TYPE: Optional[List[str]]

preceding

List of preceding hypothesis cues

TYPE: Optional[List[str]]

following

List of following hypothesis cues.

TYPE: Optional[List[str]]

verbs_hyp

List of hypothetic verbs.

TYPE: Optional[List[str]]

verbs_eds

List of mainstream verbs.

TYPE: Optional[List[str]]

filter_matches

Whether to filter out overlapping matches.

TYPE: bool

attr

spaCy's attribute to use: a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr' we can also add a key for each regex.

TYPE: str

on_ents_only

Whether to look for matches around detected entities only. Useful for faster inference in downstream tasks.

TYPE: bool

within_ents

Whether to consider cues within entities.

TYPE: bool

explain

Whether to keep track of cues for each entity.

TYPE: bool

regex

A dictionnary of regex patterns.

TYPE: Optional[Dict[str, Union[List[str], str]]]

Source code in edsnlp/pipelines/qualifiers/hypothesis/hypothesis.py
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
class Hypothesis(Qualifier):
    """
    Hypothesis detection with spaCy.

    The component looks for five kinds of expressions in the text :

    - preceding hypothesis, ie cues that precede a hypothetic expression
    - following hypothesis, ie cues that follow a hypothetic expression
    - pseudo hypothesis : contain a hypothesis cue, but are not hypothesis
      (eg "pas de doute"/"no doubt")
    - hypothetic verbs : verbs indicating hypothesis (eg "douter")
    - classic verbs conjugated to the conditional, thus indicating hypothesis

    Parameters
    ----------
    nlp : Language
        spaCy nlp pipeline to use for matching.
    pseudo : Optional[List[str]]
        List of pseudo hypothesis cues.
    preceding : Optional[List[str]]
        List of preceding hypothesis cues
    following : Optional[List[str]]
        List of following hypothesis cues.
    verbs_hyp : Optional[List[str]]
        List of hypothetic verbs.
    verbs_eds : Optional[List[str]]
        List of mainstream verbs.
    filter_matches : bool
        Whether to filter out overlapping matches.
    attr : str
        spaCy's attribute to use:
        a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr'
        we can also add a key for each regex.
    on_ents_only : bool
        Whether to look for matches around detected entities only.
        Useful for faster inference in downstream tasks.
    within_ents : bool
        Whether to consider cues within entities.
    explain : bool
        Whether to keep track of cues for each entity.
    regex : Optional[Dict[str, Union[List[str], str]]]
        A dictionnary of regex patterns.
    """

    defaults = dict(
        following=following,
        preceding=preceding,
        pseudo=pseudo,
        termination=termination,
        verbs_eds=verbs_eds,
        verbs_hyp=verbs_hyp,
    )

    def __init__(
        self,
        nlp: Language,
        attr: str,
        pseudo: Optional[List[str]],
        preceding: Optional[List[str]],
        following: Optional[List[str]],
        termination: Optional[List[str]],
        verbs_eds: Optional[List[str]],
        verbs_hyp: Optional[List[str]],
        on_ents_only: bool,
        within_ents: bool,
        explain: bool,
    ):

        terms = self.get_defaults(
            pseudo=pseudo,
            preceding=preceding,
            following=following,
            termination=termination,
            verbs_eds=verbs_eds,
            verbs_hyp=verbs_hyp,
        )
        terms["verbs"] = self.load_verbs(
            verbs_hyp=terms.pop("verbs_hyp"),
            verbs_eds=terms.pop("verbs_eds"),
        )

        super().__init__(
            nlp=nlp,
            attr=attr,
            on_ents_only=on_ents_only,
            explain=explain,
            **terms,
        )

        self.within_ents = within_ents
        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:
        if not Token.has_extension("hypothesis"):
            Token.set_extension("hypothesis", default=False)

        if not Token.has_extension("hypothesis_"):
            Token.set_extension(
                "hypothesis_",
                getter=lambda token: "HYP" if token._.hypothesis else "CERT",
            )

        if not Span.has_extension("hypothesis"):
            Span.set_extension("hypothesis", default=False)

        if not Span.has_extension("hypothesis_"):
            Span.set_extension(
                "hypothesis_",
                getter=lambda span: "HYP" if span._.hypothesis else "CERT",
            )

        if not Span.has_extension("hypothesis_cues"):
            Span.set_extension("hypothesis_cues", default=[])

        if not Doc.has_extension("hypothesis"):
            Doc.set_extension("hypothesis", default=[])

    def load_verbs(
        self,
        verbs_hyp: List[str],
        verbs_eds: List[str],
    ) -> List[str]:
        """
        Conjugate "classic" verbs to conditional, and add hypothesis
        verbs conjugated to all tenses.

        Parameters
        ----------
        verbs_hyp: List of verbs that specifically imply an hypothesis.
        verbs_eds: List of general verbs.

        Returns
        -------
        list of hypothesis verbs conjugated at all tenses and classic
        verbs conjugated to conditional.
        """

        classic_verbs = get_verbs(verbs_eds)
        classic_verbs = classic_verbs.loc[classic_verbs["mode"] == "Conditionnel"]
        list_classic_verbs = list(classic_verbs["term"].unique())

        hypo_verbs = get_verbs(verbs_hyp)
        list_hypo_verbs = list(hypo_verbs["term"].unique())

        return list_hypo_verbs + list_classic_verbs

    def process(self, doc: Doc) -> Doc:
        """
        Finds entities related to hypothesis.

        Parameters
        ----------
        doc: spaCy Doc object

        Returns
        -------
        doc: spaCy Doc object, annotated for hypothesis
        """

        matches = self.get_matches(doc)

        terminations = get_spans(matches, "termination")
        boundaries = self._boundaries(doc, terminations)

        # Removes duplicate matches and pseudo-expressions in one statement
        matches = filter_spans(matches, label_to_remove="pseudo")

        entities = list(doc.ents) + list(doc.spans.get("discarded", []))
        ents = None

        for start, end in boundaries:

            ents, entities = consume_spans(
                entities,
                filter=lambda s: check_inclusion(s, start, end),
                second_chance=ents,
            )

            sub_matches, matches = consume_spans(
                matches, lambda s: start <= s.start < end
            )

            if self.on_ents_only and not ents:
                continue

            sub_preceding = get_spans(sub_matches, "preceding")
            sub_following = get_spans(sub_matches, "following")
            sub_verbs = get_spans(sub_matches, "verbs")

            if not sub_preceding + sub_following + sub_verbs:
                continue

            if not self.on_ents_only:
                for token in doc[start:end]:
                    token._.hypothesis = any(
                        m.end <= token.i for m in sub_preceding + sub_verbs
                    ) or any(m.start > token.i for m in sub_following)

            for ent in ents:

                if self.within_ents:
                    cues = [m for m in sub_preceding + sub_verbs if m.end <= ent.end]
                    cues += [m for m in sub_following if m.start >= ent.start]
                else:
                    cues = [m for m in sub_preceding + sub_verbs if m.end <= ent.start]
                    cues += [m for m in sub_following if m.start >= ent.end]

                hypothesis = ent._.hypothesis or bool(cues)

                ent._.hypothesis = hypothesis

                if self.explain and hypothesis:
                    ent._.hypothesis_cues += cues

                if not self.on_ents_only and hypothesis:
                    for token in ent:
                        token._.hypothesis = True

        return doc
defaults = dict(following=following, preceding=preceding, pseudo=pseudo, termination=termination, verbs_eds=verbs_eds, verbs_hyp=verbs_hyp) class-attribute
within_ents = within_ents instance-attribute
__init__(nlp, attr, pseudo, preceding, following, termination, verbs_eds, verbs_hyp, on_ents_only, within_ents, explain)
Source code in edsnlp/pipelines/qualifiers/hypothesis/hypothesis.py
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def __init__(
    self,
    nlp: Language,
    attr: str,
    pseudo: Optional[List[str]],
    preceding: Optional[List[str]],
    following: Optional[List[str]],
    termination: Optional[List[str]],
    verbs_eds: Optional[List[str]],
    verbs_hyp: Optional[List[str]],
    on_ents_only: bool,
    within_ents: bool,
    explain: bool,
):

    terms = self.get_defaults(
        pseudo=pseudo,
        preceding=preceding,
        following=following,
        termination=termination,
        verbs_eds=verbs_eds,
        verbs_hyp=verbs_hyp,
    )
    terms["verbs"] = self.load_verbs(
        verbs_hyp=terms.pop("verbs_hyp"),
        verbs_eds=terms.pop("verbs_eds"),
    )

    super().__init__(
        nlp=nlp,
        attr=attr,
        on_ents_only=on_ents_only,
        explain=explain,
        **terms,
    )

    self.within_ents = within_ents
    self.set_extensions()
set_extensions()
Source code in edsnlp/pipelines/qualifiers/hypothesis/hypothesis.py
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
@staticmethod
def set_extensions() -> None:
    if not Token.has_extension("hypothesis"):
        Token.set_extension("hypothesis", default=False)

    if not Token.has_extension("hypothesis_"):
        Token.set_extension(
            "hypothesis_",
            getter=lambda token: "HYP" if token._.hypothesis else "CERT",
        )

    if not Span.has_extension("hypothesis"):
        Span.set_extension("hypothesis", default=False)

    if not Span.has_extension("hypothesis_"):
        Span.set_extension(
            "hypothesis_",
            getter=lambda span: "HYP" if span._.hypothesis else "CERT",
        )

    if not Span.has_extension("hypothesis_cues"):
        Span.set_extension("hypothesis_cues", default=[])

    if not Doc.has_extension("hypothesis"):
        Doc.set_extension("hypothesis", default=[])
load_verbs(verbs_hyp, verbs_eds)

Conjugate "classic" verbs to conditional, and add hypothesis verbs conjugated to all tenses.

PARAMETER DESCRIPTION
verbs_hyp

TYPE: List[str]

verbs_eds

TYPE: List[str]

RETURNS DESCRIPTION
list of hypothesis verbs conjugated at all tenses and classic
verbs conjugated to conditional.
Source code in edsnlp/pipelines/qualifiers/hypothesis/hypothesis.py
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
def load_verbs(
    self,
    verbs_hyp: List[str],
    verbs_eds: List[str],
) -> List[str]:
    """
    Conjugate "classic" verbs to conditional, and add hypothesis
    verbs conjugated to all tenses.

    Parameters
    ----------
    verbs_hyp: List of verbs that specifically imply an hypothesis.
    verbs_eds: List of general verbs.

    Returns
    -------
    list of hypothesis verbs conjugated at all tenses and classic
    verbs conjugated to conditional.
    """

    classic_verbs = get_verbs(verbs_eds)
    classic_verbs = classic_verbs.loc[classic_verbs["mode"] == "Conditionnel"]
    list_classic_verbs = list(classic_verbs["term"].unique())

    hypo_verbs = get_verbs(verbs_hyp)
    list_hypo_verbs = list(hypo_verbs["term"].unique())

    return list_hypo_verbs + list_classic_verbs
process(doc)

Finds entities related to hypothesis.

PARAMETER DESCRIPTION
doc

TYPE: Doc

RETURNS DESCRIPTION
doc
Source code in edsnlp/pipelines/qualifiers/hypothesis/hypothesis.py
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
def process(self, doc: Doc) -> Doc:
    """
    Finds entities related to hypothesis.

    Parameters
    ----------
    doc: spaCy Doc object

    Returns
    -------
    doc: spaCy Doc object, annotated for hypothesis
    """

    matches = self.get_matches(doc)

    terminations = get_spans(matches, "termination")
    boundaries = self._boundaries(doc, terminations)

    # Removes duplicate matches and pseudo-expressions in one statement
    matches = filter_spans(matches, label_to_remove="pseudo")

    entities = list(doc.ents) + list(doc.spans.get("discarded", []))
    ents = None

    for start, end in boundaries:

        ents, entities = consume_spans(
            entities,
            filter=lambda s: check_inclusion(s, start, end),
            second_chance=ents,
        )

        sub_matches, matches = consume_spans(
            matches, lambda s: start <= s.start < end
        )

        if self.on_ents_only and not ents:
            continue

        sub_preceding = get_spans(sub_matches, "preceding")
        sub_following = get_spans(sub_matches, "following")
        sub_verbs = get_spans(sub_matches, "verbs")

        if not sub_preceding + sub_following + sub_verbs:
            continue

        if not self.on_ents_only:
            for token in doc[start:end]:
                token._.hypothesis = any(
                    m.end <= token.i for m in sub_preceding + sub_verbs
                ) or any(m.start > token.i for m in sub_following)

        for ent in ents:

            if self.within_ents:
                cues = [m for m in sub_preceding + sub_verbs if m.end <= ent.end]
                cues += [m for m in sub_following if m.start >= ent.start]
            else:
                cues = [m for m in sub_preceding + sub_verbs if m.end <= ent.start]
                cues += [m for m in sub_following if m.start >= ent.end]

            hypothesis = ent._.hypothesis or bool(cues)

            ent._.hypothesis = hypothesis

            if self.explain and hypothesis:
                ent._.hypothesis_cues += cues

            if not self.on_ents_only and hypothesis:
                for token in ent:
                    token._.hypothesis = True

    return doc

factory

DEFAULT_CONFIG = dict(pseudo=None, preceding=None, following=None, termination=None, verbs_hyp=None, verbs_eds=None, attr='NORM', on_ents_only=True, within_ents=False, explain=False) module-attribute
create_component(nlp, name, attr, pseudo, preceding, following, termination, verbs_eds, verbs_hyp, on_ents_only, within_ents, explain)
Source code in edsnlp/pipelines/qualifiers/hypothesis/factory.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
@deprecated_factory("hypothesis", "eds.hypothesis", default_config=DEFAULT_CONFIG)
@Language.factory("eds.hypothesis", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    attr: str,
    pseudo: Optional[List[str]],
    preceding: Optional[List[str]],
    following: Optional[List[str]],
    termination: Optional[List[str]],
    verbs_eds: Optional[List[str]],
    verbs_hyp: Optional[List[str]],
    on_ents_only: bool,
    within_ents: bool,
    explain: bool,
):
    return Hypothesis(
        nlp=nlp,
        attr=attr,
        pseudo=pseudo,
        preceding=preceding,
        following=following,
        termination=termination,
        verbs_eds=verbs_eds,
        verbs_hyp=verbs_hyp,
        on_ents_only=on_ents_only,
        within_ents=within_ents,
        explain=explain,
    )
Back to top