Skip to content

edsnlp

EDS-NLP

__version__ = '0.4.4' module-attribute

BASE_DIR = Path(__file__).parent module-attribute

conjugator

conjugate_verb(verb, conjugator)

Conjugates the verb using an instance of mlconjug3, and formats the results in a pandas DataFrame.

PARAMETER DESCRIPTION
verb

Verb to conjugate.

TYPE: str

conjugator

mlconjug3 instance for conjugating.

TYPE: mlconjug3.Conjugator

RETURNS DESCRIPTION
pd.DataFrame

Normalized dataframe containing all conjugated forms for the verb.

Source code in edsnlp/conjugator.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
def conjugate_verb(
    verb: str,
    conjugator: mlconjug3.Conjugator,
) -> pd.DataFrame:
    """
    Conjugates the verb using an instance of mlconjug3,
    and formats the results in a pandas `DataFrame`.

    Parameters
    ----------
    verb : str
        Verb to conjugate.
    conjugator : mlconjug3.Conjugator
        mlconjug3 instance for conjugating.

    Returns
    -------
    pd.DataFrame
        Normalized dataframe containing all conjugated forms
        for the verb.
    """

    df = pd.DataFrame(
        conjugator.conjugate(verb).iterate(),
        columns=["mode", "tense", "person", "term"],
    )

    df.term = df.term.fillna(df.person)
    df.loc[df.person == df.term, "person"] = None

    df.insert(0, "verb", verb)

    return df

conjugate(verbs, language='fr')

Conjugate a list of verbs.

PARAMETER DESCRIPTION
verbs

List of verbs to conjugate

TYPE: Union[str, List[str]]

language

Language to conjugate. Defaults to French (fr).

TYPE: str DEFAULT: 'fr'

RETURNS DESCRIPTION
pd.DataFrame

Dataframe containing the conjugations for the provided verbs. Columns: verb, mode, tense, person, term

Source code in edsnlp/conjugator.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
def conjugate(
    verbs: Union[str, List[str]],
    language: str = "fr",
) -> pd.DataFrame:
    """
    Conjugate a list of verbs.

    Parameters
    ----------
    verbs : Union[str, List[str]]
        List of verbs to conjugate
    language: str
        Language to conjugate. Defaults to French (`fr`).

    Returns
    -------
    pd.DataFrame
        Dataframe containing the conjugations for the provided verbs.
        Columns: `verb`, `mode`, `tense`, `person`, `term`
    """
    if isinstance(verbs, str):
        verbs = [verbs]

    conjugator = mlconjug3.Conjugator(language=language)

    df = pd.concat([conjugate_verb(verb, conjugator=conjugator) for verb in verbs])

    df = df.reset_index(drop=True)

    return df

get_conjugated_verbs(verbs, matches, language='fr')

Get a list of conjugated verbs.

PARAMETER DESCRIPTION
verbs

List of verbs to conjugate.

TYPE: Union[str, List[str]]

matches

List of dictionary describing the mode/tense/persons to keep.

TYPE: Union[List[Dict[str, str]], Dict[str, str]]

language

[description], by default "fr" (French)

TYPE: str, optional DEFAULT: 'fr'

RETURNS DESCRIPTION
List[str]

List of terms to look for.

Examples:

>>> get_conjugated_verbs(
        "aimer",
        dict(mode="Indicatif", tense="Présent", person="1p"),
    )
['aimons']
Source code in edsnlp/conjugator.py
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
def get_conjugated_verbs(
    verbs: Union[str, List[str]],
    matches: Union[List[Dict[str, str]], Dict[str, str]],
    language: str = "fr",
) -> List[str]:
    """
    Get a list of conjugated verbs.

    Parameters
    ----------
    verbs : Union[str, List[str]]
        List of verbs to conjugate.
    matches : Union[List[Dict[str, str]], Dict[str, str]]
        List of dictionary describing the mode/tense/persons to keep.
    language : str, optional
        [description], by default "fr" (French)

    Returns
    -------
    List[str]
        List of terms to look for.

    Examples
    --------
    >>> get_conjugated_verbs(
            "aimer",
            dict(mode="Indicatif", tense="Présent", person="1p"),
        )
    ['aimons']
    """

    if isinstance(matches, dict):
        matches = [matches]

    terms = []

    df = conjugate(
        verbs=verbs,
        language=language,
    )

    for match in matches:
        q = " & ".join([f'{k} == "{v}"' for k, v in match.items()])
        terms.extend(df.query(q).term.unique())

    return list(set(terms))

extensions

components

matchers

phrase

PatternDict = Dict[str, Union[str, Dict[str, str]]] module-attribute
EDSPhraseMatcher

Bases: object

PhraseMatcher that matches "over" excluded tokens.

PARAMETER DESCRIPTION
vocab

spaCy vocabulary to match on.

TYPE: Vocab

attr

Default attribute to match on, by default "TEXT". Can be overiden in the add method.

To match on a custom attribute, prepend the attribute name with _.

TYPE: str

ignore_excluded

Whether to ignore excluded tokens, by default True

TYPE: bool, optional

exclude_newlines

Whether to exclude new lines, by default False

TYPE: bool, optional

Source code in edsnlp/matchers/phrase.py
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
class EDSPhraseMatcher(object):
    """
    PhraseMatcher that matches "over" excluded tokens.

    Parameters
    ----------
    vocab : Vocab
        spaCy vocabulary to match on.
    attr : str
        Default attribute to match on, by default "TEXT".
        Can be overiden in the `add` method.

        To match on a custom attribute, prepend the attribute name with `_`.
    ignore_excluded : bool, optional
        Whether to ignore excluded tokens, by default True
    exclude_newlines : bool, optional
        Whether to exclude new lines, by default False
    """

    def __init__(
        self,
        vocab: Vocab,
        attr: str = "TEXT",
        ignore_excluded: bool = True,
        exclude_newlines: bool = False,
    ):
        self.matcher = Matcher(vocab, validate=True)
        self.attr = attr
        self.ignore_excluded = ignore_excluded

        self.exclusion_attribute = (
            "excluded_or_space" if exclude_newlines else "excluded"
        )

    @staticmethod
    def get_attr(token: Token, attr: str, custom_attr: bool = False) -> str:
        if custom_attr:
            return getattr(token._, attr)
        else:
            attr = ATTRIBUTES.get(attr)
            return getattr(token, attr)

    def create_pattern(
        self,
        match_pattern: Doc,
        attr: Optional[str] = None,
        ignore_excluded: Optional[bool] = None,
    ) -> List[PatternDict]:
        """
        Create a pattern

        Parameters
        ----------
        match_pattern : Doc
            A spaCy doc object, to use as match model.
        attr : str, optional
            Overwrite attribute to match on.
        ignore_excluded: bool, optional
            Whether to skip excluded tokens.

        Returns
        -------
        List[PatternDict]
            A spaCy rule-based pattern.
        """

        ignore_excluded = ignore_excluded or self.ignore_excluded

        attr = attr or self.attr
        custom_attr = attr.startswith("_")

        if custom_attr:
            attr = attr.lstrip("_").lower()

            pattern = []

            for token in match_pattern:
                pattern.append({"_": {attr: self.get_attr(token, attr, True)}})
                if ignore_excluded and token.whitespace_:
                    # If the token is followed by a whitespace,
                    # we let it match on a pollution
                    pattern.append({"_": {self.exclusion_attribute: True}, "OP": "*"})

            return pattern
        else:
            pattern = []

            for token in match_pattern:
                pattern.append({attr: self.get_attr(token, attr, False)})
                if ignore_excluded and token.whitespace_:
                    # If the token is followed by a whitespace,
                    # we let it match on a pollution
                    pattern.append({"_": {self.exclusion_attribute: True}, "OP": "*"})

            return pattern

    def build_patterns(self, nlp: Language, terms: Patterns):
        """
        Build patterns and adds them for matching.
        Helper function for pipelines using this matcher.

        Parameters
        ----------
        nlp : Language
            The instance of the spaCy language class.
        terms : Patterns
            Dictionary of label/terms, or label/dictionary of terms/attribute.
        """

        if not terms:
            terms = dict()

        for key, expressions in terms.items():
            if isinstance(expressions, dict):
                attr = expressions.get("attr")
                expressions = expressions.get("patterns")
            else:
                attr = None
            if isinstance(expressions, str):
                expressions = [expressions]
            patterns = list(nlp.pipe(expressions))
            self.add(key, patterns, attr)

    def add(
        self,
        key: str,
        patterns: List[Doc],
        attr: Optional[str] = None,
        ignore_excluded: Optional[bool] = None,
    ) -> None:
        """
        Add a pattern.

        Parameters
        ----------
        key : str
            Key of the new/updated pattern.
        patterns : List[str]
            List of patterns to add.
        attr : str, optional
            Overwrite the attribute to match on for this specific pattern.
        ignore_excluded : bool, optional
            Overwrite the parameter for this specific pattern.
        """

        patterns = [
            self.create_pattern(pattern, attr=attr, ignore_excluded=ignore_excluded)
            for pattern in patterns
        ]
        self.matcher.add(key, patterns)

    def remove(
        self,
        key: str,
    ) -> None:
        """
        Remove a pattern.

        Parameters
        ----------
        key : str
            key of the pattern to remove.

        Raises
        ------
        ValueError
            Should the key not be contained in the registry.
        """
        self.matcher.remove(key)

    def __len__(self):
        return len(self.matcher)

    def __call__(
        self,
        doclike: Union[Doc, Span],
        as_spans=False,
    ) -> Generator:
        """
        Performs matching. Yields matches.

        Parameters
        ----------
        doclike:
            spaCy Doc or Span object.
        as_spans:
            Whether to return matches as spans.

        Yields
        -------
        match: Span
            A match.
        """
        if len(self.matcher):
            for match in self.matcher(doclike, as_spans=as_spans):
                yield match
matcher = Matcher(vocab, validate=True) instance-attribute
attr = attr instance-attribute
ignore_excluded = ignore_excluded instance-attribute
exclusion_attribute = 'excluded_or_space' if exclude_newlines else 'excluded' instance-attribute
__init__(vocab, attr='TEXT', ignore_excluded=True, exclude_newlines=False)
Source code in edsnlp/matchers/phrase.py
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def __init__(
    self,
    vocab: Vocab,
    attr: str = "TEXT",
    ignore_excluded: bool = True,
    exclude_newlines: bool = False,
):
    self.matcher = Matcher(vocab, validate=True)
    self.attr = attr
    self.ignore_excluded = ignore_excluded

    self.exclusion_attribute = (
        "excluded_or_space" if exclude_newlines else "excluded"
    )
get_attr(token, attr, custom_attr=False)
Source code in edsnlp/matchers/phrase.py
75
76
77
78
79
80
81
@staticmethod
def get_attr(token: Token, attr: str, custom_attr: bool = False) -> str:
    if custom_attr:
        return getattr(token._, attr)
    else:
        attr = ATTRIBUTES.get(attr)
        return getattr(token, attr)
create_pattern(match_pattern, attr=None, ignore_excluded=None)

Create a pattern

PARAMETER DESCRIPTION
match_pattern

A spaCy doc object, to use as match model.

TYPE: Doc

attr

Overwrite attribute to match on.

TYPE: str, optional DEFAULT: None

ignore_excluded

Whether to skip excluded tokens.

TYPE: Optional[bool] DEFAULT: None

RETURNS DESCRIPTION
List[PatternDict]

A spaCy rule-based pattern.

Source code in edsnlp/matchers/phrase.py
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
def create_pattern(
    self,
    match_pattern: Doc,
    attr: Optional[str] = None,
    ignore_excluded: Optional[bool] = None,
) -> List[PatternDict]:
    """
    Create a pattern

    Parameters
    ----------
    match_pattern : Doc
        A spaCy doc object, to use as match model.
    attr : str, optional
        Overwrite attribute to match on.
    ignore_excluded: bool, optional
        Whether to skip excluded tokens.

    Returns
    -------
    List[PatternDict]
        A spaCy rule-based pattern.
    """

    ignore_excluded = ignore_excluded or self.ignore_excluded

    attr = attr or self.attr
    custom_attr = attr.startswith("_")

    if custom_attr:
        attr = attr.lstrip("_").lower()

        pattern = []

        for token in match_pattern:
            pattern.append({"_": {attr: self.get_attr(token, attr, True)}})
            if ignore_excluded and token.whitespace_:
                # If the token is followed by a whitespace,
                # we let it match on a pollution
                pattern.append({"_": {self.exclusion_attribute: True}, "OP": "*"})

        return pattern
    else:
        pattern = []

        for token in match_pattern:
            pattern.append({attr: self.get_attr(token, attr, False)})
            if ignore_excluded and token.whitespace_:
                # If the token is followed by a whitespace,
                # we let it match on a pollution
                pattern.append({"_": {self.exclusion_attribute: True}, "OP": "*"})

        return pattern
build_patterns(nlp, terms)

Build patterns and adds them for matching. Helper function for pipelines using this matcher.

PARAMETER DESCRIPTION
nlp

The instance of the spaCy language class.

TYPE: Language

terms

Dictionary of label/terms, or label/dictionary of terms/attribute.

TYPE: Patterns

Source code in edsnlp/matchers/phrase.py
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
def build_patterns(self, nlp: Language, terms: Patterns):
    """
    Build patterns and adds them for matching.
    Helper function for pipelines using this matcher.

    Parameters
    ----------
    nlp : Language
        The instance of the spaCy language class.
    terms : Patterns
        Dictionary of label/terms, or label/dictionary of terms/attribute.
    """

    if not terms:
        terms = dict()

    for key, expressions in terms.items():
        if isinstance(expressions, dict):
            attr = expressions.get("attr")
            expressions = expressions.get("patterns")
        else:
            attr = None
        if isinstance(expressions, str):
            expressions = [expressions]
        patterns = list(nlp.pipe(expressions))
        self.add(key, patterns, attr)
add(key, patterns, attr=None, ignore_excluded=None)

Add a pattern.

PARAMETER DESCRIPTION
key

Key of the new/updated pattern.

TYPE: str

patterns

List of patterns to add.

TYPE: List[str]

attr

Overwrite the attribute to match on for this specific pattern.

TYPE: str, optional DEFAULT: None

ignore_excluded

Overwrite the parameter for this specific pattern.

TYPE: bool, optional DEFAULT: None

Source code in edsnlp/matchers/phrase.py
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
def add(
    self,
    key: str,
    patterns: List[Doc],
    attr: Optional[str] = None,
    ignore_excluded: Optional[bool] = None,
) -> None:
    """
    Add a pattern.

    Parameters
    ----------
    key : str
        Key of the new/updated pattern.
    patterns : List[str]
        List of patterns to add.
    attr : str, optional
        Overwrite the attribute to match on for this specific pattern.
    ignore_excluded : bool, optional
        Overwrite the parameter for this specific pattern.
    """

    patterns = [
        self.create_pattern(pattern, attr=attr, ignore_excluded=ignore_excluded)
        for pattern in patterns
    ]
    self.matcher.add(key, patterns)
remove(key)

Remove a pattern.

PARAMETER DESCRIPTION
key

key of the pattern to remove.

TYPE: str

RAISES DESCRIPTION
ValueError

Should the key not be contained in the registry.

Source code in edsnlp/matchers/phrase.py
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
def remove(
    self,
    key: str,
) -> None:
    """
    Remove a pattern.

    Parameters
    ----------
    key : str
        key of the pattern to remove.

    Raises
    ------
    ValueError
        Should the key not be contained in the registry.
    """
    self.matcher.remove(key)
__len__()
Source code in edsnlp/matchers/phrase.py
211
212
def __len__(self):
    return len(self.matcher)
__call__(doclike, as_spans=False)

Performs matching. Yields matches.

PARAMETER DESCRIPTION
doclike

spaCy Doc or Span object.

TYPE: Union[Doc, Span]

as_spans

Whether to return matches as spans.

DEFAULT: False

YIELDS DESCRIPTION
match

A match.

Source code in edsnlp/matchers/phrase.py
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
def __call__(
    self,
    doclike: Union[Doc, Span],
    as_spans=False,
) -> Generator:
    """
    Performs matching. Yields matches.

    Parameters
    ----------
    doclike:
        spaCy Doc or Span object.
    as_spans:
        Whether to return matches as spans.

    Yields
    -------
    match: Span
        A match.
    """
    if len(self.matcher):
        for match in self.matcher(doclike, as_spans=as_spans):
            yield match
get_normalized_variant(doclike)
Source code in edsnlp/matchers/phrase.py
15
16
17
18
19
20
def get_normalized_variant(doclike: Union[Span, Doc]) -> str:
    tokens = [t.text + t.whitespace_ for t in doclike if not t._.excluded]
    variant = "".join(tokens)
    variant = variant.rstrip(" ")
    variant = re.sub(r"\s+", " ", variant)
    return variant
phrase_matcher_factory(attr, ignore_excluded, exclude_newlines)
Source code in edsnlp/matchers/phrase.py
27
28
29
30
31
32
33
34
35
36
37
38
@registry.misc("edsnlp.factories.phrasematcher.v1")
def phrase_matcher_factory(
    attr: str,
    ignore_excluded: bool,
    exclude_newlines: bool,
):
    return partial(
        EDSPhraseMatcher,
        attr=attr,
        ignore_excluded=ignore_excluded,
        exclude_newlines=exclude_newlines,
    )

regex

RegexMatcher

Bases: object

Simple RegExp matcher.

PARAMETER DESCRIPTION
alignment_mode

How spans should be aligned with tokens. Possible values are strict (character indices must be aligned with token boundaries), "contract" (span of all tokens completely within the character span), "expand" (span of all tokens at least partially covered by the character span). Defaults to expand.

TYPE: str

attr

Default attribute to match on, by default "TEXT". Can be overiden in the add method.

TYPE: str

ignore_excluded

Whether to skip exclusions

TYPE: bool

Source code in edsnlp/matchers/regex.py
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
class RegexMatcher(object):
    """
    Simple RegExp matcher.

    Parameters
    ----------
    alignment_mode : str
        How spans should be aligned with tokens.
        Possible values are `strict` (character indices must be aligned
        with token boundaries), "contract" (span of all tokens completely
        within the character span), "expand" (span of all tokens at least
        partially covered by the character span).
        Defaults to `expand`.
    attr : str
        Default attribute to match on, by default "TEXT".
        Can be overiden in the `add` method.
    ignore_excluded : bool
        Whether to skip exclusions
    """

    def __init__(
        self,
        alignment_mode: str = "expand",
        attr: str = "TEXT",
        ignore_excluded: bool = False,
    ):
        self.alignment_mode = alignment_mode
        self.regex = []

        self.default_attr = attr

        self.ignore_excluded = ignore_excluded

    def build_patterns(self, regex: Patterns):
        """
        Build patterns and adds them for matching.
        Helper function for pipelines using this matcher.

        Parameters
        ----------
        regex : Patterns
            Dictionary of label/terms, or label/dictionary of terms/attribute.
        """
        if not regex:
            regex = dict()

        for key, patterns in regex.items():
            if isinstance(patterns, dict):
                attr = patterns.get("attr")
                alignment_mode = patterns.get("alignment_mode")
                patterns = patterns.get("regex")
            else:
                attr = None
                alignment_mode = None

            if isinstance(patterns, str):
                patterns = [patterns]

            self.add(
                key=key, patterns=patterns, attr=attr, alignment_mode=alignment_mode
            )

    def add(
        self,
        key: str,
        patterns: List[str],
        attr: Optional[str] = None,
        ignore_excluded: Optional[bool] = None,
        alignment_mode: Optional[str] = None,
    ):
        """
        Add a pattern to the registry.

        Parameters
        ----------
        key : str
            Key of the new/updated pattern.
        patterns : List[str]
            List of patterns to add.
        attr : str, optional
            Attribute to use for matching.
            By default uses the `default_attr` attribute
        ignore_excluded : bool, optional
            Whether to skip excluded tokens during matching.
        alignment_mode : str, optional
            Overwrite alignment mode.
        """

        if attr is None:
            attr = self.default_attr

        if ignore_excluded is None:
            ignore_excluded = self.ignore_excluded

        if alignment_mode is None:
            alignment_mode = self.alignment_mode

        patterns = [compile_regex(pattern) for pattern in patterns]

        self.regex.append((key, patterns, attr, ignore_excluded, alignment_mode))

    def remove(
        self,
        key: str,
    ):
        """
        Remove a pattern for the registry.

        Parameters
        ----------
        key : str
            key of the pattern to remove.

        Raises
        ------
        ValueError
            If the key is not present in the registered patterns.
        """
        n = len(self.regex)
        self.regex = [(k, p, a, i, am) for k, p, a, i, am in self.regex if k != key]
        if len(self.regex) == n:
            raise ValueError(f"`{key}` is not referenced in the matcher")

    def __len__(self):
        return len(set([regex[0] for regex in self.regex]))

    def match(
        self,
        doclike: Union[Doc, Span],
    ) -> Tuple[Span, re.Match]:
        """
        Iterates on the matches.

        Parameters
        ----------
        doclike:
            spaCy Doc or Span object to match on.

        Yields
        -------
        span:
            A match.
        """

        for key, patterns, attr, ignore_excluded, alignment_mode in self.regex:
            text = get_text(doclike, attr, ignore_excluded)

            for pattern in patterns:
                for match in pattern.finditer(text):
                    logger.trace(f"Matched a regex from {key}: {repr(match.group())}")

                    span = create_span(
                        doclike=doclike,
                        start_char=match.start(),
                        end_char=match.end(),
                        key=key,
                        attr=attr,
                        alignment_mode=alignment_mode,
                        ignore_excluded=ignore_excluded,
                    )

                    if span is None:
                        continue

                    yield span, match

    def __call__(
        self,
        doclike: Union[Doc, Span],
        as_spans=False,
        return_groupdict=False,
    ) -> Union[Span, Tuple[Span, Dict[str, Any]]]:
        """
        Performs matching. Yields matches.

        Parameters
        ----------
        doclike:
            spaCy Doc or Span object.
        as_spans:
            Returns matches as spans.

        Yields
        ------
        span:
            A match.
        groupdict:
            Additional information coming from the named patterns
            in the regular expression.
        """
        for span, match in self.match(doclike):
            if not as_spans:
                offset = doclike[0].i
                span = (span.label, span.start - offset, span.end - offset)
            if return_groupdict:
                yield span, match.groupdict()
            else:
                yield span
alignment_mode = alignment_mode instance-attribute
regex = [] instance-attribute
default_attr = attr instance-attribute
ignore_excluded = ignore_excluded instance-attribute
__init__(alignment_mode='expand', attr='TEXT', ignore_excluded=False)
Source code in edsnlp/matchers/regex.py
135
136
137
138
139
140
141
142
143
144
145
146
def __init__(
    self,
    alignment_mode: str = "expand",
    attr: str = "TEXT",
    ignore_excluded: bool = False,
):
    self.alignment_mode = alignment_mode
    self.regex = []

    self.default_attr = attr

    self.ignore_excluded = ignore_excluded
build_patterns(regex)

Build patterns and adds them for matching. Helper function for pipelines using this matcher.

PARAMETER DESCRIPTION
regex

Dictionary of label/terms, or label/dictionary of terms/attribute.

TYPE: Patterns

Source code in edsnlp/matchers/regex.py
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
def build_patterns(self, regex: Patterns):
    """
    Build patterns and adds them for matching.
    Helper function for pipelines using this matcher.

    Parameters
    ----------
    regex : Patterns
        Dictionary of label/terms, or label/dictionary of terms/attribute.
    """
    if not regex:
        regex = dict()

    for key, patterns in regex.items():
        if isinstance(patterns, dict):
            attr = patterns.get("attr")
            alignment_mode = patterns.get("alignment_mode")
            patterns = patterns.get("regex")
        else:
            attr = None
            alignment_mode = None

        if isinstance(patterns, str):
            patterns = [patterns]

        self.add(
            key=key, patterns=patterns, attr=attr, alignment_mode=alignment_mode
        )
add(key, patterns, attr=None, ignore_excluded=None, alignment_mode=None)

Add a pattern to the registry.

PARAMETER DESCRIPTION
key

Key of the new/updated pattern.

TYPE: str

patterns

List of patterns to add.

TYPE: List[str]

attr

Attribute to use for matching. By default uses the default_attr attribute

TYPE: str, optional DEFAULT: None

ignore_excluded

Whether to skip excluded tokens during matching.

TYPE: bool, optional DEFAULT: None

alignment_mode

Overwrite alignment mode.

TYPE: str, optional DEFAULT: None

Source code in edsnlp/matchers/regex.py
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
def add(
    self,
    key: str,
    patterns: List[str],
    attr: Optional[str] = None,
    ignore_excluded: Optional[bool] = None,
    alignment_mode: Optional[str] = None,
):
    """
    Add a pattern to the registry.

    Parameters
    ----------
    key : str
        Key of the new/updated pattern.
    patterns : List[str]
        List of patterns to add.
    attr : str, optional
        Attribute to use for matching.
        By default uses the `default_attr` attribute
    ignore_excluded : bool, optional
        Whether to skip excluded tokens during matching.
    alignment_mode : str, optional
        Overwrite alignment mode.
    """

    if attr is None:
        attr = self.default_attr

    if ignore_excluded is None:
        ignore_excluded = self.ignore_excluded

    if alignment_mode is None:
        alignment_mode = self.alignment_mode

    patterns = [compile_regex(pattern) for pattern in patterns]

    self.regex.append((key, patterns, attr, ignore_excluded, alignment_mode))
remove(key)

Remove a pattern for the registry.

PARAMETER DESCRIPTION
key

key of the pattern to remove.

TYPE: str

RAISES DESCRIPTION
ValueError

If the key is not present in the registered patterns.

Source code in edsnlp/matchers/regex.py
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
def remove(
    self,
    key: str,
):
    """
    Remove a pattern for the registry.

    Parameters
    ----------
    key : str
        key of the pattern to remove.

    Raises
    ------
    ValueError
        If the key is not present in the registered patterns.
    """
    n = len(self.regex)
    self.regex = [(k, p, a, i, am) for k, p, a, i, am in self.regex if k != key]
    if len(self.regex) == n:
        raise ValueError(f"`{key}` is not referenced in the matcher")
__len__()
Source code in edsnlp/matchers/regex.py
238
239
def __len__(self):
    return len(set([regex[0] for regex in self.regex]))
match(doclike)

Iterates on the matches.

PARAMETER DESCRIPTION
doclike

spaCy Doc or Span object to match on.

TYPE: Union[Doc, Span]

YIELDS DESCRIPTION
span

A match.

Source code in edsnlp/matchers/regex.py
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
def match(
    self,
    doclike: Union[Doc, Span],
) -> Tuple[Span, re.Match]:
    """
    Iterates on the matches.

    Parameters
    ----------
    doclike:
        spaCy Doc or Span object to match on.

    Yields
    -------
    span:
        A match.
    """

    for key, patterns, attr, ignore_excluded, alignment_mode in self.regex:
        text = get_text(doclike, attr, ignore_excluded)

        for pattern in patterns:
            for match in pattern.finditer(text):
                logger.trace(f"Matched a regex from {key}: {repr(match.group())}")

                span = create_span(
                    doclike=doclike,
                    start_char=match.start(),
                    end_char=match.end(),
                    key=key,
                    attr=attr,
                    alignment_mode=alignment_mode,
                    ignore_excluded=ignore_excluded,
                )

                if span is None:
                    continue

                yield span, match
__call__(doclike, as_spans=False, return_groupdict=False)

Performs matching. Yields matches.

PARAMETER DESCRIPTION
doclike

spaCy Doc or Span object.

TYPE: Union[Doc, Span]

as_spans

Returns matches as spans.

DEFAULT: False

YIELDS DESCRIPTION
span

A match.

groupdict

Additional information coming from the named patterns in the regular expression.

Source code in edsnlp/matchers/regex.py
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
def __call__(
    self,
    doclike: Union[Doc, Span],
    as_spans=False,
    return_groupdict=False,
) -> Union[Span, Tuple[Span, Dict[str, Any]]]:
    """
    Performs matching. Yields matches.

    Parameters
    ----------
    doclike:
        spaCy Doc or Span object.
    as_spans:
        Returns matches as spans.

    Yields
    ------
    span:
        A match.
    groupdict:
        Additional information coming from the named patterns
        in the regular expression.
    """
    for span, match in self.match(doclike):
        if not as_spans:
            offset = doclike[0].i
            span = (span.label, span.start - offset, span.end - offset)
        if return_groupdict:
            yield span, match.groupdict()
        else:
            yield span
get_first_included(doclike)
Source code in edsnlp/matchers/regex.py
13
14
15
16
17
18
@lru_cache(32)
def get_first_included(doclike: Union[Doc, Span]) -> Token:
    for token in doclike:
        if not token._.excluded:
            return token
    raise IndexError("The provided Span does not include any token")
create_span(doclike, start_char, end_char, key, attr, alignment_mode, ignore_excluded)

spaCy only allows strict alignment mode for char_span on Spans. This method circumvents this.

PARAMETER DESCRIPTION
doclike

Doc or Span.

TYPE: Union[Doc, Span]

start_char

Character index within the Doc-like object.

TYPE: int

end_char

Character index of the end, within the Doc-like object.

TYPE: int

key

The key used to match.

TYPE: str

alignment_mode

The alignment mode.

TYPE: str

ignore_excluded

Whether to skip excluded tokens.

TYPE: bool

RETURNS DESCRIPTION
span

A span matched on the Doc-like object.

Source code in edsnlp/matchers/regex.py
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
def create_span(
    doclike: Union[Doc, Span],
    start_char: int,
    end_char: int,
    key: str,
    attr: str,
    alignment_mode: str,
    ignore_excluded: bool,
) -> Span:
    """
    spaCy only allows strict alignment mode for char_span on Spans.
    This method circumvents this.

    Parameters
    ----------
    doclike : Union[Doc, Span]
        `Doc` or `Span`.
    start_char : int
        Character index within the Doc-like object.
    end_char : int
        Character index of the end, within the Doc-like object.
    key : str
        The key used to match.
    alignment_mode : str
        The alignment mode.
    ignore_excluded : bool
        Whether to skip excluded tokens.

    Returns
    -------
    span:
        A span matched on the Doc-like object.
    """

    doc = doclike if isinstance(doclike, Doc) else doclike.doc

    # Handle the simple case immediately
    if attr in {"TEXT", "LOWER"} and not ignore_excluded:
        off = doclike[0].idx
        return doc.char_span(
            start_char + off,
            end_char + off,
            label=key,
            alignment_mode=alignment_mode,
        )

    # If doclike is a Span, we need to get the clean
    # index of the first included token
    if ignore_excluded:
        original, clean = alignment(
            doc=doc,
            attr=attr,
            ignore_excluded=ignore_excluded,
        )

        first_included = get_first_included(doclike)
        i = bisect_left(original, first_included.idx)
        first = clean[i]

    else:
        first = doclike[0].idx

    start_char = (
        first
        + start_char
        + offset(
            doc,
            attr=attr,
            ignore_excluded=ignore_excluded,
            index=first + start_char,
        )
    )

    end_char = (
        first
        + end_char
        + offset(
            doc,
            attr=attr,
            ignore_excluded=ignore_excluded,
            index=first + end_char,
        )
    )

    span = doc.char_span(
        start_char,
        end_char,
        label=key,
        alignment_mode=alignment_mode,
    )

    return span

utils

ListOrStr = Union[List[str], str] module-attribute
DictOrPattern = Union[Dict[str, ListOrStr], ListOrStr] module-attribute
Patterns = Dict[str, DictOrPattern] module-attribute
ATTRIBUTES = {'LOWER': 'lower_', 'TEXT': 'text', 'NORM': 'norm_', 'SHAPE': 'shape_'} module-attribute
offset
token_length(token, custom, attr)
Source code in edsnlp/matchers/utils/offset.py
10
11
12
13
14
15
def token_length(token: Token, custom: bool, attr: str):
    if custom:
        text = getattr(token._, attr)
    else:
        text = getattr(token, attr)
    return len(text)
alignment(doc, attr='TEXT', ignore_excluded=True)

Align different representations of a Doc or Span object.

PARAMETER DESCRIPTION
doc

spaCy Doc or Span object

TYPE: Doc

attr

Attribute to use, by default "TEXT"

TYPE: str, optional DEFAULT: 'TEXT'

ignore_excluded

Whether to remove excluded tokens, by default True

TYPE: bool, optional DEFAULT: True

RETURNS DESCRIPTION
Tuple[List[int], List[int]]

An alignment tuple: original and clean lists.

Source code in edsnlp/matchers/utils/offset.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
@lru_cache(maxsize=32)
def alignment(
    doc: Doc,
    attr: str = "TEXT",
    ignore_excluded: bool = True,
) -> Tuple[List[int], List[int]]:
    """
    Align different representations of a `Doc` or `Span` object.

    Parameters
    ----------
    doc : Doc
        spaCy `Doc` or `Span` object
    attr : str, optional
        Attribute to use, by default `"TEXT"`
    ignore_excluded : bool, optional
        Whether to remove excluded tokens, by default True

    Returns
    -------
    Tuple[List[int], List[int]]
        An alignment tuple: original and clean lists.
    """
    assert isinstance(doc, Doc)

    attr = attr.upper()
    attr = ATTRIBUTES.get(attr, attr)

    custom = attr.startswith("_")

    if custom:
        attr = attr[1:].lower()

    # Define the length function
    length = partial(token_length, custom=custom, attr=attr)

    original = []
    clean = []

    cursor = 0

    for token in doc:

        if not ignore_excluded or not token._.excluded:

            # The token is not excluded, we add its extremities to the list
            original.append(token.idx)

            # We add the cursor
            clean.append(cursor)
            cursor += length(token)

            if token.whitespace_:
                cursor += 1

    return original, clean
offset(doc, attr, ignore_excluded, index)

Compute offset between the original text and a given representation (defined by the couple attr, ignore_excluded).

The alignment itself is computed with alignment.

PARAMETER DESCRIPTION
doc

The spaCy Doc object

TYPE: Doc

attr

The attribute used by the RegexMatcher (eg NORM)

TYPE: str

ignore_excluded

Whether the RegexMatcher ignores excluded tokens.

TYPE: bool

index

The index in the pre-processed text.

TYPE: int

RETURNS DESCRIPTION
int

The offset. To get the character index in the original document, just do: original = index + offset(doc, attr, ignore_excluded, index)

Source code in edsnlp/matchers/utils/offset.py
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
def offset(
    doc: Doc,
    attr: str,
    ignore_excluded: bool,
    index: int,
) -> int:
    """
    Compute offset between the original text and a given representation
    (defined by the couple `attr`, `ignore_excluded`).

    The alignment itself is computed with
    [`alignment`][edsnlp.matchers.utils.offset.alignment].

    Parameters
    ----------
    doc : Doc
        The spaCy `Doc` object
    attr : str
        The attribute used by the [`RegexMatcher`][edsnlp.matchers.regex.RegexMatcher]
        (eg `NORM`)
    ignore_excluded : bool
        Whether the RegexMatcher ignores excluded tokens.
    index : int
        The index in the pre-processed text.

    Returns
    -------
    int
        The offset. To get the character index in the original document,
        just do: `#!python original = index + offset(doc, attr, ignore_excluded, index)`
    """
    original, clean = alignment(
        doc=doc,
        attr=attr,
        ignore_excluded=ignore_excluded,
    )

    # We use bisect to efficiently find the correct rightmost-lower index
    i = bisect_left(clean, index)
    i = min(i, len(original) - 1)

    return original[i] - clean[i]
text
get_text(doclike, attr, ignore_excluded)

Get text using a custom attribute, possibly ignoring excluded tokens.

PARAMETER DESCRIPTION
doclike

Doc or Span to get text from.

TYPE: Union[Doc, Span]

attr

Attribute to use.

TYPE: str

ignore_excluded

Whether to skip excluded tokens, by default False

TYPE: bool

RETURNS DESCRIPTION
str

Extracted text.

Source code in edsnlp/matchers/utils/text.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
@lru_cache(32)
def get_text(
    doclike: Union[Doc, Span],
    attr: str,
    ignore_excluded: bool,
) -> str:
    """
    Get text using a custom attribute, possibly ignoring excluded tokens.

    Parameters
    ----------
    doclike : Union[Doc, Span]
        Doc or Span to get text from.
    attr : str
        Attribute to use.
    ignore_excluded : bool
        Whether to skip excluded tokens, by default False

    Returns
    -------
    str
        Extracted text.
    """

    attr = attr.upper()

    if not ignore_excluded:
        if attr == "TEXT":
            return doclike.text
        elif attr == "LOWER":
            return doclike.text.lower()
        else:
            tokens = doclike
    else:
        tokens = [t for t in doclike if not t._.excluded]

    attr = ATTRIBUTES.get(attr, attr)

    if attr.startswith("_"):
        attr = attr[1:].lower()
        return "".join([getattr(t._, attr) + t.whitespace_ for t in tokens])
    else:
        return "".join([getattr(t, attr) + t.whitespace_ for t in tokens])

processing

helpers

DataFrames = None module-attribute
spec = importlib.util.find_spec(module.value) module-attribute
DataFrameModules

Bases: Enum

Source code in edsnlp/processing/helpers.py
 9
10
11
12
class DataFrameModules(Enum):
    PANDAS = "pandas"
    PYSPARK = "pyspark.sql"
    KOALAS = "databricks.koalas"
PANDAS = 'pandas' class-attribute
PYSPARK = 'pyspark.sql' class-attribute
KOALAS = 'databricks.koalas' class-attribute
get_module(df)
Source code in edsnlp/processing/helpers.py
26
27
28
29
def get_module(df: DataFrames):
    for module in list(DataFrameModules):
        if df.__class__.__module__.startswith(module.value):
            return module
check_spacy_version_for_context()
Source code in edsnlp/processing/helpers.py
32
33
34
35
36
37
38
39
40
41
def check_spacy_version_for_context():  # pragma: no cover
    import spacy

    spacy_version = getattr(spacy, "__version__")
    if LooseVersion(spacy_version) < LooseVersion("3.2"):
        raise VersionConflict(
            "You provided a `context` argument, which only work with spacy>=3.2.\n"
            f"However, we found SpaCy version {spacy_version}.\n",
            "Please upgrade SpaCy ;)",
        )

simple

nlp = spacy.blank('fr') module-attribute
ExtensionSchema = Union[str, List[str], Dict[str, Any]] module-attribute
_df_to_spacy(note, nlp, context)

Takes a pandas DataFrame and return a generator that can be used in nlp.pipe().

PARAMETER DESCRIPTION
note

A pandas DataFrame with at least note_text and note_id columns. A Doc object will be created for each line.

TYPE: pd.DataFrame

RETURNS DESCRIPTION
generator

A generator which items are of the form (text, context), with text being a string and context a dictionnary

Source code in edsnlp/processing/simple.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def _df_to_spacy(
    note: pd.DataFrame,
    nlp: Language,
    context: List[str],
):
    """
    Takes a pandas DataFrame and return a generator that can be used in
    `nlp.pipe()`.

    Parameters
    ----------
    note: pd.DataFrame
        A pandas DataFrame with at least `note_text` and `note_id` columns.
        A `Doc` object will be created for each line.

    Returns
    -------
    generator:
        A generator which items are of the form (text, context), with `text`
        being a string and `context` a dictionnary
    """

    if context:
        check_spacy_version_for_context()

    kept_cols = ["note_text"] + context

    for col in kept_cols:
        if col not in note.columns:
            raise ValueError(f"No column named {repr(col)} found in df")

    def add_context(context_values):
        note_text = context_values.note_text
        doc = nlp.make_doc(note_text)
        for col in context:
            doc._.set(col, getattr(context_values, col))
        return doc

    yield from map(
        add_context,
        note[kept_cols].itertuples(),
    )
_flatten(list_of_lists)

Flatten a list of lists to a combined list.

Source code in edsnlp/processing/simple.py
64
65
66
67
68
def _flatten(list_of_lists: List[List[Any]]):
    """
    Flatten a list of lists to a combined list.
    """
    return [item for sublist in list_of_lists for item in sublist]
_pipe_generator(note, nlp, context=[], additional_spans='discarded', extensions=[], batch_size=50, progress_bar=True)
Source code in edsnlp/processing/simple.py
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def _pipe_generator(
    note: pd.DataFrame,
    nlp: Language,
    context: List[str] = [],
    additional_spans: Union[List[str], str] = "discarded",
    extensions: ExtensionSchema = [],
    batch_size: int = 50,
    progress_bar: bool = True,
):

    if type(extensions) == str:
        extensions = [extensions]
    elif type(extensions) == dict:
        extensions = list(extensions.keys())

    if type(additional_spans) == str:
        additional_spans = [additional_spans]

    if "note_id" not in context:
        context.append("note_id")

    if not nlp.has_pipe("eds.context"):
        nlp.add_pipe("eds.context", first=True, config=dict(context=context))

    gen = _df_to_spacy(note, nlp, context)
    n_docs = len(note)
    pipeline = nlp.pipe(gen, batch_size=batch_size)

    for doc in tqdm(pipeline, total=n_docs, disable=not progress_bar):

        yield _full_schema(
            doc,
            additional_spans=additional_spans,
            extensions=extensions,
        )
_single_schema(ent, span_type='ents', extensions=[])
Source code in edsnlp/processing/simple.py
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
def _single_schema(
    ent: Span,
    span_type: str = "ents",
    extensions: List[str] = [],
):

    return {
        "note_id": ent.doc._.note_id,
        "lexical_variant": ent.text,
        "label": ent.label_,
        "span_type": span_type,
        "start": ent.start_char,
        "end": ent.end_char,
        **{extension: getattr(ent._, extension) for extension in extensions},
    }
_full_schema(doc, additional_spans=[], extensions=[])

Function used when Parallelising tasks via joblib. Takes a Doc as input, and returns a list of serializable objects

Note

The parallelisation needs for output objects to be serializable: after splitting the task into separate jobs, intermediate results are saved on memory before being aggregated, thus the need to be serializable. For instance, spaCy's spans aren't serializable since they are merely a view of the parent document.

Check the source code of this function for an example.

Source code in edsnlp/processing/simple.py
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
def _full_schema(
    doc: Doc,
    additional_spans: List[str] = [],
    extensions: List[str] = [],
):
    """
    Function used when Parallelising tasks via joblib.
    Takes a Doc as input, and returns a list of serializable objects

    !!! note

        The parallelisation needs for output objects to be **serializable**:
        after splitting the task into separate jobs, intermediate results
        are saved on memory before being aggregated, thus the need to be
        serializable. For instance, spaCy's spans aren't serializable since
        they are merely a *view* of the parent document.

        Check the source code of this function for an example.

    """

    results = []

    results.extend(
        [
            _single_schema(
                ent,
                extensions=extensions,
            )
            for ent in doc.ents
            if doc.ents
        ]
    )

    for span_type in additional_spans:
        results.extend(
            [
                _single_schema(
                    ent,
                    span_type=span_type,
                    extensions=extensions,
                )
                for ent in doc.spans[span_type]
                if doc.spans[span_type]
            ]
        )
    return results
pipe(note, nlp, context=[], additional_spans='discarded', extensions=[], batch_size=1000, progress_bar=True)

Function to apply a spaCy pipe to a pandas DataFrame note For a large DataFrame, prefer the parallel version.

PARAMETER DESCRIPTION
note

A pandas DataFrame with a note_id and note_text column

TYPE: DataFrame

nlp

A spaCy pipe

TYPE: Language

context

A list of column to add to the generated SpaCy document as an extension. For instance, if context=["note_datetime"], the corresponding value found in thenote_datetimecolumn will be stored indoc._.note_datetime, which can be useful e.g. for thedates` pipeline.

TYPE: List[str] DEFAULT: []

additional_spans

A name (or list of names) of SpanGroup on which to apply the pipe too: SpanGroup are available as doc.spans[spangroup_name] and can be generated by some pipes. For instance, the date pipe populates doc.spans['dates']

TYPE: Union[List[str], str], by default "discarded" DEFAULT: 'discarded'

extensions

Spans extensions to add to the extracted results: For instance, if extensions=["score_name"], the extracted result will include, for each entity, ent._.score_name.

TYPE: List[Tuple[str, T.DataType]], by default [] DEFAULT: []

batch_size

Batch size used by spaCy's pipe

TYPE: int, by default 1000 DEFAULT: 1000

progress_bar

Whether to display a progress bar or not

TYPE: bool DEFAULT: True

RETURNS DESCRIPTION
DataFrame

A pandas DataFrame with one line per extraction

Source code in edsnlp/processing/simple.py
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
def pipe(
    note: pd.DataFrame,
    nlp: Language,
    context: List[str] = [],
    additional_spans: Union[List[str], str] = "discarded",
    extensions: Union[List[str], str] = [],
    batch_size: int = 1000,
    progress_bar: bool = True,
):
    """
    Function to apply a spaCy pipe to a pandas DataFrame note
    For a large DataFrame, prefer the parallel version.

    Parameters
    ----------
    note : DataFrame
        A pandas DataFrame with a `note_id` and `note_text` column
    nlp : Language
        A spaCy pipe
    context : List[str]
        A list of column to add to the generated SpaCy document as an extension.
        For instance, if `context=["note_datetime"], the corresponding value found
        in the `note_datetime` column will be stored in `doc._.note_datetime`,
        which can be useful e.g. for the `dates` pipeline.
    additional_spans : Union[List[str], str], by default "discarded"
        A name (or list of names) of SpanGroup on which to apply the pipe too:
        SpanGroup are available as `doc.spans[spangroup_name]` and can be generated
        by some pipes. For instance, the `date` pipe populates doc.spans['dates']
    extensions : List[Tuple[str, T.DataType]], by default []
        Spans extensions to add to the extracted results:
        For instance, if `extensions=["score_name"]`, the extracted result
        will include, for each entity, `ent._.score_name`.
    batch_size : int, by default 1000
        Batch size used by spaCy's pipe
    progress_bar: bool, by default True
        Whether to display a progress bar or not

    Returns
    -------
    DataFrame
        A pandas DataFrame with one line per extraction
    """
    return pd.DataFrame(
        _flatten(
            _pipe_generator(
                note=note,
                nlp=nlp,
                context=context,
                additional_spans=additional_spans,
                extensions=extensions,
                batch_size=batch_size,
                progress_bar=progress_bar,
            )
        )
    )

wrapper

pipe(note, nlp, n_jobs=-2, context=[], additional_spans='discarded', extensions=[], **kwargs)

Function to apply a spaCy pipe to a pandas or pyspark DataFrame

PARAMETER DESCRIPTION
note

A pandas/pyspark/koalas DataFrame with a note_id and note_text column

TYPE: DataFrame

nlp

A spaCy pipe

TYPE: Language

context

A list of column to add to the generated SpaCy document as an extension. For instance, if context=["note_datetime"], the corresponding value found in thenote_datetimecolumn will be stored indoc._.note_datetime, which can be useful e.g. for thedates` pipeline.

TYPE: List[str] DEFAULT: []

n_jobs

Only used when providing a Pandas DataFrame

  • n_jobs=1 corresponds to simple_pipe
  • n_jobs>1 corresponds to parallel_pipe with n_jobs parallel workers
  • n_jobs=-1 corresponds to parallel_pipe with maximun number of workers
  • n_jobs=-2 corresponds to parallel_pipe with maximun number of workers -1

TYPE: int, by default -2 DEFAULT: -2

additional_spans

A name (or list of names) of SpanGroup on which to apply the pipe too: SpanGroup are available as doc.spans[spangroup_name] and can be generated by some pipes. For instance, the date pipe populates doc.spans['dates']

TYPE: Union[List[str], str], by default "discarded" DEFAULT: 'discarded'

extensions

Spans extensions to add to the extracted results: For instance, if extensions=["score_name"], the extracted result will include, for each entity, ent._.score_name.

TYPE: List[Tuple[str, T.DataType]], by default [] DEFAULT: []

kwargs

Additional parameters depending on the how argument.

TYPE: Dict[str, Any]

RETURNS DESCRIPTION
DataFrame

A DataFrame with one line per extraction

Source code in edsnlp/processing/wrapper.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
def pipe(
    note: DataFrames,
    nlp: Language,
    n_jobs: int = -2,
    context: List[str] = [],
    additional_spans: Union[List[str], str] = "discarded",
    extensions: ExtensionSchema = [],
    **kwargs: Dict[str, Any],
) -> DataFrames:
    """
    Function to apply a spaCy pipe to a pandas or pyspark DataFrame


    Parameters
    ----------
    note : DataFrame
        A pandas/pyspark/koalas DataFrame with a `note_id` and `note_text` column
    nlp : Language
        A spaCy pipe
    context : List[str]
        A list of column to add to the generated SpaCy document as an extension.
        For instance, if `context=["note_datetime"], the corresponding value found
        in the `note_datetime` column will be stored in `doc._.note_datetime`,
        which can be useful e.g. for the `dates` pipeline.
    n_jobs : int, by default -2
        Only used when providing a Pandas DataFrame

        - `n_jobs=1` corresponds to `simple_pipe`
        - `n_jobs>1` corresponds to `parallel_pipe` with `n_jobs` parallel workers
        - `n_jobs=-1` corresponds to `parallel_pipe` with maximun number of workers
        - `n_jobs=-2` corresponds to `parallel_pipe` with maximun number of workers -1
    additional_spans : Union[List[str], str], by default "discarded"
        A name (or list of names) of SpanGroup on which to apply the pipe too:
        SpanGroup are available as `doc.spans[spangroup_name]` and can be generated
        by some pipes. For instance, the `date` pipe populates doc.spans['dates']
    extensions : List[Tuple[str, T.DataType]], by default []
        Spans extensions to add to the extracted results:
        For instance, if `extensions=["score_name"]`, the extracted result
        will include, for each entity, `ent._.score_name`.
    kwargs : Dict[str, Any]
        Additional parameters depending on the `how` argument.

    Returns
    -------
    DataFrame
        A DataFrame with one line per extraction
    """

    module = get_module(note)

    if module == DataFrameModules.PANDAS:
        if n_jobs == 1:

            return simple_pipe(
                note=note,
                nlp=nlp,
                context=context,
                additional_spans=additional_spans,
                extensions=extensions,
                **kwargs,
            )

        else:

            return parallel_pipe(
                note=note,
                nlp=nlp,
                context=context,
                additional_spans=additional_spans,
                extensions=extensions,
                n_jobs=n_jobs,
                **kwargs,
            )

    if extensions and type(extensions) != dict:
        raise ValueError(
            """
            When using Spark or Koalas, you should provide extension names
            along with the extension type (as a dictionnary):
            `d[extension_name] = extension_type`
            """  # noqa W291
        )

    from .distributed import pipe as distributed_pipe

    return distributed_pipe(
        note=note,
        nlp=nlp,
        context=context,
        additional_spans=additional_spans,
        extensions=extensions,
        **kwargs,
    )

parallel

nlp = spacy.blank('fr') module-attribute
_define_nlp(new_nlp)

Set the global nlp variable Doing it this way saves non negligeable amount of time

Source code in edsnlp/processing/parallel.py
14
15
16
17
18
19
20
def _define_nlp(new_nlp: Language):
    """
    Set the global nlp variable
    Doing it this way saves non negligeable amount of time
    """
    global nlp
    nlp = new_nlp
_chunker(iterable, total_length, chunksize)

Takes an iterable and chunk it.

Source code in edsnlp/processing/parallel.py
23
24
25
26
27
28
29
30
31
32
33
def _chunker(
    iterable: Iterable,
    total_length: int,
    chunksize: int,
):
    """
    Takes an iterable and chunk it.
    """
    return (
        iterable[pos : pos + chunksize] for pos in range(0, total_length, chunksize)
    )
_process_chunk(note, **pipe_kwargs)
Source code in edsnlp/processing/parallel.py
36
37
38
39
40
41
42
43
def _process_chunk(note: pd.DataFrame, **pipe_kwargs):

    list_results = []

    for out in _pipe_generator(note, nlp, progress_bar=False, **pipe_kwargs):
        list_results += out

    return list_results
pipe(note, nlp, context=[], additional_spans='discarded', extensions=[], chunksize=100, n_jobs=-2, progress_bar=True, **pipe_kwargs)

Function to apply a spaCy pipe to a pandas DataFrame note by using multiprocessing

PARAMETER DESCRIPTION
note

A pandas DataFrame with a note_id and note_text column

TYPE: DataFrame

nlp

A spaCy pipe

TYPE: Language

context

A list of column to add to the generated SpaCy document as an extension. For instance, if context=["note_datetime"], the corresponding value found in thenote_datetimecolumn will be stored indoc._.note_datetime, which can be useful e.g. for thedates` pipeline.

TYPE: List[str] DEFAULT: []

additional_spans

A name (or list of names) of SpanGroup on which to apply the pipe too: SpanGroup are available as doc.spans[spangroup_name] and can be generated by some pipes. For instance, the date pipe populates doc.spans['dates']

TYPE: Union[List[str], str], by default "discarded" DEFAULT: 'discarded'

extensions

Spans extensions to add to the extracted results: FOr instance, if extensions=["score_name"], the extracted result will include, for each entity, ent._.score_name.

TYPE: List[Tuple[str, T.DataType]], by default [] DEFAULT: []

chunksize

Batch size used to split tasks

TYPE: int DEFAULT: 100

n_jobs

Max number of parallel jobs. The default value uses the maximum number of available cores.

TYPE: int DEFAULT: -2

progress_bar

Whether to display a progress bar or not

TYPE: bool DEFAULT: True

**pipe_kwargs

Arguments exposed in processing.pipe_generator are also available here

DEFAULT: {}

RETURNS DESCRIPTION
DataFrame

A pandas DataFrame with one line per extraction

Source code in edsnlp/processing/parallel.py
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
def pipe(
    note: pd.DataFrame,
    nlp: Language,
    context: List[str] = [],
    additional_spans: Union[List[str], str] = "discarded",
    extensions: ExtensionSchema = [],
    chunksize: int = 100,
    n_jobs: int = -2,
    progress_bar: bool = True,
    **pipe_kwargs,
):
    """
    Function to apply a spaCy pipe to a pandas DataFrame note by using multiprocessing

    Parameters
    ----------
    note : DataFrame
        A pandas DataFrame with a `note_id` and `note_text` column
    nlp : Language
        A spaCy pipe
    context : List[str]
        A list of column to add to the generated SpaCy document as an extension.
        For instance, if `context=["note_datetime"], the corresponding value found
        in the `note_datetime` column will be stored in `doc._.note_datetime`,
        which can be useful e.g. for the `dates` pipeline.
    additional_spans : Union[List[str], str], by default "discarded"
        A name (or list of names) of SpanGroup on which to apply the pipe too:
        SpanGroup are available as `doc.spans[spangroup_name]` and can be generated
        by some pipes. For instance, the `date` pipe populates doc.spans['dates']
    extensions : List[Tuple[str, T.DataType]], by default []
        Spans extensions to add to the extracted results:
        FOr instance, if `extensions=["score_name"]`, the extracted result
        will include, for each entity, `ent._.score_name`.
    chunksize: int, by default 100
        Batch size used to split tasks
    n_jobs: int, by default -2
        Max number of parallel jobs.
        The default value uses the maximum number of available cores.
    progress_bar: bool, by default True
        Whether to display a progress bar or not
    **pipe_kwargs:
        Arguments exposed in `processing.pipe_generator` are also available here

    Returns
    -------
    DataFrame
        A pandas DataFrame with one line per extraction
    """

    if context:
        check_spacy_version_for_context()

    # Setting the nlp variable
    _define_nlp(nlp)

    verbose = 10 if progress_bar else 0

    executor = Parallel(
        n_jobs, backend="multiprocessing", prefer="processes", verbose=verbose
    )
    executor.warn(f"Used nlp components: {nlp.component_names}")

    pipe_kwargs["additional_spans"] = additional_spans
    pipe_kwargs["extensions"] = extensions
    pipe_kwargs["context"] = context

    if verbose:
        executor.warn(f"{int(len(note)/chunksize)} tasks to complete")

    do = delayed(_process_chunk)

    tasks = (
        do(chunk, **pipe_kwargs)
        for chunk in _chunker(note, len(note), chunksize=chunksize)
    )
    result = executor(tasks)

    out = _flatten(result)

    return pd.DataFrame(out)

distributed

pyspark_type_finder(obj)

Returns (when possible) the PySpark type of any python object

Source code in edsnlp/processing/distributed.py
20
21
22
23
24
25
26
27
28
29
def pyspark_type_finder(obj):
    """
    Returns (when possible) the PySpark type of any python object
    """
    try:
        inferred_type = T._infer_type(obj)
        print(f"Inferred type is {repr(inferred_type)}")
        return inferred_type
    except TypeError:
        raise TypeError("Cannot infer type for this object.")
module_checker(func, *args, **kwargs)
Source code in edsnlp/processing/distributed.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
@decorator
def module_checker(
    func: Callable,
    *args,
    **kwargs,
) -> Any:

    args = list(args)
    note = args.pop(0)
    module = get_module(note)

    if module == DataFrameModules.PYSPARK:
        return func(note, *args, **kwargs)
    elif module == DataFrameModules.KOALAS:
        import databricks.koalas  # noqa F401

        note_spark = note.to_spark()
        note_nlp_spark = func(note_spark, *args, **kwargs)
        return note_nlp_spark.to_koalas()
pipe(note, nlp, context=[], additional_spans='discarded', extensions=[])

Function to apply a spaCy pipe to a pyspark or koalas DataFrame note

PARAMETER DESCRIPTION
note

A Pyspark or Koalas DataFrame with a note_id and note_text column

TYPE: DataFrame

nlp

A spaCy pipe

TYPE: Language

context

A list of column to add to the generated SpaCy document as an extension. For instance, if context=["note_datetime"], the corresponding value found in thenote_datetimecolumn will be stored indoc._.note_datetime, which can be useful e.g. for thedates` pipeline.

TYPE: List[str] DEFAULT: []

additional_spans

A name (or list of names) of SpanGroup on which to apply the pipe too: SpanGroup are available as doc.spans[spangroup_name] and can be generated by some pipes. For instance, the date pipe populates doc.spans['dates']

TYPE: Union[List[str], str], by default "discarded" DEFAULT: 'discarded'

extensions

Spans extensions to add to the extracted results: FOr instance, if extensions=["score_name"], the extracted result will include, for each entity, ent._.score_name.

TYPE: List[Tuple[str, T.DataType]], by default [] DEFAULT: []

RETURNS DESCRIPTION
DataFrame

A pyspark DataFrame with one line per extraction

Source code in edsnlp/processing/distributed.py
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
@module_checker
def pipe(
    note: DataFrames,
    nlp: Language,
    context: List[str] = [],
    additional_spans: Union[List[str], str] = "discarded",
    extensions: List[Tuple[str, T.DataType]] = [],
) -> DataFrame:
    """
    Function to apply a spaCy pipe to a pyspark or koalas DataFrame note

    Parameters
    ----------
    note : DataFrame
        A Pyspark or Koalas DataFrame with a `note_id` and `note_text` column
    nlp : Language
        A spaCy pipe
    context : List[str]
        A list of column to add to the generated SpaCy document as an extension.
        For instance, if `context=["note_datetime"], the corresponding value found
        in the `note_datetime` column will be stored in `doc._.note_datetime`,
        which can be useful e.g. for the `dates` pipeline.
    additional_spans : Union[List[str], str], by default "discarded"
        A name (or list of names) of SpanGroup on which to apply the pipe too:
        SpanGroup are available as `doc.spans[spangroup_name]` and can be generated
        by some pipes. For instance, the `date` pipe populates doc.spans['dates']
    extensions : List[Tuple[str, T.DataType]], by default []
        Spans extensions to add to the extracted results:
        FOr instance, if `extensions=["score_name"]`, the extracted result
        will include, for each entity, `ent._.score_name`.

    Returns
    -------
    DataFrame
        A pyspark DataFrame with one line per extraction
    """

    if context:
        check_spacy_version_for_context()

    spark = SparkSession.builder.enableHiveSupport().getOrCreate()
    sc = spark.sparkContext

    if not nlp.has_pipe("eds.context"):
        nlp.add_pipe("eds.context", first=True, config=dict(context=context))

    nlp_bc = sc.broadcast(nlp)

    def _udf_factory(
        additional_spans: Union[List[str], str] = "discarded",
        extensions: Dict[str, T.DataType] = dict(),
    ):

        schema = T.ArrayType(
            T.StructType(
                [
                    T.StructField("lexical_variant", T.StringType(), False),
                    T.StructField("label", T.StringType(), False),
                    T.StructField("span_type", T.StringType(), True),
                    T.StructField("start", T.IntegerType(), False),
                    T.StructField("end", T.IntegerType(), False),
                    *[
                        T.StructField(extension_name, extension_type, True)
                        for extension_name, extension_type in extensions.items()
                    ],
                ]
            )
        )

        def f(
            text,
            *context_values,
            additional_spans=additional_spans,
            extensions=extensions,
        ):

            if text is None:
                return []

            nlp = nlp_bc.value

            for _, pipe in nlp.pipeline:
                if isinstance(pipe, BaseComponent):
                    pipe.set_extensions()

            doc = nlp.make_doc(text)
            for context_name, context_value in zip(context, context_values):
                doc._.set(context_name, context_value)
            doc = nlp(doc)

            ents = []

            for ent in doc.ents:
                parsed_extensions = [
                    getattr(ent._, extension) for extension in extensions.keys()
                ]

                ents.append(
                    (
                        ent.text,
                        ent.label_,
                        "ents",
                        ent.start_char,
                        ent.end_char,
                        *parsed_extensions,
                    )
                )

            if additional_spans is None:
                return ents

            if type(additional_spans) == str:
                additional_spans = [additional_spans]

            for spans_name in additional_spans:

                for ent in doc.spans.get(spans_name, []):

                    parsed_extensions = [
                        getattr(ent._, extension) for extension in extensions.keys()
                    ]

                    ents.append(
                        (
                            ent.text,
                            ent.label_,
                            spans_name,
                            ent.start_char,
                            ent.end_char,
                            *parsed_extensions,
                        )
                    )

            return ents

        f_udf = F.udf(
            partial(
                f,
                additional_spans=additional_spans,
                extensions=extensions,
            ),
            schema,
        )

        return f_udf

    matcher = _udf_factory(
        additional_spans=additional_spans,
        extensions=extensions,
    )

    note_nlp = note.withColumn(
        "matches", matcher(F.col("note_text"), *[F.col(c) for c in context])
    )
    note_nlp = note_nlp.withColumn("matches", F.explode(note_nlp.matches))

    note_nlp = note_nlp.select("note_id", "matches.*")

    return note_nlp

utils

inclusion

check_inclusion(span, start, end)

Checks whether the span overlaps the boundaries.

PARAMETER DESCRIPTION
span

Span to check.

TYPE: Span

start

Start of the boundary

TYPE: int

end

End of the boundary

TYPE: int

RETURNS DESCRIPTION
bool

Whether the span overlaps the boundaries.

Source code in edsnlp/utils/inclusion.py
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
def check_inclusion(span: Span, start: int, end: int) -> bool:
    """
    Checks whether the span overlaps the boundaries.

    Parameters
    ----------
    span : Span
        Span to check.
    start : int
        Start of the boundary
    end : int
        End of the boundary

    Returns
    -------
    bool
        Whether the span overlaps the boundaries.
    """

    if span.start >= end or span.end <= start:
        return False
    return True

filter

default_sort_key(span)

Returns the sort key for filtering spans.

PARAMETER DESCRIPTION
span

Span to sort.

TYPE: Span

RETURNS DESCRIPTION
key

Sort key.

TYPE: Tuple(int, int)

Source code in edsnlp/utils/filter.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
def default_sort_key(span: Span) -> Tuple[int, int]:
    """
    Returns the sort key for filtering spans.

    Parameters
    ----------
    span : Span
        Span to sort.

    Returns
    -------
    key : Tuple(int, int)
        Sort key.
    """
    return span.end - span.start, -span.start
filter_spans(spans, label_to_remove=None, return_discarded=False, sort_key=default_sort_key)

Re-definition of spacy's filtering function, that returns discarded spans as well as filtered ones.

Can also accept a label_to_remove argument, useful for filtering out pseudo cues. If set, results can contain overlapping spans: only spans overlapping with excluded labels are removed. The main expected use case is for pseudo-cues.

The spaCy documentation states:

Filter a sequence of spans and remove duplicates or overlaps. Useful for creating named entities (where one token can only be part of one entity) or when merging spans with Retokenizer.merge. When spans overlap, the (first) longest span is preferred over shorter spans.

Filtering out spans

If the label_to_remove argument is supplied, it might be tempting to filter overlapping spans that are not part of a label to remove.

The reason we keep all other possibly overlapping labels is that in qualifier pipelines, the same cue can precede and follow a marked entity. Hence we need to keep every example.

PARAMETER DESCRIPTION
spans

Spans to filter.

TYPE: List[Span]

return_discarded

Whether to return discarded spans.

TYPE: bool DEFAULT: False

label_to_remove

Label to remove. If set, results can contain overlapping spans.

TYPE: str, optional DEFAULT: None

sort_key

Key to sorting spans before applying overlap conflict resolution. A span with a higher key will have precedence over another span. By default, the largest, leftmost spans are selected first.

TYPE: Callable[Span, Any], optional DEFAULT: default_sort_key

RETURNS DESCRIPTION
results

Filtered spans

TYPE: List[Span]

discarded

Discarded spans

TYPE: List[Span], optional

Source code in edsnlp/utils/filter.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
def filter_spans(
    spans: Iterable[Union["Span", Tuple["Span", Any]]],
    label_to_remove: Optional[str] = None,
    return_discarded: bool = False,
    sort_key: Callable[[Span], Any] = default_sort_key,
) -> Union[List["Span"], Tuple[List["Span"], List["Span"]]]:
    """
    Re-definition of spacy's filtering function, that returns discarded spans
    as well as filtered ones.

    Can also accept a `label_to_remove` argument, useful for filtering out
    pseudo cues. If set, `results` can contain overlapping spans: only
    spans overlapping with excluded labels are removed. The main expected
    use case is for pseudo-cues.

    !!! note ""

        The **spaCy documentation states**:

        > Filter a sequence of spans and remove duplicates or overlaps.
        > Useful for creating named entities (where one token can only
        > be part of one entity) or when merging spans with
        > `Retokenizer.merge`. When spans overlap, the (first)
        > longest span is preferred over shorter spans.

    !!! danger "Filtering out spans"

        If the `label_to_remove` argument is supplied, it might be tempting to
        filter overlapping spans that are not part of a label to remove.

        The reason we keep all other possibly overlapping labels is that in qualifier
        pipelines, the same cue can precede **and** follow a marked entity.
        Hence we need to keep every example.

    Parameters
    ----------
    spans : List[Span]
        Spans to filter.
    return_discarded : bool
        Whether to return discarded spans.
    label_to_remove : str, optional
        Label to remove. If set, results can contain overlapping spans.
    sort_key : Callable[Span, Any], optional
        Key to sorting spans before applying overlap conflict resolution.
        A span with a higher key will have precedence over another span.
        By default, the largest, leftmost spans are selected first.

    Returns
    -------
    results : List[Span]
        Filtered spans
    discarded : List[Span], optional
        Discarded spans
    """
    sorted_spans = sorted(spans, key=sort_key, reverse=True)
    result = []
    discarded = []
    seen_tokens = set()
    for span in sorted_spans:
        # Check for end - 1 here because boundaries are inclusive
        if span.start not in seen_tokens and span.end - 1 not in seen_tokens:
            if label_to_remove is None or span.label_ != label_to_remove:
                result.append(span)
            if label_to_remove is None or span.label_ == label_to_remove:
                seen_tokens.update(range(span.start, span.end))
        elif label_to_remove is None or span.label_ != label_to_remove:
            discarded.append(span)

    result = sorted(result, key=lambda span: span.start)
    discarded = sorted(discarded, key=lambda span: span.start)

    if return_discarded:
        return result, discarded

    return result
consume_spans(spans, filter, second_chance=None)

Consume a list of span, according to a filter.

Warning

This method makes the hard hypothesis that:

  1. Spans are sorted.
  2. Spans are consumed in sequence and only once.

The second item is problematic for the way we treat long entities, hence the second_chance parameter, which lets entities be seen more than once.

PARAMETER DESCRIPTION
spans

List of spans to filter

TYPE: List of spans

filter

Filtering function. Should return True when the item is to be included.

TYPE: Callable

second_chance

Optional list of spans to include again (useful for long entities), by default None

TYPE: List of spans, optional DEFAULT: None

RETURNS DESCRIPTION
matches

List of spans consumed by the filter.

TYPE: List of spans

remainder

List of remaining spans in the original spans parameter.

TYPE: List of spans

Source code in edsnlp/utils/filter.py
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
def consume_spans(
    spans: List[Span],
    filter: Callable,
    second_chance: Optional[List[Span]] = None,
) -> Tuple[List[Span], List[Span]]:
    """
    Consume a list of span, according to a filter.

    !!! warning
        This method makes the hard hypothesis that:

        1. Spans are sorted.
        2. Spans are consumed in sequence and only once.

        The second item is problematic for the way we treat long entities,
        hence the `second_chance` parameter, which lets entities be seen
        more than once.

    Parameters
    ----------
    spans : List of spans
        List of spans to filter
    filter : Callable
        Filtering function. Should return True when the item is to be included.
    second_chance : List of spans, optional
        Optional list of spans to include again (useful for long entities),
        by default None

    Returns
    -------
    matches : List of spans
        List of spans consumed by the filter.
    remainder : List of spans
        List of remaining spans in the original `spans` parameter.
    """

    if not second_chance:
        second_chance = []
    else:
        second_chance = [m for m in second_chance if filter(m)]

    if not spans:
        return second_chance, []

    for i, span in enumerate(spans):
        if not filter(span):
            break
        else:
            i += 1

    matches = spans[:i]
    remainder = spans[i:]

    matches.extend(second_chance)

    return matches, remainder
get_spans(spans, label)

Extracts spans with a given label. Prefer using hash label for performance reasons.

PARAMETER DESCRIPTION
spans

List of spans to filter.

TYPE: List[Span]

label

Label to filter on.

TYPE: Union[int, str]

RETURNS DESCRIPTION
List[Span]

Filtered spans.

Source code in edsnlp/utils/filter.py
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
def get_spans(spans: List[Span], label: Union[int, str]) -> List[Span]:
    """
    Extracts spans with a given label.
    Prefer using hash label for performance reasons.

    Parameters
    ----------
    spans : List[Span]
        List of spans to filter.
    label : Union[int, str]
        Label to filter on.

    Returns
    -------
    List[Span]
        Filtered spans.
    """
    if isinstance(label, int):
        return [span for span in spans if span.label == label]
    else:
        return [span for span in spans if span.label_ == label]

resources

get_verbs(verbs=None, check_contains=True)

Extract verbs from the resources, as a pandas dataframe.

PARAMETER DESCRIPTION
verbs

List of verbs to keep. Returns all verbs by default.

TYPE: List[str], optional DEFAULT: None

check_contains

Whether to check that no verb is missing if a list of verbs was provided. By default True

TYPE: bool, optional DEFAULT: True

RETURNS DESCRIPTION
pd.DataFrame

DataFrame containing conjugated verbs.

Source code in edsnlp/utils/resources.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
def get_verbs(
    verbs: Optional[List[str]] = None, check_contains: bool = True
) -> pd.DataFrame:
    """
    Extract verbs from the resources, as a pandas dataframe.

    Parameters
    ----------
    verbs : List[str], optional
        List of verbs to keep. Returns all verbs by default.
    check_contains : bool, optional
        Whether to check that no verb is missing if a list of verbs was provided.
        By default True

    Returns
    -------
    pd.DataFrame
        DataFrame containing conjugated verbs.
    """

    conjugated_verbs = pd.read_csv(BASE_DIR / "resources" / "verbs.csv")

    if not verbs:
        return conjugated_verbs

    verbs = set(verbs)

    selected_verbs = conjugated_verbs[conjugated_verbs.verb.isin(verbs)]

    if check_contains:
        assert len(verbs) == selected_verbs.verb.nunique(), "Some verbs are missing !"

    return selected_verbs

examples

entity_pattern = re.compile('(<ent[^<>]*>[^<>]+</ent>)') module-attribute
text_pattern = re.compile('<ent.*>(.+)</ent>') module-attribute
modifiers_pattern = re.compile('<ent\\s?(.*)>.+</ent>') module-attribute
Match

Bases: BaseModel

Source code in edsnlp/utils/examples.py
 7
 8
 9
10
11
class Match(BaseModel):
    start_char: int
    end_char: int
    text: str
    modifiers: str
start_char: int = None class-attribute
end_char: int = None class-attribute
text: str = None class-attribute
modifiers: str = None class-attribute
Modifier

Bases: BaseModel

Source code in edsnlp/utils/examples.py
14
15
16
class Modifier(BaseModel):
    key: str
    value: Union[int, float, bool, str]
key: str = None class-attribute
value: Union[int, float, bool, str] = None class-attribute
Entity

Bases: BaseModel

Source code in edsnlp/utils/examples.py
19
20
21
22
class Entity(BaseModel):
    start_char: int
    end_char: int
    modifiers: List[Modifier]
start_char: int = None class-attribute
end_char: int = None class-attribute
modifiers: List[Modifier] = None class-attribute
find_matches(example)

Finds entities within the example.

PARAMETER DESCRIPTION
example

Example to process.

TYPE: str

RETURNS DESCRIPTION
List[re.Match]

List of matches for entities.

Source code in edsnlp/utils/examples.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
def find_matches(example: str) -> List[re.Match]:
    """
    Finds entities within the example.

    Parameters
    ----------
    example : str
        Example to process.

    Returns
    -------
    List[re.Match]
        List of matches for entities.
    """
    return list(entity_pattern.finditer(example))
parse_match(match)

Parse a regex match representing an entity.

PARAMETER DESCRIPTION
match

Match for an entity.

TYPE: re.Match

RETURNS DESCRIPTION
Match

Usable representation for the entity match.

Source code in edsnlp/utils/examples.py
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
def parse_match(match: re.Match) -> Match:
    """
    Parse a regex match representing an entity.

    Parameters
    ----------
    match : re.Match
        Match for an entity.

    Returns
    -------
    Match
        Usable representation for the entity match.
    """

    lexical_variant = match.group()
    start_char = match.start()
    end_char = match.end()

    text = text_pattern.findall(lexical_variant)[0]
    modifiers = modifiers_pattern.findall(lexical_variant)[0]

    m = Match(start_char=start_char, end_char=end_char, text=text, modifiers=modifiers)

    return m
parse_example(example)

Parses an example : finds examples and removes the tags.

PARAMETER DESCRIPTION
example

Example to process.

TYPE: str

RETURNS DESCRIPTION
Tuple[str, List[Entity]]

Cleaned text and extracted entities.

Source code in edsnlp/utils/examples.py
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
def parse_example(example: str) -> Tuple[str, List[Entity]]:
    """
    Parses an example : finds examples and removes the tags.

    Parameters
    ----------
    example : str
        Example to process.

    Returns
    -------
    Tuple[str, List[Entity]]
        Cleaned text and extracted entities.
    """

    matches = [parse_match(match) for match in find_matches(example=example)]
    text = ""
    entities = []

    cursor = 0

    for match in matches:

        text += example[cursor : match.start_char]
        start_char = len(text)
        text += match.text
        end_char = len(text)
        modifiers = [m.split("=") for m in match.modifiers.split()]

        cursor = match.end_char

        entity = Entity(
            start_char=start_char,
            end_char=end_char,
            modifiers=[Modifier(key=k, value=v) for k, v in modifiers],
        )

        entities.append(entity)

    text += example[cursor:]

    return text, entities

deprecation

deprecated_extension(name, new_name)
Source code in edsnlp/utils/deprecation.py
 9
10
11
12
13
14
15
16
def deprecated_extension(name: str, new_name: str) -> None:
    msg = (
        f'The extension "{name}" is deprecated and will be '
        "removed in a future version. "
        f'Please use "{new_name}" instead.'
    )

    logger.warning(msg)
deprecated_getter_factory(name, new_name)
Source code in edsnlp/utils/deprecation.py
19
20
21
22
23
24
25
26
27
28
29
def deprecated_getter_factory(name: str, new_name: str) -> Callable:
    def getter(toklike: Union[Token, Span, Doc]) -> Any:

        n = f"{type(toklike).__name__}._.{name}"
        nn = f"{type(toklike).__name__}._.{new_name}"

        deprecated_extension(n, nn)

        return getattr(toklike._, new_name)

    return getter
deprecation(name, new_name=None)
Source code in edsnlp/utils/deprecation.py
32
33
34
35
36
37
38
39
40
41
42
def deprecation(name: str, new_name: Optional[str] = None):

    new_name = new_name or f"eds.{name}"

    msg = (
        f'Calling "{name}" directly is deprecated and '
        "will be removed in a future version. "
        f'Please use "{new_name}" instead.'
    )

    logger.warning(msg)
deprecated_factory(name, new_name=None, default_config=None, func=None)

Execute the Language.factory method on a modified factory function. The modification adds a deprecation warning.

PARAMETER DESCRIPTION
name

The deprecated name for the pipeline

TYPE: str

new_name

The new name for the pipeline, which should be used, by default None

TYPE: Optional[str], optional DEFAULT: None

default_config

The configuration that should be passed to Language.factory, by default None

TYPE: Optional[Dict[str, Any]], optional DEFAULT: None

func

The function to decorate, by default None

TYPE: Optional[Callable], optional DEFAULT: None

RETURNS DESCRIPTION
Callable
Source code in edsnlp/utils/deprecation.py
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
def deprecated_factory(
    name: str,
    new_name: Optional[str] = None,
    default_config: Optional[Dict[str, Any]] = None,
    func: Optional[Callable] = None,
) -> Callable:
    """
    Execute the Language.factory method on a modified factory function.
    The modification adds a deprecation warning.

    Parameters
    ----------
    name : str
        The deprecated name for the pipeline
    new_name : Optional[str], optional
        The new name for the pipeline, which should be used, by default None
    default_config : Optional[Dict[str, Any]], optional
        The configuration that should be passed to Language.factory, by default None
    func : Optional[Callable], optional
        The function to decorate, by default None

    Returns
    -------
    Callable
    """

    if default_config is None:
        default_config = dict()

    wrapper = Language.factory(name, default_config=default_config)

    def wrap(factory):

        # Define decorator
        # We use micheles' decorator package to keep the same signature
        # See https://github.com/micheles/decorator/
        @decorator
        def decorate(
            f,
            *args,
            **kwargs,
        ):
            deprecation(name, new_name)
            return f(
                *args,
                **kwargs,
            )

        decorated = decorate(factory)

        wrapper(decorated)

        return factory

    if func is not None:
        return wrap(func)

    return wrap

regex

make_pattern(patterns, with_breaks=False, name=None)

Create OR pattern from a list of patterns.

PARAMETER DESCRIPTION
patterns

List of patterns to merge.

TYPE: List[str]

with_breaks

Whether to add breaks (\b) on each side, by default False

TYPE: bool, optional DEFAULT: False

name

Name of the group, using regex ?P<> directive.

TYPE: Optional[str] DEFAULT: None

RETURNS DESCRIPTION
str

Merged pattern.

Source code in edsnlp/utils/regex.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def make_pattern(
    patterns: List[str],
    with_breaks: bool = False,
    name: Optional[str] = None,
) -> str:
    r"""
    Create OR pattern from a list of patterns.

    Parameters
    ----------
    patterns : List[str]
        List of patterns to merge.
    with_breaks : bool, optional
        Whether to add breaks (`\b`) on each side, by default False
    name: str, optional
        Name of the group, using regex `?P<>` directive.

    Returns
    -------
    str
        Merged pattern.
    """

    if name:
        prefix = f"(?P<{name}>"
    else:
        prefix = "("

    # Sorting by length might be more efficient
    patterns.sort(key=len, reverse=True)

    pattern = prefix + "|".join(patterns) + ")"

    if with_breaks:
        pattern = r"\b" + pattern + r"\b"

    return pattern
compile_regex(reg)

This function tries to compile reg using the re module, and fallbacks to the regex module that is more permissive.

PARAMETER DESCRIPTION
reg

RETURNS DESCRIPTION
Union[re.Pattern, regex.Pattern]
Source code in edsnlp/utils/regex.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
def compile_regex(reg):
    """
    This function tries to compile `reg`  using the `re` module, and
    fallbacks to the `regex` module that is more permissive.

    Parameters
    ----------
    reg: str

    Returns
    -------
    Union[re.Pattern, regex.Pattern]
    """
    try:
        return re.compile(reg)
    except re.error:
        try:
            return regex.compile(reg)
        except regex.error:
            raise Exception("Could not compile: {}".format(repr(reg)))

connectors

brat

BratConnector

Bases: object

Two-way connector with BRAT. Supports entities only.

PARAMETER DESCRIPTION
directory

Directory containing the BRAT files.

TYPE: str

n_jobs

Number of jobs for multiprocessing, by default 1

TYPE: int, optional

Source code in edsnlp/connectors/brat.py
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
class BratConnector(object):
    """
    Two-way connector with BRAT. Supports entities only.

    Parameters
    ----------
    directory : str
        Directory containing the BRAT files.
    n_jobs : int, optional
        Number of jobs for multiprocessing, by default 1
    """

    def __init__(self, directory: str, n_jobs: int = 1):
        self.directory = directory
        self.n_jobs = n_jobs

        os.makedirs(directory, exist_ok=True)

    def full_path(self, filename: str) -> str:
        return os.path.join(self.directory, filename)

    def read_file(self, filename: str) -> str:
        """
        Reads a file within the BRAT directory.

        Parameters
        ----------
        filename:
            The path to the file within the BRAT directory.

        Returns
        -------
        text:
            The text content of the file.
        """
        with open(self.full_path(filename), "r", encoding="utf-8") as f:
            return f.read()

    def read_texts(self) -> pd.DataFrame:
        """
        Reads all texts from the BRAT folder.

        Returns
        -------
        texts:
            DataFrame containing all texts in the BRAT directory.
        """
        files = os.listdir(self.directory)
        filenames = [f[:-4] for f in files if f.endswith(".txt")]

        assert filenames, f"BRAT directory {self.directory} is empty!"

        logger.info(
            f"The BRAT directory contains {len(filenames)} annotated documents."
        )

        texts = pd.DataFrame(dict(note_id=filenames))

        with tqdm(
            texts.note_id, ascii=True, ncols=100, desc="Text extraction"
        ) as iterator:
            texts["note_text"] = [
                self.read_file(note_id + ".txt") for note_id in iterator
            ]

        return texts

    def read_brat_annotation(self, note_id: Union[str, int]) -> pd.DataFrame:
        """
        Reads BRAT annotation inside the BRAT directory.

        Parameters
        ----------
        note_id:
            Note ID within the BRAT directory.

        Returns
        -------
        annotations:
            DataFrame containing the annotations for the given note.
        """
        filename = f"{note_id}.ann"
        annotations = read_brat_annotation(self.full_path(filename))
        return annotations

    def read_annotations(self, texts: pd.DataFrame) -> pd.DataFrame:
        dfs = []

        with tqdm(
            texts.note_id, ascii=True, ncols=100, desc="Annotation extraction"
        ) as iterator:
            dfs = Parallel(n_jobs=self.n_jobs)(
                delayed(self.read_brat_annotation)(note_id) for note_id in iterator
            )
            # for note_id in iterator:
            #     dfs.append(self.read_brat_annotation(note_id))

        annotations = pd.concat(dfs, keys=texts.note_id, names=["note_id"])

        annotations = annotations.droplevel(1).reset_index()

        return annotations

    def get_brat(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Reads texts and annotations, and returns two DataFrame objects.

        Returns
        -------
        texts:
            A DataFrame containing two fields, `note_id` and `note_text`
        annotations:
            A DataFrame containing the annotations.
        """

        texts = self.read_texts()
        annotations = self.read_annotations(texts)

        return texts, annotations

    def brat2docs(self, nlp: Language) -> List[Doc]:
        """
        Transforms a BRAT folder to a list of spaCy documents.

        Parameters
        ----------
        nlp:
            A spaCy pipeline.

        Returns
        -------
        docs:
            List of spaCy documents, with annotations in the `ents` attribute.
        """
        texts, annotations = self.get_brat()

        docs = []

        with tqdm(
            zip(
                texts.note_id,
                nlp.pipe(texts.note_text, batch_size=50, n_process=self.n_jobs),
            ),
            ascii=True,
            ncols=100,
            desc="spaCy conversion",
            total=len(texts),
        ) as iterator:
            for note_id, doc in iterator:

                doc._.note_id = note_id

                ann = annotations.query("note_id == @note_id")

                spans = []

                for _, row in ann.iterrows():
                    span = doc.char_span(
                        row.start,
                        row.end,
                        label=row.label,
                        alignment_mode="expand",
                    )
                    spans.append(span)

                doc.ents = filter_spans(spans)

                docs.append(doc)

        return docs

    def doc2brat(self, doc: Doc) -> None:
        """
        Writes a spaCy document to file in the BRAT directory.

        Parameters
        ----------
        doc:
            spaCy Doc object. The spans in `ents` will populate the `note_id.ann` file.
        """
        filename = str(doc._.note_id)

        with open(self.full_path(f"{filename}.txt"), "w", encoding="utf-8") as f:
            f.write(doc.text)

        annotations = pd.DataFrame.from_records(
            [
                dict(
                    label=ann.label_,
                    lexical_variant=ann.text,
                    start=ann.start_char,
                    end=ann.end_char,
                )
                for ann in doc.ents
            ]
        )

        if len(annotations) > 0:

            annotations["annot"] = (
                annotations.label
                + " "
                + annotations.start.astype(str)
                + " "
                + annotations.end.astype(str)
            )

            annotations["index"] = [f"T{i + 1}" for i in range(len(annotations))]

            annotations = annotations[["index", "annot", "lexical_variant"]]
            annotations.to_csv(
                self.full_path(f"{filename}.ann"),
                sep="\t",
                header=None,
                index=False,
                encoding="utf-8",
            )

        else:
            open(self.full_path(f"{filename}.ann"), "w", encoding="utf-8").close()

    def docs2brat(self, docs: List[Doc]) -> None:
        """
        Writes a list of spaCy documents to file.

        Parameters
        ----------
        docs:
            List of spaCy documents.
        """
        for doc in docs:
            self.doc2brat(doc)
directory = directory instance-attribute
n_jobs = n_jobs instance-attribute
__init__(directory, n_jobs=1)
Source code in edsnlp/connectors/brat.py
66
67
68
69
70
def __init__(self, directory: str, n_jobs: int = 1):
    self.directory = directory
    self.n_jobs = n_jobs

    os.makedirs(directory, exist_ok=True)
full_path(filename)
Source code in edsnlp/connectors/brat.py
72
73
def full_path(self, filename: str) -> str:
    return os.path.join(self.directory, filename)
read_file(filename)

Reads a file within the BRAT directory.

PARAMETER DESCRIPTION
filename

The path to the file within the BRAT directory.

TYPE: str

RETURNS DESCRIPTION
text

The text content of the file.

Source code in edsnlp/connectors/brat.py
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
def read_file(self, filename: str) -> str:
    """
    Reads a file within the BRAT directory.

    Parameters
    ----------
    filename:
        The path to the file within the BRAT directory.

    Returns
    -------
    text:
        The text content of the file.
    """
    with open(self.full_path(filename), "r", encoding="utf-8") as f:
        return f.read()
read_texts()

Reads all texts from the BRAT folder.

RETURNS DESCRIPTION
texts

DataFrame containing all texts in the BRAT directory.

Source code in edsnlp/connectors/brat.py
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
def read_texts(self) -> pd.DataFrame:
    """
    Reads all texts from the BRAT folder.

    Returns
    -------
    texts:
        DataFrame containing all texts in the BRAT directory.
    """
    files = os.listdir(self.directory)
    filenames = [f[:-4] for f in files if f.endswith(".txt")]

    assert filenames, f"BRAT directory {self.directory} is empty!"

    logger.info(
        f"The BRAT directory contains {len(filenames)} annotated documents."
    )

    texts = pd.DataFrame(dict(note_id=filenames))

    with tqdm(
        texts.note_id, ascii=True, ncols=100, desc="Text extraction"
    ) as iterator:
        texts["note_text"] = [
            self.read_file(note_id + ".txt") for note_id in iterator
        ]

    return texts
read_brat_annotation(note_id)

Reads BRAT annotation inside the BRAT directory.

PARAMETER DESCRIPTION
note_id

Note ID within the BRAT directory.

TYPE: Union[str, int]

RETURNS DESCRIPTION
annotations

DataFrame containing the annotations for the given note.

Source code in edsnlp/connectors/brat.py
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
def read_brat_annotation(self, note_id: Union[str, int]) -> pd.DataFrame:
    """
    Reads BRAT annotation inside the BRAT directory.

    Parameters
    ----------
    note_id:
        Note ID within the BRAT directory.

    Returns
    -------
    annotations:
        DataFrame containing the annotations for the given note.
    """
    filename = f"{note_id}.ann"
    annotations = read_brat_annotation(self.full_path(filename))
    return annotations
read_annotations(texts)
Source code in edsnlp/connectors/brat.py
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
def read_annotations(self, texts: pd.DataFrame) -> pd.DataFrame:
    dfs = []

    with tqdm(
        texts.note_id, ascii=True, ncols=100, desc="Annotation extraction"
    ) as iterator:
        dfs = Parallel(n_jobs=self.n_jobs)(
            delayed(self.read_brat_annotation)(note_id) for note_id in iterator
        )
        # for note_id in iterator:
        #     dfs.append(self.read_brat_annotation(note_id))

    annotations = pd.concat(dfs, keys=texts.note_id, names=["note_id"])

    annotations = annotations.droplevel(1).reset_index()

    return annotations
get_brat()

Reads texts and annotations, and returns two DataFrame objects.

RETURNS DESCRIPTION
texts

A DataFrame containing two fields, note_id and note_text

annotations

A DataFrame containing the annotations.

Source code in edsnlp/connectors/brat.py
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
def get_brat(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Reads texts and annotations, and returns two DataFrame objects.

    Returns
    -------
    texts:
        A DataFrame containing two fields, `note_id` and `note_text`
    annotations:
        A DataFrame containing the annotations.
    """

    texts = self.read_texts()
    annotations = self.read_annotations(texts)

    return texts, annotations
brat2docs(nlp)

Transforms a BRAT folder to a list of spaCy documents.

PARAMETER DESCRIPTION
nlp

A spaCy pipeline.

TYPE: Language

RETURNS DESCRIPTION
docs

List of spaCy documents, with annotations in the ents attribute.

Source code in edsnlp/connectors/brat.py
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
def brat2docs(self, nlp: Language) -> List[Doc]:
    """
    Transforms a BRAT folder to a list of spaCy documents.

    Parameters
    ----------
    nlp:
        A spaCy pipeline.

    Returns
    -------
    docs:
        List of spaCy documents, with annotations in the `ents` attribute.
    """
    texts, annotations = self.get_brat()

    docs = []

    with tqdm(
        zip(
            texts.note_id,
            nlp.pipe(texts.note_text, batch_size=50, n_process=self.n_jobs),
        ),
        ascii=True,
        ncols=100,
        desc="spaCy conversion",
        total=len(texts),
    ) as iterator:
        for note_id, doc in iterator:

            doc._.note_id = note_id

            ann = annotations.query("note_id == @note_id")

            spans = []

            for _, row in ann.iterrows():
                span = doc.char_span(
                    row.start,
                    row.end,
                    label=row.label,
                    alignment_mode="expand",
                )
                spans.append(span)

            doc.ents = filter_spans(spans)

            docs.append(doc)

    return docs
doc2brat(doc)

Writes a spaCy document to file in the BRAT directory.

PARAMETER DESCRIPTION
doc

spaCy Doc object. The spans in ents will populate the note_id.ann file.

TYPE: Doc

Source code in edsnlp/connectors/brat.py
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
def doc2brat(self, doc: Doc) -> None:
    """
    Writes a spaCy document to file in the BRAT directory.

    Parameters
    ----------
    doc:
        spaCy Doc object. The spans in `ents` will populate the `note_id.ann` file.
    """
    filename = str(doc._.note_id)

    with open(self.full_path(f"{filename}.txt"), "w", encoding="utf-8") as f:
        f.write(doc.text)

    annotations = pd.DataFrame.from_records(
        [
            dict(
                label=ann.label_,
                lexical_variant=ann.text,
                start=ann.start_char,
                end=ann.end_char,
            )
            for ann in doc.ents
        ]
    )

    if len(annotations) > 0:

        annotations["annot"] = (
            annotations.label
            + " "
            + annotations.start.astype(str)
            + " "
            + annotations.end.astype(str)
        )

        annotations["index"] = [f"T{i + 1}" for i in range(len(annotations))]

        annotations = annotations[["index", "annot", "lexical_variant"]]
        annotations.to_csv(
            self.full_path(f"{filename}.ann"),
            sep="\t",
            header=None,
            index=False,
            encoding="utf-8",
        )

    else:
        open(self.full_path(f"{filename}.ann"), "w", encoding="utf-8").close()
docs2brat(docs)

Writes a list of spaCy documents to file.

PARAMETER DESCRIPTION
docs

List of spaCy documents.

TYPE: List[Doc]

Source code in edsnlp/connectors/brat.py
275
276
277
278
279
280
281
282
283
284
285
def docs2brat(self, docs: List[Doc]) -> None:
    """
    Writes a list of spaCy documents to file.

    Parameters
    ----------
    docs:
        List of spaCy documents.
    """
    for doc in docs:
        self.doc2brat(doc)
read_brat_annotation(filename)

Read BRAT annotation file and returns a pandas DataFrame.

PARAMETER DESCRIPTION
filename

Path to the annotation file.

TYPE: str

RETURNS DESCRIPTION
annotations

DataFrame containing the annotations.

Source code in edsnlp/connectors/brat.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
def read_brat_annotation(filename: str) -> pd.DataFrame:
    """
    Read BRAT annotation file and returns a pandas DataFrame.

    Parameters
    ----------
    filename:
        Path to the annotation file.

    Returns
    -------
    annotations:
        DataFrame containing the annotations.
    """

    lines = []

    with open(filename, "r") as f:
        for line in f.readlines():
            lines.append(tuple(line.rstrip("\n").split("\t", 2)))

    if not lines or len(lines[0]) == 1:
        return pd.DataFrame(
            columns=["index", "start", "end", "label", "lexical_variant"]
        )

    annotations = pd.DataFrame(lines, columns=["index", "annot", "lexical_variant"])

    annotations["end"] = annotations.annot.str.split().str[-1]
    annotations["annot"] = annotations.annot.str.split(";").str[0]

    annotations["label"] = annotations.annot.str.split().str[:-2].str.join(" ")
    annotations["start"] = annotations.annot.str.split().str[-2]

    annotations[["start", "end"]] = annotations[["start", "end"]].astype(int)

    annotations = annotations.drop(columns=["annot"])

    return annotations

omop

OmopConnector

Bases: object

[summary]

PARAMETER DESCRIPTION
nlp

spaCy language object.

TYPE: Language

start_char

Name of the column containing the start character index of the entity, by default "start_char"

TYPE: str, optional

end_char

Name of the column containing the end character index of the entity, by default "end_char"

TYPE: str, optional

Source code in edsnlp/connectors/omop.py
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
class OmopConnector(object):
    """
    [summary]

    Parameters
    ----------
    nlp : Language
        spaCy language object.
    start_char : str, optional
        Name of the column containing the start character index of the entity,
        by default "start_char"
    end_char : str, optional
        Name of the column containing the end character index of the entity,
        by default "end_char"
    """

    def __init__(
        self,
        nlp: Language,
        start_char: str = "start_char",
        end_char: str = "end_char",
    ):

        self.start_char = start_char
        self.end_char = end_char

        self.nlp = nlp

    def preprocess(
        self, note: pd.DataFrame, note_nlp: pd.DataFrame
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Preprocess the input OMOP tables: modification of the column names.

        Parameters
        ----------
        note : pd.DataFrame
            OMOP `note` table.
        note_nlp : pd.DataFrame
            OMOP `note_nlp` table.

        Returns
        -------
        note : pd.DataFrame
            OMOP `note` table.
        note_nlp : pd.DataFrame
            OMOP `note_nlp` table.
        """

        note_nlp = note_nlp.rename(
            columns={
                self.start_char: "start_char",
                self.end_char: "end_char",
            }
        )

        return note, note_nlp

    def postprocess(
        self, note: pd.DataFrame, note_nlp: pd.DataFrame
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Postprocess the input OMOP tables: modification of the column names.

        Parameters
        ----------
        note : pd.DataFrame
            OMOP `note` table.
        note_nlp : pd.DataFrame
            OMOP `note_nlp` table.

        Returns
        -------
        note : pd.DataFrame
            OMOP `note` table.
        note_nlp : pd.DataFrame
            OMOP `note_nlp` table.
        """

        note_nlp = note_nlp.rename(
            columns={
                "start_char": self.start_char,
                "end_char": self.end_char,
            }
        )

        return note, note_nlp

    def omop2docs(
        self,
        note: pd.DataFrame,
        note_nlp: pd.DataFrame,
        extensions: Optional[List[str]] = None,
    ) -> List[Doc]:
        """
        Transforms OMOP tables to a list of spaCy documents.

        Parameters
        ----------
        note : pd.DataFrame
            OMOP `note` table.
        note_nlp : pd.DataFrame
            OMOP `note_nlp` table.
        extensions : Optional[List[str]], optional
            Extensions to keep, by default None

        Returns
        -------
        List[Doc]
            List of spaCy documents.
        """
        note, note_nlp = self.preprocess(note, note_nlp)
        return omop2docs(note, note_nlp, self.nlp, extensions)

    def docs2omop(
        self,
        docs: List[Doc],
        extensions: Optional[List[str]] = None,
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Transforms a list of spaCy documents to a pair of OMOP tables.

        Parameters
        ----------
        docs : List[Doc]
            List of spaCy documents.
        extensions : Optional[List[str]], optional
            Extensions to keep, by default None

        Returns
        -------
        note : pd.DataFrame
            OMOP `note` table.
        note_nlp : pd.DataFrame
            OMOP `note_nlp` table.
        """
        note, note_nlp = docs2omop(docs, extensions=extensions)
        note, note_nlp = self.postprocess(note, note_nlp)
        return note, note_nlp
start_char = start_char instance-attribute
end_char = end_char instance-attribute
nlp = nlp instance-attribute
__init__(nlp, start_char='start_char', end_char='end_char')
Source code in edsnlp/connectors/omop.py
201
202
203
204
205
206
207
208
209
210
211
def __init__(
    self,
    nlp: Language,
    start_char: str = "start_char",
    end_char: str = "end_char",
):

    self.start_char = start_char
    self.end_char = end_char

    self.nlp = nlp
preprocess(note, note_nlp)

Preprocess the input OMOP tables: modification of the column names.

PARAMETER DESCRIPTION
note

OMOP note table.

TYPE: pd.DataFrame

note_nlp

OMOP note_nlp table.

TYPE: pd.DataFrame

RETURNS DESCRIPTION
note

OMOP note table.

TYPE: pd.DataFrame

note_nlp

OMOP note_nlp table.

TYPE: pd.DataFrame

Source code in edsnlp/connectors/omop.py
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
def preprocess(
    self, note: pd.DataFrame, note_nlp: pd.DataFrame
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Preprocess the input OMOP tables: modification of the column names.

    Parameters
    ----------
    note : pd.DataFrame
        OMOP `note` table.
    note_nlp : pd.DataFrame
        OMOP `note_nlp` table.

    Returns
    -------
    note : pd.DataFrame
        OMOP `note` table.
    note_nlp : pd.DataFrame
        OMOP `note_nlp` table.
    """

    note_nlp = note_nlp.rename(
        columns={
            self.start_char: "start_char",
            self.end_char: "end_char",
        }
    )

    return note, note_nlp
postprocess(note, note_nlp)

Postprocess the input OMOP tables: modification of the column names.

PARAMETER DESCRIPTION
note

OMOP note table.

TYPE: pd.DataFrame

note_nlp

OMOP note_nlp table.

TYPE: pd.DataFrame

RETURNS DESCRIPTION
note

OMOP note table.

TYPE: pd.DataFrame

note_nlp

OMOP note_nlp table.

TYPE: pd.DataFrame

Source code in edsnlp/connectors/omop.py
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
def postprocess(
    self, note: pd.DataFrame, note_nlp: pd.DataFrame
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Postprocess the input OMOP tables: modification of the column names.

    Parameters
    ----------
    note : pd.DataFrame
        OMOP `note` table.
    note_nlp : pd.DataFrame
        OMOP `note_nlp` table.

    Returns
    -------
    note : pd.DataFrame
        OMOP `note` table.
    note_nlp : pd.DataFrame
        OMOP `note_nlp` table.
    """

    note_nlp = note_nlp.rename(
        columns={
            "start_char": self.start_char,
            "end_char": self.end_char,
        }
    )

    return note, note_nlp
omop2docs(note, note_nlp, extensions=None)

Transforms OMOP tables to a list of spaCy documents.

PARAMETER DESCRIPTION
note

OMOP note table.

TYPE: pd.DataFrame

note_nlp

OMOP note_nlp table.

TYPE: pd.DataFrame

extensions

Extensions to keep, by default None

TYPE: Optional[List[str]], optional DEFAULT: None

RETURNS DESCRIPTION
List[Doc]

List of spaCy documents.

Source code in edsnlp/connectors/omop.py
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
def omop2docs(
    self,
    note: pd.DataFrame,
    note_nlp: pd.DataFrame,
    extensions: Optional[List[str]] = None,
) -> List[Doc]:
    """
    Transforms OMOP tables to a list of spaCy documents.

    Parameters
    ----------
    note : pd.DataFrame
        OMOP `note` table.
    note_nlp : pd.DataFrame
        OMOP `note_nlp` table.
    extensions : Optional[List[str]], optional
        Extensions to keep, by default None

    Returns
    -------
    List[Doc]
        List of spaCy documents.
    """
    note, note_nlp = self.preprocess(note, note_nlp)
    return omop2docs(note, note_nlp, self.nlp, extensions)
docs2omop(docs, extensions=None)

Transforms a list of spaCy documents to a pair of OMOP tables.

PARAMETER DESCRIPTION
docs

List of spaCy documents.

TYPE: List[Doc]

extensions

Extensions to keep, by default None

TYPE: Optional[List[str]], optional DEFAULT: None

RETURNS DESCRIPTION
note

OMOP note table.

TYPE: pd.DataFrame

note_nlp

OMOP note_nlp table.

TYPE: pd.DataFrame

Source code in edsnlp/connectors/omop.py
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
def docs2omop(
    self,
    docs: List[Doc],
    extensions: Optional[List[str]] = None,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Transforms a list of spaCy documents to a pair of OMOP tables.

    Parameters
    ----------
    docs : List[Doc]
        List of spaCy documents.
    extensions : Optional[List[str]], optional
        Extensions to keep, by default None

    Returns
    -------
    note : pd.DataFrame
        OMOP `note` table.
    note_nlp : pd.DataFrame
        OMOP `note_nlp` table.
    """
    note, note_nlp = docs2omop(docs, extensions=extensions)
    note, note_nlp = self.postprocess(note, note_nlp)
    return note, note_nlp
omop2docs(note, note_nlp, nlp, extensions=None)

Transforms an OMOP-formatted pair of dataframes into a list of documents.

PARAMETER DESCRIPTION
note

The OMOP note table.

TYPE: pd.DataFrame

note_nlp

The OMOP note_nlp table

TYPE: pd.DataFrame

nlp

spaCy language object.

TYPE: Language

extensions

Extensions to keep, by default None

TYPE: Optional[List[str]], optional DEFAULT: None

RETURNS DESCRIPTION
List[Doc]

List of spaCy documents

Source code in edsnlp/connectors/omop.py
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def omop2docs(
    note: pd.DataFrame,
    note_nlp: pd.DataFrame,
    nlp: Language,
    extensions: Optional[List[str]] = None,
) -> List[Doc]:
    """
    Transforms an OMOP-formatted pair of dataframes into a list of documents.

    Parameters
    ----------
    note : pd.DataFrame
        The OMOP `note` table.
    note_nlp : pd.DataFrame
        The OMOP `note_nlp` table
    nlp : Language
        spaCy language object.
    extensions : Optional[List[str]], optional
        Extensions to keep, by default None

    Returns
    -------
    List[Doc] :
        List of spaCy documents
    """

    note = note.copy()
    note_nlp = note_nlp.copy()

    extensions = extensions or []

    def row2ent(row):
        d = dict(
            start_char=row.start_char,
            end_char=row.end_char,
            label=row.get("note_nlp_source_value"),
            extensions={ext: row.get(ext) for ext in extensions},
        )

        return d

    # Create entities
    note_nlp["ents"] = note_nlp.apply(row2ent, axis=1)

    note_nlp = note_nlp.groupby("note_id", as_index=False)["ents"].agg(list)

    note = note.merge(note_nlp, on=["note_id"], how="left")

    # Generate documents
    note["doc"] = note.note_text.apply(nlp)

    # Process documents
    for _, row in note.iterrows():

        doc = row.doc
        doc._.note_id = row.note_id
        doc._.note_datetime = row.get("note_datetime")

        ents = []

        if not isinstance(row.ents, list):
            continue

        for ent in row.ents:

            span = doc.char_span(
                ent["start_char"],
                ent["end_char"],
                ent["label"],
                alignment_mode="expand",
            )

            for k, v in ent["extensions"].items():
                setattr(span._, k, v)

            ents.append(span)

            if span.label_ not in doc.spans:
                doc.spans[span.label_] = [span]
            else:
                doc.spans[span.label_].append(span)

        ents, discarded = filter_spans(ents, return_discarded=True)

        doc.ents = ents

        if "discarded" not in doc.spans:
            doc.spans["discarded"] = []
        doc.spans["discarded"].extend(discarded)

    return list(note.doc)
docs2omop(docs, extensions=None)

Transforms a list of spaCy docs to a pair of OMOP tables.

PARAMETER DESCRIPTION
docs

List of documents to transform.

TYPE: List[Doc]

extensions

Extensions to keep, by default None

TYPE: Optional[List[str]], optional DEFAULT: None

RETURNS DESCRIPTION
Tuple[pd.DataFrame, pd.DataFrame]

Pair of OMOP tables (note and note_nlp)

Source code in edsnlp/connectors/omop.py
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
def docs2omop(
    docs: List[Doc],
    extensions: Optional[List[str]] = None,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Transforms a list of spaCy docs to a pair of OMOP tables.

    Parameters
    ----------
    docs : List[Doc]
        List of documents to transform.
    extensions : Optional[List[str]], optional
        Extensions to keep, by default None

    Returns
    -------
    Tuple[pd.DataFrame, pd.DataFrame]
        Pair of OMOP tables (`note` and `note_nlp`)
    """

    df = pd.DataFrame(dict(doc=docs))

    df["note_text"] = df.doc.apply(lambda doc: doc.text)
    df["note_id"] = df.doc.apply(lambda doc: doc._.note_id)
    df["note_datetime"] = df.doc.apply(lambda doc: doc._.note_datetime)

    if df.note_id.isna().any():
        df["note_id"] = range(len(df))

    df["ents"] = df.doc.apply(lambda doc: list(doc.ents))
    df["ents"] += df.doc.apply(lambda doc: list(doc.spans["discarded"]))

    note = df[["note_id", "note_text", "note_datetime"]]

    df = df[["note_id", "ents"]].explode("ents")

    extensions = extensions or []

    def ent2dict(
        ent: Span,
    ) -> Dict[str, Any]:

        d = dict(
            start_char=ent.start_char,
            end_char=ent.end_char,
            note_nlp_source_value=ent.label_,
            lexical_variant=ent.text,
            # normalized_variant=ent._.normalized.text,
        )

        for ext in extensions:
            d[ext] = getattr(ent._, ext)

        return d

    df["ents"] = df.ents.apply(ent2dict)

    columns = [
        "start_char",
        "end_char",
        "note_nlp_source_value",
        "lexical_variant",
        # "normalized_variant",
    ]
    columns += extensions

    df[columns] = df.ents.apply(pd.Series)

    df["term_modifiers"] = ""

    for i, ext in enumerate(extensions):
        if i > 0:
            df.term_modifiers += ";"
        df.term_modifiers += ext + "=" + df[ext].astype(str)

    df["note_nlp_id"] = range(len(df))

    note_nlp = df[["note_nlp_id", "note_id"] + columns]

    return note, note_nlp

labeltool

docs2labeltool(docs, extensions=None)

Returns a labeltool-ready dataframe from a list of annotated document.

PARAMETER DESCRIPTION
docs

List of annotated spacy docs.

TYPE: List[Doc]

extensions

List of extensions to use by labeltool.

TYPE: Optional[List[str]] DEFAULT: None

RETURNS DESCRIPTION
df

DataFrame tailored for labeltool.

Source code in edsnlp/connectors/labeltool.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
def docs2labeltool(
    docs: List[Doc],
    extensions: Optional[List[str]] = None,
) -> pd.DataFrame:
    """
    Returns a labeltool-ready dataframe from a list of annotated document.

    Parameters
    ----------
    docs: list of spaCy Doc
        List of annotated spacy docs.
    extensions: list of extensions
        List of extensions to use by labeltool.

    Returns
    -------
    df: pd.DataFrame
        DataFrame tailored for labeltool.
    """

    if extensions is None:
        extensions = []

    entities = []

    for i, doc in enumerate(tqdm(docs, ascii=True, ncols=100)):
        for ent in doc.ents:
            d = dict(
                note_text=doc.text,
                offset_begin=ent.start_char,
                offset_end=ent.end_char,
                label_name=ent.label_,
                label_value=ent.text,
            )

            d["note_id"] = doc._.note_id or i

            for ext in extensions:
                d[ext] = getattr(ent._, ext)

            entities.append(d)

    df = pd.DataFrame.from_records(entities)

    columns = [
        "note_id",
        "note_text",
        "offset_begin",
        "offset_end",
        "label_name",
        "label_value",
    ]

    df = df[columns + extensions]

    return df

pipelines

terminations

termination: List[str] = ['et', 'bien que', 'même si', 'mais', 'or', 'alors que', 'sauf', 'cependant', 'pourtant', 'cause de', 'source de', 'hormis', 'car', 'parce que', 'pourtant', 'puisque', 'ni', 'en raison de', 'qui', 'que', 'ainsi que', 'avec', 'toutefois', 'en dehors', 'dans le cadre', 'du fait', '.', ',', ';', '...', '…', '(', ')', '"'] module-attribute

factories

base

BaseComponent

Bases: object

The BaseComponent adds a set_extensions method, called at the creation of the object.

It helps decouple the initialisation of the pipeline from the creation of extensions, and is particularly usefull when distributing EDSNLP on a cluster, since the serialisation mechanism imposes that the extensions be reset.

Source code in edsnlp/pipelines/base.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
class BaseComponent(object):
    """
    The `BaseComponent` adds a `set_extensions` method,
    called at the creation of the object.

    It helps decouple the initialisation of the pipeline from
    the creation of extensions, and is particularly usefull when
    distributing EDSNLP on a cluster, since the serialisation mechanism
    imposes that the extensions be reset.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:
        """
        Set `Doc`, `Span` and `Token` extensions.
        """
        pass

    def _boundaries(
        self, doc: Doc, terminations: Optional[List[Span]] = None
    ) -> List[Tuple[int, int]]:
        """
        Create sub sentences based sentences and terminations found in text.

        Parameters
        ----------
        doc:
            spaCy Doc object
        terminations:
            List of tuples with (match_id, start, end)

        Returns
        -------
        boundaries:
            List of tuples with (start, end) of spans
        """

        if terminations is None:
            terminations = []

        sent_starts = [sent.start for sent in doc.sents]
        termination_starts = [t.start for t in terminations]

        starts = sent_starts + termination_starts + [len(doc)]

        # Remove duplicates
        starts = list(set(starts))

        # Sort starts
        starts.sort()

        boundaries = [(start, end) for start, end in zip(starts[:-1], starts[1:])]

        return boundaries
__init__(*args, **kwargs)
Source code in edsnlp/pipelines/base.py
17
18
19
20
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)

    self.set_extensions()
set_extensions()

Set Doc, Span and Token extensions.

Source code in edsnlp/pipelines/base.py
22
23
24
25
26
27
@staticmethod
def set_extensions() -> None:
    """
    Set `Doc`, `Span` and `Token` extensions.
    """
    pass
_boundaries(doc, terminations=None)

Create sub sentences based sentences and terminations found in text.

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

terminations

List of tuples with (match_id, start, end)

TYPE: Optional[List[Span]] DEFAULT: None

RETURNS DESCRIPTION
boundaries

List of tuples with (start, end) of spans

Source code in edsnlp/pipelines/base.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
def _boundaries(
    self, doc: Doc, terminations: Optional[List[Span]] = None
) -> List[Tuple[int, int]]:
    """
    Create sub sentences based sentences and terminations found in text.

    Parameters
    ----------
    doc:
        spaCy Doc object
    terminations:
        List of tuples with (match_id, start, end)

    Returns
    -------
    boundaries:
        List of tuples with (start, end) of spans
    """

    if terminations is None:
        terminations = []

    sent_starts = [sent.start for sent in doc.sents]
    termination_starts = [t.start for t in terminations]

    starts = sent_starts + termination_starts + [len(doc)]

    # Remove duplicates
    starts = list(set(starts))

    # Sort starts
    starts.sort()

    boundaries = [(start, end) for start, end in zip(starts[:-1], starts[1:])]

    return boundaries

core

sentences
terms
punctuation = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹', '।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄', '᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿', '‼', '‽', '⁇', '⁈', '⁉', '⸮', '⸼', '꓿', '꘎', '꘏', '꛳', '꛷', '꡶', '꡷', '꣎', '꣏', '꤯', '꧈', '꧉', '꩝', '꩞', '꩟', '꫰', '꫱', '꯫', '﹒', '﹖', '﹗', '!', '.', '?', '𐩖', '𐩗', '𑁇', '𑁈', '𑂾', '𑂿', '𑃀', '𑃁', '𑅁', '𑅂', '𑅃', '𑇅', '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼', '𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐', '𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂', '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈', '。', '。'] module-attribute
sentences
SentenceSegmenter

Bases: object

Segments the Doc into sentences using a rule-based strategy, specific to AP-HP documents.

Applies the same rule-based pipeline as spaCy's sentencizer, and adds a simple rule on the new lines : if a new line is followed by a capitalised word, then it is also an end of sentence.

DOCS: https://spacy.io/api/sentencizer

Arguments

punct_chars : Optional[List[str]] Punctuation characters. use_endlines : bool Whether to use endlines prediction.

Source code in edsnlp/pipelines/core/sentences/sentences.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
class SentenceSegmenter(object):
    """
    Segments the Doc into sentences using a rule-based strategy,
    specific to AP-HP documents.

    Applies the same rule-based pipeline as spaCy's sentencizer,
    and adds a simple rule on the new lines : if a new line is followed by a
    capitalised word, then it is also an end of sentence.

    DOCS: https://spacy.io/api/sentencizer

    Arguments
    ---------
    punct_chars : Optional[List[str]]
        Punctuation characters.
    use_endlines : bool
        Whether to use endlines prediction.
    """

    def __init__(
        self,
        punct_chars: Optional[List[str]],
        use_endlines: bool,
    ):

        if punct_chars is None:
            punct_chars = punctuation

        self.punct_chars = set(punct_chars)
        self.use_endlines = use_endlines

    def __call__(self, doc: Doc) -> Doc:
        """
        Segments the document in sentences.

        Arguments
        ---------
        doc:
            A spacy Doc object.

        Returns
        -------
        doc:
            A spaCy Doc object, annotated for sentences.
        """

        if not doc:
            return doc

        doc[0].sent_start = True

        seen_period = False
        seen_newline = False

        for i, token in enumerate(doc):
            is_in_punct_chars = token.text in self.punct_chars
            is_newline = token.is_space and "\n" in token.text

            if self.use_endlines:
                end_line = getattr(token._, "end_line", None)
                is_newline = is_newline and (end_line or end_line is None)

            token.sent_start = (
                i == 0
            )  # To set the attributes at False by default for the other tokens
            if seen_period or seen_newline:
                if token.is_punct or is_in_punct_chars or is_newline:
                    continue
                if seen_period:
                    token.sent_start = True
                    seen_newline = False
                    seen_period = False
                else:
                    token.sent_start = token.shape_.startswith("Xx")
                    seen_newline = False
                    seen_period = False
            elif is_in_punct_chars:
                seen_period = True
            elif is_newline:
                seen_newline = True

        return doc
punct_chars = set(punct_chars) instance-attribute
use_endlines = use_endlines instance-attribute
__init__(punct_chars, use_endlines)
Source code in edsnlp/pipelines/core/sentences/sentences.py
27
28
29
30
31
32
33
34
35
36
37
def __init__(
    self,
    punct_chars: Optional[List[str]],
    use_endlines: bool,
):

    if punct_chars is None:
        punct_chars = punctuation

    self.punct_chars = set(punct_chars)
    self.use_endlines = use_endlines
__call__(doc)

Segments the document in sentences.

Arguments

doc: A spacy Doc object.

RETURNS DESCRIPTION
doc

A spaCy Doc object, annotated for sentences.

Source code in edsnlp/pipelines/core/sentences/sentences.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
def __call__(self, doc: Doc) -> Doc:
    """
    Segments the document in sentences.

    Arguments
    ---------
    doc:
        A spacy Doc object.

    Returns
    -------
    doc:
        A spaCy Doc object, annotated for sentences.
    """

    if not doc:
        return doc

    doc[0].sent_start = True

    seen_period = False
    seen_newline = False

    for i, token in enumerate(doc):
        is_in_punct_chars = token.text in self.punct_chars
        is_newline = token.is_space and "\n" in token.text

        if self.use_endlines:
            end_line = getattr(token._, "end_line", None)
            is_newline = is_newline and (end_line or end_line is None)

        token.sent_start = (
            i == 0
        )  # To set the attributes at False by default for the other tokens
        if seen_period or seen_newline:
            if token.is_punct or is_in_punct_chars or is_newline:
                continue
            if seen_period:
                token.sent_start = True
                seen_newline = False
                seen_period = False
            else:
                token.sent_start = token.shape_.startswith("Xx")
                seen_newline = False
                seen_period = False
        elif is_in_punct_chars:
            seen_period = True
        elif is_newline:
            seen_newline = True

    return doc
factory
DEFAULT_CONFIG = dict(punct_chars=None, use_endlines=True) module-attribute
create_component(nlp, name, punct_chars, use_endlines)
Source code in edsnlp/pipelines/core/sentences/factory.py
15
16
17
18
19
20
21
22
23
24
25
26
@deprecated_factory("sentences", "eds.sentences", default_config=DEFAULT_CONFIG)
@Language.factory("eds.sentences", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    punct_chars: Optional[List[str]],
    use_endlines: bool,
):
    return SentenceSegmenter(
        punct_chars=punct_chars,
        use_endlines=use_endlines,
    )
matcher
matcher
GenericMatcher

Bases: BaseComponent

Provides a generic matcher component.

PARAMETER DESCRIPTION
nlp

The spaCy object.

TYPE: Language

terms

A dictionary of terms.

TYPE: Optional[Patterns]

regex

A dictionary of regular expressions.

TYPE: Optional[Patterns]

attr

The default attribute to use for matching. Can be overiden using the terms and regex configurations.

TYPE: str

filter_matches

Whether to filter out matches.

TYPE: bool

on_ents_only

Whether to to look for matches around pre-extracted entities only.

TYPE: bool

ignore_excluded

Whether to skip excluded tokens (requires an upstream pipeline to mark excluded tokens).

TYPE: bool

Source code in edsnlp/pipelines/core/matcher/matcher.py
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
class GenericMatcher(BaseComponent):
    """
    Provides a generic matcher component.

    Parameters
    ----------
    nlp : Language
        The spaCy object.
    terms : Optional[Patterns]
        A dictionary of terms.
    regex : Optional[Patterns]
        A dictionary of regular expressions.
    attr : str
        The default attribute to use for matching.
        Can be overiden using the `terms` and `regex` configurations.
    filter_matches : bool
        Whether to filter out matches.
    on_ents_only : bool
        Whether to to look for matches around pre-extracted entities only.
    ignore_excluded : bool
        Whether to skip excluded tokens (requires an upstream
        pipeline to mark excluded tokens).
    """

    def __init__(
        self,
        nlp: Language,
        terms: Optional[Patterns],
        regex: Optional[Patterns],
        attr: str,
        ignore_excluded: bool,
    ):

        self.nlp = nlp

        self.attr = attr

        self.phrase_matcher = EDSPhraseMatcher(
            self.nlp.vocab,
            attr=attr,
            ignore_excluded=ignore_excluded,
        )
        self.regex_matcher = RegexMatcher(
            attr=attr,
            ignore_excluded=ignore_excluded,
        )

        self.phrase_matcher.build_patterns(nlp=nlp, terms=terms)
        self.regex_matcher.build_patterns(regex=regex)

        self.set_extensions()

    def process(self, doc: Doc) -> List[Span]:
        """
        Find matching spans in doc.

        Parameters
        ----------
        doc:
            spaCy Doc object.

        Returns
        -------
        spans:
            List of Spans returned by the matchers.
        """

        matches = self.phrase_matcher(doc, as_spans=True)
        regex_matches = self.regex_matcher(doc, as_spans=True)

        spans = list(matches) + list(regex_matches)

        return spans

    def __call__(self, doc: Doc) -> Doc:
        """
        Adds spans to document.

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        doc:
            spaCy Doc object, annotated for extracted terms.
        """
        matches = self.process(doc)

        for span in matches:
            if span.label_ not in doc.spans:
                doc.spans[span.label_] = []
            doc.spans[span.label_].append(span)

        ents, discarded = filter_spans(list(doc.ents) + matches, return_discarded=True)

        doc.ents = ents

        if "discarded" not in doc.spans:
            doc.spans["discarded"] = []
        doc.spans["discarded"].extend(discarded)

        return doc
nlp = nlp instance-attribute
attr = attr instance-attribute
phrase_matcher = EDSPhraseMatcher(self.nlp.vocab, attr=attr, ignore_excluded=ignore_excluded) instance-attribute
regex_matcher = RegexMatcher(attr=attr, ignore_excluded=ignore_excluded) instance-attribute
__init__(nlp, terms, regex, attr, ignore_excluded)
Source code in edsnlp/pipelines/core/matcher/matcher.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
def __init__(
    self,
    nlp: Language,
    terms: Optional[Patterns],
    regex: Optional[Patterns],
    attr: str,
    ignore_excluded: bool,
):

    self.nlp = nlp

    self.attr = attr

    self.phrase_matcher = EDSPhraseMatcher(
        self.nlp.vocab,
        attr=attr,
        ignore_excluded=ignore_excluded,
    )
    self.regex_matcher = RegexMatcher(
        attr=attr,
        ignore_excluded=ignore_excluded,
    )

    self.phrase_matcher.build_patterns(nlp=nlp, terms=terms)
    self.regex_matcher.build_patterns(regex=regex)

    self.set_extensions()
process(doc)

Find matching spans in doc.

PARAMETER DESCRIPTION
doc

spaCy Doc object.

TYPE: Doc

RETURNS DESCRIPTION
spans

List of Spans returned by the matchers.

Source code in edsnlp/pipelines/core/matcher/matcher.py
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
def process(self, doc: Doc) -> List[Span]:
    """
    Find matching spans in doc.

    Parameters
    ----------
    doc:
        spaCy Doc object.

    Returns
    -------
    spans:
        List of Spans returned by the matchers.
    """

    matches = self.phrase_matcher(doc, as_spans=True)
    regex_matches = self.regex_matcher(doc, as_spans=True)

    spans = list(matches) + list(regex_matches)

    return spans
__call__(doc)

Adds spans to document.

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
doc

spaCy Doc object, annotated for extracted terms.

Source code in edsnlp/pipelines/core/matcher/matcher.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
def __call__(self, doc: Doc) -> Doc:
    """
    Adds spans to document.

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    doc:
        spaCy Doc object, annotated for extracted terms.
    """
    matches = self.process(doc)

    for span in matches:
        if span.label_ not in doc.spans:
            doc.spans[span.label_] = []
        doc.spans[span.label_].append(span)

    ents, discarded = filter_spans(list(doc.ents) + matches, return_discarded=True)

    doc.ents = ents

    if "discarded" not in doc.spans:
        doc.spans["discarded"] = []
    doc.spans["discarded"].extend(discarded)

    return doc
factory
DEFAULT_CONFIG = dict(terms=None, regex=None, attr='TEXT', ignore_excluded=False) module-attribute
create_component(nlp, name, terms, attr, regex, ignore_excluded)
Source code in edsnlp/pipelines/core/matcher/factory.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
@deprecated_factory("matcher", "eds.matcher", default_config=DEFAULT_CONFIG)
@Language.factory("eds.matcher", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    terms: Optional[Dict[str, Union[str, List[str]]]],
    attr: Union[str, Dict[str, str]],
    regex: Optional[Dict[str, Union[str, List[str]]]],
    ignore_excluded: bool,
):
    assert not (terms is None and regex is None)

    if terms is None:
        terms = dict()
    if regex is None:
        regex = dict()

    return GenericMatcher(
        nlp,
        terms=terms,
        attr=attr,
        regex=regex,
        ignore_excluded=ignore_excluded,
    )
endlines
functional
_get_label(prediction)

Returns the label for the prediction PREDICTED_END_LINE

PARAMETER DESCRIPTION
prediction

value of PREDICTED_END_LINE

TYPE: bool

RETURNS DESCRIPTION
str

Label for PREDICTED_END_LINE

Source code in edsnlp/pipelines/core/endlines/functional.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
def _get_label(prediction: bool) -> str:
    """Returns the label for the prediction `PREDICTED_END_LINE`

    Parameters
    ----------
    prediction : bool
        value of `PREDICTED_END_LINE`

    Returns
    -------
    str
        Label for `PREDICTED_END_LINE`
    """
    if prediction:
        return "end_line"
    else:
        return "space"
get_dir_path(file)
Source code in edsnlp/pipelines/core/endlines/functional.py
26
27
28
def get_dir_path(file):
    path_file = os.path.dirname(os.path.realpath(file))
    return path_file
build_path(file, relative_path)

Function to build an absolut path.

PARAMETER DESCRIPTION
file

relative_path

relative path from the main file to the desired output

RETURNS DESCRIPTION
path
Source code in edsnlp/pipelines/core/endlines/functional.py
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
def build_path(file, relative_path):
    """
    Function to build an absolut path.

    Parameters
    ----------
    file: main file from where we are calling. It could be __file__
    relative_path: str,
        relative path from the main file to the desired output

    Returns
    -------
    path: absolute path
    """
    dir_path = get_dir_path(file)
    path = os.path.abspath(os.path.join(dir_path, relative_path))
    return path
_convert_series_to_array(s)

Converts pandas series of n elements to an array of shape (n,1).

PARAMETER DESCRIPTION
s

TYPE: pd.Series

RETURNS DESCRIPTION
np.ndarray
Source code in edsnlp/pipelines/core/endlines/functional.py
50
51
52
53
54
55
56
57
58
59
60
61
62
def _convert_series_to_array(s: pd.Series) -> np.ndarray:
    """Converts pandas series of n elements to an array of shape (n,1).

    Parameters
    ----------
    s : pd.Series

    Returns
    -------
    np.ndarray
    """
    X = s.to_numpy().reshape(-1, 1).astype("O")  # .astype(np.int64)
    return X
endlines
EndLines

Bases: GenericMatcher

spaCy Pipeline to detect whether a newline character should be considered a space (ie introduced by the PDF).

The pipeline will add the extension end_line to spans and tokens. The end_line attribute is a boolean or None, set to True if the pipeline predicts that the new line is an end line character. Otherwise, it is set to False if the new line is classified as a space. If no classification has been done over that token, it will remain None.

PARAMETER DESCRIPTION
nlp

spaCy nlp pipeline to use for matching.

TYPE: Language

end_lines_model : Optional[Union[str, EndLinesModel]], by default None path to trained model. If None, it will use a default model

Source code in edsnlp/pipelines/core/endlines/endlines.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
class EndLines(GenericMatcher):
    """
    spaCy Pipeline to detect whether a newline character should
    be considered a space (ie introduced by the PDF).

    The pipeline will add the extension `end_line` to spans
    and tokens. The `end_line` attribute is a boolean or `None`,
    set to `True` if the pipeline predicts that the new line
    is an end line character. Otherwise, it is  set to `False`
    if the new line is classified as a space. If no classification
    has been done over that token, it will remain `None`.

    Parameters
    ----------
    nlp : Language
        spaCy nlp pipeline to use for matching.

    end_lines_model : Optional[Union[str, EndLinesModel]], by default None
        path to trained model. If None, it will use a default model
    """

    def __init__(
        self,
        nlp: Language,
        end_lines_model: Optional[Union[str, EndLinesModel]],
        **kwargs,
    ):

        super().__init__(
            nlp,
            terms=None,
            attr="TEXT",
            regex=dict(
                new_line=r"\n+",
            ),
            ignore_excluded=False,
            **kwargs,
        )

        if not Token.has_extension("end_line"):
            Token.set_extension("end_line", default=None)

        if not Span.has_extension("end_line"):
            Span.set_extension("end_line", default=None)

        self._read_model(end_lines_model)

    def _read_model(self, end_lines_model: Optional[Union[str, EndLinesModel]]):
        """
        Parameters
        ----------
        end_lines_model : Optional[Union[str, EndLinesModel]]

        Raises
        ------
        TypeError
        """
        if end_lines_model is None:
            path = build_path(__file__, "base_model.pkl")

            with open(path, "rb") as inp:
                self.model = pickle.load(inp)
        elif type(end_lines_model) == str:
            with open(end_lines_model, "rb") as inp:
                self.model = pickle.load(inp)
        elif type(end_lines_model) == EndLinesModel:
            self.model = end_lines_model
        else:
            raise TypeError(
                "type(`end_lines_model`) should be one of {None, str, EndLinesModel}"
            )

    @staticmethod
    def _spacy_compute_a3a4(token: Token) -> str:
        """Function to compute A3 and A4

        Parameters
        ----------
        token : Token

        Returns
        -------
        str
        """

        if token.is_upper:
            return "UPPER"

        elif token.shape_.startswith("Xx"):
            return "S_UPPER"

        elif token.shape_.startswith("x"):
            return "LOWER"

        elif (token.is_digit) & (
            (token.doc[max(token.i - 1, 0)].is_punct)
            | (token.doc[min(token.i + 1, len(token.doc) - 1)].is_punct)
        ):
            return "ENUMERATION"

        elif token.is_digit:
            return "DIGIT"

        elif (token.is_punct) & (token.text in [".", ";", "..", "..."]):
            return "STRONG_PUNCT"

        elif (token.is_punct) & (token.text not in [".", ";", "..", "..."]):
            return "SOFT_PUNCT"

        else:
            return "OTHER"

    @staticmethod
    def _compute_length(doc: Doc, start: int, end: int) -> int:
        """Compute length without spaces

        Parameters
        ----------
        doc : Doc
        start : int
        end : int

        Returns
        -------
        int
        """
        length = 0
        for t in doc[start:end]:
            length += len(t.text)

        return length

    def _get_df(self, doc: Doc, new_lines: List[Span]) -> pd.DataFrame:
        """Get a pandas DataFrame to call the classifier

        Parameters
        ----------
        doc : Doc
        new_lines : List[Span]

        Returns
        -------
        pd.DataFrame
        """

        data = []
        for i, span in enumerate(new_lines):
            start = span.start
            end = span.end

            max_index = len(doc) - 1
            a1_token = doc[max(start - 1, 0)]
            a2_token = doc[min(start + 1, max_index)]
            a1 = a1_token.orth
            a2 = a2_token.orth
            a3 = self._spacy_compute_a3a4(a1_token)
            a4 = self._spacy_compute_a3a4(a2_token)
            blank_line = "\n\n" in span.text

            if i > 0:
                start_previous = new_lines[i - 1].start + 1
            else:
                start_previous = 0

            length = self._compute_length(
                doc, start=start_previous, end=start
            )  # It's ok cause i count the total length from the previous up to this one

            data_dict = dict(
                span_start=start,
                span_end=end,
                A1=a1,
                A2=a2,
                A3=a3,
                A4=a4,
                BLANK_LINE=blank_line,
                length=length,
            )
            data.append(data_dict)

        df = pd.DataFrame(data)

        mu = df["length"].mean()
        sigma = df["length"].std()
        if np.isnan(sigma):
            sigma = 1

        cv = sigma / mu
        df["B1"] = (df["length"] - mu) / sigma
        df["B2"] = cv

        return df

    def __call__(self, doc: Doc) -> Doc:
        """
        Predict for each new line if it's an end of line or a space.

        Parameters
        ----------
        doc: spaCy Doc object

        Returns
        -------
        doc: spaCy Doc object, with each new line annotated
        """

        matches = self.process(doc)
        new_lines = get_spans(matches, "new_line")

        if len(new_lines) > 0:
            df = self._get_df(doc=doc, new_lines=new_lines)
            df = self.model.predict(df)

            spans = []
            for span, prediction in zip(new_lines, df.PREDICTED_END_LINE):

                span.label_ = _get_label(prediction)
                span._.end_line = prediction

                spans.append(span)
                for t in span:
                    t._.end_line = prediction
                    if not prediction:
                        t._.excluded = True

            doc.spans["new_lines"] = spans
        return doc
__init__(nlp, end_lines_model, **kwargs)
Source code in edsnlp/pipelines/core/endlines/endlines.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def __init__(
    self,
    nlp: Language,
    end_lines_model: Optional[Union[str, EndLinesModel]],
    **kwargs,
):

    super().__init__(
        nlp,
        terms=None,
        attr="TEXT",
        regex=dict(
            new_line=r"\n+",
        ),
        ignore_excluded=False,
        **kwargs,
    )

    if not Token.has_extension("end_line"):
        Token.set_extension("end_line", default=None)

    if not Span.has_extension("end_line"):
        Span.set_extension("end_line", default=None)

    self._read_model(end_lines_model)
_read_model(end_lines_model)
PARAMETER DESCRIPTION
end_lines_model

TYPE: Optional[Union[str, EndLinesModel]]

RAISES DESCRIPTION
TypeError
Source code in edsnlp/pipelines/core/endlines/endlines.py
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
def _read_model(self, end_lines_model: Optional[Union[str, EndLinesModel]]):
    """
    Parameters
    ----------
    end_lines_model : Optional[Union[str, EndLinesModel]]

    Raises
    ------
    TypeError
    """
    if end_lines_model is None:
        path = build_path(__file__, "base_model.pkl")

        with open(path, "rb") as inp:
            self.model = pickle.load(inp)
    elif type(end_lines_model) == str:
        with open(end_lines_model, "rb") as inp:
            self.model = pickle.load(inp)
    elif type(end_lines_model) == EndLinesModel:
        self.model = end_lines_model
    else:
        raise TypeError(
            "type(`end_lines_model`) should be one of {None, str, EndLinesModel}"
        )
_spacy_compute_a3a4(token)

Function to compute A3 and A4

PARAMETER DESCRIPTION
token

TYPE: Token

RETURNS DESCRIPTION
str
Source code in edsnlp/pipelines/core/endlines/endlines.py
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
@staticmethod
def _spacy_compute_a3a4(token: Token) -> str:
    """Function to compute A3 and A4

    Parameters
    ----------
    token : Token

    Returns
    -------
    str
    """

    if token.is_upper:
        return "UPPER"

    elif token.shape_.startswith("Xx"):
        return "S_UPPER"

    elif token.shape_.startswith("x"):
        return "LOWER"

    elif (token.is_digit) & (
        (token.doc[max(token.i - 1, 0)].is_punct)
        | (token.doc[min(token.i + 1, len(token.doc) - 1)].is_punct)
    ):
        return "ENUMERATION"

    elif token.is_digit:
        return "DIGIT"

    elif (token.is_punct) & (token.text in [".", ";", "..", "..."]):
        return "STRONG_PUNCT"

    elif (token.is_punct) & (token.text not in [".", ";", "..", "..."]):
        return "SOFT_PUNCT"

    else:
        return "OTHER"
_compute_length(doc, start, end)

Compute length without spaces

PARAMETER DESCRIPTION
doc

TYPE: Doc

start

TYPE: int

end

TYPE: int

RETURNS DESCRIPTION
int
Source code in edsnlp/pipelines/core/endlines/endlines.py
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
@staticmethod
def _compute_length(doc: Doc, start: int, end: int) -> int:
    """Compute length without spaces

    Parameters
    ----------
    doc : Doc
    start : int
    end : int

    Returns
    -------
    int
    """
    length = 0
    for t in doc[start:end]:
        length += len(t.text)

    return length
_get_df(doc, new_lines)

Get a pandas DataFrame to call the classifier

PARAMETER DESCRIPTION
doc

TYPE: Doc

new_lines

TYPE: List[Span]

RETURNS DESCRIPTION
pd.DataFrame
Source code in edsnlp/pipelines/core/endlines/endlines.py
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
def _get_df(self, doc: Doc, new_lines: List[Span]) -> pd.DataFrame:
    """Get a pandas DataFrame to call the classifier

    Parameters
    ----------
    doc : Doc
    new_lines : List[Span]

    Returns
    -------
    pd.DataFrame
    """

    data = []
    for i, span in enumerate(new_lines):
        start = span.start
        end = span.end

        max_index = len(doc) - 1
        a1_token = doc[max(start - 1, 0)]
        a2_token = doc[min(start + 1, max_index)]
        a1 = a1_token.orth
        a2 = a2_token.orth
        a3 = self._spacy_compute_a3a4(a1_token)
        a4 = self._spacy_compute_a3a4(a2_token)
        blank_line = "\n\n" in span.text

        if i > 0:
            start_previous = new_lines[i - 1].start + 1
        else:
            start_previous = 0

        length = self._compute_length(
            doc, start=start_previous, end=start
        )  # It's ok cause i count the total length from the previous up to this one

        data_dict = dict(
            span_start=start,
            span_end=end,
            A1=a1,
            A2=a2,
            A3=a3,
            A4=a4,
            BLANK_LINE=blank_line,
            length=length,
        )
        data.append(data_dict)

    df = pd.DataFrame(data)

    mu = df["length"].mean()
    sigma = df["length"].std()
    if np.isnan(sigma):
        sigma = 1

    cv = sigma / mu
    df["B1"] = (df["length"] - mu) / sigma
    df["B2"] = cv

    return df
__call__(doc)

Predict for each new line if it's an end of line or a space.

PARAMETER DESCRIPTION
doc

TYPE: Doc

RETURNS DESCRIPTION
doc
Source code in edsnlp/pipelines/core/endlines/endlines.py
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
def __call__(self, doc: Doc) -> Doc:
    """
    Predict for each new line if it's an end of line or a space.

    Parameters
    ----------
    doc: spaCy Doc object

    Returns
    -------
    doc: spaCy Doc object, with each new line annotated
    """

    matches = self.process(doc)
    new_lines = get_spans(matches, "new_line")

    if len(new_lines) > 0:
        df = self._get_df(doc=doc, new_lines=new_lines)
        df = self.model.predict(df)

        spans = []
        for span, prediction in zip(new_lines, df.PREDICTED_END_LINE):

            span.label_ = _get_label(prediction)
            span._.end_line = prediction

            spans.append(span)
            for t in span:
                t._.end_line = prediction
                if not prediction:
                    t._.excluded = True

        doc.spans["new_lines"] = spans
    return doc
factory
create_component(nlp, name, model_path)
Source code in edsnlp/pipelines/core/endlines/factory.py
10
11
12
13
14
15
16
17
@deprecated_factory("endlines", "eds.endlines")
@Language.factory("eds.endlines")
def create_component(
    nlp: Language,
    name: str,
    model_path: Optional[str],
):
    return EndLines(nlp, end_lines_model=model_path)
endlinesmodel
EndLinesModel

Model to classify if an end line is a real one or it should be a space.

PARAMETER DESCRIPTION
nlp

spaCy nlp pipeline to use for matching.

TYPE: Language

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
class EndLinesModel:
    """Model to classify if an end line is a real one or it should be a space.

    Parameters
    ----------
    nlp : Language
        spaCy nlp pipeline to use for matching.
    """

    def __init__(self, nlp: Language):
        self.nlp = nlp

    def _preprocess_data(self, corpus: Iterable[Doc]) -> pd.DataFrame:
        """
        Parameters
        ----------
        corpus : Iterable[Doc]
            Corpus of documents

        Returns
        -------
        pd.DataFrame
            Preprocessed data
        """
        # Extract the vocabulary
        string_store = self.nlp.vocab.strings

        # Iterate in the corpus and construct a dataframe
        train_data_list = []
        for i, doc in enumerate(corpus):
            train_data_list.append(self._get_attributes(doc, i))

        df = pd.concat(train_data_list)
        df.reset_index(inplace=True, drop=False)
        df.rename(columns={"ORTH": "A1", "index": "original_token_index"}, inplace=True)

        # Retrieve string representation of token_id and shape
        df["TEXT"] = df.A1.apply(self._get_string, string_store=string_store)
        df["SHAPE_"] = df.SHAPE.apply(self._get_string, string_store=string_store)

        # Convert new lines as an attribute instead of a row
        df = self._convert_line_to_attribute(df, expr="\n", col="END_LINE")
        df = self._convert_line_to_attribute(df, expr="\n\n", col="BLANK_LINE")
        df = df.loc[~(df.END_LINE | df.BLANK_LINE)]
        df = df.drop(columns="END_LINE")
        df = df.drop(columns="BLANK_LINE")
        df.rename(
            columns={"TEMP_END_LINE": "END_LINE", "TEMP_BLANK_LINE": "BLANK_LINE"},
            inplace=True,
        )

        # Construct A2 by shifting
        df = self._shift_col(df, "A1", "A2", direction="backward")

        # Compute A3 and A4
        df = self._compute_a3(df)
        df = self._shift_col(df, "A3", "A4", direction="backward")

        # SPACE is the class to predict. Set 1 if not an END_LINE
        df["SPACE"] = np.logical_not(df["END_LINE"]).astype("int")

        df[["END_LINE", "BLANK_LINE"]] = df[["END_LINE", "BLANK_LINE"]].fillna(
            True, inplace=False
        )

        # Assign a sentence id to each token
        df = df.groupby("DOC_ID").apply(self._retrieve_lines)
        df["SENTENCE_ID"] = df["SENTENCE_ID"].astype("int")

        # Compute B1 and B2
        df = self._compute_B(df)

        # Drop Tokens without info (last token of doc)
        df.dropna(subset=["A1", "A2", "A3", "A4"], inplace=True)

        # Export the vocabularies to be able to use the model with another corpus
        voc_a3a4 = self._create_vocabulary(df.A3_.cat.categories)
        voc_B2 = self._create_vocabulary(df.cv_bin.cat.categories)
        voc_B1 = self._create_vocabulary(df.l_norm_bin.cat.categories)

        vocabulary = {"A3A4": voc_a3a4, "B1": voc_B1, "B2": voc_B2}

        self.vocabulary = vocabulary

        return df

    def fit_and_predict(self, corpus: Iterable[Doc]) -> pd.DataFrame:
        """Fit the model and predict for the training data

        Parameters
        ----------
        corpus : Iterable[Doc]
            An iterable of Documents

        Returns
        -------
        pd.DataFrame
            one line by end_line prediction
        """

        # Preprocess data to have a pd DF
        df = self._preprocess_data(corpus)

        # Train and predict M1
        self._fit_M1(df.A1, df.A2, df.A3, df.A4, df.SPACE)
        outputs_M1 = self._predict_M1(
            df.A1,
            df.A2,
            df.A3,
            df.A4,
        )
        df["M1"] = outputs_M1["predictions"]
        df["M1_proba"] = outputs_M1["predictions_proba"]

        # Force Blank lines to 0
        df.loc[df.BLANK_LINE, "M1"] = 0

        # Train and predict M2
        df_endlines = df.loc[df.END_LINE]
        self._fit_M2(B1=df_endlines.B1, B2=df_endlines.B2, label=df_endlines.M1)
        outputs_M2 = self._predict_M2(B1=df_endlines.B1, B2=df_endlines.B2)

        df.loc[df.END_LINE, "M2"] = outputs_M2["predictions"]
        df.loc[df.END_LINE, "M2_proba"] = outputs_M2["predictions_proba"]

        df["M2"] = df["M2"].astype(
            pd.Int64Dtype()
        )  # cast to pd.Int64Dtype cause there are None values

        # M1M2
        df = df.loc[df.END_LINE]
        df["M1M2_lr"] = (df["M2_proba"] / (1 - df["M2_proba"])) * (
            df["M1_proba"] / (1 - df["M1_proba"])
        )
        df["M1M2"] = (df["M1M2_lr"] > 1).astype("int")

        # Force Blank lines to 0
        df.loc[df.BLANK_LINE, ["M2", "M1M2"]] = 0

        # Make binary col
        df["PREDICTED_END_LINE"] = np.logical_not(df["M1M2"].astype(bool))

        return df

    def predict(self, df: pd.DataFrame) -> pd.DataFrame:
        """Use the model for inference

        The df should have the following columns:
        `["A1","A2","A3","A4","B1","B2","BLANK_LINE"]`

        Parameters
        ----------
        df : pd.DataFrame
            The df should have the following columns:
            `["A1","A2","A3","A4","B1","B2","BLANK_LINE"]`

        Returns
        -------
        pd.DataFrame
            The result is added to the column `PREDICTED_END_LINE`
        """

        df = self._convert_raw_data_to_codes(df)

        outputs_M1 = self._predict_M1(df.A1, df.A2, df._A3, df._A4)
        df["M1"] = outputs_M1["predictions"]
        df["M1_proba"] = outputs_M1["predictions_proba"]

        outputs_M2 = self._predict_M2(B1=df._B1, B2=df._B2)
        df["M2"] = outputs_M2["predictions"]
        df["M2_proba"] = outputs_M2["predictions_proba"]
        df["M2"] = df["M2"].astype(
            pd.Int64Dtype()
        )  # cast to pd.Int64Dtype cause there are None values

        # M1M2
        df["M1M2_lr"] = (df["M2_proba"] / (1 - df["M2_proba"])) * (
            df["M1_proba"] / (1 - df["M1_proba"])
        )
        df["M1M2"] = (df["M1M2_lr"] > 1).astype("int")

        # Force Blank lines to 0
        df.loc[
            df.BLANK_LINE,
            [
                "M1M2",
            ],
        ] = 0

        # Make binary col
        df["PREDICTED_END_LINE"] = np.logical_not(df["M1M2"].astype(bool))

        return df

    def save(self, path="base_model.pkl"):
        """Save a pickle of the model. It could be read by the pipeline later.

        Parameters
        ----------
        path : str, optional
            path to file .pkl, by default `base_model.pkl`
        """
        with open(path, "wb") as outp:
            del self.nlp
            pickle.dump(self, outp, pickle.HIGHEST_PROTOCOL)

    def _convert_A(self, df: pd.DataFrame, col: str) -> pd.DataFrame:
        """
        Parameters
        ----------
        df : pd.DataFrame
        col : str
            column to translate

        Returns
        -------
        pd.DataFrame
        """
        cat_type_A = CategoricalDtype(
            categories=self.vocabulary["A3A4"].keys(), ordered=True
        )
        new_col = "_" + col
        df[new_col] = df[col].astype(cat_type_A)
        df[new_col] = df[new_col].cat.codes
        # Ensure that not known values are coded as OTHER
        df.loc[
            ~df[col].isin(self.vocabulary["A3A4"].keys()), new_col
        ] = self.vocabulary["A3A4"]["OTHER"]
        return df

    def _convert_B(self, df: pd.DataFrame, col: str) -> pd.DataFrame:
        """
        Parameters
        ----------
        df : pd.DataFrame
            [description]
        col : str
            column to translate

        Returns
        -------
        pd.DataFrame
            [description]
        """
        # Translate B1
        index_B = pd.IntervalIndex(list(self.vocabulary[col].keys()))
        new_col = "_" + col
        df[new_col] = pd.cut(df[col], index_B)
        df[new_col] = df[new_col].cat.codes
        df.loc[df[col] >= index_B.right.max(), new_col] = max(
            self.vocabulary[col].values()
        )
        df.loc[df[col] <= index_B.left.min(), new_col] = min(
            self.vocabulary[col].values()
        )

        return df

    def _convert_raw_data_to_codes(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Function to translate data as extracted from spacy to the model codes.
        `A1` and `A2` are not translated cause are supposed to be already
        in good encoding.

        Parameters
        ----------
        df : pd.DataFrame
            It should have columns `['A3','A4','B1','B2']`

        Returns
        -------
        pd.DataFrame
        """
        df = self._convert_A(df, "A3")
        df = self._convert_A(df, "A4")
        df = self._convert_B(df, "B1")
        df = self._convert_B(df, "B2")
        return df

    def _convert_line_to_attribute(
        self, df: pd.DataFrame, expr: str, col: str
    ) -> pd.DataFrame:
        """
        Function to convert a line into an attribute (column) of the
        previous row. Particularly we use it to identify "\\n" and "\\n\\n"
        that are considered tokens, express this information as an attribute
        of the previous token.

        Parameters
        ----------
        df : pd.DataFrame
        expr : str
            pattern to search in the text. Ex.: "\\n"
        col : str
            name of the new column

        Returns
        -------
        pd.DataFrame
        """
        idx = df.TEXT.str.contains(expr)
        df.loc[idx, col] = True
        df[col] = df[col].fillna(False)
        df = self._shift_col(df, col, "TEMP_" + col, direction="backward")

        return df

    def _compute_a3(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        A3 (A4 respectively): typographic form  of left word (or right) :

        - All in capital letter
        - It starts with a capital letter
        - Starts by lowercase
        - It's a number
        - Strong punctuation
        - Soft punctuation
        - A number followed or preced by a punctuation (it's the case of enumerations)

        Parameters
        ----------
        df: pd.DataFrame

        Returns
        -------
        df: pd.DataFrame with the columns `A3` and `A3_`

        """
        df = self._shift_col(
            df, "IS_PUNCT", "IS_PUNCT_+1", direction="backward", fill=False
        )
        df = self._shift_col(
            df, "IS_PUNCT", "IS_PUNCT_-1", direction="forward", fill=False
        )

        CONDITION1 = df.IS_UPPER
        CONDITION2 = df.SHAPE_.str.startswith("Xx", na=False)
        CONDITION3 = df.SHAPE_.str.startswith("x", na=False)
        CONDITION4 = df.IS_DIGIT
        STRONG_PUNCT = [".", ";", "..", "..."]
        CONDITION5 = (df.IS_PUNCT) & (df.TEXT.isin(STRONG_PUNCT))
        CONDITION6 = (df.IS_PUNCT) & (~df.TEXT.isin(STRONG_PUNCT))
        CONDITION7 = (df.IS_DIGIT) & (df["IS_PUNCT_+1"] | df["IS_PUNCT_-1"])  # discuss

        df["A3_"] = None
        df.loc[CONDITION1, "A3_"] = "UPPER"
        df.loc[CONDITION2, "A3_"] = "S_UPPER"
        df.loc[CONDITION3, "A3_"] = "LOWER"
        df.loc[CONDITION4, "A3_"] = "DIGIT"
        df.loc[CONDITION5, "A3_"] = "STRONG_PUNCT"
        df.loc[CONDITION6, "A3_"] = "SOFT_PUNCT"
        df.loc[CONDITION7, "A3_"] = "ENUMERATION"

        df = df.drop(columns=["IS_PUNCT_+1", "IS_PUNCT_-1"])
        df["A3_"] = df["A3_"].astype("category")

        df["A3_"] = df["A3_"].cat.add_categories("OTHER")
        df["A3_"].fillna("OTHER", inplace=True)

        df["A3"] = df["A3_"].cat.codes

        return df

    def _fit_M1(
        self,
        A1: pd.Series,
        A2: pd.Series,
        A3: pd.Series,
        A4: pd.Series,
        label: pd.Series,
    ):
        """Function to train M1 classifier (Naive Bayes)

        Parameters
        ----------
        A1 : pd.Series
            [description]
        A2 : pd.Series
            [description]
        A3 : pd.Series
            [description]
        A4 : pd.Series
            [description]
        label : pd.Series
            [description]

        """
        # Encode classes to OneHotEncoder representation
        encoder_A1_A2 = self._fit_encoder_2S(A1, A2)
        self.encoder_A1_A2 = encoder_A1_A2

        encoder_A3_A4 = self._fit_encoder_2S(A3, A4)
        self.encoder_A3_A4 = encoder_A3_A4

        # M1
        m1 = MultinomialNB(alpha=1)

        X = self._get_X_for_M1(A1, A2, A3, A4)
        m1.fit(X, label)
        self.m1 = m1

    def _fit_M2(self, B1: pd.Series, B2: pd.Series, label: pd.Series):
        """Function to train M2 classifier (Naive Bayes)

        Parameters
        ----------
        B1 : pd.Series
        B2 : pd.Series
        label : pd.Series
        """

        # Encode classes to OneHotEncoder representation
        encoder_B1 = self._fit_encoder_1S(B1)
        self.encoder_B1 = encoder_B1
        encoder_B2 = self._fit_encoder_1S(B2)
        self.encoder_B2 = encoder_B2

        # Multinomial Naive Bayes
        m2 = MultinomialNB(alpha=1)
        X = self._get_X_for_M2(B1, B2)
        m2.fit(X, label)
        self.m2 = m2

    def _get_X_for_M1(
        self, A1: pd.Series, A2: pd.Series, A3: pd.Series, A4: pd.Series
    ) -> np.ndarray:
        """Get X matrix for classifier

        Parameters
        ----------
        A1 : pd.Series
        A2 : pd.Series
        A3 : pd.Series
        A4 : pd.Series

        Returns
        -------
        np.ndarray
        """
        A1_enc = self._encode_series(self.encoder_A1_A2, A1)
        A2_enc = self._encode_series(self.encoder_A1_A2, A2)
        A3_enc = self._encode_series(self.encoder_A3_A4, A3)
        A4_enc = self._encode_series(self.encoder_A3_A4, A4)
        X = hstack([A1_enc, A2_enc, A3_enc, A4_enc])
        return X

    def _get_X_for_M2(self, B1: pd.Series, B2: pd.Series) -> np.ndarray:
        """Get X matrix for classifier

        Parameters
        ----------
        B1 : pd.Series
        B2 : pd.Series

        Returns
        -------
        np.ndarray
        """
        B1_enc = self._encode_series(self.encoder_B1, B1)
        B2_enc = self._encode_series(self.encoder_B2, B2)
        X = hstack([B1_enc, B2_enc])
        return X

    def _predict_M1(
        self, A1: pd.Series, A2: pd.Series, A3: pd.Series, A4: pd.Series
    ) -> Dict[str, Any]:
        """Use M1 for prediction

        Parameters
        ----------
        A1 : pd.Series
        A2 : pd.Series
        A3 : pd.Series
        A4 : pd.Series

        Returns
        -------
        Dict[str, Any]
        """
        X = self._get_X_for_M1(A1, A2, A3, A4)
        predictions = self.m1.predict(X)
        predictions_proba = self.m1.predict_proba(X)[:, 1]
        outputs = {"predictions": predictions, "predictions_proba": predictions_proba}
        return outputs

    def _predict_M2(self, B1: pd.Series, B2: pd.Series) -> Dict[str, Any]:
        """Use M2 for prediction

        Parameters
        ----------
        B1 : pd.Series
        B2 : pd.Series

        Returns
        -------
        Dict[str, Any]
        """
        X = self._get_X_for_M2(B1, B2)
        predictions = self.m2.predict(X)
        predictions_proba = self.m2.predict_proba(X)[:, 1]
        outputs = {"predictions": predictions, "predictions_proba": predictions_proba}
        return outputs

    def _fit_encoder_2S(self, S1: pd.Series, S2: pd.Series) -> OneHotEncoder:
        """Fit a one hot encoder with 2 Series. It concatenates the series and after it fits.

        Parameters
        ----------
        S1 : pd.Series
        S2 : pd.Series

        Returns
        -------
        OneHotEncoder
        """
        _S1 = _convert_series_to_array(S1)
        _S2 = _convert_series_to_array(S2)
        S = np.concatenate([_S1, _S2])
        encoder = self._fit_one_hot_encoder(S)
        return encoder

    def _fit_encoder_1S(self, S1: pd.Series) -> OneHotEncoder:
        """Fit a one hot encoder with 1 Series.

        Parameters
        ----------
        S1 : pd.Series

        Returns
        -------
        OneHotEncoder
        """
        _S1 = _convert_series_to_array(S1)
        encoder = self._fit_one_hot_encoder(_S1)
        return encoder

    def _encode_series(self, encoder: OneHotEncoder, S: pd.Series) -> np.ndarray:
        """Use the one hot encoder to transform a series.

        Parameters
        ----------
        encoder : OneHotEncoder
        S : pd.Series
            a series to encode (transform)

        Returns
        -------
        np.ndarray
        """
        _S = _convert_series_to_array(S)
        S_enc = encoder.transform(_S)
        return S_enc

    def set_spans(self, corpus: Iterable[Doc], df: pd.DataFrame):
        """
        Function to set the results of the algorithm (pd.DataFrame)
        as spans of the spaCy document.

        Parameters
        ----------
        corpus : Iterable[Doc]
            Iterable of spaCy Documents
        df : pd.DataFrame
            It should have the columns:
            ["DOC_ID","original_token_index","PREDICTED_END_LINE"]
        """

        for doc_id, doc in enumerate(corpus):
            spans = []
            for token_i, pred in df.loc[
                df.DOC_ID == doc_id, ["original_token_index", "PREDICTED_END_LINE"]
            ].values:
                s = Span(doc, start=token_i, end=token_i + 1, label=_get_label(pred))

                spans.append(s)

            doc.spans["new_lines"] = spans

    @staticmethod
    def _retrieve_lines(dfg: DataFrameGroupBy) -> DataFrameGroupBy:
        """Function to give a sentence_id to each token.

        Parameters
        ----------
        dfg : DataFrameGroupBy

        Returns
        -------
        DataFrameGroupBy
            Same DataFrameGroupBy with the column `SENTENCE_ID`
        """
        sentences_ids = np.arange(dfg.END_LINE.sum())
        dfg.loc[dfg.END_LINE, "SENTENCE_ID"] = sentences_ids
        dfg["SENTENCE_ID"] = dfg["SENTENCE_ID"].fillna(method="bfill")
        return dfg

    @staticmethod
    def _create_vocabulary(x: iterable) -> dict:
        """Function to create a vocabulary for attributes in the training set.

        Parameters
        ----------
        x : iterable

        Returns
        -------
        dict
        """
        v = {}

        for i, key in enumerate(x):
            v[key] = i

        return v

    @staticmethod
    def _compute_B(df: pd.DataFrame) -> pd.DataFrame:
        """Function to compute B1 and B2

        Parameters
        ----------
        df : pd.DataFrame

        Returns
        -------
        pd.DataFrame
        """

        data = df.groupby(["DOC_ID", "SENTENCE_ID"]).agg(l=("LENGTH", "sum"))
        df_t = df.loc[df.END_LINE, ["DOC_ID", "SENTENCE_ID"]].merge(
            data, left_on=["DOC_ID", "SENTENCE_ID"], right_index=True, how="left"
        )

        stats_doc = df_t.groupby("DOC_ID").agg(mu=("l", "mean"), sigma=("l", "std"))
        stats_doc["sigma"].replace(
            0.0, 1.0, inplace=True
        )  # Replace the 0 std by unit std, otherwise it breaks the code.
        stats_doc["cv"] = stats_doc["sigma"] / stats_doc["mu"]

        df_t = df_t.drop(columns=["DOC_ID", "SENTENCE_ID"])
        df2 = df.merge(df_t, left_index=True, right_index=True, how="left")

        df2 = df2.merge(stats_doc, on=["DOC_ID"], how="left")
        df2["l_norm"] = (df2["l"] - df2["mu"]) / df2["sigma"]

        df2["cv_bin"] = pd.cut(df2["cv"], bins=10)
        df2["B2"] = df2["cv_bin"].cat.codes

        df2["l_norm_bin"] = pd.cut(df2["l_norm"], bins=10)
        df2["B1"] = df2["l_norm_bin"].cat.codes

        return df2

    @staticmethod
    def _shift_col(
        df: pd.DataFrame, col: str, new_col: str, direction="backward", fill=None
    ) -> pd.DataFrame:
        """Shifts a column one position into backward / forward direction.

        Parameters
        ----------
        df : pd.DataFrame
        col : str
            column to shift
        new_col : str
            column name to save the results
        direction : str, optional
            one of {"backward", "forward"}, by default "backward"
        fill : [type], optional
            , by default None

        Returns
        -------
        pd.DataFrame
            same df with `new_col` added.
        """
        df[new_col] = fill

        if direction == "backward":
            df.loc[df.index[:-1], new_col] = df[col].values[1:]

            different_doc_id = df["DOC_ID"].values[:-1] != df["DOC_ID"].values[1:]
            different_doc_id = np.append(different_doc_id, True)

        if direction == "forward":
            df.loc[df.index[1:], new_col] = df[col].values[:-1]
            different_doc_id = df["DOC_ID"].values[1:] != df["DOC_ID"].values[:-1]
            different_doc_id = np.append(True, different_doc_id)

        df.loc[different_doc_id, new_col] = fill
        return df

    @staticmethod
    def _get_attributes(doc: Doc, i=0):
        """Function to get the attributes of tokens of a spacy doc in a pd.DataFrame format.

        Parameters
        ----------
        doc : Doc
            spacy Doc
        i : int, optional
            document id, by default 0

        Returns
        -------
        pd.DataFrame
            Returns a dataframe with one line per token. It has the following columns :
            `[
            "ORTH",
            "LOWER",
            "SHAPE",
            "IS_DIGIT",
            "IS_SPACE",
            "IS_UPPER",
            "IS_PUNCT",
            "LENGTH",
            ]`
        """
        attributes = [
            "ORTH",
            "LOWER",
            "SHAPE",
            "IS_DIGIT",
            "IS_SPACE",
            "IS_UPPER",
            "IS_PUNCT",
            "LENGTH",
        ]
        attributes_array = doc.to_array(attributes)
        attributes_df = pd.DataFrame(attributes_array, columns=attributes)
        attributes_df["DOC_ID"] = i
        boolean_attr = []
        for a in attributes:
            if a[:3] == "IS_":
                boolean_attr.append(a)
        attributes_df[boolean_attr] = attributes_df[boolean_attr].astype("boolean")
        return attributes_df

    @staticmethod
    def _get_string(_id: int, string_store: StringStore) -> str:
        """Returns the string corresponding to the token_id

        Parameters
        ----------
        _id : int
            token id
        string_store : StringStore
            spaCy Language String Store

        Returns
        -------
        str
            string representation of the token.
        """
        return string_store[_id]

    @staticmethod
    def _fit_one_hot_encoder(X: np.ndarray) -> OneHotEncoder:
        """Fit a one hot encoder.

        Parameters
        ----------
        X : np.ndarray
            of shape (n,1)

        Returns
        -------
        OneHotEncoder
        """
        encoder = OneHotEncoder(handle_unknown="ignore")
        encoder.fit(X)
        return encoder
nlp = nlp instance-attribute
__init__(nlp)
Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py
28
29
def __init__(self, nlp: Language):
    self.nlp = nlp
_preprocess_data(corpus)
PARAMETER DESCRIPTION
corpus

Corpus of documents

TYPE: Iterable[Doc]

RETURNS DESCRIPTION
pd.DataFrame

Preprocessed data

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
def _preprocess_data(self, corpus: Iterable[Doc]) -> pd.DataFrame:
    """
    Parameters
    ----------
    corpus : Iterable[Doc]
        Corpus of documents

    Returns
    -------
    pd.DataFrame
        Preprocessed data
    """
    # Extract the vocabulary
    string_store = self.nlp.vocab.strings

    # Iterate in the corpus and construct a dataframe
    train_data_list = []
    for i, doc in enumerate(corpus):
        train_data_list.append(self._get_attributes(doc, i))

    df = pd.concat(train_data_list)
    df.reset_index(inplace=True, drop=False)
    df.rename(columns={"ORTH": "A1", "index": "original_token_index"}, inplace=True)

    # Retrieve string representation of token_id and shape
    df["TEXT"] = df.A1.apply(self._get_string, string_store=string_store)
    df["SHAPE_"] = df.SHAPE.apply(self._get_string, string_store=string_store)

    # Convert new lines as an attribute instead of a row
    df = self._convert_line_to_attribute(df, expr="\n", col="END_LINE")
    df = self._convert_line_to_attribute(df, expr="\n\n", col="BLANK_LINE")
    df = df.loc[~(df.END_LINE | df.BLANK_LINE)]
    df = df.drop(columns="END_LINE")
    df = df.drop(columns="BLANK_LINE")
    df.rename(
        columns={"TEMP_END_LINE": "END_LINE", "TEMP_BLANK_LINE": "BLANK_LINE"},
        inplace=True,
    )

    # Construct A2 by shifting
    df = self._shift_col(df, "A1", "A2", direction="backward")

    # Compute A3 and A4
    df = self._compute_a3(df)
    df = self._shift_col(df, "A3", "A4", direction="backward")

    # SPACE is the class to predict. Set 1 if not an END_LINE
    df["SPACE"] = np.logical_not(df["END_LINE"]).astype("int")

    df[["END_LINE", "BLANK_LINE"]] = df[["END_LINE", "BLANK_LINE"]].fillna(
        True, inplace=False
    )

    # Assign a sentence id to each token
    df = df.groupby("DOC_ID").apply(self._retrieve_lines)
    df["SENTENCE_ID"] = df["SENTENCE_ID"].astype("int")

    # Compute B1 and B2
    df = self._compute_B(df)

    # Drop Tokens without info (last token of doc)
    df.dropna(subset=["A1", "A2", "A3", "A4"], inplace=True)

    # Export the vocabularies to be able to use the model with another corpus
    voc_a3a4 = self._create_vocabulary(df.A3_.cat.categories)
    voc_B2 = self._create_vocabulary(df.cv_bin.cat.categories)
    voc_B1 = self._create_vocabulary(df.l_norm_bin.cat.categories)

    vocabulary = {"A3A4": voc_a3a4, "B1": voc_B1, "B2": voc_B2}

    self.vocabulary = vocabulary

    return df
fit_and_predict(corpus)

Fit the model and predict for the training data

PARAMETER DESCRIPTION
corpus

An iterable of Documents

TYPE: Iterable[Doc]

RETURNS DESCRIPTION
pd.DataFrame

one line by end_line prediction

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
def fit_and_predict(self, corpus: Iterable[Doc]) -> pd.DataFrame:
    """Fit the model and predict for the training data

    Parameters
    ----------
    corpus : Iterable[Doc]
        An iterable of Documents

    Returns
    -------
    pd.DataFrame
        one line by end_line prediction
    """

    # Preprocess data to have a pd DF
    df = self._preprocess_data(corpus)

    # Train and predict M1
    self._fit_M1(df.A1, df.A2, df.A3, df.A4, df.SPACE)
    outputs_M1 = self._predict_M1(
        df.A1,
        df.A2,
        df.A3,
        df.A4,
    )
    df["M1"] = outputs_M1["predictions"]
    df["M1_proba"] = outputs_M1["predictions_proba"]

    # Force Blank lines to 0
    df.loc[df.BLANK_LINE, "M1"] = 0

    # Train and predict M2
    df_endlines = df.loc[df.END_LINE]
    self._fit_M2(B1=df_endlines.B1, B2=df_endlines.B2, label=df_endlines.M1)
    outputs_M2 = self._predict_M2(B1=df_endlines.B1, B2=df_endlines.B2)

    df.loc[df.END_LINE, "M2"] = outputs_M2["predictions"]
    df.loc[df.END_LINE, "M2_proba"] = outputs_M2["predictions_proba"]

    df["M2"] = df["M2"].astype(
        pd.Int64Dtype()
    )  # cast to pd.Int64Dtype cause there are None values

    # M1M2
    df = df.loc[df.END_LINE]
    df["M1M2_lr"] = (df["M2_proba"] / (1 - df["M2_proba"])) * (
        df["M1_proba"] / (1 - df["M1_proba"])
    )
    df["M1M2"] = (df["M1M2_lr"] > 1).astype("int")

    # Force Blank lines to 0
    df.loc[df.BLANK_LINE, ["M2", "M1M2"]] = 0

    # Make binary col
    df["PREDICTED_END_LINE"] = np.logical_not(df["M1M2"].astype(bool))

    return df
predict(df)

Use the model for inference

The df should have the following columns: ["A1","A2","A3","A4","B1","B2","BLANK_LINE"]

PARAMETER DESCRIPTION
df

The df should have the following columns: ["A1","A2","A3","A4","B1","B2","BLANK_LINE"]

TYPE: pd.DataFrame

RETURNS DESCRIPTION
pd.DataFrame

The result is added to the column PREDICTED_END_LINE

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
def predict(self, df: pd.DataFrame) -> pd.DataFrame:
    """Use the model for inference

    The df should have the following columns:
    `["A1","A2","A3","A4","B1","B2","BLANK_LINE"]`

    Parameters
    ----------
    df : pd.DataFrame
        The df should have the following columns:
        `["A1","A2","A3","A4","B1","B2","BLANK_LINE"]`

    Returns
    -------
    pd.DataFrame
        The result is added to the column `PREDICTED_END_LINE`
    """

    df = self._convert_raw_data_to_codes(df)

    outputs_M1 = self._predict_M1(df.A1, df.A2, df._A3, df._A4)
    df["M1"] = outputs_M1["predictions"]
    df["M1_proba"] = outputs_M1["predictions_proba"]

    outputs_M2 = self._predict_M2(B1=df._B1, B2=df._B2)
    df["M2"] = outputs_M2["predictions"]
    df["M2_proba"] = outputs_M2["predictions_proba"]
    df["M2"] = df["M2"].astype(
        pd.Int64Dtype()
    )  # cast to pd.Int64Dtype cause there are None values

    # M1M2
    df["M1M2_lr"] = (df["M2_proba"] / (1 - df["M2_proba"])) * (
        df["M1_proba"] / (1 - df["M1_proba"])
    )
    df["M1M2"] = (df["M1M2_lr"] > 1).astype("int")

    # Force Blank lines to 0
    df.loc[
        df.BLANK_LINE,
        [
            "M1M2",
        ],
    ] = 0

    # Make binary col
    df["PREDICTED_END_LINE"] = np.logical_not(df["M1M2"].astype(bool))

    return df
save(path='base_model.pkl')

Save a pickle of the model. It could be read by the pipeline later.

PARAMETER DESCRIPTION
path

path to file .pkl, by default base_model.pkl

TYPE: str, optional DEFAULT: 'base_model.pkl'

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py
213
214
215
216
217
218
219
220
221
222
223
def save(self, path="base_model.pkl"):
    """Save a pickle of the model. It could be read by the pipeline later.

    Parameters
    ----------
    path : str, optional
        path to file .pkl, by default `base_model.pkl`
    """
    with open(path, "wb") as outp:
        del self.nlp
        pickle.dump(self, outp, pickle.HIGHEST_PROTOCOL)
_convert_A(df, col)
PARAMETER DESCRIPTION
df

TYPE: pd.DataFrame

col

column to translate

TYPE: str

RETURNS DESCRIPTION
pd.DataFrame
Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
def _convert_A(self, df: pd.DataFrame, col: str) -> pd.DataFrame:
    """
    Parameters
    ----------
    df : pd.DataFrame
    col : str
        column to translate

    Returns
    -------
    pd.DataFrame
    """
    cat_type_A = CategoricalDtype(
        categories=self.vocabulary["A3A4"].keys(), ordered=True
    )
    new_col = "_" + col
    df[new_col] = df[col].astype(cat_type_A)
    df[new_col] = df[new_col].cat.codes
    # Ensure that not known values are coded as OTHER
    df.loc[
        ~df[col].isin(self.vocabulary["A3A4"].keys()), new_col
    ] = self.vocabulary["A3A4"]["OTHER"]
    return df
_convert_B(df, col)
PARAMETER DESCRIPTION
df

[description]

TYPE: pd.DataFrame

col

column to translate

TYPE: str

RETURNS DESCRIPTION
pd.DataFrame

[description]

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
def _convert_B(self, df: pd.DataFrame, col: str) -> pd.DataFrame:
    """
    Parameters
    ----------
    df : pd.DataFrame
        [description]
    col : str
        column to translate

    Returns
    -------
    pd.DataFrame
        [description]
    """
    # Translate B1
    index_B = pd.IntervalIndex(list(self.vocabulary[col].keys()))
    new_col = "_" + col
    df[new_col] = pd.cut(df[col], index_B)
    df[new_col] = df[new_col].cat.codes
    df.loc[df[col] >= index_B.right.max(), new_col] = max(
        self.vocabulary[col].values()
    )
    df.loc[df[col] <= index_B.left.min(), new_col] = min(
        self.vocabulary[col].values()
    )

    return df
_convert_raw_data_to_codes(df)

Function to translate data as extracted from spacy to the model codes. A1 and A2 are not translated cause are supposed to be already in good encoding.

PARAMETER DESCRIPTION
df

It should have columns ['A3','A4','B1','B2']

TYPE: pd.DataFrame

RETURNS DESCRIPTION
pd.DataFrame
Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
def _convert_raw_data_to_codes(self, df: pd.DataFrame) -> pd.DataFrame:
    """
    Function to translate data as extracted from spacy to the model codes.
    `A1` and `A2` are not translated cause are supposed to be already
    in good encoding.

    Parameters
    ----------
    df : pd.DataFrame
        It should have columns `['A3','A4','B1','B2']`

    Returns
    -------
    pd.DataFrame
    """
    df = self._convert_A(df, "A3")
    df = self._convert_A(df, "A4")
    df = self._convert_B(df, "B1")
    df = self._convert_B(df, "B2")
    return df
_convert_line_to_attribute(df, expr, col)

Function to convert a line into an attribute (column) of the previous row. Particularly we use it to identify "\n" and "\n\n" that are considered tokens, express this information as an attribute of the previous token.

PARAMETER DESCRIPTION
df

TYPE: pd.DataFrame

expr

pattern to search in the text. Ex.: "\n"

TYPE: str

col

name of the new column

TYPE: str

RETURNS DESCRIPTION
pd.DataFrame
Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
def _convert_line_to_attribute(
    self, df: pd.DataFrame, expr: str, col: str
) -> pd.DataFrame:
    """
    Function to convert a line into an attribute (column) of the
    previous row. Particularly we use it to identify "\\n" and "\\n\\n"
    that are considered tokens, express this information as an attribute
    of the previous token.

    Parameters
    ----------
    df : pd.DataFrame
    expr : str
        pattern to search in the text. Ex.: "\\n"
    col : str
        name of the new column

    Returns
    -------
    pd.DataFrame
    """
    idx = df.TEXT.str.contains(expr)
    df.loc[idx, col] = True
    df[col] = df[col].fillna(False)
    df = self._shift_col(df, col, "TEMP_" + col, direction="backward")

    return df
_compute_a3(df)

A3 (A4 respectively): typographic form of left word (or right) :

  • All in capital letter
  • It starts with a capital letter
  • Starts by lowercase
  • It's a number
  • Strong punctuation
  • Soft punctuation
  • A number followed or preced by a punctuation (it's the case of enumerations)
PARAMETER DESCRIPTION
df

TYPE: pd.DataFrame

RETURNS DESCRIPTION
df
Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
def _compute_a3(self, df: pd.DataFrame) -> pd.DataFrame:
    """
    A3 (A4 respectively): typographic form  of left word (or right) :

    - All in capital letter
    - It starts with a capital letter
    - Starts by lowercase
    - It's a number
    - Strong punctuation
    - Soft punctuation
    - A number followed or preced by a punctuation (it's the case of enumerations)

    Parameters
    ----------
    df: pd.DataFrame

    Returns
    -------
    df: pd.DataFrame with the columns `A3` and `A3_`

    """
    df = self._shift_col(
        df, "IS_PUNCT", "IS_PUNCT_+1", direction="backward", fill=False
    )
    df = self._shift_col(
        df, "IS_PUNCT", "IS_PUNCT_-1", direction="forward", fill=False
    )

    CONDITION1 = df.IS_UPPER
    CONDITION2 = df.SHAPE_.str.startswith("Xx", na=False)
    CONDITION3 = df.SHAPE_.str.startswith("x", na=False)
    CONDITION4 = df.IS_DIGIT
    STRONG_PUNCT = [".", ";", "..", "..."]
    CONDITION5 = (df.IS_PUNCT) & (df.TEXT.isin(STRONG_PUNCT))
    CONDITION6 = (df.IS_PUNCT) & (~df.TEXT.isin(STRONG_PUNCT))
    CONDITION7 = (df.IS_DIGIT) & (df["IS_PUNCT_+1"] | df["IS_PUNCT_-1"])  # discuss

    df["A3_"] = None
    df.loc[CONDITION1, "A3_"] = "UPPER"
    df.loc[CONDITION2, "A3_"] = "S_UPPER"
    df.loc[CONDITION3, "A3_"] = "LOWER"
    df.loc[CONDITION4, "A3_"] = "DIGIT"
    df.loc[CONDITION5, "A3_"] = "STRONG_PUNCT"
    df.loc[CONDITION6, "A3_"] = "SOFT_PUNCT"
    df.loc[CONDITION7, "A3_"] = "ENUMERATION"

    df = df.drop(columns=["IS_PUNCT_+1", "IS_PUNCT_-1"])
    df["A3_"] = df["A3_"].astype("category")

    df["A3_"] = df["A3_"].cat.add_categories("OTHER")
    df["A3_"].fillna("OTHER", inplace=True)

    df["A3"] = df["A3_"].cat.codes

    return df
_fit_M1(A1, A2, A3, A4, label)

Function to train M1 classifier (Naive Bayes)

PARAMETER DESCRIPTION
A1

[description]

TYPE: pd.Series

A2

[description]

TYPE: pd.Series

A3

[description]

TYPE: pd.Series

A4

[description]

TYPE: pd.Series

label

[description]

TYPE: pd.Series

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
def _fit_M1(
    self,
    A1: pd.Series,
    A2: pd.Series,
    A3: pd.Series,
    A4: pd.Series,
    label: pd.Series,
):
    """Function to train M1 classifier (Naive Bayes)

    Parameters
    ----------
    A1 : pd.Series
        [description]
    A2 : pd.Series
        [description]
    A3 : pd.Series
        [description]
    A4 : pd.Series
        [description]
    label : pd.Series
        [description]

    """
    # Encode classes to OneHotEncoder representation
    encoder_A1_A2 = self._fit_encoder_2S(A1, A2)
    self.encoder_A1_A2 = encoder_A1_A2

    encoder_A3_A4 = self._fit_encoder_2S(A3, A4)
    self.encoder_A3_A4 = encoder_A3_A4

    # M1
    m1 = MultinomialNB(alpha=1)

    X = self._get_X_for_M1(A1, A2, A3, A4)
    m1.fit(X, label)
    self.m1 = m1
_fit_M2(B1, B2, label)

Function to train M2 classifier (Naive Bayes)

PARAMETER DESCRIPTION
B1

TYPE: pd.Series

B2

TYPE: pd.Series

label

TYPE: pd.Series

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
def _fit_M2(self, B1: pd.Series, B2: pd.Series, label: pd.Series):
    """Function to train M2 classifier (Naive Bayes)

    Parameters
    ----------
    B1 : pd.Series
    B2 : pd.Series
    label : pd.Series
    """

    # Encode classes to OneHotEncoder representation
    encoder_B1 = self._fit_encoder_1S(B1)
    self.encoder_B1 = encoder_B1
    encoder_B2 = self._fit_encoder_1S(B2)
    self.encoder_B2 = encoder_B2

    # Multinomial Naive Bayes
    m2 = MultinomialNB(alpha=1)
    X = self._get_X_for_M2(B1, B2)
    m2.fit(X, label)
    self.m2 = m2
_get_X_for_M1(A1, A2, A3, A4)

Get X matrix for classifier

PARAMETER DESCRIPTION
A1

TYPE: pd.Series

A2

TYPE: pd.Series

A3

TYPE: pd.Series

A4

TYPE: pd.Series

RETURNS DESCRIPTION
np.ndarray
Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
def _get_X_for_M1(
    self, A1: pd.Series, A2: pd.Series, A3: pd.Series, A4: pd.Series
) -> np.ndarray:
    """Get X matrix for classifier

    Parameters
    ----------
    A1 : pd.Series
    A2 : pd.Series
    A3 : pd.Series
    A4 : pd.Series

    Returns
    -------
    np.ndarray
    """
    A1_enc = self._encode_series(self.encoder_A1_A2, A1)
    A2_enc = self._encode_series(self.encoder_A1_A2, A2)
    A3_enc = self._encode_series(self.encoder_A3_A4, A3)
    A4_enc = self._encode_series(self.encoder_A3_A4, A4)
    X = hstack([A1_enc, A2_enc, A3_enc, A4_enc])
    return X
_get_X_for_M2(B1, B2)

Get X matrix for classifier

PARAMETER DESCRIPTION
B1

TYPE: pd.Series

B2

TYPE: pd.Series

RETURNS DESCRIPTION
np.ndarray
Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
def _get_X_for_M2(self, B1: pd.Series, B2: pd.Series) -> np.ndarray:
    """Get X matrix for classifier

    Parameters
    ----------
    B1 : pd.Series
    B2 : pd.Series

    Returns
    -------
    np.ndarray
    """
    B1_enc = self._encode_series(self.encoder_B1, B1)
    B2_enc = self._encode_series(self.encoder_B2, B2)
    X = hstack([B1_enc, B2_enc])
    return X
_predict_M1(A1, A2, A3, A4)

Use M1 for prediction

PARAMETER DESCRIPTION
A1

TYPE: pd.Series

A2

TYPE: pd.Series

A3

TYPE: pd.Series

A4

TYPE: pd.Series

RETURNS DESCRIPTION
Dict[str, Any]
Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
def _predict_M1(
    self, A1: pd.Series, A2: pd.Series, A3: pd.Series, A4: pd.Series
) -> Dict[str, Any]:
    """Use M1 for prediction

    Parameters
    ----------
    A1 : pd.Series
    A2 : pd.Series
    A3 : pd.Series
    A4 : pd.Series

    Returns
    -------
    Dict[str, Any]
    """
    X = self._get_X_for_M1(A1, A2, A3, A4)
    predictions = self.m1.predict(X)
    predictions_proba = self.m1.predict_proba(X)[:, 1]
    outputs = {"predictions": predictions, "predictions_proba": predictions_proba}
    return outputs
_predict_M2(B1, B2)

Use M2 for prediction

PARAMETER DESCRIPTION
B1

TYPE: pd.Series

B2

TYPE: pd.Series

RETURNS DESCRIPTION
Dict[str, Any]
Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
def _predict_M2(self, B1: pd.Series, B2: pd.Series) -> Dict[str, Any]:
    """Use M2 for prediction

    Parameters
    ----------
    B1 : pd.Series
    B2 : pd.Series

    Returns
    -------
    Dict[str, Any]
    """
    X = self._get_X_for_M2(B1, B2)
    predictions = self.m2.predict(X)
    predictions_proba = self.m2.predict_proba(X)[:, 1]
    outputs = {"predictions": predictions, "predictions_proba": predictions_proba}
    return outputs
_fit_encoder_2S(S1, S2)

Fit a one hot encoder with 2 Series. It concatenates the series and after it fits.

PARAMETER DESCRIPTION
S1

TYPE: pd.Series

S2

TYPE: pd.Series

RETURNS DESCRIPTION
OneHotEncoder
Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
def _fit_encoder_2S(self, S1: pd.Series, S2: pd.Series) -> OneHotEncoder:
    """Fit a one hot encoder with 2 Series. It concatenates the series and after it fits.

    Parameters
    ----------
    S1 : pd.Series
    S2 : pd.Series

    Returns
    -------
    OneHotEncoder
    """
    _S1 = _convert_series_to_array(S1)
    _S2 = _convert_series_to_array(S2)
    S = np.concatenate([_S1, _S2])
    encoder = self._fit_one_hot_encoder(S)
    return encoder
_fit_encoder_1S(S1)

Fit a one hot encoder with 1 Series.

PARAMETER DESCRIPTION
S1

TYPE: pd.Series

RETURNS DESCRIPTION
OneHotEncoder
Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py
540
541
542
543
544
545
546
547
548
549
550
551
552
553
def _fit_encoder_1S(self, S1: pd.Series) -> OneHotEncoder:
    """Fit a one hot encoder with 1 Series.

    Parameters
    ----------
    S1 : pd.Series

    Returns
    -------
    OneHotEncoder
    """
    _S1 = _convert_series_to_array(S1)
    encoder = self._fit_one_hot_encoder(_S1)
    return encoder
_encode_series(encoder, S)

Use the one hot encoder to transform a series.

PARAMETER DESCRIPTION
encoder

TYPE: OneHotEncoder

S

a series to encode (transform)

TYPE: pd.Series

RETURNS DESCRIPTION
np.ndarray
Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
def _encode_series(self, encoder: OneHotEncoder, S: pd.Series) -> np.ndarray:
    """Use the one hot encoder to transform a series.

    Parameters
    ----------
    encoder : OneHotEncoder
    S : pd.Series
        a series to encode (transform)

    Returns
    -------
    np.ndarray
    """
    _S = _convert_series_to_array(S)
    S_enc = encoder.transform(_S)
    return S_enc
set_spans(corpus, df)

Function to set the results of the algorithm (pd.DataFrame) as spans of the spaCy document.

PARAMETER DESCRIPTION
corpus

Iterable of spaCy Documents

TYPE: Iterable[Doc]

df

It should have the columns: ["DOC_ID","original_token_index","PREDICTED_END_LINE"]

TYPE: pd.DataFrame

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
def set_spans(self, corpus: Iterable[Doc], df: pd.DataFrame):
    """
    Function to set the results of the algorithm (pd.DataFrame)
    as spans of the spaCy document.

    Parameters
    ----------
    corpus : Iterable[Doc]
        Iterable of spaCy Documents
    df : pd.DataFrame
        It should have the columns:
        ["DOC_ID","original_token_index","PREDICTED_END_LINE"]
    """

    for doc_id, doc in enumerate(corpus):
        spans = []
        for token_i, pred in df.loc[
            df.DOC_ID == doc_id, ["original_token_index", "PREDICTED_END_LINE"]
        ].values:
            s = Span(doc, start=token_i, end=token_i + 1, label=_get_label(pred))

            spans.append(s)

        doc.spans["new_lines"] = spans
_retrieve_lines(dfg)

Function to give a sentence_id to each token.

PARAMETER DESCRIPTION
dfg

TYPE: DataFrameGroupBy

RETURNS DESCRIPTION
DataFrameGroupBy

Same DataFrameGroupBy with the column SENTENCE_ID

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
@staticmethod
def _retrieve_lines(dfg: DataFrameGroupBy) -> DataFrameGroupBy:
    """Function to give a sentence_id to each token.

    Parameters
    ----------
    dfg : DataFrameGroupBy

    Returns
    -------
    DataFrameGroupBy
        Same DataFrameGroupBy with the column `SENTENCE_ID`
    """
    sentences_ids = np.arange(dfg.END_LINE.sum())
    dfg.loc[dfg.END_LINE, "SENTENCE_ID"] = sentences_ids
    dfg["SENTENCE_ID"] = dfg["SENTENCE_ID"].fillna(method="bfill")
    return dfg
_create_vocabulary(x)

Function to create a vocabulary for attributes in the training set.

PARAMETER DESCRIPTION
x

TYPE: iterable

RETURNS DESCRIPTION
dict
Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
@staticmethod
def _create_vocabulary(x: iterable) -> dict:
    """Function to create a vocabulary for attributes in the training set.

    Parameters
    ----------
    x : iterable

    Returns
    -------
    dict
    """
    v = {}

    for i, key in enumerate(x):
        v[key] = i

    return v
_compute_B(df)

Function to compute B1 and B2

PARAMETER DESCRIPTION
df

TYPE: pd.DataFrame

RETURNS DESCRIPTION
pd.DataFrame
Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
@staticmethod
def _compute_B(df: pd.DataFrame) -> pd.DataFrame:
    """Function to compute B1 and B2

    Parameters
    ----------
    df : pd.DataFrame

    Returns
    -------
    pd.DataFrame
    """

    data = df.groupby(["DOC_ID", "SENTENCE_ID"]).agg(l=("LENGTH", "sum"))
    df_t = df.loc[df.END_LINE, ["DOC_ID", "SENTENCE_ID"]].merge(
        data, left_on=["DOC_ID", "SENTENCE_ID"], right_index=True, how="left"
    )

    stats_doc = df_t.groupby("DOC_ID").agg(mu=("l", "mean"), sigma=("l", "std"))
    stats_doc["sigma"].replace(
        0.0, 1.0, inplace=True
    )  # Replace the 0 std by unit std, otherwise it breaks the code.
    stats_doc["cv"] = stats_doc["sigma"] / stats_doc["mu"]

    df_t = df_t.drop(columns=["DOC_ID", "SENTENCE_ID"])
    df2 = df.merge(df_t, left_index=True, right_index=True, how="left")

    df2 = df2.merge(stats_doc, on=["DOC_ID"], how="left")
    df2["l_norm"] = (df2["l"] - df2["mu"]) / df2["sigma"]

    df2["cv_bin"] = pd.cut(df2["cv"], bins=10)
    df2["B2"] = df2["cv_bin"].cat.codes

    df2["l_norm_bin"] = pd.cut(df2["l_norm"], bins=10)
    df2["B1"] = df2["l_norm_bin"].cat.codes

    return df2
_shift_col(df, col, new_col, direction='backward', fill=None)

Shifts a column one position into backward / forward direction.

PARAMETER DESCRIPTION
df

TYPE: pd.DataFrame

col

column to shift

TYPE: str

new_col

column name to save the results

TYPE: str

direction

one of {"backward", "forward"}, by default "backward"

TYPE: str, optional DEFAULT: 'backward'

fill

, by default None

TYPE: [type], optional DEFAULT: None

RETURNS DESCRIPTION
pd.DataFrame

same df with new_col added.

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
@staticmethod
def _shift_col(
    df: pd.DataFrame, col: str, new_col: str, direction="backward", fill=None
) -> pd.DataFrame:
    """Shifts a column one position into backward / forward direction.

    Parameters
    ----------
    df : pd.DataFrame
    col : str
        column to shift
    new_col : str
        column name to save the results
    direction : str, optional
        one of {"backward", "forward"}, by default "backward"
    fill : [type], optional
        , by default None

    Returns
    -------
    pd.DataFrame
        same df with `new_col` added.
    """
    df[new_col] = fill

    if direction == "backward":
        df.loc[df.index[:-1], new_col] = df[col].values[1:]

        different_doc_id = df["DOC_ID"].values[:-1] != df["DOC_ID"].values[1:]
        different_doc_id = np.append(different_doc_id, True)

    if direction == "forward":
        df.loc[df.index[1:], new_col] = df[col].values[:-1]
        different_doc_id = df["DOC_ID"].values[1:] != df["DOC_ID"].values[:-1]
        different_doc_id = np.append(True, different_doc_id)

    df.loc[different_doc_id, new_col] = fill
    return df
_get_attributes(doc, i=0)

Function to get the attributes of tokens of a spacy doc in a pd.DataFrame format.

PARAMETER DESCRIPTION
doc

spacy Doc

TYPE: Doc

i

document id, by default 0

TYPE: int, optional DEFAULT: 0

RETURNS DESCRIPTION
pd.DataFrame

Returns a dataframe with one line per token. It has the following columns : [ "ORTH", "LOWER", "SHAPE", "IS_DIGIT", "IS_SPACE", "IS_UPPER", "IS_PUNCT", "LENGTH", ]

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
@staticmethod
def _get_attributes(doc: Doc, i=0):
    """Function to get the attributes of tokens of a spacy doc in a pd.DataFrame format.

    Parameters
    ----------
    doc : Doc
        spacy Doc
    i : int, optional
        document id, by default 0

    Returns
    -------
    pd.DataFrame
        Returns a dataframe with one line per token. It has the following columns :
        `[
        "ORTH",
        "LOWER",
        "SHAPE",
        "IS_DIGIT",
        "IS_SPACE",
        "IS_UPPER",
        "IS_PUNCT",
        "LENGTH",
        ]`
    """
    attributes = [
        "ORTH",
        "LOWER",
        "SHAPE",
        "IS_DIGIT",
        "IS_SPACE",
        "IS_UPPER",
        "IS_PUNCT",
        "LENGTH",
    ]
    attributes_array = doc.to_array(attributes)
    attributes_df = pd.DataFrame(attributes_array, columns=attributes)
    attributes_df["DOC_ID"] = i
    boolean_attr = []
    for a in attributes:
        if a[:3] == "IS_":
            boolean_attr.append(a)
    attributes_df[boolean_attr] = attributes_df[boolean_attr].astype("boolean")
    return attributes_df
_get_string(_id, string_store)

Returns the string corresponding to the token_id

PARAMETER DESCRIPTION
_id

token id

TYPE: int

string_store

spaCy Language String Store

TYPE: StringStore

RETURNS DESCRIPTION
str

string representation of the token.

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
@staticmethod
def _get_string(_id: int, string_store: StringStore) -> str:
    """Returns the string corresponding to the token_id

    Parameters
    ----------
    _id : int
        token id
    string_store : StringStore
        spaCy Language String Store

    Returns
    -------
    str
        string representation of the token.
    """
    return string_store[_id]
_fit_one_hot_encoder(X)

Fit a one hot encoder.

PARAMETER DESCRIPTION
X

of shape (n,1)

TYPE: np.ndarray

RETURNS DESCRIPTION
OneHotEncoder
Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
@staticmethod
def _fit_one_hot_encoder(X: np.ndarray) -> OneHotEncoder:
    """Fit a one hot encoder.

    Parameters
    ----------
    X : np.ndarray
        of shape (n,1)

    Returns
    -------
    OneHotEncoder
    """
    encoder = OneHotEncoder(handle_unknown="ignore")
    encoder.fit(X)
    return encoder
context
context
ContextAdder

Bases: BaseComponent

Provides a generic context adder component.

PARAMETER DESCRIPTION
nlp

The spaCy object.

TYPE: Language

context

The list of extensions to add to the Doc

TYPE: List[str]

Source code in edsnlp/pipelines/core/context/context.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
class ContextAdder(BaseComponent):
    """
    Provides a generic context adder component.

    Parameters
    ----------
    nlp : Language
        The spaCy object.
    context : List[str]
        The list of extensions to add to the `Doc`
    """

    def __init__(
        self,
        nlp: Language,
        context: List[str],
    ):

        self.nlp = nlp
        self.context = context
        self.set_extensions()

    def set_extensions(self):
        for col in self.context:
            if not Doc.has_extension(col):
                Doc.set_extension(col, default=None)

    def __call__(self, doc: Doc) -> Doc:
        return doc
nlp = nlp instance-attribute
context = context instance-attribute
__init__(nlp, context)
Source code in edsnlp/pipelines/core/context/context.py
21
22
23
24
25
26
27
28
29
def __init__(
    self,
    nlp: Language,
    context: List[str],
):

    self.nlp = nlp
    self.context = context
    self.set_extensions()
set_extensions()
Source code in edsnlp/pipelines/core/context/context.py
31
32
33
34
def set_extensions(self):
    for col in self.context:
        if not Doc.has_extension(col):
            Doc.set_extension(col, default=None)
__call__(doc)
Source code in edsnlp/pipelines/core/context/context.py
36
37
def __call__(self, doc: Doc) -> Doc:
    return doc
factory
DEFAULT_CONFIG = dict(context=['note_id']) module-attribute
create_component(nlp, name, context)
Source code in edsnlp/pipelines/core/context/factory.py
12
13
14
15
16
17
18
19
20
21
22
@Language.factory("eds.context", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    context: List[str],
):

    return ContextAdder(
        nlp,
        context=context,
    )
normalizer
normalizer
Normalizer

Bases: object

Normalisation pipeline. Modifies the NORM attribute, acting on four dimensions :

  • lowercase: using the default NORM
  • accents: deterministic and fixed-length normalisation of accents.
  • quotes: deterministic and fixed-length normalisation of quotation marks.
  • pollution: removal of pollutions.
PARAMETER DESCRIPTION
lowercase

Whether to remove case.

TYPE: bool

accents

Optional Accents object.

TYPE: Optional[Accents]

quotes

Optional Quotes object.

TYPE: Optional[Quotes]

pollution

Optional Pollution object.

TYPE: Optional[Pollution]

Source code in edsnlp/pipelines/core/normalizer/normalizer.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
class Normalizer(object):
    """
    Normalisation pipeline. Modifies the `NORM` attribute,
    acting on four dimensions :

    - `lowercase`: using the default `NORM`
    - `accents`: deterministic and fixed-length normalisation of accents.
    - `quotes`: deterministic and fixed-length normalisation of quotation marks.
    - `pollution`: removal of pollutions.

    Parameters
    ----------
    lowercase : bool
        Whether to remove case.
    accents : Optional[Accents]
        Optional `Accents` object.
    quotes : Optional[Quotes]
        Optional `Quotes` object.
    pollution : Optional[Pollution]
        Optional `Pollution` object.
    """

    def __init__(
        self,
        lowercase: bool,
        accents: Optional[Accents],
        quotes: Optional[Quotes],
        pollution: Optional[Pollution],
    ):
        self.lowercase = lowercase
        self.accents = accents
        self.quotes = quotes
        self.pollution = pollution

    def __call__(self, doc: Doc) -> Doc:
        """
        Apply the normalisation pipeline, one component at a time.

        Parameters
        ----------
        doc : Doc
            spaCy `Doc` object

        Returns
        -------
        Doc
            Doc object with `NORM` attribute modified
        """
        if not self.lowercase:
            remove_lowercase(doc)
        if self.accents is not None:
            self.accents(doc)
        if self.quotes is not None:
            self.quotes(doc)
        if self.pollution is not None:
            self.pollution(doc)

        return doc
lowercase = lowercase instance-attribute
accents = accents instance-attribute
quotes = quotes instance-attribute
pollution = pollution instance-attribute
__init__(lowercase, accents, quotes, pollution)
Source code in edsnlp/pipelines/core/normalizer/normalizer.py
33
34
35
36
37
38
39
40
41
42
43
def __init__(
    self,
    lowercase: bool,
    accents: Optional[Accents],
    quotes: Optional[Quotes],
    pollution: Optional[Pollution],
):
    self.lowercase = lowercase
    self.accents = accents
    self.quotes = quotes
    self.pollution = pollution
__call__(doc)

Apply the normalisation pipeline, one component at a time.

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
Doc

Doc object with NORM attribute modified

Source code in edsnlp/pipelines/core/normalizer/normalizer.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
def __call__(self, doc: Doc) -> Doc:
    """
    Apply the normalisation pipeline, one component at a time.

    Parameters
    ----------
    doc : Doc
        spaCy `Doc` object

    Returns
    -------
    Doc
        Doc object with `NORM` attribute modified
    """
    if not self.lowercase:
        remove_lowercase(doc)
    if self.accents is not None:
        self.accents(doc)
    if self.quotes is not None:
        self.quotes(doc)
    if self.pollution is not None:
        self.pollution(doc)

    return doc
factory
DEFAULT_CONFIG = dict(accents=True, lowercase=True, quotes=True, pollution=True) module-attribute
create_component(nlp, name, accents, lowercase, quotes, pollution)
Source code in edsnlp/pipelines/core/normalizer/factory.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
@deprecated_factory("normalizer", "eds.normalizer", default_config=DEFAULT_CONFIG)
@Language.factory("eds.normalizer", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    accents: Union[bool, Dict[str, Any]],
    lowercase: Union[bool, Dict[str, Any]],
    quotes: Union[bool, Dict[str, Any]],
    pollution: Union[bool, Dict[str, Any]],
):

    if accents:
        config = dict(**accents_config)
        if isinstance(accents, dict):
            config.update(accents)
        accents = registry.get("factories", "eds.accents")(nlp, "eds.accents", **config)

    if quotes:
        config = dict(**quotes_config)
        if isinstance(quotes, dict):
            config.update(quotes)
        quotes = registry.get("factories", "eds.quotes")(nlp, "eds.quotes", **config)

    if pollution:
        config = dict(**pollution_config)
        if isinstance(pollution, dict):
            config.update(pollution)
        pollution = registry.get("factories", "eds.pollution")(
            nlp, "eds.pollution", **config
        )

    normalizer = Normalizer(
        lowercase=lowercase,
        accents=accents or None,
        quotes=quotes or None,
        pollution=pollution or None,
    )

    return normalizer
pollution
patterns
information = "(?s)(=====+\\s*)?(L\\s*e\\s*s\\sdonnées\\s*administratives,\\s*sociales\\s*|I?nfo\\s*rmation\\s*aux?\\s*patients?|L[’']AP-HP\\s*collecte\\s*vos\\s*données\\s*administratives|L[’']Assistance\\s*Publique\\s*-\\s*Hôpitaux\\s*de\\s*Paris\\s*\\(?AP-HP\\)?\\s*a\\s*créé\\s*une\\s*base\\s*de\\s*données).{,2000}https?:\\/\\/recherche\\.aphp\\.fr\\/eds\\/droit-opposition[\\s\\.]*" module-attribute
bars = '(?i)([nbw]|_|-|=){5,}' module-attribute
pollution = dict(information=information, bars=bars) module-attribute
pollution
Pollution

Bases: BaseComponent

Tags pollution tokens.

Populates a number of spaCy extensions :

  • Token._.pollution : indicates whether the token is a pollution
  • Doc._.clean : lists non-pollution tokens
  • Doc._.clean_ : original text with pollutions removed.
  • Doc._.char_clean_span : method to create a Span using character indices extracted using the cleaned text.
PARAMETER DESCRIPTION
nlp

Language pipeline object

TYPE: Language

pollution

Dictionary containing regular expressions of pollution.

TYPE: Dict[str, Union[str, List[str]]]

Source code in edsnlp/pipelines/core/normalizer/pollution/pollution.py
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
class Pollution(BaseComponent):
    """
    Tags pollution tokens.

    Populates a number of spaCy extensions :

    - `Token._.pollution` : indicates whether the token is a pollution
    - `Doc._.clean` : lists non-pollution tokens
    - `Doc._.clean_` : original text with pollutions removed.
    - `Doc._.char_clean_span` : method to create a Span using character
      indices extracted using the cleaned text.

    Parameters
    ----------
    nlp : Language
        Language pipeline object
    pollution : Dict[str, Union[str, List[str]]]
        Dictionary containing regular expressions of pollution.
    """

    # noinspection PyProtectedMember
    def __init__(
        self,
        nlp: Language,
        pollution: Optional[Dict[str, Union[str, List[str]]]],
    ):

        self.nlp = nlp

        if pollution is None:
            pollution = patterns.pollution

        self.pollution = pollution

        for k, v in self.pollution.items():
            if isinstance(v, str):
                self.pollution[k] = [v]

        self.regex_matcher = RegexMatcher()
        self.build_patterns()

    def build_patterns(self) -> None:
        """
        Builds the patterns for phrase matching.
        """

        # efficiently build spaCy matcher patterns
        for k, v in self.pollution.items():
            self.regex_matcher.add(k, v)

    def process(self, doc: Doc) -> List[Span]:
        """
        Find pollutions in doc and clean candidate negations to remove pseudo negations

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        pollution:
            list of pollution spans
        """

        pollutions = self.regex_matcher(doc, as_spans=True)
        pollutions = filter_spans(pollutions)

        return pollutions

    def __call__(self, doc: Doc) -> Doc:
        """
        Tags pollutions.

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        doc:
            spaCy Doc object, annotated for pollutions.
        """
        pollutions = self.process(doc)

        for pollution in pollutions:

            for token in pollution:
                token._.excluded = True

        doc.spans["pollutions"] = pollutions

        return doc
nlp = nlp instance-attribute
pollution = pollution instance-attribute
regex_matcher = RegexMatcher() instance-attribute
__init__(nlp, pollution)
Source code in edsnlp/pipelines/core/normalizer/pollution/pollution.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
def __init__(
    self,
    nlp: Language,
    pollution: Optional[Dict[str, Union[str, List[str]]]],
):

    self.nlp = nlp

    if pollution is None:
        pollution = patterns.pollution

    self.pollution = pollution

    for k, v in self.pollution.items():
        if isinstance(v, str):
            self.pollution[k] = [v]

    self.regex_matcher = RegexMatcher()
    self.build_patterns()
build_patterns()

Builds the patterns for phrase matching.

Source code in edsnlp/pipelines/core/normalizer/pollution/pollution.py
54
55
56
57
58
59
60
61
def build_patterns(self) -> None:
    """
    Builds the patterns for phrase matching.
    """

    # efficiently build spaCy matcher patterns
    for k, v in self.pollution.items():
        self.regex_matcher.add(k, v)
process(doc)

Find pollutions in doc and clean candidate negations to remove pseudo negations

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
pollution

list of pollution spans

Source code in edsnlp/pipelines/core/normalizer/pollution/pollution.py
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
def process(self, doc: Doc) -> List[Span]:
    """
    Find pollutions in doc and clean candidate negations to remove pseudo negations

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    pollution:
        list of pollution spans
    """

    pollutions = self.regex_matcher(doc, as_spans=True)
    pollutions = filter_spans(pollutions)

    return pollutions
__call__(doc)

Tags pollutions.

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
doc

spaCy Doc object, annotated for pollutions.

Source code in edsnlp/pipelines/core/normalizer/pollution/pollution.py
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
def __call__(self, doc: Doc) -> Doc:
    """
    Tags pollutions.

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    doc:
        spaCy Doc object, annotated for pollutions.
    """
    pollutions = self.process(doc)

    for pollution in pollutions:

        for token in pollution:
            token._.excluded = True

    doc.spans["pollutions"] = pollutions

    return doc
factory
DEFAULT_CONFIG = dict(pollution=None) module-attribute
create_component(nlp, name, pollution)
Source code in edsnlp/pipelines/core/normalizer/pollution/factory.py
14
15
16
17
18
19
20
21
22
23
24
@deprecated_factory("pollution", "eds.pollution", default_config=DEFAULT_CONFIG)
@Language.factory("eds.pollution", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    pollution: Optional[Dict[str, Union[str, List[str]]]],
):
    return Pollution(
        nlp,
        pollution=pollution,
    )
accents
patterns
accents: List[Tuple[str, str]] = [('ç', 'c'), ('àáâä', 'a'), ('èéêë', 'e'), ('ìíîï', 'i'), ('òóôö', 'o'), ('ùúûü', 'u')] module-attribute
accents
Accents

Bases: object

Normalises accents, using a same-length strategy.

PARAMETER DESCRIPTION
accents

List of accentuated characters and their transcription.

TYPE: List[Tuple[str, str]]

Source code in edsnlp/pipelines/core/normalizer/accents/accents.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
class Accents(object):
    """
    Normalises accents, using a same-length strategy.

    Parameters
    ----------
    accents : List[Tuple[str, str]]
        List of accentuated characters and their transcription.
    """

    def __init__(self, accents: Optional[List[Tuple[str, str]]]) -> None:
        if accents is None:
            accents = patterns.accents

        self.translation_table = str.maketrans(
            "".join(accent_group for accent_group, _ in accents),
            "".join(rep * len(accent_group) for accent_group, rep in accents),
        )

    def __call__(self, doc: Doc) -> Doc:
        """
        Remove accents from spacy `NORM` attribute.

        Parameters
        ----------
        doc : Doc
            The spaCy `Doc` object.

        Returns
        -------
        Doc
            The document, with accents removed in `Token.norm_`.
        """

        for token in doc:
            token.norm_ = token.norm_.translate(self.translation_table)

        return doc
translation_table = str.maketrans(''.join(accent_group for (accent_group, _) in accents), ''.join(rep * len(accent_group) for (accent_group, rep) in accents)) instance-attribute
__init__(accents)
Source code in edsnlp/pipelines/core/normalizer/accents/accents.py
18
19
20
21
22
23
24
25
def __init__(self, accents: Optional[List[Tuple[str, str]]]) -> None:
    if accents is None:
        accents = patterns.accents

    self.translation_table = str.maketrans(
        "".join(accent_group for accent_group, _ in accents),
        "".join(rep * len(accent_group) for accent_group, rep in accents),
    )
__call__(doc)

Remove accents from spacy NORM attribute.

PARAMETER DESCRIPTION
doc

The spaCy Doc object.

TYPE: Doc

RETURNS DESCRIPTION
Doc

The document, with accents removed in Token.norm_.

Source code in edsnlp/pipelines/core/normalizer/accents/accents.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
def __call__(self, doc: Doc) -> Doc:
    """
    Remove accents from spacy `NORM` attribute.

    Parameters
    ----------
    doc : Doc
        The spaCy `Doc` object.

    Returns
    -------
    Doc
        The document, with accents removed in `Token.norm_`.
    """

    for token in doc:
        token.norm_ = token.norm_.translate(self.translation_table)

    return doc
factory
DEFAULT_CONFIG = dict(accents=None) module-attribute
create_component(nlp, name, accents)
Source code in edsnlp/pipelines/core/normalizer/accents/factory.py
14
15
16
17
18
19
20
21
22
23
@deprecated_factory("accents", "eds.accents", default_config=DEFAULT_CONFIG)
@Language.factory("eds.accents", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    accents: Optional[List[Tuple[str, str]]],
):
    return Accents(
        accents=accents,
    )
lowercase
factory
remove_lowercase(doc)

Add case on the NORM custom attribute. Should always be applied first.

PARAMETER DESCRIPTION
doc

The spaCy Doc object.

TYPE: Doc

RETURNS DESCRIPTION
Doc

The document, with case put back in NORM.

Source code in edsnlp/pipelines/core/normalizer/lowercase/factory.py
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
@Language.component("remove-lowercase")
@Language.component("eds.remove-lowercase")
def remove_lowercase(doc: Doc):
    """
    Add case on the `NORM` custom attribute. Should always be applied first.

    Parameters
    ----------
    doc : Doc
        The spaCy `Doc` object.

    Returns
    -------
    Doc
        The document, with case put back in `NORM`.
    """

    for token in doc:
        token.norm_ = token.text

    return doc
quotes
quotes
Quotes

Bases: object

We normalise quotes, following this source <https://www.cl.cam.ac.uk/~mgk25/ucs/quotes.html>_.

PARAMETER DESCRIPTION
quotes

List of quotation characters and their transcription.

TYPE: List[Tuple[str, str]]

Source code in edsnlp/pipelines/core/normalizer/quotes/quotes.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
class Quotes(object):
    """
    We normalise quotes, following this
    `source <https://www.cl.cam.ac.uk/~mgk25/ucs/quotes.html>`_.

    Parameters
    ----------
    quotes : List[Tuple[str, str]]
        List of quotation characters and their transcription.
    """

    def __init__(self, quotes: Optional[List[Tuple[str, str]]]) -> None:
        if quotes is None:
            quotes = quotes_and_apostrophes

        self.translation_table = str.maketrans(
            "".join(quote_group for quote_group, _ in quotes),
            "".join(rep * len(quote_group) for quote_group, rep in quotes),
        )

    def __call__(self, doc: Doc) -> Doc:
        """
        Normalises quotes.

        Parameters
        ----------
        doc : Doc
            Document to process.

        Returns
        -------
        Doc
            Same document, with quotes normalised.
        """

        for token in doc:
            token.norm_ = token.norm_.translate(self.translation_table)

        return doc
translation_table = str.maketrans(''.join(quote_group for (quote_group, _) in quotes), ''.join(rep * len(quote_group) for (quote_group, rep) in quotes)) instance-attribute
__init__(quotes)
Source code in edsnlp/pipelines/core/normalizer/quotes/quotes.py
19
20
21
22
23
24
25
26
def __init__(self, quotes: Optional[List[Tuple[str, str]]]) -> None:
    if quotes is None:
        quotes = quotes_and_apostrophes

    self.translation_table = str.maketrans(
        "".join(quote_group for quote_group, _ in quotes),
        "".join(rep * len(quote_group) for quote_group, rep in quotes),
    )
__call__(doc)

Normalises quotes.

PARAMETER DESCRIPTION
doc

Document to process.

TYPE: Doc

RETURNS DESCRIPTION
Doc

Same document, with quotes normalised.

Source code in edsnlp/pipelines/core/normalizer/quotes/quotes.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
def __call__(self, doc: Doc) -> Doc:
    """
    Normalises quotes.

    Parameters
    ----------
    doc : Doc
        Document to process.

    Returns
    -------
    Doc
        Same document, with quotes normalised.
    """

    for token in doc:
        token.norm_ = token.norm_.translate(self.translation_table)

    return doc
patterns
quotes: List[str] = ['"', '〃', 'ײ', '᳓', '″', '״', '‶', '˶', 'ʺ', '“', '”', '˝', '‟'] module-attribute
apostrophes: List[str] = ['`', '΄', ''', 'ˈ', 'ˊ', 'ᑊ', 'ˋ', 'ꞌ', 'ᛌ', '𖽒', '𖽑', '‘', '’', 'י', '՚', '‛', '՝', '`', '`', '′', '׳', '´', 'ʹ', '˴', 'ߴ', '‵', 'ߵ', 'ʹ', 'ʻ', 'ʼ', '´', '᾽', 'ʽ', '῾', 'ʾ', '᾿'] module-attribute
quotes_and_apostrophes: List[Tuple[str, str]] = [(''.join(quotes), '"'), (''.join(apostrophes), "'")] module-attribute
factory
DEFAULT_CONFIG = dict(quotes=None) module-attribute
create_component(nlp, name, quotes)
Source code in edsnlp/pipelines/core/normalizer/quotes/factory.py
14
15
16
17
18
19
20
21
22
23
@deprecated_factory("quotes", "eds.quotes", default_config=DEFAULT_CONFIG)
@Language.factory("eds.quotes", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    quotes: Optional[List[Tuple[str, str]]],
):
    return Quotes(
        quotes=quotes,
    )
advanced
factory
DEFAULT_CONFIG = dict(window=10, verbose=0, ignore_excluded=False, attr='NORM') module-attribute
create_component(nlp, name, regex_config, window, verbose, ignore_excluded, attr)
Source code in edsnlp/pipelines/core/advanced/factory.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
@deprecated_factory(
    "advanced-regex", "eds.advanced-regex", default_config=DEFAULT_CONFIG
)
@Language.factory("eds.advanced-regex", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    regex_config: Dict[str, Any],
    window: int,
    verbose: int,
    ignore_excluded: bool,
    attr: str,
):

    return AdvancedRegex(
        nlp,
        regex_config=regex_config,
        window=window,
        verbose=verbose,
        ignore_excluded=ignore_excluded,
        attr=attr,
    )
advanced
AdvancedRegex

Bases: GenericMatcher

Allows additional matching in the surrounding context of the main match group, for qualification/filtering.

PARAMETER DESCRIPTION
nlp

spaCy Language object.

TYPE: Language

regex_config

Configuration for the main expression.

TYPE: Dict[str, Any]

window

Number of tokens to consider before and after the main expression.

TYPE: int

attr

Attribute to match on, eg TEXT, NORM, etc.

TYPE: str

verbose

Verbosity level, useful for debugging.

TYPE: int

ignore_excluded

Whether to skip excluded tokens.

TYPE: bool

Source code in edsnlp/pipelines/core/advanced/advanced.py
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
class AdvancedRegex(GenericMatcher):
    """
    Allows additional matching in the surrounding context of the main match group,
    for qualification/filtering.

    Parameters
    ----------
    nlp : Language
        spaCy `Language` object.
    regex_config : Dict[str, Any]
        Configuration for the main expression.
    window : int
        Number of tokens to consider before and after the main expression.
    attr : str
        Attribute to match on, eg `TEXT`, `NORM`, etc.
    verbose : int
        Verbosity level, useful for debugging.
    ignore_excluded : bool
        Whether to skip excluded tokens.
    """

    def __init__(
        self,
        nlp: Language,
        regex_config: Dict[str, Any],
        window: int,
        attr: str,
        verbose: int,
        ignore_excluded: bool,
    ):
        self.regex_config = _check_regex_config(regex_config)
        self.window = window
        regex = regex_config

        self.verbose = verbose

        super().__init__(
            nlp=nlp,
            terms=dict(),
            regex=regex,
            attr=attr,
            ignore_excluded=ignore_excluded,
        )

        self.ignore_excluded = ignore_excluded

        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:
        if not Doc.has_extension("my_ents"):
            Doc.set_extension("my_ents", default=[])

        if not Span.has_extension("matcher_name"):
            Span.set_extension("matcher_name", default=None)

        if not Span.has_extension("before_extract"):
            Span.set_extension("before_extract", default=None)
        if not Span.has_extension("after_extract"):
            Span.set_extension("after_extract", default=None)

        if not Span.has_extension("window"):
            Span.set_extension("window", default=None)

        if not Span.has_extension("before_snippet"):
            Span.set_extension("before_snippet", default=None)
        if not Span.has_extension("after_snippet"):
            Span.set_extension("after_snippet", default=None)

    def process(self, doc: Doc) -> List[Span]:
        """
        Process the document, looking for named entities.

        Parameters
        ----------
        doc : Doc
            spaCy Doc object

        Returns
        -------
        List[Span]
            List of detected spans.
        """

        ents = super().process(doc)
        ents = self._postprocessing_pipeline(ents)

        return ents

    def __call__(self, doc: Doc) -> Doc:
        """
        Adds spans to document.

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        doc:
            spaCy Doc object, annotated for extracted terms.
        """

        ents = self.process(doc)

        ents, discarded = filter_spans(list(doc.ents) + ents, return_discarded=True)

        doc.ents = ents

        if "discarded" not in doc.spans:
            doc.spans["discarded"] = []
        doc.spans["discarded"].extend(discarded)

        return doc

    def _postprocessing_pipeline(self, ents: List[Span]):
        # add a window within the sentence around entities
        ents = [self._add_window(ent) for ent in ents]

        # Remove entities based on the snippet located just before and after the entity
        ents = filter(self._exclude_filter, ents)

        # Extract informations from the entity's context via regex
        ents = [self._snippet_extraction(ent) for ent in ents]

        return ents

    def _add_window(self, ent: Span) -> Span:
        ent._.window = ent.doc[
            max(ent.start - self.window, ent.sent.start) : min(
                ent.end + self.window, ent.sent.end
            )
        ]

        # include the entity in the snippets so that we can extract
        # the number when it is attached to the word, e.g. "3PA"
        ent._.before_snippet = ent.doc[
            max(ent.start - self.window, ent.sent.start) : ent.end
        ]
        ent._.after_snippet = ent.doc[
            ent.start : min(ent.end + self.window, ent.sent.end)
        ]
        return ent

    def get_text(self, span: Span, label) -> str:
        attr = self.regex_config[label].get("attr", self.attr)

        return get_text(
            doclike=span,
            attr=attr,
            ignore_excluded=self.ignore_excluded,
        )

    def _exclude_filter(self, ent: Span) -> Span:
        label = ent.label_

        before_exclude = self.regex_config[label].get("before_exclude", None)
        after_exclude = self.regex_config[label].get("after_exclude", None)

        if before_exclude is not None:
            t = ent._.before_snippet
            t = self.get_text(t, label)
            if re.compile(before_exclude).search(t) is not None:
                if self.verbose:
                    logger.info(
                        f"excluded (before) string: {t} - pattern {before_exclude}"
                    )
                return False

        if after_exclude is not None:
            t = ent._.after_snippet
            t = self.get_text(t, label)
            if re.compile(after_exclude).search(t) is not None:
                if self.verbose:
                    logger.info(
                        f"excluded (after) string: {t} - pattern {after_exclude}"
                    )
                return False

        return True

    def _snippet_extraction(self, ent: Span) -> Span:
        label = ent.label_

        before_extract = self.regex_config[label].get("before_extract", [])
        after_extract = self.regex_config[label].get("after_extract", [])

        if type(before_extract) == str:
            before_extract = [before_extract]
        if type(after_extract) == str:
            after_extract = [after_extract]

        t = ent._.before_snippet
        t = self.get_text(t, label)
        ent._.before_extract = []
        for pattern in before_extract:
            pattern = re.compile(pattern)
            match = pattern.search(t)
            ent._.before_extract.append(match.groups()[0] if match else None)

        t = ent._.after_snippet
        t = self.get_text(t, label)
        ent._.after_extract = []
        for pattern in after_extract:
            pattern = re.compile(pattern)
            match = pattern.search(t)
            ent._.after_extract.append(match.groups()[0] if match else None)

        return ent
regex_config = _check_regex_config(regex_config) instance-attribute
window = window instance-attribute
verbose = verbose instance-attribute
ignore_excluded = ignore_excluded instance-attribute
__init__(nlp, regex_config, window, attr, verbose, ignore_excluded)
Source code in edsnlp/pipelines/core/advanced/advanced.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
def __init__(
    self,
    nlp: Language,
    regex_config: Dict[str, Any],
    window: int,
    attr: str,
    verbose: int,
    ignore_excluded: bool,
):
    self.regex_config = _check_regex_config(regex_config)
    self.window = window
    regex = regex_config

    self.verbose = verbose

    super().__init__(
        nlp=nlp,
        terms=dict(),
        regex=regex,
        attr=attr,
        ignore_excluded=ignore_excluded,
    )

    self.ignore_excluded = ignore_excluded

    self.set_extensions()
set_extensions()
Source code in edsnlp/pipelines/core/advanced/advanced.py
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
@staticmethod
def set_extensions() -> None:
    if not Doc.has_extension("my_ents"):
        Doc.set_extension("my_ents", default=[])

    if not Span.has_extension("matcher_name"):
        Span.set_extension("matcher_name", default=None)

    if not Span.has_extension("before_extract"):
        Span.set_extension("before_extract", default=None)
    if not Span.has_extension("after_extract"):
        Span.set_extension("after_extract", default=None)

    if not Span.has_extension("window"):
        Span.set_extension("window", default=None)

    if not Span.has_extension("before_snippet"):
        Span.set_extension("before_snippet", default=None)
    if not Span.has_extension("after_snippet"):
        Span.set_extension("after_snippet", default=None)
process(doc)

Process the document, looking for named entities.

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
List[Span]

List of detected spans.

Source code in edsnlp/pipelines/core/advanced/advanced.py
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def process(self, doc: Doc) -> List[Span]:
    """
    Process the document, looking for named entities.

    Parameters
    ----------
    doc : Doc
        spaCy Doc object

    Returns
    -------
    List[Span]
        List of detected spans.
    """

    ents = super().process(doc)
    ents = self._postprocessing_pipeline(ents)

    return ents
__call__(doc)

Adds spans to document.

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
doc

spaCy Doc object, annotated for extracted terms.

Source code in edsnlp/pipelines/core/advanced/advanced.py
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
def __call__(self, doc: Doc) -> Doc:
    """
    Adds spans to document.

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    doc:
        spaCy Doc object, annotated for extracted terms.
    """

    ents = self.process(doc)

    ents, discarded = filter_spans(list(doc.ents) + ents, return_discarded=True)

    doc.ents = ents

    if "discarded" not in doc.spans:
        doc.spans["discarded"] = []
    doc.spans["discarded"].extend(discarded)

    return doc
_postprocessing_pipeline(ents)
Source code in edsnlp/pipelines/core/advanced/advanced.py
129
130
131
132
133
134
135
136
137
138
139
def _postprocessing_pipeline(self, ents: List[Span]):
    # add a window within the sentence around entities
    ents = [self._add_window(ent) for ent in ents]

    # Remove entities based on the snippet located just before and after the entity
    ents = filter(self._exclude_filter, ents)

    # Extract informations from the entity's context via regex
    ents = [self._snippet_extraction(ent) for ent in ents]

    return ents
_add_window(ent)
Source code in edsnlp/pipelines/core/advanced/advanced.py
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
def _add_window(self, ent: Span) -> Span:
    ent._.window = ent.doc[
        max(ent.start - self.window, ent.sent.start) : min(
            ent.end + self.window, ent.sent.end
        )
    ]

    # include the entity in the snippets so that we can extract
    # the number when it is attached to the word, e.g. "3PA"
    ent._.before_snippet = ent.doc[
        max(ent.start - self.window, ent.sent.start) : ent.end
    ]
    ent._.after_snippet = ent.doc[
        ent.start : min(ent.end + self.window, ent.sent.end)
    ]
    return ent
get_text(span, label)
Source code in edsnlp/pipelines/core/advanced/advanced.py
158
159
160
161
162
163
164
165
def get_text(self, span: Span, label) -> str:
    attr = self.regex_config[label].get("attr", self.attr)

    return get_text(
        doclike=span,
        attr=attr,
        ignore_excluded=self.ignore_excluded,
    )
_exclude_filter(ent)
Source code in edsnlp/pipelines/core/advanced/advanced.py
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
def _exclude_filter(self, ent: Span) -> Span:
    label = ent.label_

    before_exclude = self.regex_config[label].get("before_exclude", None)
    after_exclude = self.regex_config[label].get("after_exclude", None)

    if before_exclude is not None:
        t = ent._.before_snippet
        t = self.get_text(t, label)
        if re.compile(before_exclude).search(t) is not None:
            if self.verbose:
                logger.info(
                    f"excluded (before) string: {t} - pattern {before_exclude}"
                )
            return False

    if after_exclude is not None:
        t = ent._.after_snippet
        t = self.get_text(t, label)
        if re.compile(after_exclude).search(t) is not None:
            if self.verbose:
                logger.info(
                    f"excluded (after) string: {t} - pattern {after_exclude}"
                )
            return False

    return True
_snippet_extraction(ent)
Source code in edsnlp/pipelines/core/advanced/advanced.py
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
def _snippet_extraction(self, ent: Span) -> Span:
    label = ent.label_

    before_extract = self.regex_config[label].get("before_extract", [])
    after_extract = self.regex_config[label].get("after_extract", [])

    if type(before_extract) == str:
        before_extract = [before_extract]
    if type(after_extract) == str:
        after_extract = [after_extract]

    t = ent._.before_snippet
    t = self.get_text(t, label)
    ent._.before_extract = []
    for pattern in before_extract:
        pattern = re.compile(pattern)
        match = pattern.search(t)
        ent._.before_extract.append(match.groups()[0] if match else None)

    t = ent._.after_snippet
    t = self.get_text(t, label)
    ent._.after_extract = []
    for pattern in after_extract:
        pattern = re.compile(pattern)
        match = pattern.search(t)
        ent._.after_extract.append(match.groups()[0] if match else None)

    return ent
_check_regex_config(regex_config)
Source code in edsnlp/pipelines/core/advanced/advanced.py
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
def _check_regex_config(regex_config):
    for k, v in regex_config.items():
        if type(v) is not dict:
            raise TypeError(
                f"The value of the key {k} is of type {type(v)}, but a dict is expected"
            )

        single_group_regex_keys = ["before_extract", "after_extract"]

        for single_group_regex_key in single_group_regex_keys:
            if single_group_regex_key in v:
                # ensure it is a list
                if type(v[single_group_regex_key]) is not list:
                    v[single_group_regex_key] = [v[single_group_regex_key]]

                for i, regex in enumerate(v[single_group_regex_key]):
                    n_groups = re.compile(regex).groups

                    if n_groups == 0:
                        # Adding grouping parenthesis
                        v[single_group_regex_key][i] = r"(" + regex + r")"
                    elif n_groups != 1:
                        # Accepting only 1 group per regex
                        raise ValueError(
                            f"The RegEx for {repr(k)} ({repr(regex)}) "
                            f"stored in {repr(single_group_regex_key)} "
                            f"contains {n_groups} capturing groups, 1 expected"
                        )

    return regex_config

qualifiers

factories
base
Qualifier

Bases: BaseComponent

Implements the NegEx algorithm.

PARAMETER DESCRIPTION
nlp

spaCy nlp pipeline to use for matching.

TYPE: Language

attr

spaCy's attribute to use: a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr' we can also add a key for each regex.

TYPE: str

on_ents_only

Whether to look for matches around detected entities only. Useful for faster inference in downstream tasks.

TYPE: bool

explain

Whether to keep track of cues for each entity.

TYPE: bool

**terms

Terms to look for.

TYPE: Dict[str, Optional[List[str]]]

Source code in edsnlp/pipelines/qualifiers/base.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
class Qualifier(BaseComponent):
    """
    Implements the NegEx algorithm.

    Parameters
    ----------
    nlp : Language
        spaCy nlp pipeline to use for matching.
    attr : str
        spaCy's attribute to use:
        a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr'
        we can also add a key for each regex.
    on_ents_only : bool
        Whether to look for matches around detected entities only.
        Useful for faster inference in downstream tasks.
    explain : bool
        Whether to keep track of cues for each entity.
    **terms : Dict[str, Optional[List[str]]]
        Terms to look for.
    """

    defaults = dict()

    def __init__(
        self,
        nlp: Language,
        attr: str,
        on_ents_only: bool,
        explain: bool,
        **terms: Dict[str, Optional[List[str]]],
    ):

        if attr.upper() == "NORM":
            check_normalizer(nlp)

        self.phrase_matcher = EDSPhraseMatcher(vocab=nlp.vocab, attr=attr)
        self.phrase_matcher.build_patterns(nlp=nlp, terms=terms)

        self.on_ents_only = on_ents_only
        self.explain = explain

    def get_defaults(
        self, **kwargs: Dict[str, Optional[List[str]]]
    ) -> Dict[str, List[str]]:
        """
        Merge terms with their defaults. Null keys are replaced with defaults.

        Returns
        -------
        Dict[str, List[str]]
            Merged dictionary
        """
        # Filter out empty keys
        kwargs = {k: v for k, v in kwargs.items() if v is not None}

        # Update defaults
        terms = self.defaults.copy()
        terms.update(kwargs)

        return terms

    def get_matches(self, doc: Doc) -> List[Span]:
        """
        Extract matches.

        Parameters
        ----------
        doc : Doc
            spaCy `Doc` object.

        Returns
        -------
        List[Span]
            List of detected spans
        """
        if self.on_ents_only:

            sents = set([ent.sent for ent in doc.ents])
            match_iterator = map(
                lambda sent: self.phrase_matcher(sent, as_spans=True), sents
            )

            matches = chain.from_iterable(match_iterator)

        else:
            matches = self.phrase_matcher(doc, as_spans=True)

        return list(matches)

    def __call__(self, doc: Doc) -> Doc:
        return self.process(doc)
defaults = dict() class-attribute
phrase_matcher = EDSPhraseMatcher(vocab=nlp.vocab, attr=attr) instance-attribute
on_ents_only = on_ents_only instance-attribute
explain = explain instance-attribute
__init__(nlp, attr, on_ents_only, explain, **terms)
Source code in edsnlp/pipelines/qualifiers/base.py
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
def __init__(
    self,
    nlp: Language,
    attr: str,
    on_ents_only: bool,
    explain: bool,
    **terms: Dict[str, Optional[List[str]]],
):

    if attr.upper() == "NORM":
        check_normalizer(nlp)

    self.phrase_matcher = EDSPhraseMatcher(vocab=nlp.vocab, attr=attr)
    self.phrase_matcher.build_patterns(nlp=nlp, terms=terms)

    self.on_ents_only = on_ents_only
    self.explain = explain
get_defaults(**kwargs)

Merge terms with their defaults. Null keys are replaced with defaults.

RETURNS DESCRIPTION
Dict[str, List[str]]

Merged dictionary

Source code in edsnlp/pipelines/qualifiers/base.py
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def get_defaults(
    self, **kwargs: Dict[str, Optional[List[str]]]
) -> Dict[str, List[str]]:
    """
    Merge terms with their defaults. Null keys are replaced with defaults.

    Returns
    -------
    Dict[str, List[str]]
        Merged dictionary
    """
    # Filter out empty keys
    kwargs = {k: v for k, v in kwargs.items() if v is not None}

    # Update defaults
    terms = self.defaults.copy()
    terms.update(kwargs)

    return terms
get_matches(doc)

Extract matches.

PARAMETER DESCRIPTION
doc

spaCy Doc object.

TYPE: Doc

RETURNS DESCRIPTION
List[Span]

List of detected spans

Source code in edsnlp/pipelines/qualifiers/base.py
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
def get_matches(self, doc: Doc) -> List[Span]:
    """
    Extract matches.

    Parameters
    ----------
    doc : Doc
        spaCy `Doc` object.

    Returns
    -------
    List[Span]
        List of detected spans
    """
    if self.on_ents_only:

        sents = set([ent.sent for ent in doc.ents])
        match_iterator = map(
            lambda sent: self.phrase_matcher(sent, as_spans=True), sents
        )

        matches = chain.from_iterable(match_iterator)

    else:
        matches = self.phrase_matcher(doc, as_spans=True)

    return list(matches)
__call__(doc)
Source code in edsnlp/pipelines/qualifiers/base.py
114
115
def __call__(self, doc: Doc) -> Doc:
    return self.process(doc)
check_normalizer(nlp)
Source code in edsnlp/pipelines/qualifiers/base.py
12
13
14
15
16
17
18
19
20
21
22
def check_normalizer(nlp: Language) -> None:
    components = {name: component for name, component in nlp.pipeline}
    normalizer = components.get("normalizer")

    if normalizer and not normalizer.lowercase:
        logger.warning(
            "You have chosen the NORM attribute, but disabled lowercasing "
            "in your normalisation pipeline. "
            "This WILL hurt performance : you might want to use the "
            "LOWER attribute instead."
        )
family
family
FamilyContext

Bases: Qualifier

Implements a family context detection algorithm.

The components looks for terms indicating family references in the text.

PARAMETER DESCRIPTION
nlp

spaCy nlp pipeline to use for matching.

TYPE: Language

family

List of terms indicating family reference.

TYPE: Optional[List[str]]

terminations

List of termination terms, to separate syntagmas.

TYPE: Optional[List[str]]

attr

spaCy's attribute to use: a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr' we can also add a key for each regex.

TYPE: str

on_ents_only

Whether to look for matches around detected entities only. Useful for faster inference in downstream tasks.

TYPE: bool

regex

A dictionnary of regex patterns.

TYPE: Optional[Dict[str, Union[List[str], str]]]

explain

Whether to keep track of cues for each entity.

TYPE: bool

use_sections

Whether to use annotated sections (namely antécédents familiaux).

TYPE: bool, by default

Source code in edsnlp/pipelines/qualifiers/family/family.py
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
class FamilyContext(Qualifier):
    """
    Implements a family context detection algorithm.

    The components looks for terms indicating family references in the text.

    Parameters
    ----------
    nlp : Language
        spaCy nlp pipeline to use for matching.
    family : Optional[List[str]]
        List of terms indicating family reference.
    terminations : Optional[List[str]]
        List of termination terms, to separate syntagmas.
    attr : str
        spaCy's attribute to use:
        a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr'
        we can also add a key for each regex.
    on_ents_only : bool
        Whether to look for matches around detected entities only.
        Useful for faster inference in downstream tasks.
    regex : Optional[Dict[str, Union[List[str], str]]]
        A dictionnary of regex patterns.
    explain : bool
        Whether to keep track of cues for each entity.
    use_sections : bool, by default `False`
        Whether to use annotated sections (namely `antécédents familiaux`).
    """

    defaults = dict(
        family=family,
        termination=termination,
    )

    def __init__(
        self,
        nlp: Language,
        attr: str,
        family: Optional[List[str]],
        termination: Optional[List[str]],
        use_sections: bool,
        explain: bool,
        on_ents_only: bool,
    ):

        terms = self.get_defaults(
            family=family,
            termination=termination,
        )

        super().__init__(
            nlp=nlp,
            attr=attr,
            on_ents_only=on_ents_only,
            explain=explain,
            **terms,
        )

        self.set_extensions()

        self.sections = use_sections and (
            "eds.sections" in nlp.pipe_names or "sections" in nlp.pipe_names
        )
        if use_sections and not self.sections:
            logger.warning(
                "You have requested that the pipeline use annotations "
                "provided by the `section` pipeline, but it was not set. "
                "Skipping that step."
            )

    @staticmethod
    def set_extensions() -> None:
        if not Token.has_extension("family"):
            Token.set_extension("family", default=False)

        if not Token.has_extension("family_"):
            Token.set_extension(
                "family_",
                getter=lambda token: "FAMILY" if token._.family else "PATIENT",
            )

        if not Span.has_extension("family"):
            Span.set_extension("family", default=False)

        if not Span.has_extension("family_"):
            Span.set_extension(
                "family_",
                getter=lambda span: "FAMILY" if span._.family else "PATIENT",
            )

        if not Span.has_extension("family_cues"):
            Span.set_extension("family_cues", default=[])

        if not Doc.has_extension("family"):
            Doc.set_extension("family", default=[])

    def process(self, doc: Doc) -> Doc:
        """
        Finds entities related to family context.

        Parameters
        ----------
        doc: spaCy Doc object

        Returns
        -------
        doc: spaCy Doc object, annotated for context
        """
        matches = self.get_matches(doc)

        terminations = get_spans(matches, "termination")
        boundaries = self._boundaries(doc, terminations)

        # Removes duplicate matches and pseudo-expressions in one statement
        matches = filter_spans(matches, label_to_remove="pseudo")

        entities = list(doc.ents) + list(doc.spans.get("discarded", []))
        ents = None

        sections = []

        if self.sections:
            sections = [
                Span(doc, section.start, section.end, label="FAMILY")
                for section in doc.spans["sections"]
                if section.label_ == "antécédents familiaux"
            ]

        for start, end in boundaries:

            ents, entities = consume_spans(
                entities,
                filter=lambda s: check_inclusion(s, start, end),
                second_chance=ents,
            )

            sub_matches, matches = consume_spans(
                matches, lambda s: start <= s.start < end
            )

            sub_sections, sections = consume_spans(sections, lambda s: doc[start] in s)

            if self.on_ents_only and not ents:
                continue

            cues = get_spans(sub_matches, "family")
            cues += sub_sections

            if not cues:
                continue

            family = bool(cues)

            if not family:
                continue

            if not self.on_ents_only:
                for token in doc[start:end]:
                    token._.family = True

            for ent in ents:
                ent._.family = True
                if self.explain:
                    ent._.family_cues += cues
                if not self.on_ents_only:
                    for token in ent:
                        token._.family = True

        return doc
defaults = dict(family=family, termination=termination) class-attribute
sections = use_sections and 'eds.sections' in nlp.pipe_names or 'sections' in nlp.pipe_names instance-attribute
__init__(nlp, attr, family, termination, use_sections, explain, on_ents_only)
Source code in edsnlp/pipelines/qualifiers/family/family.py
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
def __init__(
    self,
    nlp: Language,
    attr: str,
    family: Optional[List[str]],
    termination: Optional[List[str]],
    use_sections: bool,
    explain: bool,
    on_ents_only: bool,
):

    terms = self.get_defaults(
        family=family,
        termination=termination,
    )

    super().__init__(
        nlp=nlp,
        attr=attr,
        on_ents_only=on_ents_only,
        explain=explain,
        **terms,
    )

    self.set_extensions()

    self.sections = use_sections and (
        "eds.sections" in nlp.pipe_names or "sections" in nlp.pipe_names
    )
    if use_sections and not self.sections:
        logger.warning(
            "You have requested that the pipeline use annotations "
            "provided by the `section` pipeline, but it was not set. "
            "Skipping that step."
        )
set_extensions()
Source code in edsnlp/pipelines/qualifiers/family/family.py
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
@staticmethod
def set_extensions() -> None:
    if not Token.has_extension("family"):
        Token.set_extension("family", default=False)

    if not Token.has_extension("family_"):
        Token.set_extension(
            "family_",
            getter=lambda token: "FAMILY" if token._.family else "PATIENT",
        )

    if not Span.has_extension("family"):
        Span.set_extension("family", default=False)

    if not Span.has_extension("family_"):
        Span.set_extension(
            "family_",
            getter=lambda span: "FAMILY" if span._.family else "PATIENT",
        )

    if not Span.has_extension("family_cues"):
        Span.set_extension("family_cues", default=[])

    if not Doc.has_extension("family"):
        Doc.set_extension("family", default=[])
process(doc)

Finds entities related to family context.

PARAMETER DESCRIPTION
doc

TYPE: Doc

RETURNS DESCRIPTION
doc
Source code in edsnlp/pipelines/qualifiers/family/family.py
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
def process(self, doc: Doc) -> Doc:
    """
    Finds entities related to family context.

    Parameters
    ----------
    doc: spaCy Doc object

    Returns
    -------
    doc: spaCy Doc object, annotated for context
    """
    matches = self.get_matches(doc)

    terminations = get_spans(matches, "termination")
    boundaries = self._boundaries(doc, terminations)

    # Removes duplicate matches and pseudo-expressions in one statement
    matches = filter_spans(matches, label_to_remove="pseudo")

    entities = list(doc.ents) + list(doc.spans.get("discarded", []))
    ents = None

    sections = []

    if self.sections:
        sections = [
            Span(doc, section.start, section.end, label="FAMILY")
            for section in doc.spans["sections"]
            if section.label_ == "antécédents familiaux"
        ]

    for start, end in boundaries:

        ents, entities = consume_spans(
            entities,
            filter=lambda s: check_inclusion(s, start, end),
            second_chance=ents,
        )

        sub_matches, matches = consume_spans(
            matches, lambda s: start <= s.start < end
        )

        sub_sections, sections = consume_spans(sections, lambda s: doc[start] in s)

        if self.on_ents_only and not ents:
            continue

        cues = get_spans(sub_matches, "family")
        cues += sub_sections

        if not cues:
            continue

        family = bool(cues)

        if not family:
            continue

        if not self.on_ents_only:
            for token in doc[start:end]:
                token._.family = True

        for ent in ents:
            ent._.family = True
            if self.explain:
                ent._.family_cues += cues
            if not self.on_ents_only:
                for token in ent:
                    token._.family = True

    return doc
patterns
family: List[str] = ['aïeul', 'aïeux', 'antécédent familial', 'antécédents familiaux', 'arrière-grand-mère', 'arrière-grand-père', 'arrière-grands-parents', 'cousin', 'cousine', 'cousines', 'cousins', 'enfant', 'enfants', 'épouse', 'époux', 'familial', 'familiale', 'familiales', 'familiaux', 'famille', 'fiancé', 'fiancée', 'fils', 'frère', 'frères', 'grand-mère', 'grand-père', 'grands-parents', 'maman', 'mari', 'mère', 'oncle', 'papa', 'parent', 'parents', 'père', 'soeur', 'sœur', 'sœurs', 'soeurs', 'tante'] module-attribute
factory
DEFAULT_CONFIG = dict(family=None, termination=None, attr='NORM', use_sections=False, explain=False, on_ents_only=True) module-attribute
create_component(nlp, name, family, termination, attr, explain, on_ents_only, use_sections)
Source code in edsnlp/pipelines/qualifiers/family/factory.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
@deprecated_factory("family", "eds.family", default_config=DEFAULT_CONFIG)
@Language.factory("eds.family", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    family: Optional[List[str]],
    termination: Optional[List[str]],
    attr: str,
    explain: bool,
    on_ents_only: bool,
    use_sections: bool,
):
    return FamilyContext(
        nlp,
        family=family,
        termination=termination,
        attr=attr,
        explain=explain,
        on_ents_only=on_ents_only,
        use_sections=use_sections,
    )
negation
patterns
pseudo: List[str] = ['aucun changement', 'aucun doute', 'aucune hésitation', 'aucune diminution', 'ne permet pas d', 'ne permet pas de', "n'exclut pas", 'non négligeable', "pas d'amélioration", "pas d'augmentation", "pas d'autre", 'pas de changement', 'pas de diminution', 'pas de doute', 'pas exclu', 'pas exclue', 'pas exclues', 'pas exclus', 'pas immunisé', 'pas immunisée', 'pas immunisés', 'pas immunisées', 'sans amélioration', 'sans aucun doute', 'sans augmentation', 'sans certitude', 'sans changement', 'sans diminution', 'sans doute', 'sans être certain'] module-attribute
preceding: List[str] = ['à la place de', 'absence', 'absence de signe de', 'absence de', 'aucun signe de', 'aucun', 'aucune preuve', 'aucune', 'aucunes', 'aucuns', 'décline', 'décliné', 'dépourvu', 'dépourvue', 'dépourvues', 'dépourvus', 'disparition de', 'disparition des', 'excluent', 'exclut', 'impossibilité de', 'immunisé', 'immunisée', 'immunisés', 'immunisées', 'incompatible avec', 'incompatibles avec', 'jamais', 'ne manifestaient pas', 'ne manifestait pas', 'ne manifeste pas', 'ne manifestent pas', 'ne pas', 'ne présentaient pas', 'ne présentait pas', 'ne présente pas', 'ne présentent pas', 'ne ressemble pas', 'ne ressemblent pas', 'négatif pour', "n'est pas", "n'était pas", 'ni', 'niant', 'nie', 'nié', 'nullement', 'pas d', 'pas de cause de', 'pas de signe de', 'pas de signes de', 'pas de', 'pas nécessaire de', 'pas', "permet d'exclure", "plus d'aspect de", 'sans manifester de', 'sans présenter de', 'sans', 'symptôme atypique'] module-attribute
following: List[str] = [':0', ': 0', ':non', ': non', 'absent', 'absente', 'absentes', 'absents', 'dépourvu', 'dépourvue', 'dépourvues', 'dépourvus', 'disparaissent', 'disparait', 'est exclu', 'est exclue', 'immunisé', 'immunisée', 'immunisés', 'immunisées', 'impossible', 'improbable', 'négatif', 'négatifs', 'négative', 'négatives', 'négligeable', 'négligeables', 'nié', 'niée', 'non', 'pas nécessaire', 'peu probable', 'sont exclues', 'sont exclus'] module-attribute
verbs: List[str] = ['éliminer', 'exclure', 'interdire', 'nier', 'réfuter', 'rejeter'] module-attribute
negation
Negation

Bases: Qualifier

Implements the NegEx algorithm.

The component looks for five kinds of expressions in the text :

  • preceding negations, ie cues that precede a negated expression

  • following negations, ie cues that follow a negated expression

  • pseudo negations : contain a negation cue, but are not negations (eg "pas de doute"/"no doubt")

  • negation verbs, ie verbs that indicate a negation

  • terminations, ie words that delimit propositions. The negation spans from the preceding cue to the termination.

PARAMETER DESCRIPTION
nlp

spaCy nlp pipeline to use for matching.

TYPE: Language

attr

spaCy's attribute to use

TYPE: str

pseudo

List of pseudo negation terms.

TYPE: Optional[List[str]]

preceding

List of preceding negation terms

TYPE: Optional[List[str]]

following

List of following negation terms.

TYPE: Optional[List[str]]

termination

List of termination terms.

TYPE: Optional[List[str]]

verbs

List of negation verbs.

TYPE: Optional[List[str]]

on_ents_only

Whether to look for matches around detected entities only. Useful for faster inference in downstream tasks.

TYPE: bool

within_ents

Whether to consider cues within entities.

TYPE: bool

explain

Whether to keep track of cues for each entity.

TYPE: bool

Source code in edsnlp/pipelines/qualifiers/negation/negation.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
class Negation(Qualifier):
    """
    Implements the NegEx algorithm.

    The component looks for five kinds of expressions in the text :

    - preceding negations, ie cues that precede a negated expression

    - following negations, ie cues that follow a negated expression

    - pseudo negations : contain a negation cue, but are not negations
      (eg "pas de doute"/"no doubt")

    - negation verbs, ie verbs that indicate a negation

    - terminations, ie words that delimit propositions.
      The negation spans from the preceding cue to the termination.

    Parameters
    ----------
    nlp : Language
        spaCy nlp pipeline to use for matching.
    attr : str
        spaCy's attribute to use
    pseudo : Optional[List[str]]
        List of pseudo negation terms.
    preceding : Optional[List[str]]
        List of preceding negation terms
    following : Optional[List[str]]
        List of following negation terms.
    termination : Optional[List[str]]
        List of termination terms.
    verbs : Optional[List[str]]
        List of negation verbs.
    on_ents_only : bool
        Whether to look for matches around detected entities only.
        Useful for faster inference in downstream tasks.
    within_ents : bool
        Whether to consider cues within entities.
    explain : bool
        Whether to keep track of cues for each entity.
    """

    defaults = dict(
        following=following,
        preceding=preceding,
        pseudo=pseudo,
        verbs=verbs,
        termination=termination,
    )

    def __init__(
        self,
        nlp: Language,
        attr: str,
        pseudo: Optional[List[str]],
        preceding: Optional[List[str]],
        following: Optional[List[str]],
        termination: Optional[List[str]],
        verbs: Optional[List[str]],
        on_ents_only: bool,
        within_ents: bool,
        explain: bool,
    ):

        terms = self.get_defaults(
            pseudo=pseudo,
            preceding=preceding,
            following=following,
            termination=termination,
            verbs=verbs,
        )
        terms["verbs"] = self.load_verbs(terms["verbs"])

        super().__init__(
            nlp=nlp,
            attr=attr,
            on_ents_only=on_ents_only,
            explain=explain,
            **terms,
        )

        self.within_ents = within_ents
        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:

        if not Token.has_extension("negation"):
            Token.set_extension("negation", default=False)

        if not Token.has_extension("negated"):
            Token.set_extension(
                "negated", getter=deprecated_getter_factory("negated", "negation")
            )

        if not Token.has_extension("negation_"):
            Token.set_extension(
                "negation_",
                getter=lambda token: "NEG" if token._.negation else "AFF",
            )

        if not Token.has_extension("polarity_"):
            Token.set_extension(
                "polarity_",
                getter=deprecated_getter_factory("polarity_", "negation_"),
            )

        if not Span.has_extension("negation"):
            Span.set_extension("negation", default=False)

        if not Span.has_extension("negated"):
            Span.set_extension(
                "negated", getter=deprecated_getter_factory("negated", "negation")
            )

        if not Span.has_extension("negation_cues"):
            Span.set_extension("negation_cues", default=[])

        if not Span.has_extension("negation_"):
            Span.set_extension(
                "negation_",
                getter=lambda span: "NEG" if span._.negation else "AFF",
            )

        if not Span.has_extension("polarity_"):
            Span.set_extension(
                "polarity_",
                getter=deprecated_getter_factory("polarity_", "negation_"),
            )

        if not Doc.has_extension("negations"):
            Doc.set_extension("negations", default=[])

    def load_verbs(self, verbs: List[str]) -> List[str]:
        """
        Conjugate negating verbs to specific tenses.

        Parameters
        ----------
        verbs: list of negating verbs to conjugate

        Returns
        -------
        list_neg_verbs: List of negating verbs conjugated to specific tenses.
        """

        neg_verbs = get_verbs(verbs)

        neg_verbs = neg_verbs.loc[
            ((neg_verbs["mode"] == "Indicatif") & (neg_verbs["tense"] == "Présent"))
            | (neg_verbs["tense"] == "Participe Présent")
            | (neg_verbs["tense"] == "Participe Passé")
        ]

        list_neg_verbs = list(neg_verbs["term"].unique())

        return list_neg_verbs

    def annotate_entity(
        self,
        ent: Span,
        sub_preceding: List[Span],
        sub_following: List[Span],
    ) -> None:
        """
        Annotate entities using preceding and following negations.

        Parameters
        ----------
        ent : Span
            Entity to annotate
        sub_preceding : List[Span]
            List of preceding negations cues
        sub_following : List[Span]
            List of following negations cues
        """
        if self.within_ents:
            cues = [m for m in sub_preceding if m.end <= ent.end]
            cues += [m for m in sub_following if m.start >= ent.start]
        else:
            cues = [m for m in sub_preceding if m.end <= ent.start]
            cues += [m for m in sub_following if m.start >= ent.end]

        negation = ent._.negation or bool(cues)

        ent._.negation = negation

        if self.explain and negation:
            ent._.negation_cues += cues

        if not self.on_ents_only and negation:
            for token in ent:
                token._.negation = True

    def process(self, doc: Doc) -> Doc:
        """
        Finds entities related to negation.

        Parameters
        ----------
        doc: spaCy `Doc` object

        Returns
        -------
        doc: spaCy `Doc` object, annotated for negation
        """

        matches = self.get_matches(doc)

        terminations = get_spans(matches, "termination")
        boundaries = self._boundaries(doc, terminations)

        entities = list(doc.ents) + list(doc.spans.get("discarded", []))
        ents = None

        # Removes duplicate matches and pseudo-expressions in one statement
        matches = filter_spans(matches, label_to_remove="pseudo")

        for start, end in boundaries:

            ents, entities = consume_spans(
                entities,
                filter=lambda s: check_inclusion(s, start, end),
                second_chance=ents,
            )

            sub_matches, matches = consume_spans(
                matches, lambda s: start <= s.start < end
            )

            if self.on_ents_only and not ents:
                continue

            sub_preceding = get_spans(sub_matches, "preceding")
            sub_following = get_spans(sub_matches, "following")
            # Verbs precede negated content
            sub_preceding += get_spans(sub_matches, "verbs")

            if not sub_preceding + sub_following:
                continue

            if not self.on_ents_only:
                for token in doc[start:end]:
                    token._.negation = any(
                        m.end <= token.i for m in sub_preceding
                    ) or any(m.start > token.i for m in sub_following)

            for ent in ents:
                self.annotate_entity(
                    ent=ent,
                    sub_preceding=sub_preceding,
                    sub_following=sub_following,
                )

        return doc

    def __call__(self, doc: Doc) -> Doc:
        return self.process(doc)
defaults = dict(following=following, preceding=preceding, pseudo=pseudo, verbs=verbs, termination=termination) class-attribute
within_ents = within_ents instance-attribute
__init__(nlp, attr, pseudo, preceding, following, termination, verbs, on_ents_only, within_ents, explain)
Source code in edsnlp/pipelines/qualifiers/negation/negation.py
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
def __init__(
    self,
    nlp: Language,
    attr: str,
    pseudo: Optional[List[str]],
    preceding: Optional[List[str]],
    following: Optional[List[str]],
    termination: Optional[List[str]],
    verbs: Optional[List[str]],
    on_ents_only: bool,
    within_ents: bool,
    explain: bool,
):

    terms = self.get_defaults(
        pseudo=pseudo,
        preceding=preceding,
        following=following,
        termination=termination,
        verbs=verbs,
    )
    terms["verbs"] = self.load_verbs(terms["verbs"])

    super().__init__(
        nlp=nlp,
        attr=attr,
        on_ents_only=on_ents_only,
        explain=explain,
        **terms,
    )

    self.within_ents = within_ents
    self.set_extensions()
set_extensions()
Source code in edsnlp/pipelines/qualifiers/negation/negation.py
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
@staticmethod
def set_extensions() -> None:

    if not Token.has_extension("negation"):
        Token.set_extension("negation", default=False)

    if not Token.has_extension("negated"):
        Token.set_extension(
            "negated", getter=deprecated_getter_factory("negated", "negation")
        )

    if not Token.has_extension("negation_"):
        Token.set_extension(
            "negation_",
            getter=lambda token: "NEG" if token._.negation else "AFF",
        )

    if not Token.has_extension("polarity_"):
        Token.set_extension(
            "polarity_",
            getter=deprecated_getter_factory("polarity_", "negation_"),
        )

    if not Span.has_extension("negation"):
        Span.set_extension("negation", default=False)

    if not Span.has_extension("negated"):
        Span.set_extension(
            "negated", getter=deprecated_getter_factory("negated", "negation")
        )

    if not Span.has_extension("negation_cues"):
        Span.set_extension("negation_cues", default=[])

    if not Span.has_extension("negation_"):
        Span.set_extension(
            "negation_",
            getter=lambda span: "NEG" if span._.negation else "AFF",
        )

    if not Span.has_extension("polarity_"):
        Span.set_extension(
            "polarity_",
            getter=deprecated_getter_factory("polarity_", "negation_"),
        )

    if not Doc.has_extension("negations"):
        Doc.set_extension("negations", default=[])
load_verbs(verbs)

Conjugate negating verbs to specific tenses.

PARAMETER DESCRIPTION
verbs

TYPE: List[str]

RETURNS DESCRIPTION
list_neg_verbs
Source code in edsnlp/pipelines/qualifiers/negation/negation.py
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
def load_verbs(self, verbs: List[str]) -> List[str]:
    """
    Conjugate negating verbs to specific tenses.

    Parameters
    ----------
    verbs: list of negating verbs to conjugate

    Returns
    -------
    list_neg_verbs: List of negating verbs conjugated to specific tenses.
    """

    neg_verbs = get_verbs(verbs)

    neg_verbs = neg_verbs.loc[
        ((neg_verbs["mode"] == "Indicatif") & (neg_verbs["tense"] == "Présent"))
        | (neg_verbs["tense"] == "Participe Présent")
        | (neg_verbs["tense"] == "Participe Passé")
    ]

    list_neg_verbs = list(neg_verbs["term"].unique())

    return list_neg_verbs
annotate_entity(ent, sub_preceding, sub_following)

Annotate entities using preceding and following negations.

PARAMETER DESCRIPTION
ent

Entity to annotate

TYPE: Span

sub_preceding

List of preceding negations cues

TYPE: List[Span]

sub_following

List of following negations cues

TYPE: List[Span]

Source code in edsnlp/pipelines/qualifiers/negation/negation.py
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
def annotate_entity(
    self,
    ent: Span,
    sub_preceding: List[Span],
    sub_following: List[Span],
) -> None:
    """
    Annotate entities using preceding and following negations.

    Parameters
    ----------
    ent : Span
        Entity to annotate
    sub_preceding : List[Span]
        List of preceding negations cues
    sub_following : List[Span]
        List of following negations cues
    """
    if self.within_ents:
        cues = [m for m in sub_preceding if m.end <= ent.end]
        cues += [m for m in sub_following if m.start >= ent.start]
    else:
        cues = [m for m in sub_preceding if m.end <= ent.start]
        cues += [m for m in sub_following if m.start >= ent.end]

    negation = ent._.negation or bool(cues)

    ent._.negation = negation

    if self.explain and negation:
        ent._.negation_cues += cues

    if not self.on_ents_only and negation:
        for token in ent:
            token._.negation = True
process(doc)

Finds entities related to negation.

PARAMETER DESCRIPTION
doc

TYPE: Doc

RETURNS DESCRIPTION
doc
Source code in edsnlp/pipelines/qualifiers/negation/negation.py
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
def process(self, doc: Doc) -> Doc:
    """
    Finds entities related to negation.

    Parameters
    ----------
    doc: spaCy `Doc` object

    Returns
    -------
    doc: spaCy `Doc` object, annotated for negation
    """

    matches = self.get_matches(doc)

    terminations = get_spans(matches, "termination")
    boundaries = self._boundaries(doc, terminations)

    entities = list(doc.ents) + list(doc.spans.get("discarded", []))
    ents = None

    # Removes duplicate matches and pseudo-expressions in one statement
    matches = filter_spans(matches, label_to_remove="pseudo")

    for start, end in boundaries:

        ents, entities = consume_spans(
            entities,
            filter=lambda s: check_inclusion(s, start, end),
            second_chance=ents,
        )

        sub_matches, matches = consume_spans(
            matches, lambda s: start <= s.start < end
        )

        if self.on_ents_only and not ents:
            continue

        sub_preceding = get_spans(sub_matches, "preceding")
        sub_following = get_spans(sub_matches, "following")
        # Verbs precede negated content
        sub_preceding += get_spans(sub_matches, "verbs")

        if not sub_preceding + sub_following:
            continue

        if not self.on_ents_only:
            for token in doc[start:end]:
                token._.negation = any(
                    m.end <= token.i for m in sub_preceding
                ) or any(m.start > token.i for m in sub_following)

        for ent in ents:
            self.annotate_entity(
                ent=ent,
                sub_preceding=sub_preceding,
                sub_following=sub_following,
            )

    return doc
__call__(doc)
Source code in edsnlp/pipelines/qualifiers/negation/negation.py
273
274
def __call__(self, doc: Doc) -> Doc:
    return self.process(doc)
factory
DEFAULT_CONFIG = dict(pseudo=None, preceding=None, following=None, termination=None, verbs=None, attr='NORM', on_ents_only=True, within_ents=False, explain=False) module-attribute
create_component(nlp, name, attr, pseudo, preceding, following, termination, verbs, on_ents_only, within_ents, explain)
Source code in edsnlp/pipelines/qualifiers/negation/factory.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
@deprecated_factory("negation", "eds.negation", default_config=DEFAULT_CONFIG)
@Language.factory("eds.negation", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    attr: str,
    pseudo: Optional[List[str]],
    preceding: Optional[List[str]],
    following: Optional[List[str]],
    termination: Optional[List[str]],
    verbs: Optional[List[str]],
    on_ents_only: bool,
    within_ents: bool,
    explain: bool,
):

    return Negation(
        nlp=nlp,
        attr=attr,
        pseudo=pseudo,
        preceding=preceding,
        following=following,
        termination=termination,
        verbs=verbs,
        on_ents_only=on_ents_only,
        within_ents=within_ents,
        explain=explain,
    )
reported_speech
reported_speech
ReportedSpeech

Bases: Qualifier

Implements a reported speech detection algorithm.

The components looks for terms indicating patient statements, and quotations to detect patient speech.

PARAMETER DESCRIPTION
nlp

spaCy nlp pipeline to use for matching.

TYPE: Language

quotation

String gathering all quotation cues.

TYPE: str

verbs

List of reported speech verbs.

TYPE: List[str]

following

List of terms following a reported speech.

TYPE: List[str]

preceding

List of terms preceding a reported speech.

TYPE: List[str]

filter_matches

Whether to filter out overlapping matches.

TYPE: bool

attr

spaCy's attribute to use: a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr' we can also add a key for each regex.

TYPE: str

on_ents_only

Whether to look for matches around detected entities only. Useful for faster inference in downstream tasks.

TYPE: bool

within_ents

Whether to consider cues within entities.

TYPE: bool

explain

Whether to keep track of cues for each entity.

TYPE: bool

Source code in edsnlp/pipelines/qualifiers/reported_speech/reported_speech.py
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
class ReportedSpeech(Qualifier):
    """
    Implements a reported speech detection algorithm.

    The components looks for terms indicating patient statements,
    and quotations to detect patient speech.

    Parameters
    ----------
    nlp : Language
        spaCy nlp pipeline to use for matching.
    quotation : str
        String gathering all quotation cues.
    verbs : List[str]
        List of reported speech verbs.
    following : List[str]
        List of terms following a reported speech.
    preceding : List[str]
        List of terms preceding a reported speech.
    filter_matches : bool
        Whether to filter out overlapping matches.
    attr : str
        spaCy's attribute to use:
        a string with the value "TEXT" or "NORM",
        or a dict with the key 'term_attr'
        we can also add a key for each regex.
    on_ents_only : bool
        Whether to look for matches around detected entities only.
        Useful for faster inference in downstream tasks.
    within_ents : bool
        Whether to consider cues within entities.
    explain : bool
        Whether to keep track of cues for each entity.
    """

    defaults = dict(
        following=following,
        preceding=preceding,
        verbs=verbs,
        quotation=quotation,
    )

    def __init__(
        self,
        nlp: Language,
        attr: str,
        pseudo: Optional[List[str]],
        preceding: Optional[List[str]],
        following: Optional[List[str]],
        quotation: Optional[List[str]],
        verbs: Optional[List[str]],
        on_ents_only: bool,
        within_ents: bool,
        explain: bool,
    ):

        terms = self.get_defaults(
            pseudo=pseudo,
            preceding=preceding,
            following=following,
            quotation=quotation,
            verbs=verbs,
        )
        terms["verbs"] = self.load_verbs(terms["verbs"])

        quotation = terms.pop("quotation")

        super().__init__(
            nlp=nlp,
            attr=attr,
            on_ents_only=on_ents_only,
            explain=explain,
            **terms,
        )

        self.regex_matcher = RegexMatcher(attr=attr)
        self.regex_matcher.build_patterns(dict(quotation=quotation))

        self.within_ents = within_ents

        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:

        if not Token.has_extension("reported_speech"):
            Token.set_extension("reported_speech", default=False)

        if not Token.has_extension("reported_speech_"):
            Token.set_extension(
                "reported_speech_",
                getter=lambda token: "REPORTED"
                if token._.reported_speech
                else "DIRECT",
            )

        if not Span.has_extension("reported_speech"):
            Span.set_extension("reported_speech", default=False)

        if not Span.has_extension("reported_speech_"):
            Span.set_extension(
                "reported_speech_",
                getter=lambda span: "REPORTED" if span._.reported_speech else "DIRECT",
            )

        if not Span.has_extension("reported_speech_cues"):
            Span.set_extension("reported_speech_cues", default=[])

        if not Doc.has_extension("rspeechs"):
            Doc.set_extension("rspeechs", default=[])

    def load_verbs(self, verbs: List[str]) -> List[str]:
        """
        Conjugate reporting verbs to specific tenses (trhid person)

        Parameters
        ----------
        verbs: list of reporting verbs to conjugate

        Returns
        -------
        list_rep_verbs: List of reporting verbs conjugated to specific tenses.
        """

        rep_verbs = get_verbs(verbs)

        rep_verbs = rep_verbs.loc[
            (
                (rep_verbs["mode"] == "Indicatif")
                & (rep_verbs["tense"] == "Présent")
                & (rep_verbs["person"].isin(["3s", "3p"]))
            )
            | (rep_verbs["tense"] == "Participe Présent")
            | (rep_verbs["tense"] == "Participe Passé")
        ]

        list_rep_verbs = list(rep_verbs["term"].unique())

        return list_rep_verbs

    def process(self, doc: Doc) -> Doc:
        """
        Finds entities related to reported speech.

        Parameters
        ----------
        doc: spaCy Doc object

        Returns
        -------
        doc: spaCy Doc object, annotated for negation
        """

        matches = self.get_matches(doc)
        matches += list(self.regex_matcher(doc, as_spans=True))

        boundaries = self._boundaries(doc)

        entities = list(doc.ents) + list(doc.spans.get("discarded", []))
        ents = None

        # Removes duplicate matches and pseudo-expressions in one statement
        matches = filter_spans(matches, label_to_remove="pseudo")

        for start, end in boundaries:

            ents, entities = consume_spans(
                entities,
                filter=lambda s: check_inclusion(s, start, end),
                second_chance=ents,
            )

            sub_matches, matches = consume_spans(
                matches, lambda s: start <= s.start < end
            )

            if self.on_ents_only and not ents:
                continue

            sub_preceding = get_spans(sub_matches, "preceding")
            sub_following = get_spans(sub_matches, "following")
            sub_verbs = get_spans(sub_matches, "verbs")
            sub_quotation = get_spans(sub_matches, "quotation")

            if not sub_preceding + sub_following + sub_verbs + sub_quotation:
                continue

            if not self.on_ents_only:
                for token in doc[start:end]:
                    token._.reported_speech = (
                        any(m.end <= token.i for m in sub_preceding + sub_verbs)
                        or any(m.start > token.i for m in sub_following)
                        or any(
                            ((m.start < token.i) & (m.end > token.i + 1))
                            for m in sub_quotation
                        )
                    )
            for ent in ents:

                if self.within_ents:
                    cues = [m for m in sub_preceding + sub_verbs if m.end <= ent.end]
                    cues += [m for m in sub_following if m.start >= ent.start]
                else:
                    cues = [m for m in sub_preceding + sub_verbs if m.end <= ent.start]
                    cues += [m for m in sub_following if m.start >= ent.end]

                cues += [
                    m
                    for m in sub_quotation
                    if (m.start < ent.start) & (m.end > ent.end)
                ]

                reported_speech = ent._.reported_speech or bool(cues)
                ent._.reported_speech = reported_speech

                if self.explain:
                    ent._.reported_speech_cues += cues

                if not self.on_ents_only and reported_speech:
                    for token in ent:
                        token._.reported_speech = True
        return doc
defaults = dict(following=following, preceding=preceding, verbs=verbs, quotation=quotation) class-attribute
regex_matcher = RegexMatcher(attr=attr) instance-attribute
within_ents = within_ents instance-attribute
__init__(nlp, attr, pseudo, preceding, following, quotation, verbs, on_ents_only, within_ents, explain)
Source code in edsnlp/pipelines/qualifiers/reported_speech/reported_speech.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
def __init__(
    self,
    nlp: Language,
    attr: str,
    pseudo: Optional[List[str]],
    preceding: Optional[List[str]],
    following: Optional[List[str]],
    quotation: Optional[List[str]],
    verbs: Optional[List[str]],
    on_ents_only: bool,
    within_ents: bool,
    explain: bool,
):

    terms = self.get_defaults(
        pseudo=pseudo,
        preceding=preceding,
        following=following,
        quotation=quotation,
        verbs=verbs,
    )
    terms["verbs"] = self.load_verbs(terms["verbs"])

    quotation = terms.pop("quotation")

    super().__init__(
        nlp=nlp,
        attr=attr,
        on_ents_only=on_ents_only,
        explain=explain,
        **terms,
    )

    self.regex_matcher = RegexMatcher(attr=attr)
    self.regex_matcher.build_patterns(dict(quotation=quotation))

    self.within_ents = within_ents

    self.set_extensions()
set_extensions()
Source code in edsnlp/pipelines/qualifiers/reported_speech/reported_speech.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
@staticmethod
def set_extensions() -> None:

    if not Token.has_extension("reported_speech"):
        Token.set_extension("reported_speech", default=False)

    if not Token.has_extension("reported_speech_"):
        Token.set_extension(
            "reported_speech_",
            getter=lambda token: "REPORTED"
            if token._.reported_speech
            else "DIRECT",
        )

    if not Span.has_extension("reported_speech"):
        Span.set_extension("reported_speech", default=False)

    if not Span.has_extension("reported_speech_"):
        Span.set_extension(
            "reported_speech_",
            getter=lambda span: "REPORTED" if span._.reported_speech else "DIRECT",
        )

    if not Span.has_extension("reported_speech_cues"):
        Span.set_extension("reported_speech_cues", default=[])

    if not Doc.has_extension("rspeechs"):
        Doc.set_extension("rspeechs", default=[])
load_verbs(verbs)

Conjugate reporting verbs to specific tenses (trhid person)

PARAMETER DESCRIPTION
verbs

TYPE: List[str]

RETURNS DESCRIPTION
list_rep_verbs
Source code in edsnlp/pipelines/qualifiers/reported_speech/reported_speech.py
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
def load_verbs(self, verbs: List[str]) -> List[str]:
    """
    Conjugate reporting verbs to specific tenses (trhid person)

    Parameters
    ----------
    verbs: list of reporting verbs to conjugate

    Returns
    -------
    list_rep_verbs: List of reporting verbs conjugated to specific tenses.
    """

    rep_verbs = get_verbs(verbs)

    rep_verbs = rep_verbs.loc[
        (
            (rep_verbs["mode"] == "Indicatif")
            & (rep_verbs["tense"] == "Présent")
            & (rep_verbs["person"].isin(["3s", "3p"]))
        )
        | (rep_verbs["tense"] == "Participe Présent")
        | (rep_verbs["tense"] == "Participe Passé")
    ]

    list_rep_verbs = list(rep_verbs["term"].unique())

    return list_rep_verbs
process(doc)

Finds entities related to reported speech.

PARAMETER DESCRIPTION
doc

TYPE: Doc

RETURNS DESCRIPTION
doc
Source code in edsnlp/pipelines/qualifiers/reported_speech/reported_speech.py
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
def process(self, doc: Doc) -> Doc:
    """
    Finds entities related to reported speech.

    Parameters
    ----------
    doc: spaCy Doc object

    Returns
    -------
    doc: spaCy Doc object, annotated for negation
    """

    matches = self.get_matches(doc)
    matches += list(self.regex_matcher(doc, as_spans=True))

    boundaries = self._boundaries(doc)

    entities = list(doc.ents) + list(doc.spans.get("discarded", []))
    ents = None

    # Removes duplicate matches and pseudo-expressions in one statement
    matches = filter_spans(matches, label_to_remove="pseudo")

    for start, end in boundaries:

        ents, entities = consume_spans(
            entities,
            filter=lambda s: check_inclusion(s, start, end),
            second_chance=ents,
        )

        sub_matches, matches = consume_spans(
            matches, lambda s: start <= s.start < end
        )

        if self.on_ents_only and not ents:
            continue

        sub_preceding = get_spans(sub_matches, "preceding")
        sub_following = get_spans(sub_matches, "following")
        sub_verbs = get_spans(sub_matches, "verbs")
        sub_quotation = get_spans(sub_matches, "quotation")

        if not sub_preceding + sub_following + sub_verbs + sub_quotation:
            continue

        if not self.on_ents_only:
            for token in doc[start:end]:
                token._.reported_speech = (
                    any(m.end <= token.i for m in sub_preceding + sub_verbs)
                    or any(m.start > token.i for m in sub_following)
                    or any(
                        ((m.start < token.i) & (m.end > token.i + 1))
                        for m in sub_quotation
                    )
                )
        for ent in ents:

            if self.within_ents:
                cues = [m for m in sub_preceding + sub_verbs if m.end <= ent.end]
                cues += [m for m in sub_following if m.start >= ent.start]
            else:
                cues = [m for m in sub_preceding + sub_verbs if m.end <= ent.start]
                cues += [m for m in sub_following if m.start >= ent.end]

            cues += [
                m
                for m in sub_quotation
                if (m.start < ent.start) & (m.end > ent.end)
            ]

            reported_speech = ent._.reported_speech or bool(cues)
            ent._.reported_speech = reported_speech

            if self.explain:
                ent._.reported_speech_cues += cues

            if not self.on_ents_only and reported_speech:
                for token in ent:
                    token._.reported_speech = True
    return doc
patterns
verbs: List[str] = ['affirmer', 'ajouter', 'assurer', 'confirmer', 'demander', 'dire', 'déclarer', 'décrire', 'décrire', 'démontrer', 'expliquer', 'faire remarquer', 'indiquer', 'informer', 'insinuer', 'insister', 'jurer', 'nier', 'nier', 'noter', 'objecter', 'observer', 'parler', 'promettre', 'préciser', 'prétendre', 'prévenir', 'raconter', 'rappeler', 'rapporter', 'reconnaître', 'réfuter', 'répliquer', 'répondre', 'répéter', 'révéler', 'se plaindre', 'souhaiter', 'souligner', 'supplier', 'verbaliser', 'vouloir', 'vouloir'] module-attribute
following: List[str] = ["d'après le patient", "d'après la patiente"] module-attribute
preceding: List[str] = ['pas de critique de', 'crainte de', 'menace de', 'insiste sur le fait que', "d'après le patient", "d'après la patiente", 'peur de'] module-attribute
quotation: str = '(\\".+\\")|(\\«.+\\»)' module-attribute
factory
DEFAULT_CONFIG = dict(pseudo=None, preceding=None, following=None, quotation=None, verbs=None, attr='NORM', on_ents_only=True, within_ents=False, explain=False) module-attribute
create_component(nlp, name, attr, pseudo, preceding, following, quotation, verbs, on_ents_only, within_ents, explain)
Source code in edsnlp/pipelines/qualifiers/reported_speech/factory.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
@deprecated_factory("rspeech", "eds.reported_speech", default_config=DEFAULT_CONFIG)
@deprecated_factory(
    "reported_speech", "eds.reported_speech", default_config=DEFAULT_CONFIG
)
@Language.factory("eds.reported_speech", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    attr: str,
    pseudo: Optional[List[str]],
    preceding: Optional[List[str]],
    following: Optional[List[str]],
    quotation: Optional[List[str]],
    verbs: Optional[List[str]],
    on_ents_only: bool,
    within_ents: bool,
    explain: bool,
):
    return ReportedSpeech(
        nlp=nlp,
        attr=attr,
        pseudo=pseudo,
        preceding=preceding,
        following=following,
        quotation=quotation,
        verbs=verbs,
        on_ents_only=on_ents_only,
        within_ents=within_ents,
        explain=explain,
    )
history
patterns
history = ['antécédents', 'atcd', 'atcds', 'tacds', 'antécédent'] module-attribute
history
History

Bases: Qualifier

Implements an history detection algorithm.

The components looks for terms indicating history in the text.

PARAMETER DESCRIPTION
nlp

spaCy nlp pipeline to use for matching.

TYPE: Language

history

List of terms indicating medical history reference.

TYPE: Optional[List[str]]

termination

List of syntagme termination terms.

TYPE: Optional[List[str]]

use_sections

Whether to use section pipeline to detect medical history section.

TYPE: bool

attr

spaCy's attribute to use: a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr' we can also add a key for each regex.

TYPE: str

on_ents_only

Whether to look for matches around detected entities only. Useful for faster inference in downstream tasks.

TYPE: bool

regex

A dictionnary of regex patterns.

TYPE: Optional[Dict[str, Union[List[str], str]]]

explain

Whether to keep track of cues for each entity.

TYPE: bool

Source code in edsnlp/pipelines/qualifiers/history/history.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
class History(Qualifier):
    """
    Implements an history detection algorithm.

    The components looks for terms indicating history in the text.

    Parameters
    ----------
    nlp : Language
        spaCy nlp pipeline to use for matching.
    history : Optional[List[str]]
        List of terms indicating medical history reference.
    termination : Optional[List[str]]
        List of syntagme termination terms.
    use_sections : bool
        Whether to use section pipeline to detect medical history section.
    attr : str
        spaCy's attribute to use:
        a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr'
        we can also add a key for each regex.
    on_ents_only : bool
        Whether to look for matches around detected entities only.
        Useful for faster inference in downstream tasks.
    regex : Optional[Dict[str, Union[List[str], str]]]
        A dictionnary of regex patterns.
    explain : bool
        Whether to keep track of cues for each entity.
    """

    defaults = dict(
        history=history,
        termination=termination,
    )

    def __init__(
        self,
        nlp: Language,
        attr: str,
        history: Optional[List[str]],
        termination: Optional[List[str]],
        use_sections: bool,
        explain: bool,
        on_ents_only: bool,
    ):

        terms = self.get_defaults(
            history=history,
            termination=termination,
        )

        super().__init__(
            nlp=nlp,
            attr=attr,
            on_ents_only=on_ents_only,
            explain=explain,
            **terms,
        )

        self.set_extensions()

        self.sections = use_sections and (
            "eds.sections" in nlp.pipe_names or "sections" in nlp.pipe_names
        )
        if use_sections and not self.sections:
            logger.warning(
                "You have requested that the pipeline use annotations "
                "provided by the `section` pipeline, but it was not set. "
                "Skipping that step."
            )

    @staticmethod
    def set_extensions() -> None:

        if not Token.has_extension("history"):
            Token.set_extension("history", default=False)

        if not Token.has_extension("antecedents"):
            Token.set_extension(
                "antecedents",
                getter=deprecated_getter_factory("antecedents", "history"),
            )

        if not Token.has_extension("antecedent"):
            Token.set_extension(
                "antecedent",
                getter=deprecated_getter_factory("antecedent", "history"),
            )

        if not Token.has_extension("history_"):
            Token.set_extension(
                "history_",
                getter=lambda token: "ATCD" if token._.history else "CURRENT",
            )

        if not Token.has_extension("antecedents_"):
            Token.set_extension(
                "antecedents_",
                getter=deprecated_getter_factory("antecedents_", "history_"),
            )

        if not Token.has_extension("antecedent_"):
            Token.set_extension(
                "antecedent_",
                getter=deprecated_getter_factory("antecedent_", "history_"),
            )

        if not Span.has_extension("history"):
            Span.set_extension("history", default=False)

        if not Span.has_extension("antecedents"):
            Span.set_extension(
                "antecedents",
                getter=deprecated_getter_factory("antecedents", "history"),
            )

        if not Span.has_extension("antecedent"):
            Span.set_extension(
                "antecedent",
                getter=deprecated_getter_factory("antecedent", "history"),
            )

        if not Span.has_extension("history_"):
            Span.set_extension(
                "history_",
                getter=lambda span: "ATCD" if span._.history else "CURRENT",
            )

        if not Span.has_extension("antecedents_"):
            Span.set_extension(
                "antecedents_",
                getter=deprecated_getter_factory("antecedents_", "history_"),
            )

        if not Span.has_extension("antecedent_"):
            Span.set_extension(
                "antecedent_",
                getter=deprecated_getter_factory("antecedent_", "history_"),
            )

        if not Span.has_extension("history_cues"):
            Span.set_extension("history_cues", default=[])

        if not Span.has_extension("antecedents_cues"):
            Span.set_extension(
                "antecedents_cues",
                getter=deprecated_getter_factory("antecedents_cues", "history_cues"),
            )

        if not Span.has_extension("antecedent_cues"):
            Span.set_extension(
                "antecedent_cues",
                getter=deprecated_getter_factory("antecedent_cues", "history_cues"),
            )

    def process(self, doc: Doc) -> Doc:
        """
        Finds entities related to history.

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        doc:
            spaCy Doc object, annotated for history
        """

        matches = self.get_matches(doc)

        terminations = get_spans(matches, "termination")
        boundaries = self._boundaries(doc, terminations)

        # Removes duplicate matches and pseudo-expressions in one statement
        matches = filter_spans(matches, label_to_remove="pseudo")

        entities = list(doc.ents) + list(doc.spans.get("discarded", []))
        ents = None

        sections = []

        if self.sections:
            sections = [
                Span(doc, section.start, section.end, label="ATCD")
                for section in doc.spans["sections"]
                if section.label_ == "antécédents"
            ]

        for start, end in boundaries:
            ents, entities = consume_spans(
                entities,
                filter=lambda s: check_inclusion(s, start, end),
                second_chance=ents,
            )

            sub_matches, matches = consume_spans(
                matches, lambda s: start <= s.start < end
            )

            sub_sections, sections = consume_spans(sections, lambda s: doc[start] in s)

            if self.on_ents_only and not ents:
                continue

            cues = get_spans(sub_matches, "history")
            cues += sub_sections

            history = bool(cues)

            if not self.on_ents_only:
                for token in doc[start:end]:
                    token._.history = history

            for ent in ents:
                ent._.history = ent._.history or history

                if self.explain:
                    ent._.history_cues += cues

                if not self.on_ents_only and ent._.history:
                    for token in ent:
                        token._.history = True

        return doc
defaults = dict(history=history, termination=termination) class-attribute
sections = use_sections and 'eds.sections' in nlp.pipe_names or 'sections' in nlp.pipe_names instance-attribute
__init__(nlp, attr, history, termination, use_sections, explain, on_ents_only)
Source code in edsnlp/pipelines/qualifiers/history/history.py
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def __init__(
    self,
    nlp: Language,
    attr: str,
    history: Optional[List[str]],
    termination: Optional[List[str]],
    use_sections: bool,
    explain: bool,
    on_ents_only: bool,
):

    terms = self.get_defaults(
        history=history,
        termination=termination,
    )

    super().__init__(
        nlp=nlp,
        attr=attr,
        on_ents_only=on_ents_only,
        explain=explain,
        **terms,
    )

    self.set_extensions()

    self.sections = use_sections and (
        "eds.sections" in nlp.pipe_names or "sections" in nlp.pipe_names
    )
    if use_sections and not self.sections:
        logger.warning(
            "You have requested that the pipeline use annotations "
            "provided by the `section` pipeline, but it was not set. "
            "Skipping that step."
        )
set_extensions()
Source code in edsnlp/pipelines/qualifiers/history/history.py
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
@staticmethod
def set_extensions() -> None:

    if not Token.has_extension("history"):
        Token.set_extension("history", default=False)

    if not Token.has_extension("antecedents"):
        Token.set_extension(
            "antecedents",
            getter=deprecated_getter_factory("antecedents", "history"),
        )

    if not Token.has_extension("antecedent"):
        Token.set_extension(
            "antecedent",
            getter=deprecated_getter_factory("antecedent", "history"),
        )

    if not Token.has_extension("history_"):
        Token.set_extension(
            "history_",
            getter=lambda token: "ATCD" if token._.history else "CURRENT",
        )

    if not Token.has_extension("antecedents_"):
        Token.set_extension(
            "antecedents_",
            getter=deprecated_getter_factory("antecedents_", "history_"),
        )

    if not Token.has_extension("antecedent_"):
        Token.set_extension(
            "antecedent_",
            getter=deprecated_getter_factory("antecedent_", "history_"),
        )

    if not Span.has_extension("history"):
        Span.set_extension("history", default=False)

    if not Span.has_extension("antecedents"):
        Span.set_extension(
            "antecedents",
            getter=deprecated_getter_factory("antecedents", "history"),
        )

    if not Span.has_extension("antecedent"):
        Span.set_extension(
            "antecedent",
            getter=deprecated_getter_factory("antecedent", "history"),
        )

    if not Span.has_extension("history_"):
        Span.set_extension(
            "history_",
            getter=lambda span: "ATCD" if span._.history else "CURRENT",
        )

    if not Span.has_extension("antecedents_"):
        Span.set_extension(
            "antecedents_",
            getter=deprecated_getter_factory("antecedents_", "history_"),
        )

    if not Span.has_extension("antecedent_"):
        Span.set_extension(
            "antecedent_",
            getter=deprecated_getter_factory("antecedent_", "history_"),
        )

    if not Span.has_extension("history_cues"):
        Span.set_extension("history_cues", default=[])

    if not Span.has_extension("antecedents_cues"):
        Span.set_extension(
            "antecedents_cues",
            getter=deprecated_getter_factory("antecedents_cues", "history_cues"),
        )

    if not Span.has_extension("antecedent_cues"):
        Span.set_extension(
            "antecedent_cues",
            getter=deprecated_getter_factory("antecedent_cues", "history_cues"),
        )
process(doc)

Finds entities related to history.

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
doc

spaCy Doc object, annotated for history

Source code in edsnlp/pipelines/qualifiers/history/history.py
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
def process(self, doc: Doc) -> Doc:
    """
    Finds entities related to history.

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    doc:
        spaCy Doc object, annotated for history
    """

    matches = self.get_matches(doc)

    terminations = get_spans(matches, "termination")
    boundaries = self._boundaries(doc, terminations)

    # Removes duplicate matches and pseudo-expressions in one statement
    matches = filter_spans(matches, label_to_remove="pseudo")

    entities = list(doc.ents) + list(doc.spans.get("discarded", []))
    ents = None

    sections = []

    if self.sections:
        sections = [
            Span(doc, section.start, section.end, label="ATCD")
            for section in doc.spans["sections"]
            if section.label_ == "antécédents"
        ]

    for start, end in boundaries:
        ents, entities = consume_spans(
            entities,
            filter=lambda s: check_inclusion(s, start, end),
            second_chance=ents,
        )

        sub_matches, matches = consume_spans(
            matches, lambda s: start <= s.start < end
        )

        sub_sections, sections = consume_spans(sections, lambda s: doc[start] in s)

        if self.on_ents_only and not ents:
            continue

        cues = get_spans(sub_matches, "history")
        cues += sub_sections

        history = bool(cues)

        if not self.on_ents_only:
            for token in doc[start:end]:
                token._.history = history

        for ent in ents:
            ent._.history = ent._.history or history

            if self.explain:
                ent._.history_cues += cues

            if not self.on_ents_only and ent._.history:
                for token in ent:
                    token._.history = True

    return doc
factory
DEFAULT_CONFIG = dict(attr='NORM', history=patterns.history, termination=termination, use_sections=False, explain=False, on_ents_only=True) module-attribute
create_component(nlp, name, history, termination, use_sections, attr, explain, on_ents_only)
Source code in edsnlp/pipelines/qualifiers/history/factory.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
@deprecated_factory("antecedents", "eds.history", default_config=DEFAULT_CONFIG)
@deprecated_factory("eds.antecedents", "eds.history", default_config=DEFAULT_CONFIG)
@deprecated_factory("history", "eds.history", default_config=DEFAULT_CONFIG)
@Language.factory("eds.history", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    history: Optional[List[str]],
    termination: Optional[List[str]],
    use_sections: bool,
    attr: str,
    explain: str,
    on_ents_only: bool,
):
    return History(
        nlp,
        attr=attr,
        history=history,
        termination=termination,
        use_sections=use_sections,
        explain=explain,
        on_ents_only=on_ents_only,
    )
hypothesis
patterns
pseudo: List[str] = ['aucun doute', 'même si', 'pas de condition', 'pas de doute', 'sans aucun doute', 'sans condition', 'sans risque'] module-attribute
confirmation: List[str] = ['certain', 'certaine', 'certainement', 'certaines', 'certains', 'confirmer', 'évidemment', 'évident', 'évidente', 'montrer que', 'visiblement'] module-attribute
preceding: List[str] = ['à condition', 'à la condition que', 'à moins que', 'au cas où', 'conditionnellement', 'doute', 'en admettant que', 'en cas', 'en considérant que', 'en supposant que', 'éventuellement', 'faudrait', 'hypothèse', 'hypothèses', 'idée depas confirmer', 'pas sûr', 'pas sûre', 'peut correspondre', 'peut-être', 'peuvent correspondre', 'possible', 'possiblement', 'potentiel', 'potentielle', 'potentiellement', 'potentielles', 'potentiels', 'prédisposant à', 'probable', 'probablement', 'probables', "recherche d'recherche de", 'recherche des', 'risque', 'sauf si', 'selon', 'si', "s'il", 'soit', 'sous condition', 'sous réserve', 'suspicion'] module-attribute
following: List[str] = ['?', 'envisagé', 'envisageable', 'envisageables', 'envisagées', 'envisagés', 'hypothétique', 'hypothétiquement', 'hypothétiques', 'pas certain', 'pas certaine', 'pas clair', 'pas claire', 'pas confirmé', 'pas confirmée', 'pas confirmées', 'pas confirmés', 'pas évident', 'pas évidente', 'pas sûr', 'pas sûre', 'possible', 'potentiel', 'potentielle', 'potentiels', 'probable', 'probables', ': \n', ':\n'] module-attribute
verbs_hyp: List[str] = ['douter', 'envisager', "s'apparenter", 'sembler', 'soupçonner', 'suggérer', 'suspecter'] module-attribute
verbs_eds: List[str] = ['abandonner', 'abolir', 'aborder', 'accepter', 'accidenter', 'accompagnemer', 'accompagner', 'acoller', 'acquérir', 'activer', 'actualiser', 'adapter', 'adhérer', 'adjuver', 'admettre', 'administrer', 'adopter', 'adresser', 'aggraver', 'agir', 'agréer', 'aider', 'aimer', 'alcooliser', 'alerter', 'alimenter', 'aller', 'allonger', 'alléger', 'alterner', 'altérer', 'amender', 'amener', 'améliorer', 'amyotrophier', 'améliorer', 'analyser', 'anesthésier', 'animer', 'annexer', 'annuler', 'anonymiser', 'anticiper', 'anticoaguler', 'apercevoir', 'aplatir', 'apparaître', 'appareiller', 'appeler', 'appliquer', 'apporter', 'apprendre', 'apprécier', 'appuyer', 'argumenter', 'arquer', 'arrêter', 'arriver', 'arrêter', 'articuler', 'aspirer', 'asseoir', 'assister', 'associer', 'assurer', 'assécher', 'attacher', 'atteindre', 'attendre', 'attribuer', 'augmenter', 'autonomiser', 'autoriser', 'avaler', 'avancer', 'avertir', 'avoir', 'avérer', 'aérer', 'baisser', 'ballonner', 'blesser', 'bloquer', 'boire', 'border', 'brancher', 'brûler', 'bénéficier', 'cadrer', 'calcifier', 'calculer', 'calmer', 'canaliser', 'capter', 'carencer', 'casser', 'centrer', 'cerner', 'certifier', 'changer', 'charger', 'chevaucher', 'choisir', 'chronomoduler', 'chuter', 'cicatriser', 'circoncire', 'circuler', 'classer', 'codéiner', 'coincer', 'colorer', 'combler', 'commander', 'commencer', 'communiquer', 'comparer', 'compliquer', 'compléter', 'comporter', 'comprendre', 'comprimer', 'concerner', 'conclure', 'condamner', 'conditionner', 'conduire', 'confiner', 'confirmer', 'confronter', 'congeler', 'conjoindre', 'conjuguer', 'connaître', 'connecter', 'conseiller', 'conserver', 'considérer', 'consommer', 'constater', 'constituer', 'consulter', 'contacter', 'contaminer', 'contenir', 'contentionner', 'continuer', 'contracter', 'contrarier', 'contribuer', 'contrôler', 'convaincre', 'convenir', 'convier', 'convoquer', 'copier', 'correspondre', 'corriger', 'corréler', 'coucher', 'coupler', 'couvrir', 'crapotter', 'creuser', 'croire', 'croiser', 'créer', 'crémer', 'crépiter', 'cumuler', 'curariser', 'céder', 'dater', 'demander', 'demeurer', 'destiner', 'devenir', 'devoir', 'diagnostiquer', 'dialyser', 'dicter', 'diffuser', 'différencier', 'différer', 'digérer', 'dilater', 'diluer', 'diminuer', 'diner', 'dire', 'diriger', 'discuter', 'disparaître', 'disposer', 'dissocier', 'disséminer', 'disséquer', 'distendre', 'distinguer', 'divorcer', 'documenter', 'donner', 'dorer', 'doser', 'doubler', 'durer', 'dyaliser', 'dyspner', 'débuter', 'décaler', 'déceler', 'décider', 'déclarer', 'déclencher', 'découvrir', 'décrire', 'décroître', 'décurariser', 'décéder', 'dédier', 'définir', 'dégrader', 'délivrer', 'dépasser', 'dépendre', 'déplacer', 'dépolir', 'déposer', 'dériver', 'dérouler', 'désappareiller', 'désigner', 'désinfecter', 'désorienter', 'détecter', 'déterminer', 'détruire', 'développer', 'dévouer', 'dîner', 'écraser', 'effacer', 'effectuer', 'effondrer', 'emboliser', 'emmener', 'empêcher', 'encadrer', 'encourager', 'endormir', 'endurer', 'enlever', 'enregistrer', 'entamer', 'entendre', 'entourer', 'entraîner', 'entreprendre', 'entrer', 'envahir', 'envisager', 'envoyer', 'espérer', 'essayer', 'estimer', 'être', 'examiner', 'excentrer', 'exciser', 'exclure', 'expirer', 'expliquer', 'explorer', 'exposer', 'exprimer', 'extérioriser', 'exécuter', 'faciliter', 'faire', 'fatiguer', 'favoriser', 'faxer', 'fermer', 'figurer', 'fixer', 'focaliser', 'foncer', 'former', 'fournir', 'fractionner', 'fragmenter', 'fuiter', 'fusionner', 'garder', 'graver', 'guider', 'gérer', 'gêner', 'honorer', 'hopsitaliser', 'hospitaliser', 'hydrater', 'hyperartérialiser', 'hyperfixer', 'hypertrophier', 'hésiter', 'identifier', 'illustrer', 'immuniser', 'impacter', 'implanter', 'impliquer', 'importer', 'imposer', 'impregner', 'imprimer', 'inclure', 'indifferencier', 'indiquer', 'infecter', 'infertiliser', 'infiltrer', 'informer', 'inhaler', 'initier', 'injecter', 'inscrire', 'insister', 'installer', 'interdire', 'interpréter', 'interrompre', 'intervenir', 'intituler', 'introduire', 'intéragir', 'inverser', 'inviter', 'ioder', 'ioniser', 'irradier', 'itérativer', 'joindre', 'juger', 'justifier', 'laisser', 'laminer', 'lancer', 'latéraliser', 'laver', 'lever', 'lier', 'ligaturer', 'limiter', 'lire', 'localiser', 'loger', 'louper', 'luire', 'lutter', 'lyricer', 'lyser', 'maculer', 'macérer', 'maintenir', 'majorer', 'malaiser', 'manger', 'manifester', 'manipuler', 'manquer', 'marcher', 'marier', 'marmoner', 'marquer', 'masquer', 'masser', 'mater', 'mener', 'mesurer', 'meteoriser', 'mettre', 'mitiger', 'modifier', 'moduler', 'modérer', 'monter', 'montrer', 'motiver', 'moucheter', 'mouler', 'mourir', 'multiopéréer', 'munir', 'muter', 'médicaliser', 'météoriser', 'naître', 'normaliser', 'noter', 'nuire', 'numériser', 'nécessiter', 'négativer', 'objectiver', 'observer', 'obstruer', 'obtenir', 'occasionner', 'occuper', 'opposer', 'opérer', 'organiser', 'orienter', 'ouvrir', 'palper', 'parasiter', 'paraître', 'parcourir', 'parer', 'paresthésier', 'parfaire', 'partager', 'partir', 'parvenir', 'passer', 'penser', 'percevoir', 'perdre', 'perforer', 'permettre', 'persister', 'personnaliser', 'peser', 'pigmenter', 'piloter', 'placer', 'plaindre', 'planifier', 'plier', 'plonger', 'porter', 'poser', 'positionner', 'posséder', 'poursuivre', 'pousser', 'pouvoir', 'pratiquer', 'preciser', 'prendre', 'prescrire', 'prier', 'produire', 'programmer', 'prolonger', 'prononcer', 'proposer', 'prouver', 'provoquer', 'préciser', 'précéder', 'prédominer', 'préexister', 'préférer', 'prélever', 'préparer', 'présenter', 'préserver', 'prévenir', 'prévoir', 'puruler', 'pénétrer', 'radiofréquencer', 'ralentir', 'ramener', 'rappeler', 'rapporter', 'rapprocher', 'rassurer', 'rattacher', 'rattraper', 'realiser', 'recenser', 'recevoir', 'rechercher', 'recommander', 'reconnaître', 'reconsulter', 'recontacter', 'recontrôler', 'reconvoquer', 'recouvrir', 'recueillir', 'recuperer', 'redescendre', 'rediscuter', 'refaire', 'refouler', 'refuser', 'regarder', 'rehausser', 'relancer', 'relayer', 'relever', 'relire', 'relâcher', 'remanier', 'remarquer', 'remercier', 'remettre', 'remonter', 'remplacer', 'remplir', 'rencontrer', 'rendormir', 'rendre', 'renfermer', 'renforcer', 'renouveler', 'renseigner', 'rentrer', 'reparler', 'repasser', 'reporter', 'reprendre', 'represcrire', 'reproduire', 'reprogrammer', 'représenter', 'repérer', 'requérir', 'respecter', 'ressembler', 'ressentir', 'rester', 'restreindre', 'retarder', 'retenir', 'retirer', 'retrouver', 'revasculariser', 'revenir', 'reverticaliser', 'revoir', 'rompre', 'rouler', 'réadapter', 'réadmettre', 'réadresser', 'réaliser', 'récidiver', 'récupérer', 'rédiger', 'réduire', 'réessayer', 'réexpliquer', 'référer', 'régler', 'régresser', 'réhausser', 'réopérer', 'répartir', 'répondre', 'répéter', 'réserver', 'résorber', 'résoudre', 'réséquer', 'réveiller', 'révéler', 'réévaluer', 'rêver', 'sacrer', 'saisir', 'satisfaire', 'savoir', 'scanner', 'scolariser', 'sembler', 'sensibiliser', 'sentir', 'serrer', 'servir', 'sevrer', 'signaler', 'signer', 'situer', 'siéger', 'soigner', 'sommeiller', 'sonder', 'sortir', 'souffler', 'souhaiter', 'soulager', 'soussigner', 'souvenir', 'spécialiser', 'stabiliser', 'statuer', 'stenter', 'stopper', 'stratifier', 'subir', 'substituer', 'sucrer', 'suggérer', 'suivre', 'supporter', 'supprimer', 'surajouter', 'surmonter', 'surveiller', 'survenir', 'suspecter', 'suspendre', 'suturer', 'synchroniser', 'systématiser', 'sécréter', 'sécuriser', 'sédater', 'séjourner', 'séparer', 'taire', 'taper', 'teinter', 'tendre', 'tenir', 'tenter', 'terminer', 'tester', 'thromboser', 'tirer', 'tiroir', 'tissulaire', 'titulariser', 'tolérer', 'tourner', 'tracer', 'trachéotomiser', 'traduire', 'traiter', 'transcrire', 'transférer', 'transmettre', 'transporter', 'trasnfixer', 'travailler', 'tronquer', 'trouver', 'téléphoner', 'ulcérer', 'uriner', 'utiliser', 'vacciner', 'valider', 'valoir', 'varier', 'vasculariser', 'venir', 'verifier', 'vieillir', 'viser', 'visualiser', 'vivre', 'voir', 'vouloir', 'vérifier', 'ébaucher', 'écarter', 'échographier', 'échoguider', 'échoir', 'échouer', 'éclairer', 'écraser', 'élargir', 'éliminer', 'émousser', 'épaissir', 'épargner', 'épuiser', 'épurer', 'équilibrer', 'établir', 'étager', 'étendre', 'étiqueter', 'étrangler', 'évaluer', 'éviter', 'évoluer', 'évoquer', 'être'] module-attribute
hypothesis
Hypothesis

Bases: Qualifier

Hypothesis detection with spaCy.

The component looks for five kinds of expressions in the text :

  • preceding hypothesis, ie cues that precede a hypothetic expression
  • following hypothesis, ie cues that follow a hypothetic expression
  • pseudo hypothesis : contain a hypothesis cue, but are not hypothesis (eg "pas de doute"/"no doubt")
  • hypothetic verbs : verbs indicating hypothesis (eg "douter")
  • classic verbs conjugated to the conditional, thus indicating hypothesis
PARAMETER DESCRIPTION
nlp

spaCy nlp pipeline to use for matching.

TYPE: Language

pseudo

List of pseudo hypothesis cues.

TYPE: Optional[List[str]]

preceding

List of preceding hypothesis cues

TYPE: Optional[List[str]]

following

List of following hypothesis cues.

TYPE: Optional[List[str]]

verbs_hyp

List of hypothetic verbs.

TYPE: Optional[List[str]]

verbs_eds

List of mainstream verbs.

TYPE: Optional[List[str]]

filter_matches

Whether to filter out overlapping matches.

TYPE: bool

attr

spaCy's attribute to use: a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr' we can also add a key for each regex.

TYPE: str

on_ents_only

Whether to look for matches around detected entities only. Useful for faster inference in downstream tasks.

TYPE: bool

within_ents

Whether to consider cues within entities.

TYPE: bool

explain

Whether to keep track of cues for each entity.

TYPE: bool

regex

A dictionnary of regex patterns.

TYPE: Optional[Dict[str, Union[List[str], str]]]

Source code in edsnlp/pipelines/qualifiers/hypothesis/hypothesis.py
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
class Hypothesis(Qualifier):
    """
    Hypothesis detection with spaCy.

    The component looks for five kinds of expressions in the text :

    - preceding hypothesis, ie cues that precede a hypothetic expression
    - following hypothesis, ie cues that follow a hypothetic expression
    - pseudo hypothesis : contain a hypothesis cue, but are not hypothesis
      (eg "pas de doute"/"no doubt")
    - hypothetic verbs : verbs indicating hypothesis (eg "douter")
    - classic verbs conjugated to the conditional, thus indicating hypothesis

    Parameters
    ----------
    nlp : Language
        spaCy nlp pipeline to use for matching.
    pseudo : Optional[List[str]]
        List of pseudo hypothesis cues.
    preceding : Optional[List[str]]
        List of preceding hypothesis cues
    following : Optional[List[str]]
        List of following hypothesis cues.
    verbs_hyp : Optional[List[str]]
        List of hypothetic verbs.
    verbs_eds : Optional[List[str]]
        List of mainstream verbs.
    filter_matches : bool
        Whether to filter out overlapping matches.
    attr : str
        spaCy's attribute to use:
        a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr'
        we can also add a key for each regex.
    on_ents_only : bool
        Whether to look for matches around detected entities only.
        Useful for faster inference in downstream tasks.
    within_ents : bool
        Whether to consider cues within entities.
    explain : bool
        Whether to keep track of cues for each entity.
    regex : Optional[Dict[str, Union[List[str], str]]]
        A dictionnary of regex patterns.
    """

    defaults = dict(
        following=following,
        preceding=preceding,
        pseudo=pseudo,
        termination=termination,
        verbs_eds=verbs_eds,
        verbs_hyp=verbs_hyp,
    )

    def __init__(
        self,
        nlp: Language,
        attr: str,
        pseudo: Optional[List[str]],
        preceding: Optional[List[str]],
        following: Optional[List[str]],
        termination: Optional[List[str]],
        verbs_eds: Optional[List[str]],
        verbs_hyp: Optional[List[str]],
        on_ents_only: bool,
        within_ents: bool,
        explain: bool,
    ):

        terms = self.get_defaults(
            pseudo=pseudo,
            preceding=preceding,
            following=following,
            termination=termination,
            verbs_eds=verbs_eds,
            verbs_hyp=verbs_hyp,
        )
        terms["verbs"] = self.load_verbs(
            verbs_hyp=terms.pop("verbs_hyp"),
            verbs_eds=terms.pop("verbs_eds"),
        )

        super().__init__(
            nlp=nlp,
            attr=attr,
            on_ents_only=on_ents_only,
            explain=explain,
            **terms,
        )

        self.within_ents = within_ents
        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:
        if not Token.has_extension("hypothesis"):
            Token.set_extension("hypothesis", default=False)

        if not Token.has_extension("hypothesis_"):
            Token.set_extension(
                "hypothesis_",
                getter=lambda token: "HYP" if token._.hypothesis else "CERT",
            )

        if not Span.has_extension("hypothesis"):
            Span.set_extension("hypothesis", default=False)

        if not Span.has_extension("hypothesis_"):
            Span.set_extension(
                "hypothesis_",
                getter=lambda span: "HYP" if span._.hypothesis else "CERT",
            )

        if not Span.has_extension("hypothesis_cues"):
            Span.set_extension("hypothesis_cues", default=[])

        if not Doc.has_extension("hypothesis"):
            Doc.set_extension("hypothesis", default=[])

    def load_verbs(
        self,
        verbs_hyp: List[str],
        verbs_eds: List[str],
    ) -> List[str]:
        """
        Conjugate "classic" verbs to conditional, and add hypothesis
        verbs conjugated to all tenses.

        Parameters
        ----------
        verbs_hyp: List of verbs that specifically imply an hypothesis.
        verbs_eds: List of general verbs.

        Returns
        -------
        list of hypothesis verbs conjugated at all tenses and classic
        verbs conjugated to conditional.
        """

        classic_verbs = get_verbs(verbs_eds)
        classic_verbs = classic_verbs.loc[classic_verbs["mode"] == "Conditionnel"]
        list_classic_verbs = list(classic_verbs["term"].unique())

        hypo_verbs = get_verbs(verbs_hyp)
        list_hypo_verbs = list(hypo_verbs["term"].unique())

        return list_hypo_verbs + list_classic_verbs

    def process(self, doc: Doc) -> Doc:
        """
        Finds entities related to hypothesis.

        Parameters
        ----------
        doc: spaCy Doc object

        Returns
        -------
        doc: spaCy Doc object, annotated for hypothesis
        """

        matches = self.get_matches(doc)

        terminations = get_spans(matches, "termination")
        boundaries = self._boundaries(doc, terminations)

        # Removes duplicate matches and pseudo-expressions in one statement
        matches = filter_spans(matches, label_to_remove="pseudo")

        entities = list(doc.ents) + list(doc.spans.get("discarded", []))
        ents = None

        for start, end in boundaries:

            ents, entities = consume_spans(
                entities,
                filter=lambda s: check_inclusion(s, start, end),
                second_chance=ents,
            )

            sub_matches, matches = consume_spans(
                matches, lambda s: start <= s.start < end
            )

            if self.on_ents_only and not ents:
                continue

            sub_preceding = get_spans(sub_matches, "preceding")
            sub_following = get_spans(sub_matches, "following")
            sub_verbs = get_spans(sub_matches, "verbs")

            if not sub_preceding + sub_following + sub_verbs:
                continue

            if not self.on_ents_only:
                for token in doc[start:end]:
                    token._.hypothesis = any(
                        m.end <= token.i for m in sub_preceding + sub_verbs
                    ) or any(m.start > token.i for m in sub_following)

            for ent in ents:

                if self.within_ents:
                    cues = [m for m in sub_preceding + sub_verbs if m.end <= ent.end]
                    cues += [m for m in sub_following if m.start >= ent.start]
                else:
                    cues = [m for m in sub_preceding + sub_verbs if m.end <= ent.start]
                    cues += [m for m in sub_following if m.start >= ent.end]

                hypothesis = ent._.hypothesis or bool(cues)

                ent._.hypothesis = hypothesis

                if self.explain and hypothesis:
                    ent._.hypothesis_cues += cues

                if not self.on_ents_only and hypothesis:
                    for token in ent:
                        token._.hypothesis = True

        return doc
defaults = dict(following=following, preceding=preceding, pseudo=pseudo, termination=termination, verbs_eds=verbs_eds, verbs_hyp=verbs_hyp) class-attribute
within_ents = within_ents instance-attribute
__init__(nlp, attr, pseudo, preceding, following, termination, verbs_eds, verbs_hyp, on_ents_only, within_ents, explain)
Source code in edsnlp/pipelines/qualifiers/hypothesis/hypothesis.py
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def __init__(
    self,
    nlp: Language,
    attr: str,
    pseudo: Optional[List[str]],
    preceding: Optional[List[str]],
    following: Optional[List[str]],
    termination: Optional[List[str]],
    verbs_eds: Optional[List[str]],
    verbs_hyp: Optional[List[str]],
    on_ents_only: bool,
    within_ents: bool,
    explain: bool,
):

    terms = self.get_defaults(
        pseudo=pseudo,
        preceding=preceding,
        following=following,
        termination=termination,
        verbs_eds=verbs_eds,
        verbs_hyp=verbs_hyp,
    )
    terms["verbs"] = self.load_verbs(
        verbs_hyp=terms.pop("verbs_hyp"),
        verbs_eds=terms.pop("verbs_eds"),
    )

    super().__init__(
        nlp=nlp,
        attr=attr,
        on_ents_only=on_ents_only,
        explain=explain,
        **terms,
    )

    self.within_ents = within_ents
    self.set_extensions()
set_extensions()
Source code in edsnlp/pipelines/qualifiers/hypothesis/hypothesis.py
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
@staticmethod
def set_extensions() -> None:
    if not Token.has_extension("hypothesis"):
        Token.set_extension("hypothesis", default=False)

    if not Token.has_extension("hypothesis_"):
        Token.set_extension(
            "hypothesis_",
            getter=lambda token: "HYP" if token._.hypothesis else "CERT",
        )

    if not Span.has_extension("hypothesis"):
        Span.set_extension("hypothesis", default=False)

    if not Span.has_extension("hypothesis_"):
        Span.set_extension(
            "hypothesis_",
            getter=lambda span: "HYP" if span._.hypothesis else "CERT",
        )

    if not Span.has_extension("hypothesis_cues"):
        Span.set_extension("hypothesis_cues", default=[])

    if not Doc.has_extension("hypothesis"):
        Doc.set_extension("hypothesis", default=[])
load_verbs(verbs_hyp, verbs_eds)

Conjugate "classic" verbs to conditional, and add hypothesis verbs conjugated to all tenses.

PARAMETER DESCRIPTION
verbs_hyp

TYPE: List[str]

verbs_eds

TYPE: List[str]

RETURNS DESCRIPTION
list of hypothesis verbs conjugated at all tenses and classic
verbs conjugated to conditional.
Source code in edsnlp/pipelines/qualifiers/hypothesis/hypothesis.py
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
def load_verbs(
    self,
    verbs_hyp: List[str],
    verbs_eds: List[str],
) -> List[str]:
    """
    Conjugate "classic" verbs to conditional, and add hypothesis
    verbs conjugated to all tenses.

    Parameters
    ----------
    verbs_hyp: List of verbs that specifically imply an hypothesis.
    verbs_eds: List of general verbs.

    Returns
    -------
    list of hypothesis verbs conjugated at all tenses and classic
    verbs conjugated to conditional.
    """

    classic_verbs = get_verbs(verbs_eds)
    classic_verbs = classic_verbs.loc[classic_verbs["mode"] == "Conditionnel"]
    list_classic_verbs = list(classic_verbs["term"].unique())

    hypo_verbs = get_verbs(verbs_hyp)
    list_hypo_verbs = list(hypo_verbs["term"].unique())

    return list_hypo_verbs + list_classic_verbs
process(doc)

Finds entities related to hypothesis.

PARAMETER DESCRIPTION
doc

TYPE: Doc

RETURNS DESCRIPTION
doc
Source code in edsnlp/pipelines/qualifiers/hypothesis/hypothesis.py
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
def process(self, doc: Doc) -> Doc:
    """
    Finds entities related to hypothesis.

    Parameters
    ----------
    doc: spaCy Doc object

    Returns
    -------
    doc: spaCy Doc object, annotated for hypothesis
    """

    matches = self.get_matches(doc)

    terminations = get_spans(matches, "termination")
    boundaries = self._boundaries(doc, terminations)

    # Removes duplicate matches and pseudo-expressions in one statement
    matches = filter_spans(matches, label_to_remove="pseudo")

    entities = list(doc.ents) + list(doc.spans.get("discarded", []))
    ents = None

    for start, end in boundaries:

        ents, entities = consume_spans(
            entities,
            filter=lambda s: check_inclusion(s, start, end),
            second_chance=ents,
        )

        sub_matches, matches = consume_spans(
            matches, lambda s: start <= s.start < end
        )

        if self.on_ents_only and not ents:
            continue

        sub_preceding = get_spans(sub_matches, "preceding")
        sub_following = get_spans(sub_matches, "following")
        sub_verbs = get_spans(sub_matches, "verbs")

        if not sub_preceding + sub_following + sub_verbs:
            continue

        if not self.on_ents_only:
            for token in doc[start:end]:
                token._.hypothesis = any(
                    m.end <= token.i for m in sub_preceding + sub_verbs
                ) or any(m.start > token.i for m in sub_following)

        for ent in ents:

            if self.within_ents:
                cues = [m for m in sub_preceding + sub_verbs if m.end <= ent.end]
                cues += [m for m in sub_following if m.start >= ent.start]
            else:
                cues = [m for m in sub_preceding + sub_verbs if m.end <= ent.start]
                cues += [m for m in sub_following if m.start >= ent.end]

            hypothesis = ent._.hypothesis or bool(cues)

            ent._.hypothesis = hypothesis

            if self.explain and hypothesis:
                ent._.hypothesis_cues += cues

            if not self.on_ents_only and hypothesis:
                for token in ent:
                    token._.hypothesis = True

    return doc
factory
DEFAULT_CONFIG = dict(pseudo=None, preceding=None, following=None, termination=None, verbs_hyp=None, verbs_eds=None, attr='NORM', on_ents_only=True, within_ents=False, explain=False) module-attribute
create_component(nlp, name, attr, pseudo, preceding, following, termination, verbs_eds, verbs_hyp, on_ents_only, within_ents, explain)
Source code in edsnlp/pipelines/qualifiers/hypothesis/factory.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
@deprecated_factory("hypothesis", "eds.hypothesis", default_config=DEFAULT_CONFIG)
@Language.factory("eds.hypothesis", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    attr: str,
    pseudo: Optional[List[str]],
    preceding: Optional[List[str]],
    following: Optional[List[str]],
    termination: Optional[List[str]],
    verbs_eds: Optional[List[str]],
    verbs_hyp: Optional[List[str]],
    on_ents_only: bool,
    within_ents: bool,
    explain: bool,
):
    return Hypothesis(
        nlp=nlp,
        attr=attr,
        pseudo=pseudo,
        preceding=preceding,
        following=following,
        termination=termination,
        verbs_eds=verbs_eds,
        verbs_hyp=verbs_hyp,
        on_ents_only=on_ents_only,
        within_ents=within_ents,
        explain=explain,
    )

ner

covid
patterns
covid = ['covid([-\\s]?19)?', 'sars[-\\s]?cov[-\\s]?2', 'corona[-\\s]?virus'] module-attribute
diseases = ['pneumopathies?', 'infections?'] module-attribute
pattern = '(' + make_pattern(diseases) + '\\s[àa]u?\\s)?' + make_pattern(covid) module-attribute
factory
DEFAULT_CONFIG = dict(attr='LOWER', ignore_excluded=False) module-attribute
create_component(nlp, name, attr, ignore_excluded)
Source code in edsnlp/pipelines/ner/covid/factory.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
@Language.factory("eds.covid", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    attr: Union[str, Dict[str, str]],
    ignore_excluded: bool,
):

    return GenericMatcher(
        nlp,
        terms=None,
        regex=dict(covid=patterns.pattern),
        attr=attr,
        ignore_excluded=ignore_excluded,
    )
scores
base_score
Score

Bases: AdvancedRegex

Matcher component to extract a numeric score

PARAMETER DESCRIPTION
nlp

The spaCy object.

TYPE: Language

score_name

The name of the extracted score

TYPE: str

regex

A list of regexes to identify the score

TYPE: List[str]

attr

Wether to match on the text ('TEXT') or on the normalized text ('NORM')

TYPE: str

after_extract

Regex with capturing group to get the score value

TYPE: str

score_normalization

Function that takes the "raw" value extracted from the after_extract regex, and should return - None if no score could be extracted - The desired score value else

TYPE: Callable[[Union[str,None]], Any]

window

Number of token to include after the score's mention to find the score's value

TYPE: int

Source code in edsnlp/pipelines/ner/scores/base_score.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
class Score(AdvancedRegex):
    """
    Matcher component to extract a numeric score

    Parameters
    ----------
    nlp : Language
        The spaCy object.
    score_name : str
        The name of the extracted score
    regex : List[str]
        A list of regexes to identify the score
    attr : str
        Wether to match on the text ('TEXT') or on the normalized text ('NORM')
    after_extract : str
        Regex with capturing group to get the score value
    score_normalization : Callable[[Union[str,None]], Any]
        Function that takes the "raw" value extracted from the `after_extract` regex,
        and should return
        - None if no score could be extracted
        - The desired score value else
    window : int
        Number of token to include after the score's mention to find the
        score's value
    """

    def __init__(
        self,
        nlp: Language,
        score_name: str,
        regex: List[str],
        attr: str,
        after_extract: str,
        score_normalization: Union[str, Callable[[Union[str, None]], Any]],
        window: int,
        verbose: int,
        ignore_excluded: bool,
    ):

        regex_config = {
            score_name: dict(regex=regex, attr=attr, after_extract=after_extract)
        }

        super().__init__(
            nlp=nlp,
            regex_config=regex_config,
            window=window,
            verbose=verbose,
            ignore_excluded=ignore_excluded,
            attr=attr,
        )

        self.score_name = score_name

        if isinstance(score_normalization, str):
            self.score_normalization = registry.get("misc", score_normalization)
        else:
            self.score_normalization = score_normalization

        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:
        super(Score, Score).set_extensions()
        if not Span.has_extension("score_name"):
            Span.set_extension("score_name", default=None)
        if not Span.has_extension("score_value"):
            Span.set_extension("score_value", default=None)

    def __call__(self, doc: Doc) -> Doc:
        """
        Adds spans to document.

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        doc:
            spaCy Doc object, annotated for extracted terms.
        """

        ents = super(Score, Score).process(self, doc)
        ents = self.score_filtering(ents)

        ents, discarded = filter_spans(list(doc.ents) + ents, return_discarded=True)

        doc.ents = ents

        if "discarded" not in doc.spans:
            doc.spans["discarded"] = []
        doc.spans["discarded"].extend(discarded)

        return doc

    def score_filtering(self, ents: List[Span]) -> List[Span]:
        """
        Extracts, if available, the value of the score.
        Normalizes the score via the provided `self.score_normalization` method.

        Parameters
        ----------
        ents: List[Span]
            List of spaCy's spans extracted by the score matcher

        Returns
        -------
        ents: List[Span]
            List of spaCy's spans, with, if found, an added `score_value` extension
        """
        to_keep_ents = []
        for ent in ents:
            value = ent._.after_extract[0]
            normalized_value = self.score_normalization(value)
            if normalized_value is not None:
                ent._.score_name = self.score_name
                ent._.score_value = int(value)
                to_keep_ents.append(ent)

        return to_keep_ents
score_name = score_name instance-attribute
score_normalization = registry.get('misc', score_normalization) instance-attribute
__init__(nlp, score_name, regex, attr, after_extract, score_normalization, window, verbose, ignore_excluded)
Source code in edsnlp/pipelines/ner/scores/base_score.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
def __init__(
    self,
    nlp: Language,
    score_name: str,
    regex: List[str],
    attr: str,
    after_extract: str,
    score_normalization: Union[str, Callable[[Union[str, None]], Any]],
    window: int,
    verbose: int,
    ignore_excluded: bool,
):

    regex_config = {
        score_name: dict(regex=regex, attr=attr, after_extract=after_extract)
    }

    super().__init__(
        nlp=nlp,
        regex_config=regex_config,
        window=window,
        verbose=verbose,
        ignore_excluded=ignore_excluded,
        attr=attr,
    )

    self.score_name = score_name

    if isinstance(score_normalization, str):
        self.score_normalization = registry.get("misc", score_normalization)
    else:
        self.score_normalization = score_normalization

    self.set_extensions()
set_extensions()
Source code in edsnlp/pipelines/ner/scores/base_score.py
72
73
74
75
76
77
78
@staticmethod
def set_extensions() -> None:
    super(Score, Score).set_extensions()
    if not Span.has_extension("score_name"):
        Span.set_extension("score_name", default=None)
    if not Span.has_extension("score_value"):
        Span.set_extension("score_value", default=None)
__call__(doc)

Adds spans to document.

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
doc

spaCy Doc object, annotated for extracted terms.

Source code in edsnlp/pipelines/ner/scores/base_score.py
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
def __call__(self, doc: Doc) -> Doc:
    """
    Adds spans to document.

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    doc:
        spaCy Doc object, annotated for extracted terms.
    """

    ents = super(Score, Score).process(self, doc)
    ents = self.score_filtering(ents)

    ents, discarded = filter_spans(list(doc.ents) + ents, return_discarded=True)

    doc.ents = ents

    if "discarded" not in doc.spans:
        doc.spans["discarded"] = []
    doc.spans["discarded"].extend(discarded)

    return doc
score_filtering(ents)

Extracts, if available, the value of the score. Normalizes the score via the provided self.score_normalization method.

PARAMETER DESCRIPTION
ents

List of spaCy's spans extracted by the score matcher

TYPE: List[Span]

RETURNS DESCRIPTION
ents

List of spaCy's spans, with, if found, an added score_value extension

Source code in edsnlp/pipelines/ner/scores/base_score.py
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
def score_filtering(self, ents: List[Span]) -> List[Span]:
    """
    Extracts, if available, the value of the score.
    Normalizes the score via the provided `self.score_normalization` method.

    Parameters
    ----------
    ents: List[Span]
        List of spaCy's spans extracted by the score matcher

    Returns
    -------
    ents: List[Span]
        List of spaCy's spans, with, if found, an added `score_value` extension
    """
    to_keep_ents = []
    for ent in ents:
        value = ent._.after_extract[0]
        normalized_value = self.score_normalization(value)
        if normalized_value is not None:
            ent._.score_name = self.score_name
            ent._.score_value = int(value)
            to_keep_ents.append(ent)

    return to_keep_ents
factory
DEFAULT_CONFIG = dict(attr='NORM', window=7, verbose=0, ignore_excluded=False) module-attribute
create_component(nlp, name, score_name, regex, after_extract, score_normalization, attr, window, verbose, ignore_excluded)
Source code in edsnlp/pipelines/ner/scores/factory.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
@deprecated_factory("score", "eds.score", default_config=DEFAULT_CONFIG)
@Language.factory("eds.score", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    score_name: str,
    regex: List[str],
    after_extract: str,
    score_normalization: Union[str, Callable[[Union[str, None]], Any]],
    attr: str,
    window: int,
    verbose: int,
    ignore_excluded: bool,
):
    return Score(
        nlp,
        score_name=score_name,
        regex=regex,
        after_extract=after_extract,
        score_normalization=score_normalization,
        attr=attr,
        window=window,
        verbose=verbose,
        ignore_excluded=ignore_excluded,
    )
charlson
patterns
regex = ['charlson'] module-attribute
after_extract = 'charlson.*?[\\n\\W]*?(\\d+)' module-attribute
score_normalization_str = 'score_normalization.charlson' module-attribute
score_normalization(extracted_score)

Charlson score normalization. If available, returns the integer value of the Charlson score.

Source code in edsnlp/pipelines/ner/scores/charlson/patterns.py
12
13
14
15
16
17
18
19
20
@spacy.registry.misc(score_normalization_str)
def score_normalization(extracted_score: Union[str, None]):
    """
    Charlson score normalization.
    If available, returns the integer value of the Charlson score.
    """
    score_range = list(range(0, 30))
    if (extracted_score is not None) and (int(extracted_score) in score_range):
        return int(extracted_score)
factory
DEFAULT_CONFIG = dict(regex=patterns.regex, after_extract=patterns.after_extract, score_normalization=patterns.score_normalization_str, attr='NORM', window=7, verbose=0, ignore_excluded=False) module-attribute
create_component(nlp, name, regex, after_extract, score_normalization, attr, window, verbose, ignore_excluded)
Source code in edsnlp/pipelines/ner/scores/charlson/factory.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
@deprecated_factory("charlson", "eds.charlson", default_config=DEFAULT_CONFIG)
@Language.factory("eds.charlson", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    regex: List[str],
    after_extract: str,
    score_normalization: Union[str, Callable[[Union[str, None]], Any]],
    attr: str,
    window: int,
    verbose: int,
    ignore_excluded: bool,
):
    return Score(
        nlp,
        score_name=name,
        regex=regex,
        after_extract=after_extract,
        score_normalization=score_normalization,
        attr=attr,
        window=window,
        verbose=verbose,
        ignore_excluded=ignore_excluded,
    )
emergency
priority
patterns
regex = ['\\bpriorite\\b'] module-attribute
after_extract = 'priorite.*?[\\n\\W]*?(\\d+)' module-attribute
score_normalization_str = 'score_normalization.priority' module-attribute
score_normalization(extracted_score)

Priority score normalization. If available, returns the integer value of the priority score.

Source code in edsnlp/pipelines/ner/scores/emergency/priority/patterns.py
12
13
14
15
16
17
18
19
20
@spacy.registry.misc(score_normalization_str)
def score_normalization(extracted_score: Union[str, None]):
    """
    Priority score normalization.
    If available, returns the integer value of the priority score.
    """
    score_range = list(range(0, 6))
    if (extracted_score is not None) and (int(extracted_score) in score_range):
        return int(extracted_score)
factory
DEFAULT_CONFIG = dict(regex=patterns.regex, after_extract=patterns.after_extract, score_normalization=patterns.score_normalization_str, attr='NORM', window=7, verbose=0, ignore_excluded=False) module-attribute
create_component(nlp, name, regex, after_extract, score_normalization, attr, window, verbose, ignore_excluded)
Source code in edsnlp/pipelines/ner/scores/emergency/priority/factory.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
@deprecated_factory(
    "emergency.priority", "eds.emergency.priority", default_config=DEFAULT_CONFIG
)
@Language.factory("eds.emergency.priority", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    regex: List[str],
    after_extract: str,
    score_normalization: Union[str, Callable[[Union[str, None]], Any]],
    attr: str,
    window: int,
    verbose: int,
    ignore_excluded: bool,
):
    return Score(
        nlp,
        score_name=name,
        regex=regex,
        after_extract=after_extract,
        score_normalization=score_normalization,
        attr=attr,
        window=window,
        verbose=verbose,
        ignore_excluded=ignore_excluded,
    )
ccmu
patterns
regex = ['\\bccmu\\b'] module-attribute
after_extract = 'ccmu.*?[\\n\\W]*?(\\d+)' module-attribute
score_normalization_str = 'score_normalization.ccmu' module-attribute
score_normalization(extracted_score)

CCMU score normalization. If available, returns the integer value of the CCMU score.

Source code in edsnlp/pipelines/ner/scores/emergency/ccmu/patterns.py
12
13
14
15
16
17
18
19
20
@spacy.registry.misc(score_normalization_str)
def score_normalization(extracted_score: Union[str, None]):
    """
    CCMU score normalization.
    If available, returns the integer value of the CCMU score.
    """
    score_range = [1, 2, 3, 4, 5]
    if (extracted_score is not None) and (int(extracted_score) in score_range):
        return int(extracted_score)
factory
DEFAULT_CONFIG = dict(regex=patterns.regex, after_extract=patterns.after_extract, score_normalization=patterns.score_normalization_str, attr='NORM', window=20, verbose=0, ignore_excluded=False) module-attribute
create_component(nlp, name, regex, after_extract, score_normalization, attr, window, verbose, ignore_excluded)
Source code in edsnlp/pipelines/ner/scores/emergency/ccmu/factory.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
@deprecated_factory(
    "emergency.ccmu", "eds.emergency.ccmu", default_config=DEFAULT_CONFIG
)
@Language.factory("eds.emergency.ccmu", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    regex: List[str],
    after_extract: str,
    score_normalization: Union[str, Callable[[Union[str, None]], Any]],
    attr: str,
    window: int,
    verbose: int,
    ignore_excluded: bool,
):
    return Score(
        nlp,
        score_name=name,
        regex=regex,
        after_extract=after_extract,
        score_normalization=score_normalization,
        attr=attr,
        window=window,
        verbose=verbose,
        ignore_excluded=ignore_excluded,
    )
gemsa
patterns
regex = ['\\bgemsa\\b'] module-attribute
after_extract = 'gemsa.*?[\\n\\W]*?(\\d+)' module-attribute
score_normalization_str = 'score_normalization.gemsa' module-attribute
score_normalization(extracted_score)

GEMSA score normalization. If available, returns the integer value of the GEMSA score.

Source code in edsnlp/pipelines/ner/scores/emergency/gemsa/patterns.py
12
13
14
15
16
17
18
19
20
@spacy.registry.misc(score_normalization_str)
def score_normalization(extracted_score: Union[str, None]):
    """
    GEMSA score normalization.
    If available, returns the integer value of the GEMSA score.
    """
    score_range = [1, 2, 3, 4, 5, 6]
    if (extracted_score is not None) and (int(extracted_score) in score_range):
        return int(extracted_score)
factory
DEFAULT_CONFIG = dict(regex=patterns.regex, after_extract=patterns.after_extract, score_normalization=patterns.score_normalization_str, attr='NORM', window=20, verbose=0, ignore_excluded=False) module-attribute
create_component(nlp, name, regex, after_extract, score_normalization, attr, window, verbose, ignore_excluded)
Source code in edsnlp/pipelines/ner/scores/emergency/gemsa/factory.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
@deprecated_factory(
    "emergency.gemsa", "eds.emergency.gemsa", default_config=DEFAULT_CONFIG
)
@Language.factory("eds.emergency.gemsa", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    regex: List[str],
    after_extract: str,
    score_normalization: Union[str, Callable[[Union[str, None]], Any]],
    attr: str,
    window: int,
    verbose: int,
    ignore_excluded: bool,
):
    return Score(
        nlp,
        score_name=name,
        regex=regex,
        after_extract=after_extract,
        score_normalization=score_normalization,
        attr=attr,
        window=window,
        verbose=verbose,
        ignore_excluded=ignore_excluded,
    )
sofa
sofa
Sofa

Bases: Score

Matcher component to extract the SOFA score

PARAMETER DESCRIPTION
nlp

The spaCy object.

TYPE: Language

score_name

The name of the extracted score

TYPE: str

regex

A list of regexes to identify the SOFA score

TYPE: List[str]

attr

Wether to match on the text ('TEXT') or on the normalized text ('CUSTOM_NORM')

TYPE: str

method_regex

Regex with capturing group to get the score extraction method (e.g. "à l'admission", "à 24H", "Maximum")

TYPE: str

value_regex

Regex to extract the score value

TYPE: str

score_normalization

Function that takes the "raw" value extracted from the after_extract regex, and should return - None if no score could be extracted - The desired score value else

TYPE: Callable[[Union[str,None]], Any]

window

Number of token to include after the score's mention to find the score's value

TYPE: int

Source code in edsnlp/pipelines/ner/scores/sofa/sofa.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
class Sofa(Score):
    """
    Matcher component to extract the SOFA score

    Parameters
    ----------
    nlp : Language
        The spaCy object.
    score_name : str
        The name of the extracted score
    regex : List[str]
        A list of regexes to identify the SOFA score
    attr : str
        Wether to match on the text ('TEXT') or on the normalized text ('CUSTOM_NORM')
    method_regex : str
        Regex with capturing group to get the score extraction method
        (e.g. "à l'admission", "à 24H", "Maximum")
    value_regex : str
        Regex to extract the score value
    score_normalization : Callable[[Union[str,None]], Any]
        Function that takes the "raw" value extracted from the `after_extract` regex,
        and should return
        - None if no score could be extracted
        - The desired score value else
    window : int
        Number of token to include after the score's mention to find the
        score's value
    """

    def __init__(
        self,
        nlp: Language,
        score_name: str,
        regex: List[str],
        attr: str,
        method_regex: str,
        value_regex: str,
        score_normalization: Union[str, Callable[[Union[str, None]], Any]],
        window: int,
        verbose: int,
        ignore_excluded: bool,
    ):

        super().__init__(
            nlp,
            score_name=score_name,
            regex=regex,
            after_extract=[],
            score_normalization=score_normalization,
            attr=attr,
            window=window,
            verbose=verbose,
            ignore_excluded=ignore_excluded,
        )

        self.method_regex = method_regex
        self.value_regex = value_regex

        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:
        super(Sofa, Sofa).set_extensions()
        if not Span.has_extension("score_method"):
            Span.set_extension("score_method", default=None)

    def score_filtering(self, ents: List[Span]) -> List[Span]:
        """
        Extracts, if available, the value of the score.
        Normalizes the score via the provided `self.score_normalization` method.

        Parameters
        ----------
        ents: List[Span]
            List of spaCy's spans extracted by the score matcher

        Returns
        -------
        ents: List[Span]
            List of spaCy's spans, with, if found, an added `score_value` extension
        """

        to_keep_ents = []

        for ent in ents:
            after_snippet = get_text(
                ent._.after_snippet,
                attr=self.attr,
                ignore_excluded=self.ignore_excluded,
            )
            matches = re.search(self.method_regex, after_snippet)

            if matches is None:
                method = "Non précisée"
                value = after_snippet

            else:
                groups = matches.groupdict()
                value = groups["after_value"]
                if groups["max"] is not None:
                    method = "Maximum"
                elif groups["vqheures"] is not None:
                    method = "24H"
                elif groups["admission"] is not None:
                    method = "A l'admission"

            digit_value = re.match(
                self.value_regex, value
            )  # Use match instead of search to only look at the beginning
            digit_value = None if digit_value is None else digit_value.groups()[0]

            normalized_value = self.score_normalization(digit_value)
            if normalized_value is not None:
                ent._.score_name = self.score_name
                ent._.score_value = int(normalized_value)
                ent._.score_method = method
                to_keep_ents.append(ent)

        return to_keep_ents
method_regex = method_regex instance-attribute
value_regex = value_regex instance-attribute
__init__(nlp, score_name, regex, attr, method_regex, value_regex, score_normalization, window, verbose, ignore_excluded)
Source code in edsnlp/pipelines/ner/scores/sofa/sofa.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def __init__(
    self,
    nlp: Language,
    score_name: str,
    regex: List[str],
    attr: str,
    method_regex: str,
    value_regex: str,
    score_normalization: Union[str, Callable[[Union[str, None]], Any]],
    window: int,
    verbose: int,
    ignore_excluded: bool,
):

    super().__init__(
        nlp,
        score_name=score_name,
        regex=regex,
        after_extract=[],
        score_normalization=score_normalization,
        attr=attr,
        window=window,
        verbose=verbose,
        ignore_excluded=ignore_excluded,
    )

    self.method_regex = method_regex
    self.value_regex = value_regex

    self.set_extensions()
set_extensions()
Source code in edsnlp/pipelines/ner/scores/sofa/sofa.py
71
72
73
74
75
@staticmethod
def set_extensions() -> None:
    super(Sofa, Sofa).set_extensions()
    if not Span.has_extension("score_method"):
        Span.set_extension("score_method", default=None)
score_filtering(ents)

Extracts, if available, the value of the score. Normalizes the score via the provided self.score_normalization method.

PARAMETER DESCRIPTION
ents

List of spaCy's spans extracted by the score matcher

TYPE: List[Span]

RETURNS DESCRIPTION
ents

List of spaCy's spans, with, if found, an added score_value extension

Source code in edsnlp/pipelines/ner/scores/sofa/sofa.py
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
def score_filtering(self, ents: List[Span]) -> List[Span]:
    """
    Extracts, if available, the value of the score.
    Normalizes the score via the provided `self.score_normalization` method.

    Parameters
    ----------
    ents: List[Span]
        List of spaCy's spans extracted by the score matcher

    Returns
    -------
    ents: List[Span]
        List of spaCy's spans, with, if found, an added `score_value` extension
    """

    to_keep_ents = []

    for ent in ents:
        after_snippet = get_text(
            ent._.after_snippet,
            attr=self.attr,
            ignore_excluded=self.ignore_excluded,
        )
        matches = re.search(self.method_regex, after_snippet)

        if matches is None:
            method = "Non précisée"
            value = after_snippet

        else:
            groups = matches.groupdict()
            value = groups["after_value"]
            if groups["max"] is not None:
                method = "Maximum"
            elif groups["vqheures"] is not None:
                method = "24H"
            elif groups["admission"] is not None:
                method = "A l'admission"

        digit_value = re.match(
            self.value_regex, value
        )  # Use match instead of search to only look at the beginning
        digit_value = None if digit_value is None else digit_value.groups()[0]

        normalized_value = self.score_normalization(digit_value)
        if normalized_value is not None:
            ent._.score_name = self.score_name
            ent._.score_value = int(normalized_value)
            ent._.score_method = method
            to_keep_ents.append(ent)

    return to_keep_ents
patterns
regex = ['\\bsofa\\b'] module-attribute
method_regex = 'sofa.*?((?P<max>max\\w*)|(?P<vqheures>24h\\w*)|(?P<admission>admission\\w*))(?P<after_value>(.|\\n)*)' module-attribute
value_regex = '.*?.[\\n\\W]*?(\\d+)[^h\\d]' module-attribute
score_normalization_str = 'score_normalization.sofa' module-attribute
score_normalization(extracted_score)

Sofa score normalization. If available, returns the integer value of the SOFA score.

Source code in edsnlp/pipelines/ner/scores/sofa/patterns.py
17
18
19
20
21
22
23
24
25
@spacy.registry.misc(score_normalization_str)
def score_normalization(extracted_score: Union[str, None]):
    """
    Sofa score normalization.
    If available, returns the integer value of the SOFA score.
    """
    score_range = list(range(0, 30))
    if (extracted_score is not None) and (int(extracted_score) in score_range):
        return int(extracted_score)
factory
DEFAULT_CONFIG = dict(regex=patterns.regex, method_regex=patterns.method_regex, value_regex=patterns.value_regex, score_normalization=patterns.score_normalization_str, attr='NORM', window=20, verbose=0, ignore_excluded=False) module-attribute
create_component(nlp, name, regex, method_regex, value_regex, score_normalization, attr, window, verbose, ignore_excluded)
Source code in edsnlp/pipelines/ner/scores/sofa/factory.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
@deprecated_factory("SOFA", "eds.SOFA", default_config=DEFAULT_CONFIG)
@Language.factory("eds.SOFA", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    regex: List[str],
    method_regex: str,
    value_regex: str,
    score_normalization: Union[str, Callable[[Union[str, None]], Any]],
    attr: str,
    window: int,
    verbose: int,
    ignore_excluded: bool,
):
    return Sofa(
        nlp,
        score_name=name,
        regex=regex,
        method_regex=method_regex,
        value_regex=value_regex,
        score_normalization=score_normalization,
        attr=attr,
        window=window,
        verbose=verbose,
        ignore_excluded=ignore_excluded,
    )

misc

dates
dates
parsers = [parser for parser in default_parsers if parser != 'relative-time'] module-attribute
parser1 = DateDataParser(languages=['fr'], settings={'PREFER_DAY_OF_MONTH': 'first', 'PREFER_DATES_FROM': 'past', 'PARSERS': parsers, 'RETURN_AS_TIMEZONE_AWARE': False}) module-attribute
parser2 = DateDataParser(languages=['fr'], settings={'PREFER_DAY_OF_MONTH': 'first', 'PREFER_DATES_FROM': 'past', 'PARSERS': ['relative-time'], 'RETURN_AS_TIMEZONE_AWARE': False}) module-attribute
Dates

Bases: BaseComponent

Tags and normalizes dates, using the open-source dateparser library.

The pipeline uses spaCy's filter_spans function. It filters out false positives, and introduce a hierarchy between patterns. For instance, in case of ambiguity, the pipeline will decide that a date is a date without a year rather than a date without a day.

PARAMETER DESCRIPTION
nlp

Language pipeline object

TYPE: spacy.language.Language

absolute

List of regular expressions for absolute dates.

TYPE: Union[List[str], str]

full

List of regular expressions for full dates in YYYY-MM-DD format.

TYPE: Union[List[str], str]

relative

List of regular expressions for relative dates (eg hier, la semaine prochaine).

TYPE: Union[List[str], str]

no_year

List of regular expressions for dates that do not display a year.

TYPE: Union[List[str], str]

no_day

List of regular expressions for dates that do not display a day.

TYPE: Union[List[str], str]

year_only

List of regular expressions for dates that only display a year.

TYPE: Union[List[str], str]

current

List of regular expressions for dates that relate to the current month, week, year, etc.

TYPE: Union[List[str], str]

false_positive

List of regular expressions for false positive (eg phone numbers, etc).

TYPE: Union[List[str], str]

on_ents_only

Wether to look on dates in the whole document or in specific sentences:

  • If True: Only look in the sentences of each entity in doc.ents
  • If False: Look in the whole document
  • If given a string key or list of string: Only look in the sentences of each entity in doc.spans[key]

TYPE: Union[bool, str, List[str]]

Source code in edsnlp/pipelines/misc/dates/dates.py
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
class Dates(BaseComponent):
    """
    Tags and normalizes dates, using the open-source `dateparser` library.

    The pipeline uses spaCy's `filter_spans` function.
    It filters out false positives, and introduce a hierarchy between patterns.
    For instance, in case of ambiguity, the pipeline will decide that a date is a
    date without a year rather than a date without a day.

    Parameters
    ----------
    nlp : spacy.language.Language
        Language pipeline object
    absolute : Union[List[str], str]
        List of regular expressions for absolute dates.
    full : Union[List[str], str]
        List of regular expressions for full dates in YYYY-MM-DD format.
    relative : Union[List[str], str]
        List of regular expressions for relative dates
        (eg `hier`, `la semaine prochaine`).
    no_year : Union[List[str], str]
        List of regular expressions for dates that do not display a year.
    no_day : Union[List[str], str]
        List of regular expressions for dates that do not display a day.
    year_only : Union[List[str], str]
        List of regular expressions for dates that only display a year.
    current : Union[List[str], str]
        List of regular expressions for dates that relate to
        the current month, week, year, etc.
    false_positive : Union[List[str], str]
        List of regular expressions for false positive (eg phone numbers, etc).
    on_ents_only : Union[bool, str, List[str]]
        Wether to look on dates in the whole document or in specific sentences:

        - If `True`: Only look in the sentences of each entity in doc.ents
        - If False: Look in the whole document
        - If given a string `key` or list of string: Only look in the sentences of
          each entity in `#!python doc.spans[key]`
    """

    # noinspection PyProtectedMember
    def __init__(
        self,
        nlp: Language,
        absolute: Optional[List[str]],
        full: Optional[List[str]],
        relative: Optional[List[str]],
        no_year: Optional[List[str]],
        no_day: Optional[List[str]],
        year_only: Optional[List[str]],
        current: Optional[List[str]],
        false_positive: Optional[List[str]],
        on_ents_only: bool,
        attr: str,
    ):

        self.nlp = nlp

        if no_year is None:
            no_year = patterns.no_year_pattern
        if year_only is None:
            year_only = patterns.full_year_pattern
        if no_day is None:
            no_day = patterns.no_day_pattern
        if absolute is None:
            absolute = patterns.absolute_date_pattern
        if relative is None:
            relative = patterns.relative_date_pattern
        if full is None:
            full = patterns.full_date_pattern
        if current is None:
            current = patterns.current_pattern
        if false_positive is None:
            false_positive = patterns.false_positive_pattern

        if isinstance(absolute, str):
            absolute = [absolute]
        if isinstance(relative, str):
            relative = [relative]
        if isinstance(no_year, str):
            no_year = [no_year]
        if isinstance(no_day, str):
            no_day = [no_day]
        if isinstance(year_only, str):
            year_only = [year_only]
        if isinstance(full, str):
            full = [full]
        if isinstance(current, str):
            current = [current]
        if isinstance(false_positive, str):
            false_positive = [false_positive]

        self.on_ents_only = on_ents_only
        self.regex_matcher = RegexMatcher(attr=attr, alignment_mode="strict")

        self.regex_matcher.add("false_positive", false_positive)
        self.regex_matcher.add("full_date", full)
        self.regex_matcher.add("absolute", absolute)
        self.regex_matcher.add("relative", relative)
        self.regex_matcher.add("no_year", no_year)
        self.regex_matcher.add("no_day", no_day)
        self.regex_matcher.add("year_only", year_only)
        self.regex_matcher.add("current", current)

        self.parser = date_parser
        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:

        if not Doc.has_extension("note_datetime"):
            Doc.set_extension("note_datetime", default=None)

        if not Span.has_extension("parsed_date"):
            Span.set_extension("parsed_date", default=None)

        if not Span.has_extension("parsed_delta"):
            Span.set_extension("parsed_delta", default=None)

        if not Span.has_extension("date"):
            Span.set_extension("date", getter=date_getter)

    def process(self, doc: Doc) -> List[Span]:
        """
        Find dates in doc.

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        dates:
            list of date spans
        """

        if self.on_ents_only:

            if type(self.on_ents_only) == bool:
                ents = doc.ents
            else:
                if type(self.on_ents_only) == str:
                    self.on_ents_only = [self.on_ents_only]
                ents = []
                for key in self.on_ents_only:
                    ents.extend(list(doc.spans[key]))

            dates = []
            for sent in set([ent.sent for ent in ents]):
                dates = chain(
                    dates,
                    self.regex_matcher(
                        sent,
                        as_spans=True,
                        # return_groupdict=True,
                    ),
                )

        else:
            dates = self.regex_matcher(
                doc,
                as_spans=True,
                # return_groupdict=True,
            )

        # dates = apply_groupdict(dates)

        dates = filter_spans(dates)
        dates = [date for date in dates if date.label_ != "false_positive"]

        return dates

    def get_date(self, date: Span) -> Optional[datetime]:
        """
        Get normalised date using `dateparser`.

        Parameters
        ----------
        date : Span
            Date span.

        Returns
        -------
        Optional[datetime]
            If a date is recognised, returns a Python `datetime` object.
            Returns `None` otherwise.
        """

        text_date = date.text

        if date.label_ == "no_day":
            text_date = "01/" + re.sub(r"[\.\/\s]", "/", text_date)

        elif date.label_ == "full_date":
            text_date = re.sub(r"[\.\/\s]", "-", text_date)

            try:
                return datetime.strptime(text_date, "%Y-%m-%d")
            except ValueError:
                try:
                    return datetime.strptime(text_date, "%Y-%d-%m")
                except ValueError:
                    return None

        # text_date = re.sub(r"\.", "-", text_date)

        return self.parser(text_date)

    def __call__(self, doc: Doc) -> Doc:
        """
        Tags dates.

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        doc:
            spaCy Doc object, annotated for dates
        """
        dates = self.process(doc)

        for date in dates:
            d = self.get_date(date)

            if d is None:
                date._.parsed_date = None
            else:
                date._.parsed_date = d
                date._.parsed_delta = d - datetime.now() + timedelta(seconds=10)

        doc.spans["dates"] = dates

        return doc
nlp = nlp instance-attribute
on_ents_only = on_ents_only instance-attribute
regex_matcher = RegexMatcher(attr=attr, alignment_mode='strict') instance-attribute
parser = date_parser instance-attribute
__init__(nlp, absolute, full, relative, no_year, no_day, year_only, current, false_positive, on_ents_only, attr)
Source code in edsnlp/pipelines/misc/dates/dates.py
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
def __init__(
    self,
    nlp: Language,
    absolute: Optional[List[str]],
    full: Optional[List[str]],
    relative: Optional[List[str]],
    no_year: Optional[List[str]],
    no_day: Optional[List[str]],
    year_only: Optional[List[str]],
    current: Optional[List[str]],
    false_positive: Optional[List[str]],
    on_ents_only: bool,
    attr: str,
):

    self.nlp = nlp

    if no_year is None:
        no_year = patterns.no_year_pattern
    if year_only is None:
        year_only = patterns.full_year_pattern
    if no_day is None:
        no_day = patterns.no_day_pattern
    if absolute is None:
        absolute = patterns.absolute_date_pattern
    if relative is None:
        relative = patterns.relative_date_pattern
    if full is None:
        full = patterns.full_date_pattern
    if current is None:
        current = patterns.current_pattern
    if false_positive is None:
        false_positive = patterns.false_positive_pattern

    if isinstance(absolute, str):
        absolute = [absolute]
    if isinstance(relative, str):
        relative = [relative]
    if isinstance(no_year, str):
        no_year = [no_year]
    if isinstance(no_day, str):
        no_day = [no_day]
    if isinstance(year_only, str):
        year_only = [year_only]
    if isinstance(full, str):
        full = [full]
    if isinstance(current, str):
        current = [current]
    if isinstance(false_positive, str):
        false_positive = [false_positive]

    self.on_ents_only = on_ents_only
    self.regex_matcher = RegexMatcher(attr=attr, alignment_mode="strict")

    self.regex_matcher.add("false_positive", false_positive)
    self.regex_matcher.add("full_date", full)
    self.regex_matcher.add("absolute", absolute)
    self.regex_matcher.add("relative", relative)
    self.regex_matcher.add("no_year", no_year)
    self.regex_matcher.add("no_day", no_day)
    self.regex_matcher.add("year_only", year_only)
    self.regex_matcher.add("current", current)

    self.parser = date_parser
    self.set_extensions()
set_extensions()
Source code in edsnlp/pipelines/misc/dates/dates.py
307
308
309
310
311
312
313
314
315
316
317
318
319
320
@staticmethod
def set_extensions() -> None:

    if not Doc.has_extension("note_datetime"):
        Doc.set_extension("note_datetime", default=None)

    if not Span.has_extension("parsed_date"):
        Span.set_extension("parsed_date", default=None)

    if not Span.has_extension("parsed_delta"):
        Span.set_extension("parsed_delta", default=None)

    if not Span.has_extension("date"):
        Span.set_extension("date", getter=date_getter)
process(doc)

Find dates in doc.

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
dates

list of date spans

Source code in edsnlp/pipelines/misc/dates/dates.py
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
def process(self, doc: Doc) -> List[Span]:
    """
    Find dates in doc.

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    dates:
        list of date spans
    """

    if self.on_ents_only:

        if type(self.on_ents_only) == bool:
            ents = doc.ents
        else:
            if type(self.on_ents_only) == str:
                self.on_ents_only = [self.on_ents_only]
            ents = []
            for key in self.on_ents_only:
                ents.extend(list(doc.spans[key]))

        dates = []
        for sent in set([ent.sent for ent in ents]):
            dates = chain(
                dates,
                self.regex_matcher(
                    sent,
                    as_spans=True,
                    # return_groupdict=True,
                ),
            )

    else:
        dates = self.regex_matcher(
            doc,
            as_spans=True,
            # return_groupdict=True,
        )

    # dates = apply_groupdict(dates)

    dates = filter_spans(dates)
    dates = [date for date in dates if date.label_ != "false_positive"]

    return dates
get_date(date)

Get normalised date using dateparser.

PARAMETER DESCRIPTION
date

Date span.

TYPE: Span

RETURNS DESCRIPTION
Optional[datetime]

If a date is recognised, returns a Python datetime object. Returns None otherwise.

Source code in edsnlp/pipelines/misc/dates/dates.py
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
def get_date(self, date: Span) -> Optional[datetime]:
    """
    Get normalised date using `dateparser`.

    Parameters
    ----------
    date : Span
        Date span.

    Returns
    -------
    Optional[datetime]
        If a date is recognised, returns a Python `datetime` object.
        Returns `None` otherwise.
    """

    text_date = date.text

    if date.label_ == "no_day":
        text_date = "01/" + re.sub(r"[\.\/\s]", "/", text_date)

    elif date.label_ == "full_date":
        text_date = re.sub(r"[\.\/\s]", "-", text_date)

        try:
            return datetime.strptime(text_date, "%Y-%m-%d")
        except ValueError:
            try:
                return datetime.strptime(text_date, "%Y-%d-%m")
            except ValueError:
                return None

    # text_date = re.sub(r"\.", "-", text_date)

    return self.parser(text_date)
__call__(doc)

Tags dates.

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
doc

spaCy Doc object, annotated for dates

Source code in edsnlp/pipelines/misc/dates/dates.py
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
def __call__(self, doc: Doc) -> Doc:
    """
    Tags dates.

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    doc:
        spaCy Doc object, annotated for dates
    """
    dates = self.process(doc)

    for date in dates:
        d = self.get_date(date)

        if d is None:
            date._.parsed_date = None
        else:
            date._.parsed_date = d
            date._.parsed_delta = d - datetime.now() + timedelta(seconds=10)

    doc.spans["dates"] = dates

    return doc
td2str(td)

Transforms a timedelta object to a string representation.

PARAMETER DESCRIPTION
td

The timedelta object to represent.

TYPE: timedelta

RETURNS DESCRIPTION
str

Usable representation for the timedelta object.

Source code in edsnlp/pipelines/misc/dates/dates.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
def td2str(td: timedelta):
    """
    Transforms a timedelta object to a string representation.

    Parameters
    ----------
    td : timedelta
        The timedelta object to represent.

    Returns
    -------
    str
        Usable representation for the timedelta object.
    """
    seconds = td.total_seconds()
    days = int(seconds / 3600 / 24)
    return f"TD{days:+d}"
date_getter(date)

Getter for dates. Uses the information from note_datetime.

PARAMETER DESCRIPTION
date

Date detected by the pipeline.

TYPE: Span

RETURNS DESCRIPTION
str

Normalized date.

Source code in edsnlp/pipelines/misc/dates/dates.py
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
def date_getter(date: Span) -> str:
    """
    Getter for dates. Uses the information from `note_datetime`.

    Parameters
    ----------
    date : Span
        Date detected by the pipeline.

    Returns
    -------
    str
        Normalized date.
    """

    d = date._.parsed_date

    if d is None:
        # dateparser could not interpret the date.
        return "????-??-??"

    delta = date._.parsed_delta
    note_datetime = date.doc._.note_datetime

    if date.label_ in {"absolute", "full_date", "no_day"}:
        normalized = d.strftime("%Y-%m-%d")
    elif date.label_ == "no_year":
        if note_datetime:
            year = note_datetime.strftime("%Y")
        else:
            year = "????"
        normalized = d.strftime(f"{year}-%m-%d")
    else:
        if note_datetime:
            # We need to adjust the timedelta, since most dates are set at 00h00.
            # The slightest difference leads to a day difference.
            d = note_datetime + delta
            normalized = d.strftime("%Y-%m-%d")
        else:
            normalized = td2str(d - datetime.now())

    return normalized
date_parser(text_date)

Function to parse dates. It try first all available parsers ('timestamp', 'custom-formats', 'absolute-time') but 'relative-time'. If no date is found, retries with 'relative-time'.

When just the year is identified, it returns a datetime object with month and day equal to 1.

PARAMETER DESCRIPTION
text_date

TYPE: str

RETURNS DESCRIPTION
datetime
Source code in edsnlp/pipelines/misc/dates/dates.py
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
def date_parser(text_date: str) -> datetime:
    """
    Function to parse dates. It try first all available parsers
    ('timestamp', 'custom-formats', 'absolute-time') but 'relative-time'.
    If no date is found, retries with 'relative-time'.

    When just the year is identified, it returns a datetime object with
    month and day equal to 1.


    Parameters
    ----------
    text_date : str

    Returns
    -------
    datetime
    """

    parsed_date = parser1.get_date_data(text_date)
    if parsed_date.date_obj:
        if parsed_date.period == "year":
            return datetime(year=parsed_date.date_obj.year, month=1, day=1)
        else:
            return parsed_date.date_obj
    else:
        parsed_date2 = parser2.get_date_data(text_date)
        return parsed_date2.date_obj
apply_groupdict(dates)
Source code in edsnlp/pipelines/misc/dates/dates.py
134
135
136
137
138
139
def apply_groupdict(
    dates: Iterable[Tuple[Span, Dict[str, str]]]
) -> Generator[Span, None, None]:
    for span, groupdict in dates:
        span._.groupdict = groupdict
        yield span
parse_groupdict(day=None, month=None, year=None, hour=None, minute=None, second=None, **kwargs)

Parse date groupdict.

PARAMETER DESCRIPTION
day

String representation of the day, by default None

TYPE: str, optional DEFAULT: None

month

String representation of the month, by default None

TYPE: str, optional DEFAULT: None

year

String representation of the year, by default None

TYPE: str, optional DEFAULT: None

hour

String representation of the hour, by default None

TYPE: str, optional DEFAULT: None

minute

String representation of the minute, by default None

TYPE: str, optional DEFAULT: None

second

String representation of the minute, by default None

TYPE: str, optional DEFAULT: None

RETURNS DESCRIPTION
Dict[str, int]

Parsed groupdict.

Source code in edsnlp/pipelines/misc/dates/dates.py
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
def parse_groupdict(
    day: str = None,
    month: str = None,
    year: str = None,
    hour: str = None,
    minute: str = None,
    second: str = None,
    **kwargs: Dict[str, str],
) -> Dict[str, int]:
    """
    Parse date groupdict.

    Parameters
    ----------
    day : str, optional
        String representation of the day, by default None
    month : str, optional
        String representation of the month, by default None
    year : str, optional
        String representation of the year, by default None
    hour : str, optional
        String representation of the hour, by default None
    minute : str, optional
        String representation of the minute, by default None
    second : str, optional
        String representation of the minute, by default None

    Returns
    -------
    Dict[str, int]
        Parsed groupdict.
    """

    result = dict()

    if day is not None:
        result["day"] = day2int(day)

    if month is not None:
        result["month"] = month2int(month)

    if year is not None:
        result["year"] = str2int(year)

    if hour is not None:
        result["hour"] = str2int(hour)

    if minute is not None:
        result["minute"] = str2int(minute)

    if second is not None:
        result["second"] = str2int(second)

    result.update(**kwargs)

    return result
parsing
month2int = time2int_factory(months.letter_months_dict) module-attribute
day2int = time2int_factory(days.letter_days_dict) module-attribute
str2int(time)

Converts a string to an integer. Returns None if the string cannot be converted.

PARAMETER DESCRIPTION
time

String representation

TYPE: str

RETURNS DESCRIPTION
int

Integer conversion.

Source code in edsnlp/pipelines/misc/dates/parsing.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
def str2int(time: str) -> int:
    """
    Converts a string to an integer. Returns `None` if the string cannot be converted.

    Parameters
    ----------
    time : str
        String representation

    Returns
    -------
    int
        Integer conversion.
    """
    try:
        return int(time)
    except ValueError:
        return None
time2int_factory(patterns)

Factory for a time2int conversion function.

PARAMETER DESCRIPTION
patterns

Dictionary of conversion/pattern.

TYPE: Dict[str, int]

RETURNS DESCRIPTION
Callable[[str], int]

String to integer function.

Source code in edsnlp/pipelines/misc/dates/parsing.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def time2int_factory(patterns: Dict[str, int]) -> Callable[[str], int]:
    """
    Factory for a `time2int` conversion function.

    Parameters
    ----------
    patterns : Dict[str, int]
        Dictionary of conversion/pattern.

    Returns
    -------
    Callable[[str], int]
        String to integer function.
    """

    def time2int(time: str) -> int:
        """
        Converts a string representation to the proper integer,
        iterating over a dictionnary of pattern/conversion.

        Parameters
        ----------
        time : str
            String representation

        Returns
        -------
        int
            Integer conversion
        """
        m = str2int(time)

        if m is not None:
            return m

        for pattern, key in patterns.items():
            if re.match(f"^{pattern}$", time):
                m = key
                break

        return m

    return time2int
factory
DEFAULT_CONFIG = dict(no_year=None, year_only=None, no_day=None, absolute=None, relative=None, full=None, current=None, false_positive=None, on_ents_only=False, attr='LOWER') module-attribute
create_component(nlp, name, no_year, year_only, no_day, absolute, full, relative, current, false_positive, on_ents_only, attr)
Source code in edsnlp/pipelines/misc/dates/factory.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
@deprecated_factory("dates", "eds.dates", default_config=DEFAULT_CONFIG)
@Language.factory("eds.dates", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    no_year: Optional[List[str]],
    year_only: Optional[List[str]],
    no_day: Optional[List[str]],
    absolute: Optional[List[str]],
    full: Optional[List[str]],
    relative: Optional[List[str]],
    current: Optional[List[str]],
    false_positive: Optional[List[str]],
    on_ents_only: bool,
    attr: str,
):
    return Dates(
        nlp,
        no_year=no_year,
        absolute=absolute,
        relative=relative,
        year_only=year_only,
        no_day=no_day,
        full=full,
        current=current,
        false_positive=false_positive,
        on_ents_only=on_ents_only,
        attr=attr,
    )
patterns
raw_delimiters = ['\\/', '\\-'] module-attribute
delimiters = raw_delimiters + ['\\.', '[^\\S\\r\\n]+'] module-attribute
raw_delimiter_pattern = make_pattern(raw_delimiters) module-attribute
raw_delimiter_with_spaces_pattern = make_pattern(raw_delimiters + ['[^\\S\\r\\n]+']) module-attribute
delimiter_pattern = make_pattern(delimiters) module-attribute
ante_num_pattern = '(?<!{raw_delimiter_pattern})' module-attribute
post_num_pattern = '(?!{raw_delimiter_pattern})' module-attribute
full_year_pattern = ante_num_pattern + fy_pattern + post_num_pattern module-attribute
absolute_date_pattern: List[str] = [ante_num_pattern + day_pattern + d + month_pattern + d + year_pattern + post_num_pattern for d in delimiters] + [ante_num_pattern + year_pattern + d + numeric_month_pattern + d + numeric_day_pattern + post_num_pattern for d in delimiters] module-attribute
full_date_pattern = [ante_num_pattern + fy_pattern + d + lz_numeric_month_pattern + d + lz_numeric_day_pattern + post_num_pattern for d in ['-', '\\.']] module-attribute
no_year_pattern = [day + raw_delimiter_with_spaces_pattern + month for day in [ante_num_pattern + numeric_day_pattern, letter_day_pattern] for month in [numeric_month_pattern + post_num_pattern, letter_month_pattern]] module-attribute
no_day_pattern = [letter_month_pattern + raw_delimiter_with_spaces_pattern + year_pattern + post_num_pattern, ante_num_pattern + lz_numeric_month_pattern + raw_delimiter_with_spaces_pattern + year_pattern + post_num_pattern] module-attribute
relative_date_pattern = relative_pattern module-attribute
since_pattern = ['(?<=depuis)' + '.{,5}' + pattern for pattern in absolute_date_pattern + no_year_pattern + full_date_pattern + [relative_pattern]] module-attribute
false_positive_pattern = make_pattern(['(\\d+' + delimiter_pattern + '){3,}\\d+', '\\d\\/\\d']) module-attribute
current
current_patterns: List[str] = ['cette\\sann[ée]e(?![-\\s]l[àa])', 'ce\\sjour', 'ces\\sjours[-\\s]ci', "aujourd'?hui", 'ce\\smois([-\\s]ci)?', 'cette\\ssemaine', 'cet?\\s([ée]t[ée]|automne|hiver|printemps)'] module-attribute
current_pattern = make_pattern(current_patterns, with_breaks=True) module-attribute
relative
ago_pattern = 'il\\s+y\\s+a\\s+.{,10}?\\s+(heures?|jours?|semaines?|mois|ann[ée]es?|ans?)' module-attribute
in_pattern = 'dans\\s+.{,10}?\\s+(heures?|jours?|semaines?|mois|ann[ée]es?|ans?)' module-attribute
last_pattern = "l['ae]\\s*(semaine|année|an|mois)\\s+derni[èe]re?" module-attribute
next_pattern = "l['ae]\\s*(semaine|année|an|mois)\\s+prochaine?" module-attribute
since_pattern = '(?<=depuis\\s)\\s*.{,10}\\s+(heures?|jours?|semaines?|mois|ann[ée]es?|ans?)(\\s+derni[èe]re?)?' module-attribute
during_pattern = '(pendant|pdt|pour)\\s+.{,10}?\\s+(heures?|jours?|mois|ann[ée]es?|ans?)' module-attribute
week_patterns = ['(avant\\-?\\s*)?hier', '(apr[èe]s\\-?\\s*)?demain'] module-attribute
week_pattern = make_pattern(week_patterns, with_breaks=True) module-attribute
relative_pattern = make_pattern(patterns=[ago_pattern, in_pattern, last_pattern, next_pattern, since_pattern, week_pattern], with_breaks=True) module-attribute
atomic
time
hour_pattern = '(?<!\\d)(?P<hour>0?[1-9]|1\\d|2[0-3])(?!\\d)' module-attribute
lz_hour_pattern = '(?<!\\d)(?P<hour>0[1-9]|[12]\\d|3[01])(?!\\d)' module-attribute
minute_pattern = '(?<!\\d)(?P<minute>0?[1-9]|[1-5]\\d)(?!\\d)' module-attribute
lz_minute_pattern = '(?<!\\d)(?P<minute>0[1-9]|[1-5]\\d)(?!\\d)' module-attribute
second_pattern = '(?<!\\d)(?P<second>0?[1-9]|[1-5]\\d)(?!\\d)' module-attribute
lz_second_pattern = '(?<!\\d)(?P<second>0[1-9]|[1-5]\\d)(?!\\d)' module-attribute
time_pattern = '(\\s.{,3}' + '{hour_pattern}[h:]({lz_minute_pattern})?' + '((:|m|min){lz_second_pattern})?' + ')?' module-attribute
years
year_patterns: List[str] = ['19\\d\\d'] + [str(year) for year in range(2000, date.today().year + 2)] module-attribute
full_year_pattern = '(?<!\\d)' + full_year_pattern + '(?!\\d)' module-attribute
year_pattern = '(?<!\\d)' + year_pattern + '(?!\\d)' module-attribute
months
letter_months_dict: Dict[str, int] = {'(janvier|janv\\.?)': 1, '(f[ée]vrier|f[ée]v\\.?)': 2, '(mars|mar\\.?)': 3, '(avril|avr\\.?)': 4, 'mai': 5, 'juin': 6, '(juillet|juill?\\.?)': 7, 'ao[uû]t': 8, '(septembre|sept?\\.?)': 9, '(octobre|oct\\.?)': 10, '(novembre|nov\\.)': 11, '(d[ée]cembre|d[ée]c\\.?)': 12} module-attribute
letter_months: List[str] = list(letter_months_dict.keys()) module-attribute
month_pattern = '(?P<month>{letter_month_pattern}|{numeric_month_pattern})' module-attribute
letter_month_pattern = '(?P<month>{letter_month_pattern})' module-attribute
numeric_month_pattern = '(?P<month>{numeric_month_pattern})' module-attribute
lz_numeric_month_pattern = '(?P<month>{lz_numeric_month_pattern})' module-attribute
days
letter_days_dict: Dict[str, int] = {'(premier|1\\s*er)': 1, 'deux': 2, 'trois': 3, 'quatre': 4, 'cinq': 5, 'six': 6, 'sept': 7, 'huit': 8, 'neuf': 9, 'dix': 10, 'onze': 11, 'douze': 12, 'treize': 13, 'quatorze': 14, 'quinze': 15, 'seize': 16, 'dix\\-?\\s*sept': 17, 'dix\\-?\\s*huit': 18, 'dix\\-?\\s*neuf': 19, 'vingt': 20, 'vingt\\-?\\s*et\\-?\\s*un': 21, 'vingt\\-?\\s*deux': 22, 'vingt\\-?\\s*trois': 23, 'vingt\\-?\\s*quatre': 24, 'vingt\\-?\\s*cinq': 25, 'vingt\\-?\\s*six': 26, 'vingt\\-?\\s*sept': 27, 'vingt\\-?\\s*huit': 28, 'vingt\\-?\\s*neuf': 29, 'trente': 30, 'trente\\-?\\s*et\\-?\\s*un': 31} module-attribute
letter_days: List[str] = list(letter_days_dict.keys()) module-attribute
nlz_numeric_day_pattern = '(?<!\\d)([1-9]|[12]\\d|3[01])(?!\\d)' module-attribute
day_pattern = '(?P<day>{letter_day_pattern}|{numeric_day_pattern})' module-attribute
letter_day_pattern = '(?P<day>{letter_day_pattern})' module-attribute
numeric_day_pattern = '(?P<day>{numeric_day_pattern})' module-attribute
lz_numeric_day_pattern = '(?P<day>{lz_numeric_day_pattern})' module-attribute
measures
measures
Measure

Bases: abc.ABC

Source code in edsnlp/pipelines/misc/measures/measures.py
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
class Measure(abc.ABC):
    INTEGER = r"(?:[0-9]+)"
    CONJUNCTIONS = "et|ou"
    COMPOSERS = r"[x*]|par"

    UNITS = {}
    COMPOSITE = None

    @abc.abstractmethod
    def __iter__(self) -> Iterable["SimpleMeasure"]:
        """
        Iter over items of the measure (only one for SimpleMeasure)

        Returns
        -------
        iterable : Iterable["SimpleMeasure"]
        """

    @abc.abstractmethod
    def __getitem__(self, item) -> "SimpleMeasure":
        """
        Access items of the measure (only one for SimpleMeasure)

        Parameters
        ----------
        item : int

        Returns
        -------
        measure : SimpleMeasure
        """
INTEGER = '(?:[0-9]+)' class-attribute
CONJUNCTIONS = 'et|ou' class-attribute
COMPOSERS = '[x*]|par' class-attribute
UNITS = {} class-attribute
COMPOSITE = None class-attribute
__iter__()

Iter over items of the measure (only one for SimpleMeasure)

RETURNS DESCRIPTION
iterable

TYPE: Iterable["SimpleMeasure"]

Source code in edsnlp/pipelines/misc/measures/measures.py
131
132
133
134
135
136
137
138
139
@abc.abstractmethod
def __iter__(self) -> Iterable["SimpleMeasure"]:
    """
    Iter over items of the measure (only one for SimpleMeasure)

    Returns
    -------
    iterable : Iterable["SimpleMeasure"]
    """
__getitem__(item)

Access items of the measure (only one for SimpleMeasure)

PARAMETER DESCRIPTION
item

TYPE: int

RETURNS DESCRIPTION
measure

TYPE: SimpleMeasure

Source code in edsnlp/pipelines/misc/measures/measures.py
141
142
143
144
145
146
147
148
149
150
151
152
153
@abc.abstractmethod
def __getitem__(self, item) -> "SimpleMeasure":
    """
    Access items of the measure (only one for SimpleMeasure)

    Parameters
    ----------
    item : int

    Returns
    -------
    measure : SimpleMeasure
    """
SimpleMeasure

Bases: Measure

Source code in edsnlp/pipelines/misc/measures/measures.py
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
class SimpleMeasure(Measure):
    def __init__(self, value, unit):
        """
        The SimpleMeasure class contains the value and unit
        for a single non-composite measure

        Parameters
        ----------
        value : float
        unit : str
        """
        super().__init__()
        self.value = value
        self.unit = unit

    @classmethod
    @abc.abstractmethod
    def parse(
        self, int_part: str, dec_part: str, unit: str, infix: bool
    ) -> "SimpleMeasure":
        """
        Class method to create an instance from the match groups

        int_part : str
            The integer part of the match (eg 12 in 12 metres 50 or 12.50metres)
        dec_part : str
            The decimal part of the match (eg 50 in 12 metres 50 or 12.50metres)
        unit : str
            The normalized variant of the unit (eg "m" for 12 metre 50)
        infix : bool
            Whether the unit was in the before (True) or after (False) the decimal part
        """

    def _get_scale_to(self, unit: str):
        return self.UNITS[self.unit]["value"] / self.UNITS[unit]["value"]

    def __iter__(self):
        return iter((self,))

    def __getitem__(self, item: int):
        assert isinstance(item, int)
        return [self][item]

    def __str__(self):
        return f"{self.value}{self.unit}"

    def __repr__(self):
        return f"{self.__class__.__name__}({self.value}, {repr(self.unit)})"

    def __eq__(self, other: "SimpleMeasure"):
        return getattr(self, other.unit) == other.value

    def __lt__(self, other: "SimpleMeasure"):
        return getattr(self, other.unit) < other.value

    def __le__(self, other: "SimpleMeasure"):
        return getattr(self, other.unit) <= other.value
value = value instance-attribute
unit = unit instance-attribute
__init__(value, unit)

The SimpleMeasure class contains the value and unit for a single non-composite measure

PARAMETER DESCRIPTION
value

TYPE: float

unit

TYPE: str

Source code in edsnlp/pipelines/misc/measures/measures.py
157
158
159
160
161
162
163
164
165
166
167
168
169
def __init__(self, value, unit):
    """
    The SimpleMeasure class contains the value and unit
    for a single non-composite measure

    Parameters
    ----------
    value : float
    unit : str
    """
    super().__init__()
    self.value = value
    self.unit = unit
parse(int_part, dec_part, unit, infix)

Class method to create an instance from the match groups

int_part : str The integer part of the match (eg 12 in 12 metres 50 or 12.50metres) dec_part : str The decimal part of the match (eg 50 in 12 metres 50 or 12.50metres) unit : str The normalized variant of the unit (eg "m" for 12 metre 50) infix : bool Whether the unit was in the before (True) or after (False) the decimal part

Source code in edsnlp/pipelines/misc/measures/measures.py
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
@classmethod
@abc.abstractmethod
def parse(
    self, int_part: str, dec_part: str, unit: str, infix: bool
) -> "SimpleMeasure":
    """
    Class method to create an instance from the match groups

    int_part : str
        The integer part of the match (eg 12 in 12 metres 50 or 12.50metres)
    dec_part : str
        The decimal part of the match (eg 50 in 12 metres 50 or 12.50metres)
    unit : str
        The normalized variant of the unit (eg "m" for 12 metre 50)
    infix : bool
        Whether the unit was in the before (True) or after (False) the decimal part
    """
_get_scale_to(unit)
Source code in edsnlp/pipelines/misc/measures/measures.py
189
190
def _get_scale_to(self, unit: str):
    return self.UNITS[self.unit]["value"] / self.UNITS[unit]["value"]
__iter__()
Source code in edsnlp/pipelines/misc/measures/measures.py
192
193
def __iter__(self):
    return iter((self,))
__getitem__(item)
Source code in edsnlp/pipelines/misc/measures/measures.py
195
196
197
def __getitem__(self, item: int):
    assert isinstance(item, int)
    return [self][item]
__str__()
Source code in edsnlp/pipelines/misc/measures/measures.py
199
200
def __str__(self):
    return f"{self.value}{self.unit}"
__repr__()
Source code in edsnlp/pipelines/misc/measures/measures.py
202
203
def __repr__(self):
    return f"{self.__class__.__name__}({self.value}, {repr(self.unit)})"
__eq__(other)
Source code in edsnlp/pipelines/misc/measures/measures.py
205
206
def __eq__(self, other: "SimpleMeasure"):
    return getattr(self, other.unit) == other.value
__lt__(other)
Source code in edsnlp/pipelines/misc/measures/measures.py
208
209
def __lt__(self, other: "SimpleMeasure"):
    return getattr(self, other.unit) < other.value
__le__(other)
Source code in edsnlp/pipelines/misc/measures/measures.py
211
212
def __le__(self, other: "SimpleMeasure"):
    return getattr(self, other.unit) <= other.value
CompositeMeasure

Bases: Measure

The CompositeMeasure class contains a sequence of multiple SimpleMeasure instances

PARAMETER DESCRIPTION
measures

TYPE: List[SimpleMeasure]

Source code in edsnlp/pipelines/misc/measures/measures.py
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
class CompositeMeasure(Measure):
    """
    The CompositeMeasure class contains a sequence
    of multiple SimpleMeasure instances

    Parameters
    ----------
    measures : List[SimpleMeasure]
    """

    def __init__(self, measures: Iterable["SimpleMeasure"]):
        super().__init__()
        self.measures = list(measures)

    def __iter__(self):
        return iter(self.measures)

    def __getitem__(self, item: int):
        assert isinstance(item, int)
        res = self.measures[item]
        return res

    def __str__(self):
        return " x ".join(map(str, self.measures))

    def __repr__(self):
        return f"{self.__class__.__name__}({repr(self.measures)})"
measures = list(measures) instance-attribute
__init__(measures)
Source code in edsnlp/pipelines/misc/measures/measures.py
225
226
227
def __init__(self, measures: Iterable["SimpleMeasure"]):
    super().__init__()
    self.measures = list(measures)
__iter__()
Source code in edsnlp/pipelines/misc/measures/measures.py
229
230
def __iter__(self):
    return iter(self.measures)
__getitem__(item)
Source code in edsnlp/pipelines/misc/measures/measures.py
232
233
234
235
def __getitem__(self, item: int):
    assert isinstance(item, int)
    res = self.measures[item]
    return res
__str__()
Source code in edsnlp/pipelines/misc/measures/measures.py
237
238
def __str__(self):
    return " x ".join(map(str, self.measures))
__repr__()
Source code in edsnlp/pipelines/misc/measures/measures.py
240
241
def __repr__(self):
    return f"{self.__class__.__name__}({repr(self.measures)})"
Measures

Bases: BaseComponent

Matcher component to extract measures. A measures is most often composed of a number and a unit like

1,26 cm The unit can also be positioned in place of the decimal dot/comma 1 cm 26 Some measures can be composite 1,26 cm x 2,34 mm And sometimes they are factorized Les trois kystes mesurent 1, 2 et 3cm.

The recognized measures are stored in the "measures" SpanGroup. Each span has a Measure object stored in the "value" extension attribute.

PARAMETER DESCRIPTION
nlp

The SpaCy object.

TYPE: Language

measures

The registry names of the measures to extract

TYPE: List[str]

attr

Whether to match on the text ('TEXT') or on the normalized text ('NORM')

TYPE: str

ignore_excluded

Whether to exclude pollution patterns when matching in the text

TYPE: bool

Source code in edsnlp/pipelines/misc/measures/measures.py
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
class Measures(BaseComponent):
    """
    Matcher component to extract measures.
    A measures is most often composed of a number and a unit like
    > 1,26 cm
    The unit can also be positioned in place of the decimal dot/comma
    > 1 cm 26
    Some measures can be composite
    > 1,26 cm x 2,34 mm
    And sometimes they are factorized
    > Les trois kystes mesurent 1, 2 et 3cm.

    The recognized measures are stored in the "measures" SpanGroup.
    Each span has a `Measure` object stored in the "value" extension attribute.

    Parameters
    ----------
    nlp : Language
        The SpaCy object.
    measures : List[str]
        The registry names of the measures to extract
    attr : str
        Whether to match on the text ('TEXT') or on the normalized text ('NORM')
    ignore_excluded : bool
        Whether to exclude pollution patterns when matching in the text
    """

    def __init__(
        self,
        nlp: Language,
        measures: List[str],
        attr: str,
        ignore_excluded: bool,
    ):

        self.regex_matcher = RegexMatcher(
            attr=attr,
            ignore_excluded=ignore_excluded,
        )

        self.extraction_regexes = {}
        self.measures: Dict[str, Measure] = {}
        for name in measures:
            cls: Measure = spacy.registry.misc.get(name)
            self.measures[name] = cls
            regexes = make_patterns(cls)
            self.regex_matcher.add(name, regexes["trigger"])
            self.extraction_regexes[name] = regexes["extraction"]

        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:
        super(Measures, Measures).set_extensions()
        if not Span.has_extension("value"):
            Span.set_extension("value", default=None)

    def __call__(self, doc: Doc) -> Doc:
        """
        Adds measures to document's "measures" SpanGroup.

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        doc:
            spaCy Doc object, annotated for extracted terms.
        """

        matches = dict(self.regex_matcher(doc, as_spans=True, return_groupdict=True))

        # Filter spans by rightmost, largest spans first to handle cases like 1 m 50 kg
        # while keeping the corresponding groupdicts
        matches = {
            match: matches[match]
            for match in filter_spans(matches, sort_key=rightmost_largest_sort_key)
        }

        measures = []
        for match, groupdict in matches.items():
            measure_name = match.label_
            extraction_regex = self.extraction_regexes[measure_name]

            parsed_values = []

            shared_unit_part = next(
                (key for key, val in groupdict.items() if val is not None), None
            )
            for sub_match in regex.finditer(extraction_regex, match.text):
                sub_groupdict = dict(sub_match.groupdict())

                # Integer part of the match
                int_part = sub_groupdict.pop("int_part", 0)

                # Decimal part of the match, if any
                dec_part = sub_groupdict.pop("dec_part", 0) or 0

                # If the unit was not postfix (in cases like 1cm, or 1 et 2cm)
                # the unit must be infix: we extract it now using non empty groupdict
                # entries
                infix_unit_part = next(
                    (key for key, val in sub_groupdict.items() if val is not None),
                    None,
                )
                unit_part = infix_unit_part or shared_unit_part

                # Create one SimpleMeasure per submatch inside each match...
                parsed_values.append(
                    self.measures[measure_name].parse(
                        int_part=int_part,
                        dec_part=dec_part,
                        unit=unit_part,
                        infix=infix_unit_part is not None,
                    )
                )

            # ... and compose theses measures together if there are more than one
            measure = Span(doc, start=match.start, end=match.end, label=measure_name)
            measure._.value = (
                parsed_values[0]
                if len(parsed_values) == 1
                else self.measures[measure_name].COMPOSITE(parsed_values)
                if self.measures[measure_name].COMPOSITE is not None
                else parsed_values[-1]
            )
            measures.append(match)

        doc.spans["measures"] = sorted(measures)

        return doc
regex_matcher = RegexMatcher(attr=attr, ignore_excluded=ignore_excluded) instance-attribute
extraction_regexes = {} instance-attribute
measures: Dict[str, Measure] = {} instance-attribute
__init__(nlp, measures, attr, ignore_excluded)
Source code in edsnlp/pipelines/misc/measures/measures.py
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
def __init__(
    self,
    nlp: Language,
    measures: List[str],
    attr: str,
    ignore_excluded: bool,
):

    self.regex_matcher = RegexMatcher(
        attr=attr,
        ignore_excluded=ignore_excluded,
    )

    self.extraction_regexes = {}
    self.measures: Dict[str, Measure] = {}
    for name in measures:
        cls: Measure = spacy.registry.misc.get(name)
        self.measures[name] = cls
        regexes = make_patterns(cls)
        self.regex_matcher.add(name, regexes["trigger"])
        self.extraction_regexes[name] = regexes["extraction"]

    self.set_extensions()
set_extensions()
Source code in edsnlp/pipelines/misc/measures/measures.py
295
296
297
298
299
@staticmethod
def set_extensions() -> None:
    super(Measures, Measures).set_extensions()
    if not Span.has_extension("value"):
        Span.set_extension("value", default=None)
__call__(doc)

Adds measures to document's "measures" SpanGroup.

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
doc

spaCy Doc object, annotated for extracted terms.

Source code in edsnlp/pipelines/misc/measures/measures.py
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
def __call__(self, doc: Doc) -> Doc:
    """
    Adds measures to document's "measures" SpanGroup.

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    doc:
        spaCy Doc object, annotated for extracted terms.
    """

    matches = dict(self.regex_matcher(doc, as_spans=True, return_groupdict=True))

    # Filter spans by rightmost, largest spans first to handle cases like 1 m 50 kg
    # while keeping the corresponding groupdicts
    matches = {
        match: matches[match]
        for match in filter_spans(matches, sort_key=rightmost_largest_sort_key)
    }

    measures = []
    for match, groupdict in matches.items():
        measure_name = match.label_
        extraction_regex = self.extraction_regexes[measure_name]

        parsed_values = []

        shared_unit_part = next(
            (key for key, val in groupdict.items() if val is not None), None
        )
        for sub_match in regex.finditer(extraction_regex, match.text):
            sub_groupdict = dict(sub_match.groupdict())

            # Integer part of the match
            int_part = sub_groupdict.pop("int_part", 0)

            # Decimal part of the match, if any
            dec_part = sub_groupdict.pop("dec_part", 0) or 0

            # If the unit was not postfix (in cases like 1cm, or 1 et 2cm)
            # the unit must be infix: we extract it now using non empty groupdict
            # entries
            infix_unit_part = next(
                (key for key, val in sub_groupdict.items() if val is not None),
                None,
            )
            unit_part = infix_unit_part or shared_unit_part

            # Create one SimpleMeasure per submatch inside each match...
            parsed_values.append(
                self.measures[measure_name].parse(
                    int_part=int_part,
                    dec_part=dec_part,
                    unit=unit_part,
                    infix=infix_unit_part is not None,
                )
            )

        # ... and compose theses measures together if there are more than one
        measure = Span(doc, start=match.start, end=match.end, label=measure_name)
        measure._.value = (
            parsed_values[0]
            if len(parsed_values) == 1
            else self.measures[measure_name].COMPOSITE(parsed_values)
            if self.measures[measure_name].COMPOSITE is not None
            else parsed_values[-1]
        )
        measures.append(match)

    doc.spans["measures"] = sorted(measures)

    return doc
disj_capture(regexes, capture=True)
Source code in edsnlp/pipelines/misc/measures/measures.py
14
15
16
17
18
19
20
def disj_capture(regexes, capture=True):
    return "|".join(
        ("(?P<{key}>{forms})" if capture else "{forms}").format(
            key=key, forms="|".join(forms)
        )
        for key, forms in regexes.items()
    )
rightmost_largest_sort_key(span)
Source code in edsnlp/pipelines/misc/measures/measures.py
23
24
def rightmost_largest_sort_key(span):
    return span.end, (len(span))
make_patterns(measure)

Build recognition and extraction patterns for a given Measure class

PARAMETER DESCRIPTION
measure

The measure to build recognition and extraction patterns for

TYPE: 'Measure'

RETURNS DESCRIPTION
trigger

TYPE: List[str]

extraction

TYPE: str

Source code in edsnlp/pipelines/misc/measures/measures.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def make_patterns(measure: "Measure") -> Dict[str, Union[List[str], str]]:
    """
    Build recognition and extraction patterns for a given Measure class

    Parameters
    ----------
    measure: Measure class
        The measure to build recognition and extraction patterns for

    Returns
    -------
    trigger : List[str]
    extraction : str
    """
    unit_prefix_reg = disj_capture(
        {key: [entry["prefix"]] for key, entry in measure.UNITS.items()},
        capture=True,
    )
    unit_abbreviation_reg = disj_capture(
        {key: [entry["abbr"]] for key, entry in measure.UNITS.items()},
        capture=True,
    )
    unit_reg = rf"(?:(?:{unit_prefix_reg})[a-z]*|(?:{unit_abbreviation_reg})(?![a-z]))"

    number_reg = rf"(?:{measure.INTEGER}(?:[,.]{measure.INTEGER})?)"
    infix_measure_reg = rf"(?:{measure.INTEGER}{unit_reg}{measure.INTEGER})"

    # Simple measure
    simple_measure_reg = rf"{number_reg}\s*{unit_reg}"
    trigger = [
        simple_measure_reg,
        infix_measure_reg,
        # Factorized measures separated by a conjunction
        rf"{number_reg}(?=(?:\s*[,]\s*{number_reg})*\s*"
        rf"(?:{measure.CONJUNCTIONS})\s*{number_reg}\s*{unit_reg})",
    ]
    if measure.COMPOSITE:
        # Factorized composite measures (3 x 2cm)
        trigger.append(
            rf"(?<![a-z]){number_reg}"
            rf"(?:\s*(?:{measure.COMPOSERS})\s*{number_reg})*\s*{unit_reg}"
        )
        # Expanded composite measures (3cm x 2cm)
        trigger.append(
            rf"(?<![a-z])(?:{infix_measure_reg}|{simple_measure_reg})"
            rf"(\s*(?:{measure.COMPOSERS})\s*"
            rf"(?:{infix_measure_reg}|{simple_measure_reg}))*"
        )

    unit_reg_capture = (
        rf"(?:(?:{unit_prefix_reg})[a-z]*|(?:{unit_abbreviation_reg})(?![a-z]))"
    )

    return {
        "trigger": trigger,
        "extraction": rf"(?P<int_part>{measure.INTEGER})\s*(?:[,.]|"
        rf"{unit_reg_capture})?\s*(?P<dec_part>{measure.INTEGER})?",
    }
make_simple_getter(name)
Source code in edsnlp/pipelines/misc/measures/measures.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
def make_simple_getter(name):
    def getter(self):
        """
        Get a scaled numerical value of a measure

        Parameters
        ----------
        self

        Returns
        -------
        float
        """
        return self.value * self._get_scale_to(name)

    return getter
make_multi_getter(name)
Source code in edsnlp/pipelines/misc/measures/measures.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
def make_multi_getter(name: str) -> Callable[["CompositeMeasure"], Tuple[float]]:
    def getter(self) -> Tuple[float]:
        """
        Get a scaled numerical values of a multi-measure

        Parameters
        ----------
        self

        Returns
        -------
        float
        """
        return tuple(getattr(measure, name) for measure in self.measures)

    return getter
patterns
CompositeSize

Bases: CompositeMeasure

Composite size measure. Supports the following units: - mm - cm - dm - m

Source code in edsnlp/pipelines/misc/measures/patterns.py
11
12
13
14
15
16
17
18
19
20
21
22
23
class CompositeSize(CompositeMeasure):
    """
    Composite size measure. Supports the following units:
    - mm
    - cm
    - dm
    - m
    """

    mm = property(make_multi_getter("mm"))
    cm = property(make_multi_getter("cm"))
    dm = property(make_multi_getter("dm"))
    m = property(make_multi_getter("m"))
mm = property(make_multi_getter('mm')) class-attribute
cm = property(make_multi_getter('cm')) class-attribute
dm = property(make_multi_getter('dm')) class-attribute
m = property(make_multi_getter('m')) class-attribute
Size

Bases: SimpleMeasure

Size measure. Supports the following units: - mm - cm - dm - m

Source code in edsnlp/pipelines/misc/measures/patterns.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
@spacy.registry.misc("eds.measures.size")
class Size(SimpleMeasure):
    """
    Size measure. Supports the following units:
    - mm
    - cm
    - dm
    - m
    """

    COMPOSITE = CompositeSize
    UNITS = {
        "mm": {"prefix": "mill?im", "abbr": "mm", "value": 1},
        "cm": {"prefix": "centim", "abbr": "cm", "value": 10},
        "dm": {"prefix": "decim", "abbr": "dm", "value": 100},
        "m": {"prefix": "metre", "abbr": "m", "value": 1000},
    }

    @classmethod
    def parse(cls, int_part, dec_part, unit, infix=False):
        result = float("{}.{}".format(int_part, dec_part))
        return cls(result, unit)

    mm = property(make_simple_getter("mm"))
    cm = property(make_simple_getter("cm"))
    dm = property(make_simple_getter("dm"))
    m = property(make_simple_getter("m"))
COMPOSITE = CompositeSize class-attribute
UNITS = {'mm': {'prefix': 'mill?im', 'abbr': 'mm', 'value': 1}, 'cm': {'prefix': 'centim', 'abbr': 'cm', 'value': 10}, 'dm': {'prefix': 'decim', 'abbr': 'dm', 'value': 100}, 'm': {'prefix': 'metre', 'abbr': 'm', 'value': 1000}} class-attribute
mm = property(make_simple_getter('mm')) class-attribute
cm = property(make_simple_getter('cm')) class-attribute
dm = property(make_simple_getter('dm')) class-attribute
m = property(make_simple_getter('m')) class-attribute
parse(int_part, dec_part, unit, infix=False)
Source code in edsnlp/pipelines/misc/measures/patterns.py
44
45
46
47
@classmethod
def parse(cls, int_part, dec_part, unit, infix=False):
    result = float("{}.{}".format(int_part, dec_part))
    return cls(result, unit)
Weight

Bases: SimpleMeasure

Weight measure. Supports the following units: - mg - cg - dg - g - kg

Source code in edsnlp/pipelines/misc/measures/patterns.py
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
@spacy.registry.misc("eds.measures.weight")
class Weight(SimpleMeasure):
    """
    Weight measure. Supports the following units:
    - mg
    - cg
    - dg
    - g
    - kg
    """

    COMPOSITE = None
    UNITS = {
        "mg": {"prefix": "mill?ig", "abbr": "mg", "value": 1},
        "cg": {"prefix": "centig", "abbr": "cg", "value": 10},
        "dg": {"prefix": "decig", "abbr": "dg", "value": 100},
        "g": {"prefix": "gram", "abbr": "g", "value": 1000},
        "kg": {"prefix": "kilo", "abbr": "kg", "value": 1000000},
    }

    @classmethod
    def parse(cls, int_part, dec_part, unit, infix=False):
        result = float("{}.{}".format(int_part, dec_part))
        return cls(result, unit)

    mg = property(make_simple_getter("mg"))
    cg = property(make_simple_getter("cg"))
    dg = property(make_simple_getter("dg"))
    g = property(make_simple_getter("g"))
    kg = property(make_simple_getter("kg"))
COMPOSITE = None class-attribute
UNITS = {'mg': {'prefix': 'mill?ig', 'abbr': 'mg', 'value': 1}, 'cg': {'prefix': 'centig', 'abbr': 'cg', 'value': 10}, 'dg': {'prefix': 'decig', 'abbr': 'dg', 'value': 100}, 'g': {'prefix': 'gram', 'abbr': 'g', 'value': 1000}, 'kg': {'prefix': 'kilo', 'abbr': 'kg', 'value': 1000000}} class-attribute
mg = property(make_simple_getter('mg')) class-attribute
cg = property(make_simple_getter('cg')) class-attribute
dg = property(make_simple_getter('dg')) class-attribute
g = property(make_simple_getter('g')) class-attribute
kg = property(make_simple_getter('kg')) class-attribute
parse(int_part, dec_part, unit, infix=False)
Source code in edsnlp/pipelines/misc/measures/patterns.py
75
76
77
78
@classmethod
def parse(cls, int_part, dec_part, unit, infix=False):
    result = float("{}.{}".format(int_part, dec_part))
    return cls(result, unit)
Angle

Bases: SimpleMeasure

Angle measure. Supports the following units: - h

Source code in edsnlp/pipelines/misc/measures/patterns.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
@spacy.registry.misc("eds.measures.angle")
class Angle(SimpleMeasure):
    """
    Angle measure. Supports the following units:
    - h
    """

    COMPOSITE = None
    UNITS = {
        "h": {"prefix": "heur", "abbr": "h", "value": 1},
    }

    @classmethod
    def parse(cls, int_part, dec_part, unit, infix=False):
        if infix:
            result = float(int_part) + int(dec_part) / 60.0
            return cls(result, unit)
        result = float("{}.{}".format(int_part, dec_part))
        return cls(result, unit)

    h = property(make_simple_getter("h"))
COMPOSITE = None class-attribute
UNITS = {'h': {'prefix': 'heur', 'abbr': 'h', 'value': 1}} class-attribute
h = property(make_simple_getter('h')) class-attribute
parse(int_part, dec_part, unit, infix=False)
Source code in edsnlp/pipelines/misc/measures/patterns.py
 99
100
101
102
103
104
105
@classmethod
def parse(cls, int_part, dec_part, unit, infix=False):
    if infix:
        result = float(int_part) + int(dec_part) / 60.0
        return cls(result, unit)
    result = float("{}.{}".format(int_part, dec_part))
    return cls(result, unit)
factory
DEFAULT_CONFIG = dict(attr='NORM', ignore_excluded=False, measures=['eds.measures.size', 'eds.measures.weight', 'eds.measures.angle']) module-attribute
create_component(nlp, name, measures, attr, ignore_excluded)
Source code in edsnlp/pipelines/misc/measures/factory.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
@Language.factory("eds.measures", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    measures: Union[str, List[str], Dict[str, Dict]],
    attr: str,
    ignore_excluded: bool,
):
    return Measures(
        nlp,
        measures=measures,
        attr=attr,
        ignore_excluded=ignore_excluded,
    )
consultation_dates
patterns
consultation_mention = ['rendez-vous pris', 'consultation', 'consultation.{1,8}examen', 'examen clinique', 'de compte rendu', "date de l'examen", 'examen realise le', 'date de la visite'] module-attribute
town_mention = ['paris', 'kremlin.bicetre', 'creteil', 'boulogne.billancourt', 'villejuif', 'clamart', 'bobigny', 'clichy', 'ivry.sur.seine', 'issy.les.moulineaux', 'draveil', 'limeil', 'champcueil', 'roche.guyon', 'bondy', 'colombes', 'hendaye', 'herck.sur.mer', 'labruyere', 'garches', 'sevran', 'hyeres'] module-attribute
document_date_mention = ['imprime le', 'signe electroniquement', 'signe le', 'saisi le', 'dicte le', 'tape le', 'date de reference', 'date\\s*:', 'dactylographie le', 'date du rapport'] module-attribute
consultation_dates
ConsultationDates

Bases: GenericMatcher

Class to extract consultation dates from "CR-CONS" documents.

The pipeline populates the doc.spans['consultation_dates'] list.

For each extraction s in this list, the corresponding date is available as s._.consultation_date.

PARAMETER DESCRIPTION
nlp

Language pipeline object

TYPE: Language

consultation_mention

List of RegEx for consultation mentions.

  • If type==list: Overrides the default list
  • If type==bool: Uses the default list of True, disable if False

TYPE: Union[List[str], bool]

town_mention : Union[List[str], bool] List of RegEx for all AP-HP hospitals' towns mentions.

- If `type==list`: Overrides the default list
- If `type==bool`: Uses the default list of True, disable if False

document_date_mention : Union[List[str], bool] List of RegEx for document date.

- If `type==list`: Overrides the default list
- If `type==bool`: Uses the default list of True, disable if False
Source code in edsnlp/pipelines/misc/consultation_dates/consultation_dates.py
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
class ConsultationDates(GenericMatcher):
    """
    Class to extract consultation dates from "CR-CONS" documents.

    The pipeline populates the `#!python doc.spans['consultation_dates']` list.

    For each extraction `s` in this list, the corresponding date is available
    as `s._.consultation_date`.

    Parameters
    ----------
    nlp : Language
        Language pipeline object
    consultation_mention : Union[List[str], bool]
        List of RegEx for consultation mentions.

        - If `type==list`: Overrides the default list
        - If `type==bool`: Uses the default list of True, disable if False

    town_mention : Union[List[str], bool]
        List of RegEx for all AP-HP hospitals' towns mentions.

        - If `type==list`: Overrides the default list
        - If `type==bool`: Uses the default list of True, disable if False
    document_date_mention : Union[List[str], bool]
        List of RegEx for document date.

        - If `type==list`: Overrides the default list
        - If `type==bool`: Uses the default list of True, disable if False
    """

    def __init__(
        self,
        nlp: Language,
        consultation_mention: Union[List[str], bool],
        town_mention: Union[List[str], bool],
        document_date_mention: Union[List[str], bool],
        attr: str,
        **kwargs,
    ):

        logger.warning("This pipeline is still in beta")
        logger.warning(
            "This pipeline should ONLY be used on notes "
            "where `note_class_source_value == 'CR-CONS'`"
        )
        logger.warning(
            """This pipeline requires to use the normalizer pipeline with:
        lowercase=True,
        accents=True,
        quotes=True"""
        )

        if not (nlp.has_pipe("dates") and nlp.get_pipe("dates").on_ents_only is False):

            config = dict(**DEFAULT_CONFIG)
            config["on_ents_only"] = "consultation_mentions"

            self.date_matcher = Dates(nlp, **config)

        else:
            self.date_matcher = None

        if not consultation_mention:
            consultation_mention = []
        elif consultation_mention is True:
            consultation_mention = consult_regex.consultation_mention

        if not document_date_mention:
            document_date_mention = []
        elif document_date_mention is True:
            document_date_mention = consult_regex.document_date_mention

        if not town_mention:
            town_mention = []
        elif town_mention is True:
            town_mention = consult_regex.town_mention

        regex = dict(
            consultation_mention=consultation_mention,
            town_mention=town_mention,
            document_date_mention=document_date_mention,
        )

        super().__init__(
            nlp,
            regex=regex,
            terms=dict(),
            attr=attr,
            ignore_excluded=False,
            **kwargs,
        )

        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:
        if not Span.has_extension("consultation_date"):
            Span.set_extension("consultation_date", default=None)

    def __call__(self, doc: Doc) -> Doc:
        """
        Finds entities

        Parameters
        ----------
        doc: spaCy Doc object

        Returns
        -------
        doc: spaCy Doc object with additionnal doc.spans['consultation_dates] SpanGroup
        """

        ents = self.process(doc)

        doc.spans["consultation_mentions"] = ents
        doc.spans["consultation_dates"] = []

        if self.date_matcher is not None:
            doc = self.date_matcher(doc)

        for mention in ents:
            # Looking for a date
            # - In the same sentence
            # - Not less than 10 tokens AFTER the consultation mention
            matching_dates = [
                date
                for date in doc.spans["dates"]
                if (
                    (mention.sent == date.sent)
                    and (date.start > mention.start)
                    and (date.start - mention.end <= 10)
                )
            ]

            if matching_dates:
                # We keep the first mention of a date
                kept_date = min(matching_dates, key=lambda d: d.start)
                span = doc[mention.start : kept_date.end]
                span.label_ = mention.label_
                span._.consultation_date = kept_date._.parsed_date

                doc.spans["consultation_dates"].append(span)

        del doc.spans["consultation_mentions"]

        return doc
date_matcher = Dates(nlp, None=config) instance-attribute
__init__(nlp, consultation_mention, town_mention, document_date_mention, attr, **kwargs)
Source code in edsnlp/pipelines/misc/consultation_dates/consultation_dates.py
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
def __init__(
    self,
    nlp: Language,
    consultation_mention: Union[List[str], bool],
    town_mention: Union[List[str], bool],
    document_date_mention: Union[List[str], bool],
    attr: str,
    **kwargs,
):

    logger.warning("This pipeline is still in beta")
    logger.warning(
        "This pipeline should ONLY be used on notes "
        "where `note_class_source_value == 'CR-CONS'`"
    )
    logger.warning(
        """This pipeline requires to use the normalizer pipeline with:
    lowercase=True,
    accents=True,
    quotes=True"""
    )

    if not (nlp.has_pipe("dates") and nlp.get_pipe("dates").on_ents_only is False):

        config = dict(**DEFAULT_CONFIG)
        config["on_ents_only"] = "consultation_mentions"

        self.date_matcher = Dates(nlp, **config)

    else:
        self.date_matcher = None

    if not consultation_mention:
        consultation_mention = []
    elif consultation_mention is True:
        consultation_mention = consult_regex.consultation_mention

    if not document_date_mention:
        document_date_mention = []
    elif document_date_mention is True:
        document_date_mention = consult_regex.document_date_mention

    if not town_mention:
        town_mention = []
    elif town_mention is True:
        town_mention = consult_regex.town_mention

    regex = dict(
        consultation_mention=consultation_mention,
        town_mention=town_mention,
        document_date_mention=document_date_mention,
    )

    super().__init__(
        nlp,
        regex=regex,
        terms=dict(),
        attr=attr,
        ignore_excluded=False,
        **kwargs,
    )

    self.set_extensions()
set_extensions()
Source code in edsnlp/pipelines/misc/consultation_dates/consultation_dates.py
109
110
111
112
@staticmethod
def set_extensions() -> None:
    if not Span.has_extension("consultation_date"):
        Span.set_extension("consultation_date", default=None)
__call__(doc)

Finds entities

PARAMETER DESCRIPTION
doc

TYPE: Doc

RETURNS DESCRIPTION
doc
Source code in edsnlp/pipelines/misc/consultation_dates/consultation_dates.py
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
def __call__(self, doc: Doc) -> Doc:
    """
    Finds entities

    Parameters
    ----------
    doc: spaCy Doc object

    Returns
    -------
    doc: spaCy Doc object with additionnal doc.spans['consultation_dates] SpanGroup
    """

    ents = self.process(doc)

    doc.spans["consultation_mentions"] = ents
    doc.spans["consultation_dates"] = []

    if self.date_matcher is not None:
        doc = self.date_matcher(doc)

    for mention in ents:
        # Looking for a date
        # - In the same sentence
        # - Not less than 10 tokens AFTER the consultation mention
        matching_dates = [
            date
            for date in doc.spans["dates"]
            if (
                (mention.sent == date.sent)
                and (date.start > mention.start)
                and (date.start - mention.end <= 10)
            )
        ]

        if matching_dates:
            # We keep the first mention of a date
            kept_date = min(matching_dates, key=lambda d: d.start)
            span = doc[mention.start : kept_date.end]
            span.label_ = mention.label_
            span._.consultation_date = kept_date._.parsed_date

            doc.spans["consultation_dates"].append(span)

    del doc.spans["consultation_mentions"]

    return doc
factory
DEFAULT_CONFIG = dict(consultation_mention=True, town_mention=False, document_date_mention=False, attr='NORM') module-attribute
create_component(nlp, name, attr, consultation_mention, town_mention, document_date_mention)
Source code in edsnlp/pipelines/misc/consultation_dates/factory.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
@deprecated_factory(
    "consultation_dates",
    "eds.consultation_dates",
    default_config=DEFAULT_CONFIG,
)
@Language.factory("eds.consultation_dates", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    attr: str,
    consultation_mention: Union[List[str], bool],
    town_mention: Union[List[str], bool],
    document_date_mention: Union[List[str], bool],
):
    return ConsultationDates(
        nlp,
        attr=attr,
        consultation_mention=consultation_mention,
        document_date_mention=document_date_mention,
        town_mention=town_mention,
    )
reason
patterns
reasons = dict(reasons=['(?i)motif de l.?hospitalisation : .+', '(?i)hospitalis[ée].?.*(pour|. cause|suite [àa]).+', '(?i)(consulte|prise en charge(?!\\set\\svous\\sassurer\\sun\\straitement\\sadapté)).*pour.+', '(?i)motif\\sd.hospitalisation\\s:.+', '(?i)au total\\s?\\:?\\s?\\n?.+', '(?i)motif\\sde\\sla\\sconsultation', '(?i)motif\\sd.admission', '(?i)conclusion\\smedicale']) module-attribute
sections_reason = ['motif', 'conclusion'] module-attribute
section_exclude = ['antécédents', 'antécédents familiaux', 'histoire de la maladie'] module-attribute
reason
Reason

Bases: GenericMatcher

Pipeline to identify the reason of the hospitalisation.

It declares a Span extension called ents_reason and adds the key reasons to doc.spans.

It also declares the boolean extension is_reason. This extension is set to True for the Reason Spans but also for the entities that overlap the reason span.

PARAMETER DESCRIPTION
nlp

spaCy nlp pipeline to use for matching.

TYPE: Language

reasons

The terminology of reasons.

TYPE: Optional[Dict[str, Union[List[str], str]]]

attr

spaCy's attribute to use: a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr'. We can also add a key for each regex.

TYPE: str

use_sections

whether or not use the sections pipeline to improve results.

TYPE: bool,

ignore_excluded

Whether to skip excluded tokens.

TYPE: bool

Source code in edsnlp/pipelines/misc/reason/reason.py
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
class Reason(GenericMatcher):
    """Pipeline to identify the reason of the hospitalisation.

    It declares a Span extension called `ents_reason` and adds
    the key `reasons` to doc.spans.

    It also declares the boolean extension `is_reason`.
    This extension is set to True for the Reason Spans but also
    for the entities that overlap the reason span.

    Parameters
    ----------
    nlp : Language
        spaCy nlp pipeline to use for matching.
    reasons : Optional[Dict[str, Union[List[str], str]]]
        The terminology of reasons.
    attr : str
        spaCy's attribute to use:
        a string with the value "TEXT" or "NORM", or a dict with
        the key 'term_attr'. We can also add a key for each regex.
    use_sections : bool,
        whether or not use the `sections` pipeline to improve results.
    ignore_excluded : bool
        Whether to skip excluded tokens.
    """

    def __init__(
        self,
        nlp: Language,
        reasons: Optional[Dict[str, Union[List[str], str]]],
        attr: Union[Dict[str, str], str],
        use_sections: bool,
        ignore_excluded: bool,
    ):

        if reasons is None:
            reasons = patterns.reasons

        super().__init__(
            nlp,
            terms=None,
            regex=reasons,
            attr=attr,
            ignore_excluded=ignore_excluded,
        )

        self.use_sections = use_sections and (
            "eds.sections" in self.nlp.pipe_names or "sections" in self.nlp.pipe_names
        )
        if use_sections and not self.use_sections:
            logger.warning(
                "You have requested that the pipeline use annotations "
                "provided by the `eds.section` pipeline, but it was not set. "
                "Skipping that step."
            )

        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:

        if not Span.has_extension("ents_reason"):
            Span.set_extension("ents_reason", default=None)

        if not Span.has_extension("is_reason"):
            Span.set_extension("is_reason", default=False)

    def _enhance_with_sections(self, sections: Iterable, reasons: Iterable) -> List:
        """Enhance the list of reasons with the section information.
        If the reason overlaps with history, so it will be removed from the list

        Parameters
        ----------
        sections : Iterable
            Spans of sections identified with the `sections` pipeline
        reasons : Iterable
            Reasons list identified by the regex

        Returns
        -------
        List
            Updated list of spans reasons
        """

        for section in sections:
            if section.label_ in patterns.sections_reason:
                reasons.append(section)

            if section.label_ in patterns.section_exclude:
                for reason in reasons:
                    if check_inclusion(reason, section.start, section.end):
                        reasons.remove(reason)

        return reasons

    def __call__(self, doc: Doc) -> Doc:
        """Find spans related to the reasons of the hospitalisation

        Parameters
        ----------
        doc : Doc

        Returns
        -------
        Doc
        """
        matches = self.process(doc)
        reasons = get_spans(matches, "reasons")

        if self.use_sections:
            sections = doc.spans["sections"]
            reasons = self._enhance_with_sections(sections=sections, reasons=reasons)

        doc.spans["reasons"] = reasons

        # Entities
        if len(doc.ents) > 0:
            for reason in reasons:  # TODO optimize this iteration
                ent_list = []
                for ent in doc.ents:
                    if check_inclusion(ent, reason.start, reason.end):
                        ent_list.append(ent)
                        ent._.is_reason = True

                reason._.ents_reason = ent_list
                reason._.is_reason = True

        return doc
use_sections = use_sections and 'eds.sections' in self.nlp.pipe_names or 'sections' in self.nlp.pipe_names instance-attribute
__init__(nlp, reasons, attr, use_sections, ignore_excluded)
Source code in edsnlp/pipelines/misc/reason/reason.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def __init__(
    self,
    nlp: Language,
    reasons: Optional[Dict[str, Union[List[str], str]]],
    attr: Union[Dict[str, str], str],
    use_sections: bool,
    ignore_excluded: bool,
):

    if reasons is None:
        reasons = patterns.reasons

    super().__init__(
        nlp,
        terms=None,
        regex=reasons,
        attr=attr,
        ignore_excluded=ignore_excluded,
    )

    self.use_sections = use_sections and (
        "eds.sections" in self.nlp.pipe_names or "sections" in self.nlp.pipe_names
    )
    if use_sections and not self.use_sections:
        logger.warning(
            "You have requested that the pipeline use annotations "
            "provided by the `eds.section` pipeline, but it was not set. "
            "Skipping that step."
        )

    self.set_extensions()
set_extensions()
Source code in edsnlp/pipelines/misc/reason/reason.py
71
72
73
74
75
76
77
78
@staticmethod
def set_extensions() -> None:

    if not Span.has_extension("ents_reason"):
        Span.set_extension("ents_reason", default=None)

    if not Span.has_extension("is_reason"):
        Span.set_extension("is_reason", default=False)
_enhance_with_sections(sections, reasons)

Enhance the list of reasons with the section information. If the reason overlaps with history, so it will be removed from the list

PARAMETER DESCRIPTION
sections

Spans of sections identified with the sections pipeline

TYPE: Iterable

reasons

Reasons list identified by the regex

TYPE: Iterable

RETURNS DESCRIPTION
List

Updated list of spans reasons

Source code in edsnlp/pipelines/misc/reason/reason.py
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
def _enhance_with_sections(self, sections: Iterable, reasons: Iterable) -> List:
    """Enhance the list of reasons with the section information.
    If the reason overlaps with history, so it will be removed from the list

    Parameters
    ----------
    sections : Iterable
        Spans of sections identified with the `sections` pipeline
    reasons : Iterable
        Reasons list identified by the regex

    Returns
    -------
    List
        Updated list of spans reasons
    """

    for section in sections:
        if section.label_ in patterns.sections_reason:
            reasons.append(section)

        if section.label_ in patterns.section_exclude:
            for reason in reasons:
                if check_inclusion(reason, section.start, section.end):
                    reasons.remove(reason)

    return reasons
__call__(doc)

Find spans related to the reasons of the hospitalisation

PARAMETER DESCRIPTION
doc

TYPE: Doc

RETURNS DESCRIPTION
Doc
Source code in edsnlp/pipelines/misc/reason/reason.py
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
def __call__(self, doc: Doc) -> Doc:
    """Find spans related to the reasons of the hospitalisation

    Parameters
    ----------
    doc : Doc

    Returns
    -------
    Doc
    """
    matches = self.process(doc)
    reasons = get_spans(matches, "reasons")

    if self.use_sections:
        sections = doc.spans["sections"]
        reasons = self._enhance_with_sections(sections=sections, reasons=reasons)

    doc.spans["reasons"] = reasons

    # Entities
    if len(doc.ents) > 0:
        for reason in reasons:  # TODO optimize this iteration
            ent_list = []
            for ent in doc.ents:
                if check_inclusion(ent, reason.start, reason.end):
                    ent_list.append(ent)
                    ent._.is_reason = True

            reason._.ents_reason = ent_list
            reason._.is_reason = True

    return doc
factory
DEFAULT_CONFIG = dict(reasons=None, attr='TEXT', use_sections=False, ignore_excluded=False) module-attribute
create_component(nlp, name, reasons, attr, use_sections, ignore_excluded)
Source code in edsnlp/pipelines/misc/reason/factory.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
@deprecated_factory("reason", "eds.reason", default_config=DEFAULT_CONFIG)
@Language.factory("eds.reason", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    reasons: Optional[Dict[str, Union[List[str], str]]],
    attr: str,
    use_sections: bool,
    ignore_excluded: bool,
):
    return Reason(
        nlp,
        reasons=reasons,
        attr=attr,
        use_sections=use_sections,
        ignore_excluded=ignore_excluded,
    )
sections
patterns

These section titles were extracted from a work performed by Ivan Lerner at AP-HP. It supplied a number of documents annotated for section titles.

The section titles were reviewed by Gilles Chatellier, who gave meaningful insights.

See sections/section-dataset notebook for detail.

allergies = ['allergies'] module-attribute
antecedents = ['antecedents', 'antecedents medicaux et chirurgicaux', 'antecedents personnels', 'antecedents medicaux', 'antecedents chirurgicaux', 'atcd'] module-attribute
antecedents_familiaux = ['antecedents familiaux'] module-attribute
traitements_entree = ['attitude therapeutique initiale', "traitement a l'entree", 'traitement actuel', 'traitement en cours', "traitements a l'entree"] module-attribute
conclusion = ['au total', 'conclusion', 'conclusion de sortie', 'syntese medicale / conclusion', 'synthese', 'synthese medicale', 'synthese medicale/conclusion', 'conclusion medicale'] module-attribute
conclusion_entree = ["conclusion a l'entree"] module-attribute
habitus = ['contexte familial et social', 'habitus', 'mode de vie', 'mode de vie - scolarite', 'situation sociale, mode de vie'] module-attribute
correspondants = ['correspondants'] module-attribute
diagnostic = ['diagnostic retenu'] module-attribute
donnees_biometriques_entree = ["donnees biometriques et parametres vitaux a l'entree", "parametres vitaux et donnees biometriques a l'entree"] module-attribute
examens = ['examen clinique', "examen clinique a l'entree"] module-attribute
examens_complementaires = ['examen(s) complementaire(s)', 'examens complementaires', "examens complementaires a l'entree", 'examens complementaires realises pendant le sejour', 'examens para-cliniques'] module-attribute
facteurs_de_risques = ['facteurs de risque', 'facteurs de risques'] module-attribute
histoire_de_la_maladie = ['histoire de la maladie', 'histoire de la maladie - explorations', 'histoire de la maladie actuelle', 'histoire du poids', 'histoire recente', 'histoire recente de la maladie', 'rappel clinique', 'resume', 'resume clinique'] module-attribute
actes = ['intervention'] module-attribute
motif = ['motif', "motif d'hospitalisation", "motif de l'hospitalisation", 'motif medical'] module-attribute
prescriptions = ['prescriptions de sortie', 'prescriptions medicales de sortie'] module-attribute
traitements_sortie = ['traitement de sortie'] module-attribute
sections = {'allergies': allergies, 'antécédents': antecedents, 'antécédents familiaux': antecedents_familiaux, 'traitements entrée': traitements_entree, 'conclusion': conclusion, 'conclusion entrée': conclusion_entree, 'habitus': habitus, 'correspondants': correspondants, 'diagnostic': diagnostic, 'données biométriques entrée': donnees_biometriques_entree, 'examens': examens, 'examens complémentaires': examens_complementaires, 'facteurs de risques': facteurs_de_risques, 'histoire de la maladie': histoire_de_la_maladie, 'actes': actes, 'motif': motif, 'prescriptions': prescriptions, 'traitements sortie': traitements_sortie} module-attribute
sections
Sections

Bases: GenericMatcher

Divides the document into sections.

By default, we are using a dataset of documents annotated for section titles, using the work done by Ivan Lerner, reviewed by Gilles Chatellier.

Detected sections are :

  • allergies ;
  • antécédents ;
  • antécédents familiaux ;
  • traitements entrée ;
  • conclusion ;
  • conclusion entrée ;
  • habitus ;
  • correspondants ;
  • diagnostic ;
  • données biométriques entrée ;
  • examens ;
  • examens complémentaires ;
  • facteurs de risques ;
  • histoire de la maladie ;
  • actes ;
  • motif ;
  • prescriptions ;
  • traitements sortie.

The component looks for section titles within the document, and stores them in the section_title extension.

For ease-of-use, the component also populates a section extension, which contains a list of spans corresponding to the "sections" of the document. These span from the start of one section title to the next, which can introduce obvious bias should an intermediate section title goes undetected.

PARAMETER DESCRIPTION
nlp

spaCy pipeline object.

TYPE: Language

sections

Dictionary of terms to look for.

TYPE: Dict[str, List[str]]

attr

Default attribute to match on.

TYPE: str

ignore_excluded

Whether to skip excluded tokens.

TYPE: bool

Source code in edsnlp/pipelines/misc/sections/sections.py
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
class Sections(GenericMatcher):
    """
    Divides the document into sections.

    By default, we are using a dataset of documents annotated for section titles,
    using the work done by Ivan Lerner, reviewed by Gilles Chatellier.

    Detected sections are :

    - allergies ;
    - antécédents ;
    - antécédents familiaux ;
    - traitements entrée ;
    - conclusion ;
    - conclusion entrée ;
    - habitus ;
    - correspondants ;
    - diagnostic ;
    - données biométriques entrée ;
    - examens ;
    - examens complémentaires ;
    - facteurs de risques ;
    - histoire de la maladie ;
    - actes ;
    - motif ;
    - prescriptions ;
    - traitements sortie.

    The component looks for section titles within the document,
    and stores them in the `section_title` extension.

    For ease-of-use, the component also populates a `section` extension,
    which contains a list of spans corresponding to the "sections" of the
    document. These span from the start of one section title to the next,
    which can introduce obvious bias should an intermediate section title
    goes undetected.

    Parameters
    ----------
    nlp : Language
        spaCy pipeline object.
    sections : Dict[str, List[str]]
        Dictionary of terms to look for.
    attr : str
        Default attribute to match on.
    ignore_excluded : bool
        Whether to skip excluded tokens.
    """

    def __init__(
        self,
        nlp: Language,
        sections: Dict[str, List[str]],
        add_patterns: bool,
        attr: str,
        ignore_excluded: bool,
    ):

        logger.warning(
            "The component Sections is still in Beta. Use at your own risks."
        )

        if sections is None:
            sections = patterns.sections

        self.add_patterns = add_patterns
        if add_patterns:
            for k, v in sections.items():
                sections[k] = [r"\n[^\n]{0,5}" + ent + r"[^\n]{0,5}\n" for ent in v]

        super().__init__(
            nlp,
            terms=None,
            regex=sections,
            attr=attr,
            ignore_excluded=ignore_excluded,
        )

        self.set_extensions()

        if not nlp.has_pipe("normalizer") and not not nlp.has_pipe("eds.normalizer"):
            logger.warning("You should add pipe `eds.normalizer`")

    @staticmethod
    def set_extensions():

        if not Span.has_extension("section_title"):
            Span.set_extension("section_title", default=None)

        if not Span.has_extension("section"):
            Span.set_extension("section", default=None)

    # noinspection PyProtectedMember
    def __call__(self, doc: Doc) -> Doc:
        """
        Divides the doc into sections

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        doc:
            spaCy Doc object, annotated for sections
        """
        titles = filter_spans(self.process(doc))

        if self.add_patterns:
            # Remove preceding newline
            titles = [
                Span(doc, title.start + 1, title.end - 1, label=title.label_)
                for title in titles
            ]

        sections = []

        for t1, t2 in zip(titles[:-1], titles[1:]):
            section = Span(doc, t1.start, t2.start, label=t1.label)
            section._.section_title = t1
            sections.append(section)

        if titles:
            t = titles[-1]
            section = Span(doc, t.start, len(doc), label=t.label)
            section._.section_title = t
            sections.append(section)

        doc.spans["sections"] = sections
        doc.spans["section_titles"] = titles

        return doc
add_patterns = add_patterns instance-attribute
__init__(nlp, sections, add_patterns, attr, ignore_excluded)
Source code in edsnlp/pipelines/misc/sections/sections.py
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
def __init__(
    self,
    nlp: Language,
    sections: Dict[str, List[str]],
    add_patterns: bool,
    attr: str,
    ignore_excluded: bool,
):

    logger.warning(
        "The component Sections is still in Beta. Use at your own risks."
    )

    if sections is None:
        sections = patterns.sections

    self.add_patterns = add_patterns
    if add_patterns:
        for k, v in sections.items():
            sections[k] = [r"\n[^\n]{0,5}" + ent + r"[^\n]{0,5}\n" for ent in v]

    super().__init__(
        nlp,
        terms=None,
        regex=sections,
        attr=attr,
        ignore_excluded=ignore_excluded,
    )

    self.set_extensions()

    if not nlp.has_pipe("normalizer") and not not nlp.has_pipe("eds.normalizer"):
        logger.warning("You should add pipe `eds.normalizer`")
set_extensions()
Source code in edsnlp/pipelines/misc/sections/sections.py
 96
 97
 98
 99
100
101
102
103
@staticmethod
def set_extensions():

    if not Span.has_extension("section_title"):
        Span.set_extension("section_title", default=None)

    if not Span.has_extension("section"):
        Span.set_extension("section", default=None)
__call__(doc)

Divides the doc into sections

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
doc

spaCy Doc object, annotated for sections

Source code in edsnlp/pipelines/misc/sections/sections.py
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
def __call__(self, doc: Doc) -> Doc:
    """
    Divides the doc into sections

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    doc:
        spaCy Doc object, annotated for sections
    """
    titles = filter_spans(self.process(doc))

    if self.add_patterns:
        # Remove preceding newline
        titles = [
            Span(doc, title.start + 1, title.end - 1, label=title.label_)
            for title in titles
        ]

    sections = []

    for t1, t2 in zip(titles[:-1], titles[1:]):
        section = Span(doc, t1.start, t2.start, label=t1.label)
        section._.section_title = t1
        sections.append(section)

    if titles:
        t = titles[-1]
        section = Span(doc, t.start, len(doc), label=t.label)
        section._.section_title = t
        sections.append(section)

    doc.spans["sections"] = sections
    doc.spans["section_titles"] = titles

    return doc
factory
DEFAULT_CONFIG = dict(sections=None, add_patterns=True, attr='NORM', ignore_excluded=True) module-attribute
create_component(nlp, name, sections, add_patterns, attr, ignore_excluded)
Source code in edsnlp/pipelines/misc/sections/factory.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
@deprecated_factory("sections", "eds.sections", default_config=DEFAULT_CONFIG)
@Language.factory("eds.sections", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    sections: Optional[Dict[str, List[str]]],
    add_patterns: bool,
    attr: str,
    ignore_excluded: bool,
):
    return Sections(
        nlp,
        sections=sections,
        add_patterns=add_patterns,
        attr=attr,
        ignore_excluded=ignore_excluded,
    )
Back to top