Skip to content

edsnlp.pipelines.misc

dates

dates

parsers = [parser for parser in default_parsers if parser != 'relative-time'] module-attribute
parser1 = DateDataParser(languages=['fr'], settings={'PREFER_DAY_OF_MONTH': 'first', 'PREFER_DATES_FROM': 'past', 'PARSERS': parsers, 'RETURN_AS_TIMEZONE_AWARE': False}) module-attribute
parser2 = DateDataParser(languages=['fr'], settings={'PREFER_DAY_OF_MONTH': 'first', 'PREFER_DATES_FROM': 'past', 'PARSERS': ['relative-time'], 'RETURN_AS_TIMEZONE_AWARE': False}) module-attribute
Dates

Bases: BaseComponent

Tags and normalizes dates, using the open-source dateparser library.

The pipeline uses spaCy's filter_spans function. It filters out false positives, and introduce a hierarchy between patterns. For instance, in case of ambiguity, the pipeline will decide that a date is a date without a year rather than a date without a day.

PARAMETER DESCRIPTION
nlp

Language pipeline object

TYPE: spacy.language.Language

absolute

List of regular expressions for absolute dates.

TYPE: Union[List[str], str]

full

List of regular expressions for full dates in YYYY-MM-DD format.

TYPE: Union[List[str], str]

relative

List of regular expressions for relative dates (eg hier, la semaine prochaine).

TYPE: Union[List[str], str]

no_year

List of regular expressions for dates that do not display a year.

TYPE: Union[List[str], str]

no_day

List of regular expressions for dates that do not display a day.

TYPE: Union[List[str], str]

year_only

List of regular expressions for dates that only display a year.

TYPE: Union[List[str], str]

current

List of regular expressions for dates that relate to the current month, week, year, etc.

TYPE: Union[List[str], str]

false_positive

List of regular expressions for false positive (eg phone numbers, etc).

TYPE: Union[List[str], str]

on_ents_only

Wether to look on dates in the whole document or in specific sentences:

  • If True: Only look in the sentences of each entity in doc.ents
  • If False: Look in the whole document
  • If given a string key or list of string: Only look in the sentences of each entity in doc.spans[key]

TYPE: Union[bool, str, List[str]]

Source code in edsnlp/pipelines/misc/dates/dates.py
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
class Dates(BaseComponent):
    """
    Tags and normalizes dates, using the open-source `dateparser` library.

    The pipeline uses spaCy's `filter_spans` function.
    It filters out false positives, and introduce a hierarchy between patterns.
    For instance, in case of ambiguity, the pipeline will decide that a date is a
    date without a year rather than a date without a day.

    Parameters
    ----------
    nlp : spacy.language.Language
        Language pipeline object
    absolute : Union[List[str], str]
        List of regular expressions for absolute dates.
    full : Union[List[str], str]
        List of regular expressions for full dates in YYYY-MM-DD format.
    relative : Union[List[str], str]
        List of regular expressions for relative dates
        (eg `hier`, `la semaine prochaine`).
    no_year : Union[List[str], str]
        List of regular expressions for dates that do not display a year.
    no_day : Union[List[str], str]
        List of regular expressions for dates that do not display a day.
    year_only : Union[List[str], str]
        List of regular expressions for dates that only display a year.
    current : Union[List[str], str]
        List of regular expressions for dates that relate to
        the current month, week, year, etc.
    false_positive : Union[List[str], str]
        List of regular expressions for false positive (eg phone numbers, etc).
    on_ents_only : Union[bool, str, List[str]]
        Wether to look on dates in the whole document or in specific sentences:

        - If `True`: Only look in the sentences of each entity in doc.ents
        - If False: Look in the whole document
        - If given a string `key` or list of string: Only look in the sentences of
          each entity in `#!python doc.spans[key]`
    """

    # noinspection PyProtectedMember
    def __init__(
        self,
        nlp: Language,
        absolute: Optional[List[str]],
        full: Optional[List[str]],
        relative: Optional[List[str]],
        no_year: Optional[List[str]],
        no_day: Optional[List[str]],
        year_only: Optional[List[str]],
        current: Optional[List[str]],
        false_positive: Optional[List[str]],
        on_ents_only: bool,
        attr: str,
    ):

        self.nlp = nlp

        if no_year is None:
            no_year = patterns.no_year_pattern
        if year_only is None:
            year_only = patterns.full_year_pattern
        if no_day is None:
            no_day = patterns.no_day_pattern
        if absolute is None:
            absolute = patterns.absolute_date_pattern
        if relative is None:
            relative = patterns.relative_date_pattern
        if full is None:
            full = patterns.full_date_pattern
        if current is None:
            current = patterns.current_pattern
        if false_positive is None:
            false_positive = patterns.false_positive_pattern

        if isinstance(absolute, str):
            absolute = [absolute]
        if isinstance(relative, str):
            relative = [relative]
        if isinstance(no_year, str):
            no_year = [no_year]
        if isinstance(no_day, str):
            no_day = [no_day]
        if isinstance(year_only, str):
            year_only = [year_only]
        if isinstance(full, str):
            full = [full]
        if isinstance(current, str):
            current = [current]
        if isinstance(false_positive, str):
            false_positive = [false_positive]

        self.on_ents_only = on_ents_only
        self.regex_matcher = RegexMatcher(attr=attr, alignment_mode="strict")

        self.regex_matcher.add("false_positive", false_positive)
        self.regex_matcher.add("full_date", full)
        self.regex_matcher.add("absolute", absolute)
        self.regex_matcher.add("relative", relative)
        self.regex_matcher.add("no_year", no_year)
        self.regex_matcher.add("no_day", no_day)
        self.regex_matcher.add("year_only", year_only)
        self.regex_matcher.add("current", current)

        self.parser = date_parser
        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:

        if not Doc.has_extension("note_datetime"):
            Doc.set_extension("note_datetime", default=None)

        if not Span.has_extension("parsed_date"):
            Span.set_extension("parsed_date", default=None)

        if not Span.has_extension("parsed_delta"):
            Span.set_extension("parsed_delta", default=None)

        if not Span.has_extension("date"):
            Span.set_extension("date", getter=date_getter)

    def process(self, doc: Doc) -> List[Span]:
        """
        Find dates in doc.

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        dates:
            list of date spans
        """

        if self.on_ents_only:

            if type(self.on_ents_only) == bool:
                ents = doc.ents
            else:
                if type(self.on_ents_only) == str:
                    self.on_ents_only = [self.on_ents_only]
                ents = []
                for key in self.on_ents_only:
                    ents.extend(list(doc.spans[key]))

            dates = []
            for sent in set([ent.sent for ent in ents]):
                dates = chain(
                    dates,
                    self.regex_matcher(
                        sent,
                        as_spans=True,
                        # return_groupdict=True,
                    ),
                )

        else:
            dates = self.regex_matcher(
                doc,
                as_spans=True,
                # return_groupdict=True,
            )

        # dates = apply_groupdict(dates)

        dates = filter_spans(dates)
        dates = [date for date in dates if date.label_ != "false_positive"]

        return dates

    def get_date(self, date: Span) -> Optional[datetime]:
        """
        Get normalised date using `dateparser`.

        Parameters
        ----------
        date : Span
            Date span.

        Returns
        -------
        Optional[datetime]
            If a date is recognised, returns a Python `datetime` object.
            Returns `None` otherwise.
        """

        text_date = date.text

        if date.label_ == "no_day":
            text_date = "01/" + re.sub(r"[\.\/\s]", "/", text_date)

        elif date.label_ == "full_date":
            text_date = re.sub(r"[\.\/\s]", "-", text_date)

            try:
                return datetime.strptime(text_date, "%Y-%m-%d")
            except ValueError:
                try:
                    return datetime.strptime(text_date, "%Y-%d-%m")
                except ValueError:
                    return None

        # text_date = re.sub(r"\.", "-", text_date)

        return self.parser(text_date)

    def __call__(self, doc: Doc) -> Doc:
        """
        Tags dates.

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        doc:
            spaCy Doc object, annotated for dates
        """
        dates = self.process(doc)

        for date in dates:
            d = self.get_date(date)

            if d is None:
                date._.parsed_date = None
            else:
                date._.parsed_date = d
                date._.parsed_delta = d - datetime.now() + timedelta(seconds=10)

        doc.spans["dates"] = dates

        return doc
nlp = nlp instance-attribute
on_ents_only = on_ents_only instance-attribute
regex_matcher = RegexMatcher(attr=attr, alignment_mode='strict') instance-attribute
parser = date_parser instance-attribute
__init__(nlp, absolute, full, relative, no_year, no_day, year_only, current, false_positive, on_ents_only, attr)
Source code in edsnlp/pipelines/misc/dates/dates.py
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
def __init__(
    self,
    nlp: Language,
    absolute: Optional[List[str]],
    full: Optional[List[str]],
    relative: Optional[List[str]],
    no_year: Optional[List[str]],
    no_day: Optional[List[str]],
    year_only: Optional[List[str]],
    current: Optional[List[str]],
    false_positive: Optional[List[str]],
    on_ents_only: bool,
    attr: str,
):

    self.nlp = nlp

    if no_year is None:
        no_year = patterns.no_year_pattern
    if year_only is None:
        year_only = patterns.full_year_pattern
    if no_day is None:
        no_day = patterns.no_day_pattern
    if absolute is None:
        absolute = patterns.absolute_date_pattern
    if relative is None:
        relative = patterns.relative_date_pattern
    if full is None:
        full = patterns.full_date_pattern
    if current is None:
        current = patterns.current_pattern
    if false_positive is None:
        false_positive = patterns.false_positive_pattern

    if isinstance(absolute, str):
        absolute = [absolute]
    if isinstance(relative, str):
        relative = [relative]
    if isinstance(no_year, str):
        no_year = [no_year]
    if isinstance(no_day, str):
        no_day = [no_day]
    if isinstance(year_only, str):
        year_only = [year_only]
    if isinstance(full, str):
        full = [full]
    if isinstance(current, str):
        current = [current]
    if isinstance(false_positive, str):
        false_positive = [false_positive]

    self.on_ents_only = on_ents_only
    self.regex_matcher = RegexMatcher(attr=attr, alignment_mode="strict")

    self.regex_matcher.add("false_positive", false_positive)
    self.regex_matcher.add("full_date", full)
    self.regex_matcher.add("absolute", absolute)
    self.regex_matcher.add("relative", relative)
    self.regex_matcher.add("no_year", no_year)
    self.regex_matcher.add("no_day", no_day)
    self.regex_matcher.add("year_only", year_only)
    self.regex_matcher.add("current", current)

    self.parser = date_parser
    self.set_extensions()
set_extensions()
Source code in edsnlp/pipelines/misc/dates/dates.py
307
308
309
310
311
312
313
314
315
316
317
318
319
320
@staticmethod
def set_extensions() -> None:

    if not Doc.has_extension("note_datetime"):
        Doc.set_extension("note_datetime", default=None)

    if not Span.has_extension("parsed_date"):
        Span.set_extension("parsed_date", default=None)

    if not Span.has_extension("parsed_delta"):
        Span.set_extension("parsed_delta", default=None)

    if not Span.has_extension("date"):
        Span.set_extension("date", getter=date_getter)
process(doc)

Find dates in doc.

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
dates

list of date spans

Source code in edsnlp/pipelines/misc/dates/dates.py
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
def process(self, doc: Doc) -> List[Span]:
    """
    Find dates in doc.

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    dates:
        list of date spans
    """

    if self.on_ents_only:

        if type(self.on_ents_only) == bool:
            ents = doc.ents
        else:
            if type(self.on_ents_only) == str:
                self.on_ents_only = [self.on_ents_only]
            ents = []
            for key in self.on_ents_only:
                ents.extend(list(doc.spans[key]))

        dates = []
        for sent in set([ent.sent for ent in ents]):
            dates = chain(
                dates,
                self.regex_matcher(
                    sent,
                    as_spans=True,
                    # return_groupdict=True,
                ),
            )

    else:
        dates = self.regex_matcher(
            doc,
            as_spans=True,
            # return_groupdict=True,
        )

    # dates = apply_groupdict(dates)

    dates = filter_spans(dates)
    dates = [date for date in dates if date.label_ != "false_positive"]

    return dates
get_date(date)

Get normalised date using dateparser.

PARAMETER DESCRIPTION
date

Date span.

TYPE: Span

RETURNS DESCRIPTION
Optional[datetime]

If a date is recognised, returns a Python datetime object. Returns None otherwise.

Source code in edsnlp/pipelines/misc/dates/dates.py
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
def get_date(self, date: Span) -> Optional[datetime]:
    """
    Get normalised date using `dateparser`.

    Parameters
    ----------
    date : Span
        Date span.

    Returns
    -------
    Optional[datetime]
        If a date is recognised, returns a Python `datetime` object.
        Returns `None` otherwise.
    """

    text_date = date.text

    if date.label_ == "no_day":
        text_date = "01/" + re.sub(r"[\.\/\s]", "/", text_date)

    elif date.label_ == "full_date":
        text_date = re.sub(r"[\.\/\s]", "-", text_date)

        try:
            return datetime.strptime(text_date, "%Y-%m-%d")
        except ValueError:
            try:
                return datetime.strptime(text_date, "%Y-%d-%m")
            except ValueError:
                return None

    # text_date = re.sub(r"\.", "-", text_date)

    return self.parser(text_date)
__call__(doc)

Tags dates.

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
doc

spaCy Doc object, annotated for dates

Source code in edsnlp/pipelines/misc/dates/dates.py
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
def __call__(self, doc: Doc) -> Doc:
    """
    Tags dates.

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    doc:
        spaCy Doc object, annotated for dates
    """
    dates = self.process(doc)

    for date in dates:
        d = self.get_date(date)

        if d is None:
            date._.parsed_date = None
        else:
            date._.parsed_date = d
            date._.parsed_delta = d - datetime.now() + timedelta(seconds=10)

    doc.spans["dates"] = dates

    return doc
td2str(td)

Transforms a timedelta object to a string representation.

PARAMETER DESCRIPTION
td

The timedelta object to represent.

TYPE: timedelta

RETURNS DESCRIPTION
str

Usable representation for the timedelta object.

Source code in edsnlp/pipelines/misc/dates/dates.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
def td2str(td: timedelta):
    """
    Transforms a timedelta object to a string representation.

    Parameters
    ----------
    td : timedelta
        The timedelta object to represent.

    Returns
    -------
    str
        Usable representation for the timedelta object.
    """
    seconds = td.total_seconds()
    days = int(seconds / 3600 / 24)
    return f"TD{days:+d}"
date_getter(date)

Getter for dates. Uses the information from note_datetime.

PARAMETER DESCRIPTION
date

Date detected by the pipeline.

TYPE: Span

RETURNS DESCRIPTION
str

Normalized date.

Source code in edsnlp/pipelines/misc/dates/dates.py
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
def date_getter(date: Span) -> str:
    """
    Getter for dates. Uses the information from `note_datetime`.

    Parameters
    ----------
    date : Span
        Date detected by the pipeline.

    Returns
    -------
    str
        Normalized date.
    """

    d = date._.parsed_date

    if d is None:
        # dateparser could not interpret the date.
        return "????-??-??"

    delta = date._.parsed_delta
    note_datetime = date.doc._.note_datetime

    if date.label_ in {"absolute", "full_date", "no_day"}:
        normalized = d.strftime("%Y-%m-%d")
    elif date.label_ == "no_year":
        if note_datetime:
            year = note_datetime.strftime("%Y")
        else:
            year = "????"
        normalized = d.strftime(f"{year}-%m-%d")
    else:
        if note_datetime:
            # We need to adjust the timedelta, since most dates are set at 00h00.
            # The slightest difference leads to a day difference.
            d = note_datetime + delta
            normalized = d.strftime("%Y-%m-%d")
        else:
            normalized = td2str(d - datetime.now())

    return normalized
date_parser(text_date)

Function to parse dates. It try first all available parsers ('timestamp', 'custom-formats', 'absolute-time') but 'relative-time'. If no date is found, retries with 'relative-time'.

When just the year is identified, it returns a datetime object with month and day equal to 1.

PARAMETER DESCRIPTION
text_date

TYPE: str

RETURNS DESCRIPTION
datetime
Source code in edsnlp/pipelines/misc/dates/dates.py
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
def date_parser(text_date: str) -> datetime:
    """
    Function to parse dates. It try first all available parsers
    ('timestamp', 'custom-formats', 'absolute-time') but 'relative-time'.
    If no date is found, retries with 'relative-time'.

    When just the year is identified, it returns a datetime object with
    month and day equal to 1.


    Parameters
    ----------
    text_date : str

    Returns
    -------
    datetime
    """

    parsed_date = parser1.get_date_data(text_date)
    if parsed_date.date_obj:
        if parsed_date.period == "year":
            return datetime(year=parsed_date.date_obj.year, month=1, day=1)
        else:
            return parsed_date.date_obj
    else:
        parsed_date2 = parser2.get_date_data(text_date)
        return parsed_date2.date_obj
apply_groupdict(dates)
Source code in edsnlp/pipelines/misc/dates/dates.py
134
135
136
137
138
139
def apply_groupdict(
    dates: Iterable[Tuple[Span, Dict[str, str]]]
) -> Generator[Span, None, None]:
    for span, groupdict in dates:
        span._.groupdict = groupdict
        yield span
parse_groupdict(day=None, month=None, year=None, hour=None, minute=None, second=None, **kwargs)

Parse date groupdict.

PARAMETER DESCRIPTION
day

String representation of the day, by default None

TYPE: str, optional DEFAULT: None

month

String representation of the month, by default None

TYPE: str, optional DEFAULT: None

year

String representation of the year, by default None

TYPE: str, optional DEFAULT: None

hour

String representation of the hour, by default None

TYPE: str, optional DEFAULT: None

minute

String representation of the minute, by default None

TYPE: str, optional DEFAULT: None

second

String representation of the minute, by default None

TYPE: str, optional DEFAULT: None

RETURNS DESCRIPTION
Dict[str, int]

Parsed groupdict.

Source code in edsnlp/pipelines/misc/dates/dates.py
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
def parse_groupdict(
    day: str = None,
    month: str = None,
    year: str = None,
    hour: str = None,
    minute: str = None,
    second: str = None,
    **kwargs: Dict[str, str],
) -> Dict[str, int]:
    """
    Parse date groupdict.

    Parameters
    ----------
    day : str, optional
        String representation of the day, by default None
    month : str, optional
        String representation of the month, by default None
    year : str, optional
        String representation of the year, by default None
    hour : str, optional
        String representation of the hour, by default None
    minute : str, optional
        String representation of the minute, by default None
    second : str, optional
        String representation of the minute, by default None

    Returns
    -------
    Dict[str, int]
        Parsed groupdict.
    """

    result = dict()

    if day is not None:
        result["day"] = day2int(day)

    if month is not None:
        result["month"] = month2int(month)

    if year is not None:
        result["year"] = str2int(year)

    if hour is not None:
        result["hour"] = str2int(hour)

    if minute is not None:
        result["minute"] = str2int(minute)

    if second is not None:
        result["second"] = str2int(second)

    result.update(**kwargs)

    return result

parsing

month2int = time2int_factory(months.letter_months_dict) module-attribute
day2int = time2int_factory(days.letter_days_dict) module-attribute
str2int(time)

Converts a string to an integer. Returns None if the string cannot be converted.

PARAMETER DESCRIPTION
time

String representation

TYPE: str

RETURNS DESCRIPTION
int

Integer conversion.

Source code in edsnlp/pipelines/misc/dates/parsing.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
def str2int(time: str) -> int:
    """
    Converts a string to an integer. Returns `None` if the string cannot be converted.

    Parameters
    ----------
    time : str
        String representation

    Returns
    -------
    int
        Integer conversion.
    """
    try:
        return int(time)
    except ValueError:
        return None
time2int_factory(patterns)

Factory for a time2int conversion function.

PARAMETER DESCRIPTION
patterns

Dictionary of conversion/pattern.

TYPE: Dict[str, int]

RETURNS DESCRIPTION
Callable[[str], int]

String to integer function.

Source code in edsnlp/pipelines/misc/dates/parsing.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def time2int_factory(patterns: Dict[str, int]) -> Callable[[str], int]:
    """
    Factory for a `time2int` conversion function.

    Parameters
    ----------
    patterns : Dict[str, int]
        Dictionary of conversion/pattern.

    Returns
    -------
    Callable[[str], int]
        String to integer function.
    """

    def time2int(time: str) -> int:
        """
        Converts a string representation to the proper integer,
        iterating over a dictionnary of pattern/conversion.

        Parameters
        ----------
        time : str
            String representation

        Returns
        -------
        int
            Integer conversion
        """
        m = str2int(time)

        if m is not None:
            return m

        for pattern, key in patterns.items():
            if re.match(f"^{pattern}$", time):
                m = key
                break

        return m

    return time2int

factory

DEFAULT_CONFIG = dict(no_year=None, year_only=None, no_day=None, absolute=None, relative=None, full=None, current=None, false_positive=None, on_ents_only=False, attr='LOWER') module-attribute
create_component(nlp, name, no_year, year_only, no_day, absolute, full, relative, current, false_positive, on_ents_only, attr)
Source code in edsnlp/pipelines/misc/dates/factory.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
@deprecated_factory("dates", "eds.dates", default_config=DEFAULT_CONFIG)
@Language.factory("eds.dates", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    no_year: Optional[List[str]],
    year_only: Optional[List[str]],
    no_day: Optional[List[str]],
    absolute: Optional[List[str]],
    full: Optional[List[str]],
    relative: Optional[List[str]],
    current: Optional[List[str]],
    false_positive: Optional[List[str]],
    on_ents_only: bool,
    attr: str,
):
    return Dates(
        nlp,
        no_year=no_year,
        absolute=absolute,
        relative=relative,
        year_only=year_only,
        no_day=no_day,
        full=full,
        current=current,
        false_positive=false_positive,
        on_ents_only=on_ents_only,
        attr=attr,
    )

patterns

raw_delimiters = ['\\/', '\\-'] module-attribute
delimiters = raw_delimiters + ['\\.', '[^\\S\\r\\n]+'] module-attribute
raw_delimiter_pattern = make_pattern(raw_delimiters) module-attribute
raw_delimiter_with_spaces_pattern = make_pattern(raw_delimiters + ['[^\\S\\r\\n]+']) module-attribute
delimiter_pattern = make_pattern(delimiters) module-attribute
ante_num_pattern = '(?<!{raw_delimiter_pattern})' module-attribute
post_num_pattern = '(?!{raw_delimiter_pattern})' module-attribute
full_year_pattern = ante_num_pattern + fy_pattern + post_num_pattern module-attribute
absolute_date_pattern: List[str] = [ante_num_pattern + day_pattern + d + month_pattern + d + year_pattern + post_num_pattern for d in delimiters] + [ante_num_pattern + year_pattern + d + numeric_month_pattern + d + numeric_day_pattern + post_num_pattern for d in delimiters] module-attribute
full_date_pattern = [ante_num_pattern + fy_pattern + d + lz_numeric_month_pattern + d + lz_numeric_day_pattern + post_num_pattern for d in ['-', '\\.']] module-attribute
no_year_pattern = [day + raw_delimiter_with_spaces_pattern + month for day in [ante_num_pattern + numeric_day_pattern, letter_day_pattern] for month in [numeric_month_pattern + post_num_pattern, letter_month_pattern]] module-attribute
no_day_pattern = [letter_month_pattern + raw_delimiter_with_spaces_pattern + year_pattern + post_num_pattern, ante_num_pattern + lz_numeric_month_pattern + raw_delimiter_with_spaces_pattern + year_pattern + post_num_pattern] module-attribute
relative_date_pattern = relative_pattern module-attribute
since_pattern = ['(?<=depuis)' + '.{,5}' + pattern for pattern in absolute_date_pattern + no_year_pattern + full_date_pattern + [relative_pattern]] module-attribute
false_positive_pattern = make_pattern(['(\\d+' + delimiter_pattern + '){3,}\\d+', '\\d\\/\\d']) module-attribute
current
current_patterns: List[str] = ['cette\\sann[ée]e(?![-\\s]l[àa])', 'ce\\sjour', 'ces\\sjours[-\\s]ci', "aujourd'?hui", 'ce\\smois([-\\s]ci)?', 'cette\\ssemaine', 'cet?\\s([ée]t[ée]|automne|hiver|printemps)'] module-attribute
current_pattern = make_pattern(current_patterns, with_breaks=True) module-attribute
relative
ago_pattern = 'il\\s+y\\s+a\\s+.{,10}?\\s+(heures?|jours?|semaines?|mois|ann[ée]es?|ans?)' module-attribute
in_pattern = 'dans\\s+.{,10}?\\s+(heures?|jours?|semaines?|mois|ann[ée]es?|ans?)' module-attribute
last_pattern = "l['ae]\\s*(semaine|année|an|mois)\\s+derni[èe]re?" module-attribute
next_pattern = "l['ae]\\s*(semaine|année|an|mois)\\s+prochaine?" module-attribute
since_pattern = '(?<=depuis\\s)\\s*.{,10}\\s+(heures?|jours?|semaines?|mois|ann[ée]es?|ans?)(\\s+derni[èe]re?)?' module-attribute
during_pattern = '(pendant|pdt|pour)\\s+.{,10}?\\s+(heures?|jours?|mois|ann[ée]es?|ans?)' module-attribute
week_patterns = ['(avant\\-?\\s*)?hier', '(apr[èe]s\\-?\\s*)?demain'] module-attribute
week_pattern = make_pattern(week_patterns, with_breaks=True) module-attribute
relative_pattern = make_pattern(patterns=[ago_pattern, in_pattern, last_pattern, next_pattern, since_pattern, week_pattern], with_breaks=True) module-attribute
atomic
time
hour_pattern = '(?<!\\d)(?P<hour>0?[1-9]|1\\d|2[0-3])(?!\\d)' module-attribute
lz_hour_pattern = '(?<!\\d)(?P<hour>0[1-9]|[12]\\d|3[01])(?!\\d)' module-attribute
minute_pattern = '(?<!\\d)(?P<minute>0?[1-9]|[1-5]\\d)(?!\\d)' module-attribute
lz_minute_pattern = '(?<!\\d)(?P<minute>0[1-9]|[1-5]\\d)(?!\\d)' module-attribute
second_pattern = '(?<!\\d)(?P<second>0?[1-9]|[1-5]\\d)(?!\\d)' module-attribute
lz_second_pattern = '(?<!\\d)(?P<second>0[1-9]|[1-5]\\d)(?!\\d)' module-attribute
time_pattern = '(\\s.{,3}' + '{hour_pattern}[h:]({lz_minute_pattern})?' + '((:|m|min){lz_second_pattern})?' + ')?' module-attribute
years
year_patterns: List[str] = ['19\\d\\d'] + [str(year) for year in range(2000, date.today().year + 2)] module-attribute
full_year_pattern = '(?<!\\d)' + full_year_pattern + '(?!\\d)' module-attribute
year_pattern = '(?<!\\d)' + year_pattern + '(?!\\d)' module-attribute
months
letter_months_dict: Dict[str, int] = {'(janvier|janv\\.?)': 1, '(f[ée]vrier|f[ée]v\\.?)': 2, '(mars|mar\\.?)': 3, '(avril|avr\\.?)': 4, 'mai': 5, 'juin': 6, '(juillet|juill?\\.?)': 7, 'ao[uû]t': 8, '(septembre|sept?\\.?)': 9, '(octobre|oct\\.?)': 10, '(novembre|nov\\.)': 11, '(d[ée]cembre|d[ée]c\\.?)': 12} module-attribute
letter_months: List[str] = list(letter_months_dict.keys()) module-attribute
month_pattern = '(?P<month>{letter_month_pattern}|{numeric_month_pattern})' module-attribute
letter_month_pattern = '(?P<month>{letter_month_pattern})' module-attribute
numeric_month_pattern = '(?P<month>{numeric_month_pattern})' module-attribute
lz_numeric_month_pattern = '(?P<month>{lz_numeric_month_pattern})' module-attribute
days
letter_days_dict: Dict[str, int] = {'(premier|1\\s*er)': 1, 'deux': 2, 'trois': 3, 'quatre': 4, 'cinq': 5, 'six': 6, 'sept': 7, 'huit': 8, 'neuf': 9, 'dix': 10, 'onze': 11, 'douze': 12, 'treize': 13, 'quatorze': 14, 'quinze': 15, 'seize': 16, 'dix\\-?\\s*sept': 17, 'dix\\-?\\s*huit': 18, 'dix\\-?\\s*neuf': 19, 'vingt': 20, 'vingt\\-?\\s*et\\-?\\s*un': 21, 'vingt\\-?\\s*deux': 22, 'vingt\\-?\\s*trois': 23, 'vingt\\-?\\s*quatre': 24, 'vingt\\-?\\s*cinq': 25, 'vingt\\-?\\s*six': 26, 'vingt\\-?\\s*sept': 27, 'vingt\\-?\\s*huit': 28, 'vingt\\-?\\s*neuf': 29, 'trente': 30, 'trente\\-?\\s*et\\-?\\s*un': 31} module-attribute
letter_days: List[str] = list(letter_days_dict.keys()) module-attribute
nlz_numeric_day_pattern = '(?<!\\d)([1-9]|[12]\\d|3[01])(?!\\d)' module-attribute
day_pattern = '(?P<day>{letter_day_pattern}|{numeric_day_pattern})' module-attribute
letter_day_pattern = '(?P<day>{letter_day_pattern})' module-attribute
numeric_day_pattern = '(?P<day>{numeric_day_pattern})' module-attribute
lz_numeric_day_pattern = '(?P<day>{lz_numeric_day_pattern})' module-attribute

measures

measures

Measure

Bases: abc.ABC

Source code in edsnlp/pipelines/misc/measures/measures.py
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
class Measure(abc.ABC):
    INTEGER = r"(?:[0-9]+)"
    CONJUNCTIONS = "et|ou"
    COMPOSERS = r"[x*]|par"

    UNITS = {}
    COMPOSITE = None

    @abc.abstractmethod
    def __iter__(self) -> Iterable["SimpleMeasure"]:
        """
        Iter over items of the measure (only one for SimpleMeasure)

        Returns
        -------
        iterable : Iterable["SimpleMeasure"]
        """

    @abc.abstractmethod
    def __getitem__(self, item) -> "SimpleMeasure":
        """
        Access items of the measure (only one for SimpleMeasure)

        Parameters
        ----------
        item : int

        Returns
        -------
        measure : SimpleMeasure
        """
INTEGER = '(?:[0-9]+)' class-attribute
CONJUNCTIONS = 'et|ou' class-attribute
COMPOSERS = '[x*]|par' class-attribute
UNITS = {} class-attribute
COMPOSITE = None class-attribute
__iter__()

Iter over items of the measure (only one for SimpleMeasure)

RETURNS DESCRIPTION
iterable

TYPE: Iterable["SimpleMeasure"]

Source code in edsnlp/pipelines/misc/measures/measures.py
131
132
133
134
135
136
137
138
139
@abc.abstractmethod
def __iter__(self) -> Iterable["SimpleMeasure"]:
    """
    Iter over items of the measure (only one for SimpleMeasure)

    Returns
    -------
    iterable : Iterable["SimpleMeasure"]
    """
__getitem__(item)

Access items of the measure (only one for SimpleMeasure)

PARAMETER DESCRIPTION
item

TYPE: int

RETURNS DESCRIPTION
measure

TYPE: SimpleMeasure

Source code in edsnlp/pipelines/misc/measures/measures.py
141
142
143
144
145
146
147
148
149
150
151
152
153
@abc.abstractmethod
def __getitem__(self, item) -> "SimpleMeasure":
    """
    Access items of the measure (only one for SimpleMeasure)

    Parameters
    ----------
    item : int

    Returns
    -------
    measure : SimpleMeasure
    """
SimpleMeasure

Bases: Measure

Source code in edsnlp/pipelines/misc/measures/measures.py
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
class SimpleMeasure(Measure):
    def __init__(self, value, unit):
        """
        The SimpleMeasure class contains the value and unit
        for a single non-composite measure

        Parameters
        ----------
        value : float
        unit : str
        """
        super().__init__()
        self.value = value
        self.unit = unit

    @classmethod
    @abc.abstractmethod
    def parse(
        self, int_part: str, dec_part: str, unit: str, infix: bool
    ) -> "SimpleMeasure":
        """
        Class method to create an instance from the match groups

        int_part : str
            The integer part of the match (eg 12 in 12 metres 50 or 12.50metres)
        dec_part : str
            The decimal part of the match (eg 50 in 12 metres 50 or 12.50metres)
        unit : str
            The normalized variant of the unit (eg "m" for 12 metre 50)
        infix : bool
            Whether the unit was in the before (True) or after (False) the decimal part
        """

    def _get_scale_to(self, unit: str):
        return self.UNITS[self.unit]["value"] / self.UNITS[unit]["value"]

    def __iter__(self):
        return iter((self,))

    def __getitem__(self, item: int):
        assert isinstance(item, int)
        return [self][item]

    def __str__(self):
        return f"{self.value}{self.unit}"

    def __repr__(self):
        return f"{self.__class__.__name__}({self.value}, {repr(self.unit)})"

    def __eq__(self, other: "SimpleMeasure"):
        return getattr(self, other.unit) == other.value

    def __lt__(self, other: "SimpleMeasure"):
        return getattr(self, other.unit) < other.value

    def __le__(self, other: "SimpleMeasure"):
        return getattr(self, other.unit) <= other.value
value = value instance-attribute
unit = unit instance-attribute
__init__(value, unit)

The SimpleMeasure class contains the value and unit for a single non-composite measure

PARAMETER DESCRIPTION
value

TYPE: float

unit

TYPE: str

Source code in edsnlp/pipelines/misc/measures/measures.py
157
158
159
160
161
162
163
164
165
166
167
168
169
def __init__(self, value, unit):
    """
    The SimpleMeasure class contains the value and unit
    for a single non-composite measure

    Parameters
    ----------
    value : float
    unit : str
    """
    super().__init__()
    self.value = value
    self.unit = unit
parse(int_part, dec_part, unit, infix)

Class method to create an instance from the match groups

int_part : str The integer part of the match (eg 12 in 12 metres 50 or 12.50metres) dec_part : str The decimal part of the match (eg 50 in 12 metres 50 or 12.50metres) unit : str The normalized variant of the unit (eg "m" for 12 metre 50) infix : bool Whether the unit was in the before (True) or after (False) the decimal part

Source code in edsnlp/pipelines/misc/measures/measures.py
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
@classmethod
@abc.abstractmethod
def parse(
    self, int_part: str, dec_part: str, unit: str, infix: bool
) -> "SimpleMeasure":
    """
    Class method to create an instance from the match groups

    int_part : str
        The integer part of the match (eg 12 in 12 metres 50 or 12.50metres)
    dec_part : str
        The decimal part of the match (eg 50 in 12 metres 50 or 12.50metres)
    unit : str
        The normalized variant of the unit (eg "m" for 12 metre 50)
    infix : bool
        Whether the unit was in the before (True) or after (False) the decimal part
    """
_get_scale_to(unit)
Source code in edsnlp/pipelines/misc/measures/measures.py
189
190
def _get_scale_to(self, unit: str):
    return self.UNITS[self.unit]["value"] / self.UNITS[unit]["value"]
__iter__()
Source code in edsnlp/pipelines/misc/measures/measures.py
192
193
def __iter__(self):
    return iter((self,))
__getitem__(item)
Source code in edsnlp/pipelines/misc/measures/measures.py
195
196
197
def __getitem__(self, item: int):
    assert isinstance(item, int)
    return [self][item]
__str__()
Source code in edsnlp/pipelines/misc/measures/measures.py
199
200
def __str__(self):
    return f"{self.value}{self.unit}"
__repr__()
Source code in edsnlp/pipelines/misc/measures/measures.py
202
203
def __repr__(self):
    return f"{self.__class__.__name__}({self.value}, {repr(self.unit)})"
__eq__(other)
Source code in edsnlp/pipelines/misc/measures/measures.py
205
206
def __eq__(self, other: "SimpleMeasure"):
    return getattr(self, other.unit) == other.value
__lt__(other)
Source code in edsnlp/pipelines/misc/measures/measures.py
208
209
def __lt__(self, other: "SimpleMeasure"):
    return getattr(self, other.unit) < other.value
__le__(other)
Source code in edsnlp/pipelines/misc/measures/measures.py
211
212
def __le__(self, other: "SimpleMeasure"):
    return getattr(self, other.unit) <= other.value
CompositeMeasure

Bases: Measure

The CompositeMeasure class contains a sequence of multiple SimpleMeasure instances

PARAMETER DESCRIPTION
measures

TYPE: List[SimpleMeasure]

Source code in edsnlp/pipelines/misc/measures/measures.py
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
class CompositeMeasure(Measure):
    """
    The CompositeMeasure class contains a sequence
    of multiple SimpleMeasure instances

    Parameters
    ----------
    measures : List[SimpleMeasure]
    """

    def __init__(self, measures: Iterable["SimpleMeasure"]):
        super().__init__()
        self.measures = list(measures)

    def __iter__(self):
        return iter(self.measures)

    def __getitem__(self, item: int):
        assert isinstance(item, int)
        res = self.measures[item]
        return res

    def __str__(self):
        return " x ".join(map(str, self.measures))

    def __repr__(self):
        return f"{self.__class__.__name__}({repr(self.measures)})"
measures = list(measures) instance-attribute
__init__(measures)
Source code in edsnlp/pipelines/misc/measures/measures.py
225
226
227
def __init__(self, measures: Iterable["SimpleMeasure"]):
    super().__init__()
    self.measures = list(measures)
__iter__()
Source code in edsnlp/pipelines/misc/measures/measures.py
229
230
def __iter__(self):
    return iter(self.measures)
__getitem__(item)
Source code in edsnlp/pipelines/misc/measures/measures.py
232
233
234
235
def __getitem__(self, item: int):
    assert isinstance(item, int)
    res = self.measures[item]
    return res
__str__()
Source code in edsnlp/pipelines/misc/measures/measures.py
237
238
def __str__(self):
    return " x ".join(map(str, self.measures))
__repr__()
Source code in edsnlp/pipelines/misc/measures/measures.py
240
241
def __repr__(self):
    return f"{self.__class__.__name__}({repr(self.measures)})"
Measures

Bases: BaseComponent

Matcher component to extract measures. A measures is most often composed of a number and a unit like

1,26 cm The unit can also be positioned in place of the decimal dot/comma 1 cm 26 Some measures can be composite 1,26 cm x 2,34 mm And sometimes they are factorized Les trois kystes mesurent 1, 2 et 3cm.

The recognized measures are stored in the "measures" SpanGroup. Each span has a Measure object stored in the "value" extension attribute.

PARAMETER DESCRIPTION
nlp

The SpaCy object.

TYPE: Language

measures

The registry names of the measures to extract

TYPE: List[str]

attr

Whether to match on the text ('TEXT') or on the normalized text ('NORM')

TYPE: str

ignore_excluded

Whether to exclude pollution patterns when matching in the text

TYPE: bool

Source code in edsnlp/pipelines/misc/measures/measures.py
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
class Measures(BaseComponent):
    """
    Matcher component to extract measures.
    A measures is most often composed of a number and a unit like
    > 1,26 cm
    The unit can also be positioned in place of the decimal dot/comma
    > 1 cm 26
    Some measures can be composite
    > 1,26 cm x 2,34 mm
    And sometimes they are factorized
    > Les trois kystes mesurent 1, 2 et 3cm.

    The recognized measures are stored in the "measures" SpanGroup.
    Each span has a `Measure` object stored in the "value" extension attribute.

    Parameters
    ----------
    nlp : Language
        The SpaCy object.
    measures : List[str]
        The registry names of the measures to extract
    attr : str
        Whether to match on the text ('TEXT') or on the normalized text ('NORM')
    ignore_excluded : bool
        Whether to exclude pollution patterns when matching in the text
    """

    def __init__(
        self,
        nlp: Language,
        measures: List[str],
        attr: str,
        ignore_excluded: bool,
    ):

        self.regex_matcher = RegexMatcher(
            attr=attr,
            ignore_excluded=ignore_excluded,
        )

        self.extraction_regexes = {}
        self.measures: Dict[str, Measure] = {}
        for name in measures:
            cls: Measure = spacy.registry.misc.get(name)
            self.measures[name] = cls
            regexes = make_patterns(cls)
            self.regex_matcher.add(name, regexes["trigger"])
            self.extraction_regexes[name] = regexes["extraction"]

        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:
        super(Measures, Measures).set_extensions()
        if not Span.has_extension("value"):
            Span.set_extension("value", default=None)

    def __call__(self, doc: Doc) -> Doc:
        """
        Adds measures to document's "measures" SpanGroup.

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        doc:
            spaCy Doc object, annotated for extracted terms.
        """

        matches = dict(self.regex_matcher(doc, as_spans=True, return_groupdict=True))

        # Filter spans by rightmost, largest spans first to handle cases like 1 m 50 kg
        # while keeping the corresponding groupdicts
        matches = {
            match: matches[match]
            for match in filter_spans(matches, sort_key=rightmost_largest_sort_key)
        }

        measures = []
        for match, groupdict in matches.items():
            measure_name = match.label_
            extraction_regex = self.extraction_regexes[measure_name]

            parsed_values = []

            shared_unit_part = next(
                (key for key, val in groupdict.items() if val is not None), None
            )
            for sub_match in regex.finditer(extraction_regex, match.text):
                sub_groupdict = dict(sub_match.groupdict())

                # Integer part of the match
                int_part = sub_groupdict.pop("int_part", 0)

                # Decimal part of the match, if any
                dec_part = sub_groupdict.pop("dec_part", 0) or 0

                # If the unit was not postfix (in cases like 1cm, or 1 et 2cm)
                # the unit must be infix: we extract it now using non empty groupdict
                # entries
                infix_unit_part = next(
                    (key for key, val in sub_groupdict.items() if val is not None),
                    None,
                )
                unit_part = infix_unit_part or shared_unit_part

                # Create one SimpleMeasure per submatch inside each match...
                parsed_values.append(
                    self.measures[measure_name].parse(
                        int_part=int_part,
                        dec_part=dec_part,
                        unit=unit_part,
                        infix=infix_unit_part is not None,
                    )
                )

            # ... and compose theses measures together if there are more than one
            measure = Span(doc, start=match.start, end=match.end, label=measure_name)
            measure._.value = (
                parsed_values[0]
                if len(parsed_values) == 1
                else self.measures[measure_name].COMPOSITE(parsed_values)
                if self.measures[measure_name].COMPOSITE is not None
                else parsed_values[-1]
            )
            measures.append(match)

        doc.spans["measures"] = sorted(measures)

        return doc
regex_matcher = RegexMatcher(attr=attr, ignore_excluded=ignore_excluded) instance-attribute
extraction_regexes = {} instance-attribute
measures: Dict[str, Measure] = {} instance-attribute
__init__(nlp, measures, attr, ignore_excluded)
Source code in edsnlp/pipelines/misc/measures/measures.py
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
def __init__(
    self,
    nlp: Language,
    measures: List[str],
    attr: str,
    ignore_excluded: bool,
):

    self.regex_matcher = RegexMatcher(
        attr=attr,
        ignore_excluded=ignore_excluded,
    )

    self.extraction_regexes = {}
    self.measures: Dict[str, Measure] = {}
    for name in measures:
        cls: Measure = spacy.registry.misc.get(name)
        self.measures[name] = cls
        regexes = make_patterns(cls)
        self.regex_matcher.add(name, regexes["trigger"])
        self.extraction_regexes[name] = regexes["extraction"]

    self.set_extensions()
set_extensions()
Source code in edsnlp/pipelines/misc/measures/measures.py
295
296
297
298
299
@staticmethod
def set_extensions() -> None:
    super(Measures, Measures).set_extensions()
    if not Span.has_extension("value"):
        Span.set_extension("value", default=None)
__call__(doc)

Adds measures to document's "measures" SpanGroup.

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
doc

spaCy Doc object, annotated for extracted terms.

Source code in edsnlp/pipelines/misc/measures/measures.py
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
def __call__(self, doc: Doc) -> Doc:
    """
    Adds measures to document's "measures" SpanGroup.

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    doc:
        spaCy Doc object, annotated for extracted terms.
    """

    matches = dict(self.regex_matcher(doc, as_spans=True, return_groupdict=True))

    # Filter spans by rightmost, largest spans first to handle cases like 1 m 50 kg
    # while keeping the corresponding groupdicts
    matches = {
        match: matches[match]
        for match in filter_spans(matches, sort_key=rightmost_largest_sort_key)
    }

    measures = []
    for match, groupdict in matches.items():
        measure_name = match.label_
        extraction_regex = self.extraction_regexes[measure_name]

        parsed_values = []

        shared_unit_part = next(
            (key for key, val in groupdict.items() if val is not None), None
        )
        for sub_match in regex.finditer(extraction_regex, match.text):
            sub_groupdict = dict(sub_match.groupdict())

            # Integer part of the match
            int_part = sub_groupdict.pop("int_part", 0)

            # Decimal part of the match, if any
            dec_part = sub_groupdict.pop("dec_part", 0) or 0

            # If the unit was not postfix (in cases like 1cm, or 1 et 2cm)
            # the unit must be infix: we extract it now using non empty groupdict
            # entries
            infix_unit_part = next(
                (key for key, val in sub_groupdict.items() if val is not None),
                None,
            )
            unit_part = infix_unit_part or shared_unit_part

            # Create one SimpleMeasure per submatch inside each match...
            parsed_values.append(
                self.measures[measure_name].parse(
                    int_part=int_part,
                    dec_part=dec_part,
                    unit=unit_part,
                    infix=infix_unit_part is not None,
                )
            )

        # ... and compose theses measures together if there are more than one
        measure = Span(doc, start=match.start, end=match.end, label=measure_name)
        measure._.value = (
            parsed_values[0]
            if len(parsed_values) == 1
            else self.measures[measure_name].COMPOSITE(parsed_values)
            if self.measures[measure_name].COMPOSITE is not None
            else parsed_values[-1]
        )
        measures.append(match)

    doc.spans["measures"] = sorted(measures)

    return doc
disj_capture(regexes, capture=True)
Source code in edsnlp/pipelines/misc/measures/measures.py
14
15
16
17
18
19
20
def disj_capture(regexes, capture=True):
    return "|".join(
        ("(?P<{key}>{forms})" if capture else "{forms}").format(
            key=key, forms="|".join(forms)
        )
        for key, forms in regexes.items()
    )
rightmost_largest_sort_key(span)
Source code in edsnlp/pipelines/misc/measures/measures.py
23
24
def rightmost_largest_sort_key(span):
    return span.end, (len(span))
make_patterns(measure)

Build recognition and extraction patterns for a given Measure class

PARAMETER DESCRIPTION
measure

The measure to build recognition and extraction patterns for

TYPE: 'Measure'

RETURNS DESCRIPTION
trigger

TYPE: List[str]

extraction

TYPE: str

Source code in edsnlp/pipelines/misc/measures/measures.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def make_patterns(measure: "Measure") -> Dict[str, Union[List[str], str]]:
    """
    Build recognition and extraction patterns for a given Measure class

    Parameters
    ----------
    measure: Measure class
        The measure to build recognition and extraction patterns for

    Returns
    -------
    trigger : List[str]
    extraction : str
    """
    unit_prefix_reg = disj_capture(
        {key: [entry["prefix"]] for key, entry in measure.UNITS.items()},
        capture=True,
    )
    unit_abbreviation_reg = disj_capture(
        {key: [entry["abbr"]] for key, entry in measure.UNITS.items()},
        capture=True,
    )
    unit_reg = rf"(?:(?:{unit_prefix_reg})[a-z]*|(?:{unit_abbreviation_reg})(?![a-z]))"

    number_reg = rf"(?:{measure.INTEGER}(?:[,.]{measure.INTEGER})?)"
    infix_measure_reg = rf"(?:{measure.INTEGER}{unit_reg}{measure.INTEGER})"

    # Simple measure
    simple_measure_reg = rf"{number_reg}\s*{unit_reg}"
    trigger = [
        simple_measure_reg,
        infix_measure_reg,
        # Factorized measures separated by a conjunction
        rf"{number_reg}(?=(?:\s*[,]\s*{number_reg})*\s*"
        rf"(?:{measure.CONJUNCTIONS})\s*{number_reg}\s*{unit_reg})",
    ]
    if measure.COMPOSITE:
        # Factorized composite measures (3 x 2cm)
        trigger.append(
            rf"(?<![a-z]){number_reg}"
            rf"(?:\s*(?:{measure.COMPOSERS})\s*{number_reg})*\s*{unit_reg}"
        )
        # Expanded composite measures (3cm x 2cm)
        trigger.append(
            rf"(?<![a-z])(?:{infix_measure_reg}|{simple_measure_reg})"
            rf"(\s*(?:{measure.COMPOSERS})\s*"
            rf"(?:{infix_measure_reg}|{simple_measure_reg}))*"
        )

    unit_reg_capture = (
        rf"(?:(?:{unit_prefix_reg})[a-z]*|(?:{unit_abbreviation_reg})(?![a-z]))"
    )

    return {
        "trigger": trigger,
        "extraction": rf"(?P<int_part>{measure.INTEGER})\s*(?:[,.]|"
        rf"{unit_reg_capture})?\s*(?P<dec_part>{measure.INTEGER})?",
    }
make_simple_getter(name)
Source code in edsnlp/pipelines/misc/measures/measures.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
def make_simple_getter(name):
    def getter(self):
        """
        Get a scaled numerical value of a measure

        Parameters
        ----------
        self

        Returns
        -------
        float
        """
        return self.value * self._get_scale_to(name)

    return getter
make_multi_getter(name)
Source code in edsnlp/pipelines/misc/measures/measures.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
def make_multi_getter(name: str) -> Callable[["CompositeMeasure"], Tuple[float]]:
    def getter(self) -> Tuple[float]:
        """
        Get a scaled numerical values of a multi-measure

        Parameters
        ----------
        self

        Returns
        -------
        float
        """
        return tuple(getattr(measure, name) for measure in self.measures)

    return getter

patterns

CompositeSize

Bases: CompositeMeasure

Composite size measure. Supports the following units: - mm - cm - dm - m

Source code in edsnlp/pipelines/misc/measures/patterns.py
11
12
13
14
15
16
17
18
19
20
21
22
23
class CompositeSize(CompositeMeasure):
    """
    Composite size measure. Supports the following units:
    - mm
    - cm
    - dm
    - m
    """

    mm = property(make_multi_getter("mm"))
    cm = property(make_multi_getter("cm"))
    dm = property(make_multi_getter("dm"))
    m = property(make_multi_getter("m"))
mm = property(make_multi_getter('mm')) class-attribute
cm = property(make_multi_getter('cm')) class-attribute
dm = property(make_multi_getter('dm')) class-attribute
m = property(make_multi_getter('m')) class-attribute
Size

Bases: SimpleMeasure

Size measure. Supports the following units: - mm - cm - dm - m

Source code in edsnlp/pipelines/misc/measures/patterns.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
@spacy.registry.misc("eds.measures.size")
class Size(SimpleMeasure):
    """
    Size measure. Supports the following units:
    - mm
    - cm
    - dm
    - m
    """

    COMPOSITE = CompositeSize
    UNITS = {
        "mm": {"prefix": "mill?im", "abbr": "mm", "value": 1},
        "cm": {"prefix": "centim", "abbr": "cm", "value": 10},
        "dm": {"prefix": "decim", "abbr": "dm", "value": 100},
        "m": {"prefix": "metre", "abbr": "m", "value": 1000},
    }

    @classmethod
    def parse(cls, int_part, dec_part, unit, infix=False):
        result = float("{}.{}".format(int_part, dec_part))
        return cls(result, unit)

    mm = property(make_simple_getter("mm"))
    cm = property(make_simple_getter("cm"))
    dm = property(make_simple_getter("dm"))
    m = property(make_simple_getter("m"))
COMPOSITE = CompositeSize class-attribute
UNITS = {'mm': {'prefix': 'mill?im', 'abbr': 'mm', 'value': 1}, 'cm': {'prefix': 'centim', 'abbr': 'cm', 'value': 10}, 'dm': {'prefix': 'decim', 'abbr': 'dm', 'value': 100}, 'm': {'prefix': 'metre', 'abbr': 'm', 'value': 1000}} class-attribute
mm = property(make_simple_getter('mm')) class-attribute
cm = property(make_simple_getter('cm')) class-attribute
dm = property(make_simple_getter('dm')) class-attribute
m = property(make_simple_getter('m')) class-attribute
parse(int_part, dec_part, unit, infix=False)
Source code in edsnlp/pipelines/misc/measures/patterns.py
44
45
46
47
@classmethod
def parse(cls, int_part, dec_part, unit, infix=False):
    result = float("{}.{}".format(int_part, dec_part))
    return cls(result, unit)
Weight

Bases: SimpleMeasure

Weight measure. Supports the following units: - mg - cg - dg - g - kg

Source code in edsnlp/pipelines/misc/measures/patterns.py
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
@spacy.registry.misc("eds.measures.weight")
class Weight(SimpleMeasure):
    """
    Weight measure. Supports the following units:
    - mg
    - cg
    - dg
    - g
    - kg
    """

    COMPOSITE = None
    UNITS = {
        "mg": {"prefix": "mill?ig", "abbr": "mg", "value": 1},
        "cg": {"prefix": "centig", "abbr": "cg", "value": 10},
        "dg": {"prefix": "decig", "abbr": "dg", "value": 100},
        "g": {"prefix": "gram", "abbr": "g", "value": 1000},
        "kg": {"prefix": "kilo", "abbr": "kg", "value": 1000000},
    }

    @classmethod
    def parse(cls, int_part, dec_part, unit, infix=False):
        result = float("{}.{}".format(int_part, dec_part))
        return cls(result, unit)

    mg = property(make_simple_getter("mg"))
    cg = property(make_simple_getter("cg"))
    dg = property(make_simple_getter("dg"))
    g = property(make_simple_getter("g"))
    kg = property(make_simple_getter("kg"))
COMPOSITE = None class-attribute
UNITS = {'mg': {'prefix': 'mill?ig', 'abbr': 'mg', 'value': 1}, 'cg': {'prefix': 'centig', 'abbr': 'cg', 'value': 10}, 'dg': {'prefix': 'decig', 'abbr': 'dg', 'value': 100}, 'g': {'prefix': 'gram', 'abbr': 'g', 'value': 1000}, 'kg': {'prefix': 'kilo', 'abbr': 'kg', 'value': 1000000}} class-attribute
mg = property(make_simple_getter('mg')) class-attribute
cg = property(make_simple_getter('cg')) class-attribute
dg = property(make_simple_getter('dg')) class-attribute
g = property(make_simple_getter('g')) class-attribute
kg = property(make_simple_getter('kg')) class-attribute
parse(int_part, dec_part, unit, infix=False)
Source code in edsnlp/pipelines/misc/measures/patterns.py
75
76
77
78
@classmethod
def parse(cls, int_part, dec_part, unit, infix=False):
    result = float("{}.{}".format(int_part, dec_part))
    return cls(result, unit)
Angle

Bases: SimpleMeasure

Angle measure. Supports the following units: - h

Source code in edsnlp/pipelines/misc/measures/patterns.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
@spacy.registry.misc("eds.measures.angle")
class Angle(SimpleMeasure):
    """
    Angle measure. Supports the following units:
    - h
    """

    COMPOSITE = None
    UNITS = {
        "h": {"prefix": "heur", "abbr": "h", "value": 1},
    }

    @classmethod
    def parse(cls, int_part, dec_part, unit, infix=False):
        if infix:
            result = float(int_part) + int(dec_part) / 60.0
            return cls(result, unit)
        result = float("{}.{}".format(int_part, dec_part))
        return cls(result, unit)

    h = property(make_simple_getter("h"))
COMPOSITE = None class-attribute
UNITS = {'h': {'prefix': 'heur', 'abbr': 'h', 'value': 1}} class-attribute
h = property(make_simple_getter('h')) class-attribute
parse(int_part, dec_part, unit, infix=False)
Source code in edsnlp/pipelines/misc/measures/patterns.py
 99
100
101
102
103
104
105
@classmethod
def parse(cls, int_part, dec_part, unit, infix=False):
    if infix:
        result = float(int_part) + int(dec_part) / 60.0
        return cls(result, unit)
    result = float("{}.{}".format(int_part, dec_part))
    return cls(result, unit)

factory

DEFAULT_CONFIG = dict(attr='NORM', ignore_excluded=False, measures=['eds.measures.size', 'eds.measures.weight', 'eds.measures.angle']) module-attribute
create_component(nlp, name, measures, attr, ignore_excluded)
Source code in edsnlp/pipelines/misc/measures/factory.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
@Language.factory("eds.measures", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    measures: Union[str, List[str], Dict[str, Dict]],
    attr: str,
    ignore_excluded: bool,
):
    return Measures(
        nlp,
        measures=measures,
        attr=attr,
        ignore_excluded=ignore_excluded,
    )

consultation_dates

patterns

consultation_mention = ['rendez-vous pris', 'consultation', 'consultation.{1,8}examen', 'examen clinique', 'de compte rendu', "date de l'examen", 'examen realise le', 'date de la visite'] module-attribute
town_mention = ['paris', 'kremlin.bicetre', 'creteil', 'boulogne.billancourt', 'villejuif', 'clamart', 'bobigny', 'clichy', 'ivry.sur.seine', 'issy.les.moulineaux', 'draveil', 'limeil', 'champcueil', 'roche.guyon', 'bondy', 'colombes', 'hendaye', 'herck.sur.mer', 'labruyere', 'garches', 'sevran', 'hyeres'] module-attribute
document_date_mention = ['imprime le', 'signe electroniquement', 'signe le', 'saisi le', 'dicte le', 'tape le', 'date de reference', 'date\\s*:', 'dactylographie le', 'date du rapport'] module-attribute

consultation_dates

ConsultationDates

Bases: GenericMatcher

Class to extract consultation dates from "CR-CONS" documents.

The pipeline populates the doc.spans['consultation_dates'] list.

For each extraction s in this list, the corresponding date is available as s._.consultation_date.

PARAMETER DESCRIPTION
nlp

Language pipeline object

TYPE: Language

consultation_mention

List of RegEx for consultation mentions.

  • If type==list: Overrides the default list
  • If type==bool: Uses the default list of True, disable if False

TYPE: Union[List[str], bool]

town_mention : Union[List[str], bool] List of RegEx for all AP-HP hospitals' towns mentions.

- If `type==list`: Overrides the default list
- If `type==bool`: Uses the default list of True, disable if False

document_date_mention : Union[List[str], bool] List of RegEx for document date.

- If `type==list`: Overrides the default list
- If `type==bool`: Uses the default list of True, disable if False
Source code in edsnlp/pipelines/misc/consultation_dates/consultation_dates.py
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
class ConsultationDates(GenericMatcher):
    """
    Class to extract consultation dates from "CR-CONS" documents.

    The pipeline populates the `#!python doc.spans['consultation_dates']` list.

    For each extraction `s` in this list, the corresponding date is available
    as `s._.consultation_date`.

    Parameters
    ----------
    nlp : Language
        Language pipeline object
    consultation_mention : Union[List[str], bool]
        List of RegEx for consultation mentions.

        - If `type==list`: Overrides the default list
        - If `type==bool`: Uses the default list of True, disable if False

    town_mention : Union[List[str], bool]
        List of RegEx for all AP-HP hospitals' towns mentions.

        - If `type==list`: Overrides the default list
        - If `type==bool`: Uses the default list of True, disable if False
    document_date_mention : Union[List[str], bool]
        List of RegEx for document date.

        - If `type==list`: Overrides the default list
        - If `type==bool`: Uses the default list of True, disable if False
    """

    def __init__(
        self,
        nlp: Language,
        consultation_mention: Union[List[str], bool],
        town_mention: Union[List[str], bool],
        document_date_mention: Union[List[str], bool],
        attr: str,
        **kwargs,
    ):

        logger.warning("This pipeline is still in beta")
        logger.warning(
            "This pipeline should ONLY be used on notes "
            "where `note_class_source_value == 'CR-CONS'`"
        )
        logger.warning(
            """This pipeline requires to use the normalizer pipeline with:
        lowercase=True,
        accents=True,
        quotes=True"""
        )

        if not (nlp.has_pipe("dates") and nlp.get_pipe("dates").on_ents_only is False):

            config = dict(**DEFAULT_CONFIG)
            config["on_ents_only"] = "consultation_mentions"

            self.date_matcher = Dates(nlp, **config)

        else:
            self.date_matcher = None

        if not consultation_mention:
            consultation_mention = []
        elif consultation_mention is True:
            consultation_mention = consult_regex.consultation_mention

        if not document_date_mention:
            document_date_mention = []
        elif document_date_mention is True:
            document_date_mention = consult_regex.document_date_mention

        if not town_mention:
            town_mention = []
        elif town_mention is True:
            town_mention = consult_regex.town_mention

        regex = dict(
            consultation_mention=consultation_mention,
            town_mention=town_mention,
            document_date_mention=document_date_mention,
        )

        super().__init__(
            nlp,
            regex=regex,
            terms=dict(),
            attr=attr,
            ignore_excluded=False,
            **kwargs,
        )

        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:
        if not Span.has_extension("consultation_date"):
            Span.set_extension("consultation_date", default=None)

    def __call__(self, doc: Doc) -> Doc:
        """
        Finds entities

        Parameters
        ----------
        doc: spaCy Doc object

        Returns
        -------
        doc: spaCy Doc object with additionnal doc.spans['consultation_dates] SpanGroup
        """

        ents = self.process(doc)

        doc.spans["consultation_mentions"] = ents
        doc.spans["consultation_dates"] = []

        if self.date_matcher is not None:
            doc = self.date_matcher(doc)

        for mention in ents:
            # Looking for a date
            # - In the same sentence
            # - Not less than 10 tokens AFTER the consultation mention
            matching_dates = [
                date
                for date in doc.spans["dates"]
                if (
                    (mention.sent == date.sent)
                    and (date.start > mention.start)
                    and (date.start - mention.end <= 10)
                )
            ]

            if matching_dates:
                # We keep the first mention of a date
                kept_date = min(matching_dates, key=lambda d: d.start)
                span = doc[mention.start : kept_date.end]
                span.label_ = mention.label_
                span._.consultation_date = kept_date._.parsed_date

                doc.spans["consultation_dates"].append(span)

        del doc.spans["consultation_mentions"]

        return doc
date_matcher = Dates(nlp, None=config) instance-attribute
__init__(nlp, consultation_mention, town_mention, document_date_mention, attr, **kwargs)
Source code in edsnlp/pipelines/misc/consultation_dates/consultation_dates.py
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
def __init__(
    self,
    nlp: Language,
    consultation_mention: Union[List[str], bool],
    town_mention: Union[List[str], bool],
    document_date_mention: Union[List[str], bool],
    attr: str,
    **kwargs,
):

    logger.warning("This pipeline is still in beta")
    logger.warning(
        "This pipeline should ONLY be used on notes "
        "where `note_class_source_value == 'CR-CONS'`"
    )
    logger.warning(
        """This pipeline requires to use the normalizer pipeline with:
    lowercase=True,
    accents=True,
    quotes=True"""
    )

    if not (nlp.has_pipe("dates") and nlp.get_pipe("dates").on_ents_only is False):

        config = dict(**DEFAULT_CONFIG)
        config["on_ents_only"] = "consultation_mentions"

        self.date_matcher = Dates(nlp, **config)

    else:
        self.date_matcher = None

    if not consultation_mention:
        consultation_mention = []
    elif consultation_mention is True:
        consultation_mention = consult_regex.consultation_mention

    if not document_date_mention:
        document_date_mention = []
    elif document_date_mention is True:
        document_date_mention = consult_regex.document_date_mention

    if not town_mention:
        town_mention = []
    elif town_mention is True:
        town_mention = consult_regex.town_mention

    regex = dict(
        consultation_mention=consultation_mention,
        town_mention=town_mention,
        document_date_mention=document_date_mention,
    )

    super().__init__(
        nlp,
        regex=regex,
        terms=dict(),
        attr=attr,
        ignore_excluded=False,
        **kwargs,
    )

    self.set_extensions()
set_extensions()
Source code in edsnlp/pipelines/misc/consultation_dates/consultation_dates.py
109
110
111
112
@staticmethod
def set_extensions() -> None:
    if not Span.has_extension("consultation_date"):
        Span.set_extension("consultation_date", default=None)
__call__(doc)

Finds entities

PARAMETER DESCRIPTION
doc

TYPE: Doc

RETURNS DESCRIPTION
doc
Source code in edsnlp/pipelines/misc/consultation_dates/consultation_dates.py
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
def __call__(self, doc: Doc) -> Doc:
    """
    Finds entities

    Parameters
    ----------
    doc: spaCy Doc object

    Returns
    -------
    doc: spaCy Doc object with additionnal doc.spans['consultation_dates] SpanGroup
    """

    ents = self.process(doc)

    doc.spans["consultation_mentions"] = ents
    doc.spans["consultation_dates"] = []

    if self.date_matcher is not None:
        doc = self.date_matcher(doc)

    for mention in ents:
        # Looking for a date
        # - In the same sentence
        # - Not less than 10 tokens AFTER the consultation mention
        matching_dates = [
            date
            for date in doc.spans["dates"]
            if (
                (mention.sent == date.sent)
                and (date.start > mention.start)
                and (date.start - mention.end <= 10)
            )
        ]

        if matching_dates:
            # We keep the first mention of a date
            kept_date = min(matching_dates, key=lambda d: d.start)
            span = doc[mention.start : kept_date.end]
            span.label_ = mention.label_
            span._.consultation_date = kept_date._.parsed_date

            doc.spans["consultation_dates"].append(span)

    del doc.spans["consultation_mentions"]

    return doc

factory

DEFAULT_CONFIG = dict(consultation_mention=True, town_mention=False, document_date_mention=False, attr='NORM') module-attribute
create_component(nlp, name, attr, consultation_mention, town_mention, document_date_mention)
Source code in edsnlp/pipelines/misc/consultation_dates/factory.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
@deprecated_factory(
    "consultation_dates",
    "eds.consultation_dates",
    default_config=DEFAULT_CONFIG,
)
@Language.factory("eds.consultation_dates", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    attr: str,
    consultation_mention: Union[List[str], bool],
    town_mention: Union[List[str], bool],
    document_date_mention: Union[List[str], bool],
):
    return ConsultationDates(
        nlp,
        attr=attr,
        consultation_mention=consultation_mention,
        document_date_mention=document_date_mention,
        town_mention=town_mention,
    )

reason

patterns

reasons = dict(reasons=['(?i)motif de l.?hospitalisation : .+', '(?i)hospitalis[ée].?.*(pour|. cause|suite [àa]).+', '(?i)(consulte|prise en charge(?!\\set\\svous\\sassurer\\sun\\straitement\\sadapté)).*pour.+', '(?i)motif\\sd.hospitalisation\\s:.+', '(?i)au total\\s?\\:?\\s?\\n?.+', '(?i)motif\\sde\\sla\\sconsultation', '(?i)motif\\sd.admission', '(?i)conclusion\\smedicale']) module-attribute
sections_reason = ['motif', 'conclusion'] module-attribute
section_exclude = ['antécédents', 'antécédents familiaux', 'histoire de la maladie'] module-attribute

reason

Reason

Bases: GenericMatcher

Pipeline to identify the reason of the hospitalisation.

It declares a Span extension called ents_reason and adds the key reasons to doc.spans.

It also declares the boolean extension is_reason. This extension is set to True for the Reason Spans but also for the entities that overlap the reason span.

PARAMETER DESCRIPTION
nlp

spaCy nlp pipeline to use for matching.

TYPE: Language

reasons

The terminology of reasons.

TYPE: Optional[Dict[str, Union[List[str], str]]]

attr

spaCy's attribute to use: a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr'. We can also add a key for each regex.

TYPE: str

use_sections

whether or not use the sections pipeline to improve results.

TYPE: bool,

ignore_excluded

Whether to skip excluded tokens.

TYPE: bool

Source code in edsnlp/pipelines/misc/reason/reason.py
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
class Reason(GenericMatcher):
    """Pipeline to identify the reason of the hospitalisation.

    It declares a Span extension called `ents_reason` and adds
    the key `reasons` to doc.spans.

    It also declares the boolean extension `is_reason`.
    This extension is set to True for the Reason Spans but also
    for the entities that overlap the reason span.

    Parameters
    ----------
    nlp : Language
        spaCy nlp pipeline to use for matching.
    reasons : Optional[Dict[str, Union[List[str], str]]]
        The terminology of reasons.
    attr : str
        spaCy's attribute to use:
        a string with the value "TEXT" or "NORM", or a dict with
        the key 'term_attr'. We can also add a key for each regex.
    use_sections : bool,
        whether or not use the `sections` pipeline to improve results.
    ignore_excluded : bool
        Whether to skip excluded tokens.
    """

    def __init__(
        self,
        nlp: Language,
        reasons: Optional[Dict[str, Union[List[str], str]]],
        attr: Union[Dict[str, str], str],
        use_sections: bool,
        ignore_excluded: bool,
    ):

        if reasons is None:
            reasons = patterns.reasons

        super().__init__(
            nlp,
            terms=None,
            regex=reasons,
            attr=attr,
            ignore_excluded=ignore_excluded,
        )

        self.use_sections = use_sections and (
            "eds.sections" in self.nlp.pipe_names or "sections" in self.nlp.pipe_names
        )
        if use_sections and not self.use_sections:
            logger.warning(
                "You have requested that the pipeline use annotations "
                "provided by the `eds.section` pipeline, but it was not set. "
                "Skipping that step."
            )

        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:

        if not Span.has_extension("ents_reason"):
            Span.set_extension("ents_reason", default=None)

        if not Span.has_extension("is_reason"):
            Span.set_extension("is_reason", default=False)

    def _enhance_with_sections(self, sections: Iterable, reasons: Iterable) -> List:
        """Enhance the list of reasons with the section information.
        If the reason overlaps with history, so it will be removed from the list

        Parameters
        ----------
        sections : Iterable
            Spans of sections identified with the `sections` pipeline
        reasons : Iterable
            Reasons list identified by the regex

        Returns
        -------
        List
            Updated list of spans reasons
        """

        for section in sections:
            if section.label_ in patterns.sections_reason:
                reasons.append(section)

            if section.label_ in patterns.section_exclude:
                for reason in reasons:
                    if check_inclusion(reason, section.start, section.end):
                        reasons.remove(reason)

        return reasons

    def __call__(self, doc: Doc) -> Doc:
        """Find spans related to the reasons of the hospitalisation

        Parameters
        ----------
        doc : Doc

        Returns
        -------
        Doc
        """
        matches = self.process(doc)
        reasons = get_spans(matches, "reasons")

        if self.use_sections:
            sections = doc.spans["sections"]
            reasons = self._enhance_with_sections(sections=sections, reasons=reasons)

        doc.spans["reasons"] = reasons

        # Entities
        if len(doc.ents) > 0:
            for reason in reasons:  # TODO optimize this iteration
                ent_list = []
                for ent in doc.ents:
                    if check_inclusion(ent, reason.start, reason.end):
                        ent_list.append(ent)
                        ent._.is_reason = True

                reason._.ents_reason = ent_list
                reason._.is_reason = True

        return doc
use_sections = use_sections and 'eds.sections' in self.nlp.pipe_names or 'sections' in self.nlp.pipe_names instance-attribute
__init__(nlp, reasons, attr, use_sections, ignore_excluded)
Source code in edsnlp/pipelines/misc/reason/reason.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def __init__(
    self,
    nlp: Language,
    reasons: Optional[Dict[str, Union[List[str], str]]],
    attr: Union[Dict[str, str], str],
    use_sections: bool,
    ignore_excluded: bool,
):

    if reasons is None:
        reasons = patterns.reasons

    super().__init__(
        nlp,
        terms=None,
        regex=reasons,
        attr=attr,
        ignore_excluded=ignore_excluded,
    )

    self.use_sections = use_sections and (
        "eds.sections" in self.nlp.pipe_names or "sections" in self.nlp.pipe_names
    )
    if use_sections and not self.use_sections:
        logger.warning(
            "You have requested that the pipeline use annotations "
            "provided by the `eds.section` pipeline, but it was not set. "
            "Skipping that step."
        )

    self.set_extensions()
set_extensions()
Source code in edsnlp/pipelines/misc/reason/reason.py
71
72
73
74
75
76
77
78
@staticmethod
def set_extensions() -> None:

    if not Span.has_extension("ents_reason"):
        Span.set_extension("ents_reason", default=None)

    if not Span.has_extension("is_reason"):
        Span.set_extension("is_reason", default=False)
_enhance_with_sections(sections, reasons)

Enhance the list of reasons with the section information. If the reason overlaps with history, so it will be removed from the list

PARAMETER DESCRIPTION
sections

Spans of sections identified with the sections pipeline

TYPE: Iterable

reasons

Reasons list identified by the regex

TYPE: Iterable

RETURNS DESCRIPTION
List

Updated list of spans reasons

Source code in edsnlp/pipelines/misc/reason/reason.py
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
def _enhance_with_sections(self, sections: Iterable, reasons: Iterable) -> List:
    """Enhance the list of reasons with the section information.
    If the reason overlaps with history, so it will be removed from the list

    Parameters
    ----------
    sections : Iterable
        Spans of sections identified with the `sections` pipeline
    reasons : Iterable
        Reasons list identified by the regex

    Returns
    -------
    List
        Updated list of spans reasons
    """

    for section in sections:
        if section.label_ in patterns.sections_reason:
            reasons.append(section)

        if section.label_ in patterns.section_exclude:
            for reason in reasons:
                if check_inclusion(reason, section.start, section.end):
                    reasons.remove(reason)

    return reasons
__call__(doc)

Find spans related to the reasons of the hospitalisation

PARAMETER DESCRIPTION
doc

TYPE: Doc

RETURNS DESCRIPTION
Doc
Source code in edsnlp/pipelines/misc/reason/reason.py
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
def __call__(self, doc: Doc) -> Doc:
    """Find spans related to the reasons of the hospitalisation

    Parameters
    ----------
    doc : Doc

    Returns
    -------
    Doc
    """
    matches = self.process(doc)
    reasons = get_spans(matches, "reasons")

    if self.use_sections:
        sections = doc.spans["sections"]
        reasons = self._enhance_with_sections(sections=sections, reasons=reasons)

    doc.spans["reasons"] = reasons

    # Entities
    if len(doc.ents) > 0:
        for reason in reasons:  # TODO optimize this iteration
            ent_list = []
            for ent in doc.ents:
                if check_inclusion(ent, reason.start, reason.end):
                    ent_list.append(ent)
                    ent._.is_reason = True

            reason._.ents_reason = ent_list
            reason._.is_reason = True

    return doc

factory

DEFAULT_CONFIG = dict(reasons=None, attr='TEXT', use_sections=False, ignore_excluded=False) module-attribute
create_component(nlp, name, reasons, attr, use_sections, ignore_excluded)
Source code in edsnlp/pipelines/misc/reason/factory.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
@deprecated_factory("reason", "eds.reason", default_config=DEFAULT_CONFIG)
@Language.factory("eds.reason", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    reasons: Optional[Dict[str, Union[List[str], str]]],
    attr: str,
    use_sections: bool,
    ignore_excluded: bool,
):
    return Reason(
        nlp,
        reasons=reasons,
        attr=attr,
        use_sections=use_sections,
        ignore_excluded=ignore_excluded,
    )

sections

patterns

These section titles were extracted from a work performed by Ivan Lerner at AP-HP. It supplied a number of documents annotated for section titles.

The section titles were reviewed by Gilles Chatellier, who gave meaningful insights.

See sections/section-dataset notebook for detail.

allergies = ['allergies'] module-attribute
antecedents = ['antecedents', 'antecedents medicaux et chirurgicaux', 'antecedents personnels', 'antecedents medicaux', 'antecedents chirurgicaux', 'atcd'] module-attribute
antecedents_familiaux = ['antecedents familiaux'] module-attribute
traitements_entree = ['attitude therapeutique initiale', "traitement a l'entree", 'traitement actuel', 'traitement en cours', "traitements a l'entree"] module-attribute
conclusion = ['au total', 'conclusion', 'conclusion de sortie', 'syntese medicale / conclusion', 'synthese', 'synthese medicale', 'synthese medicale/conclusion', 'conclusion medicale'] module-attribute
conclusion_entree = ["conclusion a l'entree"] module-attribute
habitus = ['contexte familial et social', 'habitus', 'mode de vie', 'mode de vie - scolarite', 'situation sociale, mode de vie'] module-attribute
correspondants = ['correspondants'] module-attribute
diagnostic = ['diagnostic retenu'] module-attribute
donnees_biometriques_entree = ["donnees biometriques et parametres vitaux a l'entree", "parametres vitaux et donnees biometriques a l'entree"] module-attribute
examens = ['examen clinique', "examen clinique a l'entree"] module-attribute
examens_complementaires = ['examen(s) complementaire(s)', 'examens complementaires', "examens complementaires a l'entree", 'examens complementaires realises pendant le sejour', 'examens para-cliniques'] module-attribute
facteurs_de_risques = ['facteurs de risque', 'facteurs de risques'] module-attribute
histoire_de_la_maladie = ['histoire de la maladie', 'histoire de la maladie - explorations', 'histoire de la maladie actuelle', 'histoire du poids', 'histoire recente', 'histoire recente de la maladie', 'rappel clinique', 'resume', 'resume clinique'] module-attribute
actes = ['intervention'] module-attribute
motif = ['motif', "motif d'hospitalisation", "motif de l'hospitalisation", 'motif medical'] module-attribute
prescriptions = ['prescriptions de sortie', 'prescriptions medicales de sortie'] module-attribute
traitements_sortie = ['traitement de sortie'] module-attribute
sections = {'allergies': allergies, 'antécédents': antecedents, 'antécédents familiaux': antecedents_familiaux, 'traitements entrée': traitements_entree, 'conclusion': conclusion, 'conclusion entrée': conclusion_entree, 'habitus': habitus, 'correspondants': correspondants, 'diagnostic': diagnostic, 'données biométriques entrée': donnees_biometriques_entree, 'examens': examens, 'examens complémentaires': examens_complementaires, 'facteurs de risques': facteurs_de_risques, 'histoire de la maladie': histoire_de_la_maladie, 'actes': actes, 'motif': motif, 'prescriptions': prescriptions, 'traitements sortie': traitements_sortie} module-attribute

sections

Sections

Bases: GenericMatcher

Divides the document into sections.

By default, we are using a dataset of documents annotated for section titles, using the work done by Ivan Lerner, reviewed by Gilles Chatellier.

Detected sections are :

  • allergies ;
  • antécédents ;
  • antécédents familiaux ;
  • traitements entrée ;
  • conclusion ;
  • conclusion entrée ;
  • habitus ;
  • correspondants ;
  • diagnostic ;
  • données biométriques entrée ;
  • examens ;
  • examens complémentaires ;
  • facteurs de risques ;
  • histoire de la maladie ;
  • actes ;
  • motif ;
  • prescriptions ;
  • traitements sortie.

The component looks for section titles within the document, and stores them in the section_title extension.

For ease-of-use, the component also populates a section extension, which contains a list of spans corresponding to the "sections" of the document. These span from the start of one section title to the next, which can introduce obvious bias should an intermediate section title goes undetected.

PARAMETER DESCRIPTION
nlp

spaCy pipeline object.

TYPE: Language

sections

Dictionary of terms to look for.

TYPE: Dict[str, List[str]]

attr

Default attribute to match on.

TYPE: str

ignore_excluded

Whether to skip excluded tokens.

TYPE: bool

Source code in edsnlp/pipelines/misc/sections/sections.py
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
class Sections(GenericMatcher):
    """
    Divides the document into sections.

    By default, we are using a dataset of documents annotated for section titles,
    using the work done by Ivan Lerner, reviewed by Gilles Chatellier.

    Detected sections are :

    - allergies ;
    - antécédents ;
    - antécédents familiaux ;
    - traitements entrée ;
    - conclusion ;
    - conclusion entrée ;
    - habitus ;
    - correspondants ;
    - diagnostic ;
    - données biométriques entrée ;
    - examens ;
    - examens complémentaires ;
    - facteurs de risques ;
    - histoire de la maladie ;
    - actes ;
    - motif ;
    - prescriptions ;
    - traitements sortie.

    The component looks for section titles within the document,
    and stores them in the `section_title` extension.

    For ease-of-use, the component also populates a `section` extension,
    which contains a list of spans corresponding to the "sections" of the
    document. These span from the start of one section title to the next,
    which can introduce obvious bias should an intermediate section title
    goes undetected.

    Parameters
    ----------
    nlp : Language
        spaCy pipeline object.
    sections : Dict[str, List[str]]
        Dictionary of terms to look for.
    attr : str
        Default attribute to match on.
    ignore_excluded : bool
        Whether to skip excluded tokens.
    """

    def __init__(
        self,
        nlp: Language,
        sections: Dict[str, List[str]],
        add_patterns: bool,
        attr: str,
        ignore_excluded: bool,
    ):

        logger.warning(
            "The component Sections is still in Beta. Use at your own risks."
        )

        if sections is None:
            sections = patterns.sections

        self.add_patterns = add_patterns
        if add_patterns:
            for k, v in sections.items():
                sections[k] = [r"\n[^\n]{0,5}" + ent + r"[^\n]{0,5}\n" for ent in v]

        super().__init__(
            nlp,
            terms=None,
            regex=sections,
            attr=attr,
            ignore_excluded=ignore_excluded,
        )

        self.set_extensions()

        if not nlp.has_pipe("normalizer") and not not nlp.has_pipe("eds.normalizer"):
            logger.warning("You should add pipe `eds.normalizer`")

    @staticmethod
    def set_extensions():

        if not Span.has_extension("section_title"):
            Span.set_extension("section_title", default=None)

        if not Span.has_extension("section"):
            Span.set_extension("section", default=None)

    # noinspection PyProtectedMember
    def __call__(self, doc: Doc) -> Doc:
        """
        Divides the doc into sections

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        doc:
            spaCy Doc object, annotated for sections
        """
        titles = filter_spans(self.process(doc))

        if self.add_patterns:
            # Remove preceding newline
            titles = [
                Span(doc, title.start + 1, title.end - 1, label=title.label_)
                for title in titles
            ]

        sections = []

        for t1, t2 in zip(titles[:-1], titles[1:]):
            section = Span(doc, t1.start, t2.start, label=t1.label)
            section._.section_title = t1
            sections.append(section)

        if titles:
            t = titles[-1]
            section = Span(doc, t.start, len(doc), label=t.label)
            section._.section_title = t
            sections.append(section)

        doc.spans["sections"] = sections
        doc.spans["section_titles"] = titles

        return doc
add_patterns = add_patterns instance-attribute
__init__(nlp, sections, add_patterns, attr, ignore_excluded)
Source code in edsnlp/pipelines/misc/sections/sections.py
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
def __init__(
    self,
    nlp: Language,
    sections: Dict[str, List[str]],
    add_patterns: bool,
    attr: str,
    ignore_excluded: bool,
):

    logger.warning(
        "The component Sections is still in Beta. Use at your own risks."
    )

    if sections is None:
        sections = patterns.sections

    self.add_patterns = add_patterns
    if add_patterns:
        for k, v in sections.items():
            sections[k] = [r"\n[^\n]{0,5}" + ent + r"[^\n]{0,5}\n" for ent in v]

    super().__init__(
        nlp,
        terms=None,
        regex=sections,
        attr=attr,
        ignore_excluded=ignore_excluded,
    )

    self.set_extensions()

    if not nlp.has_pipe("normalizer") and not not nlp.has_pipe("eds.normalizer"):
        logger.warning("You should add pipe `eds.normalizer`")
set_extensions()
Source code in edsnlp/pipelines/misc/sections/sections.py
 96
 97
 98
 99
100
101
102
103
@staticmethod
def set_extensions():

    if not Span.has_extension("section_title"):
        Span.set_extension("section_title", default=None)

    if not Span.has_extension("section"):
        Span.set_extension("section", default=None)
__call__(doc)

Divides the doc into sections

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
doc

spaCy Doc object, annotated for sections

Source code in edsnlp/pipelines/misc/sections/sections.py
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
def __call__(self, doc: Doc) -> Doc:
    """
    Divides the doc into sections

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    doc:
        spaCy Doc object, annotated for sections
    """
    titles = filter_spans(self.process(doc))

    if self.add_patterns:
        # Remove preceding newline
        titles = [
            Span(doc, title.start + 1, title.end - 1, label=title.label_)
            for title in titles
        ]

    sections = []

    for t1, t2 in zip(titles[:-1], titles[1:]):
        section = Span(doc, t1.start, t2.start, label=t1.label)
        section._.section_title = t1
        sections.append(section)

    if titles:
        t = titles[-1]
        section = Span(doc, t.start, len(doc), label=t.label)
        section._.section_title = t
        sections.append(section)

    doc.spans["sections"] = sections
    doc.spans["section_titles"] = titles

    return doc

factory

DEFAULT_CONFIG = dict(sections=None, add_patterns=True, attr='NORM', ignore_excluded=True) module-attribute
create_component(nlp, name, sections, add_patterns, attr, ignore_excluded)
Source code in edsnlp/pipelines/misc/sections/factory.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
@deprecated_factory("sections", "eds.sections", default_config=DEFAULT_CONFIG)
@Language.factory("eds.sections", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    sections: Optional[Dict[str, List[str]]],
    add_patterns: bool,
    attr: str,
    ignore_excluded: bool,
):
    return Sections(
        nlp,
        sections=sections,
        add_patterns=add_patterns,
        attr=attr,
        ignore_excluded=ignore_excluded,
    )
Back to top