Skip to content

edsnlp.pipelines.misc

reason

patterns

reasons = dict(reasons=['(?i)motif de l.?hospitalisation : .+', '(?i)hospitalis[ée].?.*(pour|. cause|suite [àa]).+', '(?i)(consulte|prise en charge(?!\\set\\svous\\sassurer\\sun\\straitement\\sadapté)).*pour.+', '(?i)motif\\sd.hospitalisation\\s:.+', '(?i)au total\\s?\\:?\\s?\\n?.+', '(?i)motif\\sde\\sla\\sconsultation', '(?i)motif\\sd.admission', '(?i)conclusion\\smedicale']) module-attribute
sections_reason = ['motif', 'conclusion'] module-attribute
section_exclude = ['antécédents', 'antécédents familiaux', 'histoire de la maladie'] module-attribute

factory

DEFAULT_CONFIG = dict(reasons=None, attr='TEXT', use_sections=False, ignore_excluded=False) module-attribute
create_component(nlp, name, reasons, attr, use_sections, ignore_excluded)
Source code in edsnlp/pipelines/misc/reason/factory.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
@deprecated_factory("reason", "eds.reason", default_config=DEFAULT_CONFIG)
@Language.factory("eds.reason", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    reasons: Optional[Dict[str, Union[List[str], str]]],
    attr: str,
    use_sections: bool,
    ignore_excluded: bool,
):
    return Reason(
        nlp,
        reasons=reasons,
        attr=attr,
        use_sections=use_sections,
        ignore_excluded=ignore_excluded,
    )

reason

Reason

Bases: GenericMatcher

Pipeline to identify the reason of the hospitalisation.

It declares a Span extension called ents_reason and adds the key reasons to doc.spans.

It also declares the boolean extension is_reason. This extension is set to True for the Reason Spans but also for the entities that overlap the reason span.

PARAMETER DESCRIPTION
nlp

spaCy nlp pipeline to use for matching.

TYPE: Language

reasons

The terminology of reasons.

TYPE: Optional[Dict[str, Union[List[str], str]]]

attr

spaCy's attribute to use: a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr'. We can also add a key for each regex.

TYPE: str

use_sections

whether or not use the sections pipeline to improve results.

TYPE: bool,

ignore_excluded

Whether to skip excluded tokens.

TYPE: bool

Source code in edsnlp/pipelines/misc/reason/reason.py
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
class Reason(GenericMatcher):
    """Pipeline to identify the reason of the hospitalisation.

    It declares a Span extension called `ents_reason` and adds
    the key `reasons` to doc.spans.

    It also declares the boolean extension `is_reason`.
    This extension is set to True for the Reason Spans but also
    for the entities that overlap the reason span.

    Parameters
    ----------
    nlp : Language
        spaCy nlp pipeline to use for matching.
    reasons : Optional[Dict[str, Union[List[str], str]]]
        The terminology of reasons.
    attr : str
        spaCy's attribute to use:
        a string with the value "TEXT" or "NORM", or a dict with
        the key 'term_attr'. We can also add a key for each regex.
    use_sections : bool,
        whether or not use the `sections` pipeline to improve results.
    ignore_excluded : bool
        Whether to skip excluded tokens.
    """

    def __init__(
        self,
        nlp: Language,
        reasons: Optional[Dict[str, Union[List[str], str]]],
        attr: Union[Dict[str, str], str],
        use_sections: bool,
        ignore_excluded: bool,
    ):

        if reasons is None:
            reasons = patterns.reasons

        super().__init__(
            nlp,
            terms=None,
            regex=reasons,
            attr=attr,
            ignore_excluded=ignore_excluded,
        )

        self.use_sections = use_sections and (
            "eds.sections" in self.nlp.pipe_names or "sections" in self.nlp.pipe_names
        )
        if use_sections and not self.use_sections:
            logger.warning(
                "You have requested that the pipeline use annotations "
                "provided by the `eds.section` pipeline, but it was not set. "
                "Skipping that step."
            )

        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:

        if not Span.has_extension("ents_reason"):
            Span.set_extension("ents_reason", default=None)

        if not Span.has_extension("is_reason"):
            Span.set_extension("is_reason", default=False)

    def _enhance_with_sections(self, sections: Iterable, reasons: Iterable) -> List:
        """Enhance the list of reasons with the section information.
        If the reason overlaps with history, so it will be removed from the list

        Parameters
        ----------
        sections : Iterable
            Spans of sections identified with the `sections` pipeline
        reasons : Iterable
            Reasons list identified by the regex

        Returns
        -------
        List
            Updated list of spans reasons
        """

        for section in sections:
            if section.label_ in patterns.sections_reason:
                reasons.append(section)

            if section.label_ in patterns.section_exclude:
                for reason in reasons:
                    if check_inclusion(reason, section.start, section.end):
                        reasons.remove(reason)

        return reasons

    def __call__(self, doc: Doc) -> Doc:
        """Find spans related to the reasons of the hospitalisation

        Parameters
        ----------
        doc : Doc

        Returns
        -------
        Doc
        """
        matches = self.process(doc)
        reasons = get_spans(matches, "reasons")

        if self.use_sections:
            sections = doc.spans["sections"]
            reasons = self._enhance_with_sections(sections=sections, reasons=reasons)

        doc.spans["reasons"] = reasons

        # Entities
        if len(doc.ents) > 0:
            for reason in reasons:  # TODO optimize this iteration
                ent_list = []
                for ent in doc.ents:
                    if check_inclusion(ent, reason.start, reason.end):
                        ent_list.append(ent)
                        ent._.is_reason = True

                reason._.ents_reason = ent_list
                reason._.is_reason = True

        return doc
use_sections = use_sections and 'eds.sections' in self.nlp.pipe_names or 'sections' in self.nlp.pipe_names instance-attribute
__init__(nlp, reasons, attr, use_sections, ignore_excluded)
Source code in edsnlp/pipelines/misc/reason/reason.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def __init__(
    self,
    nlp: Language,
    reasons: Optional[Dict[str, Union[List[str], str]]],
    attr: Union[Dict[str, str], str],
    use_sections: bool,
    ignore_excluded: bool,
):

    if reasons is None:
        reasons = patterns.reasons

    super().__init__(
        nlp,
        terms=None,
        regex=reasons,
        attr=attr,
        ignore_excluded=ignore_excluded,
    )

    self.use_sections = use_sections and (
        "eds.sections" in self.nlp.pipe_names or "sections" in self.nlp.pipe_names
    )
    if use_sections and not self.use_sections:
        logger.warning(
            "You have requested that the pipeline use annotations "
            "provided by the `eds.section` pipeline, but it was not set. "
            "Skipping that step."
        )

    self.set_extensions()
set_extensions()
Source code in edsnlp/pipelines/misc/reason/reason.py
71
72
73
74
75
76
77
78
@staticmethod
def set_extensions() -> None:

    if not Span.has_extension("ents_reason"):
        Span.set_extension("ents_reason", default=None)

    if not Span.has_extension("is_reason"):
        Span.set_extension("is_reason", default=False)
_enhance_with_sections(sections, reasons)

Enhance the list of reasons with the section information. If the reason overlaps with history, so it will be removed from the list

PARAMETER DESCRIPTION
sections

Spans of sections identified with the sections pipeline

TYPE: Iterable

reasons

Reasons list identified by the regex

TYPE: Iterable

RETURNS DESCRIPTION
List

Updated list of spans reasons

Source code in edsnlp/pipelines/misc/reason/reason.py
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
def _enhance_with_sections(self, sections: Iterable, reasons: Iterable) -> List:
    """Enhance the list of reasons with the section information.
    If the reason overlaps with history, so it will be removed from the list

    Parameters
    ----------
    sections : Iterable
        Spans of sections identified with the `sections` pipeline
    reasons : Iterable
        Reasons list identified by the regex

    Returns
    -------
    List
        Updated list of spans reasons
    """

    for section in sections:
        if section.label_ in patterns.sections_reason:
            reasons.append(section)

        if section.label_ in patterns.section_exclude:
            for reason in reasons:
                if check_inclusion(reason, section.start, section.end):
                    reasons.remove(reason)

    return reasons
__call__(doc)

Find spans related to the reasons of the hospitalisation

PARAMETER DESCRIPTION
doc

TYPE: Doc

RETURNS DESCRIPTION
Doc
Source code in edsnlp/pipelines/misc/reason/reason.py
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
def __call__(self, doc: Doc) -> Doc:
    """Find spans related to the reasons of the hospitalisation

    Parameters
    ----------
    doc : Doc

    Returns
    -------
    Doc
    """
    matches = self.process(doc)
    reasons = get_spans(matches, "reasons")

    if self.use_sections:
        sections = doc.spans["sections"]
        reasons = self._enhance_with_sections(sections=sections, reasons=reasons)

    doc.spans["reasons"] = reasons

    # Entities
    if len(doc.ents) > 0:
        for reason in reasons:  # TODO optimize this iteration
            ent_list = []
            for ent in doc.ents:
                if check_inclusion(ent, reason.start, reason.end):
                    ent_list.append(ent)
                    ent._.is_reason = True

            reason._.ents_reason = ent_list
            reason._.is_reason = True

    return doc

dates

dates

eds.dates pipeline.

PERIOD_PROXIMITY_THRESHOLD = 3 module-attribute
Dates

Bases: BaseComponent

Tags and normalizes dates, using the open-source dateparser library.

The pipeline uses spaCy's filter_spans function. It filters out false positives, and introduce a hierarchy between patterns. For instance, in case of ambiguity, the pipeline will decide that a date is a date without a year rather than a date without a day.

PARAMETER DESCRIPTION
nlp

Language pipeline object

TYPE: spacy.language.Language

absolute

List of regular expressions for absolute dates.

TYPE: Union[List[str], str]

relative

List of regular expressions for relative dates (eg hier, la semaine prochaine).

TYPE: Union[List[str], str]

duration

List of regular expressions for durations (eg pendant trois mois).

TYPE: Union[List[str], str]

false_positive

List of regular expressions for false positive (eg phone numbers, etc).

TYPE: Union[List[str], str]

on_ents_only

Wether to look on dates in the whole document or in specific sentences:

  • If True: Only look in the sentences of each entity in doc.ents
  • If False: Look in the whole document
  • If given a string key or list of string: Only look in the sentences of each entity in doc.spans[key]

TYPE: Union[bool, str, List[str]]

detect_periods

Wether to detect periods (experimental)

TYPE: bool

attr

spaCy attribute to use

TYPE: str

Source code in edsnlp/pipelines/misc/dates/dates.py
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
class Dates(BaseComponent):
    """
    Tags and normalizes dates, using the open-source `dateparser` library.

    The pipeline uses spaCy's `filter_spans` function.
    It filters out false positives, and introduce a hierarchy between patterns.
    For instance, in case of ambiguity, the pipeline will decide that a date is a
    date without a year rather than a date without a day.

    Parameters
    ----------
    nlp : spacy.language.Language
        Language pipeline object
    absolute : Union[List[str], str]
        List of regular expressions for absolute dates.
    relative : Union[List[str], str]
        List of regular expressions for relative dates
        (eg `hier`, `la semaine prochaine`).
    duration : Union[List[str], str]
        List of regular expressions for durations
        (eg `pendant trois mois`).
    false_positive : Union[List[str], str]
        List of regular expressions for false positive (eg phone numbers, etc).
    on_ents_only : Union[bool, str, List[str]]
        Wether to look on dates in the whole document or in specific sentences:

        - If `True`: Only look in the sentences of each entity in doc.ents
        - If False: Look in the whole document
        - If given a string `key` or list of string: Only look in the sentences of
          each entity in `#!python doc.spans[key]`
    detect_periods : bool
        Wether to detect periods (experimental)
    attr : str
        spaCy attribute to use
    """

    # noinspection PyProtectedMember
    def __init__(
        self,
        nlp: Language,
        absolute: Optional[List[str]],
        relative: Optional[List[str]],
        duration: Optional[List[str]],
        false_positive: Optional[List[str]],
        on_ents_only: Union[bool, List[str]],
        detect_periods: bool,
        attr: str,
    ):

        self.nlp = nlp

        if absolute is None:
            absolute = patterns.absolute_pattern
        if relative is None:
            relative = patterns.relative_pattern
        if duration is None:
            duration = patterns.duration_pattern
        if false_positive is None:
            false_positive = patterns.false_positive_pattern

        if isinstance(absolute, str):
            absolute = [absolute]
        if isinstance(relative, str):
            relative = [relative]
        if isinstance(duration, str):
            relative = [duration]
        if isinstance(false_positive, str):
            false_positive = [false_positive]

        self.on_ents_only = on_ents_only
        self.regex_matcher = RegexMatcher(attr=attr, alignment_mode="strict")

        self.regex_matcher.add("false_positive", false_positive)
        self.regex_matcher.add("absolute", absolute)
        self.regex_matcher.add("relative", relative)
        self.regex_matcher.add("duration", duration)

        self.detect_periods = detect_periods

        if detect_periods:
            logger.warning("The period extractor is experimental.")

        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:
        """
        Set extensions for the dates pipeline.
        """

        if not Span.has_extension("datetime"):
            Span.set_extension("datetime", default=None)

        if not Span.has_extension("date"):
            Span.set_extension("date", default=None)

        if not Span.has_extension("period"):
            Span.set_extension("period", default=None)

    def process(self, doc: Doc) -> List[Span]:
        """
        Find dates in doc.

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        dates:
            list of date spans
        """

        if self.on_ents_only:

            if type(self.on_ents_only) == bool:
                ents = doc.ents
            else:
                if type(self.on_ents_only) == str:
                    self.on_ents_only = [self.on_ents_only]
                ents = []
                for key in self.on_ents_only:
                    ents.extend(list(doc.spans[key]))

            dates = []
            for sent in set([ent.sent for ent in ents]):
                dates = chain(
                    dates,
                    self.regex_matcher(
                        sent,
                        as_spans=True,
                        return_groupdict=True,
                    ),
                )

        else:
            dates = self.regex_matcher(
                doc,
                as_spans=True,
                return_groupdict=True,
            )

        dates = filter_spans(dates)
        dates = [date for date in dates if date[0].label_ != "false_positive"]

        return dates

    def parse(self, dates: List[Tuple[Span, Dict[str, str]]]) -> List[Span]:
        """
        Parse dates using the groupdict returned by the matcher.

        Parameters
        ----------
        dates : List[Tuple[Span, Dict[str, str]]]
            List of tuples containing the spans and groupdict
            returned by the matcher.

        Returns
        -------
        List[Span]
            List of processed spans, with the date parsed.
        """

        for span, groupdict in dates:
            if span.label_ == "relative":
                parsed = RelativeDate.parse_obj(groupdict)
            elif span.label_ == "absolute":
                parsed = AbsoluteDate.parse_obj(groupdict)
            else:
                parsed = Duration.parse_obj(groupdict)

            span._.date = parsed

        return [span for span, _ in dates]

    def process_periods(self, dates: List[Span]) -> List[Span]:
        """
        Experimental period detection.

        Parameters
        ----------
        dates : List[Span]
            List of detected dates.

        Returns
        -------
        List[Span]
            List of detected periods.
        """

        if len(dates) < 2:
            return []

        periods = []
        seen = set()

        dates = list(sorted(dates, key=lambda d: d.start))

        for d1, d2 in zip(dates[:-1], dates[1:]):

            if d1._.date.mode == Mode.DURATION or d2._.date.mode == Mode.DURATION:
                pass
            elif d1 in seen or d1._.date.mode is None or d2._.date.mode is None:
                continue

            if (
                d1.end - d2.start < PERIOD_PROXIMITY_THRESHOLD
                and d1._.date.mode != d2._.date.mode
            ):

                period = Span(d1.doc, d1.start, d2.end, label="period")

                # If one date is a duration,
                # the other may not have a registered mode.
                m1 = d1._.date.mode or Mode.FROM
                m2 = d2._.date.mode or Mode.FROM

                period._.period = Period.parse_obj(
                    {
                        m1.value: d1,
                        m2.value: d2,
                    }
                )

                seen.add(d1)
                seen.add(d2)

                periods.append(period)

        return periods

    def __call__(self, doc: Doc) -> Doc:
        """
        Tags dates.

        Parameters
        ----------
        doc : Doc
            spaCy Doc object

        Returns
        -------
        doc : Doc
            spaCy Doc object, annotated for dates
        """
        dates = self.process(doc)
        dates = self.parse(dates)

        doc.spans["dates"] = dates

        if self.detect_periods:
            doc.spans["periods"] = self.process_periods(dates)

        return doc
nlp = nlp instance-attribute
on_ents_only = on_ents_only instance-attribute
regex_matcher = RegexMatcher(attr=attr, alignment_mode='strict') instance-attribute
detect_periods = detect_periods instance-attribute
__init__(nlp, absolute, relative, duration, false_positive, on_ents_only, detect_periods, attr)
Source code in edsnlp/pipelines/misc/dates/dates.py
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
def __init__(
    self,
    nlp: Language,
    absolute: Optional[List[str]],
    relative: Optional[List[str]],
    duration: Optional[List[str]],
    false_positive: Optional[List[str]],
    on_ents_only: Union[bool, List[str]],
    detect_periods: bool,
    attr: str,
):

    self.nlp = nlp

    if absolute is None:
        absolute = patterns.absolute_pattern
    if relative is None:
        relative = patterns.relative_pattern
    if duration is None:
        duration = patterns.duration_pattern
    if false_positive is None:
        false_positive = patterns.false_positive_pattern

    if isinstance(absolute, str):
        absolute = [absolute]
    if isinstance(relative, str):
        relative = [relative]
    if isinstance(duration, str):
        relative = [duration]
    if isinstance(false_positive, str):
        false_positive = [false_positive]

    self.on_ents_only = on_ents_only
    self.regex_matcher = RegexMatcher(attr=attr, alignment_mode="strict")

    self.regex_matcher.add("false_positive", false_positive)
    self.regex_matcher.add("absolute", absolute)
    self.regex_matcher.add("relative", relative)
    self.regex_matcher.add("duration", duration)

    self.detect_periods = detect_periods

    if detect_periods:
        logger.warning("The period extractor is experimental.")

    self.set_extensions()
set_extensions()

Set extensions for the dates pipeline.

Source code in edsnlp/pipelines/misc/dates/dates.py
104
105
106
107
108
109
110
111
112
113
114
115
116
117
@staticmethod
def set_extensions() -> None:
    """
    Set extensions for the dates pipeline.
    """

    if not Span.has_extension("datetime"):
        Span.set_extension("datetime", default=None)

    if not Span.has_extension("date"):
        Span.set_extension("date", default=None)

    if not Span.has_extension("period"):
        Span.set_extension("period", default=None)
process(doc)

Find dates in doc.

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
dates

list of date spans

Source code in edsnlp/pipelines/misc/dates/dates.py
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
def process(self, doc: Doc) -> List[Span]:
    """
    Find dates in doc.

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    dates:
        list of date spans
    """

    if self.on_ents_only:

        if type(self.on_ents_only) == bool:
            ents = doc.ents
        else:
            if type(self.on_ents_only) == str:
                self.on_ents_only = [self.on_ents_only]
            ents = []
            for key in self.on_ents_only:
                ents.extend(list(doc.spans[key]))

        dates = []
        for sent in set([ent.sent for ent in ents]):
            dates = chain(
                dates,
                self.regex_matcher(
                    sent,
                    as_spans=True,
                    return_groupdict=True,
                ),
            )

    else:
        dates = self.regex_matcher(
            doc,
            as_spans=True,
            return_groupdict=True,
        )

    dates = filter_spans(dates)
    dates = [date for date in dates if date[0].label_ != "false_positive"]

    return dates
parse(dates)

Parse dates using the groupdict returned by the matcher.

PARAMETER DESCRIPTION
dates

List of tuples containing the spans and groupdict returned by the matcher.

TYPE: List[Tuple[Span, Dict[str, str]]]

RETURNS DESCRIPTION
List[Span]

List of processed spans, with the date parsed.

Source code in edsnlp/pipelines/misc/dates/dates.py
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
def parse(self, dates: List[Tuple[Span, Dict[str, str]]]) -> List[Span]:
    """
    Parse dates using the groupdict returned by the matcher.

    Parameters
    ----------
    dates : List[Tuple[Span, Dict[str, str]]]
        List of tuples containing the spans and groupdict
        returned by the matcher.

    Returns
    -------
    List[Span]
        List of processed spans, with the date parsed.
    """

    for span, groupdict in dates:
        if span.label_ == "relative":
            parsed = RelativeDate.parse_obj(groupdict)
        elif span.label_ == "absolute":
            parsed = AbsoluteDate.parse_obj(groupdict)
        else:
            parsed = Duration.parse_obj(groupdict)

        span._.date = parsed

    return [span for span, _ in dates]
process_periods(dates)

Experimental period detection.

PARAMETER DESCRIPTION
dates

List of detected dates.

TYPE: List[Span]

RETURNS DESCRIPTION
List[Span]

List of detected periods.

Source code in edsnlp/pipelines/misc/dates/dates.py
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
def process_periods(self, dates: List[Span]) -> List[Span]:
    """
    Experimental period detection.

    Parameters
    ----------
    dates : List[Span]
        List of detected dates.

    Returns
    -------
    List[Span]
        List of detected periods.
    """

    if len(dates) < 2:
        return []

    periods = []
    seen = set()

    dates = list(sorted(dates, key=lambda d: d.start))

    for d1, d2 in zip(dates[:-1], dates[1:]):

        if d1._.date.mode == Mode.DURATION or d2._.date.mode == Mode.DURATION:
            pass
        elif d1 in seen or d1._.date.mode is None or d2._.date.mode is None:
            continue

        if (
            d1.end - d2.start < PERIOD_PROXIMITY_THRESHOLD
            and d1._.date.mode != d2._.date.mode
        ):

            period = Span(d1.doc, d1.start, d2.end, label="period")

            # If one date is a duration,
            # the other may not have a registered mode.
            m1 = d1._.date.mode or Mode.FROM
            m2 = d2._.date.mode or Mode.FROM

            period._.period = Period.parse_obj(
                {
                    m1.value: d1,
                    m2.value: d2,
                }
            )

            seen.add(d1)
            seen.add(d2)

            periods.append(period)

    return periods
__call__(doc)

Tags dates.

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
doc

spaCy Doc object, annotated for dates

TYPE: Doc

Source code in edsnlp/pipelines/misc/dates/dates.py
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
def __call__(self, doc: Doc) -> Doc:
    """
    Tags dates.

    Parameters
    ----------
    doc : Doc
        spaCy Doc object

    Returns
    -------
    doc : Doc
        spaCy Doc object, annotated for dates
    """
    dates = self.process(doc)
    dates = self.parse(dates)

    doc.spans["dates"] = dates

    if self.detect_periods:
        doc.spans["periods"] = self.process_periods(dates)

    return doc

models

Direction

Bases: Enum

Source code in edsnlp/pipelines/misc/dates/models.py
12
13
14
15
16
class Direction(Enum):

    FUTURE = "FUTURE"
    PAST = "PAST"
    CURRENT = "CURRENT"
FUTURE = 'FUTURE' class-attribute
PAST = 'PAST' class-attribute
CURRENT = 'CURRENT' class-attribute
Mode

Bases: Enum

Source code in edsnlp/pipelines/misc/dates/models.py
19
20
21
22
23
class Mode(Enum):

    FROM = "FROM"
    UNTIL = "UNTIL"
    DURATION = "DURATION"
FROM = 'FROM' class-attribute
UNTIL = 'UNTIL' class-attribute
DURATION = 'DURATION' class-attribute
Period

Bases: BaseModel

Source code in edsnlp/pipelines/misc/dates/models.py
26
27
28
29
30
31
32
class Period(BaseModel):
    FROM: Optional[Span] = None
    UNTIL: Optional[Span] = None
    DURATION: Optional[Span] = None

    class Config:
        arbitrary_types_allowed = True
FROM: Optional[Span] = None class-attribute
UNTIL: Optional[Span] = None class-attribute
DURATION: Optional[Span] = None class-attribute
Config
Source code in edsnlp/pipelines/misc/dates/models.py
31
32
class Config:
    arbitrary_types_allowed = True
arbitrary_types_allowed = True class-attribute
BaseDate

Bases: BaseModel

Source code in edsnlp/pipelines/misc/dates/models.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
class BaseDate(BaseModel):

    mode: Optional[Mode] = None

    @root_validator(pre=True)
    def validate_strings(cls, d: Dict[str, str]) -> Dict[str, str]:
        result = d.copy()

        for k, v in d.items():
            if v is not None and "_" in k:
                key, value = k.split("_")
                result.update({key: value})

        return result
mode: Optional[Mode] = None class-attribute
validate_strings(d)
Source code in edsnlp/pipelines/misc/dates/models.py
39
40
41
42
43
44
45
46
47
48
@root_validator(pre=True)
def validate_strings(cls, d: Dict[str, str]) -> Dict[str, str]:
    result = d.copy()

    for k, v in d.items():
        if v is not None and "_" in k:
            key, value = k.split("_")
            result.update({key: value})

    return result
AbsoluteDate

Bases: BaseDate

Source code in edsnlp/pipelines/misc/dates/models.py
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
class AbsoluteDate(BaseDate):

    year: Optional[int] = None
    month: Optional[int] = None
    day: Optional[int] = None
    hour: Optional[int] = None
    minute: Optional[int] = None
    second: Optional[int] = None

    def to_datetime(
        self,
        tz: Union[str, pendulum.tz.timezone] = "Europe/Paris",
        **kwargs,
    ) -> Optional[pendulum.datetime]:

        if self.year and self.month and self.day:

            d = self.dict(exclude_none=True)

            d.pop("mode", None)

            return pendulum.datetime(**d, tz=tz)

        return None

    def norm(self) -> str:

        year = str(self.year) if self.year else "????"
        month = f"{self.month:02}" if self.month else "??"
        day = f"{self.day:02}" if self.day else "??"

        norm = "-".join([year, month, day])

        if self.hour:
            norm += f" {self.hour:02}h"

        if self.minute:
            norm += f"{self.minute:02}m"

        if self.second:
            norm += f"{self.second:02}s"

        return norm

    @validator("year")
    def validate_year(cls, v):
        if v > 100:
            return v

        if v < 25:
            return 2000 + v
year: Optional[int] = None class-attribute
month: Optional[int] = None class-attribute
day: Optional[int] = None class-attribute
hour: Optional[int] = None class-attribute
minute: Optional[int] = None class-attribute
second: Optional[int] = None class-attribute
to_datetime(tz='Europe/Paris', **kwargs)
Source code in edsnlp/pipelines/misc/dates/models.py
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def to_datetime(
    self,
    tz: Union[str, pendulum.tz.timezone] = "Europe/Paris",
    **kwargs,
) -> Optional[pendulum.datetime]:

    if self.year and self.month and self.day:

        d = self.dict(exclude_none=True)

        d.pop("mode", None)

        return pendulum.datetime(**d, tz=tz)

    return None
norm()
Source code in edsnlp/pipelines/misc/dates/models.py
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
def norm(self) -> str:

    year = str(self.year) if self.year else "????"
    month = f"{self.month:02}" if self.month else "??"
    day = f"{self.day:02}" if self.day else "??"

    norm = "-".join([year, month, day])

    if self.hour:
        norm += f" {self.hour:02}h"

    if self.minute:
        norm += f"{self.minute:02}m"

    if self.second:
        norm += f"{self.second:02}s"

    return norm
validate_year(v)
Source code in edsnlp/pipelines/misc/dates/models.py
 95
 96
 97
 98
 99
100
101
@validator("year")
def validate_year(cls, v):
    if v > 100:
        return v

    if v < 25:
        return 2000 + v
Relative

Bases: BaseDate

Source code in edsnlp/pipelines/misc/dates/models.py
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
class Relative(BaseDate):

    year: Optional[int] = None
    month: Optional[int] = None
    week: Optional[int] = None
    day: Optional[int] = None
    hour: Optional[int] = None
    minute: Optional[int] = None
    second: Optional[int] = None

    @root_validator(pre=True)
    def parse_unit(cls, d: Dict[str, str]) -> Dict[str, str]:
        """
        Units need to be handled separately.

        This validator modifies the key corresponding to the unit
        with the detected value

        Parameters
        ----------
        d : Dict[str, str]
            Original data

        Returns
        -------
        Dict[str, str]
            Transformed data
        """
        unit = d.get("unit")

        if unit:
            d[unit] = d.get("number")

        return d

    def to_datetime(self, **kwargs) -> pendulum.Duration:
        d = self.dict(exclude_none=True)

        direction = d.pop("direction", None)
        dir = -1 if direction == Direction.PAST else 1

        d.pop("mode", None)

        d = {f"{k}s": v for k, v in d.items()}

        td = dir * pendulum.duration(**d)
        return td
year: Optional[int] = None class-attribute
month: Optional[int] = None class-attribute
week: Optional[int] = None class-attribute
day: Optional[int] = None class-attribute
hour: Optional[int] = None class-attribute
minute: Optional[int] = None class-attribute
second: Optional[int] = None class-attribute
parse_unit(d)

Units need to be handled separately.

This validator modifies the key corresponding to the unit with the detected value

PARAMETER DESCRIPTION
d

Original data

TYPE: Dict[str, str]

RETURNS DESCRIPTION
Dict[str, str]

Transformed data

Source code in edsnlp/pipelines/misc/dates/models.py
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
@root_validator(pre=True)
def parse_unit(cls, d: Dict[str, str]) -> Dict[str, str]:
    """
    Units need to be handled separately.

    This validator modifies the key corresponding to the unit
    with the detected value

    Parameters
    ----------
    d : Dict[str, str]
        Original data

    Returns
    -------
    Dict[str, str]
        Transformed data
    """
    unit = d.get("unit")

    if unit:
        d[unit] = d.get("number")

    return d
to_datetime(**kwargs)
Source code in edsnlp/pipelines/misc/dates/models.py
139
140
141
142
143
144
145
146
147
148
149
150
def to_datetime(self, **kwargs) -> pendulum.Duration:
    d = self.dict(exclude_none=True)

    direction = d.pop("direction", None)
    dir = -1 if direction == Direction.PAST else 1

    d.pop("mode", None)

    d = {f"{k}s": v for k, v in d.items()}

    td = dir * pendulum.duration(**d)
    return td
RelativeDate

Bases: Relative

Source code in edsnlp/pipelines/misc/dates/models.py
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
class RelativeDate(Relative):
    direction: Direction = Direction.CURRENT

    def to_datetime(
        self, note_datetime: Optional[datetime] = None
    ) -> pendulum.Duration:
        td = super(RelativeDate, self).to_datetime()

        if note_datetime is not None:
            return note_datetime + td

        return td

    def norm(self) -> str:

        if self.direction == Direction.CURRENT:
            d = self.dict(exclude_none=True)
            d.pop("direction")

            (key,) = d.keys()

            norm = f"~0 {key}"
        else:
            td = self.to_datetime()
            norm = str(td)
            if td.in_seconds() > 0:
                norm = f"+{norm}"

        return norm

    @root_validator(pre=True)
    def handle_specifics(cls, d: Dict[str, str]) -> Dict[str, str]:
        """
        Specific patterns such as `aujourd'hui`, `hier`, etc,
        need to be handled separately.

        Parameters
        ----------
        d : Dict[str, str]
            Original data.

        Returns
        -------
        Dict[str, str]
            Modified data.
        """

        specific = d.get("specific")
        specific = specific_dict.get(specific)

        if specific:
            d.update(specific)

        return d
direction: Direction = Direction.CURRENT class-attribute
to_datetime(note_datetime=None)
Source code in edsnlp/pipelines/misc/dates/models.py
156
157
158
159
160
161
162
163
164
def to_datetime(
    self, note_datetime: Optional[datetime] = None
) -> pendulum.Duration:
    td = super(RelativeDate, self).to_datetime()

    if note_datetime is not None:
        return note_datetime + td

    return td
norm()
Source code in edsnlp/pipelines/misc/dates/models.py
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
def norm(self) -> str:

    if self.direction == Direction.CURRENT:
        d = self.dict(exclude_none=True)
        d.pop("direction")

        (key,) = d.keys()

        norm = f"~0 {key}"
    else:
        td = self.to_datetime()
        norm = str(td)
        if td.in_seconds() > 0:
            norm = f"+{norm}"

    return norm
handle_specifics(d)

Specific patterns such as aujourd'hui, hier, etc, need to be handled separately.

PARAMETER DESCRIPTION
d

Original data.

TYPE: Dict[str, str]

RETURNS DESCRIPTION
Dict[str, str]

Modified data.

Source code in edsnlp/pipelines/misc/dates/models.py
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
@root_validator(pre=True)
def handle_specifics(cls, d: Dict[str, str]) -> Dict[str, str]:
    """
    Specific patterns such as `aujourd'hui`, `hier`, etc,
    need to be handled separately.

    Parameters
    ----------
    d : Dict[str, str]
        Original data.

    Returns
    -------
    Dict[str, str]
        Modified data.
    """

    specific = d.get("specific")
    specific = specific_dict.get(specific)

    if specific:
        d.update(specific)

    return d
Duration

Bases: Relative

Source code in edsnlp/pipelines/misc/dates/models.py
209
210
211
212
213
214
215
class Duration(Relative):
    mode: Mode = Mode.DURATION

    def norm(self) -> str:

        td = self.to_datetime()
        return f"during {td}"
mode: Mode = Mode.DURATION class-attribute
norm()
Source code in edsnlp/pipelines/misc/dates/models.py
212
213
214
215
def norm(self) -> str:

    td = self.to_datetime()
    return f"during {td}"

factory

DEFAULT_CONFIG = dict(absolute=None, relative=None, duration=None, false_positive=None, detect_periods=False, on_ents_only=False, attr='LOWER') module-attribute
create_component(nlp, name, absolute, relative, duration, false_positive, on_ents_only, detect_periods, attr)
Source code in edsnlp/pipelines/misc/dates/factory.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
@deprecated_factory("dates", "eds.dates", default_config=DEFAULT_CONFIG)
@Language.factory("eds.dates", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    absolute: Optional[List[str]],
    relative: Optional[List[str]],
    duration: Optional[List[str]],
    false_positive: Optional[List[str]],
    on_ents_only: Union[bool, List[str]],
    detect_periods: bool,
    attr: str,
):
    return Dates(
        nlp,
        absolute=absolute,
        relative=relative,
        duration=duration,
        false_positive=false_positive,
        on_ents_only=on_ents_only,
        detect_periods=detect_periods,
        attr=attr,
    )

patterns

duration
cue_pattern = '(pendant|durant|pdt)' module-attribute
duration_pattern = [cue_pattern + '.{,3}' + numbers.number_pattern + '\\s*' + units.unit_pattern] module-attribute
relative
specific = {'minus1': ('hier', dict(direction='PAST', day=1)), 'minus2': ('avant[-\\s]hier', dict(direction='PAST', day=2)), 'plus1': ('demain', dict(direction='FUTURE', day=1)), 'plus2': ('après[-\\s]demain', dict(direction='FUTURE', day=2))} module-attribute
specific_pattern = make_pattern(['(?P<specific_{k}>{p})' for (k, (p, _)) in specific.items()]) module-attribute
specific_dict = {k: v for (k, (_, v)) in specific.items()} module-attribute
relative_pattern = ['(?<=' + mode_pattern + '.{,3})?' + p for p in relative_pattern] module-attribute
make_specific_pattern(mode='forward')
Source code in edsnlp/pipelines/misc/dates/patterns/relative.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
def make_specific_pattern(mode: str = "forward"):

    if mode == "forward":
        p = directions.preceding_direction_pattern
        p += r"\s+"
        p += numbers.number_pattern
        p += r"\s*"
        p += units.unit_pattern
    elif mode == "backward":
        p = numbers.number_pattern
        p += r"\s*"
        p += units.unit_pattern
        p += r"\s+"
        p += directions.following_direction_pattern
    else:
        p = directions.preceding_direction_pattern
        p += r"\s+"
        p += numbers.number_pattern
        p += r"\s*"
        p += units.unit_pattern
        p += r"\s+"
        p += directions.following_direction_pattern

    return p
false_positive
false_positive_pattern = make_pattern(['(\\d+' + delimiter_pattern + '){3,}\\d+(?!:\\d\\d)\\b', '\\d\\/\\d']) module-attribute
absolute
no_year_pattern = [day + raw_delimiter_with_spaces_pattern + month + time_pattern + post_num_pattern for day in [ante_num_pattern + numeric_day_pattern, letter_day_pattern] for month in [numeric_month_pattern + post_num_pattern, letter_month_pattern]] module-attribute
no_day_pattern = [letter_month_pattern + raw_delimiter_with_spaces_pattern + year_pattern + post_num_pattern, ante_num_pattern + lz_numeric_month_pattern + raw_delimiter_with_spaces_pattern + year_pattern + post_num_pattern] module-attribute
full_year_pattern = ante_num_pattern + fy_pattern + post_num_pattern module-attribute
absolute_pattern = ['(?<=' + mode_pattern + '.{,3})?' + p for p in absolute_pattern] module-attribute
current
current_patterns: List[str] = ['(?P<year_0>cette\\s+ann[ée]e)(?![-\\s]l[àa])', "(?P<day_0>ce\\s+jour|aujourd['\\s]?hui)", '(?P<week_0>cette\\s+semaine|ces\\sjours[-\\s]ci)', '(?P<month_0>ce\\smois([-\\s]ci)?)'] module-attribute
current_pattern = make_pattern(current_patterns, with_breaks=True) module-attribute
atomic
delimiters
raw_delimiters = ['\\/', '\\-'] module-attribute
delimiters = raw_delimiters + ['\\.', '[^\\S\\r\\n]+'] module-attribute
raw_delimiter_pattern = make_pattern(raw_delimiters) module-attribute
raw_delimiter_with_spaces_pattern = make_pattern(raw_delimiters + ['[^\\S\\r\\n]+']) module-attribute
delimiter_pattern = make_pattern(delimiters) module-attribute
ante_num_pattern = '(?<!.(?:{raw_delimiter_pattern})|[0-9][.,])' module-attribute
post_num_pattern = '(?!{raw_delimiter_pattern})' module-attribute
directions
preceding_directions = ['(?P<direction_PAST>depuis|depuis\\s+le|il\\s+y\\s+a)', '(?P<direction_FUTURE>dans)'] module-attribute
following_directions = ['(?P<direction_FUTURE>prochaine?s?|suivante?s?|plus\\s+tard)', '(?P<direction_PAST>derni[eè]re?s?|passée?s?|pr[ée]c[ée]dente?s?|plus\\s+t[ôo]t)'] module-attribute
preceding_direction_pattern = make_pattern(preceding_directions, with_breaks=True) module-attribute
following_direction_pattern = make_pattern(following_directions, with_breaks=True) module-attribute
units
units = ['(?P<unit_year>ans?|ann[ée]es?)', '(?P<unit_semester>semestres?)', '(?P<unit_trimester>trimestres?)', '(?P<unit_month>mois)', '(?P<unit_week>semaines?)', '(?P<unit_day>jours?|journ[ée]es?)', '(?P<unit_hour>h|heures?)', '(?P<unit_minute>min|minutes?)', '(?P<unit_second>sec|secondes?|s)'] module-attribute
unit_pattern = make_pattern(units, with_breaks=True) module-attribute
time
hour_pattern = '(?<!\\d)(?P<hour>0?[1-9]|1\\d|2[0-3])(?!\\d)' module-attribute
lz_hour_pattern = '(?<!\\d)(?P<hour>0[1-9]|[12]\\d|3[01])(?!\\d)' module-attribute
minute_pattern = '(?<!\\d)(?P<minute>0?[1-9]|[1-5]\\d)(?!\\d)' module-attribute
lz_minute_pattern = '(?<!\\d)(?P<minute>0[1-9]|[1-5]\\d)(?!\\d)' module-attribute
second_pattern = '(?<!\\d)(?P<second>0?[1-9]|[1-5]\\d)(?!\\d)' module-attribute
lz_second_pattern = '(?<!\\d)(?P<second>0[1-9]|[1-5]\\d)(?!\\d)' module-attribute
time_pattern = '(\\s.{,3}' + '{hour_pattern}[h:]({lz_minute_pattern})?' + '((:|m|min){lz_second_pattern})?' + ')?' module-attribute
numbers
letter_numbers = ["(?P<number_01>l'|le|la|une?|ce|cette|cet)", '(?P<number_02>deux)', '(?P<number_03>trois)', '(?P<number_04>quatre)', '(?P<number_05>cinq)', '(?P<number_06>six)', '(?P<number_07>sept)', '(?P<number_08>huit)', '(?P<number_09>neuf)', '(?P<number_10>dix)', '(?P<number_11>onze)', '(?P<number_12>douze)', '(?P<number_12>treize)', '(?P<number_13>quatorze)', '(?P<number_14>quinze)', '(?P<number_15>seize)', '(?P<number_16>dix[-\\s]sept)', '(?P<number_17>dix[-\\s]huit)', '(?P<number_18>dix[-\\s]neuf)', '(?P<number_20>vingt)', '(?P<number_21>vingt[-\\s]et[-\\s]un)', '(?P<number_22>vingt[-\\s]deux)', '(?P<number_23>vingt[-\\s]trois)', '(?P<number_24>vingt[-\\s]quatre)', '(?P<number_25>vingt[-\\s]cinq)', '(?P<number_26>vingt[-\\s]six)', '(?P<number_27>vingt[-\\s]sept)', '(?P<number_28>vingt[-\\s]huit)', '(?P<number_29>vingt[-\\s]neuf)', '(?P<number_30>trente)'] module-attribute
numeric_numbers = [str(i) for i in range(1, 100)] module-attribute
letter_number_pattern = make_pattern(letter_numbers, with_breaks=True) module-attribute
numeric_number_pattern = make_pattern(numeric_numbers, name='number') module-attribute
number_pattern = '({letter_number_pattern}|{numeric_number_pattern})' module-attribute
modes
modes = ['(?P<mode_FROM>depuis|depuis\\s+le|[àa]\\s+partir\\s+d[eu]|du)', "(?P<mode_UNTIL>jusqu'[àa]u?|au)"] module-attribute
mode_pattern = make_pattern(modes, with_breaks=True) module-attribute
months
letter_months = ['(?P<month_01>janvier|janv\\.?)', '(?P<month_02>f[ée]vrier|f[ée]v\\.?)', '(?P<month_03>mars|mar\\.?)', '(?P<month_04>avril|avr\\.?)', '(?P<month_05>mai)', '(?P<month_06>juin)', '(?P<month_07>juillet|juill?\\.?)', '(?P<month_08>ao[uû]t)', '(?P<month_09>septembre|sept?\\.?)', '(?P<month_10>octobre|oct\\.?)', '(?P<month_11>novembre|nov\\.?)', '(?P<month_12>d[ée]cembre|d[ée]c\\.?)'] module-attribute
letter_month_pattern = make_pattern(letter_months, with_breaks=True) module-attribute
numeric_month_pattern = '(?P<month>{numeric_month_pattern})' module-attribute
lz_numeric_month_pattern = '(?P<month>{lz_numeric_month_pattern})' module-attribute
month_pattern = '({letter_month_pattern}|{numeric_month_pattern})' module-attribute
days
letter_days = ['(?P<day_01>premier|1\\s*er)', '(?P<day_02>deux)', '(?P<day_03>trois)', '(?P<day_04>quatre)', '(?P<day_05>cinq)', '(?P<day_06>six)', '(?P<day_07>sept)', '(?P<day_08>huit)', '(?P<day_09>neuf)', '(?P<day_10>dix)', '(?P<day_11>onze)', '(?P<day_12>douze)', '(?P<day_13>treize)', '(?P<day_14>quatorze)', '(?P<day_15>quinze)', '(?P<day_16>seize)', '(?P<day_17>dix\\-?\\s*sept)', '(?P<day_18>dix\\-?\\s*huit)', '(?P<day_19>dix\\-?\\s*neuf)', '(?P<day_20>vingt)', '(?P<day_21>vingt\\-?\\s*et\\-?\\s*un)', '(?P<day_22>vingt\\-?\\s*deux)', '(?P<day_23>vingt\\-?\\s*trois)', '(?P<day_24>vingt\\-?\\s*quatre)', '(?P<day_25>vingt\\-?\\s*cinq)', '(?P<day_26>vingt\\-?\\s*six)', '(?P<day_27>vingt\\-?\\s*sept)', '(?P<day_28>vingt\\-?\\s*huit)', '(?P<day_29>vingt\\-?\\s*neuf)', '(?P<day_30>trente)', '(?P<day_31>trente\\-?\\s*et\\-?\\s*un)'] module-attribute
letter_day_pattern = make_pattern(letter_days) module-attribute
nlz_numeric_day_pattern = '(?<!\\d)([1-9]|[12]\\d|3[01])(?!\\d)' module-attribute
numeric_day_pattern = '(?P<day>{numeric_day_pattern})' module-attribute
lz_numeric_day_pattern = '(?P<day>{lz_numeric_day_pattern})' module-attribute
day_pattern = '({letter_day_pattern}|{numeric_day_pattern})' module-attribute
years
year_patterns: List[str] = ['19\\d\\d'] + [str(year) for year in range(2000, date.today().year + 2)] module-attribute
full_year_pattern = '(?<!\\d)' + full_year_pattern + '(?!\\d)' module-attribute
year_pattern = '(?<!\\d)' + year_pattern + '(?!\\d)' module-attribute

measures

patterns

CompositeSize

Bases: CompositeMeasure

Composite size measure. Supports the following units: - mm - cm - dm - m

Source code in edsnlp/pipelines/misc/measures/patterns.py
11
12
13
14
15
16
17
18
19
20
21
22
23
class CompositeSize(CompositeMeasure):
    """
    Composite size measure. Supports the following units:
    - mm
    - cm
    - dm
    - m
    """

    mm = property(make_multi_getter("mm"))
    cm = property(make_multi_getter("cm"))
    dm = property(make_multi_getter("dm"))
    m = property(make_multi_getter("m"))
mm = property(make_multi_getter('mm')) class-attribute
cm = property(make_multi_getter('cm')) class-attribute
dm = property(make_multi_getter('dm')) class-attribute
m = property(make_multi_getter('m')) class-attribute
Size

Bases: SimpleMeasure

Size measure. Supports the following units: - mm - cm - dm - m

Source code in edsnlp/pipelines/misc/measures/patterns.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
@spacy.registry.misc("eds.measures.size")
class Size(SimpleMeasure):
    """
    Size measure. Supports the following units:
    - mm
    - cm
    - dm
    - m
    """

    COMPOSITE = CompositeSize
    UNITS = {
        "mm": {"prefix": "mill?im", "abbr": "mm", "value": 1},
        "cm": {"prefix": "centim", "abbr": "cm", "value": 10},
        "dm": {"prefix": "decim", "abbr": "dm", "value": 100},
        "m": {"prefix": "metre", "abbr": "m", "value": 1000},
    }

    @classmethod
    def parse(cls, int_part, dec_part, unit, infix=False):
        result = float("{}.{}".format(int_part, dec_part))
        return cls(result, unit)

    mm = property(make_simple_getter("mm"))
    cm = property(make_simple_getter("cm"))
    dm = property(make_simple_getter("dm"))
    m = property(make_simple_getter("m"))
COMPOSITE = CompositeSize class-attribute
UNITS = {'mm': {'prefix': 'mill?im', 'abbr': 'mm', 'value': 1}, 'cm': {'prefix': 'centim', 'abbr': 'cm', 'value': 10}, 'dm': {'prefix': 'decim', 'abbr': 'dm', 'value': 100}, 'm': {'prefix': 'metre', 'abbr': 'm', 'value': 1000}} class-attribute
mm = property(make_simple_getter('mm')) class-attribute
cm = property(make_simple_getter('cm')) class-attribute
dm = property(make_simple_getter('dm')) class-attribute
m = property(make_simple_getter('m')) class-attribute
parse(int_part, dec_part, unit, infix=False)
Source code in edsnlp/pipelines/misc/measures/patterns.py
44
45
46
47
@classmethod
def parse(cls, int_part, dec_part, unit, infix=False):
    result = float("{}.{}".format(int_part, dec_part))
    return cls(result, unit)
Weight

Bases: SimpleMeasure

Weight measure. Supports the following units: - mg - cg - dg - g - kg

Source code in edsnlp/pipelines/misc/measures/patterns.py
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
@spacy.registry.misc("eds.measures.weight")
class Weight(SimpleMeasure):
    """
    Weight measure. Supports the following units:
    - mg
    - cg
    - dg
    - g
    - kg
    """

    COMPOSITE = None
    UNITS = {
        "mg": {"prefix": "mill?ig", "abbr": "mg", "value": 1},
        "cg": {"prefix": "centig", "abbr": "cg", "value": 10},
        "dg": {"prefix": "decig", "abbr": "dg", "value": 100},
        "g": {"prefix": "gram", "abbr": "g", "value": 1000},
        "kg": {"prefix": "kilo", "abbr": "kg", "value": 1000000},
    }

    @classmethod
    def parse(cls, int_part, dec_part, unit, infix=False):
        result = float("{}.{}".format(int_part, dec_part))
        return cls(result, unit)

    mg = property(make_simple_getter("mg"))
    cg = property(make_simple_getter("cg"))
    dg = property(make_simple_getter("dg"))
    g = property(make_simple_getter("g"))
    kg = property(make_simple_getter("kg"))
COMPOSITE = None class-attribute
UNITS = {'mg': {'prefix': 'mill?ig', 'abbr': 'mg', 'value': 1}, 'cg': {'prefix': 'centig', 'abbr': 'cg', 'value': 10}, 'dg': {'prefix': 'decig', 'abbr': 'dg', 'value': 100}, 'g': {'prefix': 'gram', 'abbr': 'g', 'value': 1000}, 'kg': {'prefix': 'kilo', 'abbr': 'kg', 'value': 1000000}} class-attribute
mg = property(make_simple_getter('mg')) class-attribute
cg = property(make_simple_getter('cg')) class-attribute
dg = property(make_simple_getter('dg')) class-attribute
g = property(make_simple_getter('g')) class-attribute
kg = property(make_simple_getter('kg')) class-attribute
parse(int_part, dec_part, unit, infix=False)
Source code in edsnlp/pipelines/misc/measures/patterns.py
75
76
77
78
@classmethod
def parse(cls, int_part, dec_part, unit, infix=False):
    result = float("{}.{}".format(int_part, dec_part))
    return cls(result, unit)
Angle

Bases: SimpleMeasure

Angle measure. Supports the following units: - h

Source code in edsnlp/pipelines/misc/measures/patterns.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
@spacy.registry.misc("eds.measures.angle")
class Angle(SimpleMeasure):
    """
    Angle measure. Supports the following units:
    - h
    """

    COMPOSITE = None
    UNITS = {
        "h": {"prefix": "heur", "abbr": "h", "value": 1},
    }

    @classmethod
    def parse(cls, int_part, dec_part, unit, infix=False):
        if infix:
            result = float(int_part) + int(dec_part) / 60.0
            return cls(result, unit)
        result = float("{}.{}".format(int_part, dec_part))
        return cls(result, unit)

    h = property(make_simple_getter("h"))
COMPOSITE = None class-attribute
UNITS = {'h': {'prefix': 'heur', 'abbr': 'h', 'value': 1}} class-attribute
h = property(make_simple_getter('h')) class-attribute
parse(int_part, dec_part, unit, infix=False)
Source code in edsnlp/pipelines/misc/measures/patterns.py
 99
100
101
102
103
104
105
@classmethod
def parse(cls, int_part, dec_part, unit, infix=False):
    if infix:
        result = float(int_part) + int(dec_part) / 60.0
        return cls(result, unit)
    result = float("{}.{}".format(int_part, dec_part))
    return cls(result, unit)

factory

DEFAULT_CONFIG = dict(attr='NORM', ignore_excluded=False, measures=['eds.measures.size', 'eds.measures.weight', 'eds.measures.angle']) module-attribute
create_component(nlp, name, measures, attr, ignore_excluded)
Source code in edsnlp/pipelines/misc/measures/factory.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
@Language.factory("eds.measures", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    measures: Union[str, List[str], Dict[str, Dict]],
    attr: str,
    ignore_excluded: bool,
):
    return Measures(
        nlp,
        measures=measures,
        attr=attr,
        ignore_excluded=ignore_excluded,
    )

measures

Measure

Bases: abc.ABC

Source code in edsnlp/pipelines/misc/measures/measures.py
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
class Measure(abc.ABC):
    INTEGER = r"(?:[0-9]+)"
    CONJUNCTIONS = "et|ou"
    COMPOSERS = r"[x*]|par"

    UNITS = {}
    COMPOSITE = None

    @abc.abstractmethod
    def __iter__(self) -> Iterable["SimpleMeasure"]:
        """
        Iter over items of the measure (only one for SimpleMeasure)

        Returns
        -------
        iterable : Iterable["SimpleMeasure"]
        """

    @abc.abstractmethod
    def __getitem__(self, item) -> "SimpleMeasure":
        """
        Access items of the measure (only one for SimpleMeasure)

        Parameters
        ----------
        item : int

        Returns
        -------
        measure : SimpleMeasure
        """
INTEGER = '(?:[0-9]+)' class-attribute
CONJUNCTIONS = 'et|ou' class-attribute
COMPOSERS = '[x*]|par' class-attribute
UNITS = {} class-attribute
COMPOSITE = None class-attribute
__iter__()

Iter over items of the measure (only one for SimpleMeasure)

RETURNS DESCRIPTION
iterable

TYPE: Iterable["SimpleMeasure"]

Source code in edsnlp/pipelines/misc/measures/measures.py
131
132
133
134
135
136
137
138
139
@abc.abstractmethod
def __iter__(self) -> Iterable["SimpleMeasure"]:
    """
    Iter over items of the measure (only one for SimpleMeasure)

    Returns
    -------
    iterable : Iterable["SimpleMeasure"]
    """
__getitem__(item)

Access items of the measure (only one for SimpleMeasure)

PARAMETER DESCRIPTION
item

TYPE: int

RETURNS DESCRIPTION
measure

TYPE: SimpleMeasure

Source code in edsnlp/pipelines/misc/measures/measures.py
141
142
143
144
145
146
147
148
149
150
151
152
153
@abc.abstractmethod
def __getitem__(self, item) -> "SimpleMeasure":
    """
    Access items of the measure (only one for SimpleMeasure)

    Parameters
    ----------
    item : int

    Returns
    -------
    measure : SimpleMeasure
    """
SimpleMeasure

Bases: Measure

Source code in edsnlp/pipelines/misc/measures/measures.py
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
class SimpleMeasure(Measure):
    def __init__(self, value, unit):
        """
        The SimpleMeasure class contains the value and unit
        for a single non-composite measure

        Parameters
        ----------
        value : float
        unit : str
        """
        super().__init__()
        self.value = value
        self.unit = unit

    @classmethod
    @abc.abstractmethod
    def parse(
        self, int_part: str, dec_part: str, unit: str, infix: bool
    ) -> "SimpleMeasure":
        """
        Class method to create an instance from the match groups

        int_part : str
            The integer part of the match (eg 12 in 12 metres 50 or 12.50metres)
        dec_part : str
            The decimal part of the match (eg 50 in 12 metres 50 or 12.50metres)
        unit : str
            The normalized variant of the unit (eg "m" for 12 metre 50)
        infix : bool
            Whether the unit was in the before (True) or after (False) the decimal part
        """

    def _get_scale_to(self, unit: str):
        return self.UNITS[self.unit]["value"] / self.UNITS[unit]["value"]

    def __iter__(self):
        return iter((self,))

    def __getitem__(self, item: int):
        assert isinstance(item, int)
        return [self][item]

    def __str__(self):
        return f"{self.value}{self.unit}"

    def __repr__(self):
        return f"{self.__class__.__name__}({self.value}, {repr(self.unit)})"

    def __eq__(self, other: "SimpleMeasure"):
        return getattr(self, other.unit) == other.value

    def __lt__(self, other: "SimpleMeasure"):
        return getattr(self, other.unit) < other.value

    def __le__(self, other: "SimpleMeasure"):
        return getattr(self, other.unit) <= other.value
value = value instance-attribute
unit = unit instance-attribute
__init__(value, unit)

The SimpleMeasure class contains the value and unit for a single non-composite measure

PARAMETER DESCRIPTION
value

TYPE: float

unit

TYPE: str

Source code in edsnlp/pipelines/misc/measures/measures.py
157
158
159
160
161
162
163
164
165
166
167
168
169
def __init__(self, value, unit):
    """
    The SimpleMeasure class contains the value and unit
    for a single non-composite measure

    Parameters
    ----------
    value : float
    unit : str
    """
    super().__init__()
    self.value = value
    self.unit = unit
parse(int_part, dec_part, unit, infix)

Class method to create an instance from the match groups

int_part : str The integer part of the match (eg 12 in 12 metres 50 or 12.50metres) dec_part : str The decimal part of the match (eg 50 in 12 metres 50 or 12.50metres) unit : str The normalized variant of the unit (eg "m" for 12 metre 50) infix : bool Whether the unit was in the before (True) or after (False) the decimal part

Source code in edsnlp/pipelines/misc/measures/measures.py
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
@classmethod
@abc.abstractmethod
def parse(
    self, int_part: str, dec_part: str, unit: str, infix: bool
) -> "SimpleMeasure":
    """
    Class method to create an instance from the match groups

    int_part : str
        The integer part of the match (eg 12 in 12 metres 50 or 12.50metres)
    dec_part : str
        The decimal part of the match (eg 50 in 12 metres 50 or 12.50metres)
    unit : str
        The normalized variant of the unit (eg "m" for 12 metre 50)
    infix : bool
        Whether the unit was in the before (True) or after (False) the decimal part
    """
_get_scale_to(unit)
Source code in edsnlp/pipelines/misc/measures/measures.py
189
190
def _get_scale_to(self, unit: str):
    return self.UNITS[self.unit]["value"] / self.UNITS[unit]["value"]
__iter__()
Source code in edsnlp/pipelines/misc/measures/measures.py
192
193
def __iter__(self):
    return iter((self,))
__getitem__(item)
Source code in edsnlp/pipelines/misc/measures/measures.py
195
196
197
def __getitem__(self, item: int):
    assert isinstance(item, int)
    return [self][item]
__str__()
Source code in edsnlp/pipelines/misc/measures/measures.py
199
200
def __str__(self):
    return f"{self.value}{self.unit}"
__repr__()
Source code in edsnlp/pipelines/misc/measures/measures.py
202
203
def __repr__(self):
    return f"{self.__class__.__name__}({self.value}, {repr(self.unit)})"
__eq__(other)
Source code in edsnlp/pipelines/misc/measures/measures.py
205
206
def __eq__(self, other: "SimpleMeasure"):
    return getattr(self, other.unit) == other.value
__lt__(other)
Source code in edsnlp/pipelines/misc/measures/measures.py
208
209
def __lt__(self, other: "SimpleMeasure"):
    return getattr(self, other.unit) < other.value
__le__(other)
Source code in edsnlp/pipelines/misc/measures/measures.py
211
212
def __le__(self, other: "SimpleMeasure"):
    return getattr(self, other.unit) <= other.value
CompositeMeasure

Bases: Measure

The CompositeMeasure class contains a sequence of multiple SimpleMeasure instances

PARAMETER DESCRIPTION
measures

TYPE: List[SimpleMeasure]

Source code in edsnlp/pipelines/misc/measures/measures.py
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
class CompositeMeasure(Measure):
    """
    The CompositeMeasure class contains a sequence
    of multiple SimpleMeasure instances

    Parameters
    ----------
    measures : List[SimpleMeasure]
    """

    def __init__(self, measures: Iterable["SimpleMeasure"]):
        super().__init__()
        self.measures = list(measures)

    def __iter__(self):
        return iter(self.measures)

    def __getitem__(self, item: int):
        assert isinstance(item, int)
        res = self.measures[item]
        return res

    def __str__(self):
        return " x ".join(map(str, self.measures))

    def __repr__(self):
        return f"{self.__class__.__name__}({repr(self.measures)})"
measures = list(measures) instance-attribute
__init__(measures)
Source code in edsnlp/pipelines/misc/measures/measures.py
225
226
227
def __init__(self, measures: Iterable["SimpleMeasure"]):
    super().__init__()
    self.measures = list(measures)
__iter__()
Source code in edsnlp/pipelines/misc/measures/measures.py
229
230
def __iter__(self):
    return iter(self.measures)
__getitem__(item)
Source code in edsnlp/pipelines/misc/measures/measures.py
232
233
234
235
def __getitem__(self, item: int):
    assert isinstance(item, int)
    res = self.measures[item]
    return res
__str__()
Source code in edsnlp/pipelines/misc/measures/measures.py
237
238
def __str__(self):
    return " x ".join(map(str, self.measures))
__repr__()
Source code in edsnlp/pipelines/misc/measures/measures.py
240
241
def __repr__(self):
    return f"{self.__class__.__name__}({repr(self.measures)})"
Measures

Bases: BaseComponent

Matcher component to extract measures. A measures is most often composed of a number and a unit like

1,26 cm The unit can also be positioned in place of the decimal dot/comma 1 cm 26 Some measures can be composite 1,26 cm x 2,34 mm And sometimes they are factorized Les trois kystes mesurent 1, 2 et 3cm.

The recognized measures are stored in the "measures" SpanGroup. Each span has a Measure object stored in the "value" extension attribute.

PARAMETER DESCRIPTION
nlp

The SpaCy object.

TYPE: Language

measures

The registry names of the measures to extract

TYPE: List[str]

attr

Whether to match on the text ('TEXT') or on the normalized text ('NORM')

TYPE: str

ignore_excluded

Whether to exclude pollution patterns when matching in the text

TYPE: bool

Source code in edsnlp/pipelines/misc/measures/measures.py
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
class Measures(BaseComponent):
    """
    Matcher component to extract measures.
    A measures is most often composed of a number and a unit like
    > 1,26 cm
    The unit can also be positioned in place of the decimal dot/comma
    > 1 cm 26
    Some measures can be composite
    > 1,26 cm x 2,34 mm
    And sometimes they are factorized
    > Les trois kystes mesurent 1, 2 et 3cm.

    The recognized measures are stored in the "measures" SpanGroup.
    Each span has a `Measure` object stored in the "value" extension attribute.

    Parameters
    ----------
    nlp : Language
        The SpaCy object.
    measures : List[str]
        The registry names of the measures to extract
    attr : str
        Whether to match on the text ('TEXT') or on the normalized text ('NORM')
    ignore_excluded : bool
        Whether to exclude pollution patterns when matching in the text
    """

    def __init__(
        self,
        nlp: Language,
        measures: List[str],
        attr: str,
        ignore_excluded: bool,
    ):

        self.regex_matcher = RegexMatcher(
            attr=attr,
            ignore_excluded=ignore_excluded,
        )

        self.extraction_regexes = {}
        self.measures: Dict[str, Measure] = {}
        for name in measures:
            cls: Measure = spacy.registry.misc.get(name)
            self.measures[name] = cls
            regexes = make_patterns(cls)
            self.regex_matcher.add(name, regexes["trigger"])
            self.extraction_regexes[name] = regexes["extraction"]

        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:
        super(Measures, Measures).set_extensions()
        if not Span.has_extension("value"):
            Span.set_extension("value", default=None)

    def __call__(self, doc: Doc) -> Doc:
        """
        Adds measures to document's "measures" SpanGroup.

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        doc:
            spaCy Doc object, annotated for extracted terms.
        """

        matches = dict(self.regex_matcher(doc, as_spans=True, return_groupdict=True))

        # Filter spans by rightmost, largest spans first to handle cases like 1 m 50 kg
        # while keeping the corresponding groupdicts
        matches = {
            match: matches[match]
            for match in filter_spans(matches, sort_key=rightmost_largest_sort_key)
        }

        measures = []
        for match, groupdict in matches.items():
            measure_name = match.label_
            extraction_regex = self.extraction_regexes[measure_name]

            parsed_values = []

            shared_unit_part = next(
                (key for key, val in groupdict.items() if val is not None), None
            )
            for sub_match in regex.finditer(extraction_regex, match.text):
                sub_groupdict = dict(sub_match.groupdict())

                # Integer part of the match
                int_part = sub_groupdict.pop("int_part", 0)

                # Decimal part of the match, if any
                dec_part = sub_groupdict.pop("dec_part", 0) or 0

                # If the unit was not postfix (in cases like 1cm, or 1 et 2cm)
                # the unit must be infix: we extract it now using non empty groupdict
                # entries
                infix_unit_part = next(
                    (key for key, val in sub_groupdict.items() if val is not None),
                    None,
                )
                unit_part = infix_unit_part or shared_unit_part

                # Create one SimpleMeasure per submatch inside each match...
                parsed_values.append(
                    self.measures[measure_name].parse(
                        int_part=int_part,
                        dec_part=dec_part,
                        unit=unit_part,
                        infix=infix_unit_part is not None,
                    )
                )

            # ... and compose theses measures together if there are more than one
            measure = Span(doc, start=match.start, end=match.end, label=measure_name)
            measure._.value = (
                parsed_values[0]
                if len(parsed_values) == 1
                else self.measures[measure_name].COMPOSITE(parsed_values)
                if self.measures[measure_name].COMPOSITE is not None
                else parsed_values[-1]
            )
            measures.append(match)

        doc.spans["measures"] = sorted(measures)

        return doc
regex_matcher = RegexMatcher(attr=attr, ignore_excluded=ignore_excluded) instance-attribute
extraction_regexes = {} instance-attribute
measures: Dict[str, Measure] = {} instance-attribute
__init__(nlp, measures, attr, ignore_excluded)
Source code in edsnlp/pipelines/misc/measures/measures.py
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
def __init__(
    self,
    nlp: Language,
    measures: List[str],
    attr: str,
    ignore_excluded: bool,
):

    self.regex_matcher = RegexMatcher(
        attr=attr,
        ignore_excluded=ignore_excluded,
    )

    self.extraction_regexes = {}
    self.measures: Dict[str, Measure] = {}
    for name in measures:
        cls: Measure = spacy.registry.misc.get(name)
        self.measures[name] = cls
        regexes = make_patterns(cls)
        self.regex_matcher.add(name, regexes["trigger"])
        self.extraction_regexes[name] = regexes["extraction"]

    self.set_extensions()
set_extensions()
Source code in edsnlp/pipelines/misc/measures/measures.py
295
296
297
298
299
@staticmethod
def set_extensions() -> None:
    super(Measures, Measures).set_extensions()
    if not Span.has_extension("value"):
        Span.set_extension("value", default=None)
__call__(doc)

Adds measures to document's "measures" SpanGroup.

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
doc

spaCy Doc object, annotated for extracted terms.

Source code in edsnlp/pipelines/misc/measures/measures.py
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
def __call__(self, doc: Doc) -> Doc:
    """
    Adds measures to document's "measures" SpanGroup.

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    doc:
        spaCy Doc object, annotated for extracted terms.
    """

    matches = dict(self.regex_matcher(doc, as_spans=True, return_groupdict=True))

    # Filter spans by rightmost, largest spans first to handle cases like 1 m 50 kg
    # while keeping the corresponding groupdicts
    matches = {
        match: matches[match]
        for match in filter_spans(matches, sort_key=rightmost_largest_sort_key)
    }

    measures = []
    for match, groupdict in matches.items():
        measure_name = match.label_
        extraction_regex = self.extraction_regexes[measure_name]

        parsed_values = []

        shared_unit_part = next(
            (key for key, val in groupdict.items() if val is not None), None
        )
        for sub_match in regex.finditer(extraction_regex, match.text):
            sub_groupdict = dict(sub_match.groupdict())

            # Integer part of the match
            int_part = sub_groupdict.pop("int_part", 0)

            # Decimal part of the match, if any
            dec_part = sub_groupdict.pop("dec_part", 0) or 0

            # If the unit was not postfix (in cases like 1cm, or 1 et 2cm)
            # the unit must be infix: we extract it now using non empty groupdict
            # entries
            infix_unit_part = next(
                (key for key, val in sub_groupdict.items() if val is not None),
                None,
            )
            unit_part = infix_unit_part or shared_unit_part

            # Create one SimpleMeasure per submatch inside each match...
            parsed_values.append(
                self.measures[measure_name].parse(
                    int_part=int_part,
                    dec_part=dec_part,
                    unit=unit_part,
                    infix=infix_unit_part is not None,
                )
            )

        # ... and compose theses measures together if there are more than one
        measure = Span(doc, start=match.start, end=match.end, label=measure_name)
        measure._.value = (
            parsed_values[0]
            if len(parsed_values) == 1
            else self.measures[measure_name].COMPOSITE(parsed_values)
            if self.measures[measure_name].COMPOSITE is not None
            else parsed_values[-1]
        )
        measures.append(match)

    doc.spans["measures"] = sorted(measures)

    return doc
disj_capture(regexes, capture=True)
Source code in edsnlp/pipelines/misc/measures/measures.py
14
15
16
17
18
19
20
def disj_capture(regexes, capture=True):
    return "|".join(
        ("(?P<{key}>{forms})" if capture else "{forms}").format(
            key=key, forms="|".join(forms)
        )
        for key, forms in regexes.items()
    )
rightmost_largest_sort_key(span)
Source code in edsnlp/pipelines/misc/measures/measures.py
23
24
def rightmost_largest_sort_key(span):
    return span.end, (len(span))
make_patterns(measure)

Build recognition and extraction patterns for a given Measure class

PARAMETER DESCRIPTION
measure

The measure to build recognition and extraction patterns for

TYPE: 'Measure'

RETURNS DESCRIPTION
trigger

TYPE: List[str]

extraction

TYPE: str

Source code in edsnlp/pipelines/misc/measures/measures.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def make_patterns(measure: "Measure") -> Dict[str, Union[List[str], str]]:
    """
    Build recognition and extraction patterns for a given Measure class

    Parameters
    ----------
    measure: Measure class
        The measure to build recognition and extraction patterns for

    Returns
    -------
    trigger : List[str]
    extraction : str
    """
    unit_prefix_reg = disj_capture(
        {key: [entry["prefix"]] for key, entry in measure.UNITS.items()},
        capture=True,
    )
    unit_abbreviation_reg = disj_capture(
        {key: [entry["abbr"]] for key, entry in measure.UNITS.items()},
        capture=True,
    )
    unit_reg = rf"(?:(?:{unit_prefix_reg})[a-z]*|(?:{unit_abbreviation_reg})(?![a-z]))"

    number_reg = rf"(?:{measure.INTEGER}(?:[,.]{measure.INTEGER})?)"
    infix_measure_reg = rf"(?:{measure.INTEGER}{unit_reg}{measure.INTEGER})"

    # Simple measure
    simple_measure_reg = rf"{number_reg}\s*{unit_reg}"
    trigger = [
        simple_measure_reg,
        infix_measure_reg,
        # Factorized measures separated by a conjunction
        rf"{number_reg}(?=(?:\s*[,]\s*{number_reg})*\s*"
        rf"(?:{measure.CONJUNCTIONS})\s*{number_reg}\s*{unit_reg})",
    ]
    if measure.COMPOSITE:
        # Factorized composite measures (3 x 2cm)
        trigger.append(
            rf"(?<![a-z]){number_reg}"
            rf"(?:\s*(?:{measure.COMPOSERS})\s*{number_reg})*\s*{unit_reg}"
        )
        # Expanded composite measures (3cm x 2cm)
        trigger.append(
            rf"(?<![a-z])(?:{infix_measure_reg}|{simple_measure_reg})"
            rf"(\s*(?:{measure.COMPOSERS})\s*"
            rf"(?:{infix_measure_reg}|{simple_measure_reg}))*"
        )

    unit_reg_capture = (
        rf"(?:(?:{unit_prefix_reg})[a-z]*|(?:{unit_abbreviation_reg})(?![a-z]))"
    )

    return {
        "trigger": trigger,
        "extraction": rf"(?P<int_part>{measure.INTEGER})\s*(?:[,.]|"
        rf"{unit_reg_capture})?\s*(?P<dec_part>{measure.INTEGER})?",
    }
make_simple_getter(name)
Source code in edsnlp/pipelines/misc/measures/measures.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
def make_simple_getter(name):
    def getter(self):
        """
        Get a scaled numerical value of a measure

        Parameters
        ----------
        self

        Returns
        -------
        float
        """
        return self.value * self._get_scale_to(name)

    return getter
make_multi_getter(name)
Source code in edsnlp/pipelines/misc/measures/measures.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
def make_multi_getter(name: str) -> Callable[["CompositeMeasure"], Tuple[float]]:
    def getter(self) -> Tuple[float]:
        """
        Get a scaled numerical values of a multi-measure

        Parameters
        ----------
        self

        Returns
        -------
        float
        """
        return tuple(getattr(measure, name) for measure in self.measures)

    return getter

sections

patterns

These section titles were extracted from a work performed by Ivan Lerner at AP-HP. It supplied a number of documents annotated for section titles.

The section titles were reviewed by Gilles Chatellier, who gave meaningful insights.

See sections/section-dataset notebook for detail.

allergies = ['allergies'] module-attribute
antecedents = ['antecedents', 'antecedents medicaux et chirurgicaux', 'antecedents personnels', 'antecedents medicaux', 'antecedents chirurgicaux', 'atcd'] module-attribute
antecedents_familiaux = ['antecedents familiaux'] module-attribute
traitements_entree = ['attitude therapeutique initiale', "traitement a l'entree", 'traitement actuel', 'traitement en cours', "traitements a l'entree"] module-attribute
conclusion = ['au total', 'conclusion', 'conclusion de sortie', 'syntese medicale / conclusion', 'synthese', 'synthese medicale', 'synthese medicale/conclusion', 'conclusion medicale'] module-attribute
conclusion_entree = ["conclusion a l'entree"] module-attribute
habitus = ['contexte familial et social', 'habitus', 'mode de vie', 'mode de vie - scolarite', 'situation sociale, mode de vie'] module-attribute
correspondants = ['correspondants'] module-attribute
diagnostic = ['diagnostic retenu'] module-attribute
donnees_biometriques_entree = ["donnees biometriques et parametres vitaux a l'entree", "parametres vitaux et donnees biometriques a l'entree"] module-attribute
examens = ['examen clinique', "examen clinique a l'entree"] module-attribute
examens_complementaires = ['examen(s) complementaire(s)', 'examens complementaires', "examens complementaires a l'entree", 'examens complementaires realises pendant le sejour', 'examens para-cliniques'] module-attribute
facteurs_de_risques = ['facteurs de risque', 'facteurs de risques'] module-attribute
histoire_de_la_maladie = ['histoire de la maladie', 'histoire de la maladie - explorations', 'histoire de la maladie actuelle', 'histoire du poids', 'histoire recente', 'histoire recente de la maladie', 'rappel clinique', 'resume', 'resume clinique'] module-attribute
actes = ['intervention'] module-attribute
motif = ['motif', "motif d'hospitalisation", "motif de l'hospitalisation", 'motif medical'] module-attribute
prescriptions = ['prescriptions de sortie', 'prescriptions medicales de sortie'] module-attribute
traitements_sortie = ['traitement de sortie'] module-attribute
sections = {'allergies': allergies, 'antécédents': antecedents, 'antécédents familiaux': antecedents_familiaux, 'traitements entrée': traitements_entree, 'conclusion': conclusion, 'conclusion entrée': conclusion_entree, 'habitus': habitus, 'correspondants': correspondants, 'diagnostic': diagnostic, 'données biométriques entrée': donnees_biometriques_entree, 'examens': examens, 'examens complémentaires': examens_complementaires, 'facteurs de risques': facteurs_de_risques, 'histoire de la maladie': histoire_de_la_maladie, 'actes': actes, 'motif': motif, 'prescriptions': prescriptions, 'traitements sortie': traitements_sortie} module-attribute

sections

Sections

Bases: GenericMatcher

Divides the document into sections.

By default, we are using a dataset of documents annotated for section titles, using the work done by Ivan Lerner, reviewed by Gilles Chatellier.

Detected sections are :

  • allergies ;
  • antécédents ;
  • antécédents familiaux ;
  • traitements entrée ;
  • conclusion ;
  • conclusion entrée ;
  • habitus ;
  • correspondants ;
  • diagnostic ;
  • données biométriques entrée ;
  • examens ;
  • examens complémentaires ;
  • facteurs de risques ;
  • histoire de la maladie ;
  • actes ;
  • motif ;
  • prescriptions ;
  • traitements sortie.

The component looks for section titles within the document, and stores them in the section_title extension.

For ease-of-use, the component also populates a section extension, which contains a list of spans corresponding to the "sections" of the document. These span from the start of one section title to the next, which can introduce obvious bias should an intermediate section title goes undetected.

PARAMETER DESCRIPTION
nlp

spaCy pipeline object.

TYPE: Language

sections

Dictionary of terms to look for.

TYPE: Dict[str, List[str]]

attr

Default attribute to match on.

TYPE: str

ignore_excluded

Whether to skip excluded tokens.

TYPE: bool

Source code in edsnlp/pipelines/misc/sections/sections.py
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
class Sections(GenericMatcher):
    """
    Divides the document into sections.

    By default, we are using a dataset of documents annotated for section titles,
    using the work done by Ivan Lerner, reviewed by Gilles Chatellier.

    Detected sections are :

    - allergies ;
    - antécédents ;
    - antécédents familiaux ;
    - traitements entrée ;
    - conclusion ;
    - conclusion entrée ;
    - habitus ;
    - correspondants ;
    - diagnostic ;
    - données biométriques entrée ;
    - examens ;
    - examens complémentaires ;
    - facteurs de risques ;
    - histoire de la maladie ;
    - actes ;
    - motif ;
    - prescriptions ;
    - traitements sortie.

    The component looks for section titles within the document,
    and stores them in the `section_title` extension.

    For ease-of-use, the component also populates a `section` extension,
    which contains a list of spans corresponding to the "sections" of the
    document. These span from the start of one section title to the next,
    which can introduce obvious bias should an intermediate section title
    goes undetected.

    Parameters
    ----------
    nlp : Language
        spaCy pipeline object.
    sections : Dict[str, List[str]]
        Dictionary of terms to look for.
    attr : str
        Default attribute to match on.
    ignore_excluded : bool
        Whether to skip excluded tokens.
    """

    def __init__(
        self,
        nlp: Language,
        sections: Dict[str, List[str]],
        add_patterns: bool,
        attr: str,
        ignore_excluded: bool,
    ):

        logger.warning(
            "The component Sections is still in Beta. Use at your own risks."
        )

        if sections is None:
            sections = patterns.sections
        sections = dict(sections)

        self.add_patterns = add_patterns
        if add_patterns:
            for k, v in sections.items():
                sections[k] = [r"\n[^\n]{0,5}" + ent + r"[^\n]{0,5}\n" for ent in v]

        super().__init__(
            nlp,
            terms=None,
            regex=sections,
            attr=attr,
            ignore_excluded=ignore_excluded,
        )

        self.set_extensions()

        if not nlp.has_pipe("normalizer") and not not nlp.has_pipe("eds.normalizer"):
            logger.warning("You should add pipe `eds.normalizer`")

    @staticmethod
    def set_extensions():

        if not Span.has_extension("section_title"):
            Span.set_extension("section_title", default=None)

        if not Span.has_extension("section"):
            Span.set_extension("section", default=None)

    # noinspection PyProtectedMember
    def __call__(self, doc: Doc) -> Doc:
        """
        Divides the doc into sections

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        doc:
            spaCy Doc object, annotated for sections
        """
        titles = filter_spans(self.process(doc))

        if self.add_patterns:
            # Remove preceding newline
            titles = [
                Span(doc, title.start + 1, title.end - 1, label=title.label_)
                for title in titles
            ]

        sections = []

        for t1, t2 in zip(titles[:-1], titles[1:]):
            section = Span(doc, t1.start, t2.start, label=t1.label)
            section._.section_title = t1
            sections.append(section)

        if titles:
            t = titles[-1]
            section = Span(doc, t.start, len(doc), label=t.label)
            section._.section_title = t
            sections.append(section)

        doc.spans["sections"] = sections
        doc.spans["section_titles"] = titles

        return doc
add_patterns = add_patterns instance-attribute
__init__(nlp, sections, add_patterns, attr, ignore_excluded)
Source code in edsnlp/pipelines/misc/sections/sections.py
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
def __init__(
    self,
    nlp: Language,
    sections: Dict[str, List[str]],
    add_patterns: bool,
    attr: str,
    ignore_excluded: bool,
):

    logger.warning(
        "The component Sections is still in Beta. Use at your own risks."
    )

    if sections is None:
        sections = patterns.sections
    sections = dict(sections)

    self.add_patterns = add_patterns
    if add_patterns:
        for k, v in sections.items():
            sections[k] = [r"\n[^\n]{0,5}" + ent + r"[^\n]{0,5}\n" for ent in v]

    super().__init__(
        nlp,
        terms=None,
        regex=sections,
        attr=attr,
        ignore_excluded=ignore_excluded,
    )

    self.set_extensions()

    if not nlp.has_pipe("normalizer") and not not nlp.has_pipe("eds.normalizer"):
        logger.warning("You should add pipe `eds.normalizer`")
set_extensions()
Source code in edsnlp/pipelines/misc/sections/sections.py
 97
 98
 99
100
101
102
103
104
@staticmethod
def set_extensions():

    if not Span.has_extension("section_title"):
        Span.set_extension("section_title", default=None)

    if not Span.has_extension("section"):
        Span.set_extension("section", default=None)
__call__(doc)

Divides the doc into sections

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
doc

spaCy Doc object, annotated for sections

Source code in edsnlp/pipelines/misc/sections/sections.py
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
def __call__(self, doc: Doc) -> Doc:
    """
    Divides the doc into sections

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    doc:
        spaCy Doc object, annotated for sections
    """
    titles = filter_spans(self.process(doc))

    if self.add_patterns:
        # Remove preceding newline
        titles = [
            Span(doc, title.start + 1, title.end - 1, label=title.label_)
            for title in titles
        ]

    sections = []

    for t1, t2 in zip(titles[:-1], titles[1:]):
        section = Span(doc, t1.start, t2.start, label=t1.label)
        section._.section_title = t1
        sections.append(section)

    if titles:
        t = titles[-1]
        section = Span(doc, t.start, len(doc), label=t.label)
        section._.section_title = t
        sections.append(section)

    doc.spans["sections"] = sections
    doc.spans["section_titles"] = titles

    return doc

factory

DEFAULT_CONFIG = dict(sections=None, add_patterns=True, attr='NORM', ignore_excluded=True) module-attribute
create_component(nlp, name, sections, add_patterns, attr, ignore_excluded)
Source code in edsnlp/pipelines/misc/sections/factory.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
@deprecated_factory("sections", "eds.sections", default_config=DEFAULT_CONFIG)
@Language.factory("eds.sections", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    sections: Optional[Dict[str, List[str]]],
    add_patterns: bool,
    attr: str,
    ignore_excluded: bool,
):
    return Sections(
        nlp,
        sections=sections,
        add_patterns=add_patterns,
        attr=attr,
        ignore_excluded=ignore_excluded,
    )

consultation_dates

patterns

consultation_mention = ['rendez-vous pris', 'consultation', 'consultation.{1,8}examen', 'examen clinique', 'de compte rendu', "date de l'examen", 'examen realise le', 'date de la visite'] module-attribute
town_mention = ['paris', 'kremlin.bicetre', 'creteil', 'boulogne.billancourt', 'villejuif', 'clamart', 'bobigny', 'clichy', 'ivry.sur.seine', 'issy.les.moulineaux', 'draveil', 'limeil', 'champcueil', 'roche.guyon', 'bondy', 'colombes', 'hendaye', 'herck.sur.mer', 'labruyere', 'garches', 'sevran', 'hyeres'] module-attribute
document_date_mention = ['imprime le', 'signe electroniquement', 'signe le', 'saisi le', 'dicte le', 'tape le', 'date de reference', 'date\\s*:', 'dactylographie le', 'date du rapport'] module-attribute

consultation_dates

ConsultationDates

Bases: GenericMatcher

Class to extract consultation dates from "CR-CONS" documents.

The pipeline populates the doc.spans['consultation_dates'] list.

For each extraction s in this list, the corresponding date is available as s._.consultation_date.

PARAMETER DESCRIPTION
nlp

Language pipeline object

TYPE: Language

consultation_mention

List of RegEx for consultation mentions.

  • If type==list: Overrides the default list
  • If type==bool: Uses the default list of True, disable if False

TYPE: Union[List[str], bool]

town_mention : Union[List[str], bool] List of RegEx for all AP-HP hospitals' towns mentions.

- If `type==list`: Overrides the default list
- If `type==bool`: Uses the default list of True, disable if False

document_date_mention : Union[List[str], bool] List of RegEx for document date.

- If `type==list`: Overrides the default list
- If `type==bool`: Uses the default list of True, disable if False
Source code in edsnlp/pipelines/misc/consultation_dates/consultation_dates.py
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
class ConsultationDates(GenericMatcher):
    """
    Class to extract consultation dates from "CR-CONS" documents.

    The pipeline populates the `#!python doc.spans['consultation_dates']` list.

    For each extraction `s` in this list, the corresponding date is available
    as `s._.consultation_date`.

    Parameters
    ----------
    nlp : Language
        Language pipeline object
    consultation_mention : Union[List[str], bool]
        List of RegEx for consultation mentions.

        - If `type==list`: Overrides the default list
        - If `type==bool`: Uses the default list of True, disable if False

    town_mention : Union[List[str], bool]
        List of RegEx for all AP-HP hospitals' towns mentions.

        - If `type==list`: Overrides the default list
        - If `type==bool`: Uses the default list of True, disable if False
    document_date_mention : Union[List[str], bool]
        List of RegEx for document date.

        - If `type==list`: Overrides the default list
        - If `type==bool`: Uses the default list of True, disable if False
    """

    def __init__(
        self,
        nlp: Language,
        consultation_mention: Union[List[str], bool],
        town_mention: Union[List[str], bool],
        document_date_mention: Union[List[str], bool],
        attr: str,
        **kwargs,
    ):

        logger.warning("This pipeline is still in beta")
        logger.warning(
            "This pipeline should ONLY be used on notes "
            "where `note_class_source_value == 'CR-CONS'`"
        )
        logger.warning(
            """This pipeline requires to use the normalizer pipeline with:
        lowercase=True,
        accents=True,
        quotes=True"""
        )

        if not (nlp.has_pipe("dates") and nlp.get_pipe("dates").on_ents_only is False):

            config = dict(**DEFAULT_CONFIG)
            config["on_ents_only"] = "consultation_mentions"

            self.date_matcher = Dates(nlp, **config)

        else:
            self.date_matcher = None

        if not consultation_mention:
            consultation_mention = []
        elif consultation_mention is True:
            consultation_mention = consult_regex.consultation_mention

        if not document_date_mention:
            document_date_mention = []
        elif document_date_mention is True:
            document_date_mention = consult_regex.document_date_mention

        if not town_mention:
            town_mention = []
        elif town_mention is True:
            town_mention = consult_regex.town_mention

        regex = dict(
            consultation_mention=consultation_mention,
            town_mention=town_mention,
            document_date_mention=document_date_mention,
        )

        super().__init__(
            nlp,
            regex=regex,
            terms=dict(),
            attr=attr,
            ignore_excluded=False,
            **kwargs,
        )

        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:
        if not Span.has_extension("consultation_date"):
            Span.set_extension("consultation_date", default=None)

    def __call__(self, doc: Doc) -> Doc:
        """
        Finds entities

        Parameters
        ----------
        doc: spaCy Doc object

        Returns
        -------
        doc: Doc
            spaCy Doc object with additional
            `doc.spans['consultation_dates]` `SpanGroup`
        """

        ents = self.process(doc)

        doc.spans["consultation_mentions"] = ents
        doc.spans["consultation_dates"] = []

        if self.date_matcher is not None:
            doc = self.date_matcher(doc)

        for mention in ents:
            # Looking for a date
            # - In the same sentence
            # - Not less than 10 tokens AFTER the consultation mention
            matching_dates = [
                date
                for date in doc.spans["dates"]
                if (
                    (mention.sent == date.sent)
                    and (date.start > mention.start)
                    and (date.start - mention.end <= 10)
                )
            ]

            if matching_dates:
                # We keep the first mention of a date
                kept_date = min(matching_dates, key=lambda d: d.start)
                span = doc[mention.start : kept_date.end]
                span.label_ = mention.label_
                span._.consultation_date = kept_date._.date

                doc.spans["consultation_dates"].append(span)

        del doc.spans["consultation_mentions"]

        return doc
date_matcher = Dates(nlp, None=config) instance-attribute
__init__(nlp, consultation_mention, town_mention, document_date_mention, attr, **kwargs)
Source code in edsnlp/pipelines/misc/consultation_dates/consultation_dates.py
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
def __init__(
    self,
    nlp: Language,
    consultation_mention: Union[List[str], bool],
    town_mention: Union[List[str], bool],
    document_date_mention: Union[List[str], bool],
    attr: str,
    **kwargs,
):

    logger.warning("This pipeline is still in beta")
    logger.warning(
        "This pipeline should ONLY be used on notes "
        "where `note_class_source_value == 'CR-CONS'`"
    )
    logger.warning(
        """This pipeline requires to use the normalizer pipeline with:
    lowercase=True,
    accents=True,
    quotes=True"""
    )

    if not (nlp.has_pipe("dates") and nlp.get_pipe("dates").on_ents_only is False):

        config = dict(**DEFAULT_CONFIG)
        config["on_ents_only"] = "consultation_mentions"

        self.date_matcher = Dates(nlp, **config)

    else:
        self.date_matcher = None

    if not consultation_mention:
        consultation_mention = []
    elif consultation_mention is True:
        consultation_mention = consult_regex.consultation_mention

    if not document_date_mention:
        document_date_mention = []
    elif document_date_mention is True:
        document_date_mention = consult_regex.document_date_mention

    if not town_mention:
        town_mention = []
    elif town_mention is True:
        town_mention = consult_regex.town_mention

    regex = dict(
        consultation_mention=consultation_mention,
        town_mention=town_mention,
        document_date_mention=document_date_mention,
    )

    super().__init__(
        nlp,
        regex=regex,
        terms=dict(),
        attr=attr,
        ignore_excluded=False,
        **kwargs,
    )

    self.set_extensions()
set_extensions()
Source code in edsnlp/pipelines/misc/consultation_dates/consultation_dates.py
109
110
111
112
@staticmethod
def set_extensions() -> None:
    if not Span.has_extension("consultation_date"):
        Span.set_extension("consultation_date", default=None)
__call__(doc)

Finds entities

PARAMETER DESCRIPTION
doc

TYPE: Doc

RETURNS DESCRIPTION
doc

spaCy Doc object with additional doc.spans['consultation_dates] SpanGroup

Source code in edsnlp/pipelines/misc/consultation_dates/consultation_dates.py
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
def __call__(self, doc: Doc) -> Doc:
    """
    Finds entities

    Parameters
    ----------
    doc: spaCy Doc object

    Returns
    -------
    doc: Doc
        spaCy Doc object with additional
        `doc.spans['consultation_dates]` `SpanGroup`
    """

    ents = self.process(doc)

    doc.spans["consultation_mentions"] = ents
    doc.spans["consultation_dates"] = []

    if self.date_matcher is not None:
        doc = self.date_matcher(doc)

    for mention in ents:
        # Looking for a date
        # - In the same sentence
        # - Not less than 10 tokens AFTER the consultation mention
        matching_dates = [
            date
            for date in doc.spans["dates"]
            if (
                (mention.sent == date.sent)
                and (date.start > mention.start)
                and (date.start - mention.end <= 10)
            )
        ]

        if matching_dates:
            # We keep the first mention of a date
            kept_date = min(matching_dates, key=lambda d: d.start)
            span = doc[mention.start : kept_date.end]
            span.label_ = mention.label_
            span._.consultation_date = kept_date._.date

            doc.spans["consultation_dates"].append(span)

    del doc.spans["consultation_mentions"]

    return doc

factory

DEFAULT_CONFIG = dict(consultation_mention=True, town_mention=False, document_date_mention=False, attr='NORM') module-attribute
create_component(nlp, name, attr, consultation_mention, town_mention, document_date_mention)
Source code in edsnlp/pipelines/misc/consultation_dates/factory.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
@deprecated_factory(
    "consultation_dates",
    "eds.consultation_dates",
    default_config=DEFAULT_CONFIG,
)
@Language.factory("eds.consultation_dates", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    attr: str,
    consultation_mention: Union[List[str], bool],
    town_mention: Union[List[str], bool],
    document_date_mention: Union[List[str], bool],
):
    return ConsultationDates(
        nlp,
        attr=attr,
        consultation_mention=consultation_mention,
        document_date_mention=document_date_mention,
        town_mention=town_mention,
    )
Back to top