Skip to content

edsnlp.pipelines.misc.dates.dates

eds.dates pipeline.

PERIOD_PROXIMITY_THRESHOLD = 3 module-attribute

Dates

Bases: BaseComponent

Tags and normalizes dates, using the open-source dateparser library.

The pipeline uses spaCy's filter_spans function. It filters out false positives, and introduce a hierarchy between patterns. For instance, in case of ambiguity, the pipeline will decide that a date is a date without a year rather than a date without a day.

PARAMETER DESCRIPTION
nlp

Language pipeline object

TYPE: spacy.language.Language

absolute

List of regular expressions for absolute dates.

TYPE: Union[List[str], str]

relative

List of regular expressions for relative dates (eg hier, la semaine prochaine).

TYPE: Union[List[str], str]

duration

List of regular expressions for durations (eg pendant trois mois).

TYPE: Union[List[str], str]

false_positive

List of regular expressions for false positive (eg phone numbers, etc).

TYPE: Union[List[str], str]

on_ents_only

Wether to look on dates in the whole document or in specific sentences:

  • If True: Only look in the sentences of each entity in doc.ents
  • If False: Look in the whole document
  • If given a string key or list of string: Only look in the sentences of each entity in doc.spans[key]

TYPE: Union[bool, str, List[str]]

detect_periods

Wether to detect periods (experimental)

TYPE: bool

attr

spaCy attribute to use

TYPE: str

Source code in edsnlp/pipelines/misc/dates/dates.py
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
class Dates(BaseComponent):
    """
    Tags and normalizes dates, using the open-source `dateparser` library.

    The pipeline uses spaCy's `filter_spans` function.
    It filters out false positives, and introduce a hierarchy between patterns.
    For instance, in case of ambiguity, the pipeline will decide that a date is a
    date without a year rather than a date without a day.

    Parameters
    ----------
    nlp : spacy.language.Language
        Language pipeline object
    absolute : Union[List[str], str]
        List of regular expressions for absolute dates.
    relative : Union[List[str], str]
        List of regular expressions for relative dates
        (eg `hier`, `la semaine prochaine`).
    duration : Union[List[str], str]
        List of regular expressions for durations
        (eg `pendant trois mois`).
    false_positive : Union[List[str], str]
        List of regular expressions for false positive (eg phone numbers, etc).
    on_ents_only : Union[bool, str, List[str]]
        Wether to look on dates in the whole document or in specific sentences:

        - If `True`: Only look in the sentences of each entity in doc.ents
        - If False: Look in the whole document
        - If given a string `key` or list of string: Only look in the sentences of
          each entity in `#!python doc.spans[key]`
    detect_periods : bool
        Wether to detect periods (experimental)
    attr : str
        spaCy attribute to use
    """

    # noinspection PyProtectedMember
    def __init__(
        self,
        nlp: Language,
        absolute: Optional[List[str]],
        relative: Optional[List[str]],
        duration: Optional[List[str]],
        false_positive: Optional[List[str]],
        on_ents_only: Union[bool, List[str]],
        detect_periods: bool,
        attr: str,
    ):

        self.nlp = nlp

        if absolute is None:
            absolute = patterns.absolute_pattern
        if relative is None:
            relative = patterns.relative_pattern
        if duration is None:
            duration = patterns.duration_pattern
        if false_positive is None:
            false_positive = patterns.false_positive_pattern

        if isinstance(absolute, str):
            absolute = [absolute]
        if isinstance(relative, str):
            relative = [relative]
        if isinstance(duration, str):
            relative = [duration]
        if isinstance(false_positive, str):
            false_positive = [false_positive]

        self.on_ents_only = on_ents_only
        self.regex_matcher = RegexMatcher(attr=attr, alignment_mode="strict")

        self.regex_matcher.add("false_positive", false_positive)
        self.regex_matcher.add("absolute", absolute)
        self.regex_matcher.add("relative", relative)
        self.regex_matcher.add("duration", duration)

        self.detect_periods = detect_periods

        if detect_periods:
            logger.warning("The period extractor is experimental.")

        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:
        """
        Set extensions for the dates pipeline.
        """

        if not Span.has_extension("datetime"):
            Span.set_extension("datetime", default=None)

        if not Span.has_extension("date"):
            Span.set_extension("date", default=None)

        if not Span.has_extension("period"):
            Span.set_extension("period", default=None)

    def process(self, doc: Doc) -> List[Span]:
        """
        Find dates in doc.

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        dates:
            list of date spans
        """

        if self.on_ents_only:

            if type(self.on_ents_only) == bool:
                ents = doc.ents
            else:
                if type(self.on_ents_only) == str:
                    self.on_ents_only = [self.on_ents_only]
                ents = []
                for key in self.on_ents_only:
                    ents.extend(list(doc.spans[key]))

            dates = []
            for sent in set([ent.sent for ent in ents]):
                dates = chain(
                    dates,
                    self.regex_matcher(
                        sent,
                        as_spans=True,
                        return_groupdict=True,
                    ),
                )

        else:
            dates = self.regex_matcher(
                doc,
                as_spans=True,
                return_groupdict=True,
            )

        dates = filter_spans(dates)
        dates = [date for date in dates if date[0].label_ != "false_positive"]

        return dates

    def parse(self, dates: List[Tuple[Span, Dict[str, str]]]) -> List[Span]:
        """
        Parse dates using the groupdict returned by the matcher.

        Parameters
        ----------
        dates : List[Tuple[Span, Dict[str, str]]]
            List of tuples containing the spans and groupdict
            returned by the matcher.

        Returns
        -------
        List[Span]
            List of processed spans, with the date parsed.
        """

        for span, groupdict in dates:
            if span.label_ == "relative":
                parsed = RelativeDate.parse_obj(groupdict)
            elif span.label_ == "absolute":
                parsed = AbsoluteDate.parse_obj(groupdict)
            else:
                parsed = Duration.parse_obj(groupdict)

            span._.date = parsed

        return [span for span, _ in dates]

    def process_periods(self, dates: List[Span]) -> List[Span]:
        """
        Experimental period detection.

        Parameters
        ----------
        dates : List[Span]
            List of detected dates.

        Returns
        -------
        List[Span]
            List of detected periods.
        """

        if len(dates) < 2:
            return []

        periods = []
        seen = set()

        dates = list(sorted(dates, key=lambda d: d.start))

        for d1, d2 in zip(dates[:-1], dates[1:]):

            if d1._.date.mode == Mode.DURATION or d2._.date.mode == Mode.DURATION:
                pass
            elif d1 in seen or d1._.date.mode is None or d2._.date.mode is None:
                continue

            if (
                d1.end - d2.start < PERIOD_PROXIMITY_THRESHOLD
                and d1._.date.mode != d2._.date.mode
            ):

                period = Span(d1.doc, d1.start, d2.end, label="period")

                # If one date is a duration,
                # the other may not have a registered mode.
                m1 = d1._.date.mode or Mode.FROM
                m2 = d2._.date.mode or Mode.FROM

                period._.period = Period.parse_obj(
                    {
                        m1.value: d1,
                        m2.value: d2,
                    }
                )

                seen.add(d1)
                seen.add(d2)

                periods.append(period)

        return periods

    def __call__(self, doc: Doc) -> Doc:
        """
        Tags dates.

        Parameters
        ----------
        doc : Doc
            spaCy Doc object

        Returns
        -------
        doc : Doc
            spaCy Doc object, annotated for dates
        """
        dates = self.process(doc)
        dates = self.parse(dates)

        doc.spans["dates"] = dates

        if self.detect_periods:
            doc.spans["periods"] = self.process_periods(dates)

        return doc

nlp = nlp instance-attribute

on_ents_only = on_ents_only instance-attribute

regex_matcher = RegexMatcher(attr=attr, alignment_mode='strict') instance-attribute

detect_periods = detect_periods instance-attribute

__init__(nlp, absolute, relative, duration, false_positive, on_ents_only, detect_periods, attr)

Source code in edsnlp/pipelines/misc/dates/dates.py
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
def __init__(
    self,
    nlp: Language,
    absolute: Optional[List[str]],
    relative: Optional[List[str]],
    duration: Optional[List[str]],
    false_positive: Optional[List[str]],
    on_ents_only: Union[bool, List[str]],
    detect_periods: bool,
    attr: str,
):

    self.nlp = nlp

    if absolute is None:
        absolute = patterns.absolute_pattern
    if relative is None:
        relative = patterns.relative_pattern
    if duration is None:
        duration = patterns.duration_pattern
    if false_positive is None:
        false_positive = patterns.false_positive_pattern

    if isinstance(absolute, str):
        absolute = [absolute]
    if isinstance(relative, str):
        relative = [relative]
    if isinstance(duration, str):
        relative = [duration]
    if isinstance(false_positive, str):
        false_positive = [false_positive]

    self.on_ents_only = on_ents_only
    self.regex_matcher = RegexMatcher(attr=attr, alignment_mode="strict")

    self.regex_matcher.add("false_positive", false_positive)
    self.regex_matcher.add("absolute", absolute)
    self.regex_matcher.add("relative", relative)
    self.regex_matcher.add("duration", duration)

    self.detect_periods = detect_periods

    if detect_periods:
        logger.warning("The period extractor is experimental.")

    self.set_extensions()

set_extensions()

Set extensions for the dates pipeline.

Source code in edsnlp/pipelines/misc/dates/dates.py
104
105
106
107
108
109
110
111
112
113
114
115
116
117
@staticmethod
def set_extensions() -> None:
    """
    Set extensions for the dates pipeline.
    """

    if not Span.has_extension("datetime"):
        Span.set_extension("datetime", default=None)

    if not Span.has_extension("date"):
        Span.set_extension("date", default=None)

    if not Span.has_extension("period"):
        Span.set_extension("period", default=None)

process(doc)

Find dates in doc.

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
dates

list of date spans

Source code in edsnlp/pipelines/misc/dates/dates.py
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
def process(self, doc: Doc) -> List[Span]:
    """
    Find dates in doc.

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    dates:
        list of date spans
    """

    if self.on_ents_only:

        if type(self.on_ents_only) == bool:
            ents = doc.ents
        else:
            if type(self.on_ents_only) == str:
                self.on_ents_only = [self.on_ents_only]
            ents = []
            for key in self.on_ents_only:
                ents.extend(list(doc.spans[key]))

        dates = []
        for sent in set([ent.sent for ent in ents]):
            dates = chain(
                dates,
                self.regex_matcher(
                    sent,
                    as_spans=True,
                    return_groupdict=True,
                ),
            )

    else:
        dates = self.regex_matcher(
            doc,
            as_spans=True,
            return_groupdict=True,
        )

    dates = filter_spans(dates)
    dates = [date for date in dates if date[0].label_ != "false_positive"]

    return dates

parse(dates)

Parse dates using the groupdict returned by the matcher.

PARAMETER DESCRIPTION
dates

List of tuples containing the spans and groupdict returned by the matcher.

TYPE: List[Tuple[Span, Dict[str, str]]]

RETURNS DESCRIPTION
List[Span]

List of processed spans, with the date parsed.

Source code in edsnlp/pipelines/misc/dates/dates.py
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
def parse(self, dates: List[Tuple[Span, Dict[str, str]]]) -> List[Span]:
    """
    Parse dates using the groupdict returned by the matcher.

    Parameters
    ----------
    dates : List[Tuple[Span, Dict[str, str]]]
        List of tuples containing the spans and groupdict
        returned by the matcher.

    Returns
    -------
    List[Span]
        List of processed spans, with the date parsed.
    """

    for span, groupdict in dates:
        if span.label_ == "relative":
            parsed = RelativeDate.parse_obj(groupdict)
        elif span.label_ == "absolute":
            parsed = AbsoluteDate.parse_obj(groupdict)
        else:
            parsed = Duration.parse_obj(groupdict)

        span._.date = parsed

    return [span for span, _ in dates]

process_periods(dates)

Experimental period detection.

PARAMETER DESCRIPTION
dates

List of detected dates.

TYPE: List[Span]

RETURNS DESCRIPTION
List[Span]

List of detected periods.

Source code in edsnlp/pipelines/misc/dates/dates.py
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
def process_periods(self, dates: List[Span]) -> List[Span]:
    """
    Experimental period detection.

    Parameters
    ----------
    dates : List[Span]
        List of detected dates.

    Returns
    -------
    List[Span]
        List of detected periods.
    """

    if len(dates) < 2:
        return []

    periods = []
    seen = set()

    dates = list(sorted(dates, key=lambda d: d.start))

    for d1, d2 in zip(dates[:-1], dates[1:]):

        if d1._.date.mode == Mode.DURATION or d2._.date.mode == Mode.DURATION:
            pass
        elif d1 in seen or d1._.date.mode is None or d2._.date.mode is None:
            continue

        if (
            d1.end - d2.start < PERIOD_PROXIMITY_THRESHOLD
            and d1._.date.mode != d2._.date.mode
        ):

            period = Span(d1.doc, d1.start, d2.end, label="period")

            # If one date is a duration,
            # the other may not have a registered mode.
            m1 = d1._.date.mode or Mode.FROM
            m2 = d2._.date.mode or Mode.FROM

            period._.period = Period.parse_obj(
                {
                    m1.value: d1,
                    m2.value: d2,
                }
            )

            seen.add(d1)
            seen.add(d2)

            periods.append(period)

    return periods

__call__(doc)

Tags dates.

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
doc

spaCy Doc object, annotated for dates

TYPE: Doc

Source code in edsnlp/pipelines/misc/dates/dates.py
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
def __call__(self, doc: Doc) -> Doc:
    """
    Tags dates.

    Parameters
    ----------
    doc : Doc
        spaCy Doc object

    Returns
    -------
    doc : Doc
        spaCy Doc object, annotated for dates
    """
    dates = self.process(doc)
    dates = self.parse(dates)

    doc.spans["dates"] = dates

    if self.detect_periods:
        doc.spans["periods"] = self.process_periods(dates)

    return doc
Back to top