Skip to content

edsnlp.pipelines.misc.dates

dates

eds.dates pipeline.

PERIOD_PROXIMITY_THRESHOLD = 3 module-attribute

Dates

Bases: BaseComponent

Tags and normalizes dates, using the open-source dateparser library.

The pipeline uses spaCy's filter_spans function. It filters out false positives, and introduce a hierarchy between patterns. For instance, in case of ambiguity, the pipeline will decide that a date is a date without a year rather than a date without a day.

PARAMETER DESCRIPTION
nlp

Language pipeline object

TYPE: spacy.language.Language

absolute

List of regular expressions for absolute dates.

TYPE: Union[List[str], str]

relative

List of regular expressions for relative dates (eg hier, la semaine prochaine).

TYPE: Union[List[str], str]

duration

List of regular expressions for durations (eg pendant trois mois).

TYPE: Union[List[str], str]

false_positive

List of regular expressions for false positive (eg phone numbers, etc).

TYPE: Union[List[str], str]

on_ents_only

Wether to look on dates in the whole document or in specific sentences:

  • If True: Only look in the sentences of each entity in doc.ents
  • If False: Look in the whole document
  • If given a string key or list of string: Only look in the sentences of each entity in doc.spans[key]

TYPE: Union[bool, str, List[str]]

detect_periods

Wether to detect periods (experimental)

TYPE: bool

attr

spaCy attribute to use

TYPE: str

Source code in edsnlp/pipelines/misc/dates/dates.py
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
class Dates(BaseComponent):
    """
    Tags and normalizes dates, using the open-source `dateparser` library.

    The pipeline uses spaCy's `filter_spans` function.
    It filters out false positives, and introduce a hierarchy between patterns.
    For instance, in case of ambiguity, the pipeline will decide that a date is a
    date without a year rather than a date without a day.

    Parameters
    ----------
    nlp : spacy.language.Language
        Language pipeline object
    absolute : Union[List[str], str]
        List of regular expressions for absolute dates.
    relative : Union[List[str], str]
        List of regular expressions for relative dates
        (eg `hier`, `la semaine prochaine`).
    duration : Union[List[str], str]
        List of regular expressions for durations
        (eg `pendant trois mois`).
    false_positive : Union[List[str], str]
        List of regular expressions for false positive (eg phone numbers, etc).
    on_ents_only : Union[bool, str, List[str]]
        Wether to look on dates in the whole document or in specific sentences:

        - If `True`: Only look in the sentences of each entity in doc.ents
        - If False: Look in the whole document
        - If given a string `key` or list of string: Only look in the sentences of
          each entity in `#!python doc.spans[key]`
    detect_periods : bool
        Wether to detect periods (experimental)
    attr : str
        spaCy attribute to use
    """

    # noinspection PyProtectedMember
    def __init__(
        self,
        nlp: Language,
        absolute: Optional[List[str]],
        relative: Optional[List[str]],
        duration: Optional[List[str]],
        false_positive: Optional[List[str]],
        on_ents_only: Union[bool, List[str]],
        detect_periods: bool,
        attr: str,
    ):

        self.nlp = nlp

        if absolute is None:
            absolute = patterns.absolute_pattern
        if relative is None:
            relative = patterns.relative_pattern
        if duration is None:
            duration = patterns.duration_pattern
        if false_positive is None:
            false_positive = patterns.false_positive_pattern

        if isinstance(absolute, str):
            absolute = [absolute]
        if isinstance(relative, str):
            relative = [relative]
        if isinstance(duration, str):
            relative = [duration]
        if isinstance(false_positive, str):
            false_positive = [false_positive]

        self.on_ents_only = on_ents_only
        self.regex_matcher = RegexMatcher(attr=attr, alignment_mode="strict")

        self.regex_matcher.add("false_positive", false_positive)
        self.regex_matcher.add("absolute", absolute)
        self.regex_matcher.add("relative", relative)
        self.regex_matcher.add("duration", duration)

        self.detect_periods = detect_periods

        if detect_periods:
            logger.warning("The period extractor is experimental.")

        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:
        """
        Set extensions for the dates pipeline.
        """

        if not Span.has_extension("datetime"):
            Span.set_extension("datetime", default=None)

        if not Span.has_extension("date"):
            Span.set_extension("date", default=None)

        if not Span.has_extension("period"):
            Span.set_extension("period", default=None)

    def process(self, doc: Doc) -> List[Span]:
        """
        Find dates in doc.

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        dates:
            list of date spans
        """

        if self.on_ents_only:

            if type(self.on_ents_only) == bool:
                ents = doc.ents
            else:
                if type(self.on_ents_only) == str:
                    self.on_ents_only = [self.on_ents_only]
                ents = []
                for key in self.on_ents_only:
                    ents.extend(list(doc.spans[key]))

            dates = []
            for sent in set([ent.sent for ent in ents]):
                dates = chain(
                    dates,
                    self.regex_matcher(
                        sent,
                        as_spans=True,
                        return_groupdict=True,
                    ),
                )

        else:
            dates = self.regex_matcher(
                doc,
                as_spans=True,
                return_groupdict=True,
            )

        dates = filter_spans(dates)
        dates = [date for date in dates if date[0].label_ != "false_positive"]

        return dates

    def parse(self, dates: List[Tuple[Span, Dict[str, str]]]) -> List[Span]:
        """
        Parse dates using the groupdict returned by the matcher.

        Parameters
        ----------
        dates : List[Tuple[Span, Dict[str, str]]]
            List of tuples containing the spans and groupdict
            returned by the matcher.

        Returns
        -------
        List[Span]
            List of processed spans, with the date parsed.
        """

        for span, groupdict in dates:
            if span.label_ == "relative":
                parsed = RelativeDate.parse_obj(groupdict)
            elif span.label_ == "absolute":
                parsed = AbsoluteDate.parse_obj(groupdict)
            else:
                parsed = Duration.parse_obj(groupdict)

            span._.date = parsed

        return [span for span, _ in dates]

    def process_periods(self, dates: List[Span]) -> List[Span]:
        """
        Experimental period detection.

        Parameters
        ----------
        dates : List[Span]
            List of detected dates.

        Returns
        -------
        List[Span]
            List of detected periods.
        """

        if len(dates) < 2:
            return []

        periods = []
        seen = set()

        dates = list(sorted(dates, key=lambda d: d.start))

        for d1, d2 in zip(dates[:-1], dates[1:]):

            if d1._.date.mode == Mode.DURATION or d2._.date.mode == Mode.DURATION:
                pass
            elif d1 in seen or d1._.date.mode is None or d2._.date.mode is None:
                continue

            if (
                d1.end - d2.start < PERIOD_PROXIMITY_THRESHOLD
                and d1._.date.mode != d2._.date.mode
            ):

                period = Span(d1.doc, d1.start, d2.end, label="period")

                # If one date is a duration,
                # the other may not have a registered mode.
                m1 = d1._.date.mode or Mode.FROM
                m2 = d2._.date.mode or Mode.FROM

                period._.period = Period.parse_obj(
                    {
                        m1.value: d1,
                        m2.value: d2,
                    }
                )

                seen.add(d1)
                seen.add(d2)

                periods.append(period)

        return periods

    def __call__(self, doc: Doc) -> Doc:
        """
        Tags dates.

        Parameters
        ----------
        doc : Doc
            spaCy Doc object

        Returns
        -------
        doc : Doc
            spaCy Doc object, annotated for dates
        """
        dates = self.process(doc)
        dates = self.parse(dates)

        doc.spans["dates"] = dates

        if self.detect_periods:
            doc.spans["periods"] = self.process_periods(dates)

        return doc
nlp = nlp instance-attribute
on_ents_only = on_ents_only instance-attribute
regex_matcher = RegexMatcher(attr=attr, alignment_mode='strict') instance-attribute
detect_periods = detect_periods instance-attribute
__init__(nlp, absolute, relative, duration, false_positive, on_ents_only, detect_periods, attr)
Source code in edsnlp/pipelines/misc/dates/dates.py
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
def __init__(
    self,
    nlp: Language,
    absolute: Optional[List[str]],
    relative: Optional[List[str]],
    duration: Optional[List[str]],
    false_positive: Optional[List[str]],
    on_ents_only: Union[bool, List[str]],
    detect_periods: bool,
    attr: str,
):

    self.nlp = nlp

    if absolute is None:
        absolute = patterns.absolute_pattern
    if relative is None:
        relative = patterns.relative_pattern
    if duration is None:
        duration = patterns.duration_pattern
    if false_positive is None:
        false_positive = patterns.false_positive_pattern

    if isinstance(absolute, str):
        absolute = [absolute]
    if isinstance(relative, str):
        relative = [relative]
    if isinstance(duration, str):
        relative = [duration]
    if isinstance(false_positive, str):
        false_positive = [false_positive]

    self.on_ents_only = on_ents_only
    self.regex_matcher = RegexMatcher(attr=attr, alignment_mode="strict")

    self.regex_matcher.add("false_positive", false_positive)
    self.regex_matcher.add("absolute", absolute)
    self.regex_matcher.add("relative", relative)
    self.regex_matcher.add("duration", duration)

    self.detect_periods = detect_periods

    if detect_periods:
        logger.warning("The period extractor is experimental.")

    self.set_extensions()
set_extensions()

Set extensions for the dates pipeline.

Source code in edsnlp/pipelines/misc/dates/dates.py
104
105
106
107
108
109
110
111
112
113
114
115
116
117
@staticmethod
def set_extensions() -> None:
    """
    Set extensions for the dates pipeline.
    """

    if not Span.has_extension("datetime"):
        Span.set_extension("datetime", default=None)

    if not Span.has_extension("date"):
        Span.set_extension("date", default=None)

    if not Span.has_extension("period"):
        Span.set_extension("period", default=None)
process(doc)

Find dates in doc.

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
dates

list of date spans

Source code in edsnlp/pipelines/misc/dates/dates.py
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
def process(self, doc: Doc) -> List[Span]:
    """
    Find dates in doc.

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    dates:
        list of date spans
    """

    if self.on_ents_only:

        if type(self.on_ents_only) == bool:
            ents = doc.ents
        else:
            if type(self.on_ents_only) == str:
                self.on_ents_only = [self.on_ents_only]
            ents = []
            for key in self.on_ents_only:
                ents.extend(list(doc.spans[key]))

        dates = []
        for sent in set([ent.sent for ent in ents]):
            dates = chain(
                dates,
                self.regex_matcher(
                    sent,
                    as_spans=True,
                    return_groupdict=True,
                ),
            )

    else:
        dates = self.regex_matcher(
            doc,
            as_spans=True,
            return_groupdict=True,
        )

    dates = filter_spans(dates)
    dates = [date for date in dates if date[0].label_ != "false_positive"]

    return dates
parse(dates)

Parse dates using the groupdict returned by the matcher.

PARAMETER DESCRIPTION
dates

List of tuples containing the spans and groupdict returned by the matcher.

TYPE: List[Tuple[Span, Dict[str, str]]]

RETURNS DESCRIPTION
List[Span]

List of processed spans, with the date parsed.

Source code in edsnlp/pipelines/misc/dates/dates.py
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
def parse(self, dates: List[Tuple[Span, Dict[str, str]]]) -> List[Span]:
    """
    Parse dates using the groupdict returned by the matcher.

    Parameters
    ----------
    dates : List[Tuple[Span, Dict[str, str]]]
        List of tuples containing the spans and groupdict
        returned by the matcher.

    Returns
    -------
    List[Span]
        List of processed spans, with the date parsed.
    """

    for span, groupdict in dates:
        if span.label_ == "relative":
            parsed = RelativeDate.parse_obj(groupdict)
        elif span.label_ == "absolute":
            parsed = AbsoluteDate.parse_obj(groupdict)
        else:
            parsed = Duration.parse_obj(groupdict)

        span._.date = parsed

    return [span for span, _ in dates]
process_periods(dates)

Experimental period detection.

PARAMETER DESCRIPTION
dates

List of detected dates.

TYPE: List[Span]

RETURNS DESCRIPTION
List[Span]

List of detected periods.

Source code in edsnlp/pipelines/misc/dates/dates.py
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
def process_periods(self, dates: List[Span]) -> List[Span]:
    """
    Experimental period detection.

    Parameters
    ----------
    dates : List[Span]
        List of detected dates.

    Returns
    -------
    List[Span]
        List of detected periods.
    """

    if len(dates) < 2:
        return []

    periods = []
    seen = set()

    dates = list(sorted(dates, key=lambda d: d.start))

    for d1, d2 in zip(dates[:-1], dates[1:]):

        if d1._.date.mode == Mode.DURATION or d2._.date.mode == Mode.DURATION:
            pass
        elif d1 in seen or d1._.date.mode is None or d2._.date.mode is None:
            continue

        if (
            d1.end - d2.start < PERIOD_PROXIMITY_THRESHOLD
            and d1._.date.mode != d2._.date.mode
        ):

            period = Span(d1.doc, d1.start, d2.end, label="period")

            # If one date is a duration,
            # the other may not have a registered mode.
            m1 = d1._.date.mode or Mode.FROM
            m2 = d2._.date.mode or Mode.FROM

            period._.period = Period.parse_obj(
                {
                    m1.value: d1,
                    m2.value: d2,
                }
            )

            seen.add(d1)
            seen.add(d2)

            periods.append(period)

    return periods
__call__(doc)

Tags dates.

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
doc

spaCy Doc object, annotated for dates

TYPE: Doc

Source code in edsnlp/pipelines/misc/dates/dates.py
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
def __call__(self, doc: Doc) -> Doc:
    """
    Tags dates.

    Parameters
    ----------
    doc : Doc
        spaCy Doc object

    Returns
    -------
    doc : Doc
        spaCy Doc object, annotated for dates
    """
    dates = self.process(doc)
    dates = self.parse(dates)

    doc.spans["dates"] = dates

    if self.detect_periods:
        doc.spans["periods"] = self.process_periods(dates)

    return doc

models

Direction

Bases: Enum

Source code in edsnlp/pipelines/misc/dates/models.py
12
13
14
15
16
class Direction(Enum):

    FUTURE = "FUTURE"
    PAST = "PAST"
    CURRENT = "CURRENT"
FUTURE = 'FUTURE' class-attribute
PAST = 'PAST' class-attribute
CURRENT = 'CURRENT' class-attribute

Mode

Bases: Enum

Source code in edsnlp/pipelines/misc/dates/models.py
19
20
21
22
23
class Mode(Enum):

    FROM = "FROM"
    UNTIL = "UNTIL"
    DURATION = "DURATION"
FROM = 'FROM' class-attribute
UNTIL = 'UNTIL' class-attribute
DURATION = 'DURATION' class-attribute

Period

Bases: BaseModel

Source code in edsnlp/pipelines/misc/dates/models.py
26
27
28
29
30
31
32
class Period(BaseModel):
    FROM: Optional[Span] = None
    UNTIL: Optional[Span] = None
    DURATION: Optional[Span] = None

    class Config:
        arbitrary_types_allowed = True
FROM: Optional[Span] = None class-attribute
UNTIL: Optional[Span] = None class-attribute
DURATION: Optional[Span] = None class-attribute
Config
Source code in edsnlp/pipelines/misc/dates/models.py
31
32
class Config:
    arbitrary_types_allowed = True
arbitrary_types_allowed = True class-attribute

BaseDate

Bases: BaseModel

Source code in edsnlp/pipelines/misc/dates/models.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
class BaseDate(BaseModel):

    mode: Optional[Mode] = None

    @root_validator(pre=True)
    def validate_strings(cls, d: Dict[str, str]) -> Dict[str, str]:
        result = d.copy()

        for k, v in d.items():
            if v is not None and "_" in k:
                key, value = k.split("_")
                result.update({key: value})

        return result
mode: Optional[Mode] = None class-attribute
validate_strings(d)
Source code in edsnlp/pipelines/misc/dates/models.py
39
40
41
42
43
44
45
46
47
48
@root_validator(pre=True)
def validate_strings(cls, d: Dict[str, str]) -> Dict[str, str]:
    result = d.copy()

    for k, v in d.items():
        if v is not None and "_" in k:
            key, value = k.split("_")
            result.update({key: value})

    return result

AbsoluteDate

Bases: BaseDate

Source code in edsnlp/pipelines/misc/dates/models.py
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
class AbsoluteDate(BaseDate):

    year: Optional[int] = None
    month: Optional[int] = None
    day: Optional[int] = None
    hour: Optional[int] = None
    minute: Optional[int] = None
    second: Optional[int] = None

    def to_datetime(
        self,
        tz: Union[str, pendulum.tz.timezone] = "Europe/Paris",
        **kwargs,
    ) -> Optional[pendulum.datetime]:

        if self.year and self.month and self.day:

            d = self.dict(exclude_none=True)

            d.pop("mode", None)

            return pendulum.datetime(**d, tz=tz)

        return None

    def norm(self) -> str:

        year = str(self.year) if self.year else "????"
        month = f"{self.month:02}" if self.month else "??"
        day = f"{self.day:02}" if self.day else "??"

        norm = "-".join([year, month, day])

        if self.hour:
            norm += f" {self.hour:02}h"

        if self.minute:
            norm += f"{self.minute:02}m"

        if self.second:
            norm += f"{self.second:02}s"

        return norm

    @validator("year")
    def validate_year(cls, v):
        if v > 100:
            return v

        if v < 25:
            return 2000 + v
year: Optional[int] = None class-attribute
month: Optional[int] = None class-attribute
day: Optional[int] = None class-attribute
hour: Optional[int] = None class-attribute
minute: Optional[int] = None class-attribute
second: Optional[int] = None class-attribute
to_datetime(tz='Europe/Paris', **kwargs)
Source code in edsnlp/pipelines/misc/dates/models.py
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def to_datetime(
    self,
    tz: Union[str, pendulum.tz.timezone] = "Europe/Paris",
    **kwargs,
) -> Optional[pendulum.datetime]:

    if self.year and self.month and self.day:

        d = self.dict(exclude_none=True)

        d.pop("mode", None)

        return pendulum.datetime(**d, tz=tz)

    return None
norm()
Source code in edsnlp/pipelines/misc/dates/models.py
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
def norm(self) -> str:

    year = str(self.year) if self.year else "????"
    month = f"{self.month:02}" if self.month else "??"
    day = f"{self.day:02}" if self.day else "??"

    norm = "-".join([year, month, day])

    if self.hour:
        norm += f" {self.hour:02}h"

    if self.minute:
        norm += f"{self.minute:02}m"

    if self.second:
        norm += f"{self.second:02}s"

    return norm
validate_year(v)
Source code in edsnlp/pipelines/misc/dates/models.py
 95
 96
 97
 98
 99
100
101
@validator("year")
def validate_year(cls, v):
    if v > 100:
        return v

    if v < 25:
        return 2000 + v

Relative

Bases: BaseDate

Source code in edsnlp/pipelines/misc/dates/models.py
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
class Relative(BaseDate):

    year: Optional[int] = None
    month: Optional[int] = None
    week: Optional[int] = None
    day: Optional[int] = None
    hour: Optional[int] = None
    minute: Optional[int] = None
    second: Optional[int] = None

    @root_validator(pre=True)
    def parse_unit(cls, d: Dict[str, str]) -> Dict[str, str]:
        """
        Units need to be handled separately.

        This validator modifies the key corresponding to the unit
        with the detected value

        Parameters
        ----------
        d : Dict[str, str]
            Original data

        Returns
        -------
        Dict[str, str]
            Transformed data
        """
        unit = d.get("unit")

        if unit:
            d[unit] = d.get("number")

        return d

    def to_datetime(self, **kwargs) -> pendulum.Duration:
        d = self.dict(exclude_none=True)

        direction = d.pop("direction", None)
        dir = -1 if direction == Direction.PAST else 1

        d.pop("mode", None)

        d = {f"{k}s": v for k, v in d.items()}

        td = dir * pendulum.duration(**d)
        return td
year: Optional[int] = None class-attribute
month: Optional[int] = None class-attribute
week: Optional[int] = None class-attribute
day: Optional[int] = None class-attribute
hour: Optional[int] = None class-attribute
minute: Optional[int] = None class-attribute
second: Optional[int] = None class-attribute
parse_unit(d)

Units need to be handled separately.

This validator modifies the key corresponding to the unit with the detected value

PARAMETER DESCRIPTION
d

Original data

TYPE: Dict[str, str]

RETURNS DESCRIPTION
Dict[str, str]

Transformed data

Source code in edsnlp/pipelines/misc/dates/models.py
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
@root_validator(pre=True)
def parse_unit(cls, d: Dict[str, str]) -> Dict[str, str]:
    """
    Units need to be handled separately.

    This validator modifies the key corresponding to the unit
    with the detected value

    Parameters
    ----------
    d : Dict[str, str]
        Original data

    Returns
    -------
    Dict[str, str]
        Transformed data
    """
    unit = d.get("unit")

    if unit:
        d[unit] = d.get("number")

    return d
to_datetime(**kwargs)
Source code in edsnlp/pipelines/misc/dates/models.py
139
140
141
142
143
144
145
146
147
148
149
150
def to_datetime(self, **kwargs) -> pendulum.Duration:
    d = self.dict(exclude_none=True)

    direction = d.pop("direction", None)
    dir = -1 if direction == Direction.PAST else 1

    d.pop("mode", None)

    d = {f"{k}s": v for k, v in d.items()}

    td = dir * pendulum.duration(**d)
    return td

RelativeDate

Bases: Relative

Source code in edsnlp/pipelines/misc/dates/models.py
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
class RelativeDate(Relative):
    direction: Direction = Direction.CURRENT

    def to_datetime(
        self, note_datetime: Optional[datetime] = None
    ) -> pendulum.Duration:
        td = super(RelativeDate, self).to_datetime()

        if note_datetime is not None:
            return note_datetime + td

        return td

    def norm(self) -> str:

        if self.direction == Direction.CURRENT:
            d = self.dict(exclude_none=True)
            d.pop("direction")

            (key,) = d.keys()

            norm = f"~0 {key}"
        else:
            td = self.to_datetime()
            norm = str(td)
            if td.in_seconds() > 0:
                norm = f"+{norm}"

        return norm

    @root_validator(pre=True)
    def handle_specifics(cls, d: Dict[str, str]) -> Dict[str, str]:
        """
        Specific patterns such as `aujourd'hui`, `hier`, etc,
        need to be handled separately.

        Parameters
        ----------
        d : Dict[str, str]
            Original data.

        Returns
        -------
        Dict[str, str]
            Modified data.
        """

        specific = d.get("specific")
        specific = specific_dict.get(specific)

        if specific:
            d.update(specific)

        return d
direction: Direction = Direction.CURRENT class-attribute
to_datetime(note_datetime=None)
Source code in edsnlp/pipelines/misc/dates/models.py
156
157
158
159
160
161
162
163
164
def to_datetime(
    self, note_datetime: Optional[datetime] = None
) -> pendulum.Duration:
    td = super(RelativeDate, self).to_datetime()

    if note_datetime is not None:
        return note_datetime + td

    return td
norm()
Source code in edsnlp/pipelines/misc/dates/models.py
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
def norm(self) -> str:

    if self.direction == Direction.CURRENT:
        d = self.dict(exclude_none=True)
        d.pop("direction")

        (key,) = d.keys()

        norm = f"~0 {key}"
    else:
        td = self.to_datetime()
        norm = str(td)
        if td.in_seconds() > 0:
            norm = f"+{norm}"

    return norm
handle_specifics(d)

Specific patterns such as aujourd'hui, hier, etc, need to be handled separately.

PARAMETER DESCRIPTION
d

Original data.

TYPE: Dict[str, str]

RETURNS DESCRIPTION
Dict[str, str]

Modified data.

Source code in edsnlp/pipelines/misc/dates/models.py
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
@root_validator(pre=True)
def handle_specifics(cls, d: Dict[str, str]) -> Dict[str, str]:
    """
    Specific patterns such as `aujourd'hui`, `hier`, etc,
    need to be handled separately.

    Parameters
    ----------
    d : Dict[str, str]
        Original data.

    Returns
    -------
    Dict[str, str]
        Modified data.
    """

    specific = d.get("specific")
    specific = specific_dict.get(specific)

    if specific:
        d.update(specific)

    return d

Duration

Bases: Relative

Source code in edsnlp/pipelines/misc/dates/models.py
209
210
211
212
213
214
215
class Duration(Relative):
    mode: Mode = Mode.DURATION

    def norm(self) -> str:

        td = self.to_datetime()
        return f"during {td}"
mode: Mode = Mode.DURATION class-attribute
norm()
Source code in edsnlp/pipelines/misc/dates/models.py
212
213
214
215
def norm(self) -> str:

    td = self.to_datetime()
    return f"during {td}"

factory

DEFAULT_CONFIG = dict(absolute=None, relative=None, duration=None, false_positive=None, detect_periods=False, on_ents_only=False, attr='LOWER') module-attribute

create_component(nlp, name, absolute, relative, duration, false_positive, on_ents_only, detect_periods, attr)

Source code in edsnlp/pipelines/misc/dates/factory.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
@deprecated_factory("dates", "eds.dates", default_config=DEFAULT_CONFIG)
@Language.factory("eds.dates", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    absolute: Optional[List[str]],
    relative: Optional[List[str]],
    duration: Optional[List[str]],
    false_positive: Optional[List[str]],
    on_ents_only: Union[bool, List[str]],
    detect_periods: bool,
    attr: str,
):
    return Dates(
        nlp,
        absolute=absolute,
        relative=relative,
        duration=duration,
        false_positive=false_positive,
        on_ents_only=on_ents_only,
        detect_periods=detect_periods,
        attr=attr,
    )

patterns

duration

cue_pattern = '(pendant|durant|pdt)' module-attribute
duration_pattern = [cue_pattern + '.{,3}' + numbers.number_pattern + '\\s*' + units.unit_pattern] module-attribute

relative

specific = {'minus1': ('hier', dict(direction='PAST', day=1)), 'minus2': ('avant[-\\s]hier', dict(direction='PAST', day=2)), 'plus1': ('demain', dict(direction='FUTURE', day=1)), 'plus2': ('après[-\\s]demain', dict(direction='FUTURE', day=2))} module-attribute
specific_pattern = make_pattern(['(?P<specific_{k}>{p})' for (k, (p, _)) in specific.items()]) module-attribute
specific_dict = {k: v for (k, (_, v)) in specific.items()} module-attribute
relative_pattern = ['(?<=' + mode_pattern + '.{,3})?' + p for p in relative_pattern] module-attribute
make_specific_pattern(mode='forward')
Source code in edsnlp/pipelines/misc/dates/patterns/relative.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
def make_specific_pattern(mode: str = "forward"):

    if mode == "forward":
        p = directions.preceding_direction_pattern
        p += r"\s+"
        p += numbers.number_pattern
        p += r"\s*"
        p += units.unit_pattern
    elif mode == "backward":
        p = numbers.number_pattern
        p += r"\s*"
        p += units.unit_pattern
        p += r"\s+"
        p += directions.following_direction_pattern
    else:
        p = directions.preceding_direction_pattern
        p += r"\s+"
        p += numbers.number_pattern
        p += r"\s*"
        p += units.unit_pattern
        p += r"\s+"
        p += directions.following_direction_pattern

    return p

false_positive

false_positive_pattern = make_pattern(['(\\d+' + delimiter_pattern + '){3,}\\d+(?!:\\d\\d)\\b', '\\d\\/\\d']) module-attribute

absolute

no_year_pattern = [day + raw_delimiter_with_spaces_pattern + month + time_pattern + post_num_pattern for day in [ante_num_pattern + numeric_day_pattern, letter_day_pattern] for month in [numeric_month_pattern + post_num_pattern, letter_month_pattern]] module-attribute
no_day_pattern = [letter_month_pattern + raw_delimiter_with_spaces_pattern + year_pattern + post_num_pattern, ante_num_pattern + lz_numeric_month_pattern + raw_delimiter_with_spaces_pattern + year_pattern + post_num_pattern] module-attribute
full_year_pattern = ante_num_pattern + fy_pattern + post_num_pattern module-attribute
absolute_pattern = ['(?<=' + mode_pattern + '.{,3})?' + p for p in absolute_pattern] module-attribute

current

current_patterns: List[str] = ['(?P<year_0>cette\\s+ann[ée]e)(?![-\\s]l[àa])', "(?P<day_0>ce\\s+jour|aujourd['\\s]?hui)", '(?P<week_0>cette\\s+semaine|ces\\sjours[-\\s]ci)', '(?P<month_0>ce\\smois([-\\s]ci)?)'] module-attribute
current_pattern = make_pattern(current_patterns, with_breaks=True) module-attribute

atomic

delimiters
raw_delimiters = ['\\/', '\\-'] module-attribute
delimiters = raw_delimiters + ['\\.', '[^\\S\\r\\n]+'] module-attribute
raw_delimiter_pattern = make_pattern(raw_delimiters) module-attribute
raw_delimiter_with_spaces_pattern = make_pattern(raw_delimiters + ['[^\\S\\r\\n]+']) module-attribute
delimiter_pattern = make_pattern(delimiters) module-attribute
ante_num_pattern = '(?<!.(?:{raw_delimiter_pattern})|[0-9][.,])' module-attribute
post_num_pattern = '(?!{raw_delimiter_pattern})' module-attribute
directions
preceding_directions = ['(?P<direction_PAST>depuis|depuis\\s+le|il\\s+y\\s+a)', '(?P<direction_FUTURE>dans)'] module-attribute
following_directions = ['(?P<direction_FUTURE>prochaine?s?|suivante?s?|plus\\s+tard)', '(?P<direction_PAST>derni[eè]re?s?|passée?s?|pr[ée]c[ée]dente?s?|plus\\s+t[ôo]t)'] module-attribute
preceding_direction_pattern = make_pattern(preceding_directions, with_breaks=True) module-attribute
following_direction_pattern = make_pattern(following_directions, with_breaks=True) module-attribute
units
units = ['(?P<unit_year>ans?|ann[ée]es?)', '(?P<unit_semester>semestres?)', '(?P<unit_trimester>trimestres?)', '(?P<unit_month>mois)', '(?P<unit_week>semaines?)', '(?P<unit_day>jours?|journ[ée]es?)', '(?P<unit_hour>h|heures?)', '(?P<unit_minute>min|minutes?)', '(?P<unit_second>sec|secondes?|s)'] module-attribute
unit_pattern = make_pattern(units, with_breaks=True) module-attribute
time
hour_pattern = '(?<!\\d)(?P<hour>0?[1-9]|1\\d|2[0-3])(?!\\d)' module-attribute
lz_hour_pattern = '(?<!\\d)(?P<hour>0[1-9]|[12]\\d|3[01])(?!\\d)' module-attribute
minute_pattern = '(?<!\\d)(?P<minute>0?[1-9]|[1-5]\\d)(?!\\d)' module-attribute
lz_minute_pattern = '(?<!\\d)(?P<minute>0[1-9]|[1-5]\\d)(?!\\d)' module-attribute
second_pattern = '(?<!\\d)(?P<second>0?[1-9]|[1-5]\\d)(?!\\d)' module-attribute
lz_second_pattern = '(?<!\\d)(?P<second>0[1-9]|[1-5]\\d)(?!\\d)' module-attribute
time_pattern = '(\\s.{,3}' + '{hour_pattern}[h:]({lz_minute_pattern})?' + '((:|m|min){lz_second_pattern})?' + ')?' module-attribute
numbers
letter_numbers = ["(?P<number_01>l'|le|la|une?|ce|cette|cet)", '(?P<number_02>deux)', '(?P<number_03>trois)', '(?P<number_04>quatre)', '(?P<number_05>cinq)', '(?P<number_06>six)', '(?P<number_07>sept)', '(?P<number_08>huit)', '(?P<number_09>neuf)', '(?P<number_10>dix)', '(?P<number_11>onze)', '(?P<number_12>douze)', '(?P<number_12>treize)', '(?P<number_13>quatorze)', '(?P<number_14>quinze)', '(?P<number_15>seize)', '(?P<number_16>dix[-\\s]sept)', '(?P<number_17>dix[-\\s]huit)', '(?P<number_18>dix[-\\s]neuf)', '(?P<number_20>vingt)', '(?P<number_21>vingt[-\\s]et[-\\s]un)', '(?P<number_22>vingt[-\\s]deux)', '(?P<number_23>vingt[-\\s]trois)', '(?P<number_24>vingt[-\\s]quatre)', '(?P<number_25>vingt[-\\s]cinq)', '(?P<number_26>vingt[-\\s]six)', '(?P<number_27>vingt[-\\s]sept)', '(?P<number_28>vingt[-\\s]huit)', '(?P<number_29>vingt[-\\s]neuf)', '(?P<number_30>trente)'] module-attribute
numeric_numbers = [str(i) for i in range(1, 100)] module-attribute
letter_number_pattern = make_pattern(letter_numbers, with_breaks=True) module-attribute
numeric_number_pattern = make_pattern(numeric_numbers, name='number') module-attribute
number_pattern = '({letter_number_pattern}|{numeric_number_pattern})' module-attribute
modes
modes = ['(?P<mode_FROM>depuis|depuis\\s+le|[àa]\\s+partir\\s+d[eu]|du)', "(?P<mode_UNTIL>jusqu'[àa]u?|au)"] module-attribute
mode_pattern = make_pattern(modes, with_breaks=True) module-attribute
months
letter_months = ['(?P<month_01>janvier|janv\\.?)', '(?P<month_02>f[ée]vrier|f[ée]v\\.?)', '(?P<month_03>mars|mar\\.?)', '(?P<month_04>avril|avr\\.?)', '(?P<month_05>mai)', '(?P<month_06>juin)', '(?P<month_07>juillet|juill?\\.?)', '(?P<month_08>ao[uû]t)', '(?P<month_09>septembre|sept?\\.?)', '(?P<month_10>octobre|oct\\.?)', '(?P<month_11>novembre|nov\\.?)', '(?P<month_12>d[ée]cembre|d[ée]c\\.?)'] module-attribute
letter_month_pattern = make_pattern(letter_months, with_breaks=True) module-attribute
numeric_month_pattern = '(?P<month>{numeric_month_pattern})' module-attribute
lz_numeric_month_pattern = '(?P<month>{lz_numeric_month_pattern})' module-attribute
month_pattern = '({letter_month_pattern}|{numeric_month_pattern})' module-attribute
days
letter_days = ['(?P<day_01>premier|1\\s*er)', '(?P<day_02>deux)', '(?P<day_03>trois)', '(?P<day_04>quatre)', '(?P<day_05>cinq)', '(?P<day_06>six)', '(?P<day_07>sept)', '(?P<day_08>huit)', '(?P<day_09>neuf)', '(?P<day_10>dix)', '(?P<day_11>onze)', '(?P<day_12>douze)', '(?P<day_13>treize)', '(?P<day_14>quatorze)', '(?P<day_15>quinze)', '(?P<day_16>seize)', '(?P<day_17>dix\\-?\\s*sept)', '(?P<day_18>dix\\-?\\s*huit)', '(?P<day_19>dix\\-?\\s*neuf)', '(?P<day_20>vingt)', '(?P<day_21>vingt\\-?\\s*et\\-?\\s*un)', '(?P<day_22>vingt\\-?\\s*deux)', '(?P<day_23>vingt\\-?\\s*trois)', '(?P<day_24>vingt\\-?\\s*quatre)', '(?P<day_25>vingt\\-?\\s*cinq)', '(?P<day_26>vingt\\-?\\s*six)', '(?P<day_27>vingt\\-?\\s*sept)', '(?P<day_28>vingt\\-?\\s*huit)', '(?P<day_29>vingt\\-?\\s*neuf)', '(?P<day_30>trente)', '(?P<day_31>trente\\-?\\s*et\\-?\\s*un)'] module-attribute
letter_day_pattern = make_pattern(letter_days) module-attribute
nlz_numeric_day_pattern = '(?<!\\d)([1-9]|[12]\\d|3[01])(?!\\d)' module-attribute
numeric_day_pattern = '(?P<day>{numeric_day_pattern})' module-attribute
lz_numeric_day_pattern = '(?P<day>{lz_numeric_day_pattern})' module-attribute
day_pattern = '({letter_day_pattern}|{numeric_day_pattern})' module-attribute
years
year_patterns: List[str] = ['19\\d\\d'] + [str(year) for year in range(2000, date.today().year + 2)] module-attribute
full_year_pattern = '(?<!\\d)' + full_year_pattern + '(?!\\d)' module-attribute
year_pattern = '(?<!\\d)' + year_pattern + '(?!\\d)' module-attribute
Back to top