Skip to content

edsnlp.pipelines.core.advanced

advanced

AdvancedRegex

Bases: GenericMatcher

Allows additional matching in the surrounding context of the main match group, for qualification/filtering.

PARAMETER DESCRIPTION
nlp

spaCy Language object.

TYPE: Language

regex_config

Configuration for the main expression.

TYPE: Dict[str, Any]

window

Number of tokens to consider before and after the main expression.

TYPE: int

attr

Attribute to match on, eg TEXT, NORM, etc.

TYPE: str

verbose

Verbosity level, useful for debugging.

TYPE: int

ignore_excluded

Whether to skip excluded tokens.

TYPE: bool

Source code in edsnlp/pipelines/core/advanced/advanced.py
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
class AdvancedRegex(GenericMatcher):
    """
    Allows additional matching in the surrounding context of the main match group,
    for qualification/filtering.

    Parameters
    ----------
    nlp : Language
        spaCy `Language` object.
    regex_config : Dict[str, Any]
        Configuration for the main expression.
    window : int
        Number of tokens to consider before and after the main expression.
    attr : str
        Attribute to match on, eg `TEXT`, `NORM`, etc.
    verbose : int
        Verbosity level, useful for debugging.
    ignore_excluded : bool
        Whether to skip excluded tokens.
    """

    def __init__(
        self,
        nlp: Language,
        regex_config: Dict[str, Any],
        window: int,
        attr: str,
        verbose: int,
        ignore_excluded: bool,
    ):
        self.regex_config = _check_regex_config(regex_config)
        self.window = window
        regex = regex_config

        self.verbose = verbose

        super().__init__(
            nlp=nlp,
            terms=dict(),
            regex=regex,
            attr=attr,
            ignore_excluded=ignore_excluded,
        )

        self.ignore_excluded = ignore_excluded

        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:
        if not Doc.has_extension("my_ents"):
            Doc.set_extension("my_ents", default=[])

        if not Span.has_extension("matcher_name"):
            Span.set_extension("matcher_name", default=None)

        if not Span.has_extension("before_extract"):
            Span.set_extension("before_extract", default=None)
        if not Span.has_extension("after_extract"):
            Span.set_extension("after_extract", default=None)

        if not Span.has_extension("window"):
            Span.set_extension("window", default=None)

        if not Span.has_extension("before_snippet"):
            Span.set_extension("before_snippet", default=None)
        if not Span.has_extension("after_snippet"):
            Span.set_extension("after_snippet", default=None)

    def process(self, doc: Doc) -> List[Span]:
        """
        Process the document, looking for named entities.

        Parameters
        ----------
        doc : Doc
            spaCy Doc object

        Returns
        -------
        List[Span]
            List of detected spans.
        """

        ents = super().process(doc)
        ents = self._postprocessing_pipeline(ents)

        return ents

    def __call__(self, doc: Doc) -> Doc:
        """
        Adds spans to document.

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        doc:
            spaCy Doc object, annotated for extracted terms.
        """

        ents = self.process(doc)

        ents, discarded = filter_spans(list(doc.ents) + ents, return_discarded=True)

        doc.ents = ents

        if "discarded" not in doc.spans:
            doc.spans["discarded"] = []
        doc.spans["discarded"].extend(discarded)

        return doc

    def _postprocessing_pipeline(self, ents: List[Span]):
        # add a window within the sentence around entities
        ents = [self._add_window(ent) for ent in ents]

        # Remove entities based on the snippet located just before and after the entity
        ents = filter(self._exclude_filter, ents)

        # Extract informations from the entity's context via regex
        ents = [self._snippet_extraction(ent) for ent in ents]

        return ents

    def _add_window(self, ent: Span) -> Span:
        ent._.window = ent.doc[
            max(ent.start - self.window, ent.sent.start) : min(
                ent.end + self.window, ent.sent.end
            )
        ]

        # include the entity in the snippets so that we can extract
        # the number when it is attached to the word, e.g. "3PA"
        ent._.before_snippet = ent.doc[
            max(ent.start - self.window, ent.sent.start) : ent.end
        ]
        ent._.after_snippet = ent.doc[
            ent.start : min(ent.end + self.window, ent.sent.end)
        ]
        return ent

    def get_text(self, span: Span, label) -> str:
        attr = self.regex_config[label].get("attr", self.attr)

        return get_text(
            doclike=span,
            attr=attr,
            ignore_excluded=self.ignore_excluded,
        )

    def _exclude_filter(self, ent: Span) -> Span:
        label = ent.label_

        before_exclude = self.regex_config[label].get("before_exclude", None)
        after_exclude = self.regex_config[label].get("after_exclude", None)

        if before_exclude is not None:
            t = ent._.before_snippet
            t = self.get_text(t, label)
            if re.compile(before_exclude).search(t) is not None:
                if self.verbose:
                    logger.info(
                        f"excluded (before) string: {t} - pattern {before_exclude}"
                    )
                return False

        if after_exclude is not None:
            t = ent._.after_snippet
            t = self.get_text(t, label)
            if re.compile(after_exclude).search(t) is not None:
                if self.verbose:
                    logger.info(
                        f"excluded (after) string: {t} - pattern {after_exclude}"
                    )
                return False

        return True

    def _snippet_extraction(self, ent: Span) -> Span:
        label = ent.label_

        before_extract = self.regex_config[label].get("before_extract", [])
        after_extract = self.regex_config[label].get("after_extract", [])

        if type(before_extract) == str:
            before_extract = [before_extract]
        if type(after_extract) == str:
            after_extract = [after_extract]

        t = ent._.before_snippet
        t = self.get_text(t, label)
        ent._.before_extract = []
        for pattern in before_extract:
            pattern = re.compile(pattern)
            match = pattern.search(t)
            ent._.before_extract.append(match.groups()[0] if match else None)

        t = ent._.after_snippet
        t = self.get_text(t, label)
        ent._.after_extract = []
        for pattern in after_extract:
            pattern = re.compile(pattern)
            match = pattern.search(t)
            ent._.after_extract.append(match.groups()[0] if match else None)

        return ent
regex_config = _check_regex_config(regex_config) instance-attribute
window = window instance-attribute
verbose = verbose instance-attribute
ignore_excluded = ignore_excluded instance-attribute
__init__(nlp, regex_config, window, attr, verbose, ignore_excluded)
Source code in edsnlp/pipelines/core/advanced/advanced.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
def __init__(
    self,
    nlp: Language,
    regex_config: Dict[str, Any],
    window: int,
    attr: str,
    verbose: int,
    ignore_excluded: bool,
):
    self.regex_config = _check_regex_config(regex_config)
    self.window = window
    regex = regex_config

    self.verbose = verbose

    super().__init__(
        nlp=nlp,
        terms=dict(),
        regex=regex,
        attr=attr,
        ignore_excluded=ignore_excluded,
    )

    self.ignore_excluded = ignore_excluded

    self.set_extensions()
set_extensions()
Source code in edsnlp/pipelines/core/advanced/advanced.py
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
@staticmethod
def set_extensions() -> None:
    if not Doc.has_extension("my_ents"):
        Doc.set_extension("my_ents", default=[])

    if not Span.has_extension("matcher_name"):
        Span.set_extension("matcher_name", default=None)

    if not Span.has_extension("before_extract"):
        Span.set_extension("before_extract", default=None)
    if not Span.has_extension("after_extract"):
        Span.set_extension("after_extract", default=None)

    if not Span.has_extension("window"):
        Span.set_extension("window", default=None)

    if not Span.has_extension("before_snippet"):
        Span.set_extension("before_snippet", default=None)
    if not Span.has_extension("after_snippet"):
        Span.set_extension("after_snippet", default=None)
process(doc)

Process the document, looking for named entities.

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
List[Span]

List of detected spans.

Source code in edsnlp/pipelines/core/advanced/advanced.py
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def process(self, doc: Doc) -> List[Span]:
    """
    Process the document, looking for named entities.

    Parameters
    ----------
    doc : Doc
        spaCy Doc object

    Returns
    -------
    List[Span]
        List of detected spans.
    """

    ents = super().process(doc)
    ents = self._postprocessing_pipeline(ents)

    return ents
__call__(doc)

Adds spans to document.

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
doc

spaCy Doc object, annotated for extracted terms.

Source code in edsnlp/pipelines/core/advanced/advanced.py
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
def __call__(self, doc: Doc) -> Doc:
    """
    Adds spans to document.

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    doc:
        spaCy Doc object, annotated for extracted terms.
    """

    ents = self.process(doc)

    ents, discarded = filter_spans(list(doc.ents) + ents, return_discarded=True)

    doc.ents = ents

    if "discarded" not in doc.spans:
        doc.spans["discarded"] = []
    doc.spans["discarded"].extend(discarded)

    return doc
_postprocessing_pipeline(ents)
Source code in edsnlp/pipelines/core/advanced/advanced.py
129
130
131
132
133
134
135
136
137
138
139
def _postprocessing_pipeline(self, ents: List[Span]):
    # add a window within the sentence around entities
    ents = [self._add_window(ent) for ent in ents]

    # Remove entities based on the snippet located just before and after the entity
    ents = filter(self._exclude_filter, ents)

    # Extract informations from the entity's context via regex
    ents = [self._snippet_extraction(ent) for ent in ents]

    return ents
_add_window(ent)
Source code in edsnlp/pipelines/core/advanced/advanced.py
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
def _add_window(self, ent: Span) -> Span:
    ent._.window = ent.doc[
        max(ent.start - self.window, ent.sent.start) : min(
            ent.end + self.window, ent.sent.end
        )
    ]

    # include the entity in the snippets so that we can extract
    # the number when it is attached to the word, e.g. "3PA"
    ent._.before_snippet = ent.doc[
        max(ent.start - self.window, ent.sent.start) : ent.end
    ]
    ent._.after_snippet = ent.doc[
        ent.start : min(ent.end + self.window, ent.sent.end)
    ]
    return ent
get_text(span, label)
Source code in edsnlp/pipelines/core/advanced/advanced.py
158
159
160
161
162
163
164
165
def get_text(self, span: Span, label) -> str:
    attr = self.regex_config[label].get("attr", self.attr)

    return get_text(
        doclike=span,
        attr=attr,
        ignore_excluded=self.ignore_excluded,
    )
_exclude_filter(ent)
Source code in edsnlp/pipelines/core/advanced/advanced.py
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
def _exclude_filter(self, ent: Span) -> Span:
    label = ent.label_

    before_exclude = self.regex_config[label].get("before_exclude", None)
    after_exclude = self.regex_config[label].get("after_exclude", None)

    if before_exclude is not None:
        t = ent._.before_snippet
        t = self.get_text(t, label)
        if re.compile(before_exclude).search(t) is not None:
            if self.verbose:
                logger.info(
                    f"excluded (before) string: {t} - pattern {before_exclude}"
                )
            return False

    if after_exclude is not None:
        t = ent._.after_snippet
        t = self.get_text(t, label)
        if re.compile(after_exclude).search(t) is not None:
            if self.verbose:
                logger.info(
                    f"excluded (after) string: {t} - pattern {after_exclude}"
                )
            return False

    return True
_snippet_extraction(ent)
Source code in edsnlp/pipelines/core/advanced/advanced.py
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
def _snippet_extraction(self, ent: Span) -> Span:
    label = ent.label_

    before_extract = self.regex_config[label].get("before_extract", [])
    after_extract = self.regex_config[label].get("after_extract", [])

    if type(before_extract) == str:
        before_extract = [before_extract]
    if type(after_extract) == str:
        after_extract = [after_extract]

    t = ent._.before_snippet
    t = self.get_text(t, label)
    ent._.before_extract = []
    for pattern in before_extract:
        pattern = re.compile(pattern)
        match = pattern.search(t)
        ent._.before_extract.append(match.groups()[0] if match else None)

    t = ent._.after_snippet
    t = self.get_text(t, label)
    ent._.after_extract = []
    for pattern in after_extract:
        pattern = re.compile(pattern)
        match = pattern.search(t)
        ent._.after_extract.append(match.groups()[0] if match else None)

    return ent

_check_regex_config(regex_config)

Source code in edsnlp/pipelines/core/advanced/advanced.py
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
def _check_regex_config(regex_config):
    for k, v in regex_config.items():
        if type(v) is not dict:
            raise TypeError(
                f"The value of the key {k} is of type {type(v)}, but a dict is expected"
            )

        single_group_regex_keys = ["before_extract", "after_extract"]

        for single_group_regex_key in single_group_regex_keys:
            if single_group_regex_key in v:
                # ensure it is a list
                if type(v[single_group_regex_key]) is not list:
                    v[single_group_regex_key] = [v[single_group_regex_key]]

                for i, regex in enumerate(v[single_group_regex_key]):
                    n_groups = re.compile(regex).groups

                    if n_groups == 0:
                        # Adding grouping parenthesis
                        v[single_group_regex_key][i] = r"(" + regex + r")"
                    elif n_groups != 1:
                        # Accepting only 1 group per regex
                        raise ValueError(
                            f"The RegEx for {repr(k)} ({repr(regex)}) "
                            f"stored in {repr(single_group_regex_key)} "
                            f"contains {n_groups} capturing groups, 1 expected"
                        )

    return regex_config

factory

DEFAULT_CONFIG = dict(window=10, verbose=0, ignore_excluded=False, attr='NORM') module-attribute

create_component(nlp, name, regex_config, window, verbose, ignore_excluded, attr)

Source code in edsnlp/pipelines/core/advanced/factory.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
@deprecated_factory(
    "advanced-regex", "eds.advanced-regex", default_config=DEFAULT_CONFIG
)
@Language.factory("eds.advanced-regex", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    regex_config: Dict[str, Any],
    window: int,
    verbose: int,
    ignore_excluded: bool,
    attr: str,
):

    return AdvancedRegex(
        nlp,
        regex_config=regex_config,
        window=window,
        verbose=verbose,
        ignore_excluded=ignore_excluded,
        attr=attr,
    )
Back to top