Skip to content

edsnlp.matchers.regex

RegexMatcher

Bases: object

Simple RegExp matcher.

PARAMETER DESCRIPTION
alignment_mode

How spans should be aligned with tokens. Possible values are strict (character indices must be aligned with token boundaries), "contract" (span of all tokens completely within the character span), "expand" (span of all tokens at least partially covered by the character span). Defaults to expand.

TYPE: str

attr

Default attribute to match on, by default "TEXT". Can be overiden in the add method.

TYPE: str

ignore_excluded

Whether to skip exclusions

TYPE: bool

Source code in edsnlp/matchers/regex.py
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
class RegexMatcher(object):
    """
    Simple RegExp matcher.

    Parameters
    ----------
    alignment_mode : str
        How spans should be aligned with tokens.
        Possible values are `strict` (character indices must be aligned
        with token boundaries), "contract" (span of all tokens completely
        within the character span), "expand" (span of all tokens at least
        partially covered by the character span).
        Defaults to `expand`.
    attr : str
        Default attribute to match on, by default "TEXT".
        Can be overiden in the `add` method.
    ignore_excluded : bool
        Whether to skip exclusions
    """

    def __init__(
        self,
        alignment_mode: str = "expand",
        attr: str = "TEXT",
        ignore_excluded: bool = False,
    ):
        self.alignment_mode = alignment_mode
        self.regex = []

        self.default_attr = attr

        self.ignore_excluded = ignore_excluded

    def build_patterns(self, regex: Patterns):
        """
        Build patterns and adds them for matching.
        Helper function for pipelines using this matcher.

        Parameters
        ----------
        regex : Patterns
            Dictionary of label/terms, or label/dictionary of terms/attribute.
        """
        if not regex:
            regex = dict()

        for key, patterns in regex.items():
            if isinstance(patterns, dict):
                attr = patterns.get("attr")
                alignment_mode = patterns.get("alignment_mode")
                patterns = patterns.get("regex")
            else:
                attr = None
                alignment_mode = None

            if isinstance(patterns, str):
                patterns = [patterns]

            self.add(
                key=key, patterns=patterns, attr=attr, alignment_mode=alignment_mode
            )

    def add(
        self,
        key: str,
        patterns: List[str],
        attr: Optional[str] = None,
        ignore_excluded: Optional[bool] = None,
        alignment_mode: Optional[str] = None,
    ):
        """
        Add a pattern to the registry.

        Parameters
        ----------
        key : str
            Key of the new/updated pattern.
        patterns : List[str]
            List of patterns to add.
        attr : str, optional
            Attribute to use for matching.
            By default uses the `default_attr` attribute
        ignore_excluded : bool, optional
            Whether to skip excluded tokens during matching.
        alignment_mode : str, optional
            Overwrite alignment mode.
        """

        if attr is None:
            attr = self.default_attr

        if ignore_excluded is None:
            ignore_excluded = self.ignore_excluded

        if alignment_mode is None:
            alignment_mode = self.alignment_mode

        patterns = [re.compile(pattern) for pattern in patterns]

        self.regex.append((key, patterns, attr, ignore_excluded, alignment_mode))

    def remove(
        self,
        key: str,
    ):
        """
        Remove a pattern for the registry.

        Parameters
        ----------
        key : str
            key of the pattern to remove.

        Raises
        ------
        ValueError
            If the key is not present in the registered patterns.
        """
        n = len(self.regex)
        self.regex = [(k, p, a, i, am) for k, p, a, i, am in self.regex if k != key]
        if len(self.regex) == n:
            raise ValueError(f"`{key}` is not referenced in the matcher")

    def __len__(self):
        return len(set([regex[0] for regex in self.regex]))

    def match(
        self,
        doclike: Union[Doc, Span],
    ) -> Tuple[Span, re.Match]:
        """
        Iterates on the matches.

        Parameters
        ----------
        doclike:
            spaCy Doc or Span object to match on.

        Yields
        -------
        span:
            A match.
        """

        for key, patterns, attr, ignore_excluded, alignment_mode in self.regex:
            text = get_text(doclike, attr, ignore_excluded)

            for pattern in patterns:
                for match in pattern.finditer(text):
                    logger.trace(f"Matched a regex from {key}: {repr(match.group())}")

                    span = create_span(
                        doclike=doclike,
                        start_char=match.start(),
                        end_char=match.end(),
                        key=key,
                        attr=attr,
                        alignment_mode=alignment_mode,
                        ignore_excluded=ignore_excluded,
                    )

                    if span is None:
                        continue

                    yield span, match

    def __call__(
        self,
        doclike: Union[Doc, Span],
        as_spans=False,
        return_groupdict=False,
    ) -> Union[Span, Tuple[Span, Dict[str, Any]]]:
        """
        Performs matching. Yields matches.

        Parameters
        ----------
        doclike:
            spaCy Doc or Span object.
        as_spans:
            Returns matches as spans.

        Yields
        ------
        span:
            A match.
        groupdict:
            Additional information coming from the named patterns
            in the regular expression.
        """
        for span, match in self.match(doclike):
            if not as_spans:
                offset = doclike[0].i
                span = (span.label, span.start - offset, span.end - offset)
            if return_groupdict:
                yield span, match.groupdict()
            else:
                yield span

build_patterns(regex)

Build patterns and adds them for matching. Helper function for pipelines using this matcher.

PARAMETER DESCRIPTION
regex

Dictionary of label/terms, or label/dictionary of terms/attribute.

TYPE: Patterns

Source code in edsnlp/matchers/regex.py
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
def build_patterns(self, regex: Patterns):
    """
    Build patterns and adds them for matching.
    Helper function for pipelines using this matcher.

    Parameters
    ----------
    regex : Patterns
        Dictionary of label/terms, or label/dictionary of terms/attribute.
    """
    if not regex:
        regex = dict()

    for key, patterns in regex.items():
        if isinstance(patterns, dict):
            attr = patterns.get("attr")
            alignment_mode = patterns.get("alignment_mode")
            patterns = patterns.get("regex")
        else:
            attr = None
            alignment_mode = None

        if isinstance(patterns, str):
            patterns = [patterns]

        self.add(
            key=key, patterns=patterns, attr=attr, alignment_mode=alignment_mode
        )

add(key, patterns, attr=None, ignore_excluded=None, alignment_mode=None)

Add a pattern to the registry.

PARAMETER DESCRIPTION
key

Key of the new/updated pattern.

TYPE: str

patterns

List of patterns to add.

TYPE: List[str]

attr

Attribute to use for matching. By default uses the default_attr attribute

TYPE: str, optional DEFAULT: None

ignore_excluded

Whether to skip excluded tokens during matching.

TYPE: bool, optional DEFAULT: None

alignment_mode

Overwrite alignment mode.

TYPE: str, optional DEFAULT: None

Source code in edsnlp/matchers/regex.py
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
def add(
    self,
    key: str,
    patterns: List[str],
    attr: Optional[str] = None,
    ignore_excluded: Optional[bool] = None,
    alignment_mode: Optional[str] = None,
):
    """
    Add a pattern to the registry.

    Parameters
    ----------
    key : str
        Key of the new/updated pattern.
    patterns : List[str]
        List of patterns to add.
    attr : str, optional
        Attribute to use for matching.
        By default uses the `default_attr` attribute
    ignore_excluded : bool, optional
        Whether to skip excluded tokens during matching.
    alignment_mode : str, optional
        Overwrite alignment mode.
    """

    if attr is None:
        attr = self.default_attr

    if ignore_excluded is None:
        ignore_excluded = self.ignore_excluded

    if alignment_mode is None:
        alignment_mode = self.alignment_mode

    patterns = [re.compile(pattern) for pattern in patterns]

    self.regex.append((key, patterns, attr, ignore_excluded, alignment_mode))

remove(key)

Remove a pattern for the registry.

PARAMETER DESCRIPTION
key

key of the pattern to remove.

TYPE: str

RAISES DESCRIPTION
ValueError

If the key is not present in the registered patterns.

Source code in edsnlp/matchers/regex.py
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
def remove(
    self,
    key: str,
):
    """
    Remove a pattern for the registry.

    Parameters
    ----------
    key : str
        key of the pattern to remove.

    Raises
    ------
    ValueError
        If the key is not present in the registered patterns.
    """
    n = len(self.regex)
    self.regex = [(k, p, a, i, am) for k, p, a, i, am in self.regex if k != key]
    if len(self.regex) == n:
        raise ValueError(f"`{key}` is not referenced in the matcher")

match(doclike)

Iterates on the matches.

PARAMETER DESCRIPTION
doclike

spaCy Doc or Span object to match on.

TYPE: Union[Doc, Span]

YIELDS DESCRIPTION
span

A match.

Source code in edsnlp/matchers/regex.py
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
def match(
    self,
    doclike: Union[Doc, Span],
) -> Tuple[Span, re.Match]:
    """
    Iterates on the matches.

    Parameters
    ----------
    doclike:
        spaCy Doc or Span object to match on.

    Yields
    -------
    span:
        A match.
    """

    for key, patterns, attr, ignore_excluded, alignment_mode in self.regex:
        text = get_text(doclike, attr, ignore_excluded)

        for pattern in patterns:
            for match in pattern.finditer(text):
                logger.trace(f"Matched a regex from {key}: {repr(match.group())}")

                span = create_span(
                    doclike=doclike,
                    start_char=match.start(),
                    end_char=match.end(),
                    key=key,
                    attr=attr,
                    alignment_mode=alignment_mode,
                    ignore_excluded=ignore_excluded,
                )

                if span is None:
                    continue

                yield span, match

__call__(doclike, as_spans=False, return_groupdict=False)

Performs matching. Yields matches.

PARAMETER DESCRIPTION
doclike

spaCy Doc or Span object.

TYPE: Union[Doc, Span]

as_spans

Returns matches as spans.

DEFAULT: False

YIELDS DESCRIPTION
span

A match.

groupdict

Additional information coming from the named patterns in the regular expression.

Source code in edsnlp/matchers/regex.py
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
def __call__(
    self,
    doclike: Union[Doc, Span],
    as_spans=False,
    return_groupdict=False,
) -> Union[Span, Tuple[Span, Dict[str, Any]]]:
    """
    Performs matching. Yields matches.

    Parameters
    ----------
    doclike:
        spaCy Doc or Span object.
    as_spans:
        Returns matches as spans.

    Yields
    ------
    span:
        A match.
    groupdict:
        Additional information coming from the named patterns
        in the regular expression.
    """
    for span, match in self.match(doclike):
        if not as_spans:
            offset = doclike[0].i
            span = (span.label, span.start - offset, span.end - offset)
        if return_groupdict:
            yield span, match.groupdict()
        else:
            yield span

create_span(doclike, start_char, end_char, key, attr, alignment_mode, ignore_excluded)

spaCy only allows strict alignment mode for char_span on Spans. This method circumvents this.

PARAMETER DESCRIPTION
doclike

Doc or Span.

TYPE: Union[Doc, Span]

start_char

Character index within the Doc-like object.

TYPE: int

end_char

Character index of the end, within the Doc-like object.

TYPE: int

key

The key used to match.

TYPE: str

alignment_mode

The alignment mode.

TYPE: str

ignore_excluded

Whether to skip excluded tokens.

TYPE: bool

RETURNS DESCRIPTION
span

A span matched on the Doc-like object.

Source code in edsnlp/matchers/regex.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
def create_span(
    doclike: Union[Doc, Span],
    start_char: int,
    end_char: int,
    key: str,
    attr: str,
    alignment_mode: str,
    ignore_excluded: bool,
) -> Span:
    """
    spaCy only allows strict alignment mode for char_span on Spans.
    This method circumvents this.

    Parameters
    ----------
    doclike : Union[Doc, Span]
        `Doc` or `Span`.
    start_char : int
        Character index within the Doc-like object.
    end_char : int
        Character index of the end, within the Doc-like object.
    key : str
        The key used to match.
    alignment_mode : str
        The alignment mode.
    ignore_excluded : bool
        Whether to skip excluded tokens.

    Returns
    -------
    span:
        A span matched on the Doc-like object.
    """

    doc = doclike if isinstance(doclike, Doc) else doclike.doc

    # Handle the simple case immediately
    if attr in {"TEXT", "LOWER"} and not ignore_excluded:
        off = doclike[0].idx
        return doc.char_span(
            start_char + off,
            end_char + off,
            label=key,
            alignment_mode=alignment_mode,
        )

    # If doclike is a Span, we need to get the clean
    # index of the first included token
    if ignore_excluded:
        original, clean = alignment(
            doc=doc,
            attr=attr,
            ignore_excluded=ignore_excluded,
        )

        first_included = get_first_included(doclike)
        i = bisect_left(original, first_included.idx)
        first = clean[i]

    else:
        first = doclike[0].idx

    start_char = (
        first
        + start_char
        + offset(
            doc,
            attr=attr,
            ignore_excluded=ignore_excluded,
            index=first + start_char,
        )
    )

    end_char = (
        first
        + end_char
        + offset(
            doc,
            attr=attr,
            ignore_excluded=ignore_excluded,
            index=first + end_char,
        )
    )

    span = doc.char_span(
        start_char,
        end_char,
        label=key,
        alignment_mode=alignment_mode,
    )

    return span
Back to top