Skip to content

edsnlp.matchers.regex

RegexMatcher

Bases: object

Simple RegExp matcher.

PARAMETER DESCRIPTION
alignment_mode

How spans should be aligned with tokens. Possible values are strict (character indices must be aligned with token boundaries), "contract" (span of all tokens completely within the character span), "expand" (span of all tokens at least partially covered by the character span). Defaults to expand.

TYPE: str

attr

Default attribute to match on, by default "TEXT". Can be overiden in the add method.

TYPE: str

flags

Additional flags provided to the re module. Can be overiden in the add method.

TYPE: Union[re.RegexFlag, int]

ignore_excluded

Whether to skip exclusions

TYPE: bool

span_from_group

If set to False, will create spans basede on the regex's full match. If set to True, will use the first matching capturing group as a span (and fall back to using the full match if no capturing group is matching)

TYPE: bool

Source code in edsnlp/matchers/regex.py
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
class RegexMatcher(object):
    """
    Simple RegExp matcher.

    Parameters
    ----------
    alignment_mode : str
        How spans should be aligned with tokens.
        Possible values are `strict` (character indices must be aligned
        with token boundaries), "contract" (span of all tokens completely
        within the character span), "expand" (span of all tokens at least
        partially covered by the character span).
        Defaults to `expand`.
    attr : str
        Default attribute to match on, by default "TEXT".
        Can be overiden in the `add` method.
    flags : Union[re.RegexFlag, int]
        Additional flags provided to the `re` module.
        Can be overiden in the `add` method.
    ignore_excluded : bool
        Whether to skip exclusions
    span_from_group : bool
        If set to `False`, will create spans basede on the regex's full match.
        If set to `True`, will use the first matching capturing group as a span
        (and fall back to using the full match if no capturing group is matching)
    """

    def __init__(
        self,
        alignment_mode: str = "expand",
        attr: str = "TEXT",
        ignore_excluded: bool = False,
        flags: Union[re.RegexFlag, int] = 0,  # No additional flags
        span_from_group: bool = False,
    ):
        self.alignment_mode = alignment_mode
        self.regex = []

        self.default_attr = attr

        self.flags = flags
        self.span_from_group = span_from_group

        self.ignore_excluded = ignore_excluded

        self.set_extensions()

    @staticmethod
    def set_extensions():
        if not Span.has_extension("normalized_variant"):
            Span.set_extension("normalized_variant", getter=get_normalized_variant)

    def build_patterns(self, regex: Patterns):
        """
        Build patterns and adds them for matching.
        Helper function for pipelines using this matcher.

        Parameters
        ----------
        regex : Patterns
            Dictionary of label/terms, or label/dictionary of terms/attribute.
        """
        if not regex:
            regex = dict()

        for key, patterns in regex.items():
            if isinstance(patterns, dict):
                attr = patterns.get("attr")
                alignment_mode = patterns.get("alignment_mode")
                flags = patterns.get("flags")
                patterns = patterns.get("regex")
            else:
                attr = None
                alignment_mode = None
                flags = None

            if isinstance(patterns, str):
                patterns = [patterns]

            self.add(
                key=key,
                patterns=patterns,
                attr=attr,
                alignment_mode=alignment_mode,
                flags=flags,
            )

    def add(
        self,
        key: str,
        patterns: List[str],
        attr: Optional[str] = None,
        ignore_excluded: Optional[bool] = None,
        alignment_mode: Optional[str] = None,
        flags: Optional[re.RegexFlag] = None,
    ):
        """
        Add a pattern to the registry.

        Parameters
        ----------
        key : str
            Key of the new/updated pattern.
        patterns : List[str]
            List of patterns to add.
        attr : str, optional
            Attribute to use for matching.
            By default uses the `default_attr` attribute
        ignore_excluded : bool, optional
            Whether to skip excluded tokens during matching.
        alignment_mode : str, optional
            Overwrite alignment mode.
        """

        if attr is None:
            attr = self.default_attr

        if ignore_excluded is None:
            ignore_excluded = self.ignore_excluded

        if alignment_mode is None:
            alignment_mode = self.alignment_mode

        if flags is None:
            flags = self.flags

        patterns = [compile_regex(pattern, flags) for pattern in patterns]

        self.regex.append((key, patterns, attr, ignore_excluded, alignment_mode))

    def remove(
        self,
        key: str,
    ):
        """
        Remove a pattern for the registry.

        Parameters
        ----------
        key : str
            key of the pattern to remove.

        Raises
        ------
        ValueError
            If the key is not present in the registered patterns.
        """
        n = len(self.regex)
        self.regex = [(k, p, a, i, am) for k, p, a, i, am in self.regex if k != key]
        if len(self.regex) == n:
            raise ValueError(f"`{key}` is not referenced in the matcher")

    def __len__(self):
        return len(set([regex[0] for regex in self.regex]))

    def match(
        self,
        doclike: Union[Doc, Span],
    ) -> Tuple[Span, re.Match]:
        """
        Iterates on the matches.

        Parameters
        ----------
        doclike:
            spaCy Doc or Span object to match on.

        Yields
        -------
        span:
            A match.
        """

        for key, patterns, attr, ignore_excluded, alignment_mode in self.regex:
            text = get_text(doclike, attr, ignore_excluded)

            for pattern in patterns:
                for match in pattern.finditer(text):
                    logger.trace(f"Matched a regex from {key}: {repr(match.group())}")

                    start_char, end_char = span_from_match(
                        match=match,
                        span_from_group=self.span_from_group,
                    )

                    span = create_span(
                        doclike=doclike,
                        start_char=start_char,
                        end_char=end_char,
                        key=key,
                        attr=attr,
                        alignment_mode=alignment_mode,
                        ignore_excluded=ignore_excluded,
                    )

                    if span is None:
                        continue

                    yield span, match

    def __call__(
        self,
        doclike: Union[Doc, Span],
        as_spans=False,
        return_groupdict=False,
    ) -> Union[Span, Tuple[Span, Dict[str, Any]]]:
        """
        Performs matching. Yields matches.

        Parameters
        ----------
        doclike:
            spaCy Doc or Span object.
        as_spans:
            Returns matches as spans.

        Yields
        ------
        span:
            A match.
        groupdict:
            Additional information coming from the named patterns
            in the regular expression.
        """
        for span, match in self.match(doclike):
            if not as_spans:
                offset = doclike[0].i
                span = (span.label, span.start - offset, span.end - offset)
            if return_groupdict:
                yield span, match.groupdict()
            else:
                yield span

alignment_mode = alignment_mode instance-attribute

regex = [] instance-attribute

default_attr = attr instance-attribute

flags = flags instance-attribute

span_from_group = span_from_group instance-attribute

ignore_excluded = ignore_excluded instance-attribute

__init__(alignment_mode='expand', attr='TEXT', ignore_excluded=False, flags=0, span_from_group=False)

Source code in edsnlp/matchers/regex.py
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
def __init__(
    self,
    alignment_mode: str = "expand",
    attr: str = "TEXT",
    ignore_excluded: bool = False,
    flags: Union[re.RegexFlag, int] = 0,  # No additional flags
    span_from_group: bool = False,
):
    self.alignment_mode = alignment_mode
    self.regex = []

    self.default_attr = attr

    self.flags = flags
    self.span_from_group = span_from_group

    self.ignore_excluded = ignore_excluded

    self.set_extensions()

set_extensions()

Source code in edsnlp/matchers/regex.py
214
215
216
217
@staticmethod
def set_extensions():
    if not Span.has_extension("normalized_variant"):
        Span.set_extension("normalized_variant", getter=get_normalized_variant)

build_patterns(regex)

Build patterns and adds them for matching. Helper function for pipelines using this matcher.

PARAMETER DESCRIPTION
regex

Dictionary of label/terms, or label/dictionary of terms/attribute.

TYPE: Patterns

Source code in edsnlp/matchers/regex.py
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
def build_patterns(self, regex: Patterns):
    """
    Build patterns and adds them for matching.
    Helper function for pipelines using this matcher.

    Parameters
    ----------
    regex : Patterns
        Dictionary of label/terms, or label/dictionary of terms/attribute.
    """
    if not regex:
        regex = dict()

    for key, patterns in regex.items():
        if isinstance(patterns, dict):
            attr = patterns.get("attr")
            alignment_mode = patterns.get("alignment_mode")
            flags = patterns.get("flags")
            patterns = patterns.get("regex")
        else:
            attr = None
            alignment_mode = None
            flags = None

        if isinstance(patterns, str):
            patterns = [patterns]

        self.add(
            key=key,
            patterns=patterns,
            attr=attr,
            alignment_mode=alignment_mode,
            flags=flags,
        )

add(key, patterns, attr=None, ignore_excluded=None, alignment_mode=None, flags=None)

Add a pattern to the registry.

PARAMETER DESCRIPTION
key

Key of the new/updated pattern.

TYPE: str

patterns

List of patterns to add.

TYPE: List[str]

attr

Attribute to use for matching. By default uses the default_attr attribute

TYPE: str, optional DEFAULT: None

ignore_excluded

Whether to skip excluded tokens during matching.

TYPE: bool, optional DEFAULT: None

alignment_mode

Overwrite alignment mode.

TYPE: str, optional DEFAULT: None

Source code in edsnlp/matchers/regex.py
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
def add(
    self,
    key: str,
    patterns: List[str],
    attr: Optional[str] = None,
    ignore_excluded: Optional[bool] = None,
    alignment_mode: Optional[str] = None,
    flags: Optional[re.RegexFlag] = None,
):
    """
    Add a pattern to the registry.

    Parameters
    ----------
    key : str
        Key of the new/updated pattern.
    patterns : List[str]
        List of patterns to add.
    attr : str, optional
        Attribute to use for matching.
        By default uses the `default_attr` attribute
    ignore_excluded : bool, optional
        Whether to skip excluded tokens during matching.
    alignment_mode : str, optional
        Overwrite alignment mode.
    """

    if attr is None:
        attr = self.default_attr

    if ignore_excluded is None:
        ignore_excluded = self.ignore_excluded

    if alignment_mode is None:
        alignment_mode = self.alignment_mode

    if flags is None:
        flags = self.flags

    patterns = [compile_regex(pattern, flags) for pattern in patterns]

    self.regex.append((key, patterns, attr, ignore_excluded, alignment_mode))

remove(key)

Remove a pattern for the registry.

PARAMETER DESCRIPTION
key

key of the pattern to remove.

TYPE: str

RAISES DESCRIPTION
ValueError

If the key is not present in the registered patterns.

Source code in edsnlp/matchers/regex.py
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
def remove(
    self,
    key: str,
):
    """
    Remove a pattern for the registry.

    Parameters
    ----------
    key : str
        key of the pattern to remove.

    Raises
    ------
    ValueError
        If the key is not present in the registered patterns.
    """
    n = len(self.regex)
    self.regex = [(k, p, a, i, am) for k, p, a, i, am in self.regex if k != key]
    if len(self.regex) == n:
        raise ValueError(f"`{key}` is not referenced in the matcher")

__len__()

Source code in edsnlp/matchers/regex.py
319
320
def __len__(self):
    return len(set([regex[0] for regex in self.regex]))

match(doclike)

Iterates on the matches.

PARAMETER DESCRIPTION
doclike

spaCy Doc or Span object to match on.

TYPE: Union[Doc, Span]

YIELDS DESCRIPTION
span

A match.

Source code in edsnlp/matchers/regex.py
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
def match(
    self,
    doclike: Union[Doc, Span],
) -> Tuple[Span, re.Match]:
    """
    Iterates on the matches.

    Parameters
    ----------
    doclike:
        spaCy Doc or Span object to match on.

    Yields
    -------
    span:
        A match.
    """

    for key, patterns, attr, ignore_excluded, alignment_mode in self.regex:
        text = get_text(doclike, attr, ignore_excluded)

        for pattern in patterns:
            for match in pattern.finditer(text):
                logger.trace(f"Matched a regex from {key}: {repr(match.group())}")

                start_char, end_char = span_from_match(
                    match=match,
                    span_from_group=self.span_from_group,
                )

                span = create_span(
                    doclike=doclike,
                    start_char=start_char,
                    end_char=end_char,
                    key=key,
                    attr=attr,
                    alignment_mode=alignment_mode,
                    ignore_excluded=ignore_excluded,
                )

                if span is None:
                    continue

                yield span, match

__call__(doclike, as_spans=False, return_groupdict=False)

Performs matching. Yields matches.

PARAMETER DESCRIPTION
doclike

spaCy Doc or Span object.

TYPE: Union[Doc, Span]

as_spans

Returns matches as spans.

DEFAULT: False

YIELDS DESCRIPTION
span

A match.

groupdict

Additional information coming from the named patterns in the regular expression.

Source code in edsnlp/matchers/regex.py
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
def __call__(
    self,
    doclike: Union[Doc, Span],
    as_spans=False,
    return_groupdict=False,
) -> Union[Span, Tuple[Span, Dict[str, Any]]]:
    """
    Performs matching. Yields matches.

    Parameters
    ----------
    doclike:
        spaCy Doc or Span object.
    as_spans:
        Returns matches as spans.

    Yields
    ------
    span:
        A match.
    groupdict:
        Additional information coming from the named patterns
        in the regular expression.
    """
    for span, match in self.match(doclike):
        if not as_spans:
            offset = doclike[0].i
            span = (span.label, span.start - offset, span.end - offset)
        if return_groupdict:
            yield span, match.groupdict()
        else:
            yield span

get_first_included(doclike)

Source code in edsnlp/matchers/regex.py
13
14
15
16
17
18
@lru_cache(32)
def get_first_included(doclike: Union[Doc, Span]) -> Token:
    for token in doclike:
        if token.tag_ != "EXCLUDED":
            return token
    raise IndexError("The provided Span does not include any token")

get_normalized_variant(doclike)

Source code in edsnlp/matchers/regex.py
21
22
23
24
25
26
def get_normalized_variant(doclike) -> str:
    tokens = [t.text + t.whitespace_ for t in doclike if not t._.excluded]
    variant = "".join(tokens)
    variant = variant.rstrip(" ")
    variant = re.sub(r"\s+", " ", variant)
    return variant

spans_generator(match)

Iterates over every group, and then yields the full match

PARAMETER DESCRIPTION
match

A match object

TYPE: re.Match

YIELDS DESCRIPTION
Tuple[int, int]

A tuple containing the start and end of the group or match

Source code in edsnlp/matchers/regex.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
def spans_generator(match: re.Match) -> Tuple[int, int]:
    """
    Iterates over every group, and then yields the full match

    Parameters
    ----------
    match : re.Match
        A match object

    Yields
    ------
    Tuple[int, int]
        A tuple containing the start and end of the group or match
    """
    for idx in range(1, len(match.groups()) + 1):
        yield match.start(idx), match.end(idx)
    yield match.start(0), match.end(0)

span_from_match(match, span_from_group)

Return the span (as a (start, end) tuple) of the first matching group. If span_from_group=True, returns the full match instead.

PARAMETER DESCRIPTION
match

The Match object

TYPE: re.Match

span_from_group

Whether to work on groups or on the full match

TYPE: bool

RETURNS DESCRIPTION
Tuple[int, int]

A tuple containing the start and end of the group or match

Source code in edsnlp/matchers/regex.py
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
def span_from_match(
    match: re.Match,
    span_from_group: bool,
) -> Tuple[int, int]:
    """
    Return the span (as a (start, end) tuple) of the first matching group.
    If `span_from_group=True`, returns the full match instead.

    Parameters
    ----------
    match : re.Match
        The Match object
    span_from_group : bool
        Whether to work on groups or on the full match

    Returns
    -------
    Tuple[int, int]
        A tuple containing the start and end of the group or match
    """
    if not span_from_group:
        start_char, end_char = match.start(), match.end()
    else:
        start_char, end_char = next(filter(lambda x: x[0] >= 0, spans_generator(match)))
    return start_char, end_char

create_span(doclike, start_char, end_char, key, attr, alignment_mode, ignore_excluded)

spaCy only allows strict alignment mode for char_span on Spans. This method circumvents this.

PARAMETER DESCRIPTION
doclike

Doc or Span.

TYPE: Union[Doc, Span]

start_char

Character index within the Doc-like object.

TYPE: int

end_char

Character index of the end, within the Doc-like object.

TYPE: int

key

The key used to match.

TYPE: str

alignment_mode

The alignment mode.

TYPE: str

ignore_excluded

Whether to skip excluded tokens.

TYPE: bool

Returns

span

A span matched on the Doc-like object.

Source code in edsnlp/matchers/regex.py
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
def create_span(
    doclike: Union[Doc, Span],
    start_char: int,
    end_char: int,
    key: str,
    attr: str,
    alignment_mode: str,
    ignore_excluded: bool,
) -> Span:
    """
    spaCy only allows strict alignment mode for char_span on Spans.
    This method circumvents this.
    Parameters
    ----------
    doclike : Union[Doc, Span]
        `Doc` or `Span`.
    start_char : int
        Character index within the Doc-like object.
    end_char : int
        Character index of the end, within the Doc-like object.
    key : str
        The key used to match.
    alignment_mode : str
        The alignment mode.
    ignore_excluded : bool
        Whether to skip excluded tokens.
    Returns
    -------
    span:
        A span matched on the Doc-like object.
    """

    doc = doclike if isinstance(doclike, Doc) else doclike.doc

    # Handle the simple case immediately
    if attr in {"TEXT", "LOWER"} and not ignore_excluded:
        off = doclike[0].idx
        return doc.char_span(
            start_char + off,
            end_char + off,
            label=key,
            alignment_mode=alignment_mode,
        )

    # If doclike is a Span, we need to get the clean
    # index of the first included token
    if ignore_excluded:
        original, clean = alignment(
            doc=doc,
            attr=attr,
            ignore_excluded=ignore_excluded,
        )

        first_included = get_first_included(doclike)
        i = bisect_left(original, first_included.idx)
        first = clean[i]

    else:
        first = doclike[0].idx

    start_char = (
        first
        + start_char
        + offset(
            doc,
            attr=attr,
            ignore_excluded=ignore_excluded,
            index=first + start_char,
        )
    )

    end_char = (
        first
        + end_char
        + offset(
            doc,
            attr=attr,
            ignore_excluded=ignore_excluded,
            index=first + end_char,
        )
    )

    span = doc.char_span(
        start_char,
        end_char,
        label=key,
        alignment_mode=alignment_mode,
    )

    return span