Skip to content

edsnlp.matchers

phrase

PatternDict = Dict[str, Union[str, Dict[str, str]]] module-attribute

EDSPhraseMatcher

Bases: object

PhraseMatcher that matches "over" excluded tokens.

PARAMETER DESCRIPTION
vocab

spaCy vocabulary to match on.

TYPE: Vocab

attr

Default attribute to match on, by default "TEXT". Can be overiden in the add method.

To match on a custom attribute, prepend the attribute name with _.

TYPE: str

ignore_excluded

Whether to ignore excluded tokens, by default True

TYPE: bool, optional

exclude_newlines

Whether to exclude new lines, by default False

TYPE: bool, optional

Source code in edsnlp/matchers/phrase.py
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
class EDSPhraseMatcher(object):
    """
    PhraseMatcher that matches "over" excluded tokens.

    Parameters
    ----------
    vocab : Vocab
        spaCy vocabulary to match on.
    attr : str
        Default attribute to match on, by default "TEXT".
        Can be overiden in the `add` method.

        To match on a custom attribute, prepend the attribute name with `_`.
    ignore_excluded : bool, optional
        Whether to ignore excluded tokens, by default True
    exclude_newlines : bool, optional
        Whether to exclude new lines, by default False
    """

    def __init__(
        self,
        vocab: Vocab,
        attr: str = "TEXT",
        ignore_excluded: bool = True,
        exclude_newlines: bool = False,
    ):
        self.matcher = Matcher(vocab, validate=True)
        self.attr = attr
        self.ignore_excluded = ignore_excluded

        self.exclusion_attribute = (
            "excluded_or_space" if exclude_newlines else "excluded"
        )

    @staticmethod
    def get_attr(token: Token, attr: str, custom_attr: bool = False) -> str:
        if custom_attr:
            return getattr(token._, attr)
        else:
            attr = ATTRIBUTES.get(attr)
            return getattr(token, attr)

    def create_pattern(
        self,
        match_pattern: Doc,
        attr: Optional[str] = None,
        ignore_excluded: Optional[bool] = None,
    ) -> List[PatternDict]:
        """
        Create a pattern

        Parameters
        ----------
        match_pattern : Doc
            A spaCy doc object, to use as match model.
        attr : str, optional
            Overwrite attribute to match on.
        ignore_excluded: bool, optional
            Whether to skip excluded tokens.

        Returns
        -------
        List[PatternDict]
            A spaCy rule-based pattern.
        """

        ignore_excluded = ignore_excluded or self.ignore_excluded

        attr = attr or self.attr
        custom_attr = attr.startswith("_")

        if custom_attr:
            attr = attr.lstrip("_").lower()

            pattern = []

            for token in match_pattern:
                pattern.append({"_": {attr: self.get_attr(token, attr, True)}})
                if ignore_excluded and token.whitespace_:
                    # If the token is followed by a whitespace,
                    # we let it match on a pollution
                    pattern.append({"_": {self.exclusion_attribute: True}, "OP": "*"})

            return pattern
        else:
            pattern = []

            for token in match_pattern:
                pattern.append({attr: self.get_attr(token, attr, False)})
                if ignore_excluded and token.whitespace_:
                    # If the token is followed by a whitespace,
                    # we let it match on a pollution
                    pattern.append({"_": {self.exclusion_attribute: True}, "OP": "*"})

            return pattern

    def build_patterns(self, nlp: Language, terms: Patterns):
        """
        Build patterns and adds them for matching.
        Helper function for pipelines using this matcher.

        Parameters
        ----------
        nlp : Language
            The instance of the spaCy language class.
        terms : Patterns
            Dictionary of label/terms, or label/dictionary of terms/attribute.
        """

        if not terms:
            terms = dict()

        for key, expressions in terms.items():
            if isinstance(expressions, dict):
                attr = expressions.get("attr")
                expressions = expressions.get("patterns")
            else:
                attr = None
            if isinstance(expressions, str):
                expressions = [expressions]
            patterns = list(nlp.pipe(expressions))
            self.add(key, patterns, attr)

    def add(
        self,
        key: str,
        patterns: List[Doc],
        attr: Optional[str] = None,
        ignore_excluded: Optional[bool] = None,
    ) -> None:
        """
        Add a pattern.

        Parameters
        ----------
        key : str
            Key of the new/updated pattern.
        patterns : List[str]
            List of patterns to add.
        attr : str, optional
            Overwrite the attribute to match on for this specific pattern.
        ignore_excluded : bool, optional
            Overwrite the parameter for this specific pattern.
        """

        patterns = [
            self.create_pattern(pattern, attr=attr, ignore_excluded=ignore_excluded)
            for pattern in patterns
        ]
        self.matcher.add(key, patterns)

    def remove(
        self,
        key: str,
    ) -> None:
        """
        Remove a pattern.

        Parameters
        ----------
        key : str
            key of the pattern to remove.

        Raises
        ------
        ValueError
            Should the key not be contained in the registry.
        """
        self.matcher.remove(key)

    def __len__(self):
        return len(self.matcher)

    def __call__(
        self,
        doclike: Union[Doc, Span],
        as_spans=False,
    ) -> Generator:
        """
        Performs matching. Yields matches.

        Parameters
        ----------
        doclike:
            spaCy Doc or Span object.
        as_spans:
            Whether to return matches as spans.

        Yields
        -------
        match: Span
            A match.
        """
        if len(self.matcher):
            for match in self.matcher(doclike, as_spans=as_spans):
                yield match
matcher = Matcher(vocab, validate=True) instance-attribute
attr = attr instance-attribute
ignore_excluded = ignore_excluded instance-attribute
exclusion_attribute = 'excluded_or_space' if exclude_newlines else 'excluded' instance-attribute
__init__(vocab, attr='TEXT', ignore_excluded=True, exclude_newlines=False)
Source code in edsnlp/matchers/phrase.py
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def __init__(
    self,
    vocab: Vocab,
    attr: str = "TEXT",
    ignore_excluded: bool = True,
    exclude_newlines: bool = False,
):
    self.matcher = Matcher(vocab, validate=True)
    self.attr = attr
    self.ignore_excluded = ignore_excluded

    self.exclusion_attribute = (
        "excluded_or_space" if exclude_newlines else "excluded"
    )
get_attr(token, attr, custom_attr=False)
Source code in edsnlp/matchers/phrase.py
75
76
77
78
79
80
81
@staticmethod
def get_attr(token: Token, attr: str, custom_attr: bool = False) -> str:
    if custom_attr:
        return getattr(token._, attr)
    else:
        attr = ATTRIBUTES.get(attr)
        return getattr(token, attr)
create_pattern(match_pattern, attr=None, ignore_excluded=None)

Create a pattern

PARAMETER DESCRIPTION
match_pattern

A spaCy doc object, to use as match model.

TYPE: Doc

attr

Overwrite attribute to match on.

TYPE: str, optional DEFAULT: None

ignore_excluded

Whether to skip excluded tokens.

TYPE: Optional[bool] DEFAULT: None

RETURNS DESCRIPTION
List[PatternDict]

A spaCy rule-based pattern.

Source code in edsnlp/matchers/phrase.py
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
def create_pattern(
    self,
    match_pattern: Doc,
    attr: Optional[str] = None,
    ignore_excluded: Optional[bool] = None,
) -> List[PatternDict]:
    """
    Create a pattern

    Parameters
    ----------
    match_pattern : Doc
        A spaCy doc object, to use as match model.
    attr : str, optional
        Overwrite attribute to match on.
    ignore_excluded: bool, optional
        Whether to skip excluded tokens.

    Returns
    -------
    List[PatternDict]
        A spaCy rule-based pattern.
    """

    ignore_excluded = ignore_excluded or self.ignore_excluded

    attr = attr or self.attr
    custom_attr = attr.startswith("_")

    if custom_attr:
        attr = attr.lstrip("_").lower()

        pattern = []

        for token in match_pattern:
            pattern.append({"_": {attr: self.get_attr(token, attr, True)}})
            if ignore_excluded and token.whitespace_:
                # If the token is followed by a whitespace,
                # we let it match on a pollution
                pattern.append({"_": {self.exclusion_attribute: True}, "OP": "*"})

        return pattern
    else:
        pattern = []

        for token in match_pattern:
            pattern.append({attr: self.get_attr(token, attr, False)})
            if ignore_excluded and token.whitespace_:
                # If the token is followed by a whitespace,
                # we let it match on a pollution
                pattern.append({"_": {self.exclusion_attribute: True}, "OP": "*"})

        return pattern
build_patterns(nlp, terms)

Build patterns and adds them for matching. Helper function for pipelines using this matcher.

PARAMETER DESCRIPTION
nlp

The instance of the spaCy language class.

TYPE: Language

terms

Dictionary of label/terms, or label/dictionary of terms/attribute.

TYPE: Patterns

Source code in edsnlp/matchers/phrase.py
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
def build_patterns(self, nlp: Language, terms: Patterns):
    """
    Build patterns and adds them for matching.
    Helper function for pipelines using this matcher.

    Parameters
    ----------
    nlp : Language
        The instance of the spaCy language class.
    terms : Patterns
        Dictionary of label/terms, or label/dictionary of terms/attribute.
    """

    if not terms:
        terms = dict()

    for key, expressions in terms.items():
        if isinstance(expressions, dict):
            attr = expressions.get("attr")
            expressions = expressions.get("patterns")
        else:
            attr = None
        if isinstance(expressions, str):
            expressions = [expressions]
        patterns = list(nlp.pipe(expressions))
        self.add(key, patterns, attr)
add(key, patterns, attr=None, ignore_excluded=None)

Add a pattern.

PARAMETER DESCRIPTION
key

Key of the new/updated pattern.

TYPE: str

patterns

List of patterns to add.

TYPE: List[str]

attr

Overwrite the attribute to match on for this specific pattern.

TYPE: str, optional DEFAULT: None

ignore_excluded

Overwrite the parameter for this specific pattern.

TYPE: bool, optional DEFAULT: None

Source code in edsnlp/matchers/phrase.py
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
def add(
    self,
    key: str,
    patterns: List[Doc],
    attr: Optional[str] = None,
    ignore_excluded: Optional[bool] = None,
) -> None:
    """
    Add a pattern.

    Parameters
    ----------
    key : str
        Key of the new/updated pattern.
    patterns : List[str]
        List of patterns to add.
    attr : str, optional
        Overwrite the attribute to match on for this specific pattern.
    ignore_excluded : bool, optional
        Overwrite the parameter for this specific pattern.
    """

    patterns = [
        self.create_pattern(pattern, attr=attr, ignore_excluded=ignore_excluded)
        for pattern in patterns
    ]
    self.matcher.add(key, patterns)
remove(key)

Remove a pattern.

PARAMETER DESCRIPTION
key

key of the pattern to remove.

TYPE: str

RAISES DESCRIPTION
ValueError

Should the key not be contained in the registry.

Source code in edsnlp/matchers/phrase.py
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
def remove(
    self,
    key: str,
) -> None:
    """
    Remove a pattern.

    Parameters
    ----------
    key : str
        key of the pattern to remove.

    Raises
    ------
    ValueError
        Should the key not be contained in the registry.
    """
    self.matcher.remove(key)
__len__()
Source code in edsnlp/matchers/phrase.py
211
212
def __len__(self):
    return len(self.matcher)
__call__(doclike, as_spans=False)

Performs matching. Yields matches.

PARAMETER DESCRIPTION
doclike

spaCy Doc or Span object.

TYPE: Union[Doc, Span]

as_spans

Whether to return matches as spans.

DEFAULT: False

YIELDS DESCRIPTION
match

A match.

Source code in edsnlp/matchers/phrase.py
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
def __call__(
    self,
    doclike: Union[Doc, Span],
    as_spans=False,
) -> Generator:
    """
    Performs matching. Yields matches.

    Parameters
    ----------
    doclike:
        spaCy Doc or Span object.
    as_spans:
        Whether to return matches as spans.

    Yields
    -------
    match: Span
        A match.
    """
    if len(self.matcher):
        for match in self.matcher(doclike, as_spans=as_spans):
            yield match

get_normalized_variant(doclike)

Source code in edsnlp/matchers/phrase.py
15
16
17
18
19
20
def get_normalized_variant(doclike: Union[Span, Doc]) -> str:
    tokens = [t.text + t.whitespace_ for t in doclike if not t._.excluded]
    variant = "".join(tokens)
    variant = variant.rstrip(" ")
    variant = re.sub(r"\s+", " ", variant)
    return variant

phrase_matcher_factory(attr, ignore_excluded, exclude_newlines)

Source code in edsnlp/matchers/phrase.py
27
28
29
30
31
32
33
34
35
36
37
38
@registry.misc("edsnlp.factories.phrasematcher.v1")
def phrase_matcher_factory(
    attr: str,
    ignore_excluded: bool,
    exclude_newlines: bool,
):
    return partial(
        EDSPhraseMatcher,
        attr=attr,
        ignore_excluded=ignore_excluded,
        exclude_newlines=exclude_newlines,
    )

regex

RegexMatcher

Bases: object

Simple RegExp matcher.

PARAMETER DESCRIPTION
alignment_mode

How spans should be aligned with tokens. Possible values are strict (character indices must be aligned with token boundaries), "contract" (span of all tokens completely within the character span), "expand" (span of all tokens at least partially covered by the character span). Defaults to expand.

TYPE: str

attr

Default attribute to match on, by default "TEXT". Can be overiden in the add method.

TYPE: str

ignore_excluded

Whether to skip exclusions

TYPE: bool

Source code in edsnlp/matchers/regex.py
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
class RegexMatcher(object):
    """
    Simple RegExp matcher.

    Parameters
    ----------
    alignment_mode : str
        How spans should be aligned with tokens.
        Possible values are `strict` (character indices must be aligned
        with token boundaries), "contract" (span of all tokens completely
        within the character span), "expand" (span of all tokens at least
        partially covered by the character span).
        Defaults to `expand`.
    attr : str
        Default attribute to match on, by default "TEXT".
        Can be overiden in the `add` method.
    ignore_excluded : bool
        Whether to skip exclusions
    """

    def __init__(
        self,
        alignment_mode: str = "expand",
        attr: str = "TEXT",
        ignore_excluded: bool = False,
    ):
        self.alignment_mode = alignment_mode
        self.regex = []

        self.default_attr = attr

        self.ignore_excluded = ignore_excluded

    def build_patterns(self, regex: Patterns):
        """
        Build patterns and adds them for matching.
        Helper function for pipelines using this matcher.

        Parameters
        ----------
        regex : Patterns
            Dictionary of label/terms, or label/dictionary of terms/attribute.
        """
        if not regex:
            regex = dict()

        for key, patterns in regex.items():
            if isinstance(patterns, dict):
                attr = patterns.get("attr")
                alignment_mode = patterns.get("alignment_mode")
                patterns = patterns.get("regex")
            else:
                attr = None
                alignment_mode = None

            if isinstance(patterns, str):
                patterns = [patterns]

            self.add(
                key=key, patterns=patterns, attr=attr, alignment_mode=alignment_mode
            )

    def add(
        self,
        key: str,
        patterns: List[str],
        attr: Optional[str] = None,
        ignore_excluded: Optional[bool] = None,
        alignment_mode: Optional[str] = None,
    ):
        """
        Add a pattern to the registry.

        Parameters
        ----------
        key : str
            Key of the new/updated pattern.
        patterns : List[str]
            List of patterns to add.
        attr : str, optional
            Attribute to use for matching.
            By default uses the `default_attr` attribute
        ignore_excluded : bool, optional
            Whether to skip excluded tokens during matching.
        alignment_mode : str, optional
            Overwrite alignment mode.
        """

        if attr is None:
            attr = self.default_attr

        if ignore_excluded is None:
            ignore_excluded = self.ignore_excluded

        if alignment_mode is None:
            alignment_mode = self.alignment_mode

        patterns = [compile_regex(pattern) for pattern in patterns]

        self.regex.append((key, patterns, attr, ignore_excluded, alignment_mode))

    def remove(
        self,
        key: str,
    ):
        """
        Remove a pattern for the registry.

        Parameters
        ----------
        key : str
            key of the pattern to remove.

        Raises
        ------
        ValueError
            If the key is not present in the registered patterns.
        """
        n = len(self.regex)
        self.regex = [(k, p, a, i, am) for k, p, a, i, am in self.regex if k != key]
        if len(self.regex) == n:
            raise ValueError(f"`{key}` is not referenced in the matcher")

    def __len__(self):
        return len(set([regex[0] for regex in self.regex]))

    def match(
        self,
        doclike: Union[Doc, Span],
    ) -> Tuple[Span, re.Match]:
        """
        Iterates on the matches.

        Parameters
        ----------
        doclike:
            spaCy Doc or Span object to match on.

        Yields
        -------
        span:
            A match.
        """

        for key, patterns, attr, ignore_excluded, alignment_mode in self.regex:
            text = get_text(doclike, attr, ignore_excluded)

            for pattern in patterns:
                for match in pattern.finditer(text):
                    logger.trace(f"Matched a regex from {key}: {repr(match.group())}")

                    span = create_span(
                        doclike=doclike,
                        start_char=match.start(),
                        end_char=match.end(),
                        key=key,
                        attr=attr,
                        alignment_mode=alignment_mode,
                        ignore_excluded=ignore_excluded,
                    )

                    if span is None:
                        continue

                    yield span, match

    def __call__(
        self,
        doclike: Union[Doc, Span],
        as_spans=False,
        return_groupdict=False,
    ) -> Union[Span, Tuple[Span, Dict[str, Any]]]:
        """
        Performs matching. Yields matches.

        Parameters
        ----------
        doclike:
            spaCy Doc or Span object.
        as_spans:
            Returns matches as spans.

        Yields
        ------
        span:
            A match.
        groupdict:
            Additional information coming from the named patterns
            in the regular expression.
        """
        for span, match in self.match(doclike):
            if not as_spans:
                offset = doclike[0].i
                span = (span.label, span.start - offset, span.end - offset)
            if return_groupdict:
                yield span, match.groupdict()
            else:
                yield span
alignment_mode = alignment_mode instance-attribute
regex = [] instance-attribute
default_attr = attr instance-attribute
ignore_excluded = ignore_excluded instance-attribute
__init__(alignment_mode='expand', attr='TEXT', ignore_excluded=False)
Source code in edsnlp/matchers/regex.py
135
136
137
138
139
140
141
142
143
144
145
146
def __init__(
    self,
    alignment_mode: str = "expand",
    attr: str = "TEXT",
    ignore_excluded: bool = False,
):
    self.alignment_mode = alignment_mode
    self.regex = []

    self.default_attr = attr

    self.ignore_excluded = ignore_excluded
build_patterns(regex)

Build patterns and adds them for matching. Helper function for pipelines using this matcher.

PARAMETER DESCRIPTION
regex

Dictionary of label/terms, or label/dictionary of terms/attribute.

TYPE: Patterns

Source code in edsnlp/matchers/regex.py
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
def build_patterns(self, regex: Patterns):
    """
    Build patterns and adds them for matching.
    Helper function for pipelines using this matcher.

    Parameters
    ----------
    regex : Patterns
        Dictionary of label/terms, or label/dictionary of terms/attribute.
    """
    if not regex:
        regex = dict()

    for key, patterns in regex.items():
        if isinstance(patterns, dict):
            attr = patterns.get("attr")
            alignment_mode = patterns.get("alignment_mode")
            patterns = patterns.get("regex")
        else:
            attr = None
            alignment_mode = None

        if isinstance(patterns, str):
            patterns = [patterns]

        self.add(
            key=key, patterns=patterns, attr=attr, alignment_mode=alignment_mode
        )
add(key, patterns, attr=None, ignore_excluded=None, alignment_mode=None)

Add a pattern to the registry.

PARAMETER DESCRIPTION
key

Key of the new/updated pattern.

TYPE: str

patterns

List of patterns to add.

TYPE: List[str]

attr

Attribute to use for matching. By default uses the default_attr attribute

TYPE: str, optional DEFAULT: None

ignore_excluded

Whether to skip excluded tokens during matching.

TYPE: bool, optional DEFAULT: None

alignment_mode

Overwrite alignment mode.

TYPE: str, optional DEFAULT: None

Source code in edsnlp/matchers/regex.py
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
def add(
    self,
    key: str,
    patterns: List[str],
    attr: Optional[str] = None,
    ignore_excluded: Optional[bool] = None,
    alignment_mode: Optional[str] = None,
):
    """
    Add a pattern to the registry.

    Parameters
    ----------
    key : str
        Key of the new/updated pattern.
    patterns : List[str]
        List of patterns to add.
    attr : str, optional
        Attribute to use for matching.
        By default uses the `default_attr` attribute
    ignore_excluded : bool, optional
        Whether to skip excluded tokens during matching.
    alignment_mode : str, optional
        Overwrite alignment mode.
    """

    if attr is None:
        attr = self.default_attr

    if ignore_excluded is None:
        ignore_excluded = self.ignore_excluded

    if alignment_mode is None:
        alignment_mode = self.alignment_mode

    patterns = [compile_regex(pattern) for pattern in patterns]

    self.regex.append((key, patterns, attr, ignore_excluded, alignment_mode))
remove(key)

Remove a pattern for the registry.

PARAMETER DESCRIPTION
key

key of the pattern to remove.

TYPE: str

RAISES DESCRIPTION
ValueError

If the key is not present in the registered patterns.

Source code in edsnlp/matchers/regex.py
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
def remove(
    self,
    key: str,
):
    """
    Remove a pattern for the registry.

    Parameters
    ----------
    key : str
        key of the pattern to remove.

    Raises
    ------
    ValueError
        If the key is not present in the registered patterns.
    """
    n = len(self.regex)
    self.regex = [(k, p, a, i, am) for k, p, a, i, am in self.regex if k != key]
    if len(self.regex) == n:
        raise ValueError(f"`{key}` is not referenced in the matcher")
__len__()
Source code in edsnlp/matchers/regex.py
238
239
def __len__(self):
    return len(set([regex[0] for regex in self.regex]))
match(doclike)

Iterates on the matches.

PARAMETER DESCRIPTION
doclike

spaCy Doc or Span object to match on.

TYPE: Union[Doc, Span]

YIELDS DESCRIPTION
span

A match.

Source code in edsnlp/matchers/regex.py
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
def match(
    self,
    doclike: Union[Doc, Span],
) -> Tuple[Span, re.Match]:
    """
    Iterates on the matches.

    Parameters
    ----------
    doclike:
        spaCy Doc or Span object to match on.

    Yields
    -------
    span:
        A match.
    """

    for key, patterns, attr, ignore_excluded, alignment_mode in self.regex:
        text = get_text(doclike, attr, ignore_excluded)

        for pattern in patterns:
            for match in pattern.finditer(text):
                logger.trace(f"Matched a regex from {key}: {repr(match.group())}")

                span = create_span(
                    doclike=doclike,
                    start_char=match.start(),
                    end_char=match.end(),
                    key=key,
                    attr=attr,
                    alignment_mode=alignment_mode,
                    ignore_excluded=ignore_excluded,
                )

                if span is None:
                    continue

                yield span, match
__call__(doclike, as_spans=False, return_groupdict=False)

Performs matching. Yields matches.

PARAMETER DESCRIPTION
doclike

spaCy Doc or Span object.

TYPE: Union[Doc, Span]

as_spans

Returns matches as spans.

DEFAULT: False

YIELDS DESCRIPTION
span

A match.

groupdict

Additional information coming from the named patterns in the regular expression.

Source code in edsnlp/matchers/regex.py
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
def __call__(
    self,
    doclike: Union[Doc, Span],
    as_spans=False,
    return_groupdict=False,
) -> Union[Span, Tuple[Span, Dict[str, Any]]]:
    """
    Performs matching. Yields matches.

    Parameters
    ----------
    doclike:
        spaCy Doc or Span object.
    as_spans:
        Returns matches as spans.

    Yields
    ------
    span:
        A match.
    groupdict:
        Additional information coming from the named patterns
        in the regular expression.
    """
    for span, match in self.match(doclike):
        if not as_spans:
            offset = doclike[0].i
            span = (span.label, span.start - offset, span.end - offset)
        if return_groupdict:
            yield span, match.groupdict()
        else:
            yield span

get_first_included(doclike)

Source code in edsnlp/matchers/regex.py
13
14
15
16
17
18
@lru_cache(32)
def get_first_included(doclike: Union[Doc, Span]) -> Token:
    for token in doclike:
        if not token._.excluded:
            return token
    raise IndexError("The provided Span does not include any token")

create_span(doclike, start_char, end_char, key, attr, alignment_mode, ignore_excluded)

spaCy only allows strict alignment mode for char_span on Spans. This method circumvents this.

PARAMETER DESCRIPTION
doclike

Doc or Span.

TYPE: Union[Doc, Span]

start_char

Character index within the Doc-like object.

TYPE: int

end_char

Character index of the end, within the Doc-like object.

TYPE: int

key

The key used to match.

TYPE: str

alignment_mode

The alignment mode.

TYPE: str

ignore_excluded

Whether to skip excluded tokens.

TYPE: bool

RETURNS DESCRIPTION
span

A span matched on the Doc-like object.

Source code in edsnlp/matchers/regex.py
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
def create_span(
    doclike: Union[Doc, Span],
    start_char: int,
    end_char: int,
    key: str,
    attr: str,
    alignment_mode: str,
    ignore_excluded: bool,
) -> Span:
    """
    spaCy only allows strict alignment mode for char_span on Spans.
    This method circumvents this.

    Parameters
    ----------
    doclike : Union[Doc, Span]
        `Doc` or `Span`.
    start_char : int
        Character index within the Doc-like object.
    end_char : int
        Character index of the end, within the Doc-like object.
    key : str
        The key used to match.
    alignment_mode : str
        The alignment mode.
    ignore_excluded : bool
        Whether to skip excluded tokens.

    Returns
    -------
    span:
        A span matched on the Doc-like object.
    """

    doc = doclike if isinstance(doclike, Doc) else doclike.doc

    # Handle the simple case immediately
    if attr in {"TEXT", "LOWER"} and not ignore_excluded:
        off = doclike[0].idx
        return doc.char_span(
            start_char + off,
            end_char + off,
            label=key,
            alignment_mode=alignment_mode,
        )

    # If doclike is a Span, we need to get the clean
    # index of the first included token
    if ignore_excluded:
        original, clean = alignment(
            doc=doc,
            attr=attr,
            ignore_excluded=ignore_excluded,
        )

        first_included = get_first_included(doclike)
        i = bisect_left(original, first_included.idx)
        first = clean[i]

    else:
        first = doclike[0].idx

    start_char = (
        first
        + start_char
        + offset(
            doc,
            attr=attr,
            ignore_excluded=ignore_excluded,
            index=first + start_char,
        )
    )

    end_char = (
        first
        + end_char
        + offset(
            doc,
            attr=attr,
            ignore_excluded=ignore_excluded,
            index=first + end_char,
        )
    )

    span = doc.char_span(
        start_char,
        end_char,
        label=key,
        alignment_mode=alignment_mode,
    )

    return span

utils

ListOrStr = Union[List[str], str] module-attribute

DictOrPattern = Union[Dict[str, ListOrStr], ListOrStr] module-attribute

Patterns = Dict[str, DictOrPattern] module-attribute

ATTRIBUTES = {'LOWER': 'lower_', 'TEXT': 'text', 'NORM': 'norm_', 'SHAPE': 'shape_'} module-attribute

offset

token_length(token, custom, attr)
Source code in edsnlp/matchers/utils/offset.py
10
11
12
13
14
15
def token_length(token: Token, custom: bool, attr: str):
    if custom:
        text = getattr(token._, attr)
    else:
        text = getattr(token, attr)
    return len(text)
alignment(doc, attr='TEXT', ignore_excluded=True)

Align different representations of a Doc or Span object.

PARAMETER DESCRIPTION
doc

spaCy Doc or Span object

TYPE: Doc

attr

Attribute to use, by default "TEXT"

TYPE: str, optional DEFAULT: 'TEXT'

ignore_excluded

Whether to remove excluded tokens, by default True

TYPE: bool, optional DEFAULT: True

RETURNS DESCRIPTION
Tuple[List[int], List[int]]

An alignment tuple: original and clean lists.

Source code in edsnlp/matchers/utils/offset.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
@lru_cache(maxsize=32)
def alignment(
    doc: Doc,
    attr: str = "TEXT",
    ignore_excluded: bool = True,
) -> Tuple[List[int], List[int]]:
    """
    Align different representations of a `Doc` or `Span` object.

    Parameters
    ----------
    doc : Doc
        spaCy `Doc` or `Span` object
    attr : str, optional
        Attribute to use, by default `"TEXT"`
    ignore_excluded : bool, optional
        Whether to remove excluded tokens, by default True

    Returns
    -------
    Tuple[List[int], List[int]]
        An alignment tuple: original and clean lists.
    """
    assert isinstance(doc, Doc)

    attr = attr.upper()
    attr = ATTRIBUTES.get(attr, attr)

    custom = attr.startswith("_")

    if custom:
        attr = attr[1:].lower()

    # Define the length function
    length = partial(token_length, custom=custom, attr=attr)

    original = []
    clean = []

    cursor = 0

    for token in doc:

        if not ignore_excluded or not token._.excluded:

            # The token is not excluded, we add its extremities to the list
            original.append(token.idx)

            # We add the cursor
            clean.append(cursor)
            cursor += length(token)

            if token.whitespace_:
                cursor += 1

    return original, clean
offset(doc, attr, ignore_excluded, index)

Compute offset between the original text and a given representation (defined by the couple attr, ignore_excluded).

The alignment itself is computed with alignment.

PARAMETER DESCRIPTION
doc

The spaCy Doc object

TYPE: Doc

attr

The attribute used by the RegexMatcher (eg NORM)

TYPE: str

ignore_excluded

Whether the RegexMatcher ignores excluded tokens.

TYPE: bool

index

The index in the pre-processed text.

TYPE: int

RETURNS DESCRIPTION
int

The offset. To get the character index in the original document, just do: original = index + offset(doc, attr, ignore_excluded, index)

Source code in edsnlp/matchers/utils/offset.py
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
def offset(
    doc: Doc,
    attr: str,
    ignore_excluded: bool,
    index: int,
) -> int:
    """
    Compute offset between the original text and a given representation
    (defined by the couple `attr`, `ignore_excluded`).

    The alignment itself is computed with
    [`alignment`][edsnlp.matchers.utils.offset.alignment].

    Parameters
    ----------
    doc : Doc
        The spaCy `Doc` object
    attr : str
        The attribute used by the [`RegexMatcher`][edsnlp.matchers.regex.RegexMatcher]
        (eg `NORM`)
    ignore_excluded : bool
        Whether the RegexMatcher ignores excluded tokens.
    index : int
        The index in the pre-processed text.

    Returns
    -------
    int
        The offset. To get the character index in the original document,
        just do: `#!python original = index + offset(doc, attr, ignore_excluded, index)`
    """
    original, clean = alignment(
        doc=doc,
        attr=attr,
        ignore_excluded=ignore_excluded,
    )

    # We use bisect to efficiently find the correct rightmost-lower index
    i = bisect_left(clean, index)
    i = min(i, len(original) - 1)

    return original[i] - clean[i]

text

get_text(doclike, attr, ignore_excluded)

Get text using a custom attribute, possibly ignoring excluded tokens.

PARAMETER DESCRIPTION
doclike

Doc or Span to get text from.

TYPE: Union[Doc, Span]

attr

Attribute to use.

TYPE: str

ignore_excluded

Whether to skip excluded tokens, by default False

TYPE: bool

RETURNS DESCRIPTION
str

Extracted text.

Source code in edsnlp/matchers/utils/text.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
@lru_cache(32)
def get_text(
    doclike: Union[Doc, Span],
    attr: str,
    ignore_excluded: bool,
) -> str:
    """
    Get text using a custom attribute, possibly ignoring excluded tokens.

    Parameters
    ----------
    doclike : Union[Doc, Span]
        Doc or Span to get text from.
    attr : str
        Attribute to use.
    ignore_excluded : bool
        Whether to skip excluded tokens, by default False

    Returns
    -------
    str
        Extracted text.
    """

    attr = attr.upper()

    if not ignore_excluded:
        if attr == "TEXT":
            return doclike.text
        elif attr == "LOWER":
            return doclike.text.lower()
        else:
            tokens = doclike
    else:
        tokens = [t for t in doclike if not t._.excluded]

    attr = ATTRIBUTES.get(attr, attr)

    if attr.startswith("_"):
        attr = attr[1:].lower()
        return "".join([getattr(t._, attr) + t.whitespace_ for t in tokens])
    else:
        return "".join([getattr(t, attr) + t.whitespace_ for t in tokens])
Back to top