Skip to content

edsnlp.matchers.phrase

PatternDict = Dict[str, Union[str, Dict[str, str]]] module-attribute

EDSPhraseMatcher

Bases: object

PhraseMatcher that matches "over" excluded tokens.

PARAMETER DESCRIPTION
vocab

spaCy vocabulary to match on.

TYPE: Vocab

attr

Default attribute to match on, by default "TEXT". Can be overiden in the add method.

To match on a custom attribute, prepend the attribute name with _.

TYPE: str

ignore_excluded

Whether to ignore excluded tokens, by default True

TYPE: bool, optional

exclude_newlines

Whether to exclude new lines, by default False

TYPE: bool, optional

Source code in edsnlp/matchers/phrase.py
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
class EDSPhraseMatcher(object):
    """
    PhraseMatcher that matches "over" excluded tokens.

    Parameters
    ----------
    vocab : Vocab
        spaCy vocabulary to match on.
    attr : str
        Default attribute to match on, by default "TEXT".
        Can be overiden in the `add` method.

        To match on a custom attribute, prepend the attribute name with `_`.
    ignore_excluded : bool, optional
        Whether to ignore excluded tokens, by default True
    exclude_newlines : bool, optional
        Whether to exclude new lines, by default False
    """

    def __init__(
        self,
        vocab: Vocab,
        attr: str = "TEXT",
        ignore_excluded: bool = True,
        exclude_newlines: bool = False,
    ):
        self.matcher = Matcher(vocab, validate=True)
        self.attr = attr
        self.ignore_excluded = ignore_excluded

        self.exclusion_attribute = (
            "excluded_or_space" if exclude_newlines else "excluded"
        )

    @staticmethod
    def get_attr(token: Token, attr: str, custom_attr: bool = False) -> str:
        if custom_attr:
            return getattr(token._, attr)
        else:
            attr = ATTRIBUTES.get(attr)
            return getattr(token, attr)

    def create_pattern(
        self,
        match_pattern: Doc,
        attr: Optional[str] = None,
        ignore_excluded: Optional[bool] = None,
    ) -> List[PatternDict]:
        """
        Create a pattern

        Parameters
        ----------
        match_pattern : Doc
            A spaCy doc object, to use as match model.
        attr : str, optional
            Overwrite attribute to match on.
        ignore_excluded: bool, optional
            Whether to skip excluded tokens.

        Returns
        -------
        List[PatternDict]
            A spaCy rule-based pattern.
        """

        ignore_excluded = ignore_excluded or self.ignore_excluded

        attr = attr or self.attr
        custom_attr = attr.startswith("_")

        if custom_attr:
            attr = attr.lstrip("_").lower()

            pattern = []

            for token in match_pattern:
                pattern.append({"_": {attr: self.get_attr(token, attr, True)}})
                if ignore_excluded and token.whitespace_:
                    # If the token is followed by a whitespace,
                    # we let it match on a pollution
                    pattern.append({"_": {self.exclusion_attribute: True}, "OP": "*"})

            return pattern
        else:
            pattern = []

            for token in match_pattern:
                pattern.append({attr: self.get_attr(token, attr, False)})
                if ignore_excluded and token.whitespace_:
                    # If the token is followed by a whitespace,
                    # we let it match on a pollution
                    pattern.append({"_": {self.exclusion_attribute: True}, "OP": "*"})

            return pattern

    def build_patterns(self, nlp: Language, terms: Patterns):
        """
        Build patterns and adds them for matching.
        Helper function for pipelines using this matcher.

        Parameters
        ----------
        nlp : Language
            The instance of the spaCy language class.
        terms : Patterns
            Dictionary of label/terms, or label/dictionary of terms/attribute.
        """

        if not terms:
            terms = dict()

        for key, expressions in terms.items():
            if isinstance(expressions, dict):
                attr = expressions.get("attr")
                expressions = expressions.get("patterns")
            else:
                attr = None
            if isinstance(expressions, str):
                expressions = [expressions]
            patterns = list(nlp.pipe(expressions))
            self.add(key, patterns, attr)

    def add(
        self,
        key: str,
        patterns: List[Doc],
        attr: Optional[str] = None,
        ignore_excluded: Optional[bool] = None,
    ) -> None:
        """
        Add a pattern.

        Parameters
        ----------
        key : str
            Key of the new/updated pattern.
        patterns : List[str]
            List of patterns to add.
        attr : str, optional
            Overwrite the attribute to match on for this specific pattern.
        ignore_excluded : bool, optional
            Overwrite the parameter for this specific pattern.
        """

        patterns = [
            self.create_pattern(pattern, attr=attr, ignore_excluded=ignore_excluded)
            for pattern in patterns
        ]
        self.matcher.add(key, patterns)

    def remove(
        self,
        key: str,
    ) -> None:
        """
        Remove a pattern.

        Parameters
        ----------
        key : str
            key of the pattern to remove.

        Raises
        ------
        ValueError
            Should the key not be contained in the registry.
        """
        self.matcher.remove(key)

    def __len__(self):
        return len(self.matcher)

    def __call__(
        self,
        doclike: Union[Doc, Span],
        as_spans=False,
    ) -> Generator:
        """
        Performs matching. Yields matches.

        Parameters
        ----------
        doclike:
            spaCy Doc or Span object.
        as_spans:
            Whether to return matches as spans.

        Yields
        -------
        match: Span
            A match.
        """
        if len(self.matcher):
            for match in self.matcher(doclike, as_spans=as_spans):
                yield match

matcher = Matcher(vocab, validate=True) instance-attribute

attr = attr instance-attribute

ignore_excluded = ignore_excluded instance-attribute

exclusion_attribute = 'excluded_or_space' if exclude_newlines else 'excluded' instance-attribute

__init__(vocab, attr='TEXT', ignore_excluded=True, exclude_newlines=False)

Source code in edsnlp/matchers/phrase.py
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def __init__(
    self,
    vocab: Vocab,
    attr: str = "TEXT",
    ignore_excluded: bool = True,
    exclude_newlines: bool = False,
):
    self.matcher = Matcher(vocab, validate=True)
    self.attr = attr
    self.ignore_excluded = ignore_excluded

    self.exclusion_attribute = (
        "excluded_or_space" if exclude_newlines else "excluded"
    )

get_attr(token, attr, custom_attr=False)

Source code in edsnlp/matchers/phrase.py
75
76
77
78
79
80
81
@staticmethod
def get_attr(token: Token, attr: str, custom_attr: bool = False) -> str:
    if custom_attr:
        return getattr(token._, attr)
    else:
        attr = ATTRIBUTES.get(attr)
        return getattr(token, attr)

create_pattern(match_pattern, attr=None, ignore_excluded=None)

Create a pattern

PARAMETER DESCRIPTION
match_pattern

A spaCy doc object, to use as match model.

TYPE: Doc

attr

Overwrite attribute to match on.

TYPE: str, optional DEFAULT: None

ignore_excluded

Whether to skip excluded tokens.

TYPE: Optional[bool] DEFAULT: None

RETURNS DESCRIPTION
List[PatternDict]

A spaCy rule-based pattern.

Source code in edsnlp/matchers/phrase.py
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
def create_pattern(
    self,
    match_pattern: Doc,
    attr: Optional[str] = None,
    ignore_excluded: Optional[bool] = None,
) -> List[PatternDict]:
    """
    Create a pattern

    Parameters
    ----------
    match_pattern : Doc
        A spaCy doc object, to use as match model.
    attr : str, optional
        Overwrite attribute to match on.
    ignore_excluded: bool, optional
        Whether to skip excluded tokens.

    Returns
    -------
    List[PatternDict]
        A spaCy rule-based pattern.
    """

    ignore_excluded = ignore_excluded or self.ignore_excluded

    attr = attr or self.attr
    custom_attr = attr.startswith("_")

    if custom_attr:
        attr = attr.lstrip("_").lower()

        pattern = []

        for token in match_pattern:
            pattern.append({"_": {attr: self.get_attr(token, attr, True)}})
            if ignore_excluded and token.whitespace_:
                # If the token is followed by a whitespace,
                # we let it match on a pollution
                pattern.append({"_": {self.exclusion_attribute: True}, "OP": "*"})

        return pattern
    else:
        pattern = []

        for token in match_pattern:
            pattern.append({attr: self.get_attr(token, attr, False)})
            if ignore_excluded and token.whitespace_:
                # If the token is followed by a whitespace,
                # we let it match on a pollution
                pattern.append({"_": {self.exclusion_attribute: True}, "OP": "*"})

        return pattern

build_patterns(nlp, terms)

Build patterns and adds them for matching. Helper function for pipelines using this matcher.

PARAMETER DESCRIPTION
nlp

The instance of the spaCy language class.

TYPE: Language

terms

Dictionary of label/terms, or label/dictionary of terms/attribute.

TYPE: Patterns

Source code in edsnlp/matchers/phrase.py
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
def build_patterns(self, nlp: Language, terms: Patterns):
    """
    Build patterns and adds them for matching.
    Helper function for pipelines using this matcher.

    Parameters
    ----------
    nlp : Language
        The instance of the spaCy language class.
    terms : Patterns
        Dictionary of label/terms, or label/dictionary of terms/attribute.
    """

    if not terms:
        terms = dict()

    for key, expressions in terms.items():
        if isinstance(expressions, dict):
            attr = expressions.get("attr")
            expressions = expressions.get("patterns")
        else:
            attr = None
        if isinstance(expressions, str):
            expressions = [expressions]
        patterns = list(nlp.pipe(expressions))
        self.add(key, patterns, attr)

add(key, patterns, attr=None, ignore_excluded=None)

Add a pattern.

PARAMETER DESCRIPTION
key

Key of the new/updated pattern.

TYPE: str

patterns

List of patterns to add.

TYPE: List[str]

attr

Overwrite the attribute to match on for this specific pattern.

TYPE: str, optional DEFAULT: None

ignore_excluded

Overwrite the parameter for this specific pattern.

TYPE: bool, optional DEFAULT: None

Source code in edsnlp/matchers/phrase.py
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
def add(
    self,
    key: str,
    patterns: List[Doc],
    attr: Optional[str] = None,
    ignore_excluded: Optional[bool] = None,
) -> None:
    """
    Add a pattern.

    Parameters
    ----------
    key : str
        Key of the new/updated pattern.
    patterns : List[str]
        List of patterns to add.
    attr : str, optional
        Overwrite the attribute to match on for this specific pattern.
    ignore_excluded : bool, optional
        Overwrite the parameter for this specific pattern.
    """

    patterns = [
        self.create_pattern(pattern, attr=attr, ignore_excluded=ignore_excluded)
        for pattern in patterns
    ]
    self.matcher.add(key, patterns)

remove(key)

Remove a pattern.

PARAMETER DESCRIPTION
key

key of the pattern to remove.

TYPE: str

RAISES DESCRIPTION
ValueError

Should the key not be contained in the registry.

Source code in edsnlp/matchers/phrase.py
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
def remove(
    self,
    key: str,
) -> None:
    """
    Remove a pattern.

    Parameters
    ----------
    key : str
        key of the pattern to remove.

    Raises
    ------
    ValueError
        Should the key not be contained in the registry.
    """
    self.matcher.remove(key)

__len__()

Source code in edsnlp/matchers/phrase.py
211
212
def __len__(self):
    return len(self.matcher)

__call__(doclike, as_spans=False)

Performs matching. Yields matches.

PARAMETER DESCRIPTION
doclike

spaCy Doc or Span object.

TYPE: Union[Doc, Span]

as_spans

Whether to return matches as spans.

DEFAULT: False

YIELDS DESCRIPTION
match

A match.

Source code in edsnlp/matchers/phrase.py
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
def __call__(
    self,
    doclike: Union[Doc, Span],
    as_spans=False,
) -> Generator:
    """
    Performs matching. Yields matches.

    Parameters
    ----------
    doclike:
        spaCy Doc or Span object.
    as_spans:
        Whether to return matches as spans.

    Yields
    -------
    match: Span
        A match.
    """
    if len(self.matcher):
        for match in self.matcher(doclike, as_spans=as_spans):
            yield match

get_normalized_variant(doclike)

Source code in edsnlp/matchers/phrase.py
15
16
17
18
19
20
def get_normalized_variant(doclike: Union[Span, Doc]) -> str:
    tokens = [t.text + t.whitespace_ for t in doclike if not t._.excluded]
    variant = "".join(tokens)
    variant = variant.rstrip(" ")
    variant = re.sub(r"\s+", " ", variant)
    return variant

phrase_matcher_factory(attr, ignore_excluded, exclude_newlines)

Source code in edsnlp/matchers/phrase.py
27
28
29
30
31
32
33
34
35
36
37
38
@registry.misc("edsnlp.factories.phrasematcher.v1")
def phrase_matcher_factory(
    attr: str,
    ignore_excluded: bool,
    exclude_newlines: bool,
):
    return partial(
        EDSPhraseMatcher,
        attr=attr,
        ignore_excluded=ignore_excluded,
        exclude_newlines=exclude_newlines,
    )
Back to top