Skip to content

edsnlp.pipelines.core.matcher.matcher

GenericTermMatcher

Bases: str, Enum

Source code in edsnlp/pipelines/core/matcher/matcher.py
15
16
17
class GenericTermMatcher(str, Enum):
    exact = "exact"
    simstring = "simstring"

exact = 'exact' class-attribute

simstring = 'simstring' class-attribute

GenericMatcher

Bases: BaseComponent

Provides a generic matcher component.

PARAMETER DESCRIPTION
nlp

The spaCy object.

TYPE: Language

terms

A dictionary of terms.

TYPE: Optional[Patterns]

regex

A dictionary of regular expressions.

TYPE: Optional[Patterns]

attr

The default attribute to use for matching. Can be overridden using the terms and regex configurations.

TYPE: str

ignore_excluded

Whether to skip excluded tokens (requires an upstream pipeline to mark excluded tokens).

TYPE: bool

term_matcher

The matcher to use for matching phrases ? One of (exact, simstring)

term_matcher_config

Parameters of the matcher class

Source code in edsnlp/pipelines/core/matcher/matcher.py
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
class GenericMatcher(BaseComponent):
    """
    Provides a generic matcher component.

    Parameters
    ----------
    nlp : Language
        The spaCy object.
    terms : Optional[Patterns]
        A dictionary of terms.
    regex : Optional[Patterns]
        A dictionary of regular expressions.
    attr : str
        The default attribute to use for matching.
        Can be overridden using the `terms` and `regex` configurations.
    ignore_excluded : bool
        Whether to skip excluded tokens (requires an upstream
        pipeline to mark excluded tokens).
    term_matcher: GenericTermMatcher
        The matcher to use for matching phrases ?
        One of (exact, simstring)
    term_matcher_config: Dict[str,Any]
        Parameters of the matcher class
    """

    def __init__(
        self,
        nlp: Language,
        terms: Optional[Patterns],
        regex: Optional[Patterns],
        attr: str,
        ignore_excluded: bool,
        term_matcher: GenericTermMatcher = GenericTermMatcher.exact,
        term_matcher_config: Dict[str, Any] = None,
    ):

        self.nlp = nlp

        self.attr = attr

        if term_matcher == GenericTermMatcher.exact:
            self.phrase_matcher = EDSPhraseMatcher(
                self.nlp.vocab,
                attr=attr,
                ignore_excluded=ignore_excluded,
                **(term_matcher_config or {}),
            )
        elif term_matcher == GenericTermMatcher.simstring:
            self.phrase_matcher = SimstringMatcher(
                self.nlp.vocab,
                attr=attr,
                ignore_excluded=ignore_excluded,
                **(term_matcher_config or {}),
            )
        else:
            raise ValueError(
                f"Algorithm {repr(term_matcher)} does not belong to"
                f" known matcher [exact, simstring]."
            )

        self.regex_matcher = RegexMatcher(
            attr=attr,
            ignore_excluded=ignore_excluded,
        )

        self.phrase_matcher.build_patterns(nlp=nlp, terms=terms)
        self.regex_matcher.build_patterns(regex=regex)

        self.set_extensions()

    def process(self, doc: Doc) -> List[Span]:
        """
        Find matching spans in doc.

        Parameters
        ----------
        doc:
            spaCy Doc object.

        Returns
        -------
        spans:
            List of Spans returned by the matchers.
        """

        matches = self.phrase_matcher(doc, as_spans=True)
        regex_matches = self.regex_matcher(doc, as_spans=True)

        spans = list(matches) + list(regex_matches)

        return spans

    def __call__(self, doc: Doc) -> Doc:
        """
        Adds spans to document.

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        doc:
            spaCy Doc object, annotated for extracted terms.
        """
        matches = self.process(doc)

        for span in matches:
            if span.label_ not in doc.spans:
                doc.spans[span.label_] = []
            doc.spans[span.label_].append(span)

        ents, discarded = filter_spans(list(doc.ents) + matches, return_discarded=True)

        doc.ents = ents

        if "discarded" not in doc.spans:
            doc.spans["discarded"] = []
        doc.spans["discarded"].extend(discarded)

        return doc

nlp = nlp instance-attribute

attr = attr instance-attribute

phrase_matcher = EDSPhraseMatcher(self.nlp.vocab, attr=attr, ignore_excluded=ignore_excluded, None=term_matcher_config or {}) instance-attribute

regex_matcher = RegexMatcher(attr=attr, ignore_excluded=ignore_excluded) instance-attribute

__init__(nlp, terms, regex, attr, ignore_excluded, term_matcher=GenericTermMatcher.exact, term_matcher_config=None)

Source code in edsnlp/pipelines/core/matcher/matcher.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
def __init__(
    self,
    nlp: Language,
    terms: Optional[Patterns],
    regex: Optional[Patterns],
    attr: str,
    ignore_excluded: bool,
    term_matcher: GenericTermMatcher = GenericTermMatcher.exact,
    term_matcher_config: Dict[str, Any] = None,
):

    self.nlp = nlp

    self.attr = attr

    if term_matcher == GenericTermMatcher.exact:
        self.phrase_matcher = EDSPhraseMatcher(
            self.nlp.vocab,
            attr=attr,
            ignore_excluded=ignore_excluded,
            **(term_matcher_config or {}),
        )
    elif term_matcher == GenericTermMatcher.simstring:
        self.phrase_matcher = SimstringMatcher(
            self.nlp.vocab,
            attr=attr,
            ignore_excluded=ignore_excluded,
            **(term_matcher_config or {}),
        )
    else:
        raise ValueError(
            f"Algorithm {repr(term_matcher)} does not belong to"
            f" known matcher [exact, simstring]."
        )

    self.regex_matcher = RegexMatcher(
        attr=attr,
        ignore_excluded=ignore_excluded,
    )

    self.phrase_matcher.build_patterns(nlp=nlp, terms=terms)
    self.regex_matcher.build_patterns(regex=regex)

    self.set_extensions()

process(doc)

Find matching spans in doc.

PARAMETER DESCRIPTION
doc

spaCy Doc object.

TYPE: Doc

RETURNS DESCRIPTION
spans

List of Spans returned by the matchers.

Source code in edsnlp/pipelines/core/matcher/matcher.py
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
def process(self, doc: Doc) -> List[Span]:
    """
    Find matching spans in doc.

    Parameters
    ----------
    doc:
        spaCy Doc object.

    Returns
    -------
    spans:
        List of Spans returned by the matchers.
    """

    matches = self.phrase_matcher(doc, as_spans=True)
    regex_matches = self.regex_matcher(doc, as_spans=True)

    spans = list(matches) + list(regex_matches)

    return spans

__call__(doc)

Adds spans to document.

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
doc

spaCy Doc object, annotated for extracted terms.

Source code in edsnlp/pipelines/core/matcher/matcher.py
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
def __call__(self, doc: Doc) -> Doc:
    """
    Adds spans to document.

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    doc:
        spaCy Doc object, annotated for extracted terms.
    """
    matches = self.process(doc)

    for span in matches:
        if span.label_ not in doc.spans:
            doc.spans[span.label_] = []
        doc.spans[span.label_].append(span)

    ents, discarded = filter_spans(list(doc.ents) + matches, return_discarded=True)

    doc.ents = ents

    if "discarded" not in doc.spans:
        doc.spans["discarded"] = []
    doc.spans["discarded"].extend(discarded)

    return doc