Skip to content

edsnlp.pipelines.core.matcher.matcher

GenericMatcher

Bases: BaseComponent

Provides a generic matcher component.

PARAMETER DESCRIPTION
nlp

The spaCy object.

TYPE: Language

label

Top-level label

TYPE: str

terms

A dictionary of terms.

TYPE: Optional[Patterns]

regex

A dictionary of regular expressions.

TYPE: Optional[Patterns]

attr

The default attribute to use for matching. Can be overridden using the terms and regex configurations.

TYPE: str

ignore_excluded

Whether to skip excluded tokens (requires an upstream pipeline to mark excluded tokens).

TYPE: bool

Source code in edsnlp/pipelines/core/matcher/matcher.py
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
class GenericMatcher(BaseComponent):
    """
    Provides a generic matcher component.

    Parameters
    ----------
    nlp : Language
        The spaCy object.
    label : str
        Top-level label
    terms : Optional[Patterns]
        A dictionary of terms.
    regex : Optional[Patterns]
        A dictionary of regular expressions.
    attr : str
        The default attribute to use for matching.
        Can be overridden using the `terms` and `regex` configurations.
    ignore_excluded : bool
        Whether to skip excluded tokens (requires an upstream
        pipeline to mark excluded tokens).
    """

    def __init__(
        self,
        nlp: Language,
        terms: Optional[Patterns],
        regex: Optional[Patterns],
        attr: str,
        ignore_excluded: bool,
    ):

        self.nlp = nlp

        self.attr = attr

        self.phrase_matcher = EDSPhraseMatcher(
            self.nlp.vocab,
            attr=attr,
            ignore_excluded=ignore_excluded,
        )
        self.regex_matcher = RegexMatcher(
            attr=attr,
            ignore_excluded=ignore_excluded,
        )

        self.phrase_matcher.build_patterns(nlp=nlp, terms=terms)
        self.regex_matcher.build_patterns(regex=regex)

        self.set_extensions()

    def process(self, doc: Doc) -> List[Span]:
        """
        Find matching spans in doc.

        Parameters
        ----------
        doc:
            spaCy Doc object.

        Returns
        -------
        spans:
            List of Spans returned by the matchers.
        """

        matches = self.phrase_matcher(doc, as_spans=True)
        regex_matches = self.regex_matcher(doc, as_spans=True)

        spans = list(matches) + list(regex_matches)

        return spans

    def __call__(self, doc: Doc) -> Doc:
        """
        Adds spans to document.

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        doc:
            spaCy Doc object, annotated for extracted terms.
        """
        matches = self.process(doc)

        for span in matches:
            if span.label_ not in doc.spans:
                doc.spans[span.label_] = []
            doc.spans[span.label_].append(span)

        ents, discarded = filter_spans(list(doc.ents) + matches, return_discarded=True)

        doc.ents = ents

        if "discarded" not in doc.spans:
            doc.spans["discarded"] = []
        doc.spans["discarded"].extend(discarded)

        return doc

nlp = nlp instance-attribute

attr = attr instance-attribute

phrase_matcher = EDSPhraseMatcher(self.nlp.vocab, attr=attr, ignore_excluded=ignore_excluded) instance-attribute

regex_matcher = RegexMatcher(attr=attr, ignore_excluded=ignore_excluded) instance-attribute

__init__(nlp, terms, regex, attr, ignore_excluded)

Source code in edsnlp/pipelines/core/matcher/matcher.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def __init__(
    self,
    nlp: Language,
    terms: Optional[Patterns],
    regex: Optional[Patterns],
    attr: str,
    ignore_excluded: bool,
):

    self.nlp = nlp

    self.attr = attr

    self.phrase_matcher = EDSPhraseMatcher(
        self.nlp.vocab,
        attr=attr,
        ignore_excluded=ignore_excluded,
    )
    self.regex_matcher = RegexMatcher(
        attr=attr,
        ignore_excluded=ignore_excluded,
    )

    self.phrase_matcher.build_patterns(nlp=nlp, terms=terms)
    self.regex_matcher.build_patterns(regex=regex)

    self.set_extensions()

process(doc)

Find matching spans in doc.

PARAMETER DESCRIPTION
doc

spaCy Doc object.

TYPE: Doc

RETURNS DESCRIPTION
spans

List of Spans returned by the matchers.

Source code in edsnlp/pipelines/core/matcher/matcher.py
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
def process(self, doc: Doc) -> List[Span]:
    """
    Find matching spans in doc.

    Parameters
    ----------
    doc:
        spaCy Doc object.

    Returns
    -------
    spans:
        List of Spans returned by the matchers.
    """

    matches = self.phrase_matcher(doc, as_spans=True)
    regex_matches = self.regex_matcher(doc, as_spans=True)

    spans = list(matches) + list(regex_matches)

    return spans

__call__(doc)

Adds spans to document.

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
doc

spaCy Doc object, annotated for extracted terms.

Source code in edsnlp/pipelines/core/matcher/matcher.py
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
def __call__(self, doc: Doc) -> Doc:
    """
    Adds spans to document.

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    doc:
        spaCy Doc object, annotated for extracted terms.
    """
    matches = self.process(doc)

    for span in matches:
        if span.label_ not in doc.spans:
            doc.spans[span.label_] = []
        doc.spans[span.label_].append(span)

    ents, discarded = filter_spans(list(doc.ents) + matches, return_discarded=True)

    doc.ents = ents

    if "discarded" not in doc.spans:
        doc.spans["discarded"] = []
    doc.spans["discarded"].extend(discarded)

    return doc