Skip to content

edsnlp.pipelines.qualifiers.base

Qualifier

Bases: BaseComponent

Implements the NegEx algorithm.

PARAMETER DESCRIPTION
nlp

spaCy nlp pipeline to use for matching.

TYPE: Language

attr

spaCy's attribute to use: a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr' we can also add a key for each regex.

TYPE: str

on_ents_only

Whether to look for matches around detected entities only. Useful for faster inference in downstream tasks.

TYPE: bool

explain

Whether to keep track of cues for each entity.

TYPE: bool

**terms

Terms to look for.

TYPE: Dict[str, Optional[List[str]]]

Source code in edsnlp/pipelines/qualifiers/base.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
class Qualifier(BaseComponent):
    """
    Implements the NegEx algorithm.

    Parameters
    ----------
    nlp : Language
        spaCy nlp pipeline to use for matching.
    attr : str
        spaCy's attribute to use:
        a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr'
        we can also add a key for each regex.
    on_ents_only : bool
        Whether to look for matches around detected entities only.
        Useful for faster inference in downstream tasks.
    explain : bool
        Whether to keep track of cues for each entity.
    **terms : Dict[str, Optional[List[str]]]
        Terms to look for.
    """

    defaults = dict()

    def __init__(
        self,
        nlp: Language,
        attr: str,
        on_ents_only: bool,
        explain: bool,
        **terms: Dict[str, Optional[List[str]]],
    ):

        if attr.upper() == "NORM":
            check_normalizer(nlp)

        self.phrase_matcher = EDSPhraseMatcher(vocab=nlp.vocab, attr=attr)
        self.phrase_matcher.build_patterns(nlp=nlp, terms=terms)

        self.on_ents_only = on_ents_only
        self.explain = explain

    def get_defaults(
        self, **kwargs: Dict[str, Optional[List[str]]]
    ) -> Dict[str, List[str]]:
        """
        Merge terms with their defaults. Null keys are replaced with defaults.

        Returns
        -------
        Dict[str, List[str]]
            Merged dictionary
        """
        # Filter out empty keys
        kwargs = {k: v for k, v in kwargs.items() if v is not None}

        # Update defaults
        terms = self.defaults.copy()
        terms.update(kwargs)

        return terms

    def get_matches(self, doc: Doc) -> List[Span]:
        """
        Extract matches.

        Parameters
        ----------
        doc : Doc
            spaCy `Doc` object.

        Returns
        -------
        List[Span]
            List of detected spans
        """
        if self.on_ents_only:

            sents = set([ent.sent for ent in doc.ents])
            match_iterator = map(
                lambda sent: self.phrase_matcher(sent, as_spans=True), sents
            )

            matches = chain.from_iterable(match_iterator)

        else:
            matches = self.phrase_matcher(doc, as_spans=True)

        return list(matches)

    def __call__(self, doc: Doc) -> Doc:
        return self.process(doc)

defaults = dict() class-attribute

phrase_matcher = EDSPhraseMatcher(vocab=nlp.vocab, attr=attr) instance-attribute

on_ents_only = on_ents_only instance-attribute

explain = explain instance-attribute

__init__(nlp, attr, on_ents_only, explain, **terms)

Source code in edsnlp/pipelines/qualifiers/base.py
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
def __init__(
    self,
    nlp: Language,
    attr: str,
    on_ents_only: bool,
    explain: bool,
    **terms: Dict[str, Optional[List[str]]],
):

    if attr.upper() == "NORM":
        check_normalizer(nlp)

    self.phrase_matcher = EDSPhraseMatcher(vocab=nlp.vocab, attr=attr)
    self.phrase_matcher.build_patterns(nlp=nlp, terms=terms)

    self.on_ents_only = on_ents_only
    self.explain = explain

get_defaults(**kwargs)

Merge terms with their defaults. Null keys are replaced with defaults.

RETURNS DESCRIPTION
Dict[str, List[str]]

Merged dictionary

Source code in edsnlp/pipelines/qualifiers/base.py
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def get_defaults(
    self, **kwargs: Dict[str, Optional[List[str]]]
) -> Dict[str, List[str]]:
    """
    Merge terms with their defaults. Null keys are replaced with defaults.

    Returns
    -------
    Dict[str, List[str]]
        Merged dictionary
    """
    # Filter out empty keys
    kwargs = {k: v for k, v in kwargs.items() if v is not None}

    # Update defaults
    terms = self.defaults.copy()
    terms.update(kwargs)

    return terms

get_matches(doc)

Extract matches.

PARAMETER DESCRIPTION
doc

spaCy Doc object.

TYPE: Doc

RETURNS DESCRIPTION
List[Span]

List of detected spans

Source code in edsnlp/pipelines/qualifiers/base.py
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
def get_matches(self, doc: Doc) -> List[Span]:
    """
    Extract matches.

    Parameters
    ----------
    doc : Doc
        spaCy `Doc` object.

    Returns
    -------
    List[Span]
        List of detected spans
    """
    if self.on_ents_only:

        sents = set([ent.sent for ent in doc.ents])
        match_iterator = map(
            lambda sent: self.phrase_matcher(sent, as_spans=True), sents
        )

        matches = chain.from_iterable(match_iterator)

    else:
        matches = self.phrase_matcher(doc, as_spans=True)

    return list(matches)

__call__(doc)

Source code in edsnlp/pipelines/qualifiers/base.py
114
115
def __call__(self, doc: Doc) -> Doc:
    return self.process(doc)

check_normalizer(nlp)

Source code in edsnlp/pipelines/qualifiers/base.py
12
13
14
15
16
17
18
19
20
21
22
def check_normalizer(nlp: Language) -> None:
    components = {name: component for name, component in nlp.pipeline}
    normalizer = components.get("normalizer")

    if normalizer and not normalizer.lowercase:
        logger.warning(
            "You have chosen the NORM attribute, but disabled lowercasing "
            "in your normalisation pipeline. "
            "This WILL hurt performance : you might want to use the "
            "LOWER attribute instead."
        )
Back to top