Skip to content

edsnlp.pipelines.ner.scores.base_score

Score

Bases: ContextualMatcher

Matcher component to extract a numeric score

Source code in edsnlp/pipelines/ner/scores/base_score.py
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
class Score(ContextualMatcher):
    """Matcher component to extract a numeric score"""

    def __init__(
        self,
        nlp: Language,
        score_name: str,
        regex: List[str],
        attr: str,
        value_extract: Union[str, Dict[str, str], List[Dict[str, str]]],
        score_normalization: Union[str, Callable[[Union[str, None]], Any]],
        window: int,
        ignore_excluded: bool,
        ignore_space_tokens: bool,
        flags: Union[re.RegexFlag, int],
    ):
        """
        Parameters
        ----------
        nlp : Language
            The spaCy object.
        score_name : str
            The name of the extracted score
        regex : List[str]
            A list of regexes to identify the score
        attr : str
            Whether to match on the text ('TEXT') or on the normalized text ('NORM')
        value_extract : str
            Regex with capturing group to get the score value
        score_normalization : Callable[[Union[str,None]], Any]
            Function that takes the "raw" value extracted from the `value_extract`
            regex and should return:

            - None if no score could be extracted
            - The desired score value else
        window : int
            Number of token to include after the score's mention to find the
            score's value
        ignore_excluded : bool
            Whether to ignore excluded spans when matching
        ignore_space_tokens : bool
            Whether to ignore space tokens when matching
        flags : Union[re.RegexFlag, int]
            Regex flags to use when matching
        """
        if isinstance(value_extract, str):
            value_extract = dict(
                name="value",
                regex=value_extract,
                window=window,
            )

        if isinstance(value_extract, dict):
            value_extract = [value_extract]

        value_exists = False
        for i, extract in enumerate(value_extract):
            extract["window"] = extract.get("window", window)
            if extract.get("name", None) == "value":
                value_exists = True
                extract["replace_entity"] = True
                extract["reduce_mode"] = "keep_first"
            value_extract[i] = extract

        assert value_exists, "You should provide a `value` regex in the `assign` dict."

        patterns = dict(
            source=score_name,
            regex=regex,
            assign=value_extract,
        )

        super().__init__(
            nlp=nlp,
            name=score_name,
            patterns=patterns,
            assign_as_span=False,
            alignment_mode="expand",
            ignore_excluded=ignore_excluded,
            ignore_space_tokens=ignore_space_tokens,
            attr=attr,
            regex_flags=flags,
            include_assigned=False,
        )

        self.score_name = score_name

        if isinstance(score_normalization, str):
            self.score_normalization = registry.get("misc", score_normalization)
        else:
            self.score_normalization = score_normalization

        self.set_extensions()

    @classmethod
    def set_extensions(cls) -> None:
        super(Score, Score).set_extensions()
        if not Span.has_extension("score_name"):
            Span.set_extension("score_name", default=None)
        if not Span.has_extension("score_value"):
            Span.set_extension("score_value", default=None)

    def __call__(self, doc: Doc) -> Doc:
        """
        Adds spans to document.

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        doc:
            spaCy Doc object, annotated for extracted terms.
        """

        ents = self.process(doc)
        ents = self.score_filtering(ents)

        ents, discarded = filter_spans(
            list(doc.ents) + list(ents), return_discarded=True
        )

        doc.ents = ents

        if "discarded" not in doc.spans:
            doc.spans["discarded"] = []
        doc.spans["discarded"].extend(discarded)

        return doc

    def score_filtering(self, ents: List[Span]) -> List[Span]:
        """
        Extracts, if available, the value of the score.
        Normalizes the score via the provided `self.score_normalization` method.

        Parameters
        ----------
        ents: List[Span]
            List of spaCy's spans extracted by the score matcher

        Returns
        -------
        ents: List[Span]
            List of spaCy's spans, with, if found, an added `score_value` extension
        """

        for ent in ents:
            value = ent._.assigned.get("value", None)
            if value is None:
                continue
            normalized_value = self.score_normalization(value)
            if normalized_value is not None:
                ent._.score_name = self.score_name
                ent._.score_value = normalized_value

                yield ent

__init__(nlp, score_name, regex, attr, value_extract, score_normalization, window, ignore_excluded, ignore_space_tokens, flags)

PARAMETER DESCRIPTION
nlp

The spaCy object.

TYPE: Language

score_name

The name of the extracted score

TYPE: str

regex

A list of regexes to identify the score

TYPE: List[str]

attr

Whether to match on the text ('TEXT') or on the normalized text ('NORM')

TYPE: str

value_extract

Regex with capturing group to get the score value

TYPE: str

score_normalization

Function that takes the "raw" value extracted from the value_extract regex and should return:

  • None if no score could be extracted
  • The desired score value else

TYPE: Callable[[Union[str, None]], Any]

window

Number of token to include after the score's mention to find the score's value

TYPE: int

ignore_excluded

Whether to ignore excluded spans when matching

TYPE: bool

ignore_space_tokens

Whether to ignore space tokens when matching

TYPE: bool

flags

Regex flags to use when matching

TYPE: Union[re.RegexFlag, int]

Source code in edsnlp/pipelines/ner/scores/base_score.py
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
def __init__(
    self,
    nlp: Language,
    score_name: str,
    regex: List[str],
    attr: str,
    value_extract: Union[str, Dict[str, str], List[Dict[str, str]]],
    score_normalization: Union[str, Callable[[Union[str, None]], Any]],
    window: int,
    ignore_excluded: bool,
    ignore_space_tokens: bool,
    flags: Union[re.RegexFlag, int],
):
    """
    Parameters
    ----------
    nlp : Language
        The spaCy object.
    score_name : str
        The name of the extracted score
    regex : List[str]
        A list of regexes to identify the score
    attr : str
        Whether to match on the text ('TEXT') or on the normalized text ('NORM')
    value_extract : str
        Regex with capturing group to get the score value
    score_normalization : Callable[[Union[str,None]], Any]
        Function that takes the "raw" value extracted from the `value_extract`
        regex and should return:

        - None if no score could be extracted
        - The desired score value else
    window : int
        Number of token to include after the score's mention to find the
        score's value
    ignore_excluded : bool
        Whether to ignore excluded spans when matching
    ignore_space_tokens : bool
        Whether to ignore space tokens when matching
    flags : Union[re.RegexFlag, int]
        Regex flags to use when matching
    """
    if isinstance(value_extract, str):
        value_extract = dict(
            name="value",
            regex=value_extract,
            window=window,
        )

    if isinstance(value_extract, dict):
        value_extract = [value_extract]

    value_exists = False
    for i, extract in enumerate(value_extract):
        extract["window"] = extract.get("window", window)
        if extract.get("name", None) == "value":
            value_exists = True
            extract["replace_entity"] = True
            extract["reduce_mode"] = "keep_first"
        value_extract[i] = extract

    assert value_exists, "You should provide a `value` regex in the `assign` dict."

    patterns = dict(
        source=score_name,
        regex=regex,
        assign=value_extract,
    )

    super().__init__(
        nlp=nlp,
        name=score_name,
        patterns=patterns,
        assign_as_span=False,
        alignment_mode="expand",
        ignore_excluded=ignore_excluded,
        ignore_space_tokens=ignore_space_tokens,
        attr=attr,
        regex_flags=flags,
        include_assigned=False,
    )

    self.score_name = score_name

    if isinstance(score_normalization, str):
        self.score_normalization = registry.get("misc", score_normalization)
    else:
        self.score_normalization = score_normalization

    self.set_extensions()

__call__(doc)

Adds spans to document.

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
doc

spaCy Doc object, annotated for extracted terms.

TYPE: Doc

Source code in edsnlp/pipelines/ner/scores/base_score.py
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
def __call__(self, doc: Doc) -> Doc:
    """
    Adds spans to document.

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    doc:
        spaCy Doc object, annotated for extracted terms.
    """

    ents = self.process(doc)
    ents = self.score_filtering(ents)

    ents, discarded = filter_spans(
        list(doc.ents) + list(ents), return_discarded=True
    )

    doc.ents = ents

    if "discarded" not in doc.spans:
        doc.spans["discarded"] = []
    doc.spans["discarded"].extend(discarded)

    return doc

score_filtering(ents)

Extracts, if available, the value of the score. Normalizes the score via the provided self.score_normalization method.

PARAMETER DESCRIPTION
ents

List of spaCy's spans extracted by the score matcher

TYPE: List[Span]

RETURNS DESCRIPTION
ents

List of spaCy's spans, with, if found, an added score_value extension

TYPE: List[Span]

Source code in edsnlp/pipelines/ner/scores/base_score.py
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
def score_filtering(self, ents: List[Span]) -> List[Span]:
    """
    Extracts, if available, the value of the score.
    Normalizes the score via the provided `self.score_normalization` method.

    Parameters
    ----------
    ents: List[Span]
        List of spaCy's spans extracted by the score matcher

    Returns
    -------
    ents: List[Span]
        List of spaCy's spans, with, if found, an added `score_value` extension
    """

    for ent in ents:
        value = ent._.assigned.get("value", None)
        if value is None:
            continue
        normalized_value = self.score_normalization(value)
        if normalized_value is not None:
            ent._.score_name = self.score_name
            ent._.score_value = normalized_value

            yield ent