Skip to content

edsnlp.pipelines.ner.scores.tnm.tnm

eds.tnm pipeline.

PERIOD_PROXIMITY_THRESHOLD = 3 module-attribute

TNM

Bases: BaseComponent

Tags and normalizes TNM mentions.

PARAMETER DESCRIPTION
nlp

Language pipeline object

TYPE: spacy.language.Language

pattern

List of regular expressions for TNM mentions.

TYPE: Optional[Union[List[str], str]]

attr

spaCy attribute to use

TYPE: str

Source code in edsnlp/pipelines/ner/scores/tnm/tnm.py
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
class TNM(BaseComponent):
    """
    Tags and normalizes TNM mentions.

    Parameters
    ----------
    nlp : spacy.language.Language
        Language pipeline object
    pattern : Optional[Union[List[str], str]]
        List of regular expressions for TNM mentions.
    attr : str
        spaCy attribute to use
    """

    # noinspection PyProtectedMember
    def __init__(
        self,
        nlp: Language,
        pattern: Optional[Union[List[str], str]],
        attr: str,
    ):

        self.nlp = nlp

        if pattern is None:
            pattern = patterns.tnm_pattern

        if isinstance(pattern, str):
            pattern = [pattern]

        self.regex_matcher = RegexMatcher(attr=attr, alignment_mode="strict")
        self.regex_matcher.add("tnm", pattern)

        self.set_extensions()

    @classmethod
    def set_extensions(cls) -> None:
        """
        Set extensions for the dates pipeline.
        """

        if not Span.has_extension("value"):
            Span.set_extension("value", default=None)

    def process(self, doc: Doc) -> List[Span]:
        """
        Find TNM mentions in doc.

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        spans:
            list of tnm spans
        """

        spans = self.regex_matcher(
            doc,
            as_spans=True,
            return_groupdict=True,
        )

        spans = filter_spans(spans)

        return spans

    def parse(self, spans: List[Tuple[Span, Dict[str, str]]]) -> List[Span]:
        """
        Parse dates using the groupdict returned by the matcher.

        Parameters
        ----------
        spans : List[Tuple[Span, Dict[str, str]]]
            List of tuples containing the spans and groupdict
            returned by the matcher.

        Returns
        -------
        List[Span]
            List of processed spans, with the date parsed.
        """

        for span, groupdict in spans:
            try:
                span._.value = models.TNM.parse_obj(groupdict)
            except ValidationError:
                span._.value = models.TNM.parse_obj({})

            span.kb_id_ = span._.value.norm()

        return [span for span, _ in spans]

    def __call__(self, doc: Doc) -> Doc:
        """
        Tags TNM mentions.

        Parameters
        ----------
        doc : Doc
            spaCy Doc object

        Returns
        -------
        doc : Doc
            spaCy Doc object, annotated for TNM
        """
        spans = self.process(doc)
        spans = filter_spans(spans)

        spans = self.parse(spans)

        doc.spans["tnm"] = spans

        ents, discarded = filter_spans(list(doc.ents) + spans, return_discarded=True)

        doc.ents = ents

        if "discarded" not in doc.spans:
            doc.spans["discarded"] = []
        doc.spans["discarded"].extend(discarded)

        return doc

nlp = nlp instance-attribute

regex_matcher = RegexMatcher(attr=attr, alignment_mode='strict') instance-attribute

__init__(nlp, pattern, attr)

Source code in edsnlp/pipelines/ner/scores/tnm/tnm.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
def __init__(
    self,
    nlp: Language,
    pattern: Optional[Union[List[str], str]],
    attr: str,
):

    self.nlp = nlp

    if pattern is None:
        pattern = patterns.tnm_pattern

    if isinstance(pattern, str):
        pattern = [pattern]

    self.regex_matcher = RegexMatcher(attr=attr, alignment_mode="strict")
    self.regex_matcher.add("tnm", pattern)

    self.set_extensions()

set_extensions()

Set extensions for the dates pipeline.

Source code in edsnlp/pipelines/ner/scores/tnm/tnm.py
52
53
54
55
56
57
58
59
@classmethod
def set_extensions(cls) -> None:
    """
    Set extensions for the dates pipeline.
    """

    if not Span.has_extension("value"):
        Span.set_extension("value", default=None)

process(doc)

Find TNM mentions in doc.

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
spans

list of tnm spans

Source code in edsnlp/pipelines/ner/scores/tnm/tnm.py
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def process(self, doc: Doc) -> List[Span]:
    """
    Find TNM mentions in doc.

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    spans:
        list of tnm spans
    """

    spans = self.regex_matcher(
        doc,
        as_spans=True,
        return_groupdict=True,
    )

    spans = filter_spans(spans)

    return spans

parse(spans)

Parse dates using the groupdict returned by the matcher.

PARAMETER DESCRIPTION
spans

List of tuples containing the spans and groupdict returned by the matcher.

TYPE: List[Tuple[Span, Dict[str, str]]]

RETURNS DESCRIPTION
List[Span]

List of processed spans, with the date parsed.

Source code in edsnlp/pipelines/ner/scores/tnm/tnm.py
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
def parse(self, spans: List[Tuple[Span, Dict[str, str]]]) -> List[Span]:
    """
    Parse dates using the groupdict returned by the matcher.

    Parameters
    ----------
    spans : List[Tuple[Span, Dict[str, str]]]
        List of tuples containing the spans and groupdict
        returned by the matcher.

    Returns
    -------
    List[Span]
        List of processed spans, with the date parsed.
    """

    for span, groupdict in spans:
        try:
            span._.value = models.TNM.parse_obj(groupdict)
        except ValidationError:
            span._.value = models.TNM.parse_obj({})

        span.kb_id_ = span._.value.norm()

    return [span for span, _ in spans]

__call__(doc)

Tags TNM mentions.

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
doc

spaCy Doc object, annotated for TNM

TYPE: Doc

Source code in edsnlp/pipelines/ner/scores/tnm/tnm.py
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
def __call__(self, doc: Doc) -> Doc:
    """
    Tags TNM mentions.

    Parameters
    ----------
    doc : Doc
        spaCy Doc object

    Returns
    -------
    doc : Doc
        spaCy Doc object, annotated for TNM
    """
    spans = self.process(doc)
    spans = filter_spans(spans)

    spans = self.parse(spans)

    doc.spans["tnm"] = spans

    ents, discarded = filter_spans(list(doc.ents) + spans, return_discarded=True)

    doc.ents = ents

    if "discarded" not in doc.spans:
        doc.spans["discarded"] = []
    doc.spans["discarded"].extend(discarded)

    return doc