Skip to content

edsnlp.pipelines.core.normalizer.pollution

pollution

Pollution

Bases: BaseComponent

Tags pollution tokens.

Populates a number of spaCy extensions :

  • Token._.pollution : indicates whether the token is a pollution
  • Doc._.clean : lists non-pollution tokens
  • Doc._.clean_ : original text with pollutions removed.
  • Doc._.char_clean_span : method to create a Span using character indices extracted using the cleaned text.
PARAMETER DESCRIPTION
nlp

Language pipeline object

TYPE: Language

pollution

Dictionary containing regular expressions of pollution.

TYPE: Dict[str, Union[str, List[str]]]

Source code in edsnlp/pipelines/core/normalizer/pollution/pollution.py
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
class Pollution(BaseComponent):
    """
    Tags pollution tokens.

    Populates a number of spaCy extensions :

    - `Token._.pollution` : indicates whether the token is a pollution
    - `Doc._.clean` : lists non-pollution tokens
    - `Doc._.clean_` : original text with pollutions removed.
    - `Doc._.char_clean_span` : method to create a Span using character
      indices extracted using the cleaned text.

    Parameters
    ----------
    nlp : Language
        Language pipeline object
    pollution : Dict[str, Union[str, List[str]]]
        Dictionary containing regular expressions of pollution.
    """

    # noinspection PyProtectedMember
    def __init__(
        self,
        nlp: Language,
        pollution: Optional[Dict[str, Union[str, List[str]]]],
    ):

        self.nlp = nlp

        if pollution is None:
            pollution = patterns.pollution

        self.pollution = pollution

        for k, v in self.pollution.items():
            if isinstance(v, str):
                self.pollution[k] = [v]

        self.regex_matcher = RegexMatcher()
        self.build_patterns()

    def build_patterns(self) -> None:
        """
        Builds the patterns for phrase matching.
        """

        # efficiently build spaCy matcher patterns
        for k, v in self.pollution.items():
            self.regex_matcher.add(k, v)

    def process(self, doc: Doc) -> List[Span]:
        """
        Find pollutions in doc and clean candidate negations to remove pseudo negations

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        pollution:
            list of pollution spans
        """

        pollutions = self.regex_matcher(doc, as_spans=True)
        pollutions = filter_spans(pollutions)

        return pollutions

    def __call__(self, doc: Doc) -> Doc:
        """
        Tags pollutions.

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        doc:
            spaCy Doc object, annotated for pollutions.
        """
        pollutions = self.process(doc)

        for pollution in pollutions:

            for token in pollution:
                token._.excluded = True

        doc.spans["pollutions"] = pollutions

        return doc
build_patterns()

Builds the patterns for phrase matching.

Source code in edsnlp/pipelines/core/normalizer/pollution/pollution.py
54
55
56
57
58
59
60
61
def build_patterns(self) -> None:
    """
    Builds the patterns for phrase matching.
    """

    # efficiently build spaCy matcher patterns
    for k, v in self.pollution.items():
        self.regex_matcher.add(k, v)
process(doc)

Find pollutions in doc and clean candidate negations to remove pseudo negations

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
pollution

list of pollution spans

Source code in edsnlp/pipelines/core/normalizer/pollution/pollution.py
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
def process(self, doc: Doc) -> List[Span]:
    """
    Find pollutions in doc and clean candidate negations to remove pseudo negations

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    pollution:
        list of pollution spans
    """

    pollutions = self.regex_matcher(doc, as_spans=True)
    pollutions = filter_spans(pollutions)

    return pollutions
__call__(doc)

Tags pollutions.

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
doc

spaCy Doc object, annotated for pollutions.

Source code in edsnlp/pipelines/core/normalizer/pollution/pollution.py
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
def __call__(self, doc: Doc) -> Doc:
    """
    Tags pollutions.

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    doc:
        spaCy Doc object, annotated for pollutions.
    """
    pollutions = self.process(doc)

    for pollution in pollutions:

        for token in pollution:
            token._.excluded = True

    doc.spans["pollutions"] = pollutions

    return doc
Back to top