Skip to content

edsnlp.pipelines.core.normalizer.pollution

pollution

Pollution

Bases: BaseComponent

Tags pollution tokens.

Populates a number of spaCy extensions :

  • Token._.pollution : indicates whether the token is a pollution
  • Doc._.clean : lists non-pollution tokens
  • Doc._.clean_ : original text with pollutions removed.
  • Doc._.char_clean_span : method to create a Span using character indices extracted using the cleaned text.
PARAMETER DESCRIPTION
nlp

Language pipeline object

TYPE: Language

pollution

Dictionary containing regular expressions of pollution.

TYPE: Dict[str, Union[str, List[str]]]

Source code in edsnlp/pipelines/core/normalizer/pollution/pollution.py
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
class Pollution(BaseComponent):
    """
    Tags pollution tokens.

    Populates a number of spaCy extensions :

    - `Token._.pollution` : indicates whether the token is a pollution
    - `Doc._.clean` : lists non-pollution tokens
    - `Doc._.clean_` : original text with pollutions removed.
    - `Doc._.char_clean_span` : method to create a Span using character
      indices extracted using the cleaned text.

    Parameters
    ----------
    nlp : Language
        Language pipeline object
    pollution : Dict[str, Union[str, List[str]]]
        Dictionary containing regular expressions of pollution.
    """

    # noinspection PyProtectedMember
    def __init__(
        self,
        nlp: Language,
        pollution: Optional[Dict[str, Union[str, List[str]]]],
    ):

        self.nlp = nlp
        self.nlp.vocab.strings.add("EXCLUDED")

        if pollution is None:
            pollution = patterns.pollution

        self.pollution = pollution

        for k, v in self.pollution.items():
            if isinstance(v, str):
                self.pollution[k] = [v]

        self.regex_matcher = RegexMatcher()
        self.build_patterns()

    def build_patterns(self) -> None:
        """
        Builds the patterns for phrase matching.
        """

        # efficiently build spaCy matcher patterns
        for k, v in self.pollution.items():
            self.regex_matcher.add(k, v)

    def process(self, doc: Doc) -> List[Span]:
        """
        Find pollutions in doc and clean candidate negations to remove pseudo negations

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        pollution:
            list of pollution spans
        """

        pollutions = self.regex_matcher(doc, as_spans=True)
        pollutions = filter_spans(pollutions)

        return pollutions

    def __call__(self, doc: Doc) -> Doc:
        """
        Tags pollutions.

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        doc:
            spaCy Doc object, annotated for pollutions.
        """
        excluded_hash = doc.vocab.strings["EXCLUDED"]
        pollutions = self.process(doc)

        for pollution in pollutions:

            for token in pollution:
                token._.excluded = True
                token.tag = excluded_hash

        doc.spans["pollutions"] = pollutions

        return doc
nlp = nlp instance-attribute
pollution = pollution instance-attribute
regex_matcher = RegexMatcher() instance-attribute
__init__(nlp, pollution)
Source code in edsnlp/pipelines/core/normalizer/pollution/pollution.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
def __init__(
    self,
    nlp: Language,
    pollution: Optional[Dict[str, Union[str, List[str]]]],
):

    self.nlp = nlp
    self.nlp.vocab.strings.add("EXCLUDED")

    if pollution is None:
        pollution = patterns.pollution

    self.pollution = pollution

    for k, v in self.pollution.items():
        if isinstance(v, str):
            self.pollution[k] = [v]

    self.regex_matcher = RegexMatcher()
    self.build_patterns()
build_patterns()

Builds the patterns for phrase matching.

Source code in edsnlp/pipelines/core/normalizer/pollution/pollution.py
55
56
57
58
59
60
61
62
def build_patterns(self) -> None:
    """
    Builds the patterns for phrase matching.
    """

    # efficiently build spaCy matcher patterns
    for k, v in self.pollution.items():
        self.regex_matcher.add(k, v)
process(doc)

Find pollutions in doc and clean candidate negations to remove pseudo negations

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
pollution

list of pollution spans

Source code in edsnlp/pipelines/core/normalizer/pollution/pollution.py
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def process(self, doc: Doc) -> List[Span]:
    """
    Find pollutions in doc and clean candidate negations to remove pseudo negations

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    pollution:
        list of pollution spans
    """

    pollutions = self.regex_matcher(doc, as_spans=True)
    pollutions = filter_spans(pollutions)

    return pollutions
__call__(doc)

Tags pollutions.

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
doc

spaCy Doc object, annotated for pollutions.

Source code in edsnlp/pipelines/core/normalizer/pollution/pollution.py
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
def __call__(self, doc: Doc) -> Doc:
    """
    Tags pollutions.

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    doc:
        spaCy Doc object, annotated for pollutions.
    """
    excluded_hash = doc.vocab.strings["EXCLUDED"]
    pollutions = self.process(doc)

    for pollution in pollutions:

        for token in pollution:
            token._.excluded = True
            token.tag = excluded_hash

    doc.spans["pollutions"] = pollutions

    return doc

factory

DEFAULT_CONFIG = dict(pollution=None) module-attribute

create_component(nlp, name, pollution)

Source code in edsnlp/pipelines/core/normalizer/pollution/factory.py
14
15
16
17
18
19
20
21
22
23
24
@deprecated_factory("pollution", "eds.pollution", default_config=DEFAULT_CONFIG)
@Language.factory("eds.pollution", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    pollution: Optional[Dict[str, Union[str, List[str]]]],
):
    return Pollution(
        nlp,
        pollution=pollution,
    )

patterns

information = "(?s)(=====+\\s*)?(L\\s*e\\s*s\\sdonnées\\s*administratives,\\s*sociales\\s*|I?nfo\\s*rmation\\s*aux?\\s*patients?|L[’']AP-HP\\s*collecte\\s*vos\\s*données\\s*administratives|L[’']Assistance\\s*Publique\\s*-\\s*Hôpitaux\\s*de\\s*Paris\\s*\\(?AP-HP\\)?\\s*a\\s*créé\\s*une\\s*base\\s*de\\s*données).{,2000}https?:\\/\\/recherche\\.aphp\\.fr\\/eds\\/droit-opposition[\\s\\.]*" module-attribute

bars = '(?i)([nbw]|_|-|=){5,}' module-attribute

pollution = dict(information=information, bars=bars) module-attribute

Back to top