`edsnlp.pipelines.core.normalizer.pollution`

`pollution`

`Pollution`

Bases: BaseComponent

Tags pollution tokens.

Populates a number of spaCy extensions :

Token._.pollution : indicates whether the token is a pollution
Doc._.clean : lists non-pollution tokens
Doc._.clean_ : original text with pollutions removed.
Doc._.char_clean_span : method to create a Span using character indices extracted using the cleaned text.

PARAMETER DESCRIPTION

nlp

Language pipeline object

TYPE: Language

pollution

Dictionary containing regular expressions of pollution.

TYPE: Dict[str, Union[str, List[str]]]

Source code in edsnlp/pipelines/core/normalizer/pollution/pollution.py

class Pollution(BaseComponent):
    """
    Tags pollution tokens.

    Populates a number of spaCy extensions :

    - `Token._.pollution` : indicates whether the token is a pollution
    - `Doc._.clean` : lists non-pollution tokens
    - `Doc._.clean_` : original text with pollutions removed.
    - `Doc._.char_clean_span` : method to create a Span using character
      indices extracted using the cleaned text.

    Parameters
    ----------
    nlp : Language
        Language pipeline object
    pollution : Dict[str, Union[str, List[str]]]
        Dictionary containing regular expressions of pollution.
    """

    # noinspection PyProtectedMember
    def __init__(
        self,
        nlp: Language,
        pollution: Optional[Dict[str, Union[str, List[str]]]],
    ):

        self.nlp = nlp
        self.nlp.vocab.strings.add("EXCLUDED")

        if pollution is None:
            pollution = patterns.pollution

        self.pollution = pollution

        for k, v in self.pollution.items():
            if isinstance(v, str):
                self.pollution[k] = [v]

        self.regex_matcher = RegexMatcher()
        self.build_patterns()

    def build_patterns(self) -> None:
        """
        Builds the patterns for phrase matching.
        """

        # efficiently build spaCy matcher patterns
        for k, v in self.pollution.items():
            self.regex_matcher.add(k, v)

    def process(self, doc: Doc) -> List[Span]:
        """
        Find pollutions in doc and clean candidate negations to remove pseudo negations

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        pollution:
            list of pollution spans
        """

        pollutions = self.regex_matcher(doc, as_spans=True)
        pollutions = filter_spans(pollutions)

        return pollutions

    def __call__(self, doc: Doc) -> Doc:
        """
        Tags pollutions.

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        doc:
            spaCy Doc object, annotated for pollutions.
        """
        excluded_hash = doc.vocab.strings["EXCLUDED"]
        pollutions = self.process(doc)

        for pollution in pollutions:

            for token in pollution:
                token._.excluded = True
                token.tag = excluded_hash

        doc.spans["pollutions"] = pollutions

        return doc

`nlp = nlp` `instance-attribute`

`pollution = pollution` `instance-attribute`

`regex_matcher = RegexMatcher()` `instance-attribute`

`init(nlp, pollution)`

Source code in edsnlp/pipelines/core/normalizer/pollution/pollution.py

def __init__(
    self,
    nlp: Language,
    pollution: Optional[Dict[str, Union[str, List[str]]]],
):

    self.nlp = nlp
    self.nlp.vocab.strings.add("EXCLUDED")

    if pollution is None:
        pollution = patterns.pollution

    self.pollution = pollution

    for k, v in self.pollution.items():
        if isinstance(v, str):
            self.pollution[k] = [v]

    self.regex_matcher = RegexMatcher()
    self.build_patterns()

`build_patterns()`

Builds the patterns for phrase matching.

Source code in edsnlp/pipelines/core/normalizer/pollution/pollution.py

def build_patterns(self) -> None:
    """
    Builds the patterns for phrase matching.
    """

    # efficiently build spaCy matcher patterns
    for k, v in self.pollution.items():
        self.regex_matcher.add(k, v)

`process(doc)`

Find pollutions in doc and clean candidate negations to remove pseudo negations

PARAMETER DESCRIPTION

doc

spaCy Doc object

TYPE: Doc

RETURNS	DESCRIPTION
`pollution`	list of pollution spans

Source code in edsnlp/pipelines/core/normalizer/pollution/pollution.py

def process(self, doc: Doc) -> List[Span]:
    """
    Find pollutions in doc and clean candidate negations to remove pseudo negations

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    pollution:
        list of pollution spans
    """

    pollutions = self.regex_matcher(doc, as_spans=True)
    pollutions = filter_spans(pollutions)

    return pollutions

`call(doc)`

Tags pollutions.

PARAMETER DESCRIPTION

doc

spaCy Doc object

TYPE: Doc

RETURNS	DESCRIPTION
`doc`	spaCy Doc object, annotated for pollutions.

Source code in edsnlp/pipelines/core/normalizer/pollution/pollution.py

def __call__(self, doc: Doc) -> Doc:
    """
    Tags pollutions.

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    doc:
        spaCy Doc object, annotated for pollutions.
    """
    excluded_hash = doc.vocab.strings["EXCLUDED"]
    pollutions = self.process(doc)

    for pollution in pollutions:

        for token in pollution:
            token._.excluded = True
            token.tag = excluded_hash

    doc.spans["pollutions"] = pollutions

    return doc

`factory`

`DEFAULT_CONFIG = dict(pollution=None)` `module-attribute`

`create_component(nlp, name, pollution)`

Source code in edsnlp/pipelines/core/normalizer/pollution/factory.py

@deprecated_factory("pollution", "eds.pollution", default_config=DEFAULT_CONFIG)
@Language.factory("eds.pollution", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    pollution: Optional[Dict[str, Union[str, List[str]]]],
):
    return Pollution(
        nlp,
        pollution=pollution,
    )

`patterns`

`information = "(?s)(=====+\\s)?(L\\se\\ss\\sdonnées\\sadministratives,\\ssociales\\s|I?nfo\\srmation\\saux?\\spatients?|L[’']AP-HP\\scollecte\\svos\\sdonnées\\sadministratives|L[’']Assistance\\sPublique\\s-\\sHôpitaux\\sde\\sParis\\s\\(?AP-HP\\)?\\sa\\scréé\\sune\\sbase\\sde\\sdonnées).{,2000}https?:\\/\\/recherche\\.aphp\\.fr\\/eds\\/droit-opposition[\\s\\.]"` `module-attribute`

`edsnlp.pipelines.core.normalizer.pollution`

`pollution`

`Pollution`

`nlp = nlp` `instance-attribute`

`pollution = pollution` `instance-attribute`

`regex_matcher = RegexMatcher()` `instance-attribute`

`init(nlp, pollution)`

`build_patterns()`

`process(doc)`

`call(doc)`

`factory`

`DEFAULT_CONFIG = dict(pollution=None)` `module-attribute`

`create_component(nlp, name, pollution)`

`patterns`

`bars = '(?i)([nbw]|_|-|=){5,}'` `module-attribute`

`pollution = dict(information=information, bars=bars)` `module-attribute`

edsnlp.pipelines.core.normalizer.pollution

pollution

Pollution

nlp = nlp instance-attribute

pollution = pollution instance-attribute

regex_matcher = RegexMatcher() instance-attribute

__init__(nlp, pollution)

build_patterns()

process(doc)

__call__(doc)

factory

DEFAULT_CONFIG = dict(pollution=None) module-attribute

create_component(nlp, name, pollution)

patterns

bars = '(?i)([nbw]|_|-|=){5,}' module-attribute

pollution = dict(information=information, bars=bars) module-attribute

`edsnlp.pipelines.core.normalizer.pollution`

`pollution`

`Pollution`

`nlp = nlp` `instance-attribute`

`pollution = pollution` `instance-attribute`

`regex_matcher = RegexMatcher()` `instance-attribute`

`init(nlp, pollution)`

`build_patterns()`

`process(doc)`

`call(doc)`

`factory`

`DEFAULT_CONFIG = dict(pollution=None)` `module-attribute`

`create_component(nlp, name, pollution)`

`patterns`

`bars = '(?i)([nbw]|_|-|=){5,}'` `module-attribute`

`pollution = dict(information=information, bars=bars)` `module-attribute`