Skip to content

edsnlp.pipelines.core.normalizer

factory

DEFAULT_CONFIG = dict(accents=True, lowercase=True, quotes=True, pollution=True) module-attribute

create_component(nlp, name, accents, lowercase, quotes, pollution)

Source code in edsnlp/pipelines/core/normalizer/factory.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
@deprecated_factory("normalizer", "eds.normalizer", default_config=DEFAULT_CONFIG)
@Language.factory("eds.normalizer", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    accents: Union[bool, Dict[str, Any]],
    lowercase: Union[bool, Dict[str, Any]],
    quotes: Union[bool, Dict[str, Any]],
    pollution: Union[bool, Dict[str, Any]],
):

    if accents:
        config = dict(**accents_config)
        if isinstance(accents, dict):
            config.update(accents)
        accents = registry.get("factories", "eds.accents")(nlp, "eds.accents", **config)

    if quotes:
        config = dict(**quotes_config)
        if isinstance(quotes, dict):
            config.update(quotes)
        quotes = registry.get("factories", "eds.quotes")(nlp, "eds.quotes", **config)

    if pollution:
        config = dict(**pollution_config)
        if isinstance(pollution, dict):
            config.update(pollution)
        pollution = registry.get("factories", "eds.pollution")(
            nlp, "eds.pollution", **config
        )

    normalizer = Normalizer(
        lowercase=lowercase,
        accents=accents or None,
        quotes=quotes or None,
        pollution=pollution or None,
    )

    return normalizer

normalizer

Normalizer

Bases: object

Normalisation pipeline. Modifies the NORM attribute, acting on four dimensions :

  • lowercase: using the default NORM
  • accents: deterministic and fixed-length normalisation of accents.
  • quotes: deterministic and fixed-length normalisation of quotation marks.
  • pollution: removal of pollutions.
PARAMETER DESCRIPTION
lowercase

Whether to remove case.

TYPE: bool

accents

Optional Accents object.

TYPE: Optional[Accents]

quotes

Optional Quotes object.

TYPE: Optional[Quotes]

pollution

Optional Pollution object.

TYPE: Optional[Pollution]

Source code in edsnlp/pipelines/core/normalizer/normalizer.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
class Normalizer(object):
    """
    Normalisation pipeline. Modifies the `NORM` attribute,
    acting on four dimensions :

    - `lowercase`: using the default `NORM`
    - `accents`: deterministic and fixed-length normalisation of accents.
    - `quotes`: deterministic and fixed-length normalisation of quotation marks.
    - `pollution`: removal of pollutions.

    Parameters
    ----------
    lowercase : bool
        Whether to remove case.
    accents : Optional[Accents]
        Optional `Accents` object.
    quotes : Optional[Quotes]
        Optional `Quotes` object.
    pollution : Optional[Pollution]
        Optional `Pollution` object.
    """

    def __init__(
        self,
        lowercase: bool,
        accents: Optional[Accents],
        quotes: Optional[Quotes],
        pollution: Optional[Pollution],
    ):
        self.lowercase = lowercase
        self.accents = accents
        self.quotes = quotes
        self.pollution = pollution

    def __call__(self, doc: Doc) -> Doc:
        """
        Apply the normalisation pipeline, one component at a time.

        Parameters
        ----------
        doc : Doc
            spaCy `Doc` object

        Returns
        -------
        Doc
            Doc object with `NORM` attribute modified
        """
        if not self.lowercase:
            remove_lowercase(doc)
        if self.accents is not None:
            self.accents(doc)
        if self.quotes is not None:
            self.quotes(doc)
        if self.pollution is not None:
            self.pollution(doc)

        return doc
lowercase = lowercase instance-attribute
accents = accents instance-attribute
quotes = quotes instance-attribute
pollution = pollution instance-attribute
__init__(lowercase, accents, quotes, pollution)
Source code in edsnlp/pipelines/core/normalizer/normalizer.py
33
34
35
36
37
38
39
40
41
42
43
def __init__(
    self,
    lowercase: bool,
    accents: Optional[Accents],
    quotes: Optional[Quotes],
    pollution: Optional[Pollution],
):
    self.lowercase = lowercase
    self.accents = accents
    self.quotes = quotes
    self.pollution = pollution
__call__(doc)

Apply the normalisation pipeline, one component at a time.

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
Doc

Doc object with NORM attribute modified

Source code in edsnlp/pipelines/core/normalizer/normalizer.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
def __call__(self, doc: Doc) -> Doc:
    """
    Apply the normalisation pipeline, one component at a time.

    Parameters
    ----------
    doc : Doc
        spaCy `Doc` object

    Returns
    -------
    Doc
        Doc object with `NORM` attribute modified
    """
    if not self.lowercase:
        remove_lowercase(doc)
    if self.accents is not None:
        self.accents(doc)
    if self.quotes is not None:
        self.quotes(doc)
    if self.pollution is not None:
        self.pollution(doc)

    return doc

pollution

pollution

Pollution

Bases: BaseComponent

Tags pollution tokens.

Populates a number of spaCy extensions :

  • Token._.pollution : indicates whether the token is a pollution
  • Doc._.clean : lists non-pollution tokens
  • Doc._.clean_ : original text with pollutions removed.
  • Doc._.char_clean_span : method to create a Span using character indices extracted using the cleaned text.
PARAMETER DESCRIPTION
nlp

Language pipeline object

TYPE: Language

pollution

Dictionary containing regular expressions of pollution.

TYPE: Dict[str, Union[str, List[str]]]

Source code in edsnlp/pipelines/core/normalizer/pollution/pollution.py
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
class Pollution(BaseComponent):
    """
    Tags pollution tokens.

    Populates a number of spaCy extensions :

    - `Token._.pollution` : indicates whether the token is a pollution
    - `Doc._.clean` : lists non-pollution tokens
    - `Doc._.clean_` : original text with pollutions removed.
    - `Doc._.char_clean_span` : method to create a Span using character
      indices extracted using the cleaned text.

    Parameters
    ----------
    nlp : Language
        Language pipeline object
    pollution : Dict[str, Union[str, List[str]]]
        Dictionary containing regular expressions of pollution.
    """

    # noinspection PyProtectedMember
    def __init__(
        self,
        nlp: Language,
        pollution: Optional[Dict[str, Union[str, List[str]]]],
    ):

        self.nlp = nlp
        self.nlp.vocab.strings.add("EXCLUDED")

        if pollution is None:
            pollution = patterns.pollution

        self.pollution = pollution

        for k, v in self.pollution.items():
            if isinstance(v, str):
                self.pollution[k] = [v]

        self.regex_matcher = RegexMatcher()
        self.build_patterns()

    def build_patterns(self) -> None:
        """
        Builds the patterns for phrase matching.
        """

        # efficiently build spaCy matcher patterns
        for k, v in self.pollution.items():
            self.regex_matcher.add(k, v)

    def process(self, doc: Doc) -> List[Span]:
        """
        Find pollutions in doc and clean candidate negations to remove pseudo negations

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        pollution:
            list of pollution spans
        """

        pollutions = self.regex_matcher(doc, as_spans=True)
        pollutions = filter_spans(pollutions)

        return pollutions

    def __call__(self, doc: Doc) -> Doc:
        """
        Tags pollutions.

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        doc:
            spaCy Doc object, annotated for pollutions.
        """
        excluded_hash = doc.vocab.strings["EXCLUDED"]
        pollutions = self.process(doc)

        for pollution in pollutions:

            for token in pollution:
                token._.excluded = True
                token.tag = excluded_hash

        doc.spans["pollutions"] = pollutions

        return doc
nlp = nlp instance-attribute
pollution = pollution instance-attribute
regex_matcher = RegexMatcher() instance-attribute
__init__(nlp, pollution)
Source code in edsnlp/pipelines/core/normalizer/pollution/pollution.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
def __init__(
    self,
    nlp: Language,
    pollution: Optional[Dict[str, Union[str, List[str]]]],
):

    self.nlp = nlp
    self.nlp.vocab.strings.add("EXCLUDED")

    if pollution is None:
        pollution = patterns.pollution

    self.pollution = pollution

    for k, v in self.pollution.items():
        if isinstance(v, str):
            self.pollution[k] = [v]

    self.regex_matcher = RegexMatcher()
    self.build_patterns()
build_patterns()

Builds the patterns for phrase matching.

Source code in edsnlp/pipelines/core/normalizer/pollution/pollution.py
55
56
57
58
59
60
61
62
def build_patterns(self) -> None:
    """
    Builds the patterns for phrase matching.
    """

    # efficiently build spaCy matcher patterns
    for k, v in self.pollution.items():
        self.regex_matcher.add(k, v)
process(doc)

Find pollutions in doc and clean candidate negations to remove pseudo negations

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
pollution

list of pollution spans

Source code in edsnlp/pipelines/core/normalizer/pollution/pollution.py
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def process(self, doc: Doc) -> List[Span]:
    """
    Find pollutions in doc and clean candidate negations to remove pseudo negations

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    pollution:
        list of pollution spans
    """

    pollutions = self.regex_matcher(doc, as_spans=True)
    pollutions = filter_spans(pollutions)

    return pollutions
__call__(doc)

Tags pollutions.

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

RETURNS DESCRIPTION
doc

spaCy Doc object, annotated for pollutions.

Source code in edsnlp/pipelines/core/normalizer/pollution/pollution.py
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
def __call__(self, doc: Doc) -> Doc:
    """
    Tags pollutions.

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    doc:
        spaCy Doc object, annotated for pollutions.
    """
    excluded_hash = doc.vocab.strings["EXCLUDED"]
    pollutions = self.process(doc)

    for pollution in pollutions:

        for token in pollution:
            token._.excluded = True
            token.tag = excluded_hash

    doc.spans["pollutions"] = pollutions

    return doc

factory

DEFAULT_CONFIG = dict(pollution=None) module-attribute
create_component(nlp, name, pollution)
Source code in edsnlp/pipelines/core/normalizer/pollution/factory.py
14
15
16
17
18
19
20
21
22
23
24
@deprecated_factory("pollution", "eds.pollution", default_config=DEFAULT_CONFIG)
@Language.factory("eds.pollution", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    pollution: Optional[Dict[str, Union[str, List[str]]]],
):
    return Pollution(
        nlp,
        pollution=pollution,
    )

patterns

information = "(?s)(=====+\\s*)?(L\\s*e\\s*s\\sdonnées\\s*administratives,\\s*sociales\\s*|I?nfo\\s*rmation\\s*aux?\\s*patients?|L[’']AP-HP\\s*collecte\\s*vos\\s*données\\s*administratives|L[’']Assistance\\s*Publique\\s*-\\s*Hôpitaux\\s*de\\s*Paris\\s*\\(?AP-HP\\)?\\s*a\\s*créé\\s*une\\s*base\\s*de\\s*données).{,2000}https?:\\/\\/recherche\\.aphp\\.fr\\/eds\\/droit-opposition[\\s\\.]*" module-attribute
bars = '(?i)([nbw]|_|-|=){5,}' module-attribute
pollution = dict(information=information, bars=bars) module-attribute

lowercase

factory

remove_lowercase(doc)

Add case on the NORM custom attribute. Should always be applied first.

PARAMETER DESCRIPTION
doc

The spaCy Doc object.

TYPE: Doc

RETURNS DESCRIPTION
Doc

The document, with case put back in NORM.

Source code in edsnlp/pipelines/core/normalizer/lowercase/factory.py
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
@Language.component("remove-lowercase")
@Language.component("eds.remove-lowercase")
def remove_lowercase(doc: Doc):
    """
    Add case on the `NORM` custom attribute. Should always be applied first.

    Parameters
    ----------
    doc : Doc
        The spaCy `Doc` object.

    Returns
    -------
    Doc
        The document, with case put back in `NORM`.
    """

    for token in doc:
        token.norm_ = token.text

    return doc

quotes

quotes

Quotes

Bases: object

We normalise quotes, following this source <https://www.cl.cam.ac.uk/~mgk25/ucs/quotes.html>_.

PARAMETER DESCRIPTION
quotes

List of quotation characters and their transcription.

TYPE: List[Tuple[str, str]]

Source code in edsnlp/pipelines/core/normalizer/quotes/quotes.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
class Quotes(object):
    """
    We normalise quotes, following this
    `source <https://www.cl.cam.ac.uk/~mgk25/ucs/quotes.html>`_.

    Parameters
    ----------
    quotes : List[Tuple[str, str]]
        List of quotation characters and their transcription.
    """

    def __init__(self, quotes: Optional[List[Tuple[str, str]]]) -> None:
        if quotes is None:
            quotes = quotes_and_apostrophes

        self.translation_table = str.maketrans(
            "".join(quote_group for quote_group, _ in quotes),
            "".join(rep * len(quote_group) for quote_group, rep in quotes),
        )

    def __call__(self, doc: Doc) -> Doc:
        """
        Normalises quotes.

        Parameters
        ----------
        doc : Doc
            Document to process.

        Returns
        -------
        Doc
            Same document, with quotes normalised.
        """

        for token in doc:
            token.norm_ = token.norm_.translate(self.translation_table)

        return doc
translation_table = str.maketrans(''.join(quote_group for (quote_group, _) in quotes), ''.join(rep * len(quote_group) for (quote_group, rep) in quotes)) instance-attribute
__init__(quotes)
Source code in edsnlp/pipelines/core/normalizer/quotes/quotes.py
19
20
21
22
23
24
25
26
def __init__(self, quotes: Optional[List[Tuple[str, str]]]) -> None:
    if quotes is None:
        quotes = quotes_and_apostrophes

    self.translation_table = str.maketrans(
        "".join(quote_group for quote_group, _ in quotes),
        "".join(rep * len(quote_group) for quote_group, rep in quotes),
    )
__call__(doc)

Normalises quotes.

PARAMETER DESCRIPTION
doc

Document to process.

TYPE: Doc

RETURNS DESCRIPTION
Doc

Same document, with quotes normalised.

Source code in edsnlp/pipelines/core/normalizer/quotes/quotes.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
def __call__(self, doc: Doc) -> Doc:
    """
    Normalises quotes.

    Parameters
    ----------
    doc : Doc
        Document to process.

    Returns
    -------
    Doc
        Same document, with quotes normalised.
    """

    for token in doc:
        token.norm_ = token.norm_.translate(self.translation_table)

    return doc

factory

DEFAULT_CONFIG = dict(quotes=None) module-attribute
create_component(nlp, name, quotes)
Source code in edsnlp/pipelines/core/normalizer/quotes/factory.py
14
15
16
17
18
19
20
21
22
23
@deprecated_factory("quotes", "eds.quotes", default_config=DEFAULT_CONFIG)
@Language.factory("eds.quotes", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    quotes: Optional[List[Tuple[str, str]]],
):
    return Quotes(
        quotes=quotes,
    )

patterns

quotes: List[str] = ['"', '〃', 'ײ', '᳓', '″', '״', '‶', '˶', 'ʺ', '“', '”', '˝', '‟'] module-attribute
apostrophes: List[str] = ['`', '΄', ''', 'ˈ', 'ˊ', 'ᑊ', 'ˋ', 'ꞌ', 'ᛌ', '𖽒', '𖽑', '‘', '’', 'י', '՚', '‛', '՝', '`', '`', '′', '׳', '´', 'ʹ', '˴', 'ߴ', '‵', 'ߵ', 'ʹ', 'ʻ', 'ʼ', '´', '᾽', 'ʽ', '῾', 'ʾ', '᾿'] module-attribute
quotes_and_apostrophes: List[Tuple[str, str]] = [(''.join(quotes), '"'), (''.join(apostrophes), "'")] module-attribute

accents

accents

Accents

Bases: object

Normalises accents, using a same-length strategy.

PARAMETER DESCRIPTION
accents

List of accentuated characters and their transcription.

TYPE: List[Tuple[str, str]]

Source code in edsnlp/pipelines/core/normalizer/accents/accents.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
class Accents(object):
    """
    Normalises accents, using a same-length strategy.

    Parameters
    ----------
    accents : List[Tuple[str, str]]
        List of accentuated characters and their transcription.
    """

    def __init__(self, accents: Optional[List[Tuple[str, str]]]) -> None:
        if accents is None:
            accents = patterns.accents

        self.translation_table = str.maketrans(
            "".join(accent_group for accent_group, _ in accents),
            "".join(rep * len(accent_group) for accent_group, rep in accents),
        )

    def __call__(self, doc: Doc) -> Doc:
        """
        Remove accents from spacy `NORM` attribute.

        Parameters
        ----------
        doc : Doc
            The spaCy `Doc` object.

        Returns
        -------
        Doc
            The document, with accents removed in `Token.norm_`.
        """

        for token in doc:
            token.norm_ = token.norm_.translate(self.translation_table)

        return doc
translation_table = str.maketrans(''.join(accent_group for (accent_group, _) in accents), ''.join(rep * len(accent_group) for (accent_group, rep) in accents)) instance-attribute
__init__(accents)
Source code in edsnlp/pipelines/core/normalizer/accents/accents.py
18
19
20
21
22
23
24
25
def __init__(self, accents: Optional[List[Tuple[str, str]]]) -> None:
    if accents is None:
        accents = patterns.accents

    self.translation_table = str.maketrans(
        "".join(accent_group for accent_group, _ in accents),
        "".join(rep * len(accent_group) for accent_group, rep in accents),
    )
__call__(doc)

Remove accents from spacy NORM attribute.

PARAMETER DESCRIPTION
doc

The spaCy Doc object.

TYPE: Doc

RETURNS DESCRIPTION
Doc

The document, with accents removed in Token.norm_.

Source code in edsnlp/pipelines/core/normalizer/accents/accents.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
def __call__(self, doc: Doc) -> Doc:
    """
    Remove accents from spacy `NORM` attribute.

    Parameters
    ----------
    doc : Doc
        The spaCy `Doc` object.

    Returns
    -------
    Doc
        The document, with accents removed in `Token.norm_`.
    """

    for token in doc:
        token.norm_ = token.norm_.translate(self.translation_table)

    return doc

factory

DEFAULT_CONFIG = dict(accents=None) module-attribute
create_component(nlp, name, accents)
Source code in edsnlp/pipelines/core/normalizer/accents/factory.py
14
15
16
17
18
19
20
21
22
23
@deprecated_factory("accents", "eds.accents", default_config=DEFAULT_CONFIG)
@Language.factory("eds.accents", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    accents: Optional[List[Tuple[str, str]]],
):
    return Accents(
        accents=accents,
    )

patterns

accents: List[Tuple[str, str]] = [('ç', 'c'), ('àáâä', 'a'), ('èéêë', 'e'), ('ìíîï', 'i'), ('òóôö', 'o'), ('ùúûü', 'u')] module-attribute
Back to top