Skip to content

edsnlp.pipelines.core.normalizer.factory

create_component(nlp, name='eds.normalizer', accents=True, lowercase=True, quotes=True, spaces=True, pollution=True)

Normalisation pipeline. Modifies the NORM attribute, acting on five dimensions :

  • lowercase: using the default NORM
  • accents: deterministic and fixed-length normalisation of accents.
  • quotes: deterministic and fixed-length normalisation of quotation marks.
  • spaces: "removal" of spaces tokens (via the tag_ attribute).
  • pollution: "removal" of pollutions (via the tag_ attribute).
PARAMETER DESCRIPTION
lowercase

Whether to remove case.

TYPE: bool DEFAULT: True

accents

Accents configuration object

TYPE: Union[bool, Dict[str, Any]] DEFAULT: True

quotes

Quotes configuration object

TYPE: Union[bool, Dict[str, Any]] DEFAULT: True

spaces

Spaces configuration object

TYPE: Union[bool, Dict[str, Any]] DEFAULT: True

pollution

Optional Pollution configuration object.

TYPE: Union[bool, Dict[str, Any]] DEFAULT: True

Source code in edsnlp/pipelines/core/normalizer/factory.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
@deprecated_factory(
    "normalizer",
    "eds.normalizer",
    default_config=DEFAULT_CONFIG,
    assigns=["token.norm", "token.tag"],
)
@Language.factory(
    "eds.normalizer", default_config=DEFAULT_CONFIG, assigns=["token.norm", "token.tag"]
)
def create_component(
    nlp: Language,
    name: str = "eds.normalizer",
    accents: Union[bool, Dict[str, Any]] = True,
    lowercase: Union[bool, Dict[str, Any]] = True,
    quotes: Union[bool, Dict[str, Any]] = True,
    spaces: Union[bool, Dict[str, Any]] = True,
    pollution: Union[bool, Dict[str, Any]] = True,
) -> Normalizer:
    """
    Normalisation pipeline. Modifies the `NORM` attribute,
    acting on five dimensions :

    - `lowercase`: using the default `NORM`
    - `accents`: deterministic and fixed-length normalisation of accents.
    - `quotes`: deterministic and fixed-length normalisation of quotation marks.
    - `spaces`: "removal" of spaces tokens (via the tag_ attribute).
    - `pollution`: "removal" of pollutions (via the tag_ attribute).

    Parameters
    ----------
    lowercase : bool
        Whether to remove case.
    accents : Union[bool, Dict[str, Any]]
        `Accents` configuration object
    quotes : Union[bool, Dict[str, Any]]
        `Quotes` configuration object
    spaces : Union[bool, Dict[str, Any]]
        `Spaces` configuration object
    pollution : Union[bool, Dict[str, Any]]
        Optional `Pollution` configuration object.
    """

    if accents:
        config = dict(**accents_config)
        if isinstance(accents, dict):
            config.update(accents)
        accents = registry.get("factories", "eds.accents")(nlp, "eds.accents", **config)

    if quotes:
        config = dict(**quotes_config)
        if isinstance(quotes, dict):
            config.update(quotes)
        quotes = registry.get("factories", "eds.quotes")(nlp, "eds.quotes", **config)

    if spaces:
        config = dict(**spaces_config)
        if isinstance(spaces, dict):
            config.update(spaces)
        spaces = registry.get("factories", "eds.spaces")(nlp, "eds.spaces", **config)

    if pollution:
        config = dict(**pollution_config["pollution"])
        if isinstance(pollution, dict):
            config.update(pollution)
        pollution = registry.get("factories", "eds.pollution")(
            nlp, "eds.pollution", pollution=config
        )

    normalizer = Normalizer(
        lowercase=lowercase,
        accents=accents or None,
        quotes=quotes or None,
        pollution=pollution or None,
        spaces=spaces or None,
    )

    return normalizer