Skip to content

edsnlp.pipelines.core.normalizer.factory

DEFAULT_CONFIG = dict(accents=True, lowercase=True, quotes=True, pollution=True) module-attribute

create_component(nlp, name, accents, lowercase, quotes, pollution)

Source code in edsnlp/pipelines/core/normalizer/factory.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
@deprecated_factory(
    "normalizer",
    "eds.normalizer",
    default_config=DEFAULT_CONFIG,
    assigns=["token.norm", "token.tag"],
)
@Language.factory(
    "eds.normalizer", default_config=DEFAULT_CONFIG, assigns=["token.norm", "token.tag"]
)
def create_component(
    nlp: Language,
    name: str,
    accents: Union[bool, Dict[str, Any]],
    lowercase: Union[bool, Dict[str, Any]],
    quotes: Union[bool, Dict[str, Any]],
    pollution: Union[bool, Dict[str, Any]],
):

    if accents:
        config = dict(**accents_config)
        if isinstance(accents, dict):
            config.update(accents)
        accents = registry.get("factories", "eds.accents")(nlp, "eds.accents", **config)

    if quotes:
        config = dict(**quotes_config)
        if isinstance(quotes, dict):
            config.update(quotes)
        quotes = registry.get("factories", "eds.quotes")(nlp, "eds.quotes", **config)

    if pollution:
        config = dict(**pollution_config["pollution"])
        if isinstance(pollution, dict):
            config.update(pollution)
        pollution = registry.get("factories", "eds.pollution")(
            nlp, "eds.pollution", pollution=config
        )

    normalizer = Normalizer(
        lowercase=lowercase,
        accents=accents or None,
        quotes=quotes or None,
        pollution=pollution or None,
    )

    return normalizer