Skip to content

edsnlp.pipelines.core.normalizer.accents

patterns

accents: List[Tuple[str, str]] = [('ç', 'c'), ('àáâä', 'a'), ('èéêë', 'e'), ('ìíîï', 'i'), ('òóôö', 'o'), ('ùúûü', 'u')] module-attribute

accents

Accents

Bases: object

Normalises accents, using a same-length strategy.

PARAMETER DESCRIPTION
accents

List of accentuated characters and their transcription.

TYPE: List[Tuple[str, str]]

Source code in edsnlp/pipelines/core/normalizer/accents/accents.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
class Accents(object):
    """
    Normalises accents, using a same-length strategy.

    Parameters
    ----------
    accents : List[Tuple[str, str]]
        List of accentuated characters and their transcription.
    """

    def __init__(self, accents: Optional[List[Tuple[str, str]]]) -> None:
        if accents is None:
            accents = patterns.accents

        self.translation_table = str.maketrans(
            "".join(accent_group for accent_group, _ in accents),
            "".join(rep * len(accent_group) for accent_group, rep in accents),
        )

    def __call__(self, doc: Doc) -> Doc:
        """
        Remove accents from spacy `NORM` attribute.

        Parameters
        ----------
        doc : Doc
            The spaCy `Doc` object.

        Returns
        -------
        Doc
            The document, with accents removed in `Token.norm_`.
        """

        for token in doc:
            token.norm_ = token.norm_.translate(self.translation_table)

        return doc
translation_table = str.maketrans(''.join(accent_group for (accent_group, _) in accents), ''.join(rep * len(accent_group) for (accent_group, rep) in accents)) instance-attribute
__init__(accents)
Source code in edsnlp/pipelines/core/normalizer/accents/accents.py
18
19
20
21
22
23
24
25
def __init__(self, accents: Optional[List[Tuple[str, str]]]) -> None:
    if accents is None:
        accents = patterns.accents

    self.translation_table = str.maketrans(
        "".join(accent_group for accent_group, _ in accents),
        "".join(rep * len(accent_group) for accent_group, rep in accents),
    )
__call__(doc)

Remove accents from spacy NORM attribute.

PARAMETER DESCRIPTION
doc

The spaCy Doc object.

TYPE: Doc

RETURNS DESCRIPTION
Doc

The document, with accents removed in Token.norm_.

Source code in edsnlp/pipelines/core/normalizer/accents/accents.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
def __call__(self, doc: Doc) -> Doc:
    """
    Remove accents from spacy `NORM` attribute.

    Parameters
    ----------
    doc : Doc
        The spaCy `Doc` object.

    Returns
    -------
    Doc
        The document, with accents removed in `Token.norm_`.
    """

    for token in doc:
        token.norm_ = token.norm_.translate(self.translation_table)

    return doc

factory

DEFAULT_CONFIG = dict(accents=None) module-attribute

create_component(nlp, name, accents)

Source code in edsnlp/pipelines/core/normalizer/accents/factory.py
14
15
16
17
18
19
20
21
22
23
@deprecated_factory("accents", "eds.accents", default_config=DEFAULT_CONFIG)
@Language.factory("eds.accents", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    accents: Optional[List[Tuple[str, str]]],
):
    return Accents(
        accents=accents,
    )
Back to top