Skip to content

edsnlp.pipelines.core.normalizer.quotes

quotes

Quotes

Bases: object

We normalise quotes, following this source <https://www.cl.cam.ac.uk/~mgk25/ucs/quotes.html>_.

PARAMETER DESCRIPTION
quotes

List of quotation characters and their transcription.

TYPE: List[Tuple[str, str]]

Source code in edsnlp/pipelines/core/normalizer/quotes/quotes.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
class Quotes(object):
    """
    We normalise quotes, following this
    `source <https://www.cl.cam.ac.uk/~mgk25/ucs/quotes.html>`_.

    Parameters
    ----------
    quotes : List[Tuple[str, str]]
        List of quotation characters and their transcription.
    """

    def __init__(self, quotes: Optional[List[Tuple[str, str]]]) -> None:
        if quotes is None:
            quotes = quotes_and_apostrophes

        self.translation_table = str.maketrans(
            "".join(quote_group for quote_group, _ in quotes),
            "".join(rep * len(quote_group) for quote_group, rep in quotes),
        )

    def __call__(self, doc: Doc) -> Doc:
        """
        Normalises quotes.

        Parameters
        ----------
        doc : Doc
            Document to process.

        Returns
        -------
        Doc
            Same document, with quotes normalised.
        """

        for token in doc:
            token.norm_ = token.norm_.translate(self.translation_table)

        return doc
translation_table = str.maketrans(''.join(quote_group for (quote_group, _) in quotes), ''.join(rep * len(quote_group) for (quote_group, rep) in quotes)) instance-attribute
__init__(quotes)
Source code in edsnlp/pipelines/core/normalizer/quotes/quotes.py
19
20
21
22
23
24
25
26
def __init__(self, quotes: Optional[List[Tuple[str, str]]]) -> None:
    if quotes is None:
        quotes = quotes_and_apostrophes

    self.translation_table = str.maketrans(
        "".join(quote_group for quote_group, _ in quotes),
        "".join(rep * len(quote_group) for quote_group, rep in quotes),
    )
__call__(doc)

Normalises quotes.

PARAMETER DESCRIPTION
doc

Document to process.

TYPE: Doc

RETURNS DESCRIPTION
Doc

Same document, with quotes normalised.

Source code in edsnlp/pipelines/core/normalizer/quotes/quotes.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
def __call__(self, doc: Doc) -> Doc:
    """
    Normalises quotes.

    Parameters
    ----------
    doc : Doc
        Document to process.

    Returns
    -------
    Doc
        Same document, with quotes normalised.
    """

    for token in doc:
        token.norm_ = token.norm_.translate(self.translation_table)

    return doc

patterns

quotes: List[str] = ['"', '〃', 'ײ', '᳓', '″', '״', '‶', '˶', 'ʺ', '“', '”', '˝', '‟'] module-attribute

apostrophes: List[str] = ['`', '΄', ''', 'ˈ', 'ˊ', 'ᑊ', 'ˋ', 'ꞌ', 'ᛌ', '𖽒', '𖽑', '‘', '’', 'י', '՚', '‛', '՝', '`', '`', '′', '׳', '´', 'ʹ', '˴', 'ߴ', '‵', 'ߵ', 'ʹ', 'ʻ', 'ʼ', '´', '᾽', 'ʽ', '῾', 'ʾ', '᾿'] module-attribute

quotes_and_apostrophes: List[Tuple[str, str]] = [(''.join(quotes), '"'), (''.join(apostrophes), "'")] module-attribute

factory

DEFAULT_CONFIG = dict(quotes=None) module-attribute

create_component(nlp, name, quotes)

Source code in edsnlp/pipelines/core/normalizer/quotes/factory.py
14
15
16
17
18
19
20
21
22
23
@deprecated_factory("quotes", "eds.quotes", default_config=DEFAULT_CONFIG)
@Language.factory("eds.quotes", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    quotes: Optional[List[Tuple[str, str]]],
):
    return Quotes(
        quotes=quotes,
    )
Back to top