Skip to content

edsnlp.language

EDSDefaults

Bases: FrenchDefaults

Defaults for the EDSLanguage class Mostly identical to the FrenchDefaults, but without tokenization info

Source code in edsnlp/language.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
class EDSDefaults(FrenchDefaults):
    """
    Defaults for the EDSLanguage class
    Mostly identical to the FrenchDefaults, but
    without tokenization info
    """

    tokenizer_exceptions = {}
    infixes = []
    lex_attr_getters = LEX_ATTRS
    syntax_iterators = SYNTAX_ITERATORS
    stop_words = STOP_WORDS
    config = FrenchDefaults.config.merge(
        {
            "nlp": {"tokenizer": {"@tokenizers": "eds.tokenizer"}},
        }
    )

EDSLanguage

Bases: French

French clinical language. It is shipped with the EDSTokenizer tokenizer that better handles tokenization for French clinical documents

Source code in edsnlp/language.py
32
33
34
35
36
37
38
39
40
41
42
@spacy.registry.languages("eds")
class EDSLanguage(French):
    """
    French clinical language.
    It is shipped with the `EDSTokenizer` tokenizer that better handles
    tokenization for French clinical documents
    """

    lang = "eds"
    Defaults = EDSDefaults
    default_config = Defaults

EDSTokenizer

Bases: DummyTokenizer

Source code in edsnlp/language.py
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
class EDSTokenizer(DummyTokenizer):
    def __init__(self, vocab: Vocab) -> None:
        """
        Tokenizer class for French clinical documents.
        It better handles tokenization around:
        - numbers: "ACR5" -> ["ACR", "5"] instead of ["ACR5"]
        - newlines: "\n \n \n" -> ["\n", "\n", "\n"] instead of ["\n \n \n"]
        and should be around 5-6 times faster than its standard French counterpart.
        Parameters
        ----------
        vocab: Vocab
            The spacy vocabulary
        """
        self.vocab = vocab
        punct = "[:punct:]" + "\"'ˊ"〃ײ᳓″״‶˶ʺ“”˝"
        num_like = r"\d+(?:[.,]\d(?![.,]?[0-9])|(?![.,]?[0-9]))?"
        sep = rf"\d{punct}'\n[:space:]"
        default = rf"[^{sep}]+(?:['ˊ](?=[[:alpha:]]|$))?"
        exceptions = "|".join(TOKENIZER_EXCEPTIONS)
        acronym = r"[A-Z][A-Z0-9]*[.](?=[A-Z0-9])"
        self.word_regex = regex.compile(
            rf"""(?x)
        (
            {exceptions}    # tokenizer exceptions like M., Dr., etc
            |{acronym}      # acronyms
            |{num_like}     # numbers
            |[{punct}]      # punctuations
            |[\n\r\t]       # new lines or tabs
            |[^\S\r\n\t]+   # multi-spaces
            |{default}      # anything else: most often alpha-numerical words
        )                   # followed by
        ([^\S\r\n\t])?      # an optional space
        """
        )

    def __call__(self, text: str) -> Doc:
        """
        Tokenizes the text using the EDSTokenizer

        Parameters
        ----------
        text: str

        Returns
        -------
        Doc

        """
        last = 0
        words = []
        whitespaces = []
        for match in self.word_regex.finditer(text):
            begin, end = match.start(), match.end()
            if last != begin:
                logger.warning(
                    "Missed some characters during"
                    + f" tokenization between {last} and {begin}: "
                    + text[last - 10 : last]
                    + "|"
                    + text[last:begin]
                    + "|"
                    + text[begin : begin + 10],
                )
            last = end
            words.append(match.group(1))
            whitespaces.append(bool(match.group(2)))
        return Doc(self.vocab, words=words, spaces=whitespaces)

__init__(vocab)

    Tokenizer class for French clinical documents.
    It better handles tokenization around:
    - numbers: "ACR5" -> ["ACR", "5"] instead of ["ACR5"]
    - newlines: "

" -> [" ", " ", " "] instead of ["

"] and should be around 5-6 times faster than its standard French counterpart. Parameters ---------- vocab: Vocab The spacy vocabulary

Source code in edsnlp/language.py
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
def __init__(self, vocab: Vocab) -> None:
    """
    Tokenizer class for French clinical documents.
    It better handles tokenization around:
    - numbers: "ACR5" -> ["ACR", "5"] instead of ["ACR5"]
    - newlines: "\n \n \n" -> ["\n", "\n", "\n"] instead of ["\n \n \n"]
    and should be around 5-6 times faster than its standard French counterpart.
    Parameters
    ----------
    vocab: Vocab
        The spacy vocabulary
    """
    self.vocab = vocab
    punct = "[:punct:]" + "\"'ˊ"〃ײ᳓″״‶˶ʺ“”˝"
    num_like = r"\d+(?:[.,]\d(?![.,]?[0-9])|(?![.,]?[0-9]))?"
    sep = rf"\d{punct}'\n[:space:]"
    default = rf"[^{sep}]+(?:['ˊ](?=[[:alpha:]]|$))?"
    exceptions = "|".join(TOKENIZER_EXCEPTIONS)
    acronym = r"[A-Z][A-Z0-9]*[.](?=[A-Z0-9])"
    self.word_regex = regex.compile(
        rf"""(?x)
    (
        {exceptions}    # tokenizer exceptions like M., Dr., etc
        |{acronym}      # acronyms
        |{num_like}     # numbers
        |[{punct}]      # punctuations
        |[\n\r\t]       # new lines or tabs
        |[^\S\r\n\t]+   # multi-spaces
        |{default}      # anything else: most often alpha-numerical words
    )                   # followed by
    ([^\S\r\n\t])?      # an optional space
    """
    )

__call__(text)

Tokenizes the text using the EDSTokenizer

PARAMETER DESCRIPTION
text

TYPE: str

RETURNS DESCRIPTION
Doc
Source code in edsnlp/language.py
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
def __call__(self, text: str) -> Doc:
    """
    Tokenizes the text using the EDSTokenizer

    Parameters
    ----------
    text: str

    Returns
    -------
    Doc

    """
    last = 0
    words = []
    whitespaces = []
    for match in self.word_regex.finditer(text):
        begin, end = match.start(), match.end()
        if last != begin:
            logger.warning(
                "Missed some characters during"
                + f" tokenization between {last} and {begin}: "
                + text[last - 10 : last]
                + "|"
                + text[last:begin]
                + "|"
                + text[begin : begin + 10],
            )
        last = end
        words.append(match.group(1))
        whitespaces.append(bool(match.group(2)))
    return Doc(self.vocab, words=words, spaces=whitespaces)

create_eds_tokenizer()

Creates a factory that returns new EDSTokenizer instances

RETURNS DESCRIPTION
EDSTokenizer
Source code in edsnlp/language.py
117
118
119
120
121
122
123
124
125
126
127
128
129
130
@spacy.registry.tokenizers("eds.tokenizer")
def create_eds_tokenizer():
    """
    Creates a factory that returns new EDSTokenizer instances

    Returns
    -------
    EDSTokenizer
    """

    def eds_tokenizer_factory(nlp):
        return EDSTokenizer(nlp.vocab)

    return eds_tokenizer_factory