Skip to content

edsnlp.language

__all__ = ['EDSLanguage'] module-attribute

EDSDefaults

Bases: FrenchDefaults

Defaults for the EDSLanguage class Mostly identical to the FrenchDefaults, but without tokenization info

Source code in edsnlp/language.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
class EDSDefaults(FrenchDefaults):
    """
    Defaults for the EDSLanguage class
    Mostly identical to the FrenchDefaults, but
    without tokenization info
    """

    tokenizer_exceptions = {}
    infixes = []
    lex_attr_getters = LEX_ATTRS
    syntax_iterators = SYNTAX_ITERATORS
    stop_words = STOP_WORDS
    config = FrenchDefaults.config.merge(
        {
            "nlp": {"tokenizer": {"@tokenizers": "eds.tokenizer"}},
        }
    )

tokenizer_exceptions = {} class-attribute

infixes = [] class-attribute

lex_attr_getters = LEX_ATTRS class-attribute

syntax_iterators = SYNTAX_ITERATORS class-attribute

stop_words = STOP_WORDS class-attribute

config = FrenchDefaults.config.merge({'nlp': {'tokenizer': {'@tokenizers': 'eds.tokenizer'}}}) class-attribute

EDSLanguage

Bases: French

French clinical language. It is shipped with the EDSTokenizer tokenizer that better handles tokenization for French clinical documents

Source code in edsnlp/language.py
32
33
34
35
36
37
38
39
40
41
42
@spacy.registry.languages("eds")
class EDSLanguage(French):
    """
    French clinical language.
    It is shipped with the `EDSTokenizer` tokenizer that better handles
    tokenization for French clinical documents
    """

    lang = "eds"
    Defaults = EDSDefaults
    default_config = Defaults

lang = 'eds' class-attribute

Defaults = EDSDefaults class-attribute

default_config = Defaults class-attribute

EDSTokenizer

Bases: DummyTokenizer

Source code in edsnlp/language.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
class EDSTokenizer(DummyTokenizer):
    def __init__(self, vocab: Vocab) -> None:
        """
        Tokenizer class for French clinical documents.
        It better handles tokenization around:
        - numbers: "ACR5" -> ["ACR", "5"] instead of ["ACR5"]
        - newlines: "\n \n \n" -> ["\n", "\n", "\n"] instead of ["\n \n \n"]
        and should be around 5-6 times faster than its standard French counterpart.

        Parameters
        ----------
        vocab: Vocab
            The spacy vocabulary
        """
        self.vocab = vocab
        punct = "[:punct:]" + "\"'ˊ"〃ײ᳓″״‶˶ʺ“”˝"
        num_like = r"[\d]+"
        default = rf"[^\d{punct}'\n ]+(?:['ˊ](?=[[:alpha:]]))?"
        self.word_regex = regex.compile(
            rf"({num_like}|[{punct}]|\n|[ ]+|{default})([ ])?"
        )

    def __call__(self, text: str) -> Doc:
        """
        Tokenizes the text using the EDSTokenizer

        Parameters
        ----------
        text: str

        Returns
        -------
        Doc

        """
        last = 0
        words = []
        whitespaces = []
        for match in self.word_regex.finditer(text):
            begin, end = match.start(), match.end()
            if last != begin:
                logger.warning(
                    "Missed some characters during"
                    + f" tokenization between {last} and {begin}: "
                    + text[last - 10 : last]
                    + "|"
                    + text[last:begin]
                    + "|"
                    + text[begin : begin + 10],
                )
            last = end
            words.append(match.group(1))
            whitespaces.append(bool(match.group(2)))
        return Doc(self.vocab, words=words, spaces=whitespaces)

vocab = vocab instance-attribute

word_regex = regex.compile('({num_like}|[{punct}]|\\n|[ ]+|{default})([ ])?') instance-attribute

__init__(vocab)

    Tokenizer class for French clinical documents.
    It better handles tokenization around:
    - numbers: "ACR5" -> ["ACR", "5"] instead of ["ACR5"]
    - newlines: "

" -> [" ", " ", " "] instead of ["

"] and should be around 5-6 times faster than its standard French counterpart.

    Parameters
    ----------
    vocab: Vocab
        The spacy vocabulary
Source code in edsnlp/language.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
def __init__(self, vocab: Vocab) -> None:
    """
    Tokenizer class for French clinical documents.
    It better handles tokenization around:
    - numbers: "ACR5" -> ["ACR", "5"] instead of ["ACR5"]
    - newlines: "\n \n \n" -> ["\n", "\n", "\n"] instead of ["\n \n \n"]
    and should be around 5-6 times faster than its standard French counterpart.

    Parameters
    ----------
    vocab: Vocab
        The spacy vocabulary
    """
    self.vocab = vocab
    punct = "[:punct:]" + "\"'ˊ"〃ײ᳓″״‶˶ʺ“”˝"
    num_like = r"[\d]+"
    default = rf"[^\d{punct}'\n ]+(?:['ˊ](?=[[:alpha:]]))?"
    self.word_regex = regex.compile(
        rf"({num_like}|[{punct}]|\n|[ ]+|{default})([ ])?"
    )

__call__(text)

Tokenizes the text using the EDSTokenizer

PARAMETER DESCRIPTION
text

TYPE: str

RETURNS DESCRIPTION
Doc
Source code in edsnlp/language.py
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
def __call__(self, text: str) -> Doc:
    """
    Tokenizes the text using the EDSTokenizer

    Parameters
    ----------
    text: str

    Returns
    -------
    Doc

    """
    last = 0
    words = []
    whitespaces = []
    for match in self.word_regex.finditer(text):
        begin, end = match.start(), match.end()
        if last != begin:
            logger.warning(
                "Missed some characters during"
                + f" tokenization between {last} and {begin}: "
                + text[last - 10 : last]
                + "|"
                + text[last:begin]
                + "|"
                + text[begin : begin + 10],
            )
        last = end
        words.append(match.group(1))
        whitespaces.append(bool(match.group(2)))
    return Doc(self.vocab, words=words, spaces=whitespaces)

create_eds_tokenizer()

Creates a factory that returns new EDSTokenizer instances

RETURNS DESCRIPTION
EDSTokenizer
Source code in edsnlp/language.py
101
102
103
104
105
106
107
108
109
110
111
112
113
114
@spacy.registry.tokenizers("eds.tokenizer")
def create_eds_tokenizer():
    """
    Creates a factory that returns new EDSTokenizer instances

    Returns
    -------
    EDSTokenizer
    """

    def eds_tokenizer_factory(nlp):
        return EDSTokenizer(nlp.vocab)

    return eds_tokenizer_factory