`edsnlp.language`

`all = ['EDSLanguage']` `module-attribute`

`EDSDefaults`

Bases: FrenchDefaults

Defaults for the EDSLanguage class Mostly identical to the FrenchDefaults, but without tokenization info

Source code in edsnlp/language.py

class EDSDefaults(FrenchDefaults):
    """
    Defaults for the EDSLanguage class
    Mostly identical to the FrenchDefaults, but
    without tokenization info
    """

    tokenizer_exceptions = {}
    infixes = []
    lex_attr_getters = LEX_ATTRS
    syntax_iterators = SYNTAX_ITERATORS
    stop_words = STOP_WORDS
    config = FrenchDefaults.config.merge(
        {
            "nlp": {"tokenizer": {"@tokenizers": "eds.tokenizer"}},
        }
    )

`tokenizer_exceptions = {}` `class-attribute`

`infixes = []` `class-attribute`

`lex_attr_getters = LEX_ATTRS` `class-attribute`

`syntax_iterators = SYNTAX_ITERATORS` `class-attribute`

`stop_words = STOP_WORDS` `class-attribute`

`config = FrenchDefaults.config.merge({'nlp': {'tokenizer': {'@tokenizers': 'eds.tokenizer'}}})` `class-attribute`

`EDSLanguage`

Bases: French

French clinical language. It is shipped with the EDSTokenizer tokenizer that better handles tokenization for French clinical documents

Source code in edsnlp/language.py

@spacy.registry.languages("eds")
class EDSLanguage(French):
    """
    French clinical language.
    It is shipped with the `EDSTokenizer` tokenizer that better handles
    tokenization for French clinical documents
    """

    lang = "eds"
    Defaults = EDSDefaults
    default_config = Defaults

`lang = 'eds'` `class-attribute`

`Defaults = EDSDefaults` `class-attribute`

`default_config = Defaults` `class-attribute`

`EDSTokenizer`

Bases: DummyTokenizer

Source code in edsnlp/language.py

class EDSTokenizer(DummyTokenizer):
    def __init__(self, vocab: Vocab) -> None:
        """
        Tokenizer class for French clinical documents.
        It better handles tokenization around:
        - numbers: "ACR5" -> ["ACR", "5"] instead of ["ACR5"]
        - newlines: "\n \n \n" -> ["\n", "\n", "\n"] instead of ["\n \n \n"]
        and should be around 5-6 times faster than its standard French counterpart.

        Parameters
        ----------
        vocab: Vocab
            The spacy vocabulary
        """
        self.vocab = vocab
        punct = "[:punct:]" + "\"'ˊ＂〃ײ᳓″״‶˶ʺ“”˝"
        num_like = r"[\d]+"
        default = rf"[^\d{punct}'\n ]+(?:['ˊ](?=[[:alpha:]]))?"
        self.word_regex = regex.compile(
            rf"({num_like}|[{punct}]|\n|[ ]+|{default})([ ])?"
        )

    def __call__(self, text: str) -> Doc:
        """
        Tokenizes the text using the EDSTokenizer

        Parameters
        ----------
        text: str

        Returns
        -------
        Doc

        """
        last = 0
        words = []
        whitespaces = []
        for match in self.word_regex.finditer(text):
            begin, end = match.start(), match.end()
            if last != begin:
                logger.warning(
                    "Missed some characters during"
                    + f" tokenization between {last} and {begin}: "
                    + text[last - 10 : last]
                    + "|"
                    + text[last:begin]
                    + "|"
                    + text[begin : begin + 10],
                )
            last = end
            words.append(match.group(1))
            whitespaces.append(bool(match.group(2)))
        return Doc(self.vocab, words=words, spaces=whitespaces)

`vocab = vocab` `instance-attribute`

`word_regex = regex.compile('({num_like}|[{punct}]|\\n|[ ]+|{default})([ ])?')` `instance-attribute`

`init(vocab)`

    Tokenizer class for French clinical documents.
    It better handles tokenization around:
    - numbers: "ACR5" -> ["ACR", "5"] instead of ["ACR5"]
    - newlines: "

" -> [" ", " ", " "] instead of ["

"] and should be around 5-6 times faster than its standard French counterpart.

    Parameters
    ----------
    vocab: Vocab
        The spacy vocabulary

Source code in edsnlp/language.py

def __init__(self, vocab: Vocab) -> None:
    """
    Tokenizer class for French clinical documents.
    It better handles tokenization around:
    - numbers: "ACR5" -> ["ACR", "5"] instead of ["ACR5"]
    - newlines: "\n \n \n" -> ["\n", "\n", "\n"] instead of ["\n \n \n"]
    and should be around 5-6 times faster than its standard French counterpart.

    Parameters
    ----------
    vocab: Vocab
        The spacy vocabulary
    """
    self.vocab = vocab
    punct = "[:punct:]" + "\"'ˊ＂〃ײ᳓″״‶˶ʺ“”˝"
    num_like = r"[\d]+"
    default = rf"[^\d{punct}'\n ]+(?:['ˊ](?=[[:alpha:]]))?"
    self.word_regex = regex.compile(
        rf"({num_like}|[{punct}]|\n|[ ]+|{default})([ ])?"
    )

`call(text)`

Tokenizes the text using the EDSTokenizer

PARAMETER	DESCRIPTION
`text`	TYPE: `str`

RETURNS	DESCRIPTION
`Doc`

Source code in edsnlp/language.py

def __call__(self, text: str) -> Doc:
    """
    Tokenizes the text using the EDSTokenizer

    Parameters
    ----------
    text: str

    Returns
    -------
    Doc

    """
    last = 0
    words = []
    whitespaces = []
    for match in self.word_regex.finditer(text):
        begin, end = match.start(), match.end()
        if last != begin:
            logger.warning(
                "Missed some characters during"
                + f" tokenization between {last} and {begin}: "
                + text[last - 10 : last]
                + "|"
                + text[last:begin]
                + "|"
                + text[begin : begin + 10],
            )
        last = end
        words.append(match.group(1))
        whitespaces.append(bool(match.group(2)))
    return Doc(self.vocab, words=words, spaces=whitespaces)

`create_eds_tokenizer()`

Creates a factory that returns new EDSTokenizer instances

RETURNS	DESCRIPTION
`EDSTokenizer`

Source code in edsnlp/language.py

@spacy.registry.tokenizers("eds.tokenizer")
def create_eds_tokenizer():
    """
    Creates a factory that returns new EDSTokenizer instances

    Returns
    -------
    EDSTokenizer
    """

    def eds_tokenizer_factory(nlp):
        return EDSTokenizer(nlp.vocab)

    return eds_tokenizer_factory

edsnlp.language

__all__ = ['EDSLanguage'] module-attribute

EDSDefaults

tokenizer_exceptions = {} class-attribute

infixes = [] class-attribute

lex_attr_getters = LEX_ATTRS class-attribute

syntax_iterators = SYNTAX_ITERATORS class-attribute

stop_words = STOP_WORDS class-attribute

config = FrenchDefaults.config.merge({'nlp': {'tokenizer': {'@tokenizers': 'eds.tokenizer'}}}) class-attribute

EDSLanguage

lang = 'eds' class-attribute

Defaults = EDSDefaults class-attribute

default_config = Defaults class-attribute

EDSTokenizer

vocab = vocab instance-attribute

word_regex = regex.compile('({num_like}|[{punct}]|\\n|[ ]+|{default})([ ])?') instance-attribute

__init__(vocab)

__call__(text)

create_eds_tokenizer()

`edsnlp.language`

`all = ['EDSLanguage']` `module-attribute`

`EDSDefaults`

`tokenizer_exceptions = {}` `class-attribute`

`infixes = []` `class-attribute`

`lex_attr_getters = LEX_ATTRS` `class-attribute`

`syntax_iterators = SYNTAX_ITERATORS` `class-attribute`

`stop_words = STOP_WORDS` `class-attribute`

`config = FrenchDefaults.config.merge({'nlp': {'tokenizer': {'@tokenizers': 'eds.tokenizer'}}})` `class-attribute`

`EDSLanguage`

`lang = 'eds'` `class-attribute`

`Defaults = EDSDefaults` `class-attribute`

`default_config = Defaults` `class-attribute`

`EDSTokenizer`

`vocab = vocab` `instance-attribute`

`word_regex = regex.compile('({num_like}|[{punct}]|\\n|[ ]+|{default})([ ])?')` `instance-attribute`

`init(vocab)`

`call(text)`

`create_eds_tokenizer()`