`edsnlp`

EDS-NLP

`version = '0.4.4'` `module-attribute`

`BASE_DIR = Path(file).parent` `module-attribute`

`conjugator`

`conjugate_verb(verb, conjugator)`

Conjugates the verb using an instance of mlconjug3, and formats the results in a pandas DataFrame.

PARAMETER DESCRIPTION

verb

Verb to conjugate.

TYPE: str

conjugator

mlconjug3 instance for conjugating.

TYPE: mlconjug3.Conjugator

RETURNS	DESCRIPTION
`pd.DataFrame`	Normalized dataframe containing all conjugated forms for the verb.

Source code in edsnlp/conjugator.py

def conjugate_verb(
    verb: str,
    conjugator: mlconjug3.Conjugator,
) -> pd.DataFrame:
    """
    Conjugates the verb using an instance of mlconjug3,
    and formats the results in a pandas `DataFrame`.

    Parameters
    ----------
    verb : str
        Verb to conjugate.
    conjugator : mlconjug3.Conjugator
        mlconjug3 instance for conjugating.

    Returns
    -------
    pd.DataFrame
        Normalized dataframe containing all conjugated forms
        for the verb.
    """

    df = pd.DataFrame(
        conjugator.conjugate(verb).iterate(),
        columns=["mode", "tense", "person", "term"],
    )

    df.term = df.term.fillna(df.person)
    df.loc[df.person == df.term, "person"] = None

    df.insert(0, "verb", verb)

    return df

`conjugate(verbs, language='fr')`

Conjugate a list of verbs.

PARAMETER DESCRIPTION

verbs

List of verbs to conjugate

TYPE: Union[str, List[str]]

language

Language to conjugate. Defaults to French (fr).

TYPE: str DEFAULT: 'fr'

RETURNS	DESCRIPTION
`pd.DataFrame`	Dataframe containing the conjugations for the provided verbs. Columns: `verb`, `mode`, `tense`, `person`, `term`

Source code in edsnlp/conjugator.py

def conjugate(
    verbs: Union[str, List[str]],
    language: str = "fr",
) -> pd.DataFrame:
    """
    Conjugate a list of verbs.

    Parameters
    ----------
    verbs : Union[str, List[str]]
        List of verbs to conjugate
    language: str
        Language to conjugate. Defaults to French (`fr`).

    Returns
    -------
    pd.DataFrame
        Dataframe containing the conjugations for the provided verbs.
        Columns: `verb`, `mode`, `tense`, `person`, `term`
    """
    if isinstance(verbs, str):
        verbs = [verbs]

    conjugator = mlconjug3.Conjugator(language=language)

    df = pd.concat([conjugate_verb(verb, conjugator=conjugator) for verb in verbs])

    df = df.reset_index(drop=True)

    return df

`get_conjugated_verbs(verbs, matches, language='fr')`

Get a list of conjugated verbs.

PARAMETER DESCRIPTION

verbs

List of verbs to conjugate.

TYPE: Union[str, List[str]]

matches

List of dictionary describing the mode/tense/persons to keep.

TYPE: Union[List[Dict[str, str]], Dict[str, str]]

language

[description], by default "fr" (French)

TYPE: str, optional DEFAULT: 'fr'

RETURNS	DESCRIPTION
`List[str]`	List of terms to look for.

Examples:

>>> get_conjugated_verbs(
        "aimer",
        dict(mode="Indicatif", tense="Présent", person="1p"),
    )
['aimons']

Source code in edsnlp/conjugator.py

def get_conjugated_verbs(
    verbs: Union[str, List[str]],
    matches: Union[List[Dict[str, str]], Dict[str, str]],
    language: str = "fr",
) -> List[str]:
    """
    Get a list of conjugated verbs.

    Parameters
    ----------
    verbs : Union[str, List[str]]
        List of verbs to conjugate.
    matches : Union[List[Dict[str, str]], Dict[str, str]]
        List of dictionary describing the mode/tense/persons to keep.
    language : str, optional
        [description], by default "fr" (French)

    Returns
    -------
    List[str]
        List of terms to look for.

    Examples
    --------
    >>> get_conjugated_verbs(
            "aimer",
            dict(mode="Indicatif", tense="Présent", person="1p"),
        )
    ['aimons']
    """

    if isinstance(matches, dict):
        matches = [matches]

    terms = []

    df = conjugate(
        verbs=verbs,
        language=language,
    )

    for match in matches:
        q = " & ".join([f'{k} == "{v}"' for k, v in match.items()])
        terms.extend(df.query(q).term.unique())

    return list(set(terms))

`extensions`

`components`

`matchers`

`phrase`

`PatternDict = Dict[str, Union[str, Dict[str, str]]]` `module-attribute`

`EDSPhraseMatcher`

Bases: object

PhraseMatcher that matches "over" excluded tokens.

PARAMETER	DESCRIPTION
`vocab`	spaCy vocabulary to match on. TYPE: `Vocab`
`attr`	Default attribute to match on, by default "TEXT". Can be overiden in the `add` method. To match on a custom attribute, prepend the attribute name with `_`. TYPE: `str`
`ignore_excluded`	Whether to ignore excluded tokens, by default True TYPE: `bool, optional`
`exclude_newlines`	Whether to exclude new lines, by default False TYPE: `bool, optional`

Source code in edsnlp/matchers/phrase.py

class EDSPhraseMatcher(object):
    """
    PhraseMatcher that matches "over" excluded tokens.

    Parameters
    ----------
    vocab : Vocab
        spaCy vocabulary to match on.
    attr : str
        Default attribute to match on, by default "TEXT".
        Can be overiden in the `add` method.

        To match on a custom attribute, prepend the attribute name with `_`.
    ignore_excluded : bool, optional
        Whether to ignore excluded tokens, by default True
    exclude_newlines : bool, optional
        Whether to exclude new lines, by default False
    """

    def __init__(
        self,
        vocab: Vocab,
        attr: str = "TEXT",
        ignore_excluded: bool = True,
        exclude_newlines: bool = False,
    ):
        self.matcher = Matcher(vocab, validate=True)
        self.attr = attr
        self.ignore_excluded = ignore_excluded

        self.exclusion_attribute = (
            "excluded_or_space" if exclude_newlines else "excluded"
        )

    @staticmethod
    def get_attr(token: Token, attr: str, custom_attr: bool = False) -> str:
        if custom_attr:
            return getattr(token._, attr)
        else:
            attr = ATTRIBUTES.get(attr)
            return getattr(token, attr)

    def create_pattern(
        self,
        match_pattern: Doc,
        attr: Optional[str] = None,
        ignore_excluded: Optional[bool] = None,
    ) -> List[PatternDict]:
        """
        Create a pattern

        Parameters
        ----------
        match_pattern : Doc
            A spaCy doc object, to use as match model.
        attr : str, optional
            Overwrite attribute to match on.
        ignore_excluded: bool, optional
            Whether to skip excluded tokens.

        Returns
        -------
        List[PatternDict]
            A spaCy rule-based pattern.
        """

        ignore_excluded = ignore_excluded or self.ignore_excluded

        attr = attr or self.attr
        custom_attr = attr.startswith("_")

        if custom_attr:
            attr = attr.lstrip("_").lower()

            pattern = []

            for token in match_pattern:
                pattern.append({"_": {attr: self.get_attr(token, attr, True)}})
                if ignore_excluded and token.whitespace_:
                    # If the token is followed by a whitespace,
                    # we let it match on a pollution
                    pattern.append({"_": {self.exclusion_attribute: True}, "OP": "*"})

            return pattern
        else:
            pattern = []

            for token in match_pattern:
                pattern.append({attr: self.get_attr(token, attr, False)})
                if ignore_excluded and token.whitespace_:
                    # If the token is followed by a whitespace,
                    # we let it match on a pollution
                    pattern.append({"_": {self.exclusion_attribute: True}, "OP": "*"})

            return pattern

    def build_patterns(self, nlp: Language, terms: Patterns):
        """
        Build patterns and adds them for matching.
        Helper function for pipelines using this matcher.

        Parameters
        ----------
        nlp : Language
            The instance of the spaCy language class.
        terms : Patterns
            Dictionary of label/terms, or label/dictionary of terms/attribute.
        """

        if not terms:
            terms = dict()

        for key, expressions in terms.items():
            if isinstance(expressions, dict):
                attr = expressions.get("attr")
                expressions = expressions.get("patterns")
            else:
                attr = None
            if isinstance(expressions, str):
                expressions = [expressions]
            patterns = list(nlp.pipe(expressions))
            self.add(key, patterns, attr)

    def add(
        self,
        key: str,
        patterns: List[Doc],
        attr: Optional[str] = None,
        ignore_excluded: Optional[bool] = None,
    ) -> None:
        """
        Add a pattern.

        Parameters
        ----------
        key : str
            Key of the new/updated pattern.
        patterns : List[str]
            List of patterns to add.
        attr : str, optional
            Overwrite the attribute to match on for this specific pattern.
        ignore_excluded : bool, optional
            Overwrite the parameter for this specific pattern.
        """

        patterns = [
            self.create_pattern(pattern, attr=attr, ignore_excluded=ignore_excluded)
            for pattern in patterns
        ]
        self.matcher.add(key, patterns)

    def remove(
        self,
        key: str,
    ) -> None:
        """
        Remove a pattern.

        Parameters
        ----------
        key : str
            key of the pattern to remove.

        Raises
        ------
        ValueError
            Should the key not be contained in the registry.
        """
        self.matcher.remove(key)

    def __len__(self):
        return len(self.matcher)

    def __call__(
        self,
        doclike: Union[Doc, Span],
        as_spans=False,
    ) -> Generator:
        """
        Performs matching. Yields matches.

        Parameters
        ----------
        doclike:
            spaCy Doc or Span object.
        as_spans:
            Whether to return matches as spans.

        Yields
        -------
        match: Span
            A match.
        """
        if len(self.matcher):
            for match in self.matcher(doclike, as_spans=as_spans):
                yield match

`matcher = Matcher(vocab, validate=True)` `instance-attribute`

`attr = attr` `instance-attribute`

`ignore_excluded = ignore_excluded` `instance-attribute`

`exclusion_attribute = 'excluded_or_space' if exclude_newlines else 'excluded'` `instance-attribute`

`init(vocab, attr='TEXT', ignore_excluded=True, exclude_newlines=False)`

Source code in edsnlp/matchers/phrase.py

def __init__(
    self,
    vocab: Vocab,
    attr: str = "TEXT",
    ignore_excluded: bool = True,
    exclude_newlines: bool = False,
):
    self.matcher = Matcher(vocab, validate=True)
    self.attr = attr
    self.ignore_excluded = ignore_excluded

    self.exclusion_attribute = (
        "excluded_or_space" if exclude_newlines else "excluded"
    )

`get_attr(token, attr, custom_attr=False)`

Source code in edsnlp/matchers/phrase.py

@staticmethod
def get_attr(token: Token, attr: str, custom_attr: bool = False) -> str:
    if custom_attr:
        return getattr(token._, attr)
    else:
        attr = ATTRIBUTES.get(attr)
        return getattr(token, attr)

`create_pattern(match_pattern, attr=None, ignore_excluded=None)`

Create a pattern

PARAMETER DESCRIPTION

match_pattern

A spaCy doc object, to use as match model.

TYPE: Doc

attr

Overwrite attribute to match on.

TYPE: str, optional DEFAULT: None

ignore_excluded

Whether to skip excluded tokens.

TYPE: Optional[bool] DEFAULT: None

RETURNS	DESCRIPTION
`List[PatternDict]`	A spaCy rule-based pattern.

Source code in edsnlp/matchers/phrase.py

def create_pattern(
    self,
    match_pattern: Doc,
    attr: Optional[str] = None,
    ignore_excluded: Optional[bool] = None,
) -> List[PatternDict]:
    """
    Create a pattern

    Parameters
    ----------
    match_pattern : Doc
        A spaCy doc object, to use as match model.
    attr : str, optional
        Overwrite attribute to match on.
    ignore_excluded: bool, optional
        Whether to skip excluded tokens.

    Returns
    -------
    List[PatternDict]
        A spaCy rule-based pattern.
    """

    ignore_excluded = ignore_excluded or self.ignore_excluded

    attr = attr or self.attr
    custom_attr = attr.startswith("_")

    if custom_attr:
        attr = attr.lstrip("_").lower()

        pattern = []

        for token in match_pattern:
            pattern.append({"_": {attr: self.get_attr(token, attr, True)}})
            if ignore_excluded and token.whitespace_:
                # If the token is followed by a whitespace,
                # we let it match on a pollution
                pattern.append({"_": {self.exclusion_attribute: True}, "OP": "*"})

        return pattern
    else:
        pattern = []

        for token in match_pattern:
            pattern.append({attr: self.get_attr(token, attr, False)})
            if ignore_excluded and token.whitespace_:
                # If the token is followed by a whitespace,
                # we let it match on a pollution
                pattern.append({"_": {self.exclusion_attribute: True}, "OP": "*"})

        return pattern

`build_patterns(nlp, terms)`

Build patterns and adds them for matching. Helper function for pipelines using this matcher.

PARAMETER DESCRIPTION

nlp

The instance of the spaCy language class.

TYPE: Language

terms

Dictionary of label/terms, or label/dictionary of terms/attribute.

TYPE: Patterns

Source code in edsnlp/matchers/phrase.py

def build_patterns(self, nlp: Language, terms: Patterns):
    """
    Build patterns and adds them for matching.
    Helper function for pipelines using this matcher.

    Parameters
    ----------
    nlp : Language
        The instance of the spaCy language class.
    terms : Patterns
        Dictionary of label/terms, or label/dictionary of terms/attribute.
    """

    if not terms:
        terms = dict()

    for key, expressions in terms.items():
        if isinstance(expressions, dict):
            attr = expressions.get("attr")
            expressions = expressions.get("patterns")
        else:
            attr = None
        if isinstance(expressions, str):
            expressions = [expressions]
        patterns = list(nlp.pipe(expressions))
        self.add(key, patterns, attr)

`add(key, patterns, attr=None, ignore_excluded=None)`

Add a pattern.

PARAMETER	DESCRIPTION
`key`	Key of the new/updated pattern. TYPE: `str`
`patterns`	List of patterns to add. TYPE: `List[str]`
`attr`	Overwrite the attribute to match on for this specific pattern. TYPE: `str, optional` DEFAULT: `None`
`ignore_excluded`	Overwrite the parameter for this specific pattern. TYPE: `bool, optional` DEFAULT: `None`

Source code in edsnlp/matchers/phrase.py

def add(
    self,
    key: str,
    patterns: List[Doc],
    attr: Optional[str] = None,
    ignore_excluded: Optional[bool] = None,
) -> None:
    """
    Add a pattern.

    Parameters
    ----------
    key : str
        Key of the new/updated pattern.
    patterns : List[str]
        List of patterns to add.
    attr : str, optional
        Overwrite the attribute to match on for this specific pattern.
    ignore_excluded : bool, optional
        Overwrite the parameter for this specific pattern.
    """

    patterns = [
        self.create_pattern(pattern, attr=attr, ignore_excluded=ignore_excluded)
        for pattern in patterns
    ]
    self.matcher.add(key, patterns)

`remove(key)`

Remove a pattern.

PARAMETER DESCRIPTION

key

key of the pattern to remove.

TYPE: str

RAISES	DESCRIPTION
`ValueError`	Should the key not be contained in the registry.

Source code in edsnlp/matchers/phrase.py

def remove(
    self,
    key: str,
) -> None:
    """
    Remove a pattern.

    Parameters
    ----------
    key : str
        key of the pattern to remove.

    Raises
    ------
    ValueError
        Should the key not be contained in the registry.
    """
    self.matcher.remove(key)

`len()`

Source code in edsnlp/matchers/phrase.py

211
212

def __len__(self):
    return len(self.matcher)

`call(doclike, as_spans=False)`

Performs matching. Yields matches.

PARAMETER DESCRIPTION

doclike

spaCy Doc or Span object.

TYPE: Union[Doc, Span]

as_spans

Whether to return matches as spans.

DEFAULT: False

YIELDS	DESCRIPTION
`match`	A match.

Source code in edsnlp/matchers/phrase.py

def __call__(
    self,
    doclike: Union[Doc, Span],
    as_spans=False,
) -> Generator:
    """
    Performs matching. Yields matches.

    Parameters
    ----------
    doclike:
        spaCy Doc or Span object.
    as_spans:
        Whether to return matches as spans.

    Yields
    -------
    match: Span
        A match.
    """
    if len(self.matcher):
        for match in self.matcher(doclike, as_spans=as_spans):
            yield match

`get_normalized_variant(doclike)`

Source code in edsnlp/matchers/phrase.py

def get_normalized_variant(doclike: Union[Span, Doc]) -> str:
    tokens = [t.text + t.whitespace_ for t in doclike if not t._.excluded]
    variant = "".join(tokens)
    variant = variant.rstrip(" ")
    variant = re.sub(r"\s+", " ", variant)
    return variant

`phrase_matcher_factory(attr, ignore_excluded, exclude_newlines)`

Source code in edsnlp/matchers/phrase.py

@registry.misc("edsnlp.factories.phrasematcher.v1")
def phrase_matcher_factory(
    attr: str,
    ignore_excluded: bool,
    exclude_newlines: bool,
):
    return partial(
        EDSPhraseMatcher,
        attr=attr,
        ignore_excluded=ignore_excluded,
        exclude_newlines=exclude_newlines,
    )

`regex`

`RegexMatcher`

Bases: object

Simple RegExp matcher.

PARAMETER DESCRIPTION

alignment_mode

How spans should be aligned with tokens. Possible values are strict (character indices must be aligned with token boundaries), "contract" (span of all tokens completely within the character span), "expand" (span of all tokens at least partially covered by the character span). Defaults to expand.

TYPE: str

attr

Default attribute to match on, by default "TEXT". Can be overiden in the add method.

TYPE: str

ignore_excluded

Whether to skip exclusions

TYPE: bool

Source code in edsnlp/matchers/regex.py

class RegexMatcher(object):
    """
    Simple RegExp matcher.

    Parameters
    ----------
    alignment_mode : str
        How spans should be aligned with tokens.
        Possible values are `strict` (character indices must be aligned
        with token boundaries), "contract" (span of all tokens completely
        within the character span), "expand" (span of all tokens at least
        partially covered by the character span).
        Defaults to `expand`.
    attr : str
        Default attribute to match on, by default "TEXT".
        Can be overiden in the `add` method.
    ignore_excluded : bool
        Whether to skip exclusions
    """

    def __init__(
        self,
        alignment_mode: str = "expand",
        attr: str = "TEXT",
        ignore_excluded: bool = False,
    ):
        self.alignment_mode = alignment_mode
        self.regex = []

        self.default_attr = attr

        self.ignore_excluded = ignore_excluded

    def build_patterns(self, regex: Patterns):
        """
        Build patterns and adds them for matching.
        Helper function for pipelines using this matcher.

        Parameters
        ----------
        regex : Patterns
            Dictionary of label/terms, or label/dictionary of terms/attribute.
        """
        if not regex:
            regex = dict()

        for key, patterns in regex.items():
            if isinstance(patterns, dict):
                attr = patterns.get("attr")
                alignment_mode = patterns.get("alignment_mode")
                patterns = patterns.get("regex")
            else:
                attr = None
                alignment_mode = None

            if isinstance(patterns, str):
                patterns = [patterns]

            self.add(
                key=key, patterns=patterns, attr=attr, alignment_mode=alignment_mode
            )

    def add(
        self,
        key: str,
        patterns: List[str],
        attr: Optional[str] = None,
        ignore_excluded: Optional[bool] = None,
        alignment_mode: Optional[str] = None,
    ):
        """
        Add a pattern to the registry.

        Parameters
        ----------
        key : str
            Key of the new/updated pattern.
        patterns : List[str]
            List of patterns to add.
        attr : str, optional
            Attribute to use for matching.
            By default uses the `default_attr` attribute
        ignore_excluded : bool, optional
            Whether to skip excluded tokens during matching.
        alignment_mode : str, optional
            Overwrite alignment mode.
        """

        if attr is None:
            attr = self.default_attr

        if ignore_excluded is None:
            ignore_excluded = self.ignore_excluded

        if alignment_mode is None:
            alignment_mode = self.alignment_mode

        patterns = [compile_regex(pattern) for pattern in patterns]

        self.regex.append((key, patterns, attr, ignore_excluded, alignment_mode))

    def remove(
        self,
        key: str,
    ):
        """
        Remove a pattern for the registry.

        Parameters
        ----------
        key : str
            key of the pattern to remove.

        Raises
        ------
        ValueError
            If the key is not present in the registered patterns.
        """
        n = len(self.regex)
        self.regex = [(k, p, a, i, am) for k, p, a, i, am in self.regex if k != key]
        if len(self.regex) == n:
            raise ValueError(f"`{key}` is not referenced in the matcher")

    def __len__(self):
        return len(set([regex[0] for regex in self.regex]))

    def match(
        self,
        doclike: Union[Doc, Span],
    ) -> Tuple[Span, re.Match]:
        """
        Iterates on the matches.

        Parameters
        ----------
        doclike:
            spaCy Doc or Span object to match on.

        Yields
        -------
        span:
            A match.
        """

        for key, patterns, attr, ignore_excluded, alignment_mode in self.regex:
            text = get_text(doclike, attr, ignore_excluded)

            for pattern in patterns:
                for match in pattern.finditer(text):
                    logger.trace(f"Matched a regex from {key}: {repr(match.group())}")

                    span = create_span(
                        doclike=doclike,
                        start_char=match.start(),
                        end_char=match.end(),
                        key=key,
                        attr=attr,
                        alignment_mode=alignment_mode,
                        ignore_excluded=ignore_excluded,
                    )

                    if span is None:
                        continue

                    yield span, match

    def __call__(
        self,
        doclike: Union[Doc, Span],
        as_spans=False,
        return_groupdict=False,
    ) -> Union[Span, Tuple[Span, Dict[str, Any]]]:
        """
        Performs matching. Yields matches.

        Parameters
        ----------
        doclike:
            spaCy Doc or Span object.
        as_spans:
            Returns matches as spans.

        Yields
        ------
        span:
            A match.
        groupdict:
            Additional information coming from the named patterns
            in the regular expression.
        """
        for span, match in self.match(doclike):
            if not as_spans:
                offset = doclike[0].i
                span = (span.label, span.start - offset, span.end - offset)
            if return_groupdict:
                yield span, match.groupdict()
            else:
                yield span

`alignment_mode = alignment_mode` `instance-attribute`

`regex = []` `instance-attribute`

`default_attr = attr` `instance-attribute`

`ignore_excluded = ignore_excluded` `instance-attribute`

`init(alignment_mode='expand', attr='TEXT', ignore_excluded=False)`

Source code in edsnlp/matchers/regex.py

def __init__(
    self,
    alignment_mode: str = "expand",
    attr: str = "TEXT",
    ignore_excluded: bool = False,
):
    self.alignment_mode = alignment_mode
    self.regex = []

    self.default_attr = attr

    self.ignore_excluded = ignore_excluded

`build_patterns(regex)`

Build patterns and adds them for matching. Helper function for pipelines using this matcher.

PARAMETER DESCRIPTION

regex

Dictionary of label/terms, or label/dictionary of terms/attribute.

TYPE: Patterns

Source code in edsnlp/matchers/regex.py

def build_patterns(self, regex: Patterns):
    """
    Build patterns and adds them for matching.
    Helper function for pipelines using this matcher.

    Parameters
    ----------
    regex : Patterns
        Dictionary of label/terms, or label/dictionary of terms/attribute.
    """
    if not regex:
        regex = dict()

    for key, patterns in regex.items():
        if isinstance(patterns, dict):
            attr = patterns.get("attr")
            alignment_mode = patterns.get("alignment_mode")
            patterns = patterns.get("regex")
        else:
            attr = None
            alignment_mode = None

        if isinstance(patterns, str):
            patterns = [patterns]

        self.add(
            key=key, patterns=patterns, attr=attr, alignment_mode=alignment_mode
        )

`add(key, patterns, attr=None, ignore_excluded=None, alignment_mode=None)`

Add a pattern to the registry.

PARAMETER	DESCRIPTION
`key`	Key of the new/updated pattern. TYPE: `str`
`patterns`	List of patterns to add. TYPE: `List[str]`
`attr`	Attribute to use for matching. By default uses the `default_attr` attribute TYPE: `str, optional` DEFAULT: `None`
`ignore_excluded`	Whether to skip excluded tokens during matching. TYPE: `bool, optional` DEFAULT: `None`
`alignment_mode`	Overwrite alignment mode. TYPE: `str, optional` DEFAULT: `None`

Source code in edsnlp/matchers/regex.py

def add(
    self,
    key: str,
    patterns: List[str],
    attr: Optional[str] = None,
    ignore_excluded: Optional[bool] = None,
    alignment_mode: Optional[str] = None,
):
    """
    Add a pattern to the registry.

    Parameters
    ----------
    key : str
        Key of the new/updated pattern.
    patterns : List[str]
        List of patterns to add.
    attr : str, optional
        Attribute to use for matching.
        By default uses the `default_attr` attribute
    ignore_excluded : bool, optional
        Whether to skip excluded tokens during matching.
    alignment_mode : str, optional
        Overwrite alignment mode.
    """

    if attr is None:
        attr = self.default_attr

    if ignore_excluded is None:
        ignore_excluded = self.ignore_excluded

    if alignment_mode is None:
        alignment_mode = self.alignment_mode

    patterns = [compile_regex(pattern) for pattern in patterns]

    self.regex.append((key, patterns, attr, ignore_excluded, alignment_mode))

`remove(key)`

Remove a pattern for the registry.

PARAMETER DESCRIPTION

key

key of the pattern to remove.

TYPE: str

RAISES	DESCRIPTION
`ValueError`	If the key is not present in the registered patterns.

Source code in edsnlp/matchers/regex.py

def remove(
    self,
    key: str,
):
    """
    Remove a pattern for the registry.

    Parameters
    ----------
    key : str
        key of the pattern to remove.

    Raises
    ------
    ValueError
        If the key is not present in the registered patterns.
    """
    n = len(self.regex)
    self.regex = [(k, p, a, i, am) for k, p, a, i, am in self.regex if k != key]
    if len(self.regex) == n:
        raise ValueError(f"`{key}` is not referenced in the matcher")

`len()`

Source code in edsnlp/matchers/regex.py

238
239

def __len__(self):
    return len(set([regex[0] for regex in self.regex]))

`match(doclike)`

Iterates on the matches.

PARAMETER DESCRIPTION

doclike

spaCy Doc or Span object to match on.

TYPE: Union[Doc, Span]

YIELDS	DESCRIPTION
`span`	A match.

Source code in edsnlp/matchers/regex.py

def match(
    self,
    doclike: Union[Doc, Span],
) -> Tuple[Span, re.Match]:
    """
    Iterates on the matches.

    Parameters
    ----------
    doclike:
        spaCy Doc or Span object to match on.

    Yields
    -------
    span:
        A match.
    """

    for key, patterns, attr, ignore_excluded, alignment_mode in self.regex:
        text = get_text(doclike, attr, ignore_excluded)

        for pattern in patterns:
            for match in pattern.finditer(text):
                logger.trace(f"Matched a regex from {key}: {repr(match.group())}")

                span = create_span(
                    doclike=doclike,
                    start_char=match.start(),
                    end_char=match.end(),
                    key=key,
                    attr=attr,
                    alignment_mode=alignment_mode,
                    ignore_excluded=ignore_excluded,
                )

                if span is None:
                    continue

                yield span, match

`call(doclike, as_spans=False, return_groupdict=False)`

Performs matching. Yields matches.

PARAMETER DESCRIPTION

doclike

spaCy Doc or Span object.

TYPE: Union[Doc, Span]

as_spans

Returns matches as spans.

DEFAULT: False

YIELDS	DESCRIPTION
`span`	A match.
`groupdict`	Additional information coming from the named patterns in the regular expression.

Source code in edsnlp/matchers/regex.py

def __call__(
    self,
    doclike: Union[Doc, Span],
    as_spans=False,
    return_groupdict=False,
) -> Union[Span, Tuple[Span, Dict[str, Any]]]:
    """
    Performs matching. Yields matches.

    Parameters
    ----------
    doclike:
        spaCy Doc or Span object.
    as_spans:
        Returns matches as spans.

    Yields
    ------
    span:
        A match.
    groupdict:
        Additional information coming from the named patterns
        in the regular expression.
    """
    for span, match in self.match(doclike):
        if not as_spans:
            offset = doclike[0].i
            span = (span.label, span.start - offset, span.end - offset)
        if return_groupdict:
            yield span, match.groupdict()
        else:
            yield span

`get_first_included(doclike)`

Source code in edsnlp/matchers/regex.py

@lru_cache(32)
def get_first_included(doclike: Union[Doc, Span]) -> Token:
    for token in doclike:
        if not token._.excluded:
            return token
    raise IndexError("The provided Span does not include any token")

`create_span(doclike, start_char, end_char, key, attr, alignment_mode, ignore_excluded)`

spaCy only allows strict alignment mode for char_span on Spans. This method circumvents this.

PARAMETER	DESCRIPTION
`doclike`	`Doc` or `Span`. TYPE: `Union[Doc, Span]`
`start_char`	Character index within the Doc-like object. TYPE: `int`
`end_char`	Character index of the end, within the Doc-like object. TYPE: `int`
`key`	The key used to match. TYPE: `str`
`alignment_mode`	The alignment mode. TYPE: `str`
`ignore_excluded`	Whether to skip excluded tokens. TYPE: `bool`

RETURNS	DESCRIPTION
`span`	A span matched on the Doc-like object.

Source code in edsnlp/matchers/regex.py

def create_span(
    doclike: Union[Doc, Span],
    start_char: int,
    end_char: int,
    key: str,
    attr: str,
    alignment_mode: str,
    ignore_excluded: bool,
) -> Span:
    """
    spaCy only allows strict alignment mode for char_span on Spans.
    This method circumvents this.

    Parameters
    ----------
    doclike : Union[Doc, Span]
        `Doc` or `Span`.
    start_char : int
        Character index within the Doc-like object.
    end_char : int
        Character index of the end, within the Doc-like object.
    key : str
        The key used to match.
    alignment_mode : str
        The alignment mode.
    ignore_excluded : bool
        Whether to skip excluded tokens.

    Returns
    -------
    span:
        A span matched on the Doc-like object.
    """

    doc = doclike if isinstance(doclike, Doc) else doclike.doc

    # Handle the simple case immediately
    if attr in {"TEXT", "LOWER"} and not ignore_excluded:
        off = doclike[0].idx
        return doc.char_span(
            start_char + off,
            end_char + off,
            label=key,
            alignment_mode=alignment_mode,
        )

    # If doclike is a Span, we need to get the clean
    # index of the first included token
    if ignore_excluded:
        original, clean = alignment(
            doc=doc,
            attr=attr,
            ignore_excluded=ignore_excluded,
        )

        first_included = get_first_included(doclike)
        i = bisect_left(original, first_included.idx)
        first = clean[i]

    else:
        first = doclike[0].idx

    start_char = (
        first
        + start_char
        + offset(
            doc,
            attr=attr,
            ignore_excluded=ignore_excluded,
            index=first + start_char,
        )
    )

    end_char = (
        first
        + end_char
        + offset(
            doc,
            attr=attr,
            ignore_excluded=ignore_excluded,
            index=first + end_char,
        )
    )

    span = doc.char_span(
        start_char,
        end_char,
        label=key,
        alignment_mode=alignment_mode,
    )

    return span

`utils`

`ListOrStr = Union[List[str], str]` `module-attribute`

`DictOrPattern = Union[Dict[str, ListOrStr], ListOrStr]` `module-attribute`

`Patterns = Dict[str, DictOrPattern]` `module-attribute`

`ATTRIBUTES = {'LOWER': 'lower_', 'TEXT': 'text', 'NORM': 'norm_', 'SHAPE': 'shape_'}` `module-attribute`

`offset`

`token_length(token, custom, attr)`

Source code in edsnlp/matchers/utils/offset.py

def token_length(token: Token, custom: bool, attr: str):
    if custom:
        text = getattr(token._, attr)
    else:
        text = getattr(token, attr)
    return len(text)

`alignment(doc, attr='TEXT', ignore_excluded=True)`

Align different representations of a Doc or Span object.

PARAMETER DESCRIPTION

doc

spaCy Doc or Span object

TYPE: Doc

attr

Attribute to use, by default "TEXT"

TYPE: str, optional DEFAULT: 'TEXT'

ignore_excluded

Whether to remove excluded tokens, by default True

TYPE: bool, optional DEFAULT: True

RETURNS	DESCRIPTION
`Tuple[List[int], List[int]]`	An alignment tuple: original and clean lists.

Source code in edsnlp/matchers/utils/offset.py

@lru_cache(maxsize=32)
def alignment(
    doc: Doc,
    attr: str = "TEXT",
    ignore_excluded: bool = True,
) -> Tuple[List[int], List[int]]:
    """
    Align different representations of a `Doc` or `Span` object.

    Parameters
    ----------
    doc : Doc
        spaCy `Doc` or `Span` object
    attr : str, optional
        Attribute to use, by default `"TEXT"`
    ignore_excluded : bool, optional
        Whether to remove excluded tokens, by default True

    Returns
    -------
    Tuple[List[int], List[int]]
        An alignment tuple: original and clean lists.
    """
    assert isinstance(doc, Doc)

    attr = attr.upper()
    attr = ATTRIBUTES.get(attr, attr)

    custom = attr.startswith("_")

    if custom:
        attr = attr[1:].lower()

    # Define the length function
    length = partial(token_length, custom=custom, attr=attr)

    original = []
    clean = []

    cursor = 0

    for token in doc:

        if not ignore_excluded or not token._.excluded:

            # The token is not excluded, we add its extremities to the list
            original.append(token.idx)

            # We add the cursor
            clean.append(cursor)
            cursor += length(token)

            if token.whitespace_:
                cursor += 1

    return original, clean

`offset(doc, attr, ignore_excluded, index)`

Compute offset between the original text and a given representation (defined by the couple attr, ignore_excluded).

The alignment itself is computed with alignment.

PARAMETER	DESCRIPTION
`doc`	The spaCy `Doc` object TYPE: `Doc`
`attr`	The attribute used by the `RegexMatcher` (eg `NORM`) TYPE: `str`
`ignore_excluded`	Whether the RegexMatcher ignores excluded tokens. TYPE: `bool`
`index`	The index in the pre-processed text. TYPE: `int`

RETURNS	DESCRIPTION
`int`	The offset. To get the character index in the original document, just do: `original = index + offset(doc, attr, ignore_excluded, index)`

Source code in edsnlp/matchers/utils/offset.py

def offset(
    doc: Doc,
    attr: str,
    ignore_excluded: bool,
    index: int,
) -> int:
    """
    Compute offset between the original text and a given representation
    (defined by the couple `attr`, `ignore_excluded`).

    The alignment itself is computed with
    [`alignment`][edsnlp.matchers.utils.offset.alignment].

    Parameters
    ----------
    doc : Doc
        The spaCy `Doc` object
    attr : str
        The attribute used by the [`RegexMatcher`][edsnlp.matchers.regex.RegexMatcher]
        (eg `NORM`)
    ignore_excluded : bool
        Whether the RegexMatcher ignores excluded tokens.
    index : int
        The index in the pre-processed text.

    Returns
    -------
    int
        The offset. To get the character index in the original document,
        just do: `#!python original = index + offset(doc, attr, ignore_excluded, index)`
    """
    original, clean = alignment(
        doc=doc,
        attr=attr,
        ignore_excluded=ignore_excluded,
    )

    # We use bisect to efficiently find the correct rightmost-lower index
    i = bisect_left(clean, index)
    i = min(i, len(original) - 1)

    return original[i] - clean[i]

`text`

`get_text(doclike, attr, ignore_excluded)`

Get text using a custom attribute, possibly ignoring excluded tokens.

PARAMETER DESCRIPTION

doclike

Doc or Span to get text from.

TYPE: Union[Doc, Span]

attr

Attribute to use.

TYPE: str

ignore_excluded

Whether to skip excluded tokens, by default False

TYPE: bool

RETURNS	DESCRIPTION
`str`	Extracted text.

Source code in edsnlp/matchers/utils/text.py

@lru_cache(32)
def get_text(
    doclike: Union[Doc, Span],
    attr: str,
    ignore_excluded: bool,
) -> str:
    """
    Get text using a custom attribute, possibly ignoring excluded tokens.

    Parameters
    ----------
    doclike : Union[Doc, Span]
        Doc or Span to get text from.
    attr : str
        Attribute to use.
    ignore_excluded : bool
        Whether to skip excluded tokens, by default False

    Returns
    -------
    str
        Extracted text.
    """

    attr = attr.upper()

    if not ignore_excluded:
        if attr == "TEXT":
            return doclike.text
        elif attr == "LOWER":
            return doclike.text.lower()
        else:
            tokens = doclike
    else:
        tokens = [t for t in doclike if not t._.excluded]

    attr = ATTRIBUTES.get(attr, attr)

    if attr.startswith("_"):
        attr = attr[1:].lower()
        return "".join([getattr(t._, attr) + t.whitespace_ for t in tokens])
    else:
        return "".join([getattr(t, attr) + t.whitespace_ for t in tokens])

`processing`

`helpers`

`DataFrames = None` `module-attribute`

`spec = importlib.util.find_spec(module.value)` `module-attribute`

`DataFrameModules`

Bases: Enum

Source code in edsnlp/processing/helpers.py

class DataFrameModules(Enum):
    PANDAS = "pandas"
    PYSPARK = "pyspark.sql"
    KOALAS = "databricks.koalas"

`PANDAS = 'pandas'` `class-attribute`

`PYSPARK = 'pyspark.sql'` `class-attribute`

`KOALAS = 'databricks.koalas'` `class-attribute`

`get_module(df)`

Source code in edsnlp/processing/helpers.py

def get_module(df: DataFrames):
    for module in list(DataFrameModules):
        if df.__class__.__module__.startswith(module.value):
            return module

`check_spacy_version_for_context()`

Source code in edsnlp/processing/helpers.py

def check_spacy_version_for_context():  # pragma: no cover
    import spacy

    spacy_version = getattr(spacy, "__version__")
    if LooseVersion(spacy_version) < LooseVersion("3.2"):
        raise VersionConflict(
            "You provided a `context` argument, which only work with spacy>=3.2.\n"
            f"However, we found SpaCy version {spacy_version}.\n",
            "Please upgrade SpaCy ;)",
        )

`simple`

`nlp = spacy.blank('fr')` `module-attribute`

`ExtensionSchema = Union[str, List[str], Dict[str, Any]]` `module-attribute`

`_df_to_spacy(note, nlp, context)`

Takes a pandas DataFrame and return a generator that can be used in nlp.pipe().

PARAMETER DESCRIPTION

note

A pandas DataFrame with at least note_text and note_id columns. A Doc object will be created for each line.

TYPE: pd.DataFrame

RETURNS	DESCRIPTION
`generator`	A generator which items are of the form (text, context), with `text` being a string and `context` a dictionnary

Source code in edsnlp/processing/simple.py

def _df_to_spacy(
    note: pd.DataFrame,
    nlp: Language,
    context: List[str],
):
    """
    Takes a pandas DataFrame and return a generator that can be used in
    `nlp.pipe()`.

    Parameters
    ----------
    note: pd.DataFrame
        A pandas DataFrame with at least `note_text` and `note_id` columns.
        A `Doc` object will be created for each line.

    Returns
    -------
    generator:
        A generator which items are of the form (text, context), with `text`
        being a string and `context` a dictionnary
    """

    if context:
        check_spacy_version_for_context()

    kept_cols = ["note_text"] + context

    for col in kept_cols:
        if col not in note.columns:
            raise ValueError(f"No column named {repr(col)} found in df")

    def add_context(context_values):
        note_text = context_values.note_text
        doc = nlp.make_doc(note_text)
        for col in context:
            doc._.set(col, getattr(context_values, col))
        return doc

    yield from map(
        add_context,
        note[kept_cols].itertuples(),
    )

`_flatten(list_of_lists)`

Flatten a list of lists to a combined list.

Source code in edsnlp/processing/simple.py

def _flatten(list_of_lists: List[List[Any]]):
    """
    Flatten a list of lists to a combined list.
    """
    return [item for sublist in list_of_lists for item in sublist]

`_pipe_generator(note, nlp, context=[], additional_spans='discarded', extensions=[], batch_size=50, progress_bar=True)`

Source code in edsnlp/processing/simple.py

def _pipe_generator(
    note: pd.DataFrame,
    nlp: Language,
    context: List[str] = [],
    additional_spans: Union[List[str], str] = "discarded",
    extensions: ExtensionSchema = [],
    batch_size: int = 50,
    progress_bar: bool = True,
):

    if type(extensions) == str:
        extensions = [extensions]
    elif type(extensions) == dict:
        extensions = list(extensions.keys())

    if type(additional_spans) == str:
        additional_spans = [additional_spans]

    if "note_id" not in context:
        context.append("note_id")

    if not nlp.has_pipe("eds.context"):
        nlp.add_pipe("eds.context", first=True, config=dict(context=context))

    gen = _df_to_spacy(note, nlp, context)
    n_docs = len(note)
    pipeline = nlp.pipe(gen, batch_size=batch_size)

    for doc in tqdm(pipeline, total=n_docs, disable=not progress_bar):

        yield _full_schema(
            doc,
            additional_spans=additional_spans,
            extensions=extensions,
        )

`_single_schema(ent, span_type='ents', extensions=[])`

Source code in edsnlp/processing/simple.py

def _single_schema(
    ent: Span,
    span_type: str = "ents",
    extensions: List[str] = [],
):

    return {
        "note_id": ent.doc._.note_id,
        "lexical_variant": ent.text,
        "label": ent.label_,
        "span_type": span_type,
        "start": ent.start_char,
        "end": ent.end_char,
        **{extension: getattr(ent._, extension) for extension in extensions},
    }

`_full_schema(doc, additional_spans=[], extensions=[])`

Function used when Parallelising tasks via joblib. Takes a Doc as input, and returns a list of serializable objects

Note

The parallelisation needs for output objects to be serializable: after splitting the task into separate jobs, intermediate results are saved on memory before being aggregated, thus the need to be serializable. For instance, spaCy's spans aren't serializable since they are merely a view of the parent document.

Check the source code of this function for an example.

Source code in edsnlp/processing/simple.py

def _full_schema(
    doc: Doc,
    additional_spans: List[str] = [],
    extensions: List[str] = [],
):
    """
    Function used when Parallelising tasks via joblib.
    Takes a Doc as input, and returns a list of serializable objects

    !!! note

        The parallelisation needs for output objects to be **serializable**:
        after splitting the task into separate jobs, intermediate results
        are saved on memory before being aggregated, thus the need to be
        serializable. For instance, spaCy's spans aren't serializable since
        they are merely a *view* of the parent document.

        Check the source code of this function for an example.

    """

    results = []

    results.extend(
        [
            _single_schema(
                ent,
                extensions=extensions,
            )
            for ent in doc.ents
            if doc.ents
        ]
    )

    for span_type in additional_spans:
        results.extend(
            [
                _single_schema(
                    ent,
                    span_type=span_type,
                    extensions=extensions,
                )
                for ent in doc.spans[span_type]
                if doc.spans[span_type]
            ]
        )
    return results

`pipe(note, nlp, context=[], additional_spans='discarded', extensions=[], batch_size=1000, progress_bar=True)`

Function to apply a spaCy pipe to a pandas DataFrame note For a large DataFrame, prefer the parallel version.

PARAMETER	DESCRIPTION
`note`	A pandas DataFrame with a `note_id` and `note_text` column TYPE: `DataFrame`
`nlp`	A spaCy pipe TYPE: `Language`
`context`	A list of column to add to the generated SpaCy document as an extension. For instance, if `context=["note_datetime"], the corresponding value found in the`note_datetime`column will be stored in`doc._.note_datetime`, which can be useful e.g. for the`dates` pipeline. TYPE: `List[str]` DEFAULT: `[]`
`additional_spans`	A name (or list of names) of SpanGroup on which to apply the pipe too: SpanGroup are available as `doc.spans[spangroup_name]` and can be generated by some pipes. For instance, the `date` pipe populates doc.spans['dates'] TYPE: `Union[List[str], str], by default "discarded"` DEFAULT: `'discarded'`
`extensions`	Spans extensions to add to the extracted results: For instance, if `extensions=["score_name"]`, the extracted result will include, for each entity, `ent._.score_name`. TYPE: `List[Tuple[str, T.DataType]], by default []` DEFAULT: `[]`
`batch_size`	Batch size used by spaCy's pipe TYPE: `int, by default 1000` DEFAULT: `1000`
`progress_bar`	Whether to display a progress bar or not TYPE: `bool` DEFAULT: `True`

RETURNS	DESCRIPTION
`DataFrame`	A pandas DataFrame with one line per extraction

Source code in edsnlp/processing/simple.py

def pipe(
    note: pd.DataFrame,
    nlp: Language,
    context: List[str] = [],
    additional_spans: Union[List[str], str] = "discarded",
    extensions: Union[List[str], str] = [],
    batch_size: int = 1000,
    progress_bar: bool = True,
):
    """
    Function to apply a spaCy pipe to a pandas DataFrame note
    For a large DataFrame, prefer the parallel version.

    Parameters
    ----------
    note : DataFrame
        A pandas DataFrame with a `note_id` and `note_text` column
    nlp : Language
        A spaCy pipe
    context : List[str]
        A list of column to add to the generated SpaCy document as an extension.
        For instance, if `context=["note_datetime"], the corresponding value found
        in the `note_datetime` column will be stored in `doc._.note_datetime`,
        which can be useful e.g. for the `dates` pipeline.
    additional_spans : Union[List[str], str], by default "discarded"
        A name (or list of names) of SpanGroup on which to apply the pipe too:
        SpanGroup are available as `doc.spans[spangroup_name]` and can be generated
        by some pipes. For instance, the `date` pipe populates doc.spans['dates']
    extensions : List[Tuple[str, T.DataType]], by default []
        Spans extensions to add to the extracted results:
        For instance, if `extensions=["score_name"]`, the extracted result
        will include, for each entity, `ent._.score_name`.
    batch_size : int, by default 1000
        Batch size used by spaCy's pipe
    progress_bar: bool, by default True
        Whether to display a progress bar or not

    Returns
    -------
    DataFrame
        A pandas DataFrame with one line per extraction
    """
    return pd.DataFrame(
        _flatten(
            _pipe_generator(
                note=note,
                nlp=nlp,
                context=context,
                additional_spans=additional_spans,
                extensions=extensions,
                batch_size=batch_size,
                progress_bar=progress_bar,
            )
        )
    )

`wrapper`

`pipe(note, nlp, n_jobs=-2, context=[], additional_spans='discarded', extensions=[], **kwargs)`

Function to apply a spaCy pipe to a pandas or pyspark DataFrame

PARAMETER	DESCRIPTION
`note`	A pandas/pyspark/koalas DataFrame with a `note_id` and `note_text` column TYPE: `DataFrame`
`nlp`	A spaCy pipe TYPE: `Language`
`context`	A list of column to add to the generated SpaCy document as an extension. For instance, if `context=["note_datetime"], the corresponding value found in the`note_datetime`column will be stored in`doc._.note_datetime`, which can be useful e.g. for the`dates` pipeline. TYPE: `List[str]` DEFAULT: `[]`
`n_jobs`	Only used when providing a Pandas DataFrame `n_jobs=1` corresponds to `simple_pipe` `n_jobs>1` corresponds to `parallel_pipe` with `n_jobs` parallel workers `n_jobs=-1` corresponds to `parallel_pipe` with maximun number of workers `n_jobs=-2` corresponds to `parallel_pipe` with maximun number of workers -1 TYPE: `int, by default -2` DEFAULT: `-2`
`additional_spans`	A name (or list of names) of SpanGroup on which to apply the pipe too: SpanGroup are available as `doc.spans[spangroup_name]` and can be generated by some pipes. For instance, the `date` pipe populates doc.spans['dates'] TYPE: `Union[List[str], str], by default "discarded"` DEFAULT: `'discarded'`
`extensions`	Spans extensions to add to the extracted results: For instance, if `extensions=["score_name"]`, the extracted result will include, for each entity, `ent._.score_name`. TYPE: `List[Tuple[str, T.DataType]], by default []` DEFAULT: `[]`
`kwargs`	Additional parameters depending on the `how` argument. TYPE: `Dict[str, Any]`

RETURNS	DESCRIPTION
`DataFrame`	A DataFrame with one line per extraction

Source code in edsnlp/processing/wrapper.py

def pipe(
    note: DataFrames,
    nlp: Language,
    n_jobs: int = -2,
    context: List[str] = [],
    additional_spans: Union[List[str], str] = "discarded",
    extensions: ExtensionSchema = [],
    **kwargs: Dict[str, Any],
) -> DataFrames:
    """
    Function to apply a spaCy pipe to a pandas or pyspark DataFrame


    Parameters
    ----------
    note : DataFrame
        A pandas/pyspark/koalas DataFrame with a `note_id` and `note_text` column
    nlp : Language
        A spaCy pipe
    context : List[str]
        A list of column to add to the generated SpaCy document as an extension.
        For instance, if `context=["note_datetime"], the corresponding value found
        in the `note_datetime` column will be stored in `doc._.note_datetime`,
        which can be useful e.g. for the `dates` pipeline.
    n_jobs : int, by default -2
        Only used when providing a Pandas DataFrame

        - `n_jobs=1` corresponds to `simple_pipe`
        - `n_jobs>1` corresponds to `parallel_pipe` with `n_jobs` parallel workers
        - `n_jobs=-1` corresponds to `parallel_pipe` with maximun number of workers
        - `n_jobs=-2` corresponds to `parallel_pipe` with maximun number of workers -1
    additional_spans : Union[List[str], str], by default "discarded"
        A name (or list of names) of SpanGroup on which to apply the pipe too:
        SpanGroup are available as `doc.spans[spangroup_name]` and can be generated
        by some pipes. For instance, the `date` pipe populates doc.spans['dates']
    extensions : List[Tuple[str, T.DataType]], by default []
        Spans extensions to add to the extracted results:
        For instance, if `extensions=["score_name"]`, the extracted result
        will include, for each entity, `ent._.score_name`.
    kwargs : Dict[str, Any]
        Additional parameters depending on the `how` argument.

    Returns
    -------
    DataFrame
        A DataFrame with one line per extraction
    """

    module = get_module(note)

    if module == DataFrameModules.PANDAS:
        if n_jobs == 1:

            return simple_pipe(
                note=note,
                nlp=nlp,
                context=context,
                additional_spans=additional_spans,
                extensions=extensions,
                **kwargs,
            )

        else:

            return parallel_pipe(
                note=note,
                nlp=nlp,
                context=context,
                additional_spans=additional_spans,
                extensions=extensions,
                n_jobs=n_jobs,
                **kwargs,
            )

    if extensions and type(extensions) != dict:
        raise ValueError(
            """
            When using Spark or Koalas, you should provide extension names
            along with the extension type (as a dictionnary):
            `d[extension_name] = extension_type`
            """  # noqa W291
        )

    from .distributed import pipe as distributed_pipe

    return distributed_pipe(
        note=note,
        nlp=nlp,
        context=context,
        additional_spans=additional_spans,
        extensions=extensions,
        **kwargs,
    )

`parallel`

`nlp = spacy.blank('fr')` `module-attribute`

`_define_nlp(new_nlp)`

Set the global nlp variable Doing it this way saves non negligeable amount of time

Source code in edsnlp/processing/parallel.py

def _define_nlp(new_nlp: Language):
    """
    Set the global nlp variable
    Doing it this way saves non negligeable amount of time
    """
    global nlp
    nlp = new_nlp

`_chunker(iterable, total_length, chunksize)`

Takes an iterable and chunk it.

Source code in edsnlp/processing/parallel.py

def _chunker(
    iterable: Iterable,
    total_length: int,
    chunksize: int,
):
    """
    Takes an iterable and chunk it.
    """
    return (
        iterable[pos : pos + chunksize] for pos in range(0, total_length, chunksize)
    )

`_process_chunk(note, **pipe_kwargs)`

Source code in edsnlp/processing/parallel.py

def _process_chunk(note: pd.DataFrame, **pipe_kwargs):

    list_results = []

    for out in _pipe_generator(note, nlp, progress_bar=False, **pipe_kwargs):
        list_results += out

    return list_results

`pipe(note, nlp, context=[], additional_spans='discarded', extensions=[], chunksize=100, n_jobs=-2, progress_bar=True, **pipe_kwargs)`

Function to apply a spaCy pipe to a pandas DataFrame note by using multiprocessing

PARAMETER	DESCRIPTION
`note`	A pandas DataFrame with a `note_id` and `note_text` column TYPE: `DataFrame`
`nlp`	A spaCy pipe TYPE: `Language`
`context`	A list of column to add to the generated SpaCy document as an extension. For instance, if `context=["note_datetime"], the corresponding value found in the`note_datetime`column will be stored in`doc._.note_datetime`, which can be useful e.g. for the`dates` pipeline. TYPE: `List[str]` DEFAULT: `[]`
`additional_spans`	A name (or list of names) of SpanGroup on which to apply the pipe too: SpanGroup are available as `doc.spans[spangroup_name]` and can be generated by some pipes. For instance, the `date` pipe populates doc.spans['dates'] TYPE: `Union[List[str], str], by default "discarded"` DEFAULT: `'discarded'`
`extensions`	Spans extensions to add to the extracted results: FOr instance, if `extensions=["score_name"]`, the extracted result will include, for each entity, `ent._.score_name`. TYPE: `List[Tuple[str, T.DataType]], by default []` DEFAULT: `[]`
`chunksize`	Batch size used to split tasks TYPE: `int` DEFAULT: `100`
`n_jobs`	Max number of parallel jobs. The default value uses the maximum number of available cores. TYPE: `int` DEFAULT: `-2`
`progress_bar`	Whether to display a progress bar or not TYPE: `bool` DEFAULT: `True`
`**pipe_kwargs`	Arguments exposed in `processing.pipe_generator` are also available here DEFAULT: `{}`

RETURNS	DESCRIPTION
`DataFrame`	A pandas DataFrame with one line per extraction

Source code in edsnlp/processing/parallel.py

def pipe(
    note: pd.DataFrame,
    nlp: Language,
    context: List[str] = [],
    additional_spans: Union[List[str], str] = "discarded",
    extensions: ExtensionSchema = [],
    chunksize: int = 100,
    n_jobs: int = -2,
    progress_bar: bool = True,
    **pipe_kwargs,
):
    """
    Function to apply a spaCy pipe to a pandas DataFrame note by using multiprocessing

    Parameters
    ----------
    note : DataFrame
        A pandas DataFrame with a `note_id` and `note_text` column
    nlp : Language
        A spaCy pipe
    context : List[str]
        A list of column to add to the generated SpaCy document as an extension.
        For instance, if `context=["note_datetime"], the corresponding value found
        in the `note_datetime` column will be stored in `doc._.note_datetime`,
        which can be useful e.g. for the `dates` pipeline.
    additional_spans : Union[List[str], str], by default "discarded"
        A name (or list of names) of SpanGroup on which to apply the pipe too:
        SpanGroup are available as `doc.spans[spangroup_name]` and can be generated
        by some pipes. For instance, the `date` pipe populates doc.spans['dates']
    extensions : List[Tuple[str, T.DataType]], by default []
        Spans extensions to add to the extracted results:
        FOr instance, if `extensions=["score_name"]`, the extracted result
        will include, for each entity, `ent._.score_name`.
    chunksize: int, by default 100
        Batch size used to split tasks
    n_jobs: int, by default -2
        Max number of parallel jobs.
        The default value uses the maximum number of available cores.
    progress_bar: bool, by default True
        Whether to display a progress bar or not
    **pipe_kwargs:
        Arguments exposed in `processing.pipe_generator` are also available here

    Returns
    -------
    DataFrame
        A pandas DataFrame with one line per extraction
    """

    if context:
        check_spacy_version_for_context()

    # Setting the nlp variable
    _define_nlp(nlp)

    verbose = 10 if progress_bar else 0

    executor = Parallel(
        n_jobs, backend="multiprocessing", prefer="processes", verbose=verbose
    )
    executor.warn(f"Used nlp components: {nlp.component_names}")

    pipe_kwargs["additional_spans"] = additional_spans
    pipe_kwargs["extensions"] = extensions
    pipe_kwargs["context"] = context

    if verbose:
        executor.warn(f"{int(len(note)/chunksize)} tasks to complete")

    do = delayed(_process_chunk)

    tasks = (
        do(chunk, **pipe_kwargs)
        for chunk in _chunker(note, len(note), chunksize=chunksize)
    )
    result = executor(tasks)

    out = _flatten(result)

    return pd.DataFrame(out)

`distributed`

`pyspark_type_finder(obj)`

Returns (when possible) the PySpark type of any python object

Source code in edsnlp/processing/distributed.py

def pyspark_type_finder(obj):
    """
    Returns (when possible) the PySpark type of any python object
    """
    try:
        inferred_type = T._infer_type(obj)
        print(f"Inferred type is {repr(inferred_type)}")
        return inferred_type
    except TypeError:
        raise TypeError("Cannot infer type for this object.")

`module_checker(func, *args, **kwargs)`

Source code in edsnlp/processing/distributed.py

@decorator
def module_checker(
    func: Callable,
    *args,
    **kwargs,
) -> Any:

    args = list(args)
    note = args.pop(0)
    module = get_module(note)

    if module == DataFrameModules.PYSPARK:
        return func(note, *args, **kwargs)
    elif module == DataFrameModules.KOALAS:
        import databricks.koalas  # noqa F401

        note_spark = note.to_spark()
        note_nlp_spark = func(note_spark, *args, **kwargs)
        return note_nlp_spark.to_koalas()

`pipe(note, nlp, context=[], additional_spans='discarded', extensions=[])`

Function to apply a spaCy pipe to a pyspark or koalas DataFrame note

PARAMETER	DESCRIPTION
`note`	A Pyspark or Koalas DataFrame with a `note_id` and `note_text` column TYPE: `DataFrame`
`nlp`	A spaCy pipe TYPE: `Language`
`context`	A list of column to add to the generated SpaCy document as an extension. For instance, if `context=["note_datetime"], the corresponding value found in the`note_datetime`column will be stored in`doc._.note_datetime`, which can be useful e.g. for the`dates` pipeline. TYPE: `List[str]` DEFAULT: `[]`
`additional_spans`	A name (or list of names) of SpanGroup on which to apply the pipe too: SpanGroup are available as `doc.spans[spangroup_name]` and can be generated by some pipes. For instance, the `date` pipe populates doc.spans['dates'] TYPE: `Union[List[str], str], by default "discarded"` DEFAULT: `'discarded'`
`extensions`	Spans extensions to add to the extracted results: FOr instance, if `extensions=["score_name"]`, the extracted result will include, for each entity, `ent._.score_name`. TYPE: `List[Tuple[str, T.DataType]], by default []` DEFAULT: `[]`

RETURNS	DESCRIPTION
`DataFrame`	A pyspark DataFrame with one line per extraction

Source code in edsnlp/processing/distributed.py

@module_checker
def pipe(
    note: DataFrames,
    nlp: Language,
    context: List[str] = [],
    additional_spans: Union[List[str], str] = "discarded",
    extensions: List[Tuple[str, T.DataType]] = [],
) -> DataFrame:
    """
    Function to apply a spaCy pipe to a pyspark or koalas DataFrame note

    Parameters
    ----------
    note : DataFrame
        A Pyspark or Koalas DataFrame with a `note_id` and `note_text` column
    nlp : Language
        A spaCy pipe
    context : List[str]
        A list of column to add to the generated SpaCy document as an extension.
        For instance, if `context=["note_datetime"], the corresponding value found
        in the `note_datetime` column will be stored in `doc._.note_datetime`,
        which can be useful e.g. for the `dates` pipeline.
    additional_spans : Union[List[str], str], by default "discarded"
        A name (or list of names) of SpanGroup on which to apply the pipe too:
        SpanGroup are available as `doc.spans[spangroup_name]` and can be generated
        by some pipes. For instance, the `date` pipe populates doc.spans['dates']
    extensions : List[Tuple[str, T.DataType]], by default []
        Spans extensions to add to the extracted results:
        FOr instance, if `extensions=["score_name"]`, the extracted result
        will include, for each entity, `ent._.score_name`.

    Returns
    -------
    DataFrame
        A pyspark DataFrame with one line per extraction
    """

    if context:
        check_spacy_version_for_context()

    spark = SparkSession.builder.enableHiveSupport().getOrCreate()
    sc = spark.sparkContext

    if not nlp.has_pipe("eds.context"):
        nlp.add_pipe("eds.context", first=True, config=dict(context=context))

    nlp_bc = sc.broadcast(nlp)

    def _udf_factory(
        additional_spans: Union[List[str], str] = "discarded",
        extensions: Dict[str, T.DataType] = dict(),
    ):

        schema = T.ArrayType(
            T.StructType(
                [
                    T.StructField("lexical_variant", T.StringType(), False),
                    T.StructField("label", T.StringType(), False),
                    T.StructField("span_type", T.StringType(), True),
                    T.StructField("start", T.IntegerType(), False),
                    T.StructField("end", T.IntegerType(), False),
                    *[
                        T.StructField(extension_name, extension_type, True)
                        for extension_name, extension_type in extensions.items()
                    ],
                ]
            )
        )

        def f(
            text,
            *context_values,
            additional_spans=additional_spans,
            extensions=extensions,
        ):

            if text is None:
                return []

            nlp = nlp_bc.value

            for _, pipe in nlp.pipeline:
                if isinstance(pipe, BaseComponent):
                    pipe.set_extensions()

            doc = nlp.make_doc(text)
            for context_name, context_value in zip(context, context_values):
                doc._.set(context_name, context_value)
            doc = nlp(doc)

            ents = []

            for ent in doc.ents:
                parsed_extensions = [
                    getattr(ent._, extension) for extension in extensions.keys()
                ]

                ents.append(
                    (
                        ent.text,
                        ent.label_,
                        "ents",
                        ent.start_char,
                        ent.end_char,
                        *parsed_extensions,
                    )
                )

            if additional_spans is None:
                return ents

            if type(additional_spans) == str:
                additional_spans = [additional_spans]

            for spans_name in additional_spans:

                for ent in doc.spans.get(spans_name, []):

                    parsed_extensions = [
                        getattr(ent._, extension) for extension in extensions.keys()
                    ]

                    ents.append(
                        (
                            ent.text,
                            ent.label_,
                            spans_name,
                            ent.start_char,
                            ent.end_char,
                            *parsed_extensions,
                        )
                    )

            return ents

        f_udf = F.udf(
            partial(
                f,
                additional_spans=additional_spans,
                extensions=extensions,
            ),
            schema,
        )

        return f_udf

    matcher = _udf_factory(
        additional_spans=additional_spans,
        extensions=extensions,
    )

    note_nlp = note.withColumn(
        "matches", matcher(F.col("note_text"), *[F.col(c) for c in context])
    )
    note_nlp = note_nlp.withColumn("matches", F.explode(note_nlp.matches))

    note_nlp = note_nlp.select("note_id", "matches.*")

    return note_nlp

`utils`

`inclusion`

`check_inclusion(span, start, end)`

Checks whether the span overlaps the boundaries.

PARAMETER DESCRIPTION

span

Span to check.

TYPE: Span

start

Start of the boundary

TYPE: int

end

End of the boundary

TYPE: int

RETURNS	DESCRIPTION
`bool`	Whether the span overlaps the boundaries.

Source code in edsnlp/utils/inclusion.py

def check_inclusion(span: Span, start: int, end: int) -> bool:
    """
    Checks whether the span overlaps the boundaries.

    Parameters
    ----------
    span : Span
        Span to check.
    start : int
        Start of the boundary
    end : int
        End of the boundary

    Returns
    -------
    bool
        Whether the span overlaps the boundaries.
    """

    if span.start >= end or span.end <= start:
        return False
    return True

`filter`

`default_sort_key(span)`

Returns the sort key for filtering spans.

PARAMETER DESCRIPTION

span

Span to sort.

TYPE: Span

RETURNS DESCRIPTION

key

Sort key.

TYPE: Tuple(int, int)

Source code in edsnlp/utils/filter.py

def default_sort_key(span: Span) -> Tuple[int, int]:
    """
    Returns the sort key for filtering spans.

    Parameters
    ----------
    span : Span
        Span to sort.

    Returns
    -------
    key : Tuple(int, int)
        Sort key.
    """
    return span.end - span.start, -span.start

`filter_spans(spans, label_to_remove=None, return_discarded=False, sort_key=default_sort_key)`

Re-definition of spacy's filtering function, that returns discarded spans as well as filtered ones.

Can also accept a label_to_remove argument, useful for filtering out pseudo cues. If set, results can contain overlapping spans: only spans overlapping with excluded labels are removed. The main expected use case is for pseudo-cues.

The spaCy documentation states:

Filter a sequence of spans and remove duplicates or overlaps. Useful for creating named entities (where one token can only be part of one entity) or when merging spans with Retokenizer.merge. When spans overlap, the (first) longest span is preferred over shorter spans.

Filtering out spans

If the label_to_remove argument is supplied, it might be tempting to filter overlapping spans that are not part of a label to remove.

The reason we keep all other possibly overlapping labels is that in qualifier pipelines, the same cue can precede and follow a marked entity. Hence we need to keep every example.

PARAMETER	DESCRIPTION
`spans`	Spans to filter. TYPE: `List[Span]`
`return_discarded`	Whether to return discarded spans. TYPE: `bool` DEFAULT: `False`
`label_to_remove`	Label to remove. If set, results can contain overlapping spans. TYPE: `str, optional` DEFAULT: `None`
`sort_key`	Key to sorting spans before applying overlap conflict resolution. A span with a higher key will have precedence over another span. By default, the largest, leftmost spans are selected first. TYPE: `Callable[Span, Any], optional` DEFAULT: `default_sort_key`

RETURNS DESCRIPTION

results

Filtered spans

TYPE: List[Span]

discarded

Discarded spans

TYPE: List[Span], optional

Source code in edsnlp/utils/filter.py

def filter_spans(
    spans: Iterable[Union["Span", Tuple["Span", Any]]],
    label_to_remove: Optional[str] = None,
    return_discarded: bool = False,
    sort_key: Callable[[Span], Any] = default_sort_key,
) -> Union[List["Span"], Tuple[List["Span"], List["Span"]]]:
    """
    Re-definition of spacy's filtering function, that returns discarded spans
    as well as filtered ones.

    Can also accept a `label_to_remove` argument, useful for filtering out
    pseudo cues. If set, `results` can contain overlapping spans: only
    spans overlapping with excluded labels are removed. The main expected
    use case is for pseudo-cues.

    !!! note ""

        The **spaCy documentation states**:

        > Filter a sequence of spans and remove duplicates or overlaps.
        > Useful for creating named entities (where one token can only
        > be part of one entity) or when merging spans with
        > `Retokenizer.merge`. When spans overlap, the (first)
        > longest span is preferred over shorter spans.

    !!! danger "Filtering out spans"

        If the `label_to_remove` argument is supplied, it might be tempting to
        filter overlapping spans that are not part of a label to remove.

        The reason we keep all other possibly overlapping labels is that in qualifier
        pipelines, the same cue can precede **and** follow a marked entity.
        Hence we need to keep every example.

    Parameters
    ----------
    spans : List[Span]
        Spans to filter.
    return_discarded : bool
        Whether to return discarded spans.
    label_to_remove : str, optional
        Label to remove. If set, results can contain overlapping spans.
    sort_key : Callable[Span, Any], optional
        Key to sorting spans before applying overlap conflict resolution.
        A span with a higher key will have precedence over another span.
        By default, the largest, leftmost spans are selected first.

    Returns
    -------
    results : List[Span]
        Filtered spans
    discarded : List[Span], optional
        Discarded spans
    """
    sorted_spans = sorted(spans, key=sort_key, reverse=True)
    result = []
    discarded = []
    seen_tokens = set()
    for span in sorted_spans:
        # Check for end - 1 here because boundaries are inclusive
        if span.start not in seen_tokens and span.end - 1 not in seen_tokens:
            if label_to_remove is None or span.label_ != label_to_remove:
                result.append(span)
            if label_to_remove is None or span.label_ == label_to_remove:
                seen_tokens.update(range(span.start, span.end))
        elif label_to_remove is None or span.label_ != label_to_remove:
            discarded.append(span)

    result = sorted(result, key=lambda span: span.start)
    discarded = sorted(discarded, key=lambda span: span.start)

    if return_discarded:
        return result, discarded

    return result

`consume_spans(spans, filter, second_chance=None)`

Consume a list of span, according to a filter.

Warning

This method makes the hard hypothesis that:

Spans are sorted.
Spans are consumed in sequence and only once.

The second item is problematic for the way we treat long entities, hence the second_chance parameter, which lets entities be seen more than once.

PARAMETER DESCRIPTION

spans

List of spans to filter

TYPE: List of spans

filter

Filtering function. Should return True when the item is to be included.

TYPE: Callable

second_chance

Optional list of spans to include again (useful for long entities), by default None

TYPE: List of spans, optional DEFAULT: None

RETURNS DESCRIPTION

matches

List of spans consumed by the filter.

TYPE: List of spans

remainder

List of remaining spans in the original spans parameter.

TYPE: List of spans

Source code in edsnlp/utils/filter.py

def consume_spans(
    spans: List[Span],
    filter: Callable,
    second_chance: Optional[List[Span]] = None,
) -> Tuple[List[Span], List[Span]]:
    """
    Consume a list of span, according to a filter.

    !!! warning
        This method makes the hard hypothesis that:

        1. Spans are sorted.
        2. Spans are consumed in sequence and only once.

        The second item is problematic for the way we treat long entities,
        hence the `second_chance` parameter, which lets entities be seen
        more than once.

    Parameters
    ----------
    spans : List of spans
        List of spans to filter
    filter : Callable
        Filtering function. Should return True when the item is to be included.
    second_chance : List of spans, optional
        Optional list of spans to include again (useful for long entities),
        by default None

    Returns
    -------
    matches : List of spans
        List of spans consumed by the filter.
    remainder : List of spans
        List of remaining spans in the original `spans` parameter.
    """

    if not second_chance:
        second_chance = []
    else:
        second_chance = [m for m in second_chance if filter(m)]

    if not spans:
        return second_chance, []

    for i, span in enumerate(spans):
        if not filter(span):
            break
        else:
            i += 1

    matches = spans[:i]
    remainder = spans[i:]

    matches.extend(second_chance)

    return matches, remainder

`get_spans(spans, label)`

Extracts spans with a given label. Prefer using hash label for performance reasons.

PARAMETER DESCRIPTION

spans

List of spans to filter.

TYPE: List[Span]

label

Label to filter on.

TYPE: Union[int, str]

RETURNS	DESCRIPTION
`List[Span]`	Filtered spans.

Source code in edsnlp/utils/filter.py

def get_spans(spans: List[Span], label: Union[int, str]) -> List[Span]:
    """
    Extracts spans with a given label.
    Prefer using hash label for performance reasons.

    Parameters
    ----------
    spans : List[Span]
        List of spans to filter.
    label : Union[int, str]
        Label to filter on.

    Returns
    -------
    List[Span]
        Filtered spans.
    """
    if isinstance(label, int):
        return [span for span in spans if span.label == label]
    else:
        return [span for span in spans if span.label_ == label]

`resources`

`get_verbs(verbs=None, check_contains=True)`

Extract verbs from the resources, as a pandas dataframe.

PARAMETER DESCRIPTION

verbs

List of verbs to keep. Returns all verbs by default.

TYPE: List[str], optional DEFAULT: None

check_contains

Whether to check that no verb is missing if a list of verbs was provided. By default True

TYPE: bool, optional DEFAULT: True

RETURNS	DESCRIPTION
`pd.DataFrame`	DataFrame containing conjugated verbs.

Source code in edsnlp/utils/resources.py

def get_verbs(
    verbs: Optional[List[str]] = None, check_contains: bool = True
) -> pd.DataFrame:
    """
    Extract verbs from the resources, as a pandas dataframe.

    Parameters
    ----------
    verbs : List[str], optional
        List of verbs to keep. Returns all verbs by default.
    check_contains : bool, optional
        Whether to check that no verb is missing if a list of verbs was provided.
        By default True

    Returns
    -------
    pd.DataFrame
        DataFrame containing conjugated verbs.
    """

    conjugated_verbs = pd.read_csv(BASE_DIR / "resources" / "verbs.csv")

    if not verbs:
        return conjugated_verbs

    verbs = set(verbs)

    selected_verbs = conjugated_verbs[conjugated_verbs.verb.isin(verbs)]

    if check_contains:
        assert len(verbs) == selected_verbs.verb.nunique(), "Some verbs are missing !"

    return selected_verbs

`examples`

`entity_pattern = re.compile('(<ent[^<>]*>[^<>]+</ent>)')` `module-attribute`

`text_pattern = re.compile('<ent.*>(.+)</ent>')` `module-attribute`

`modifiers_pattern = re.compile('<ent\\s?(.*)>.+</ent>')` `module-attribute`

`Match`

Bases: BaseModel

Source code in edsnlp/utils/examples.py

class Match(BaseModel):
    start_char: int
    end_char: int
    text: str
    modifiers: str

`start_char: int = None` `class-attribute`

`end_char: int = None` `class-attribute`

`text: str = None` `class-attribute`

`modifiers: str = None` `class-attribute`

`Modifier`

Bases: BaseModel

Source code in edsnlp/utils/examples.py

14
15
16

class Modifier(BaseModel):
    key: str
    value: Union[int, float, bool, str]

`key: str = None` `class-attribute`

`value: Union[int, float, bool, str] = None` `class-attribute`

`Entity`

Bases: BaseModel

Source code in edsnlp/utils/examples.py

class Entity(BaseModel):
    start_char: int
    end_char: int
    modifiers: List[Modifier]

`start_char: int = None` `class-attribute`

`end_char: int = None` `class-attribute`

`modifiers: List[Modifier] = None` `class-attribute`

`find_matches(example)`

Finds entities within the example.

PARAMETER DESCRIPTION

example

Example to process.

TYPE: str

RETURNS	DESCRIPTION
`List[re.Match]`	List of matches for entities.

Source code in edsnlp/utils/examples.py

def find_matches(example: str) -> List[re.Match]:
    """
    Finds entities within the example.

    Parameters
    ----------
    example : str
        Example to process.

    Returns
    -------
    List[re.Match]
        List of matches for entities.
    """
    return list(entity_pattern.finditer(example))

`parse_match(match)`

Parse a regex match representing an entity.

PARAMETER DESCRIPTION

match

Match for an entity.

TYPE: re.Match

RETURNS	DESCRIPTION
`Match`	Usable representation for the entity match.

Source code in edsnlp/utils/examples.py

def parse_match(match: re.Match) -> Match:
    """
    Parse a regex match representing an entity.

    Parameters
    ----------
    match : re.Match
        Match for an entity.

    Returns
    -------
    Match
        Usable representation for the entity match.
    """

    lexical_variant = match.group()
    start_char = match.start()
    end_char = match.end()

    text = text_pattern.findall(lexical_variant)[0]
    modifiers = modifiers_pattern.findall(lexical_variant)[0]

    m = Match(start_char=start_char, end_char=end_char, text=text, modifiers=modifiers)

    return m

`parse_example(example)`

Parses an example : finds examples and removes the tags.

PARAMETER DESCRIPTION

example

Example to process.

TYPE: str

RETURNS	DESCRIPTION
`Tuple[str, List[Entity]]`	Cleaned text and extracted entities.

Source code in edsnlp/utils/examples.py

def parse_example(example: str) -> Tuple[str, List[Entity]]:
    """
    Parses an example : finds examples and removes the tags.

    Parameters
    ----------
    example : str
        Example to process.

    Returns
    -------
    Tuple[str, List[Entity]]
        Cleaned text and extracted entities.
    """

    matches = [parse_match(match) for match in find_matches(example=example)]
    text = ""
    entities = []

    cursor = 0

    for match in matches:

        text += example[cursor : match.start_char]
        start_char = len(text)
        text += match.text
        end_char = len(text)
        modifiers = [m.split("=") for m in match.modifiers.split()]

        cursor = match.end_char

        entity = Entity(
            start_char=start_char,
            end_char=end_char,
            modifiers=[Modifier(key=k, value=v) for k, v in modifiers],
        )

        entities.append(entity)

    text += example[cursor:]

    return text, entities

`deprecation`

`deprecated_extension(name, new_name)`

Source code in edsnlp/utils/deprecation.py

def deprecated_extension(name: str, new_name: str) -> None:
    msg = (
        f'The extension "{name}" is deprecated and will be '
        "removed in a future version. "
        f'Please use "{new_name}" instead.'
    )

    logger.warning(msg)

`deprecated_getter_factory(name, new_name)`

Source code in edsnlp/utils/deprecation.py

def deprecated_getter_factory(name: str, new_name: str) -> Callable:
    def getter(toklike: Union[Token, Span, Doc]) -> Any:

        n = f"{type(toklike).__name__}._.{name}"
        nn = f"{type(toklike).__name__}._.{new_name}"

        deprecated_extension(n, nn)

        return getattr(toklike._, new_name)

    return getter

`deprecation(name, new_name=None)`

Source code in edsnlp/utils/deprecation.py

def deprecation(name: str, new_name: Optional[str] = None):

    new_name = new_name or f"eds.{name}"

    msg = (
        f'Calling "{name}" directly is deprecated and '
        "will be removed in a future version. "
        f'Please use "{new_name}" instead.'
    )

    logger.warning(msg)

`deprecated_factory(name, new_name=None, default_config=None, func=None)`

Execute the Language.factory method on a modified factory function. The modification adds a deprecation warning.

PARAMETER	DESCRIPTION
`name`	The deprecated name for the pipeline TYPE: `str`
`new_name`	The new name for the pipeline, which should be used, by default None TYPE: `Optional[str], optional` DEFAULT: `None`
`default_config`	The configuration that should be passed to Language.factory, by default None TYPE: `Optional[Dict[str, Any]], optional` DEFAULT: `None`
`func`	The function to decorate, by default None TYPE: `Optional[Callable], optional` DEFAULT: `None`

RETURNS	DESCRIPTION
`Callable`

Source code in edsnlp/utils/deprecation.py

def deprecated_factory(
    name: str,
    new_name: Optional[str] = None,
    default_config: Optional[Dict[str, Any]] = None,
    func: Optional[Callable] = None,
) -> Callable:
    """
    Execute the Language.factory method on a modified factory function.
    The modification adds a deprecation warning.

    Parameters
    ----------
    name : str
        The deprecated name for the pipeline
    new_name : Optional[str], optional
        The new name for the pipeline, which should be used, by default None
    default_config : Optional[Dict[str, Any]], optional
        The configuration that should be passed to Language.factory, by default None
    func : Optional[Callable], optional
        The function to decorate, by default None

    Returns
    -------
    Callable
    """

    if default_config is None:
        default_config = dict()

    wrapper = Language.factory(name, default_config=default_config)

    def wrap(factory):

        # Define decorator
        # We use micheles' decorator package to keep the same signature
        # See https://github.com/micheles/decorator/
        @decorator
        def decorate(
            f,
            *args,
            **kwargs,
        ):
            deprecation(name, new_name)
            return f(
                *args,
                **kwargs,
            )

        decorated = decorate(factory)

        wrapper(decorated)

        return factory

    if func is not None:
        return wrap(func)

    return wrap

`regex`

`make_pattern(patterns, with_breaks=False, name=None)`

Create OR pattern from a list of patterns.

PARAMETER DESCRIPTION

patterns

List of patterns to merge.

TYPE: List[str]

with_breaks

Whether to add breaks (\b) on each side, by default False

TYPE: bool, optional DEFAULT: False

name

Name of the group, using regex ?P<> directive.

TYPE: Optional[str] DEFAULT: None

RETURNS	DESCRIPTION
`str`	Merged pattern.

Source code in edsnlp/utils/regex.py

def make_pattern(
    patterns: List[str],
    with_breaks: bool = False,
    name: Optional[str] = None,
) -> str:
    r"""
    Create OR pattern from a list of patterns.

    Parameters
    ----------
    patterns : List[str]
        List of patterns to merge.
    with_breaks : bool, optional
        Whether to add breaks (`\b`) on each side, by default False
    name: str, optional
        Name of the group, using regex `?P<>` directive.

    Returns
    -------
    str
        Merged pattern.
    """

    if name:
        prefix = f"(?P<{name}>"
    else:
        prefix = "("

    # Sorting by length might be more efficient
    patterns.sort(key=len, reverse=True)

    pattern = prefix + "|".join(patterns) + ")"

    if with_breaks:
        pattern = r"\b" + pattern + r"\b"

    return pattern

`compile_regex(reg)`

This function tries to compile reg using the re module, and fallbacks to the regex module that is more permissive.

PARAMETER	DESCRIPTION
`reg`

RETURNS	DESCRIPTION
`Union[re.Pattern, regex.Pattern]`

Source code in edsnlp/utils/regex.py

def compile_regex(reg):
    """
    This function tries to compile `reg`  using the `re` module, and
    fallbacks to the `regex` module that is more permissive.

    Parameters
    ----------
    reg: str

    Returns
    -------
    Union[re.Pattern, regex.Pattern]
    """
    try:
        return re.compile(reg)
    except re.error:
        try:
            return regex.compile(reg)
        except regex.error:
            raise Exception("Could not compile: {}".format(repr(reg)))

`connectors`

`brat`

`BratConnector`

Bases: object

Two-way connector with BRAT. Supports entities only.

PARAMETER DESCRIPTION

directory

Directory containing the BRAT files.

TYPE: str

n_jobs

Number of jobs for multiprocessing, by default 1

TYPE: int, optional

Source code in edsnlp/connectors/brat.py

class BratConnector(object):
    """
    Two-way connector with BRAT. Supports entities only.

    Parameters
    ----------
    directory : str
        Directory containing the BRAT files.
    n_jobs : int, optional
        Number of jobs for multiprocessing, by default 1
    """

    def __init__(self, directory: str, n_jobs: int = 1):
        self.directory = directory
        self.n_jobs = n_jobs

        os.makedirs(directory, exist_ok=True)

    def full_path(self, filename: str) -> str:
        return os.path.join(self.directory, filename)

    def read_file(self, filename: str) -> str:
        """
        Reads a file within the BRAT directory.

        Parameters
        ----------
        filename:
            The path to the file within the BRAT directory.

        Returns
        -------
        text:
            The text content of the file.
        """
        with open(self.full_path(filename), "r", encoding="utf-8") as f:
            return f.read()

    def read_texts(self) -> pd.DataFrame:
        """
        Reads all texts from the BRAT folder.

        Returns
        -------
        texts:
            DataFrame containing all texts in the BRAT directory.
        """
        files = os.listdir(self.directory)
        filenames = [f[:-4] for f in files if f.endswith(".txt")]

        assert filenames, f"BRAT directory {self.directory} is empty!"

        logger.info(
            f"The BRAT directory contains {len(filenames)} annotated documents."
        )

        texts = pd.DataFrame(dict(note_id=filenames))

        with tqdm(
            texts.note_id, ascii=True, ncols=100, desc="Text extraction"
        ) as iterator:
            texts["note_text"] = [
                self.read_file(note_id + ".txt") for note_id in iterator
            ]

        return texts

    def read_brat_annotation(self, note_id: Union[str, int]) -> pd.DataFrame:
        """
        Reads BRAT annotation inside the BRAT directory.

        Parameters
        ----------
        note_id:
            Note ID within the BRAT directory.

        Returns
        -------
        annotations:
            DataFrame containing the annotations for the given note.
        """
        filename = f"{note_id}.ann"
        annotations = read_brat_annotation(self.full_path(filename))
        return annotations

    def read_annotations(self, texts: pd.DataFrame) -> pd.DataFrame:
        dfs = []

        with tqdm(
            texts.note_id, ascii=True, ncols=100, desc="Annotation extraction"
        ) as iterator:
            dfs = Parallel(n_jobs=self.n_jobs)(
                delayed(self.read_brat_annotation)(note_id) for note_id in iterator
            )
            # for note_id in iterator:
            #     dfs.append(self.read_brat_annotation(note_id))

        annotations = pd.concat(dfs, keys=texts.note_id, names=["note_id"])

        annotations = annotations.droplevel(1).reset_index()

        return annotations

    def get_brat(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Reads texts and annotations, and returns two DataFrame objects.

        Returns
        -------
        texts:
            A DataFrame containing two fields, `note_id` and `note_text`
        annotations:
            A DataFrame containing the annotations.
        """

        texts = self.read_texts()
        annotations = self.read_annotations(texts)

        return texts, annotations

    def brat2docs(self, nlp: Language) -> List[Doc]:
        """
        Transforms a BRAT folder to a list of spaCy documents.

        Parameters
        ----------
        nlp:
            A spaCy pipeline.

        Returns
        -------
        docs:
            List of spaCy documents, with annotations in the `ents` attribute.
        """
        texts, annotations = self.get_brat()

        docs = []

        with tqdm(
            zip(
                texts.note_id,
                nlp.pipe(texts.note_text, batch_size=50, n_process=self.n_jobs),
            ),
            ascii=True,
            ncols=100,
            desc="spaCy conversion",
            total=len(texts),
        ) as iterator:
            for note_id, doc in iterator:

                doc._.note_id = note_id

                ann = annotations.query("note_id == @note_id")

                spans = []

                for _, row in ann.iterrows():
                    span = doc.char_span(
                        row.start,
                        row.end,
                        label=row.label,
                        alignment_mode="expand",
                    )
                    spans.append(span)

                doc.ents = filter_spans(spans)

                docs.append(doc)

        return docs

    def doc2brat(self, doc: Doc) -> None:
        """
        Writes a spaCy document to file in the BRAT directory.

        Parameters
        ----------
        doc:
            spaCy Doc object. The spans in `ents` will populate the `note_id.ann` file.
        """
        filename = str(doc._.note_id)

        with open(self.full_path(f"{filename}.txt"), "w", encoding="utf-8") as f:
            f.write(doc.text)

        annotations = pd.DataFrame.from_records(
            [
                dict(
                    label=ann.label_,
                    lexical_variant=ann.text,
                    start=ann.start_char,
                    end=ann.end_char,
                )
                for ann in doc.ents
            ]
        )

        if len(annotations) > 0:

            annotations["annot"] = (
                annotations.label
                + " "
                + annotations.start.astype(str)
                + " "
                + annotations.end.astype(str)
            )

            annotations["index"] = [f"T{i + 1}" for i in range(len(annotations))]

            annotations = annotations[["index", "annot", "lexical_variant"]]
            annotations.to_csv(
                self.full_path(f"{filename}.ann"),
                sep="\t",
                header=None,
                index=False,
                encoding="utf-8",
            )

        else:
            open(self.full_path(f"{filename}.ann"), "w", encoding="utf-8").close()

    def docs2brat(self, docs: List[Doc]) -> None:
        """
        Writes a list of spaCy documents to file.

        Parameters
        ----------
        docs:
            List of spaCy documents.
        """
        for doc in docs:
            self.doc2brat(doc)

`directory = directory` `instance-attribute`

`n_jobs = n_jobs` `instance-attribute`

`init(directory, n_jobs=1)`

Source code in edsnlp/connectors/brat.py

def __init__(self, directory: str, n_jobs: int = 1):
    self.directory = directory
    self.n_jobs = n_jobs

    os.makedirs(directory, exist_ok=True)

`full_path(filename)`

Source code in edsnlp/connectors/brat.py

72
73

def full_path(self, filename: str) -> str:
    return os.path.join(self.directory, filename)

`read_file(filename)`

Reads a file within the BRAT directory.

PARAMETER DESCRIPTION

filename

The path to the file within the BRAT directory.

TYPE: str

RETURNS	DESCRIPTION
`text`	The text content of the file.

Source code in edsnlp/connectors/brat.py

def read_file(self, filename: str) -> str:
    """
    Reads a file within the BRAT directory.

    Parameters
    ----------
    filename:
        The path to the file within the BRAT directory.

    Returns
    -------
    text:
        The text content of the file.
    """
    with open(self.full_path(filename), "r", encoding="utf-8") as f:
        return f.read()

`read_texts()`

Reads all texts from the BRAT folder.

RETURNS	DESCRIPTION
`texts`	DataFrame containing all texts in the BRAT directory.

Source code in edsnlp/connectors/brat.py

def read_texts(self) -> pd.DataFrame:
    """
    Reads all texts from the BRAT folder.

    Returns
    -------
    texts:
        DataFrame containing all texts in the BRAT directory.
    """
    files = os.listdir(self.directory)
    filenames = [f[:-4] for f in files if f.endswith(".txt")]

    assert filenames, f"BRAT directory {self.directory} is empty!"

    logger.info(
        f"The BRAT directory contains {len(filenames)} annotated documents."
    )

    texts = pd.DataFrame(dict(note_id=filenames))

    with tqdm(
        texts.note_id, ascii=True, ncols=100, desc="Text extraction"
    ) as iterator:
        texts["note_text"] = [
            self.read_file(note_id + ".txt") for note_id in iterator
        ]

    return texts

`read_brat_annotation(note_id)`

Reads BRAT annotation inside the BRAT directory.

PARAMETER DESCRIPTION

note_id

Note ID within the BRAT directory.

TYPE: Union[str, int]

RETURNS	DESCRIPTION
`annotations`	DataFrame containing the annotations for the given note.

Source code in edsnlp/connectors/brat.py

def read_brat_annotation(self, note_id: Union[str, int]) -> pd.DataFrame:
    """
    Reads BRAT annotation inside the BRAT directory.

    Parameters
    ----------
    note_id:
        Note ID within the BRAT directory.

    Returns
    -------
    annotations:
        DataFrame containing the annotations for the given note.
    """
    filename = f"{note_id}.ann"
    annotations = read_brat_annotation(self.full_path(filename))
    return annotations

`read_annotations(texts)`

Source code in edsnlp/connectors/brat.py

def read_annotations(self, texts: pd.DataFrame) -> pd.DataFrame:
    dfs = []

    with tqdm(
        texts.note_id, ascii=True, ncols=100, desc="Annotation extraction"
    ) as iterator:
        dfs = Parallel(n_jobs=self.n_jobs)(
            delayed(self.read_brat_annotation)(note_id) for note_id in iterator
        )
        # for note_id in iterator:
        #     dfs.append(self.read_brat_annotation(note_id))

    annotations = pd.concat(dfs, keys=texts.note_id, names=["note_id"])

    annotations = annotations.droplevel(1).reset_index()

    return annotations

`get_brat()`

Reads texts and annotations, and returns two DataFrame objects.

RETURNS	DESCRIPTION
`texts`	A DataFrame containing two fields, `note_id` and `note_text`
`annotations`	A DataFrame containing the annotations.

Source code in edsnlp/connectors/brat.py

def get_brat(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Reads texts and annotations, and returns two DataFrame objects.

    Returns
    -------
    texts:
        A DataFrame containing two fields, `note_id` and `note_text`
    annotations:
        A DataFrame containing the annotations.
    """

    texts = self.read_texts()
    annotations = self.read_annotations(texts)

    return texts, annotations

`brat2docs(nlp)`

Transforms a BRAT folder to a list of spaCy documents.

PARAMETER DESCRIPTION

nlp

A spaCy pipeline.

TYPE: Language

RETURNS	DESCRIPTION
`docs`	List of spaCy documents, with annotations in the `ents` attribute.

Source code in edsnlp/connectors/brat.py

def brat2docs(self, nlp: Language) -> List[Doc]:
    """
    Transforms a BRAT folder to a list of spaCy documents.

    Parameters
    ----------
    nlp:
        A spaCy pipeline.

    Returns
    -------
    docs:
        List of spaCy documents, with annotations in the `ents` attribute.
    """
    texts, annotations = self.get_brat()

    docs = []

    with tqdm(
        zip(
            texts.note_id,
            nlp.pipe(texts.note_text, batch_size=50, n_process=self.n_jobs),
        ),
        ascii=True,
        ncols=100,
        desc="spaCy conversion",
        total=len(texts),
    ) as iterator:
        for note_id, doc in iterator:

            doc._.note_id = note_id

            ann = annotations.query("note_id == @note_id")

            spans = []

            for _, row in ann.iterrows():
                span = doc.char_span(
                    row.start,
                    row.end,
                    label=row.label,
                    alignment_mode="expand",
                )
                spans.append(span)

            doc.ents = filter_spans(spans)

            docs.append(doc)

    return docs

`doc2brat(doc)`

Writes a spaCy document to file in the BRAT directory.

PARAMETER DESCRIPTION

doc

spaCy Doc object. The spans in ents will populate the note_id.ann file.

TYPE: Doc

Source code in edsnlp/connectors/brat.py

def doc2brat(self, doc: Doc) -> None:
    """
    Writes a spaCy document to file in the BRAT directory.

    Parameters
    ----------
    doc:
        spaCy Doc object. The spans in `ents` will populate the `note_id.ann` file.
    """
    filename = str(doc._.note_id)

    with open(self.full_path(f"{filename}.txt"), "w", encoding="utf-8") as f:
        f.write(doc.text)

    annotations = pd.DataFrame.from_records(
        [
            dict(
                label=ann.label_,
                lexical_variant=ann.text,
                start=ann.start_char,
                end=ann.end_char,
            )
            for ann in doc.ents
        ]
    )

    if len(annotations) > 0:

        annotations["annot"] = (
            annotations.label
            + " "
            + annotations.start.astype(str)
            + " "
            + annotations.end.astype(str)
        )

        annotations["index"] = [f"T{i + 1}" for i in range(len(annotations))]

        annotations = annotations[["index", "annot", "lexical_variant"]]
        annotations.to_csv(
            self.full_path(f"{filename}.ann"),
            sep="\t",
            header=None,
            index=False,
            encoding="utf-8",
        )

    else:
        open(self.full_path(f"{filename}.ann"), "w", encoding="utf-8").close()

`docs2brat(docs)`

Writes a list of spaCy documents to file.

PARAMETER DESCRIPTION

docs

List of spaCy documents.

TYPE: List[Doc]

Source code in edsnlp/connectors/brat.py

def docs2brat(self, docs: List[Doc]) -> None:
    """
    Writes a list of spaCy documents to file.

    Parameters
    ----------
    docs:
        List of spaCy documents.
    """
    for doc in docs:
        self.doc2brat(doc)

`read_brat_annotation(filename)`

Read BRAT annotation file and returns a pandas DataFrame.

PARAMETER DESCRIPTION

filename

Path to the annotation file.

TYPE: str

RETURNS	DESCRIPTION
`annotations`	DataFrame containing the annotations.

Source code in edsnlp/connectors/brat.py

def read_brat_annotation(filename: str) -> pd.DataFrame:
    """
    Read BRAT annotation file and returns a pandas DataFrame.

    Parameters
    ----------
    filename:
        Path to the annotation file.

    Returns
    -------
    annotations:
        DataFrame containing the annotations.
    """

    lines = []

    with open(filename, "r") as f:
        for line in f.readlines():
            lines.append(tuple(line.rstrip("\n").split("\t", 2)))

    if not lines or len(lines[0]) == 1:
        return pd.DataFrame(
            columns=["index", "start", "end", "label", "lexical_variant"]
        )

    annotations = pd.DataFrame(lines, columns=["index", "annot", "lexical_variant"])

    annotations["end"] = annotations.annot.str.split().str[-1]
    annotations["annot"] = annotations.annot.str.split(";").str[0]

    annotations["label"] = annotations.annot.str.split().str[:-2].str.join(" ")
    annotations["start"] = annotations.annot.str.split().str[-2]

    annotations[["start", "end"]] = annotations[["start", "end"]].astype(int)

    annotations = annotations.drop(columns=["annot"])

    return annotations

`omop`

`OmopConnector`

Bases: object

[summary]

PARAMETER DESCRIPTION

nlp

spaCy language object.

TYPE: Language

start_char

Name of the column containing the start character index of the entity, by default "start_char"

TYPE: str, optional

end_char

Name of the column containing the end character index of the entity, by default "end_char"

TYPE: str, optional

Source code in edsnlp/connectors/omop.py

class OmopConnector(object):
    """
    [summary]

    Parameters
    ----------
    nlp : Language
        spaCy language object.
    start_char : str, optional
        Name of the column containing the start character index of the entity,
        by default "start_char"
    end_char : str, optional
        Name of the column containing the end character index of the entity,
        by default "end_char"
    """

    def __init__(
        self,
        nlp: Language,
        start_char: str = "start_char",
        end_char: str = "end_char",
    ):

        self.start_char = start_char
        self.end_char = end_char

        self.nlp = nlp

    def preprocess(
        self, note: pd.DataFrame, note_nlp: pd.DataFrame
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Preprocess the input OMOP tables: modification of the column names.

        Parameters
        ----------
        note : pd.DataFrame
            OMOP `note` table.
        note_nlp : pd.DataFrame
            OMOP `note_nlp` table.

        Returns
        -------
        note : pd.DataFrame
            OMOP `note` table.
        note_nlp : pd.DataFrame
            OMOP `note_nlp` table.
        """

        note_nlp = note_nlp.rename(
            columns={
                self.start_char: "start_char",
                self.end_char: "end_char",
            }
        )

        return note, note_nlp

    def postprocess(
        self, note: pd.DataFrame, note_nlp: pd.DataFrame
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Postprocess the input OMOP tables: modification of the column names.

        Parameters
        ----------
        note : pd.DataFrame
            OMOP `note` table.
        note_nlp : pd.DataFrame
            OMOP `note_nlp` table.

        Returns
        -------
        note : pd.DataFrame
            OMOP `note` table.
        note_nlp : pd.DataFrame
            OMOP `note_nlp` table.
        """

        note_nlp = note_nlp.rename(
            columns={
                "start_char": self.start_char,
                "end_char": self.end_char,
            }
        )

        return note, note_nlp

    def omop2docs(
        self,
        note: pd.DataFrame,
        note_nlp: pd.DataFrame,
        extensions: Optional[List[str]] = None,
    ) -> List[Doc]:
        """
        Transforms OMOP tables to a list of spaCy documents.

        Parameters
        ----------
        note : pd.DataFrame
            OMOP `note` table.
        note_nlp : pd.DataFrame
            OMOP `note_nlp` table.
        extensions : Optional[List[str]], optional
            Extensions to keep, by default None

        Returns
        -------
        List[Doc]
            List of spaCy documents.
        """
        note, note_nlp = self.preprocess(note, note_nlp)
        return omop2docs(note, note_nlp, self.nlp, extensions)

    def docs2omop(
        self,
        docs: List[Doc],
        extensions: Optional[List[str]] = None,
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Transforms a list of spaCy documents to a pair of OMOP tables.

        Parameters
        ----------
        docs : List[Doc]
            List of spaCy documents.
        extensions : Optional[List[str]], optional
            Extensions to keep, by default None

        Returns
        -------
        note : pd.DataFrame
            OMOP `note` table.
        note_nlp : pd.DataFrame
            OMOP `note_nlp` table.
        """
        note, note_nlp = docs2omop(docs, extensions=extensions)
        note, note_nlp = self.postprocess(note, note_nlp)
        return note, note_nlp

`start_char = start_char` `instance-attribute`

`end_char = end_char` `instance-attribute`

`nlp = nlp` `instance-attribute`

`init(nlp, start_char='start_char', end_char='end_char')`

Source code in edsnlp/connectors/omop.py

def __init__(
    self,
    nlp: Language,
    start_char: str = "start_char",
    end_char: str = "end_char",
):

    self.start_char = start_char
    self.end_char = end_char

    self.nlp = nlp

`preprocess(note, note_nlp)`

Preprocess the input OMOP tables: modification of the column names.

PARAMETER DESCRIPTION

note

OMOP note table.

TYPE: pd.DataFrame

note_nlp

OMOP note_nlp table.

TYPE: pd.DataFrame

RETURNS DESCRIPTION

note

OMOP note table.

TYPE: pd.DataFrame

note_nlp

OMOP note_nlp table.

TYPE: pd.DataFrame

Source code in edsnlp/connectors/omop.py

def preprocess(
    self, note: pd.DataFrame, note_nlp: pd.DataFrame
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Preprocess the input OMOP tables: modification of the column names.

    Parameters
    ----------
    note : pd.DataFrame
        OMOP `note` table.
    note_nlp : pd.DataFrame
        OMOP `note_nlp` table.

    Returns
    -------
    note : pd.DataFrame
        OMOP `note` table.
    note_nlp : pd.DataFrame
        OMOP `note_nlp` table.
    """

    note_nlp = note_nlp.rename(
        columns={
            self.start_char: "start_char",
            self.end_char: "end_char",
        }
    )

    return note, note_nlp

`postprocess(note, note_nlp)`

Postprocess the input OMOP tables: modification of the column names.

PARAMETER DESCRIPTION

note

OMOP note table.

TYPE: pd.DataFrame

note_nlp

OMOP note_nlp table.

TYPE: pd.DataFrame

RETURNS DESCRIPTION

note

OMOP note table.

TYPE: pd.DataFrame

note_nlp

OMOP note_nlp table.

TYPE: pd.DataFrame

Source code in edsnlp/connectors/omop.py

def postprocess(
    self, note: pd.DataFrame, note_nlp: pd.DataFrame
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Postprocess the input OMOP tables: modification of the column names.

    Parameters
    ----------
    note : pd.DataFrame
        OMOP `note` table.
    note_nlp : pd.DataFrame
        OMOP `note_nlp` table.

    Returns
    -------
    note : pd.DataFrame
        OMOP `note` table.
    note_nlp : pd.DataFrame
        OMOP `note_nlp` table.
    """

    note_nlp = note_nlp.rename(
        columns={
            "start_char": self.start_char,
            "end_char": self.end_char,
        }
    )

    return note, note_nlp

`omop2docs(note, note_nlp, extensions=None)`

Transforms OMOP tables to a list of spaCy documents.

PARAMETER DESCRIPTION

note

OMOP note table.

TYPE: pd.DataFrame

note_nlp

OMOP note_nlp table.

TYPE: pd.DataFrame

extensions

Extensions to keep, by default None

TYPE: Optional[List[str]], optional DEFAULT: None

RETURNS	DESCRIPTION
`List[Doc]`	List of spaCy documents.

Source code in edsnlp/connectors/omop.py

def omop2docs(
    self,
    note: pd.DataFrame,
    note_nlp: pd.DataFrame,
    extensions: Optional[List[str]] = None,
) -> List[Doc]:
    """
    Transforms OMOP tables to a list of spaCy documents.

    Parameters
    ----------
    note : pd.DataFrame
        OMOP `note` table.
    note_nlp : pd.DataFrame
        OMOP `note_nlp` table.
    extensions : Optional[List[str]], optional
        Extensions to keep, by default None

    Returns
    -------
    List[Doc]
        List of spaCy documents.
    """
    note, note_nlp = self.preprocess(note, note_nlp)
    return omop2docs(note, note_nlp, self.nlp, extensions)

`docs2omop(docs, extensions=None)`

Transforms a list of spaCy documents to a pair of OMOP tables.

PARAMETER DESCRIPTION

docs

List of spaCy documents.

TYPE: List[Doc]

extensions

Extensions to keep, by default None

TYPE: Optional[List[str]], optional DEFAULT: None

RETURNS DESCRIPTION

note

OMOP note table.

TYPE: pd.DataFrame

note_nlp

OMOP note_nlp table.

TYPE: pd.DataFrame

Source code in edsnlp/connectors/omop.py

def docs2omop(
    self,
    docs: List[Doc],
    extensions: Optional[List[str]] = None,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Transforms a list of spaCy documents to a pair of OMOP tables.

    Parameters
    ----------
    docs : List[Doc]
        List of spaCy documents.
    extensions : Optional[List[str]], optional
        Extensions to keep, by default None

    Returns
    -------
    note : pd.DataFrame
        OMOP `note` table.
    note_nlp : pd.DataFrame
        OMOP `note_nlp` table.
    """
    note, note_nlp = docs2omop(docs, extensions=extensions)
    note, note_nlp = self.postprocess(note, note_nlp)
    return note, note_nlp

`omop2docs(note, note_nlp, nlp, extensions=None)`

Transforms an OMOP-formatted pair of dataframes into a list of documents.

PARAMETER	DESCRIPTION
`note`	The OMOP `note` table. TYPE: `pd.DataFrame`
`note_nlp`	The OMOP `note_nlp` table TYPE: `pd.DataFrame`
`nlp`	spaCy language object. TYPE: `Language`
`extensions`	Extensions to keep, by default None TYPE: `Optional[List[str]], optional` DEFAULT: `None`

RETURNS	DESCRIPTION
`List[Doc]`	List of spaCy documents

Source code in edsnlp/connectors/omop.py

def omop2docs(
    note: pd.DataFrame,
    note_nlp: pd.DataFrame,
    nlp: Language,
    extensions: Optional[List[str]] = None,
) -> List[Doc]:
    """
    Transforms an OMOP-formatted pair of dataframes into a list of documents.

    Parameters
    ----------
    note : pd.DataFrame
        The OMOP `note` table.
    note_nlp : pd.DataFrame
        The OMOP `note_nlp` table
    nlp : Language
        spaCy language object.
    extensions : Optional[List[str]], optional
        Extensions to keep, by default None

    Returns
    -------
    List[Doc] :
        List of spaCy documents
    """

    note = note.copy()
    note_nlp = note_nlp.copy()

    extensions = extensions or []

    def row2ent(row):
        d = dict(
            start_char=row.start_char,
            end_char=row.end_char,
            label=row.get("note_nlp_source_value"),
            extensions={ext: row.get(ext) for ext in extensions},
        )

        return d

    # Create entities
    note_nlp["ents"] = note_nlp.apply(row2ent, axis=1)

    note_nlp = note_nlp.groupby("note_id", as_index=False)["ents"].agg(list)

    note = note.merge(note_nlp, on=["note_id"], how="left")

    # Generate documents
    note["doc"] = note.note_text.apply(nlp)

    # Process documents
    for _, row in note.iterrows():

        doc = row.doc
        doc._.note_id = row.note_id
        doc._.note_datetime = row.get("note_datetime")

        ents = []

        if not isinstance(row.ents, list):
            continue

        for ent in row.ents:

            span = doc.char_span(
                ent["start_char"],
                ent["end_char"],
                ent["label"],
                alignment_mode="expand",
            )

            for k, v in ent["extensions"].items():
                setattr(span._, k, v)

            ents.append(span)

            if span.label_ not in doc.spans:
                doc.spans[span.label_] = [span]
            else:
                doc.spans[span.label_].append(span)

        ents, discarded = filter_spans(ents, return_discarded=True)

        doc.ents = ents

        if "discarded" not in doc.spans:
            doc.spans["discarded"] = []
        doc.spans["discarded"].extend(discarded)

    return list(note.doc)

`docs2omop(docs, extensions=None)`

Transforms a list of spaCy docs to a pair of OMOP tables.

PARAMETER DESCRIPTION

docs

List of documents to transform.

TYPE: List[Doc]

extensions

Extensions to keep, by default None

TYPE: Optional[List[str]], optional DEFAULT: None

RETURNS	DESCRIPTION
`Tuple[pd.DataFrame, pd.DataFrame]`	Pair of OMOP tables (`note` and `note_nlp`)

Source code in edsnlp/connectors/omop.py

def docs2omop(
    docs: List[Doc],
    extensions: Optional[List[str]] = None,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Transforms a list of spaCy docs to a pair of OMOP tables.

    Parameters
    ----------
    docs : List[Doc]
        List of documents to transform.
    extensions : Optional[List[str]], optional
        Extensions to keep, by default None

    Returns
    -------
    Tuple[pd.DataFrame, pd.DataFrame]
        Pair of OMOP tables (`note` and `note_nlp`)
    """

    df = pd.DataFrame(dict(doc=docs))

    df["note_text"] = df.doc.apply(lambda doc: doc.text)
    df["note_id"] = df.doc.apply(lambda doc: doc._.note_id)
    df["note_datetime"] = df.doc.apply(lambda doc: doc._.note_datetime)

    if df.note_id.isna().any():
        df["note_id"] = range(len(df))

    df["ents"] = df.doc.apply(lambda doc: list(doc.ents))
    df["ents"] += df.doc.apply(lambda doc: list(doc.spans["discarded"]))

    note = df[["note_id", "note_text", "note_datetime"]]

    df = df[["note_id", "ents"]].explode("ents")

    extensions = extensions or []

    def ent2dict(
        ent: Span,
    ) -> Dict[str, Any]:

        d = dict(
            start_char=ent.start_char,
            end_char=ent.end_char,
            note_nlp_source_value=ent.label_,
            lexical_variant=ent.text,
            # normalized_variant=ent._.normalized.text,
        )

        for ext in extensions:
            d[ext] = getattr(ent._, ext)

        return d

    df["ents"] = df.ents.apply(ent2dict)

    columns = [
        "start_char",
        "end_char",
        "note_nlp_source_value",
        "lexical_variant",
        # "normalized_variant",
    ]
    columns += extensions

    df[columns] = df.ents.apply(pd.Series)

    df["term_modifiers"] = ""

    for i, ext in enumerate(extensions):
        if i > 0:
            df.term_modifiers += ";"
        df.term_modifiers += ext + "=" + df[ext].astype(str)

    df["note_nlp_id"] = range(len(df))

    note_nlp = df[["note_nlp_id", "note_id"] + columns]

    return note, note_nlp

`labeltool`

`docs2labeltool(docs, extensions=None)`

Returns a labeltool-ready dataframe from a list of annotated document.

PARAMETER DESCRIPTION

docs

List of annotated spacy docs.

TYPE: List[Doc]

extensions

List of extensions to use by labeltool.

TYPE: Optional[List[str]] DEFAULT: None

RETURNS	DESCRIPTION
`df`	DataFrame tailored for labeltool.

Source code in edsnlp/connectors/labeltool.py

def docs2labeltool(
    docs: List[Doc],
    extensions: Optional[List[str]] = None,
) -> pd.DataFrame:
    """
    Returns a labeltool-ready dataframe from a list of annotated document.

    Parameters
    ----------
    docs: list of spaCy Doc
        List of annotated spacy docs.
    extensions: list of extensions
        List of extensions to use by labeltool.

    Returns
    -------
    df: pd.DataFrame
        DataFrame tailored for labeltool.
    """

    if extensions is None:
        extensions = []

    entities = []

    for i, doc in enumerate(tqdm(docs, ascii=True, ncols=100)):
        for ent in doc.ents:
            d = dict(
                note_text=doc.text,
                offset_begin=ent.start_char,
                offset_end=ent.end_char,
                label_name=ent.label_,
                label_value=ent.text,
            )

            d["note_id"] = doc._.note_id or i

            for ext in extensions:
                d[ext] = getattr(ent._, ext)

            entities.append(d)

    df = pd.DataFrame.from_records(entities)

    columns = [
        "note_id",
        "note_text",
        "offset_begin",
        "offset_end",
        "label_name",
        "label_value",
    ]

    df = df[columns + extensions]

    return df

`pipelines`

`terminations`

`termination: List[str] = ['et', 'bien que', 'même si', 'mais', 'or', 'alors que', 'sauf', 'cependant', 'pourtant', 'cause de', 'source de', 'hormis', 'car', 'parce que', 'pourtant', 'puisque', 'ni', 'en raison de', 'qui', 'que', 'ainsi que', 'avec', 'toutefois', 'en dehors', 'dans le cadre', 'du fait', '.', ',', ';', '...', '…', '(', ')', '"']` `module-attribute`

`factories`

`base`

`BaseComponent`

Bases: object

The BaseComponent adds a set_extensions method, called at the creation of the object.

It helps decouple the initialisation of the pipeline from the creation of extensions, and is particularly usefull when distributing EDSNLP on a cluster, since the serialisation mechanism imposes that the extensions be reset.

Source code in edsnlp/pipelines/base.py

class BaseComponent(object):
    """
    The `BaseComponent` adds a `set_extensions` method,
    called at the creation of the object.

    It helps decouple the initialisation of the pipeline from
    the creation of extensions, and is particularly usefull when
    distributing EDSNLP on a cluster, since the serialisation mechanism
    imposes that the extensions be reset.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:
        """
        Set `Doc`, `Span` and `Token` extensions.
        """
        pass

    def _boundaries(
        self, doc: Doc, terminations: Optional[List[Span]] = None
    ) -> List[Tuple[int, int]]:
        """
        Create sub sentences based sentences and terminations found in text.

        Parameters
        ----------
        doc:
            spaCy Doc object
        terminations:
            List of tuples with (match_id, start, end)

        Returns
        -------
        boundaries:
            List of tuples with (start, end) of spans
        """

        if terminations is None:
            terminations = []

        sent_starts = [sent.start for sent in doc.sents]
        termination_starts = [t.start for t in terminations]

        starts = sent_starts + termination_starts + [len(doc)]

        # Remove duplicates
        starts = list(set(starts))

        # Sort starts
        starts.sort()

        boundaries = [(start, end) for start, end in zip(starts[:-1], starts[1:])]

        return boundaries

`init(*args, **kwargs)`

Source code in edsnlp/pipelines/base.py

def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)

    self.set_extensions()

`set_extensions()`

Set Doc, Span and Token extensions.

Source code in edsnlp/pipelines/base.py

@staticmethod
def set_extensions() -> None:
    """
    Set `Doc`, `Span` and `Token` extensions.
    """
    pass

`_boundaries(doc, terminations=None)`

Create sub sentences based sentences and terminations found in text.

PARAMETER DESCRIPTION

doc

spaCy Doc object

TYPE: Doc

terminations

List of tuples with (match_id, start, end)

TYPE: Optional[List[Span]] DEFAULT: None

RETURNS	DESCRIPTION
`boundaries`	List of tuples with (start, end) of spans

Source code in edsnlp/pipelines/base.py

def _boundaries(
    self, doc: Doc, terminations: Optional[List[Span]] = None
) -> List[Tuple[int, int]]:
    """
    Create sub sentences based sentences and terminations found in text.

    Parameters
    ----------
    doc:
        spaCy Doc object
    terminations:
        List of tuples with (match_id, start, end)

    Returns
    -------
    boundaries:
        List of tuples with (start, end) of spans
    """

    if terminations is None:
        terminations = []

    sent_starts = [sent.start for sent in doc.sents]
    termination_starts = [t.start for t in terminations]

    starts = sent_starts + termination_starts + [len(doc)]

    # Remove duplicates
    starts = list(set(starts))

    # Sort starts
    starts.sort()

    boundaries = [(start, end) for start, end in zip(starts[:-1], starts[1:])]

    return boundaries

`core`

`sentences`

`terms`

punctuation = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹', '।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄', '᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿', '‼', '‽', '⁇', '⁈', '⁉', '⸮', '⸼', '꓿', '꘎', '꘏', '꛳', '꛷', '꡶', '꡷', '꣎', '꣏', '꤯', '꧈', '꧉', '꩝', '꩞', '꩟', '꫰', '꫱', '꯫', '﹒', '﹖', '﹗', '！', '．', '？', '𐩖', '𐩗', '𑁇', '𑁈', '𑂾', '𑂿', '𑃀', '𑃁', '𑅁', '𑅂', '𑅃', '𑇅', '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼', '𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐', '𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂', '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈', '｡', '。']

module-attribute

`sentences`

SentenceSegmenter

Bases: object

Segments the Doc into sentences using a rule-based strategy, specific to AP-HP documents.

Applies the same rule-based pipeline as spaCy's sentencizer, and adds a simple rule on the new lines : if a new line is followed by a capitalised word, then it is also an end of sentence.

DOCS: https://spacy.io/api/sentencizer

Arguments

punct_chars : Optional[List[str]] Punctuation characters. use_endlines : bool Whether to use endlines prediction.

Source code in edsnlp/pipelines/core/sentences/sentences.py

class SentenceSegmenter(object):
    """
    Segments the Doc into sentences using a rule-based strategy,
    specific to AP-HP documents.

    Applies the same rule-based pipeline as spaCy's sentencizer,
    and adds a simple rule on the new lines : if a new line is followed by a
    capitalised word, then it is also an end of sentence.

    DOCS: https://spacy.io/api/sentencizer

    Arguments
    ---------
    punct_chars : Optional[List[str]]
        Punctuation characters.
    use_endlines : bool
        Whether to use endlines prediction.
    """

    def __init__(
        self,
        punct_chars: Optional[List[str]],
        use_endlines: bool,
    ):

        if punct_chars is None:
            punct_chars = punctuation

        self.punct_chars = set(punct_chars)
        self.use_endlines = use_endlines

    def __call__(self, doc: Doc) -> Doc:
        """
        Segments the document in sentences.

        Arguments
        ---------
        doc:
            A spacy Doc object.

        Returns
        -------
        doc:
            A spaCy Doc object, annotated for sentences.
        """

        if not doc:
            return doc

        doc[0].sent_start = True

        seen_period = False
        seen_newline = False

        for i, token in enumerate(doc):
            is_in_punct_chars = token.text in self.punct_chars
            is_newline = token.is_space and "\n" in token.text

            if self.use_endlines:
                end_line = getattr(token._, "end_line", None)
                is_newline = is_newline and (end_line or end_line is None)

            token.sent_start = (
                i == 0
            )  # To set the attributes at False by default for the other tokens
            if seen_period or seen_newline:
                if token.is_punct or is_in_punct_chars or is_newline:
                    continue
                if seen_period:
                    token.sent_start = True
                    seen_newline = False
                    seen_period = False
                else:
                    token.sent_start = token.shape_.startswith("Xx")
                    seen_newline = False
                    seen_period = False
            elif is_in_punct_chars:
                seen_period = True
            elif is_newline:
                seen_newline = True

        return doc

punct_chars = set(punct_chars) instance-attribute

use_endlines = use_endlines instance-attribute

__init__(punct_chars, use_endlines)

Source code in edsnlp/pipelines/core/sentences/sentences.py

def __init__(
    self,
    punct_chars: Optional[List[str]],
    use_endlines: bool,
):

    if punct_chars is None:
        punct_chars = punctuation

    self.punct_chars = set(punct_chars)
    self.use_endlines = use_endlines

__call__(doc)

Segments the document in sentences.

Arguments

doc: A spacy Doc object.

RETURNS	DESCRIPTION
`doc`	A spaCy Doc object, annotated for sentences.

Source code in edsnlp/pipelines/core/sentences/sentences.py

def __call__(self, doc: Doc) -> Doc:
    """
    Segments the document in sentences.

    Arguments
    ---------
    doc:
        A spacy Doc object.

    Returns
    -------
    doc:
        A spaCy Doc object, annotated for sentences.
    """

    if not doc:
        return doc

    doc[0].sent_start = True

    seen_period = False
    seen_newline = False

    for i, token in enumerate(doc):
        is_in_punct_chars = token.text in self.punct_chars
        is_newline = token.is_space and "\n" in token.text

        if self.use_endlines:
            end_line = getattr(token._, "end_line", None)
            is_newline = is_newline and (end_line or end_line is None)

        token.sent_start = (
            i == 0
        )  # To set the attributes at False by default for the other tokens
        if seen_period or seen_newline:
            if token.is_punct or is_in_punct_chars or is_newline:
                continue
            if seen_period:
                token.sent_start = True
                seen_newline = False
                seen_period = False
            else:
                token.sent_start = token.shape_.startswith("Xx")
                seen_newline = False
                seen_period = False
        elif is_in_punct_chars:
            seen_period = True
        elif is_newline:
            seen_newline = True

    return doc

`factory`

DEFAULT_CONFIG = dict(punct_chars=None, use_endlines=True) module-attribute

create_component(nlp, name, punct_chars, use_endlines)

Source code in edsnlp/pipelines/core/sentences/factory.py

@deprecated_factory("sentences", "eds.sentences", default_config=DEFAULT_CONFIG)
@Language.factory("eds.sentences", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    punct_chars: Optional[List[str]],
    use_endlines: bool,
):
    return SentenceSegmenter(
        punct_chars=punct_chars,
        use_endlines=use_endlines,
    )

`matcher`

GenericMatcher

Bases: BaseComponent

Provides a generic matcher component.

PARAMETER	DESCRIPTION
`nlp`	The spaCy object. TYPE: `Language`
`terms`	A dictionary of terms. TYPE: `Optional[Patterns]`
`regex`	A dictionary of regular expressions. TYPE: `Optional[Patterns]`
`attr`	The default attribute to use for matching. Can be overiden using the `terms` and `regex` configurations. TYPE: `str`
`filter_matches`	Whether to filter out matches. TYPE: `bool`
`on_ents_only`	Whether to to look for matches around pre-extracted entities only. TYPE: `bool`
`ignore_excluded`	Whether to skip excluded tokens (requires an upstream pipeline to mark excluded tokens). TYPE: `bool`

Source code in edsnlp/pipelines/core/matcher/matcher.py

class GenericMatcher(BaseComponent):
    """
    Provides a generic matcher component.

    Parameters
    ----------
    nlp : Language
        The spaCy object.
    terms : Optional[Patterns]
        A dictionary of terms.
    regex : Optional[Patterns]
        A dictionary of regular expressions.
    attr : str
        The default attribute to use for matching.
        Can be overiden using the `terms` and `regex` configurations.
    filter_matches : bool
        Whether to filter out matches.
    on_ents_only : bool
        Whether to to look for matches around pre-extracted entities only.
    ignore_excluded : bool
        Whether to skip excluded tokens (requires an upstream
        pipeline to mark excluded tokens).
    """

    def __init__(
        self,
        nlp: Language,
        terms: Optional[Patterns],
        regex: Optional[Patterns],
        attr: str,
        ignore_excluded: bool,
    ):

        self.nlp = nlp

        self.attr = attr

        self.phrase_matcher = EDSPhraseMatcher(
            self.nlp.vocab,
            attr=attr,
            ignore_excluded=ignore_excluded,
        )
        self.regex_matcher = RegexMatcher(
            attr=attr,
            ignore_excluded=ignore_excluded,
        )

        self.phrase_matcher.build_patterns(nlp=nlp, terms=terms)
        self.regex_matcher.build_patterns(regex=regex)

        self.set_extensions()

    def process(self, doc: Doc) -> List[Span]:
        """
        Find matching spans in doc.

        Parameters
        ----------
        doc:
            spaCy Doc object.

        Returns
        -------
        spans:
            List of Spans returned by the matchers.
        """

        matches = self.phrase_matcher(doc, as_spans=True)
        regex_matches = self.regex_matcher(doc, as_spans=True)

        spans = list(matches) + list(regex_matches)

        return spans

    def __call__(self, doc: Doc) -> Doc:
        """
        Adds spans to document.

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        doc:
            spaCy Doc object, annotated for extracted terms.
        """
        matches = self.process(doc)

        for span in matches:
            if span.label_ not in doc.spans:
                doc.spans[span.label_] = []
            doc.spans[span.label_].append(span)

        ents, discarded = filter_spans(list(doc.ents) + matches, return_discarded=True)

        doc.ents = ents

        if "discarded" not in doc.spans:
            doc.spans["discarded"] = []
        doc.spans["discarded"].extend(discarded)

        return doc

nlp = nlp instance-attribute

attr = attr instance-attribute

phrase_matcher = EDSPhraseMatcher(self.nlp.vocab, attr=attr, ignore_excluded=ignore_excluded) instance-attribute

regex_matcher = RegexMatcher(attr=attr, ignore_excluded=ignore_excluded) instance-attribute

__init__(nlp, terms, regex, attr, ignore_excluded)

Source code in edsnlp/pipelines/core/matcher/matcher.py

def __init__(
    self,
    nlp: Language,
    terms: Optional[Patterns],
    regex: Optional[Patterns],
    attr: str,
    ignore_excluded: bool,
):

    self.nlp = nlp

    self.attr = attr

    self.phrase_matcher = EDSPhraseMatcher(
        self.nlp.vocab,
        attr=attr,
        ignore_excluded=ignore_excluded,
    )
    self.regex_matcher = RegexMatcher(
        attr=attr,
        ignore_excluded=ignore_excluded,
    )

    self.phrase_matcher.build_patterns(nlp=nlp, terms=terms)
    self.regex_matcher.build_patterns(regex=regex)

    self.set_extensions()

process(doc)

Find matching spans in doc.

PARAMETER DESCRIPTION

doc

spaCy Doc object.

TYPE: Doc

RETURNS	DESCRIPTION
`spans`	List of Spans returned by the matchers.

Source code in edsnlp/pipelines/core/matcher/matcher.py

def process(self, doc: Doc) -> List[Span]:
    """
    Find matching spans in doc.

    Parameters
    ----------
    doc:
        spaCy Doc object.

    Returns
    -------
    spans:
        List of Spans returned by the matchers.
    """

    matches = self.phrase_matcher(doc, as_spans=True)
    regex_matches = self.regex_matcher(doc, as_spans=True)

    spans = list(matches) + list(regex_matches)

    return spans

__call__(doc)

Adds spans to document.

PARAMETER DESCRIPTION

doc

spaCy Doc object

TYPE: Doc

RETURNS	DESCRIPTION
`doc`	spaCy Doc object, annotated for extracted terms.

Source code in edsnlp/pipelines/core/matcher/matcher.py

def __call__(self, doc: Doc) -> Doc:
    """
    Adds spans to document.

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    doc:
        spaCy Doc object, annotated for extracted terms.
    """
    matches = self.process(doc)

    for span in matches:
        if span.label_ not in doc.spans:
            doc.spans[span.label_] = []
        doc.spans[span.label_].append(span)

    ents, discarded = filter_spans(list(doc.ents) + matches, return_discarded=True)

    doc.ents = ents

    if "discarded" not in doc.spans:
        doc.spans["discarded"] = []
    doc.spans["discarded"].extend(discarded)

    return doc

`factory`

DEFAULT_CONFIG = dict(terms=None, regex=None, attr='TEXT', ignore_excluded=False) module-attribute

create_component(nlp, name, terms, attr, regex, ignore_excluded)

Source code in edsnlp/pipelines/core/matcher/factory.py

@deprecated_factory("matcher", "eds.matcher", default_config=DEFAULT_CONFIG)
@Language.factory("eds.matcher", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    terms: Optional[Dict[str, Union[str, List[str]]]],
    attr: Union[str, Dict[str, str]],
    regex: Optional[Dict[str, Union[str, List[str]]]],
    ignore_excluded: bool,
):
    assert not (terms is None and regex is None)

    if terms is None:
        terms = dict()
    if regex is None:
        regex = dict()

    return GenericMatcher(
        nlp,
        terms=terms,
        attr=attr,
        regex=regex,
        ignore_excluded=ignore_excluded,
    )

`endlines`

`functional`

_get_label(prediction)

Returns the label for the prediction PREDICTED_END_LINE

PARAMETER DESCRIPTION

prediction

value of PREDICTED_END_LINE

TYPE: bool

RETURNS	DESCRIPTION
`str`	Label for `PREDICTED_END_LINE`

Source code in edsnlp/pipelines/core/endlines/functional.py

def _get_label(prediction: bool) -> str:
    """Returns the label for the prediction `PREDICTED_END_LINE`

    Parameters
    ----------
    prediction : bool
        value of `PREDICTED_END_LINE`

    Returns
    -------
    str
        Label for `PREDICTED_END_LINE`
    """
    if prediction:
        return "end_line"
    else:
        return "space"

get_dir_path(file)

Source code in edsnlp/pipelines/core/endlines/functional.py

26
27
28

def get_dir_path(file):
    path_file = os.path.dirname(os.path.realpath(file))
    return path_file

build_path(file, relative_path)

Function to build an absolut path.

PARAMETER DESCRIPTION

file

relative_path

relative path from the main file to the desired output

RETURNS	DESCRIPTION
`path`

Source code in edsnlp/pipelines/core/endlines/functional.py

def build_path(file, relative_path):
    """
    Function to build an absolut path.

    Parameters
    ----------
    file: main file from where we are calling. It could be __file__
    relative_path: str,
        relative path from the main file to the desired output

    Returns
    -------
    path: absolute path
    """
    dir_path = get_dir_path(file)
    path = os.path.abspath(os.path.join(dir_path, relative_path))
    return path

_convert_series_to_array(s)

Converts pandas series of n elements to an array of shape (n,1).

PARAMETER	DESCRIPTION
`s`	TYPE: `pd.Series`

RETURNS	DESCRIPTION
`np.ndarray`

Source code in edsnlp/pipelines/core/endlines/functional.py

def _convert_series_to_array(s: pd.Series) -> np.ndarray:
    """Converts pandas series of n elements to an array of shape (n,1).

    Parameters
    ----------
    s : pd.Series

    Returns
    -------
    np.ndarray
    """
    X = s.to_numpy().reshape(-1, 1).astype("O")  # .astype(np.int64)
    return X

`endlines`

EndLines

Bases: GenericMatcher

spaCy Pipeline to detect whether a newline character should be considered a space (ie introduced by the PDF).

The pipeline will add the extension end_line to spans and tokens. The end_line attribute is a boolean or None, set to True if the pipeline predicts that the new line is an end line character. Otherwise, it is set to False if the new line is classified as a space. If no classification has been done over that token, it will remain None.

PARAMETER DESCRIPTION

nlp

spaCy nlp pipeline to use for matching.

TYPE: Language

end_lines_model : Optional[Union[str, EndLinesModel]], by default None path to trained model. If None, it will use a default model

Source code in edsnlp/pipelines/core/endlines/endlines.py

class EndLines(GenericMatcher):
    """
    spaCy Pipeline to detect whether a newline character should
    be considered a space (ie introduced by the PDF).

    The pipeline will add the extension `end_line` to spans
    and tokens. The `end_line` attribute is a boolean or `None`,
    set to `True` if the pipeline predicts that the new line
    is an end line character. Otherwise, it is  set to `False`
    if the new line is classified as a space. If no classification
    has been done over that token, it will remain `None`.

    Parameters
    ----------
    nlp : Language
        spaCy nlp pipeline to use for matching.

    end_lines_model : Optional[Union[str, EndLinesModel]], by default None
        path to trained model. If None, it will use a default model
    """

    def __init__(
        self,
        nlp: Language,
        end_lines_model: Optional[Union[str, EndLinesModel]],
        **kwargs,
    ):

        super().__init__(
            nlp,
            terms=None,
            attr="TEXT",
            regex=dict(
                new_line=r"\n+",
            ),
            ignore_excluded=False,
            **kwargs,
        )

        if not Token.has_extension("end_line"):
            Token.set_extension("end_line", default=None)

        if not Span.has_extension("end_line"):
            Span.set_extension("end_line", default=None)

        self._read_model(end_lines_model)

    def _read_model(self, end_lines_model: Optional[Union[str, EndLinesModel]]):
        """
        Parameters
        ----------
        end_lines_model : Optional[Union[str, EndLinesModel]]

        Raises
        ------
        TypeError
        """
        if end_lines_model is None:
            path = build_path(__file__, "base_model.pkl")

            with open(path, "rb") as inp:
                self.model = pickle.load(inp)
        elif type(end_lines_model) == str:
            with open(end_lines_model, "rb") as inp:
                self.model = pickle.load(inp)
        elif type(end_lines_model) == EndLinesModel:
            self.model = end_lines_model
        else:
            raise TypeError(
                "type(`end_lines_model`) should be one of {None, str, EndLinesModel}"
            )

    @staticmethod
    def _spacy_compute_a3a4(token: Token) -> str:
        """Function to compute A3 and A4

        Parameters
        ----------
        token : Token

        Returns
        -------
        str
        """

        if token.is_upper:
            return "UPPER"

        elif token.shape_.startswith("Xx"):
            return "S_UPPER"

        elif token.shape_.startswith("x"):
            return "LOWER"

        elif (token.is_digit) & (
            (token.doc[max(token.i - 1, 0)].is_punct)
            | (token.doc[min(token.i + 1, len(token.doc) - 1)].is_punct)
        ):
            return "ENUMERATION"

        elif token.is_digit:
            return "DIGIT"

        elif (token.is_punct) & (token.text in [".", ";", "..", "..."]):
            return "STRONG_PUNCT"

        elif (token.is_punct) & (token.text not in [".", ";", "..", "..."]):
            return "SOFT_PUNCT"

        else:
            return "OTHER"

    @staticmethod
    def _compute_length(doc: Doc, start: int, end: int) -> int:
        """Compute length without spaces

        Parameters
        ----------
        doc : Doc
        start : int
        end : int

        Returns
        -------
        int
        """
        length = 0
        for t in doc[start:end]:
            length += len(t.text)

        return length

    def _get_df(self, doc: Doc, new_lines: List[Span]) -> pd.DataFrame:
        """Get a pandas DataFrame to call the classifier

        Parameters
        ----------
        doc : Doc
        new_lines : List[Span]

        Returns
        -------
        pd.DataFrame
        """

        data = []
        for i, span in enumerate(new_lines):
            start = span.start
            end = span.end

            max_index = len(doc) - 1
            a1_token = doc[max(start - 1, 0)]
            a2_token = doc[min(start + 1, max_index)]
            a1 = a1_token.orth
            a2 = a2_token.orth
            a3 = self._spacy_compute_a3a4(a1_token)
            a4 = self._spacy_compute_a3a4(a2_token)
            blank_line = "\n\n" in span.text

            if i > 0:
                start_previous = new_lines[i - 1].start + 1
            else:
                start_previous = 0

            length = self._compute_length(
                doc, start=start_previous, end=start
            )  # It's ok cause i count the total length from the previous up to this one

            data_dict = dict(
                span_start=start,
                span_end=end,
                A1=a1,
                A2=a2,
                A3=a3,
                A4=a4,
                BLANK_LINE=blank_line,
                length=length,
            )
            data.append(data_dict)

        df = pd.DataFrame(data)

        mu = df["length"].mean()
        sigma = df["length"].std()
        if np.isnan(sigma):
            sigma = 1

        cv = sigma / mu
        df["B1"] = (df["length"] - mu) / sigma
        df["B2"] = cv

        return df

    def __call__(self, doc: Doc) -> Doc:
        """
        Predict for each new line if it's an end of line or a space.

        Parameters
        ----------
        doc: spaCy Doc object

        Returns
        -------
        doc: spaCy Doc object, with each new line annotated
        """

        matches = self.process(doc)
        new_lines = get_spans(matches, "new_line")

        if len(new_lines) > 0:
            df = self._get_df(doc=doc, new_lines=new_lines)
            df = self.model.predict(df)

            spans = []
            for span, prediction in zip(new_lines, df.PREDICTED_END_LINE):

                span.label_ = _get_label(prediction)
                span._.end_line = prediction

                spans.append(span)
                for t in span:
                    t._.end_line = prediction
                    if not prediction:
                        t._.excluded = True

            doc.spans["new_lines"] = spans
        return doc

__init__(nlp, end_lines_model, **kwargs)

Source code in edsnlp/pipelines/core/endlines/endlines.py

def __init__(
    self,
    nlp: Language,
    end_lines_model: Optional[Union[str, EndLinesModel]],
    **kwargs,
):

    super().__init__(
        nlp,
        terms=None,
        attr="TEXT",
        regex=dict(
            new_line=r"\n+",
        ),
        ignore_excluded=False,
        **kwargs,
    )

    if not Token.has_extension("end_line"):
        Token.set_extension("end_line", default=None)

    if not Span.has_extension("end_line"):
        Span.set_extension("end_line", default=None)

    self._read_model(end_lines_model)

_read_model(end_lines_model)

PARAMETER	DESCRIPTION
`end_lines_model`	TYPE: `Optional[Union[str, EndLinesModel]]`

RAISES	DESCRIPTION
`TypeError`

Source code in edsnlp/pipelines/core/endlines/endlines.py

def _read_model(self, end_lines_model: Optional[Union[str, EndLinesModel]]):
    """
    Parameters
    ----------
    end_lines_model : Optional[Union[str, EndLinesModel]]

    Raises
    ------
    TypeError
    """
    if end_lines_model is None:
        path = build_path(__file__, "base_model.pkl")

        with open(path, "rb") as inp:
            self.model = pickle.load(inp)
    elif type(end_lines_model) == str:
        with open(end_lines_model, "rb") as inp:
            self.model = pickle.load(inp)
    elif type(end_lines_model) == EndLinesModel:
        self.model = end_lines_model
    else:
        raise TypeError(
            "type(`end_lines_model`) should be one of {None, str, EndLinesModel}"
        )

_spacy_compute_a3a4(token)

Function to compute A3 and A4

PARAMETER	DESCRIPTION
`token`	TYPE: `Token`

RETURNS	DESCRIPTION
`str`

Source code in edsnlp/pipelines/core/endlines/endlines.py

@staticmethod
def _spacy_compute_a3a4(token: Token) -> str:
    """Function to compute A3 and A4

    Parameters
    ----------
    token : Token

    Returns
    -------
    str
    """

    if token.is_upper:
        return "UPPER"

    elif token.shape_.startswith("Xx"):
        return "S_UPPER"

    elif token.shape_.startswith("x"):
        return "LOWER"

    elif (token.is_digit) & (
        (token.doc[max(token.i - 1, 0)].is_punct)
        | (token.doc[min(token.i + 1, len(token.doc) - 1)].is_punct)
    ):
        return "ENUMERATION"

    elif token.is_digit:
        return "DIGIT"

    elif (token.is_punct) & (token.text in [".", ";", "..", "..."]):
        return "STRONG_PUNCT"

    elif (token.is_punct) & (token.text not in [".", ";", "..", "..."]):
        return "SOFT_PUNCT"

    else:
        return "OTHER"

_compute_length(doc, start, end)

Compute length without spaces

PARAMETER	DESCRIPTION
`doc`	TYPE: `Doc`
`start`	TYPE: `int`
`end`	TYPE: `int`

RETURNS	DESCRIPTION
`int`

Source code in edsnlp/pipelines/core/endlines/endlines.py

@staticmethod
def _compute_length(doc: Doc, start: int, end: int) -> int:
    """Compute length without spaces

    Parameters
    ----------
    doc : Doc
    start : int
    end : int

    Returns
    -------
    int
    """
    length = 0
    for t in doc[start:end]:
        length += len(t.text)

    return length

_get_df(doc, new_lines)

Get a pandas DataFrame to call the classifier

PARAMETER	DESCRIPTION
`doc`	TYPE: `Doc`
`new_lines`	TYPE: `List[Span]`

RETURNS	DESCRIPTION
`pd.DataFrame`

Source code in edsnlp/pipelines/core/endlines/endlines.py

def _get_df(self, doc: Doc, new_lines: List[Span]) -> pd.DataFrame:
    """Get a pandas DataFrame to call the classifier

    Parameters
    ----------
    doc : Doc
    new_lines : List[Span]

    Returns
    -------
    pd.DataFrame
    """

    data = []
    for i, span in enumerate(new_lines):
        start = span.start
        end = span.end

        max_index = len(doc) - 1
        a1_token = doc[max(start - 1, 0)]
        a2_token = doc[min(start + 1, max_index)]
        a1 = a1_token.orth
        a2 = a2_token.orth
        a3 = self._spacy_compute_a3a4(a1_token)
        a4 = self._spacy_compute_a3a4(a2_token)
        blank_line = "\n\n" in span.text

        if i > 0:
            start_previous = new_lines[i - 1].start + 1
        else:
            start_previous = 0

        length = self._compute_length(
            doc, start=start_previous, end=start
        )  # It's ok cause i count the total length from the previous up to this one

        data_dict = dict(
            span_start=start,
            span_end=end,
            A1=a1,
            A2=a2,
            A3=a3,
            A4=a4,
            BLANK_LINE=blank_line,
            length=length,
        )
        data.append(data_dict)

    df = pd.DataFrame(data)

    mu = df["length"].mean()
    sigma = df["length"].std()
    if np.isnan(sigma):
        sigma = 1

    cv = sigma / mu
    df["B1"] = (df["length"] - mu) / sigma
    df["B2"] = cv

    return df

__call__(doc)

Predict for each new line if it's an end of line or a space.

PARAMETER	DESCRIPTION
`doc`	TYPE: `Doc`

RETURNS	DESCRIPTION
`doc`

Source code in edsnlp/pipelines/core/endlines/endlines.py

def __call__(self, doc: Doc) -> Doc:
    """
    Predict for each new line if it's an end of line or a space.

    Parameters
    ----------
    doc: spaCy Doc object

    Returns
    -------
    doc: spaCy Doc object, with each new line annotated
    """

    matches = self.process(doc)
    new_lines = get_spans(matches, "new_line")

    if len(new_lines) > 0:
        df = self._get_df(doc=doc, new_lines=new_lines)
        df = self.model.predict(df)

        spans = []
        for span, prediction in zip(new_lines, df.PREDICTED_END_LINE):

            span.label_ = _get_label(prediction)
            span._.end_line = prediction

            spans.append(span)
            for t in span:
                t._.end_line = prediction
                if not prediction:
                    t._.excluded = True

        doc.spans["new_lines"] = spans
    return doc

`factory`

create_component(nlp, name, model_path)

Source code in edsnlp/pipelines/core/endlines/factory.py

@deprecated_factory("endlines", "eds.endlines")
@Language.factory("eds.endlines")
def create_component(
    nlp: Language,
    name: str,
    model_path: Optional[str],
):
    return EndLines(nlp, end_lines_model=model_path)

`endlinesmodel`

EndLinesModel

Model to classify if an end line is a real one or it should be a space.

PARAMETER DESCRIPTION

nlp

spaCy nlp pipeline to use for matching.

TYPE: Language

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py

class EndLinesModel:
    """Model to classify if an end line is a real one or it should be a space.

    Parameters
    ----------
    nlp : Language
        spaCy nlp pipeline to use for matching.
    """

    def __init__(self, nlp: Language):
        self.nlp = nlp

    def _preprocess_data(self, corpus: Iterable[Doc]) -> pd.DataFrame:
        """
        Parameters
        ----------
        corpus : Iterable[Doc]
            Corpus of documents

        Returns
        -------
        pd.DataFrame
            Preprocessed data
        """
        # Extract the vocabulary
        string_store = self.nlp.vocab.strings

        # Iterate in the corpus and construct a dataframe
        train_data_list = []
        for i, doc in enumerate(corpus):
            train_data_list.append(self._get_attributes(doc, i))

        df = pd.concat(train_data_list)
        df.reset_index(inplace=True, drop=False)
        df.rename(columns={"ORTH": "A1", "index": "original_token_index"}, inplace=True)

        # Retrieve string representation of token_id and shape
        df["TEXT"] = df.A1.apply(self._get_string, string_store=string_store)
        df["SHAPE_"] = df.SHAPE.apply(self._get_string, string_store=string_store)

        # Convert new lines as an attribute instead of a row
        df = self._convert_line_to_attribute(df, expr="\n", col="END_LINE")
        df = self._convert_line_to_attribute(df, expr="\n\n", col="BLANK_LINE")
        df = df.loc[~(df.END_LINE | df.BLANK_LINE)]
        df = df.drop(columns="END_LINE")
        df = df.drop(columns="BLANK_LINE")
        df.rename(
            columns={"TEMP_END_LINE": "END_LINE", "TEMP_BLANK_LINE": "BLANK_LINE"},
            inplace=True,
        )

        # Construct A2 by shifting
        df = self._shift_col(df, "A1", "A2", direction="backward")

        # Compute A3 and A4
        df = self._compute_a3(df)
        df = self._shift_col(df, "A3", "A4", direction="backward")

        # SPACE is the class to predict. Set 1 if not an END_LINE
        df["SPACE"] = np.logical_not(df["END_LINE"]).astype("int")

        df[["END_LINE", "BLANK_LINE"]] = df[["END_LINE", "BLANK_LINE"]].fillna(
            True, inplace=False
        )

        # Assign a sentence id to each token
        df = df.groupby("DOC_ID").apply(self._retrieve_lines)
        df["SENTENCE_ID"] = df["SENTENCE_ID"].astype("int")

        # Compute B1 and B2
        df = self._compute_B(df)

        # Drop Tokens without info (last token of doc)
        df.dropna(subset=["A1", "A2", "A3", "A4"], inplace=True)

        # Export the vocabularies to be able to use the model with another corpus
        voc_a3a4 = self._create_vocabulary(df.A3_.cat.categories)
        voc_B2 = self._create_vocabulary(df.cv_bin.cat.categories)
        voc_B1 = self._create_vocabulary(df.l_norm_bin.cat.categories)

        vocabulary = {"A3A4": voc_a3a4, "B1": voc_B1, "B2": voc_B2}

        self.vocabulary = vocabulary

        return df

    def fit_and_predict(self, corpus: Iterable[Doc]) -> pd.DataFrame:
        """Fit the model and predict for the training data

        Parameters
        ----------
        corpus : Iterable[Doc]
            An iterable of Documents

        Returns
        -------
        pd.DataFrame
            one line by end_line prediction
        """

        # Preprocess data to have a pd DF
        df = self._preprocess_data(corpus)

        # Train and predict M1
        self._fit_M1(df.A1, df.A2, df.A3, df.A4, df.SPACE)
        outputs_M1 = self._predict_M1(
            df.A1,
            df.A2,
            df.A3,
            df.A4,
        )
        df["M1"] = outputs_M1["predictions"]
        df["M1_proba"] = outputs_M1["predictions_proba"]

        # Force Blank lines to 0
        df.loc[df.BLANK_LINE, "M1"] = 0

        # Train and predict M2
        df_endlines = df.loc[df.END_LINE]
        self._fit_M2(B1=df_endlines.B1, B2=df_endlines.B2, label=df_endlines.M1)
        outputs_M2 = self._predict_M2(B1=df_endlines.B1, B2=df_endlines.B2)

        df.loc[df.END_LINE, "M2"] = outputs_M2["predictions"]
        df.loc[df.END_LINE, "M2_proba"] = outputs_M2["predictions_proba"]

        df["M2"] = df["M2"].astype(
            pd.Int64Dtype()
        )  # cast to pd.Int64Dtype cause there are None values

        # M1M2
        df = df.loc[df.END_LINE]
        df["M1M2_lr"] = (df["M2_proba"] / (1 - df["M2_proba"])) * (
            df["M1_proba"] / (1 - df["M1_proba"])
        )
        df["M1M2"] = (df["M1M2_lr"] > 1).astype("int")

        # Force Blank lines to 0
        df.loc[df.BLANK_LINE, ["M2", "M1M2"]] = 0

        # Make binary col
        df["PREDICTED_END_LINE"] = np.logical_not(df["M1M2"].astype(bool))

        return df

    def predict(self, df: pd.DataFrame) -> pd.DataFrame:
        """Use the model for inference

        The df should have the following columns:
        `["A1","A2","A3","A4","B1","B2","BLANK_LINE"]`

        Parameters
        ----------
        df : pd.DataFrame
            The df should have the following columns:
            `["A1","A2","A3","A4","B1","B2","BLANK_LINE"]`

        Returns
        -------
        pd.DataFrame
            The result is added to the column `PREDICTED_END_LINE`
        """

        df = self._convert_raw_data_to_codes(df)

        outputs_M1 = self._predict_M1(df.A1, df.A2, df._A3, df._A4)
        df["M1"] = outputs_M1["predictions"]
        df["M1_proba"] = outputs_M1["predictions_proba"]

        outputs_M2 = self._predict_M2(B1=df._B1, B2=df._B2)
        df["M2"] = outputs_M2["predictions"]
        df["M2_proba"] = outputs_M2["predictions_proba"]
        df["M2"] = df["M2"].astype(
            pd.Int64Dtype()
        )  # cast to pd.Int64Dtype cause there are None values

        # M1M2
        df["M1M2_lr"] = (df["M2_proba"] / (1 - df["M2_proba"])) * (
            df["M1_proba"] / (1 - df["M1_proba"])
        )
        df["M1M2"] = (df["M1M2_lr"] > 1).astype("int")

        # Force Blank lines to 0
        df.loc[
            df.BLANK_LINE,
            [
                "M1M2",
            ],
        ] = 0

        # Make binary col
        df["PREDICTED_END_LINE"] = np.logical_not(df["M1M2"].astype(bool))

        return df

    def save(self, path="base_model.pkl"):
        """Save a pickle of the model. It could be read by the pipeline later.

        Parameters
        ----------
        path : str, optional
            path to file .pkl, by default `base_model.pkl`
        """
        with open(path, "wb") as outp:
            del self.nlp
            pickle.dump(self, outp, pickle.HIGHEST_PROTOCOL)

    def _convert_A(self, df: pd.DataFrame, col: str) -> pd.DataFrame:
        """
        Parameters
        ----------
        df : pd.DataFrame
        col : str
            column to translate

        Returns
        -------
        pd.DataFrame
        """
        cat_type_A = CategoricalDtype(
            categories=self.vocabulary["A3A4"].keys(), ordered=True
        )
        new_col = "_" + col
        df[new_col] = df[col].astype(cat_type_A)
        df[new_col] = df[new_col].cat.codes
        # Ensure that not known values are coded as OTHER
        df.loc[
            ~df[col].isin(self.vocabulary["A3A4"].keys()), new_col
        ] = self.vocabulary["A3A4"]["OTHER"]
        return df

    def _convert_B(self, df: pd.DataFrame, col: str) -> pd.DataFrame:
        """
        Parameters
        ----------
        df : pd.DataFrame
            [description]
        col : str
            column to translate

        Returns
        -------
        pd.DataFrame
            [description]
        """
        # Translate B1
        index_B = pd.IntervalIndex(list(self.vocabulary[col].keys()))
        new_col = "_" + col
        df[new_col] = pd.cut(df[col], index_B)
        df[new_col] = df[new_col].cat.codes
        df.loc[df[col] >= index_B.right.max(), new_col] = max(
            self.vocabulary[col].values()
        )
        df.loc[df[col] <= index_B.left.min(), new_col] = min(
            self.vocabulary[col].values()
        )

        return df

    def _convert_raw_data_to_codes(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Function to translate data as extracted from spacy to the model codes.
        `A1` and `A2` are not translated cause are supposed to be already
        in good encoding.

        Parameters
        ----------
        df : pd.DataFrame
            It should have columns `['A3','A4','B1','B2']`

        Returns
        -------
        pd.DataFrame
        """
        df = self._convert_A(df, "A3")
        df = self._convert_A(df, "A4")
        df = self._convert_B(df, "B1")
        df = self._convert_B(df, "B2")
        return df

    def _convert_line_to_attribute(
        self, df: pd.DataFrame, expr: str, col: str
    ) -> pd.DataFrame:
        """
        Function to convert a line into an attribute (column) of the
        previous row. Particularly we use it to identify "\\n" and "\\n\\n"
        that are considered tokens, express this information as an attribute
        of the previous token.

        Parameters
        ----------
        df : pd.DataFrame
        expr : str
            pattern to search in the text. Ex.: "\\n"
        col : str
            name of the new column

        Returns
        -------
        pd.DataFrame
        """
        idx = df.TEXT.str.contains(expr)
        df.loc[idx, col] = True
        df[col] = df[col].fillna(False)
        df = self._shift_col(df, col, "TEMP_" + col, direction="backward")

        return df

    def _compute_a3(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        A3 (A4 respectively): typographic form  of left word (or right) :

        - All in capital letter
        - It starts with a capital letter
        - Starts by lowercase
        - It's a number
        - Strong punctuation
        - Soft punctuation
        - A number followed or preced by a punctuation (it's the case of enumerations)

        Parameters
        ----------
        df: pd.DataFrame

        Returns
        -------
        df: pd.DataFrame with the columns `A3` and `A3_`

        """
        df = self._shift_col(
            df, "IS_PUNCT", "IS_PUNCT_+1", direction="backward", fill=False
        )
        df = self._shift_col(
            df, "IS_PUNCT", "IS_PUNCT_-1", direction="forward", fill=False
        )

        CONDITION1 = df.IS_UPPER
        CONDITION2 = df.SHAPE_.str.startswith("Xx", na=False)
        CONDITION3 = df.SHAPE_.str.startswith("x", na=False)
        CONDITION4 = df.IS_DIGIT
        STRONG_PUNCT = [".", ";", "..", "..."]
        CONDITION5 = (df.IS_PUNCT) & (df.TEXT.isin(STRONG_PUNCT))
        CONDITION6 = (df.IS_PUNCT) & (~df.TEXT.isin(STRONG_PUNCT))
        CONDITION7 = (df.IS_DIGIT) & (df["IS_PUNCT_+1"] | df["IS_PUNCT_-1"])  # discuss

        df["A3_"] = None
        df.loc[CONDITION1, "A3_"] = "UPPER"
        df.loc[CONDITION2, "A3_"] = "S_UPPER"
        df.loc[CONDITION3, "A3_"] = "LOWER"
        df.loc[CONDITION4, "A3_"] = "DIGIT"
        df.loc[CONDITION5, "A3_"] = "STRONG_PUNCT"
        df.loc[CONDITION6, "A3_"] = "SOFT_PUNCT"
        df.loc[CONDITION7, "A3_"] = "ENUMERATION"

        df = df.drop(columns=["IS_PUNCT_+1", "IS_PUNCT_-1"])
        df["A3_"] = df["A3_"].astype("category")

        df["A3_"] = df["A3_"].cat.add_categories("OTHER")
        df["A3_"].fillna("OTHER", inplace=True)

        df["A3"] = df["A3_"].cat.codes

        return df

    def _fit_M1(
        self,
        A1: pd.Series,
        A2: pd.Series,
        A3: pd.Series,
        A4: pd.Series,
        label: pd.Series,
    ):
        """Function to train M1 classifier (Naive Bayes)

        Parameters
        ----------
        A1 : pd.Series
            [description]
        A2 : pd.Series
            [description]
        A3 : pd.Series
            [description]
        A4 : pd.Series
            [description]
        label : pd.Series
            [description]

        """
        # Encode classes to OneHotEncoder representation
        encoder_A1_A2 = self._fit_encoder_2S(A1, A2)
        self.encoder_A1_A2 = encoder_A1_A2

        encoder_A3_A4 = self._fit_encoder_2S(A3, A4)
        self.encoder_A3_A4 = encoder_A3_A4

        # M1
        m1 = MultinomialNB(alpha=1)

        X = self._get_X_for_M1(A1, A2, A3, A4)
        m1.fit(X, label)
        self.m1 = m1

    def _fit_M2(self, B1: pd.Series, B2: pd.Series, label: pd.Series):
        """Function to train M2 classifier (Naive Bayes)

        Parameters
        ----------
        B1 : pd.Series
        B2 : pd.Series
        label : pd.Series
        """

        # Encode classes to OneHotEncoder representation
        encoder_B1 = self._fit_encoder_1S(B1)
        self.encoder_B1 = encoder_B1
        encoder_B2 = self._fit_encoder_1S(B2)
        self.encoder_B2 = encoder_B2

        # Multinomial Naive Bayes
        m2 = MultinomialNB(alpha=1)
        X = self._get_X_for_M2(B1, B2)
        m2.fit(X, label)
        self.m2 = m2

    def _get_X_for_M1(
        self, A1: pd.Series, A2: pd.Series, A3: pd.Series, A4: pd.Series
    ) -> np.ndarray:
        """Get X matrix for classifier

        Parameters
        ----------
        A1 : pd.Series
        A2 : pd.Series
        A3 : pd.Series
        A4 : pd.Series

        Returns
        -------
        np.ndarray
        """
        A1_enc = self._encode_series(self.encoder_A1_A2, A1)
        A2_enc = self._encode_series(self.encoder_A1_A2, A2)
        A3_enc = self._encode_series(self.encoder_A3_A4, A3)
        A4_enc = self._encode_series(self.encoder_A3_A4, A4)
        X = hstack([A1_enc, A2_enc, A3_enc, A4_enc])
        return X

    def _get_X_for_M2(self, B1: pd.Series, B2: pd.Series) -> np.ndarray:
        """Get X matrix for classifier

        Parameters
        ----------
        B1 : pd.Series
        B2 : pd.Series

        Returns
        -------
        np.ndarray
        """
        B1_enc = self._encode_series(self.encoder_B1, B1)
        B2_enc = self._encode_series(self.encoder_B2, B2)
        X = hstack([B1_enc, B2_enc])
        return X

    def _predict_M1(
        self, A1: pd.Series, A2: pd.Series, A3: pd.Series, A4: pd.Series
    ) -> Dict[str, Any]:
        """Use M1 for prediction

        Parameters
        ----------
        A1 : pd.Series
        A2 : pd.Series
        A3 : pd.Series
        A4 : pd.Series

        Returns
        -------
        Dict[str, Any]
        """
        X = self._get_X_for_M1(A1, A2, A3, A4)
        predictions = self.m1.predict(X)
        predictions_proba = self.m1.predict_proba(X)[:, 1]
        outputs = {"predictions": predictions, "predictions_proba": predictions_proba}
        return outputs

    def _predict_M2(self, B1: pd.Series, B2: pd.Series) -> Dict[str, Any]:
        """Use M2 for prediction

        Parameters
        ----------
        B1 : pd.Series
        B2 : pd.Series

        Returns
        -------
        Dict[str, Any]
        """
        X = self._get_X_for_M2(B1, B2)
        predictions = self.m2.predict(X)
        predictions_proba = self.m2.predict_proba(X)[:, 1]
        outputs = {"predictions": predictions, "predictions_proba": predictions_proba}
        return outputs

    def _fit_encoder_2S(self, S1: pd.Series, S2: pd.Series) -> OneHotEncoder:
        """Fit a one hot encoder with 2 Series. It concatenates the series and after it fits.

        Parameters
        ----------
        S1 : pd.Series
        S2 : pd.Series

        Returns
        -------
        OneHotEncoder
        """
        _S1 = _convert_series_to_array(S1)
        _S2 = _convert_series_to_array(S2)
        S = np.concatenate([_S1, _S2])
        encoder = self._fit_one_hot_encoder(S)
        return encoder

    def _fit_encoder_1S(self, S1: pd.Series) -> OneHotEncoder:
        """Fit a one hot encoder with 1 Series.

        Parameters
        ----------
        S1 : pd.Series

        Returns
        -------
        OneHotEncoder
        """
        _S1 = _convert_series_to_array(S1)
        encoder = self._fit_one_hot_encoder(_S1)
        return encoder

    def _encode_series(self, encoder: OneHotEncoder, S: pd.Series) -> np.ndarray:
        """Use the one hot encoder to transform a series.

        Parameters
        ----------
        encoder : OneHotEncoder
        S : pd.Series
            a series to encode (transform)

        Returns
        -------
        np.ndarray
        """
        _S = _convert_series_to_array(S)
        S_enc = encoder.transform(_S)
        return S_enc

    def set_spans(self, corpus: Iterable[Doc], df: pd.DataFrame):
        """
        Function to set the results of the algorithm (pd.DataFrame)
        as spans of the spaCy document.

        Parameters
        ----------
        corpus : Iterable[Doc]
            Iterable of spaCy Documents
        df : pd.DataFrame
            It should have the columns:
            ["DOC_ID","original_token_index","PREDICTED_END_LINE"]
        """

        for doc_id, doc in enumerate(corpus):
            spans = []
            for token_i, pred in df.loc[
                df.DOC_ID == doc_id, ["original_token_index", "PREDICTED_END_LINE"]
            ].values:
                s = Span(doc, start=token_i, end=token_i + 1, label=_get_label(pred))

                spans.append(s)

            doc.spans["new_lines"] = spans

    @staticmethod
    def _retrieve_lines(dfg: DataFrameGroupBy) -> DataFrameGroupBy:
        """Function to give a sentence_id to each token.

        Parameters
        ----------
        dfg : DataFrameGroupBy

        Returns
        -------
        DataFrameGroupBy
            Same DataFrameGroupBy with the column `SENTENCE_ID`
        """
        sentences_ids = np.arange(dfg.END_LINE.sum())
        dfg.loc[dfg.END_LINE, "SENTENCE_ID"] = sentences_ids
        dfg["SENTENCE_ID"] = dfg["SENTENCE_ID"].fillna(method="bfill")
        return dfg

    @staticmethod
    def _create_vocabulary(x: iterable) -> dict:
        """Function to create a vocabulary for attributes in the training set.

        Parameters
        ----------
        x : iterable

        Returns
        -------
        dict
        """
        v = {}

        for i, key in enumerate(x):
            v[key] = i

        return v

    @staticmethod
    def _compute_B(df: pd.DataFrame) -> pd.DataFrame:
        """Function to compute B1 and B2

        Parameters
        ----------
        df : pd.DataFrame

        Returns
        -------
        pd.DataFrame
        """

        data = df.groupby(["DOC_ID", "SENTENCE_ID"]).agg(l=("LENGTH", "sum"))
        df_t = df.loc[df.END_LINE, ["DOC_ID", "SENTENCE_ID"]].merge(
            data, left_on=["DOC_ID", "SENTENCE_ID"], right_index=True, how="left"
        )

        stats_doc = df_t.groupby("DOC_ID").agg(mu=("l", "mean"), sigma=("l", "std"))
        stats_doc["sigma"].replace(
            0.0, 1.0, inplace=True
        )  # Replace the 0 std by unit std, otherwise it breaks the code.
        stats_doc["cv"] = stats_doc["sigma"] / stats_doc["mu"]

        df_t = df_t.drop(columns=["DOC_ID", "SENTENCE_ID"])
        df2 = df.merge(df_t, left_index=True, right_index=True, how="left")

        df2 = df2.merge(stats_doc, on=["DOC_ID"], how="left")
        df2["l_norm"] = (df2["l"] - df2["mu"]) / df2["sigma"]

        df2["cv_bin"] = pd.cut(df2["cv"], bins=10)
        df2["B2"] = df2["cv_bin"].cat.codes

        df2["l_norm_bin"] = pd.cut(df2["l_norm"], bins=10)
        df2["B1"] = df2["l_norm_bin"].cat.codes

        return df2

    @staticmethod
    def _shift_col(
        df: pd.DataFrame, col: str, new_col: str, direction="backward", fill=None
    ) -> pd.DataFrame:
        """Shifts a column one position into backward / forward direction.

        Parameters
        ----------
        df : pd.DataFrame
        col : str
            column to shift
        new_col : str
            column name to save the results
        direction : str, optional
            one of {"backward", "forward"}, by default "backward"
        fill : [type], optional
            , by default None

        Returns
        -------
        pd.DataFrame
            same df with `new_col` added.
        """
        df[new_col] = fill

        if direction == "backward":
            df.loc[df.index[:-1], new_col] = df[col].values[1:]

            different_doc_id = df["DOC_ID"].values[:-1] != df["DOC_ID"].values[1:]
            different_doc_id = np.append(different_doc_id, True)

        if direction == "forward":
            df.loc[df.index[1:], new_col] = df[col].values[:-1]
            different_doc_id = df["DOC_ID"].values[1:] != df["DOC_ID"].values[:-1]
            different_doc_id = np.append(True, different_doc_id)

        df.loc[different_doc_id, new_col] = fill
        return df

    @staticmethod
    def _get_attributes(doc: Doc, i=0):
        """Function to get the attributes of tokens of a spacy doc in a pd.DataFrame format.

        Parameters
        ----------
        doc : Doc
            spacy Doc
        i : int, optional
            document id, by default 0

        Returns
        -------
        pd.DataFrame
            Returns a dataframe with one line per token. It has the following columns :
            `[
            "ORTH",
            "LOWER",
            "SHAPE",
            "IS_DIGIT",
            "IS_SPACE",
            "IS_UPPER",
            "IS_PUNCT",
            "LENGTH",
            ]`
        """
        attributes = [
            "ORTH",
            "LOWER",
            "SHAPE",
            "IS_DIGIT",
            "IS_SPACE",
            "IS_UPPER",
            "IS_PUNCT",
            "LENGTH",
        ]
        attributes_array = doc.to_array(attributes)
        attributes_df = pd.DataFrame(attributes_array, columns=attributes)
        attributes_df["DOC_ID"] = i
        boolean_attr = []
        for a in attributes:
            if a[:3] == "IS_":
                boolean_attr.append(a)
        attributes_df[boolean_attr] = attributes_df[boolean_attr].astype("boolean")
        return attributes_df

    @staticmethod
    def _get_string(_id: int, string_store: StringStore) -> str:
        """Returns the string corresponding to the token_id

        Parameters
        ----------
        _id : int
            token id
        string_store : StringStore
            spaCy Language String Store

        Returns
        -------
        str
            string representation of the token.
        """
        return string_store[_id]

    @staticmethod
    def _fit_one_hot_encoder(X: np.ndarray) -> OneHotEncoder:
        """Fit a one hot encoder.

        Parameters
        ----------
        X : np.ndarray
            of shape (n,1)

        Returns
        -------
        OneHotEncoder
        """
        encoder = OneHotEncoder(handle_unknown="ignore")
        encoder.fit(X)
        return encoder

nlp = nlp instance-attribute

__init__(nlp)

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py

28
29

def __init__(self, nlp: Language):
    self.nlp = nlp

_preprocess_data(corpus)

PARAMETER DESCRIPTION

corpus

Corpus of documents

TYPE: Iterable[Doc]

RETURNS	DESCRIPTION
`pd.DataFrame`	Preprocessed data

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py

def _preprocess_data(self, corpus: Iterable[Doc]) -> pd.DataFrame:
    """
    Parameters
    ----------
    corpus : Iterable[Doc]
        Corpus of documents

    Returns
    -------
    pd.DataFrame
        Preprocessed data
    """
    # Extract the vocabulary
    string_store = self.nlp.vocab.strings

    # Iterate in the corpus and construct a dataframe
    train_data_list = []
    for i, doc in enumerate(corpus):
        train_data_list.append(self._get_attributes(doc, i))

    df = pd.concat(train_data_list)
    df.reset_index(inplace=True, drop=False)
    df.rename(columns={"ORTH": "A1", "index": "original_token_index"}, inplace=True)

    # Retrieve string representation of token_id and shape
    df["TEXT"] = df.A1.apply(self._get_string, string_store=string_store)
    df["SHAPE_"] = df.SHAPE.apply(self._get_string, string_store=string_store)

    # Convert new lines as an attribute instead of a row
    df = self._convert_line_to_attribute(df, expr="\n", col="END_LINE")
    df = self._convert_line_to_attribute(df, expr="\n\n", col="BLANK_LINE")
    df = df.loc[~(df.END_LINE | df.BLANK_LINE)]
    df = df.drop(columns="END_LINE")
    df = df.drop(columns="BLANK_LINE")
    df.rename(
        columns={"TEMP_END_LINE": "END_LINE", "TEMP_BLANK_LINE": "BLANK_LINE"},
        inplace=True,
    )

    # Construct A2 by shifting
    df = self._shift_col(df, "A1", "A2", direction="backward")

    # Compute A3 and A4
    df = self._compute_a3(df)
    df = self._shift_col(df, "A3", "A4", direction="backward")

    # SPACE is the class to predict. Set 1 if not an END_LINE
    df["SPACE"] = np.logical_not(df["END_LINE"]).astype("int")

    df[["END_LINE", "BLANK_LINE"]] = df[["END_LINE", "BLANK_LINE"]].fillna(
        True, inplace=False
    )

    # Assign a sentence id to each token
    df = df.groupby("DOC_ID").apply(self._retrieve_lines)
    df["SENTENCE_ID"] = df["SENTENCE_ID"].astype("int")

    # Compute B1 and B2
    df = self._compute_B(df)

    # Drop Tokens without info (last token of doc)
    df.dropna(subset=["A1", "A2", "A3", "A4"], inplace=True)

    # Export the vocabularies to be able to use the model with another corpus
    voc_a3a4 = self._create_vocabulary(df.A3_.cat.categories)
    voc_B2 = self._create_vocabulary(df.cv_bin.cat.categories)
    voc_B1 = self._create_vocabulary(df.l_norm_bin.cat.categories)

    vocabulary = {"A3A4": voc_a3a4, "B1": voc_B1, "B2": voc_B2}

    self.vocabulary = vocabulary

    return df

fit_and_predict(corpus)

Fit the model and predict for the training data

PARAMETER DESCRIPTION

corpus

An iterable of Documents

TYPE: Iterable[Doc]

RETURNS	DESCRIPTION
`pd.DataFrame`	one line by end_line prediction

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py

def fit_and_predict(self, corpus: Iterable[Doc]) -> pd.DataFrame:
    """Fit the model and predict for the training data

    Parameters
    ----------
    corpus : Iterable[Doc]
        An iterable of Documents

    Returns
    -------
    pd.DataFrame
        one line by end_line prediction
    """

    # Preprocess data to have a pd DF
    df = self._preprocess_data(corpus)

    # Train and predict M1
    self._fit_M1(df.A1, df.A2, df.A3, df.A4, df.SPACE)
    outputs_M1 = self._predict_M1(
        df.A1,
        df.A2,
        df.A3,
        df.A4,
    )
    df["M1"] = outputs_M1["predictions"]
    df["M1_proba"] = outputs_M1["predictions_proba"]

    # Force Blank lines to 0
    df.loc[df.BLANK_LINE, "M1"] = 0

    # Train and predict M2
    df_endlines = df.loc[df.END_LINE]
    self._fit_M2(B1=df_endlines.B1, B2=df_endlines.B2, label=df_endlines.M1)
    outputs_M2 = self._predict_M2(B1=df_endlines.B1, B2=df_endlines.B2)

    df.loc[df.END_LINE, "M2"] = outputs_M2["predictions"]
    df.loc[df.END_LINE, "M2_proba"] = outputs_M2["predictions_proba"]

    df["M2"] = df["M2"].astype(
        pd.Int64Dtype()
    )  # cast to pd.Int64Dtype cause there are None values

    # M1M2
    df = df.loc[df.END_LINE]
    df["M1M2_lr"] = (df["M2_proba"] / (1 - df["M2_proba"])) * (
        df["M1_proba"] / (1 - df["M1_proba"])
    )
    df["M1M2"] = (df["M1M2_lr"] > 1).astype("int")

    # Force Blank lines to 0
    df.loc[df.BLANK_LINE, ["M2", "M1M2"]] = 0

    # Make binary col
    df["PREDICTED_END_LINE"] = np.logical_not(df["M1M2"].astype(bool))

    return df

predict(df)

Use the model for inference

The df should have the following columns: ["A1","A2","A3","A4","B1","B2","BLANK_LINE"]

PARAMETER	DESCRIPTION
`df`	The df should have the following columns: `["A1","A2","A3","A4","B1","B2","BLANK_LINE"]` TYPE: `pd.DataFrame`

RETURNS	DESCRIPTION
`pd.DataFrame`	The result is added to the column `PREDICTED_END_LINE`

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py

def predict(self, df: pd.DataFrame) -> pd.DataFrame:
    """Use the model for inference

    The df should have the following columns:
    `["A1","A2","A3","A4","B1","B2","BLANK_LINE"]`

    Parameters
    ----------
    df : pd.DataFrame
        The df should have the following columns:
        `["A1","A2","A3","A4","B1","B2","BLANK_LINE"]`

    Returns
    -------
    pd.DataFrame
        The result is added to the column `PREDICTED_END_LINE`
    """

    df = self._convert_raw_data_to_codes(df)

    outputs_M1 = self._predict_M1(df.A1, df.A2, df._A3, df._A4)
    df["M1"] = outputs_M1["predictions"]
    df["M1_proba"] = outputs_M1["predictions_proba"]

    outputs_M2 = self._predict_M2(B1=df._B1, B2=df._B2)
    df["M2"] = outputs_M2["predictions"]
    df["M2_proba"] = outputs_M2["predictions_proba"]
    df["M2"] = df["M2"].astype(
        pd.Int64Dtype()
    )  # cast to pd.Int64Dtype cause there are None values

    # M1M2
    df["M1M2_lr"] = (df["M2_proba"] / (1 - df["M2_proba"])) * (
        df["M1_proba"] / (1 - df["M1_proba"])
    )
    df["M1M2"] = (df["M1M2_lr"] > 1).astype("int")

    # Force Blank lines to 0
    df.loc[
        df.BLANK_LINE,
        [
            "M1M2",
        ],
    ] = 0

    # Make binary col
    df["PREDICTED_END_LINE"] = np.logical_not(df["M1M2"].astype(bool))

    return df

save(path='base_model.pkl')

Save a pickle of the model. It could be read by the pipeline later.

PARAMETER DESCRIPTION

path

path to file .pkl, by default base_model.pkl

TYPE: str, optional DEFAULT: 'base_model.pkl'

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py

def save(self, path="base_model.pkl"):
    """Save a pickle of the model. It could be read by the pipeline later.

    Parameters
    ----------
    path : str, optional
        path to file .pkl, by default `base_model.pkl`
    """
    with open(path, "wb") as outp:
        del self.nlp
        pickle.dump(self, outp, pickle.HIGHEST_PROTOCOL)

_convert_A(df, col)

PARAMETER DESCRIPTION

df

TYPE: pd.DataFrame

col

column to translate

TYPE: str

RETURNS	DESCRIPTION
`pd.DataFrame`

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py

def _convert_A(self, df: pd.DataFrame, col: str) -> pd.DataFrame:
    """
    Parameters
    ----------
    df : pd.DataFrame
    col : str
        column to translate

    Returns
    -------
    pd.DataFrame
    """
    cat_type_A = CategoricalDtype(
        categories=self.vocabulary["A3A4"].keys(), ordered=True
    )
    new_col = "_" + col
    df[new_col] = df[col].astype(cat_type_A)
    df[new_col] = df[new_col].cat.codes
    # Ensure that not known values are coded as OTHER
    df.loc[
        ~df[col].isin(self.vocabulary["A3A4"].keys()), new_col
    ] = self.vocabulary["A3A4"]["OTHER"]
    return df

_convert_B(df, col)

PARAMETER DESCRIPTION

df

[description]

TYPE: pd.DataFrame

col

column to translate

TYPE: str

RETURNS	DESCRIPTION
`pd.DataFrame`	[description]

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py

def _convert_B(self, df: pd.DataFrame, col: str) -> pd.DataFrame:
    """
    Parameters
    ----------
    df : pd.DataFrame
        [description]
    col : str
        column to translate

    Returns
    -------
    pd.DataFrame
        [description]
    """
    # Translate B1
    index_B = pd.IntervalIndex(list(self.vocabulary[col].keys()))
    new_col = "_" + col
    df[new_col] = pd.cut(df[col], index_B)
    df[new_col] = df[new_col].cat.codes
    df.loc[df[col] >= index_B.right.max(), new_col] = max(
        self.vocabulary[col].values()
    )
    df.loc[df[col] <= index_B.left.min(), new_col] = min(
        self.vocabulary[col].values()
    )

    return df

_convert_raw_data_to_codes(df)

Function to translate data as extracted from spacy to the model codes. A1 and A2 are not translated cause are supposed to be already in good encoding.

PARAMETER	DESCRIPTION
`df`	It should have columns `['A3','A4','B1','B2']` TYPE: `pd.DataFrame`

RETURNS	DESCRIPTION
`pd.DataFrame`

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py

def _convert_raw_data_to_codes(self, df: pd.DataFrame) -> pd.DataFrame:
    """
    Function to translate data as extracted from spacy to the model codes.
    `A1` and `A2` are not translated cause are supposed to be already
    in good encoding.

    Parameters
    ----------
    df : pd.DataFrame
        It should have columns `['A3','A4','B1','B2']`

    Returns
    -------
    pd.DataFrame
    """
    df = self._convert_A(df, "A3")
    df = self._convert_A(df, "A4")
    df = self._convert_B(df, "B1")
    df = self._convert_B(df, "B2")
    return df

_convert_line_to_attribute(df, expr, col)

Function to convert a line into an attribute (column) of the previous row. Particularly we use it to identify "\n" and "\n\n" that are considered tokens, express this information as an attribute of the previous token.

PARAMETER DESCRIPTION

df

TYPE: pd.DataFrame

expr

pattern to search in the text. Ex.: "\n"

TYPE: str

col

name of the new column

TYPE: str

RETURNS	DESCRIPTION
`pd.DataFrame`

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py

def _convert_line_to_attribute(
    self, df: pd.DataFrame, expr: str, col: str
) -> pd.DataFrame:
    """
    Function to convert a line into an attribute (column) of the
    previous row. Particularly we use it to identify "\\n" and "\\n\\n"
    that are considered tokens, express this information as an attribute
    of the previous token.

    Parameters
    ----------
    df : pd.DataFrame
    expr : str
        pattern to search in the text. Ex.: "\\n"
    col : str
        name of the new column

    Returns
    -------
    pd.DataFrame
    """
    idx = df.TEXT.str.contains(expr)
    df.loc[idx, col] = True
    df[col] = df[col].fillna(False)
    df = self._shift_col(df, col, "TEMP_" + col, direction="backward")

    return df

_compute_a3(df)

A3 (A4 respectively): typographic form of left word (or right) :

All in capital letter
It starts with a capital letter
Starts by lowercase
It's a number
Strong punctuation
Soft punctuation
A number followed or preced by a punctuation (it's the case of enumerations)

PARAMETER	DESCRIPTION
`df`	TYPE: `pd.DataFrame`

RETURNS	DESCRIPTION
`df`

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py

def _compute_a3(self, df: pd.DataFrame) -> pd.DataFrame:
    """
    A3 (A4 respectively): typographic form  of left word (or right) :

    - All in capital letter
    - It starts with a capital letter
    - Starts by lowercase
    - It's a number
    - Strong punctuation
    - Soft punctuation
    - A number followed or preced by a punctuation (it's the case of enumerations)

    Parameters
    ----------
    df: pd.DataFrame

    Returns
    -------
    df: pd.DataFrame with the columns `A3` and `A3_`

    """
    df = self._shift_col(
        df, "IS_PUNCT", "IS_PUNCT_+1", direction="backward", fill=False
    )
    df = self._shift_col(
        df, "IS_PUNCT", "IS_PUNCT_-1", direction="forward", fill=False
    )

    CONDITION1 = df.IS_UPPER
    CONDITION2 = df.SHAPE_.str.startswith("Xx", na=False)
    CONDITION3 = df.SHAPE_.str.startswith("x", na=False)
    CONDITION4 = df.IS_DIGIT
    STRONG_PUNCT = [".", ";", "..", "..."]
    CONDITION5 = (df.IS_PUNCT) & (df.TEXT.isin(STRONG_PUNCT))
    CONDITION6 = (df.IS_PUNCT) & (~df.TEXT.isin(STRONG_PUNCT))
    CONDITION7 = (df.IS_DIGIT) & (df["IS_PUNCT_+1"] | df["IS_PUNCT_-1"])  # discuss

    df["A3_"] = None
    df.loc[CONDITION1, "A3_"] = "UPPER"
    df.loc[CONDITION2, "A3_"] = "S_UPPER"
    df.loc[CONDITION3, "A3_"] = "LOWER"
    df.loc[CONDITION4, "A3_"] = "DIGIT"
    df.loc[CONDITION5, "A3_"] = "STRONG_PUNCT"
    df.loc[CONDITION6, "A3_"] = "SOFT_PUNCT"
    df.loc[CONDITION7, "A3_"] = "ENUMERATION"

    df = df.drop(columns=["IS_PUNCT_+1", "IS_PUNCT_-1"])
    df["A3_"] = df["A3_"].astype("category")

    df["A3_"] = df["A3_"].cat.add_categories("OTHER")
    df["A3_"].fillna("OTHER", inplace=True)

    df["A3"] = df["A3_"].cat.codes

    return df

_fit_M1(A1, A2, A3, A4, label)

Function to train M1 classifier (Naive Bayes)

PARAMETER	DESCRIPTION
`A1`	[description] TYPE: `pd.Series`
`A2`	[description] TYPE: `pd.Series`
`A3`	[description] TYPE: `pd.Series`
`A4`	[description] TYPE: `pd.Series`
`label`	[description] TYPE: `pd.Series`

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py

def _fit_M1(
    self,
    A1: pd.Series,
    A2: pd.Series,
    A3: pd.Series,
    A4: pd.Series,
    label: pd.Series,
):
    """Function to train M1 classifier (Naive Bayes)

    Parameters
    ----------
    A1 : pd.Series
        [description]
    A2 : pd.Series
        [description]
    A3 : pd.Series
        [description]
    A4 : pd.Series
        [description]
    label : pd.Series
        [description]

    """
    # Encode classes to OneHotEncoder representation
    encoder_A1_A2 = self._fit_encoder_2S(A1, A2)
    self.encoder_A1_A2 = encoder_A1_A2

    encoder_A3_A4 = self._fit_encoder_2S(A3, A4)
    self.encoder_A3_A4 = encoder_A3_A4

    # M1
    m1 = MultinomialNB(alpha=1)

    X = self._get_X_for_M1(A1, A2, A3, A4)
    m1.fit(X, label)
    self.m1 = m1

_fit_M2(B1, B2, label)

Function to train M2 classifier (Naive Bayes)

PARAMETER	DESCRIPTION
`B1`	TYPE: `pd.Series`
`B2`	TYPE: `pd.Series`
`label`	TYPE: `pd.Series`

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py

def _fit_M2(self, B1: pd.Series, B2: pd.Series, label: pd.Series):
    """Function to train M2 classifier (Naive Bayes)

    Parameters
    ----------
    B1 : pd.Series
    B2 : pd.Series
    label : pd.Series
    """

    # Encode classes to OneHotEncoder representation
    encoder_B1 = self._fit_encoder_1S(B1)
    self.encoder_B1 = encoder_B1
    encoder_B2 = self._fit_encoder_1S(B2)
    self.encoder_B2 = encoder_B2

    # Multinomial Naive Bayes
    m2 = MultinomialNB(alpha=1)
    X = self._get_X_for_M2(B1, B2)
    m2.fit(X, label)
    self.m2 = m2

_get_X_for_M1(A1, A2, A3, A4)

Get X matrix for classifier

PARAMETER	DESCRIPTION
`A1`	TYPE: `pd.Series`
`A2`	TYPE: `pd.Series`
`A3`	TYPE: `pd.Series`
`A4`	TYPE: `pd.Series`

RETURNS	DESCRIPTION
`np.ndarray`

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py

def _get_X_for_M1(
    self, A1: pd.Series, A2: pd.Series, A3: pd.Series, A4: pd.Series
) -> np.ndarray:
    """Get X matrix for classifier

    Parameters
    ----------
    A1 : pd.Series
    A2 : pd.Series
    A3 : pd.Series
    A4 : pd.Series

    Returns
    -------
    np.ndarray
    """
    A1_enc = self._encode_series(self.encoder_A1_A2, A1)
    A2_enc = self._encode_series(self.encoder_A1_A2, A2)
    A3_enc = self._encode_series(self.encoder_A3_A4, A3)
    A4_enc = self._encode_series(self.encoder_A3_A4, A4)
    X = hstack([A1_enc, A2_enc, A3_enc, A4_enc])
    return X

_get_X_for_M2(B1, B2)

Get X matrix for classifier

PARAMETER	DESCRIPTION
`B1`	TYPE: `pd.Series`
`B2`	TYPE: `pd.Series`

RETURNS	DESCRIPTION
`np.ndarray`

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py

def _get_X_for_M2(self, B1: pd.Series, B2: pd.Series) -> np.ndarray:
    """Get X matrix for classifier

    Parameters
    ----------
    B1 : pd.Series
    B2 : pd.Series

    Returns
    -------
    np.ndarray
    """
    B1_enc = self._encode_series(self.encoder_B1, B1)
    B2_enc = self._encode_series(self.encoder_B2, B2)
    X = hstack([B1_enc, B2_enc])
    return X

_predict_M1(A1, A2, A3, A4)

Use M1 for prediction

PARAMETER	DESCRIPTION
`A1`	TYPE: `pd.Series`
`A2`	TYPE: `pd.Series`
`A3`	TYPE: `pd.Series`
`A4`	TYPE: `pd.Series`

RETURNS	DESCRIPTION
`Dict[str, Any]`

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py

def _predict_M1(
    self, A1: pd.Series, A2: pd.Series, A3: pd.Series, A4: pd.Series
) -> Dict[str, Any]:
    """Use M1 for prediction

    Parameters
    ----------
    A1 : pd.Series
    A2 : pd.Series
    A3 : pd.Series
    A4 : pd.Series

    Returns
    -------
    Dict[str, Any]
    """
    X = self._get_X_for_M1(A1, A2, A3, A4)
    predictions = self.m1.predict(X)
    predictions_proba = self.m1.predict_proba(X)[:, 1]
    outputs = {"predictions": predictions, "predictions_proba": predictions_proba}
    return outputs

_predict_M2(B1, B2)

Use M2 for prediction

PARAMETER	DESCRIPTION
`B1`	TYPE: `pd.Series`
`B2`	TYPE: `pd.Series`

RETURNS	DESCRIPTION
`Dict[str, Any]`

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py

def _predict_M2(self, B1: pd.Series, B2: pd.Series) -> Dict[str, Any]:
    """Use M2 for prediction

    Parameters
    ----------
    B1 : pd.Series
    B2 : pd.Series

    Returns
    -------
    Dict[str, Any]
    """
    X = self._get_X_for_M2(B1, B2)
    predictions = self.m2.predict(X)
    predictions_proba = self.m2.predict_proba(X)[:, 1]
    outputs = {"predictions": predictions, "predictions_proba": predictions_proba}
    return outputs

_fit_encoder_2S(S1, S2)

Fit a one hot encoder with 2 Series. It concatenates the series and after it fits.

PARAMETER	DESCRIPTION
`S1`	TYPE: `pd.Series`
`S2`	TYPE: `pd.Series`

RETURNS	DESCRIPTION
`OneHotEncoder`

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py

def _fit_encoder_2S(self, S1: pd.Series, S2: pd.Series) -> OneHotEncoder:
    """Fit a one hot encoder with 2 Series. It concatenates the series and after it fits.

    Parameters
    ----------
    S1 : pd.Series
    S2 : pd.Series

    Returns
    -------
    OneHotEncoder
    """
    _S1 = _convert_series_to_array(S1)
    _S2 = _convert_series_to_array(S2)
    S = np.concatenate([_S1, _S2])
    encoder = self._fit_one_hot_encoder(S)
    return encoder

_fit_encoder_1S(S1)

Fit a one hot encoder with 1 Series.

PARAMETER	DESCRIPTION
`S1`	TYPE: `pd.Series`

RETURNS	DESCRIPTION
`OneHotEncoder`

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py

def _fit_encoder_1S(self, S1: pd.Series) -> OneHotEncoder:
    """Fit a one hot encoder with 1 Series.

    Parameters
    ----------
    S1 : pd.Series

    Returns
    -------
    OneHotEncoder
    """
    _S1 = _convert_series_to_array(S1)
    encoder = self._fit_one_hot_encoder(_S1)
    return encoder

_encode_series(encoder, S)

Use the one hot encoder to transform a series.

PARAMETER DESCRIPTION

encoder

TYPE: OneHotEncoder

S

a series to encode (transform)

TYPE: pd.Series

RETURNS	DESCRIPTION
`np.ndarray`

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py

def _encode_series(self, encoder: OneHotEncoder, S: pd.Series) -> np.ndarray:
    """Use the one hot encoder to transform a series.

    Parameters
    ----------
    encoder : OneHotEncoder
    S : pd.Series
        a series to encode (transform)

    Returns
    -------
    np.ndarray
    """
    _S = _convert_series_to_array(S)
    S_enc = encoder.transform(_S)
    return S_enc

set_spans(corpus, df)

Function to set the results of the algorithm (pd.DataFrame) as spans of the spaCy document.

PARAMETER DESCRIPTION

corpus

Iterable of spaCy Documents

TYPE: Iterable[Doc]

df

It should have the columns: ["DOC_ID","original_token_index","PREDICTED_END_LINE"]

TYPE: pd.DataFrame

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py

def set_spans(self, corpus: Iterable[Doc], df: pd.DataFrame):
    """
    Function to set the results of the algorithm (pd.DataFrame)
    as spans of the spaCy document.

    Parameters
    ----------
    corpus : Iterable[Doc]
        Iterable of spaCy Documents
    df : pd.DataFrame
        It should have the columns:
        ["DOC_ID","original_token_index","PREDICTED_END_LINE"]
    """

    for doc_id, doc in enumerate(corpus):
        spans = []
        for token_i, pred in df.loc[
            df.DOC_ID == doc_id, ["original_token_index", "PREDICTED_END_LINE"]
        ].values:
            s = Span(doc, start=token_i, end=token_i + 1, label=_get_label(pred))

            spans.append(s)

        doc.spans["new_lines"] = spans

_retrieve_lines(dfg)

Function to give a sentence_id to each token.

PARAMETER	DESCRIPTION
`dfg`	TYPE: `DataFrameGroupBy`

RETURNS	DESCRIPTION
`DataFrameGroupBy`	Same DataFrameGroupBy with the column `SENTENCE_ID`

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py

@staticmethod
def _retrieve_lines(dfg: DataFrameGroupBy) -> DataFrameGroupBy:
    """Function to give a sentence_id to each token.

    Parameters
    ----------
    dfg : DataFrameGroupBy

    Returns
    -------
    DataFrameGroupBy
        Same DataFrameGroupBy with the column `SENTENCE_ID`
    """
    sentences_ids = np.arange(dfg.END_LINE.sum())
    dfg.loc[dfg.END_LINE, "SENTENCE_ID"] = sentences_ids
    dfg["SENTENCE_ID"] = dfg["SENTENCE_ID"].fillna(method="bfill")
    return dfg

_create_vocabulary(x)

Function to create a vocabulary for attributes in the training set.

PARAMETER	DESCRIPTION
`x`	TYPE: `iterable`

RETURNS	DESCRIPTION
`dict`

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py

@staticmethod
def _create_vocabulary(x: iterable) -> dict:
    """Function to create a vocabulary for attributes in the training set.

    Parameters
    ----------
    x : iterable

    Returns
    -------
    dict
    """
    v = {}

    for i, key in enumerate(x):
        v[key] = i

    return v

_compute_B(df)

Function to compute B1 and B2

PARAMETER	DESCRIPTION
`df`	TYPE: `pd.DataFrame`

RETURNS	DESCRIPTION
`pd.DataFrame`

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py

@staticmethod
def _compute_B(df: pd.DataFrame) -> pd.DataFrame:
    """Function to compute B1 and B2

    Parameters
    ----------
    df : pd.DataFrame

    Returns
    -------
    pd.DataFrame
    """

    data = df.groupby(["DOC_ID", "SENTENCE_ID"]).agg(l=("LENGTH", "sum"))
    df_t = df.loc[df.END_LINE, ["DOC_ID", "SENTENCE_ID"]].merge(
        data, left_on=["DOC_ID", "SENTENCE_ID"], right_index=True, how="left"
    )

    stats_doc = df_t.groupby("DOC_ID").agg(mu=("l", "mean"), sigma=("l", "std"))
    stats_doc["sigma"].replace(
        0.0, 1.0, inplace=True
    )  # Replace the 0 std by unit std, otherwise it breaks the code.
    stats_doc["cv"] = stats_doc["sigma"] / stats_doc["mu"]

    df_t = df_t.drop(columns=["DOC_ID", "SENTENCE_ID"])
    df2 = df.merge(df_t, left_index=True, right_index=True, how="left")

    df2 = df2.merge(stats_doc, on=["DOC_ID"], how="left")
    df2["l_norm"] = (df2["l"] - df2["mu"]) / df2["sigma"]

    df2["cv_bin"] = pd.cut(df2["cv"], bins=10)
    df2["B2"] = df2["cv_bin"].cat.codes

    df2["l_norm_bin"] = pd.cut(df2["l_norm"], bins=10)
    df2["B1"] = df2["l_norm_bin"].cat.codes

    return df2

_shift_col(df, col, new_col, direction='backward', fill=None)

Shifts a column one position into backward / forward direction.

PARAMETER	DESCRIPTION
`df`	TYPE: `pd.DataFrame`
`col`	column to shift TYPE: `str`
`new_col`	column name to save the results TYPE: `str`
`direction`	one of {"backward", "forward"}, by default "backward" TYPE: `str, optional` DEFAULT: `'backward'`
`fill`	, by default None TYPE: `[type], optional` DEFAULT: `None`

RETURNS	DESCRIPTION
`pd.DataFrame`	same df with `new_col` added.

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py

@staticmethod
def _shift_col(
    df: pd.DataFrame, col: str, new_col: str, direction="backward", fill=None
) -> pd.DataFrame:
    """Shifts a column one position into backward / forward direction.

    Parameters
    ----------
    df : pd.DataFrame
    col : str
        column to shift
    new_col : str
        column name to save the results
    direction : str, optional
        one of {"backward", "forward"}, by default "backward"
    fill : [type], optional
        , by default None

    Returns
    -------
    pd.DataFrame
        same df with `new_col` added.
    """
    df[new_col] = fill

    if direction == "backward":
        df.loc[df.index[:-1], new_col] = df[col].values[1:]

        different_doc_id = df["DOC_ID"].values[:-1] != df["DOC_ID"].values[1:]
        different_doc_id = np.append(different_doc_id, True)

    if direction == "forward":
        df.loc[df.index[1:], new_col] = df[col].values[:-1]
        different_doc_id = df["DOC_ID"].values[1:] != df["DOC_ID"].values[:-1]
        different_doc_id = np.append(True, different_doc_id)

    df.loc[different_doc_id, new_col] = fill
    return df

_get_attributes(doc, i=0)

Function to get the attributes of tokens of a spacy doc in a pd.DataFrame format.

PARAMETER DESCRIPTION

doc

spacy Doc

TYPE: Doc

i

document id, by default 0

TYPE: int, optional DEFAULT: 0

RETURNS	DESCRIPTION
`pd.DataFrame`	Returns a dataframe with one line per token. It has the following columns : `[ "ORTH", "LOWER", "SHAPE", "IS_DIGIT", "IS_SPACE", "IS_UPPER", "IS_PUNCT", "LENGTH", ]`

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py

@staticmethod
def _get_attributes(doc: Doc, i=0):
    """Function to get the attributes of tokens of a spacy doc in a pd.DataFrame format.

    Parameters
    ----------
    doc : Doc
        spacy Doc
    i : int, optional
        document id, by default 0

    Returns
    -------
    pd.DataFrame
        Returns a dataframe with one line per token. It has the following columns :
        `[
        "ORTH",
        "LOWER",
        "SHAPE",
        "IS_DIGIT",
        "IS_SPACE",
        "IS_UPPER",
        "IS_PUNCT",
        "LENGTH",
        ]`
    """
    attributes = [
        "ORTH",
        "LOWER",
        "SHAPE",
        "IS_DIGIT",
        "IS_SPACE",
        "IS_UPPER",
        "IS_PUNCT",
        "LENGTH",
    ]
    attributes_array = doc.to_array(attributes)
    attributes_df = pd.DataFrame(attributes_array, columns=attributes)
    attributes_df["DOC_ID"] = i
    boolean_attr = []
    for a in attributes:
        if a[:3] == "IS_":
            boolean_attr.append(a)
    attributes_df[boolean_attr] = attributes_df[boolean_attr].astype("boolean")
    return attributes_df

_get_string(_id, string_store)

Returns the string corresponding to the token_id

PARAMETER DESCRIPTION

_id

token id

TYPE: int

string_store

spaCy Language String Store

TYPE: StringStore

RETURNS	DESCRIPTION
`str`	string representation of the token.

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py

@staticmethod
def _get_string(_id: int, string_store: StringStore) -> str:
    """Returns the string corresponding to the token_id

    Parameters
    ----------
    _id : int
        token id
    string_store : StringStore
        spaCy Language String Store

    Returns
    -------
    str
        string representation of the token.
    """
    return string_store[_id]

_fit_one_hot_encoder(X)

Fit a one hot encoder.

PARAMETER	DESCRIPTION
`X`	of shape (n,1) TYPE: `np.ndarray`

RETURNS	DESCRIPTION
`OneHotEncoder`

Source code in edsnlp/pipelines/core/endlines/endlinesmodel.py

@staticmethod
def _fit_one_hot_encoder(X: np.ndarray) -> OneHotEncoder:
    """Fit a one hot encoder.

    Parameters
    ----------
    X : np.ndarray
        of shape (n,1)

    Returns
    -------
    OneHotEncoder
    """
    encoder = OneHotEncoder(handle_unknown="ignore")
    encoder.fit(X)
    return encoder

`context`

ContextAdder

Bases: BaseComponent

Provides a generic context adder component.

PARAMETER DESCRIPTION

nlp

The spaCy object.

TYPE: Language

context

The list of extensions to add to the Doc

TYPE: List[str]

Source code in edsnlp/pipelines/core/context/context.py

class ContextAdder(BaseComponent):
    """
    Provides a generic context adder component.

    Parameters
    ----------
    nlp : Language
        The spaCy object.
    context : List[str]
        The list of extensions to add to the `Doc`
    """

    def __init__(
        self,
        nlp: Language,
        context: List[str],
    ):

        self.nlp = nlp
        self.context = context
        self.set_extensions()

    def set_extensions(self):
        for col in self.context:
            if not Doc.has_extension(col):
                Doc.set_extension(col, default=None)

    def __call__(self, doc: Doc) -> Doc:
        return doc

nlp = nlp instance-attribute

context = context instance-attribute

__init__(nlp, context)

Source code in edsnlp/pipelines/core/context/context.py

def __init__(
    self,
    nlp: Language,
    context: List[str],
):

    self.nlp = nlp
    self.context = context
    self.set_extensions()

set_extensions()

Source code in edsnlp/pipelines/core/context/context.py

def set_extensions(self):
    for col in self.context:
        if not Doc.has_extension(col):
            Doc.set_extension(col, default=None)

__call__(doc)

Source code in edsnlp/pipelines/core/context/context.py

36
37

def __call__(self, doc: Doc) -> Doc:
    return doc

`factory`

DEFAULT_CONFIG = dict(context=['note_id']) module-attribute

create_component(nlp, name, context)

Source code in edsnlp/pipelines/core/context/factory.py

@Language.factory("eds.context", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    context: List[str],
):

    return ContextAdder(
        nlp,
        context=context,
    )

`normalizer`

Normalizer

Bases: object

Normalisation pipeline. Modifies the NORM attribute, acting on four dimensions :

lowercase: using the default NORM
accents: deterministic and fixed-length normalisation of accents.
quotes: deterministic and fixed-length normalisation of quotation marks.
pollution: removal of pollutions.

PARAMETER	DESCRIPTION
`lowercase`	Whether to remove case. TYPE: `bool`
`accents`	Optional `Accents` object. TYPE: `Optional[Accents]`
`quotes`	Optional `Quotes` object. TYPE: `Optional[Quotes]`
`pollution`	Optional `Pollution` object. TYPE: `Optional[Pollution]`

Source code in edsnlp/pipelines/core/normalizer/normalizer.py

class Normalizer(object):
    """
    Normalisation pipeline. Modifies the `NORM` attribute,
    acting on four dimensions :

    - `lowercase`: using the default `NORM`
    - `accents`: deterministic and fixed-length normalisation of accents.
    - `quotes`: deterministic and fixed-length normalisation of quotation marks.
    - `pollution`: removal of pollutions.

    Parameters
    ----------
    lowercase : bool
        Whether to remove case.
    accents : Optional[Accents]
        Optional `Accents` object.
    quotes : Optional[Quotes]
        Optional `Quotes` object.
    pollution : Optional[Pollution]
        Optional `Pollution` object.
    """

    def __init__(
        self,
        lowercase: bool,
        accents: Optional[Accents],
        quotes: Optional[Quotes],
        pollution: Optional[Pollution],
    ):
        self.lowercase = lowercase
        self.accents = accents
        self.quotes = quotes
        self.pollution = pollution

    def __call__(self, doc: Doc) -> Doc:
        """
        Apply the normalisation pipeline, one component at a time.

        Parameters
        ----------
        doc : Doc
            spaCy `Doc` object

        Returns
        -------
        Doc
            Doc object with `NORM` attribute modified
        """
        if not self.lowercase:
            remove_lowercase(doc)
        if self.accents is not None:
            self.accents(doc)
        if self.quotes is not None:
            self.quotes(doc)
        if self.pollution is not None:
            self.pollution(doc)

        return doc

lowercase = lowercase instance-attribute

accents = accents instance-attribute

quotes = quotes instance-attribute

pollution = pollution instance-attribute

__init__(lowercase, accents, quotes, pollution)

Source code in edsnlp/pipelines/core/normalizer/normalizer.py

def __init__(
    self,
    lowercase: bool,
    accents: Optional[Accents],
    quotes: Optional[Quotes],
    pollution: Optional[Pollution],
):
    self.lowercase = lowercase
    self.accents = accents
    self.quotes = quotes
    self.pollution = pollution

__call__(doc)

Apply the normalisation pipeline, one component at a time.

PARAMETER DESCRIPTION

doc

spaCy Doc object

TYPE: Doc

RETURNS	DESCRIPTION
`Doc`	Doc object with `NORM` attribute modified

Source code in edsnlp/pipelines/core/normalizer/normalizer.py

def __call__(self, doc: Doc) -> Doc:
    """
    Apply the normalisation pipeline, one component at a time.

    Parameters
    ----------
    doc : Doc
        spaCy `Doc` object

    Returns
    -------
    Doc
        Doc object with `NORM` attribute modified
    """
    if not self.lowercase:
        remove_lowercase(doc)
    if self.accents is not None:
        self.accents(doc)
    if self.quotes is not None:
        self.quotes(doc)
    if self.pollution is not None:
        self.pollution(doc)

    return doc

`factory`

DEFAULT_CONFIG = dict(accents=True, lowercase=True, quotes=True, pollution=True) module-attribute

create_component(nlp, name, accents, lowercase, quotes, pollution)

Source code in edsnlp/pipelines/core/normalizer/factory.py

@deprecated_factory("normalizer", "eds.normalizer", default_config=DEFAULT_CONFIG)
@Language.factory("eds.normalizer", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    accents: Union[bool, Dict[str, Any]],
    lowercase: Union[bool, Dict[str, Any]],
    quotes: Union[bool, Dict[str, Any]],
    pollution: Union[bool, Dict[str, Any]],
):

    if accents:
        config = dict(**accents_config)
        if isinstance(accents, dict):
            config.update(accents)
        accents = registry.get("factories", "eds.accents")(nlp, "eds.accents", **config)

    if quotes:
        config = dict(**quotes_config)
        if isinstance(quotes, dict):
            config.update(quotes)
        quotes = registry.get("factories", "eds.quotes")(nlp, "eds.quotes", **config)

    if pollution:
        config = dict(**pollution_config)
        if isinstance(pollution, dict):
            config.update(pollution)
        pollution = registry.get("factories", "eds.pollution")(
            nlp, "eds.pollution", **config
        )

    normalizer = Normalizer(
        lowercase=lowercase,
        accents=accents or None,
        quotes=quotes or None,
        pollution=pollution or None,
    )

    return normalizer

`pollution`

patterns

information = "(?s)(=====+\\s*)?(L\\s*e\\s*s\\sdonnées\\s*administratives,\\s*sociales\\s*|I?nfo\\s*rmation\\s*aux?\\s*patients?|L[’']AP-HP\\s*collecte\\s*vos\\s*données\\s*administratives|L[’']Assistance\\s*Publique\\s*-\\s*Hôpitaux\\s*de\\s*Paris\\s*\\(?AP-HP\\)?\\s*a\\s*créé\\s*une\\s*base\\s*de\\s*données).{,2000}https?:\\/\\/recherche\\.aphp\\.fr\\/eds\\/droit-opposition[\\s\\.]*"

module-attribute

bars = '(?i)([nbw]|_|-|=){5,}' module-attribute

pollution = dict(information=information, bars=bars) module-attribute

pollution

Pollution

Bases: BaseComponent

Tags pollution tokens.

Populates a number of spaCy extensions :

Token._.pollution : indicates whether the token is a pollution
Doc._.clean : lists non-pollution tokens
Doc._.clean_ : original text with pollutions removed.
Doc._.char_clean_span : method to create a Span using character indices extracted using the cleaned text.

PARAMETER DESCRIPTION

nlp

Language pipeline object

TYPE: Language

pollution

Dictionary containing regular expressions of pollution.

TYPE: Dict[str, Union[str, List[str]]]

Source code in edsnlp/pipelines/core/normalizer/pollution/pollution.py

class Pollution(BaseComponent):
    """
    Tags pollution tokens.

    Populates a number of spaCy extensions :

    - `Token._.pollution` : indicates whether the token is a pollution
    - `Doc._.clean` : lists non-pollution tokens
    - `Doc._.clean_` : original text with pollutions removed.
    - `Doc._.char_clean_span` : method to create a Span using character
      indices extracted using the cleaned text.

    Parameters
    ----------
    nlp : Language
        Language pipeline object
    pollution : Dict[str, Union[str, List[str]]]
        Dictionary containing regular expressions of pollution.
    """

    # noinspection PyProtectedMember
    def __init__(
        self,
        nlp: Language,
        pollution: Optional[Dict[str, Union[str, List[str]]]],
    ):

        self.nlp = nlp

        if pollution is None:
            pollution = patterns.pollution

        self.pollution = pollution

        for k, v in self.pollution.items():
            if isinstance(v, str):
                self.pollution[k] = [v]

        self.regex_matcher = RegexMatcher()
        self.build_patterns()

    def build_patterns(self) -> None:
        """
        Builds the patterns for phrase matching.
        """

        # efficiently build spaCy matcher patterns
        for k, v in self.pollution.items():
            self.regex_matcher.add(k, v)

    def process(self, doc: Doc) -> List[Span]:
        """
        Find pollutions in doc and clean candidate negations to remove pseudo negations

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        pollution:
            list of pollution spans
        """

        pollutions = self.regex_matcher(doc, as_spans=True)
        pollutions = filter_spans(pollutions)

        return pollutions

    def __call__(self, doc: Doc) -> Doc:
        """
        Tags pollutions.

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        doc:
            spaCy Doc object, annotated for pollutions.
        """
        pollutions = self.process(doc)

        for pollution in pollutions:

            for token in pollution:
                token._.excluded = True

        doc.spans["pollutions"] = pollutions

        return doc

nlp = nlp instance-attribute

pollution = pollution instance-attribute

regex_matcher = RegexMatcher() instance-attribute

__init__(nlp, pollution)

Source code in edsnlp/pipelines/core/normalizer/pollution/pollution.py

def __init__(
    self,
    nlp: Language,
    pollution: Optional[Dict[str, Union[str, List[str]]]],
):

    self.nlp = nlp

    if pollution is None:
        pollution = patterns.pollution

    self.pollution = pollution

    for k, v in self.pollution.items():
        if isinstance(v, str):
            self.pollution[k] = [v]

    self.regex_matcher = RegexMatcher()
    self.build_patterns()

build_patterns()

Builds the patterns for phrase matching.

Source code in edsnlp/pipelines/core/normalizer/pollution/pollution.py

def build_patterns(self) -> None:
    """
    Builds the patterns for phrase matching.
    """

    # efficiently build spaCy matcher patterns
    for k, v in self.pollution.items():
        self.regex_matcher.add(k, v)

process(doc)

Find pollutions in doc and clean candidate negations to remove pseudo negations

PARAMETER DESCRIPTION

doc

spaCy Doc object

TYPE: Doc

RETURNS	DESCRIPTION
`pollution`	list of pollution spans

Source code in edsnlp/pipelines/core/normalizer/pollution/pollution.py

def process(self, doc: Doc) -> List[Span]:
    """
    Find pollutions in doc and clean candidate negations to remove pseudo negations

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    pollution:
        list of pollution spans
    """

    pollutions = self.regex_matcher(doc, as_spans=True)
    pollutions = filter_spans(pollutions)

    return pollutions

__call__(doc)

Tags pollutions.

PARAMETER DESCRIPTION

doc

spaCy Doc object

TYPE: Doc

RETURNS	DESCRIPTION
`doc`	spaCy Doc object, annotated for pollutions.

Source code in edsnlp/pipelines/core/normalizer/pollution/pollution.py

def __call__(self, doc: Doc) -> Doc:
    """
    Tags pollutions.

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    doc:
        spaCy Doc object, annotated for pollutions.
    """
    pollutions = self.process(doc)

    for pollution in pollutions:

        for token in pollution:
            token._.excluded = True

    doc.spans["pollutions"] = pollutions

    return doc

factory

DEFAULT_CONFIG = dict(pollution=None) module-attribute

create_component(nlp, name, pollution)

Source code in edsnlp/pipelines/core/normalizer/pollution/factory.py

@deprecated_factory("pollution", "eds.pollution", default_config=DEFAULT_CONFIG)
@Language.factory("eds.pollution", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    pollution: Optional[Dict[str, Union[str, List[str]]]],
):
    return Pollution(
        nlp,
        pollution=pollution,
    )

`accents`

patterns

accents: List[Tuple[str, str]] = [('ç', 'c'), ('àáâä', 'a'), ('èéêë', 'e'), ('ìíîï', 'i'), ('òóôö', 'o'), ('ùúûü', 'u')]

module-attribute

accents

Accents

Bases: object

Normalises accents, using a same-length strategy.

PARAMETER DESCRIPTION

accents

List of accentuated characters and their transcription.

TYPE: List[Tuple[str, str]]

Source code in edsnlp/pipelines/core/normalizer/accents/accents.py

class Accents(object):
    """
    Normalises accents, using a same-length strategy.

    Parameters
    ----------
    accents : List[Tuple[str, str]]
        List of accentuated characters and their transcription.
    """

    def __init__(self, accents: Optional[List[Tuple[str, str]]]) -> None:
        if accents is None:
            accents = patterns.accents

        self.translation_table = str.maketrans(
            "".join(accent_group for accent_group, _ in accents),
            "".join(rep * len(accent_group) for accent_group, rep in accents),
        )

    def __call__(self, doc: Doc) -> Doc:
        """
        Remove accents from spacy `NORM` attribute.

        Parameters
        ----------
        doc : Doc
            The spaCy `Doc` object.

        Returns
        -------
        Doc
            The document, with accents removed in `Token.norm_`.
        """

        for token in doc:
            token.norm_ = token.norm_.translate(self.translation_table)

        return doc

translation_table = str.maketrans(''.join(accent_group for (accent_group, _) in accents), ''.join(rep * len(accent_group) for (accent_group, rep) in accents))

instance-attribute

__init__(accents)

Source code in edsnlp/pipelines/core/normalizer/accents/accents.py

def __init__(self, accents: Optional[List[Tuple[str, str]]]) -> None:
    if accents is None:
        accents = patterns.accents

    self.translation_table = str.maketrans(
        "".join(accent_group for accent_group, _ in accents),
        "".join(rep * len(accent_group) for accent_group, rep in accents),
    )

__call__(doc)

Remove accents from spacy NORM attribute.

PARAMETER DESCRIPTION

doc

The spaCy Doc object.

TYPE: Doc

RETURNS	DESCRIPTION
`Doc`	The document, with accents removed in `Token.norm_`.

Source code in edsnlp/pipelines/core/normalizer/accents/accents.py

def __call__(self, doc: Doc) -> Doc:
    """
    Remove accents from spacy `NORM` attribute.

    Parameters
    ----------
    doc : Doc
        The spaCy `Doc` object.

    Returns
    -------
    Doc
        The document, with accents removed in `Token.norm_`.
    """

    for token in doc:
        token.norm_ = token.norm_.translate(self.translation_table)

    return doc

factory

DEFAULT_CONFIG = dict(accents=None) module-attribute

create_component(nlp, name, accents)

Source code in edsnlp/pipelines/core/normalizer/accents/factory.py

@deprecated_factory("accents", "eds.accents", default_config=DEFAULT_CONFIG)
@Language.factory("eds.accents", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    accents: Optional[List[Tuple[str, str]]],
):
    return Accents(
        accents=accents,
    )

`lowercase`

factory

remove_lowercase(doc)

Add case on the NORM custom attribute. Should always be applied first.

PARAMETER DESCRIPTION

doc

The spaCy Doc object.

TYPE: Doc

RETURNS	DESCRIPTION
`Doc`	The document, with case put back in `NORM`.

Source code in edsnlp/pipelines/core/normalizer/lowercase/factory.py

@Language.component("remove-lowercase")
@Language.component("eds.remove-lowercase")
def remove_lowercase(doc: Doc):
    """
    Add case on the `NORM` custom attribute. Should always be applied first.

    Parameters
    ----------
    doc : Doc
        The spaCy `Doc` object.

    Returns
    -------
    Doc
        The document, with case put back in `NORM`.
    """

    for token in doc:
        token.norm_ = token.text

    return doc

`quotes`

quotes

Quotes

Bases: object

We normalise quotes, following this source <https://www.cl.cam.ac.uk/~mgk25/ucs/quotes.html>_.

PARAMETER DESCRIPTION

quotes

List of quotation characters and their transcription.

TYPE: List[Tuple[str, str]]

Source code in edsnlp/pipelines/core/normalizer/quotes/quotes.py

class Quotes(object):
    """
    We normalise quotes, following this
    `source <https://www.cl.cam.ac.uk/~mgk25/ucs/quotes.html>`_.

    Parameters
    ----------
    quotes : List[Tuple[str, str]]
        List of quotation characters and their transcription.
    """

    def __init__(self, quotes: Optional[List[Tuple[str, str]]]) -> None:
        if quotes is None:
            quotes = quotes_and_apostrophes

        self.translation_table = str.maketrans(
            "".join(quote_group for quote_group, _ in quotes),
            "".join(rep * len(quote_group) for quote_group, rep in quotes),
        )

    def __call__(self, doc: Doc) -> Doc:
        """
        Normalises quotes.

        Parameters
        ----------
        doc : Doc
            Document to process.

        Returns
        -------
        Doc
            Same document, with quotes normalised.
        """

        for token in doc:
            token.norm_ = token.norm_.translate(self.translation_table)

        return doc

translation_table = str.maketrans(''.join(quote_group for (quote_group, _) in quotes), ''.join(rep * len(quote_group) for (quote_group, rep) in quotes))

instance-attribute

__init__(quotes)

Source code in edsnlp/pipelines/core/normalizer/quotes/quotes.py

def __init__(self, quotes: Optional[List[Tuple[str, str]]]) -> None:
    if quotes is None:
        quotes = quotes_and_apostrophes

    self.translation_table = str.maketrans(
        "".join(quote_group for quote_group, _ in quotes),
        "".join(rep * len(quote_group) for quote_group, rep in quotes),
    )

__call__(doc)

Normalises quotes.

PARAMETER DESCRIPTION

doc

Document to process.

TYPE: Doc

RETURNS	DESCRIPTION
`Doc`	Same document, with quotes normalised.

Source code in edsnlp/pipelines/core/normalizer/quotes/quotes.py

def __call__(self, doc: Doc) -> Doc:
    """
    Normalises quotes.

    Parameters
    ----------
    doc : Doc
        Document to process.

    Returns
    -------
    Doc
        Same document, with quotes normalised.
    """

    for token in doc:
        token.norm_ = token.norm_.translate(self.translation_table)

    return doc

patterns

quotes: List[str] = ['＂', '〃', 'ײ', '᳓', '″', '״', '‶', '˶', 'ʺ', '“', '”', '˝', '‟'] module-attribute

apostrophes: List[str] = ['｀', '΄', '＇', 'ˈ', 'ˊ', 'ᑊ', 'ˋ', 'ꞌ', 'ᛌ', '𖽒', '𖽑', '‘', '’', 'י', '՚', '‛', '՝', '`', '`', '′', '׳', '´', 'ʹ', '˴', 'ߴ', '‵', 'ߵ', 'ʹ', 'ʻ', 'ʼ', '´', '᾽', 'ʽ', '῾', 'ʾ', '᾿']

module-attribute

quotes_and_apostrophes: List[Tuple[str, str]] = [(''.join(quotes), '"'), (''.join(apostrophes), "'")] module-attribute

factory

DEFAULT_CONFIG = dict(quotes=None) module-attribute

create_component(nlp, name, quotes)

Source code in edsnlp/pipelines/core/normalizer/quotes/factory.py

@deprecated_factory("quotes", "eds.quotes", default_config=DEFAULT_CONFIG)
@Language.factory("eds.quotes", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    quotes: Optional[List[Tuple[str, str]]],
):
    return Quotes(
        quotes=quotes,
    )

`advanced`

`factory`

DEFAULT_CONFIG = dict(window=10, verbose=0, ignore_excluded=False, attr='NORM') module-attribute

create_component(nlp, name, regex_config, window, verbose, ignore_excluded, attr)

Source code in edsnlp/pipelines/core/advanced/factory.py

@deprecated_factory(
    "advanced-regex", "eds.advanced-regex", default_config=DEFAULT_CONFIG
)
@Language.factory("eds.advanced-regex", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    regex_config: Dict[str, Any],
    window: int,
    verbose: int,
    ignore_excluded: bool,
    attr: str,
):

    return AdvancedRegex(
        nlp,
        regex_config=regex_config,
        window=window,
        verbose=verbose,
        ignore_excluded=ignore_excluded,
        attr=attr,
    )

`advanced`

AdvancedRegex

Bases: GenericMatcher

Allows additional matching in the surrounding context of the main match group, for qualification/filtering.

PARAMETER	DESCRIPTION
`nlp`	spaCy `Language` object. TYPE: `Language`
`regex_config`	Configuration for the main expression. TYPE: `Dict[str, Any]`
`window`	Number of tokens to consider before and after the main expression. TYPE: `int`
`attr`	Attribute to match on, eg `TEXT`, `NORM`, etc. TYPE: `str`
`verbose`	Verbosity level, useful for debugging. TYPE: `int`
`ignore_excluded`	Whether to skip excluded tokens. TYPE: `bool`

Source code in edsnlp/pipelines/core/advanced/advanced.py

class AdvancedRegex(GenericMatcher):
    """
    Allows additional matching in the surrounding context of the main match group,
    for qualification/filtering.

    Parameters
    ----------
    nlp : Language
        spaCy `Language` object.
    regex_config : Dict[str, Any]
        Configuration for the main expression.
    window : int
        Number of tokens to consider before and after the main expression.
    attr : str
        Attribute to match on, eg `TEXT`, `NORM`, etc.
    verbose : int
        Verbosity level, useful for debugging.
    ignore_excluded : bool
        Whether to skip excluded tokens.
    """

    def __init__(
        self,
        nlp: Language,
        regex_config: Dict[str, Any],
        window: int,
        attr: str,
        verbose: int,
        ignore_excluded: bool,
    ):
        self.regex_config = _check_regex_config(regex_config)
        self.window = window
        regex = regex_config

        self.verbose = verbose

        super().__init__(
            nlp=nlp,
            terms=dict(),
            regex=regex,
            attr=attr,
            ignore_excluded=ignore_excluded,
        )

        self.ignore_excluded = ignore_excluded

        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:
        if not Doc.has_extension("my_ents"):
            Doc.set_extension("my_ents", default=[])

        if not Span.has_extension("matcher_name"):
            Span.set_extension("matcher_name", default=None)

        if not Span.has_extension("before_extract"):
            Span.set_extension("before_extract", default=None)
        if not Span.has_extension("after_extract"):
            Span.set_extension("after_extract", default=None)

        if not Span.has_extension("window"):
            Span.set_extension("window", default=None)

        if not Span.has_extension("before_snippet"):
            Span.set_extension("before_snippet", default=None)
        if not Span.has_extension("after_snippet"):
            Span.set_extension("after_snippet", default=None)

    def process(self, doc: Doc) -> List[Span]:
        """
        Process the document, looking for named entities.

        Parameters
        ----------
        doc : Doc
            spaCy Doc object

        Returns
        -------
        List[Span]
            List of detected spans.
        """

        ents = super().process(doc)
        ents = self._postprocessing_pipeline(ents)

        return ents

    def __call__(self, doc: Doc) -> Doc:
        """
        Adds spans to document.

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        doc:
            spaCy Doc object, annotated for extracted terms.
        """

        ents = self.process(doc)

        ents, discarded = filter_spans(list(doc.ents) + ents, return_discarded=True)

        doc.ents = ents

        if "discarded" not in doc.spans:
            doc.spans["discarded"] = []
        doc.spans["discarded"].extend(discarded)

        return doc

    def _postprocessing_pipeline(self, ents: List[Span]):
        # add a window within the sentence around entities
        ents = [self._add_window(ent) for ent in ents]

        # Remove entities based on the snippet located just before and after the entity
        ents = filter(self._exclude_filter, ents)

        # Extract informations from the entity's context via regex
        ents = [self._snippet_extraction(ent) for ent in ents]

        return ents

    def _add_window(self, ent: Span) -> Span:
        ent._.window = ent.doc[
            max(ent.start - self.window, ent.sent.start) : min(
                ent.end + self.window, ent.sent.end
            )
        ]

        # include the entity in the snippets so that we can extract
        # the number when it is attached to the word, e.g. "3PA"
        ent._.before_snippet = ent.doc[
            max(ent.start - self.window, ent.sent.start) : ent.end
        ]
        ent._.after_snippet = ent.doc[
            ent.start : min(ent.end + self.window, ent.sent.end)
        ]
        return ent

    def get_text(self, span: Span, label) -> str:
        attr = self.regex_config[label].get("attr", self.attr)

        return get_text(
            doclike=span,
            attr=attr,
            ignore_excluded=self.ignore_excluded,
        )

    def _exclude_filter(self, ent: Span) -> Span:
        label = ent.label_

        before_exclude = self.regex_config[label].get("before_exclude", None)
        after_exclude = self.regex_config[label].get("after_exclude", None)

        if before_exclude is not None:
            t = ent._.before_snippet
            t = self.get_text(t, label)
            if re.compile(before_exclude).search(t) is not None:
                if self.verbose:
                    logger.info(
                        f"excluded (before) string: {t} - pattern {before_exclude}"
                    )
                return False

        if after_exclude is not None:
            t = ent._.after_snippet
            t = self.get_text(t, label)
            if re.compile(after_exclude).search(t) is not None:
                if self.verbose:
                    logger.info(
                        f"excluded (after) string: {t} - pattern {after_exclude}"
                    )
                return False

        return True

    def _snippet_extraction(self, ent: Span) -> Span:
        label = ent.label_

        before_extract = self.regex_config[label].get("before_extract", [])
        after_extract = self.regex_config[label].get("after_extract", [])

        if type(before_extract) == str:
            before_extract = [before_extract]
        if type(after_extract) == str:
            after_extract = [after_extract]

        t = ent._.before_snippet
        t = self.get_text(t, label)
        ent._.before_extract = []
        for pattern in before_extract:
            pattern = re.compile(pattern)
            match = pattern.search(t)
            ent._.before_extract.append(match.groups()[0] if match else None)

        t = ent._.after_snippet
        t = self.get_text(t, label)
        ent._.after_extract = []
        for pattern in after_extract:
            pattern = re.compile(pattern)
            match = pattern.search(t)
            ent._.after_extract.append(match.groups()[0] if match else None)

        return ent

regex_config = _check_regex_config(regex_config) instance-attribute

window = window instance-attribute

verbose = verbose instance-attribute

ignore_excluded = ignore_excluded instance-attribute

__init__(nlp, regex_config, window, attr, verbose, ignore_excluded)

Source code in edsnlp/pipelines/core/advanced/advanced.py

def __init__(
    self,
    nlp: Language,
    regex_config: Dict[str, Any],
    window: int,
    attr: str,
    verbose: int,
    ignore_excluded: bool,
):
    self.regex_config = _check_regex_config(regex_config)
    self.window = window
    regex = regex_config

    self.verbose = verbose

    super().__init__(
        nlp=nlp,
        terms=dict(),
        regex=regex,
        attr=attr,
        ignore_excluded=ignore_excluded,
    )

    self.ignore_excluded = ignore_excluded

    self.set_extensions()

set_extensions()

Source code in edsnlp/pipelines/core/advanced/advanced.py

@staticmethod
def set_extensions() -> None:
    if not Doc.has_extension("my_ents"):
        Doc.set_extension("my_ents", default=[])

    if not Span.has_extension("matcher_name"):
        Span.set_extension("matcher_name", default=None)

    if not Span.has_extension("before_extract"):
        Span.set_extension("before_extract", default=None)
    if not Span.has_extension("after_extract"):
        Span.set_extension("after_extract", default=None)

    if not Span.has_extension("window"):
        Span.set_extension("window", default=None)

    if not Span.has_extension("before_snippet"):
        Span.set_extension("before_snippet", default=None)
    if not Span.has_extension("after_snippet"):
        Span.set_extension("after_snippet", default=None)

process(doc)

Process the document, looking for named entities.

PARAMETER DESCRIPTION

doc

spaCy Doc object

TYPE: Doc

RETURNS	DESCRIPTION
`List[Span]`	List of detected spans.

Source code in edsnlp/pipelines/core/advanced/advanced.py

def process(self, doc: Doc) -> List[Span]:
    """
    Process the document, looking for named entities.

    Parameters
    ----------
    doc : Doc
        spaCy Doc object

    Returns
    -------
    List[Span]
        List of detected spans.
    """

    ents = super().process(doc)
    ents = self._postprocessing_pipeline(ents)

    return ents

__call__(doc)

Adds spans to document.

PARAMETER DESCRIPTION

doc

spaCy Doc object

TYPE: Doc

RETURNS	DESCRIPTION
`doc`	spaCy Doc object, annotated for extracted terms.

Source code in edsnlp/pipelines/core/advanced/advanced.py

def __call__(self, doc: Doc) -> Doc:
    """
    Adds spans to document.

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    doc:
        spaCy Doc object, annotated for extracted terms.
    """

    ents = self.process(doc)

    ents, discarded = filter_spans(list(doc.ents) + ents, return_discarded=True)

    doc.ents = ents

    if "discarded" not in doc.spans:
        doc.spans["discarded"] = []
    doc.spans["discarded"].extend(discarded)

    return doc

_postprocessing_pipeline(ents)

Source code in edsnlp/pipelines/core/advanced/advanced.py

def _postprocessing_pipeline(self, ents: List[Span]):
    # add a window within the sentence around entities
    ents = [self._add_window(ent) for ent in ents]

    # Remove entities based on the snippet located just before and after the entity
    ents = filter(self._exclude_filter, ents)

    # Extract informations from the entity's context via regex
    ents = [self._snippet_extraction(ent) for ent in ents]

    return ents

_add_window(ent)

Source code in edsnlp/pipelines/core/advanced/advanced.py

def _add_window(self, ent: Span) -> Span:
    ent._.window = ent.doc[
        max(ent.start - self.window, ent.sent.start) : min(
            ent.end + self.window, ent.sent.end
        )
    ]

    # include the entity in the snippets so that we can extract
    # the number when it is attached to the word, e.g. "3PA"
    ent._.before_snippet = ent.doc[
        max(ent.start - self.window, ent.sent.start) : ent.end
    ]
    ent._.after_snippet = ent.doc[
        ent.start : min(ent.end + self.window, ent.sent.end)
    ]
    return ent

get_text(span, label)

Source code in edsnlp/pipelines/core/advanced/advanced.py

def get_text(self, span: Span, label) -> str:
    attr = self.regex_config[label].get("attr", self.attr)

    return get_text(
        doclike=span,
        attr=attr,
        ignore_excluded=self.ignore_excluded,
    )

_exclude_filter(ent)

Source code in edsnlp/pipelines/core/advanced/advanced.py

def _exclude_filter(self, ent: Span) -> Span:
    label = ent.label_

    before_exclude = self.regex_config[label].get("before_exclude", None)
    after_exclude = self.regex_config[label].get("after_exclude", None)

    if before_exclude is not None:
        t = ent._.before_snippet
        t = self.get_text(t, label)
        if re.compile(before_exclude).search(t) is not None:
            if self.verbose:
                logger.info(
                    f"excluded (before) string: {t} - pattern {before_exclude}"
                )
            return False

    if after_exclude is not None:
        t = ent._.after_snippet
        t = self.get_text(t, label)
        if re.compile(after_exclude).search(t) is not None:
            if self.verbose:
                logger.info(
                    f"excluded (after) string: {t} - pattern {after_exclude}"
                )
            return False

    return True

_snippet_extraction(ent)

Source code in edsnlp/pipelines/core/advanced/advanced.py

def _snippet_extraction(self, ent: Span) -> Span:
    label = ent.label_

    before_extract = self.regex_config[label].get("before_extract", [])
    after_extract = self.regex_config[label].get("after_extract", [])

    if type(before_extract) == str:
        before_extract = [before_extract]
    if type(after_extract) == str:
        after_extract = [after_extract]

    t = ent._.before_snippet
    t = self.get_text(t, label)
    ent._.before_extract = []
    for pattern in before_extract:
        pattern = re.compile(pattern)
        match = pattern.search(t)
        ent._.before_extract.append(match.groups()[0] if match else None)

    t = ent._.after_snippet
    t = self.get_text(t, label)
    ent._.after_extract = []
    for pattern in after_extract:
        pattern = re.compile(pattern)
        match = pattern.search(t)
        ent._.after_extract.append(match.groups()[0] if match else None)

    return ent

_check_regex_config(regex_config)

Source code in edsnlp/pipelines/core/advanced/advanced.py

def _check_regex_config(regex_config):
    for k, v in regex_config.items():
        if type(v) is not dict:
            raise TypeError(
                f"The value of the key {k} is of type {type(v)}, but a dict is expected"
            )

        single_group_regex_keys = ["before_extract", "after_extract"]

        for single_group_regex_key in single_group_regex_keys:
            if single_group_regex_key in v:
                # ensure it is a list
                if type(v[single_group_regex_key]) is not list:
                    v[single_group_regex_key] = [v[single_group_regex_key]]

                for i, regex in enumerate(v[single_group_regex_key]):
                    n_groups = re.compile(regex).groups

                    if n_groups == 0:
                        # Adding grouping parenthesis
                        v[single_group_regex_key][i] = r"(" + regex + r")"
                    elif n_groups != 1:
                        # Accepting only 1 group per regex
                        raise ValueError(
                            f"The RegEx for {repr(k)} ({repr(regex)}) "
                            f"stored in {repr(single_group_regex_key)} "
                            f"contains {n_groups} capturing groups, 1 expected"
                        )

    return regex_config

`qualifiers`

`factories`

`base`

`Qualifier`

Bases: BaseComponent

Implements the NegEx algorithm.

PARAMETER	DESCRIPTION
`nlp`	spaCy nlp pipeline to use for matching. TYPE: `Language`
`attr`	spaCy's attribute to use: a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr' we can also add a key for each regex. TYPE: `str`
`on_ents_only`	Whether to look for matches around detected entities only. Useful for faster inference in downstream tasks. TYPE: `bool`
`explain`	Whether to keep track of cues for each entity. TYPE: `bool`
`**terms`	Terms to look for. TYPE: `Dict[str, Optional[List[str]]]`

Source code in edsnlp/pipelines/qualifiers/base.py

class Qualifier(BaseComponent):
    """
    Implements the NegEx algorithm.

    Parameters
    ----------
    nlp : Language
        spaCy nlp pipeline to use for matching.
    attr : str
        spaCy's attribute to use:
        a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr'
        we can also add a key for each regex.
    on_ents_only : bool
        Whether to look for matches around detected entities only.
        Useful for faster inference in downstream tasks.
    explain : bool
        Whether to keep track of cues for each entity.
    **terms : Dict[str, Optional[List[str]]]
        Terms to look for.
    """

    defaults = dict()

    def __init__(
        self,
        nlp: Language,
        attr: str,
        on_ents_only: bool,
        explain: bool,
        **terms: Dict[str, Optional[List[str]]],
    ):

        if attr.upper() == "NORM":
            check_normalizer(nlp)

        self.phrase_matcher = EDSPhraseMatcher(vocab=nlp.vocab, attr=attr)
        self.phrase_matcher.build_patterns(nlp=nlp, terms=terms)

        self.on_ents_only = on_ents_only
        self.explain = explain

    def get_defaults(
        self, **kwargs: Dict[str, Optional[List[str]]]
    ) -> Dict[str, List[str]]:
        """
        Merge terms with their defaults. Null keys are replaced with defaults.

        Returns
        -------
        Dict[str, List[str]]
            Merged dictionary
        """
        # Filter out empty keys
        kwargs = {k: v for k, v in kwargs.items() if v is not None}

        # Update defaults
        terms = self.defaults.copy()
        terms.update(kwargs)

        return terms

    def get_matches(self, doc: Doc) -> List[Span]:
        """
        Extract matches.

        Parameters
        ----------
        doc : Doc
            spaCy `Doc` object.

        Returns
        -------
        List[Span]
            List of detected spans
        """
        if self.on_ents_only:

            sents = set([ent.sent for ent in doc.ents])
            match_iterator = map(
                lambda sent: self.phrase_matcher(sent, as_spans=True), sents
            )

            matches = chain.from_iterable(match_iterator)

        else:
            matches = self.phrase_matcher(doc, as_spans=True)

        return list(matches)

    def __call__(self, doc: Doc) -> Doc:
        return self.process(doc)

defaults = dict() class-attribute

phrase_matcher = EDSPhraseMatcher(vocab=nlp.vocab, attr=attr) instance-attribute

on_ents_only = on_ents_only instance-attribute

explain = explain instance-attribute

__init__(nlp, attr, on_ents_only, explain, **terms)

Source code in edsnlp/pipelines/qualifiers/base.py

def __init__(
    self,
    nlp: Language,
    attr: str,
    on_ents_only: bool,
    explain: bool,
    **terms: Dict[str, Optional[List[str]]],
):

    if attr.upper() == "NORM":
        check_normalizer(nlp)

    self.phrase_matcher = EDSPhraseMatcher(vocab=nlp.vocab, attr=attr)
    self.phrase_matcher.build_patterns(nlp=nlp, terms=terms)

    self.on_ents_only = on_ents_only
    self.explain = explain

get_defaults(**kwargs)

Merge terms with their defaults. Null keys are replaced with defaults.

RETURNS	DESCRIPTION
`Dict[str, List[str]]`	Merged dictionary

Source code in edsnlp/pipelines/qualifiers/base.py

def get_defaults(
    self, **kwargs: Dict[str, Optional[List[str]]]
) -> Dict[str, List[str]]:
    """
    Merge terms with their defaults. Null keys are replaced with defaults.

    Returns
    -------
    Dict[str, List[str]]
        Merged dictionary
    """
    # Filter out empty keys
    kwargs = {k: v for k, v in kwargs.items() if v is not None}

    # Update defaults
    terms = self.defaults.copy()
    terms.update(kwargs)

    return terms

get_matches(doc)

Extract matches.

PARAMETER DESCRIPTION

doc

spaCy Doc object.

TYPE: Doc

RETURNS	DESCRIPTION
`List[Span]`	List of detected spans

Source code in edsnlp/pipelines/qualifiers/base.py

def get_matches(self, doc: Doc) -> List[Span]:
    """
    Extract matches.

    Parameters
    ----------
    doc : Doc
        spaCy `Doc` object.

    Returns
    -------
    List[Span]
        List of detected spans
    """
    if self.on_ents_only:

        sents = set([ent.sent for ent in doc.ents])
        match_iterator = map(
            lambda sent: self.phrase_matcher(sent, as_spans=True), sents
        )

        matches = chain.from_iterable(match_iterator)

    else:
        matches = self.phrase_matcher(doc, as_spans=True)

    return list(matches)

__call__(doc)

Source code in edsnlp/pipelines/qualifiers/base.py

114
115

def __call__(self, doc: Doc) -> Doc:
    return self.process(doc)

`check_normalizer(nlp)`

Source code in edsnlp/pipelines/qualifiers/base.py

def check_normalizer(nlp: Language) -> None:
    components = {name: component for name, component in nlp.pipeline}
    normalizer = components.get("normalizer")

    if normalizer and not normalizer.lowercase:
        logger.warning(
            "You have chosen the NORM attribute, but disabled lowercasing "
            "in your normalisation pipeline. "
            "This WILL hurt performance : you might want to use the "
            "LOWER attribute instead."
        )

`family`

FamilyContext

Bases: Qualifier

Implements a family context detection algorithm.

The components looks for terms indicating family references in the text.

PARAMETER	DESCRIPTION
`nlp`	spaCy nlp pipeline to use for matching. TYPE: `Language`
`family`	List of terms indicating family reference. TYPE: `Optional[List[str]]`
`terminations`	List of termination terms, to separate syntagmas. TYPE: `Optional[List[str]]`
`attr`	spaCy's attribute to use: a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr' we can also add a key for each regex. TYPE: `str`
`on_ents_only`	Whether to look for matches around detected entities only. Useful for faster inference in downstream tasks. TYPE: `bool`
`regex`	A dictionnary of regex patterns. TYPE: `Optional[Dict[str, Union[List[str], str]]]`
`explain`	Whether to keep track of cues for each entity. TYPE: `bool`
`use_sections`	Whether to use annotated sections (namely `antécédents familiaux`). TYPE: `bool, by default`

Source code in edsnlp/pipelines/qualifiers/family/family.py

class FamilyContext(Qualifier):
    """
    Implements a family context detection algorithm.

    The components looks for terms indicating family references in the text.

    Parameters
    ----------
    nlp : Language
        spaCy nlp pipeline to use for matching.
    family : Optional[List[str]]
        List of terms indicating family reference.
    terminations : Optional[List[str]]
        List of termination terms, to separate syntagmas.
    attr : str
        spaCy's attribute to use:
        a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr'
        we can also add a key for each regex.
    on_ents_only : bool
        Whether to look for matches around detected entities only.
        Useful for faster inference in downstream tasks.
    regex : Optional[Dict[str, Union[List[str], str]]]
        A dictionnary of regex patterns.
    explain : bool
        Whether to keep track of cues for each entity.
    use_sections : bool, by default `False`
        Whether to use annotated sections (namely `antécédents familiaux`).
    """

    defaults = dict(
        family=family,
        termination=termination,
    )

    def __init__(
        self,
        nlp: Language,
        attr: str,
        family: Optional[List[str]],
        termination: Optional[List[str]],
        use_sections: bool,
        explain: bool,
        on_ents_only: bool,
    ):

        terms = self.get_defaults(
            family=family,
            termination=termination,
        )

        super().__init__(
            nlp=nlp,
            attr=attr,
            on_ents_only=on_ents_only,
            explain=explain,
            **terms,
        )

        self.set_extensions()

        self.sections = use_sections and (
            "eds.sections" in nlp.pipe_names or "sections" in nlp.pipe_names
        )
        if use_sections and not self.sections:
            logger.warning(
                "You have requested that the pipeline use annotations "
                "provided by the `section` pipeline, but it was not set. "
                "Skipping that step."
            )

    @staticmethod
    def set_extensions() -> None:
        if not Token.has_extension("family"):
            Token.set_extension("family", default=False)

        if not Token.has_extension("family_"):
            Token.set_extension(
                "family_",
                getter=lambda token: "FAMILY" if token._.family else "PATIENT",
            )

        if not Span.has_extension("family"):
            Span.set_extension("family", default=False)

        if not Span.has_extension("family_"):
            Span.set_extension(
                "family_",
                getter=lambda span: "FAMILY" if span._.family else "PATIENT",
            )

        if not Span.has_extension("family_cues"):
            Span.set_extension("family_cues", default=[])

        if not Doc.has_extension("family"):
            Doc.set_extension("family", default=[])

    def process(self, doc: Doc) -> Doc:
        """
        Finds entities related to family context.

        Parameters
        ----------
        doc: spaCy Doc object

        Returns
        -------
        doc: spaCy Doc object, annotated for context
        """
        matches = self.get_matches(doc)

        terminations = get_spans(matches, "termination")
        boundaries = self._boundaries(doc, terminations)

        # Removes duplicate matches and pseudo-expressions in one statement
        matches = filter_spans(matches, label_to_remove="pseudo")

        entities = list(doc.ents) + list(doc.spans.get("discarded", []))
        ents = None

        sections = []

        if self.sections:
            sections = [
                Span(doc, section.start, section.end, label="FAMILY")
                for section in doc.spans["sections"]
                if section.label_ == "antécédents familiaux"
            ]

        for start, end in boundaries:

            ents, entities = consume_spans(
                entities,
                filter=lambda s: check_inclusion(s, start, end),
                second_chance=ents,
            )

            sub_matches, matches = consume_spans(
                matches, lambda s: start <= s.start < end
            )

            sub_sections, sections = consume_spans(sections, lambda s: doc[start] in s)

            if self.on_ents_only and not ents:
                continue

            cues = get_spans(sub_matches, "family")
            cues += sub_sections

            if not cues:
                continue

            family = bool(cues)

            if not family:
                continue

            if not self.on_ents_only:
                for token in doc[start:end]:
                    token._.family = True

            for ent in ents:
                ent._.family = True
                if self.explain:
                    ent._.family_cues += cues
                if not self.on_ents_only:
                    for token in ent:
                        token._.family = True

        return doc

defaults = dict(family=family, termination=termination) class-attribute

sections = use_sections and 'eds.sections' in nlp.pipe_names or 'sections' in nlp.pipe_names instance-attribute

__init__(nlp, attr, family, termination, use_sections, explain, on_ents_only)

Source code in edsnlp/pipelines/qualifiers/family/family.py

def __init__(
    self,
    nlp: Language,
    attr: str,
    family: Optional[List[str]],
    termination: Optional[List[str]],
    use_sections: bool,
    explain: bool,
    on_ents_only: bool,
):

    terms = self.get_defaults(
        family=family,
        termination=termination,
    )

    super().__init__(
        nlp=nlp,
        attr=attr,
        on_ents_only=on_ents_only,
        explain=explain,
        **terms,
    )

    self.set_extensions()

    self.sections = use_sections and (
        "eds.sections" in nlp.pipe_names or "sections" in nlp.pipe_names
    )
    if use_sections and not self.sections:
        logger.warning(
            "You have requested that the pipeline use annotations "
            "provided by the `section` pipeline, but it was not set. "
            "Skipping that step."
        )

set_extensions()

Source code in edsnlp/pipelines/qualifiers/family/family.py

@staticmethod
def set_extensions() -> None:
    if not Token.has_extension("family"):
        Token.set_extension("family", default=False)

    if not Token.has_extension("family_"):
        Token.set_extension(
            "family_",
            getter=lambda token: "FAMILY" if token._.family else "PATIENT",
        )

    if not Span.has_extension("family"):
        Span.set_extension("family", default=False)

    if not Span.has_extension("family_"):
        Span.set_extension(
            "family_",
            getter=lambda span: "FAMILY" if span._.family else "PATIENT",
        )

    if not Span.has_extension("family_cues"):
        Span.set_extension("family_cues", default=[])

    if not Doc.has_extension("family"):
        Doc.set_extension("family", default=[])

process(doc)

Finds entities related to family context.

PARAMETER	DESCRIPTION
`doc`	TYPE: `Doc`

RETURNS	DESCRIPTION
`doc`

Source code in edsnlp/pipelines/qualifiers/family/family.py

def process(self, doc: Doc) -> Doc:
    """
    Finds entities related to family context.

    Parameters
    ----------
    doc: spaCy Doc object

    Returns
    -------
    doc: spaCy Doc object, annotated for context
    """
    matches = self.get_matches(doc)

    terminations = get_spans(matches, "termination")
    boundaries = self._boundaries(doc, terminations)

    # Removes duplicate matches and pseudo-expressions in one statement
    matches = filter_spans(matches, label_to_remove="pseudo")

    entities = list(doc.ents) + list(doc.spans.get("discarded", []))
    ents = None

    sections = []

    if self.sections:
        sections = [
            Span(doc, section.start, section.end, label="FAMILY")
            for section in doc.spans["sections"]
            if section.label_ == "antécédents familiaux"
        ]

    for start, end in boundaries:

        ents, entities = consume_spans(
            entities,
            filter=lambda s: check_inclusion(s, start, end),
            second_chance=ents,
        )

        sub_matches, matches = consume_spans(
            matches, lambda s: start <= s.start < end
        )

        sub_sections, sections = consume_spans(sections, lambda s: doc[start] in s)

        if self.on_ents_only and not ents:
            continue

        cues = get_spans(sub_matches, "family")
        cues += sub_sections

        if not cues:
            continue

        family = bool(cues)

        if not family:
            continue

        if not self.on_ents_only:
            for token in doc[start:end]:
                token._.family = True

        for ent in ents:
            ent._.family = True
            if self.explain:
                ent._.family_cues += cues
            if not self.on_ents_only:
                for token in ent:
                    token._.family = True

    return doc

`patterns`

family: List[str] = ['aïeul', 'aïeux', 'antécédent familial', 'antécédents familiaux', 'arrière-grand-mère', 'arrière-grand-père', 'arrière-grands-parents', 'cousin', 'cousine', 'cousines', 'cousins', 'enfant', 'enfants', 'épouse', 'époux', 'familial', 'familiale', 'familiales', 'familiaux', 'famille', 'fiancé', 'fiancée', 'fils', 'frère', 'frères', 'grand-mère', 'grand-père', 'grands-parents', 'maman', 'mari', 'mère', 'oncle', 'papa', 'parent', 'parents', 'père', 'soeur', 'sœur', 'sœurs', 'soeurs', 'tante']

module-attribute

`factory`

DEFAULT_CONFIG = dict(family=None, termination=None, attr='NORM', use_sections=False, explain=False, on_ents_only=True) module-attribute

create_component(nlp, name, family, termination, attr, explain, on_ents_only, use_sections)

Source code in edsnlp/pipelines/qualifiers/family/factory.py

@deprecated_factory("family", "eds.family", default_config=DEFAULT_CONFIG)
@Language.factory("eds.family", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    family: Optional[List[str]],
    termination: Optional[List[str]],
    attr: str,
    explain: bool,
    on_ents_only: bool,
    use_sections: bool,
):
    return FamilyContext(
        nlp,
        family=family,
        termination=termination,
        attr=attr,
        explain=explain,
        on_ents_only=on_ents_only,
        use_sections=use_sections,
    )

`negation`

`patterns`

pseudo: List[str] = ['aucun changement', 'aucun doute', 'aucune hésitation', 'aucune diminution', 'ne permet pas d', 'ne permet pas de', "n'exclut pas", 'non négligeable', "pas d'amélioration", "pas d'augmentation", "pas d'autre", 'pas de changement', 'pas de diminution', 'pas de doute', 'pas exclu', 'pas exclue', 'pas exclues', 'pas exclus', 'pas immunisé', 'pas immunisée', 'pas immunisés', 'pas immunisées', 'sans amélioration', 'sans aucun doute', 'sans augmentation', 'sans certitude', 'sans changement', 'sans diminution', 'sans doute', 'sans être certain']

module-attribute

preceding: List[str] = ['à la place de', 'absence', 'absence de signe de', 'absence de', 'aucun signe de', 'aucun', 'aucune preuve', 'aucune', 'aucunes', 'aucuns', 'décline', 'décliné', 'dépourvu', 'dépourvue', 'dépourvues', 'dépourvus', 'disparition de', 'disparition des', 'excluent', 'exclut', 'impossibilité de', 'immunisé', 'immunisée', 'immunisés', 'immunisées', 'incompatible avec', 'incompatibles avec', 'jamais', 'ne manifestaient pas', 'ne manifestait pas', 'ne manifeste pas', 'ne manifestent pas', 'ne pas', 'ne présentaient pas', 'ne présentait pas', 'ne présente pas', 'ne présentent pas', 'ne ressemble pas', 'ne ressemblent pas', 'négatif pour', "n'est pas", "n'était pas", 'ni', 'niant', 'nie', 'nié', 'nullement', 'pas d', 'pas de cause de', 'pas de signe de', 'pas de signes de', 'pas de', 'pas nécessaire de', 'pas', "permet d'exclure", "plus d'aspect de", 'sans manifester de', 'sans présenter de', 'sans', 'symptôme atypique']

module-attribute

following: List[str] = [':0', ': 0', ':non', ': non', 'absent', 'absente', 'absentes', 'absents', 'dépourvu', 'dépourvue', 'dépourvues', 'dépourvus', 'disparaissent', 'disparait', 'est exclu', 'est exclue', 'immunisé', 'immunisée', 'immunisés', 'immunisées', 'impossible', 'improbable', 'négatif', 'négatifs', 'négative', 'négatives', 'négligeable', 'négligeables', 'nié', 'niée', 'non', 'pas nécessaire', 'peu probable', 'sont exclues', 'sont exclus']

module-attribute

verbs: List[str] = ['éliminer', 'exclure', 'interdire', 'nier', 'réfuter', 'rejeter'] module-attribute

`negation`

Negation

Bases: Qualifier

Implements the NegEx algorithm.

The component looks for five kinds of expressions in the text :

preceding negations, ie cues that precede a negated expression
following negations, ie cues that follow a negated expression
pseudo negations : contain a negation cue, but are not negations (eg "pas de doute"/"no doubt")
negation verbs, ie verbs that indicate a negation
terminations, ie words that delimit propositions. The negation spans from the preceding cue to the termination.

PARAMETER	DESCRIPTION
`nlp`	spaCy nlp pipeline to use for matching. TYPE: `Language`
`attr`	spaCy's attribute to use TYPE: `str`
`pseudo`	List of pseudo negation terms. TYPE: `Optional[List[str]]`
`preceding`	List of preceding negation terms TYPE: `Optional[List[str]]`
`following`	List of following negation terms. TYPE: `Optional[List[str]]`
`termination`	List of termination terms. TYPE: `Optional[List[str]]`
`verbs`	List of negation verbs. TYPE: `Optional[List[str]]`
`on_ents_only`	Whether to look for matches around detected entities only. Useful for faster inference in downstream tasks. TYPE: `bool`
`within_ents`	Whether to consider cues within entities. TYPE: `bool`
`explain`	Whether to keep track of cues for each entity. TYPE: `bool`

Source code in edsnlp/pipelines/qualifiers/negation/negation.py

class Negation(Qualifier):
    """
    Implements the NegEx algorithm.

    The component looks for five kinds of expressions in the text :

    - preceding negations, ie cues that precede a negated expression

    - following negations, ie cues that follow a negated expression

    - pseudo negations : contain a negation cue, but are not negations
      (eg "pas de doute"/"no doubt")

    - negation verbs, ie verbs that indicate a negation

    - terminations, ie words that delimit propositions.
      The negation spans from the preceding cue to the termination.

    Parameters
    ----------
    nlp : Language
        spaCy nlp pipeline to use for matching.
    attr : str
        spaCy's attribute to use
    pseudo : Optional[List[str]]
        List of pseudo negation terms.
    preceding : Optional[List[str]]
        List of preceding negation terms
    following : Optional[List[str]]
        List of following negation terms.
    termination : Optional[List[str]]
        List of termination terms.
    verbs : Optional[List[str]]
        List of negation verbs.
    on_ents_only : bool
        Whether to look for matches around detected entities only.
        Useful for faster inference in downstream tasks.
    within_ents : bool
        Whether to consider cues within entities.
    explain : bool
        Whether to keep track of cues for each entity.
    """

    defaults = dict(
        following=following,
        preceding=preceding,
        pseudo=pseudo,
        verbs=verbs,
        termination=termination,
    )

    def __init__(
        self,
        nlp: Language,
        attr: str,
        pseudo: Optional[List[str]],
        preceding: Optional[List[str]],
        following: Optional[List[str]],
        termination: Optional[List[str]],
        verbs: Optional[List[str]],
        on_ents_only: bool,
        within_ents: bool,
        explain: bool,
    ):

        terms = self.get_defaults(
            pseudo=pseudo,
            preceding=preceding,
            following=following,
            termination=termination,
            verbs=verbs,
        )
        terms["verbs"] = self.load_verbs(terms["verbs"])

        super().__init__(
            nlp=nlp,
            attr=attr,
            on_ents_only=on_ents_only,
            explain=explain,
            **terms,
        )

        self.within_ents = within_ents
        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:

        if not Token.has_extension("negation"):
            Token.set_extension("negation", default=False)

        if not Token.has_extension("negated"):
            Token.set_extension(
                "negated", getter=deprecated_getter_factory("negated", "negation")
            )

        if not Token.has_extension("negation_"):
            Token.set_extension(
                "negation_",
                getter=lambda token: "NEG" if token._.negation else "AFF",
            )

        if not Token.has_extension("polarity_"):
            Token.set_extension(
                "polarity_",
                getter=deprecated_getter_factory("polarity_", "negation_"),
            )

        if not Span.has_extension("negation"):
            Span.set_extension("negation", default=False)

        if not Span.has_extension("negated"):
            Span.set_extension(
                "negated", getter=deprecated_getter_factory("negated", "negation")
            )

        if not Span.has_extension("negation_cues"):
            Span.set_extension("negation_cues", default=[])

        if not Span.has_extension("negation_"):
            Span.set_extension(
                "negation_",
                getter=lambda span: "NEG" if span._.negation else "AFF",
            )

        if not Span.has_extension("polarity_"):
            Span.set_extension(
                "polarity_",
                getter=deprecated_getter_factory("polarity_", "negation_"),
            )

        if not Doc.has_extension("negations"):
            Doc.set_extension("negations", default=[])

    def load_verbs(self, verbs: List[str]) -> List[str]:
        """
        Conjugate negating verbs to specific tenses.

        Parameters
        ----------
        verbs: list of negating verbs to conjugate

        Returns
        -------
        list_neg_verbs: List of negating verbs conjugated to specific tenses.
        """

        neg_verbs = get_verbs(verbs)

        neg_verbs = neg_verbs.loc[
            ((neg_verbs["mode"] == "Indicatif") & (neg_verbs["tense"] == "Présent"))
            | (neg_verbs["tense"] == "Participe Présent")
            | (neg_verbs["tense"] == "Participe Passé")
        ]

        list_neg_verbs = list(neg_verbs["term"].unique())

        return list_neg_verbs

    def annotate_entity(
        self,
        ent: Span,
        sub_preceding: List[Span],
        sub_following: List[Span],
    ) -> None:
        """
        Annotate entities using preceding and following negations.

        Parameters
        ----------
        ent : Span
            Entity to annotate
        sub_preceding : List[Span]
            List of preceding negations cues
        sub_following : List[Span]
            List of following negations cues
        """
        if self.within_ents:
            cues = [m for m in sub_preceding if m.end <= ent.end]
            cues += [m for m in sub_following if m.start >= ent.start]
        else:
            cues = [m for m in sub_preceding if m.end <= ent.start]
            cues += [m for m in sub_following if m.start >= ent.end]

        negation = ent._.negation or bool(cues)

        ent._.negation = negation

        if self.explain and negation:
            ent._.negation_cues += cues

        if not self.on_ents_only and negation:
            for token in ent:
                token._.negation = True

    def process(self, doc: Doc) -> Doc:
        """
        Finds entities related to negation.

        Parameters
        ----------
        doc: spaCy `Doc` object

        Returns
        -------
        doc: spaCy `Doc` object, annotated for negation
        """

        matches = self.get_matches(doc)

        terminations = get_spans(matches, "termination")
        boundaries = self._boundaries(doc, terminations)

        entities = list(doc.ents) + list(doc.spans.get("discarded", []))
        ents = None

        # Removes duplicate matches and pseudo-expressions in one statement
        matches = filter_spans(matches, label_to_remove="pseudo")

        for start, end in boundaries:

            ents, entities = consume_spans(
                entities,
                filter=lambda s: check_inclusion(s, start, end),
                second_chance=ents,
            )

            sub_matches, matches = consume_spans(
                matches, lambda s: start <= s.start < end
            )

            if self.on_ents_only and not ents:
                continue

            sub_preceding = get_spans(sub_matches, "preceding")
            sub_following = get_spans(sub_matches, "following")
            # Verbs precede negated content
            sub_preceding += get_spans(sub_matches, "verbs")

            if not sub_preceding + sub_following:
                continue

            if not self.on_ents_only:
                for token in doc[start:end]:
                    token._.negation = any(
                        m.end <= token.i for m in sub_preceding
                    ) or any(m.start > token.i for m in sub_following)

            for ent in ents:
                self.annotate_entity(
                    ent=ent,
                    sub_preceding=sub_preceding,
                    sub_following=sub_following,
                )

        return doc

    def __call__(self, doc: Doc) -> Doc:
        return self.process(doc)

defaults = dict(following=following, preceding=preceding, pseudo=pseudo, verbs=verbs, termination=termination) class-attribute

within_ents = within_ents instance-attribute

__init__(nlp, attr, pseudo, preceding, following, termination, verbs, on_ents_only, within_ents, explain)

Source code in edsnlp/pipelines/qualifiers/negation/negation.py

def __init__(
    self,
    nlp: Language,
    attr: str,
    pseudo: Optional[List[str]],
    preceding: Optional[List[str]],
    following: Optional[List[str]],
    termination: Optional[List[str]],
    verbs: Optional[List[str]],
    on_ents_only: bool,
    within_ents: bool,
    explain: bool,
):

    terms = self.get_defaults(
        pseudo=pseudo,
        preceding=preceding,
        following=following,
        termination=termination,
        verbs=verbs,
    )
    terms["verbs"] = self.load_verbs(terms["verbs"])

    super().__init__(
        nlp=nlp,
        attr=attr,
        on_ents_only=on_ents_only,
        explain=explain,
        **terms,
    )

    self.within_ents = within_ents
    self.set_extensions()

set_extensions()

Source code in edsnlp/pipelines/qualifiers/negation/negation.py

@staticmethod
def set_extensions() -> None:

    if not Token.has_extension("negation"):
        Token.set_extension("negation", default=False)

    if not Token.has_extension("negated"):
        Token.set_extension(
            "negated", getter=deprecated_getter_factory("negated", "negation")
        )

    if not Token.has_extension("negation_"):
        Token.set_extension(
            "negation_",
            getter=lambda token: "NEG" if token._.negation else "AFF",
        )

    if not Token.has_extension("polarity_"):
        Token.set_extension(
            "polarity_",
            getter=deprecated_getter_factory("polarity_", "negation_"),
        )

    if not Span.has_extension("negation"):
        Span.set_extension("negation", default=False)

    if not Span.has_extension("negated"):
        Span.set_extension(
            "negated", getter=deprecated_getter_factory("negated", "negation")
        )

    if not Span.has_extension("negation_cues"):
        Span.set_extension("negation_cues", default=[])

    if not Span.has_extension("negation_"):
        Span.set_extension(
            "negation_",
            getter=lambda span: "NEG" if span._.negation else "AFF",
        )

    if not Span.has_extension("polarity_"):
        Span.set_extension(
            "polarity_",
            getter=deprecated_getter_factory("polarity_", "negation_"),
        )

    if not Doc.has_extension("negations"):
        Doc.set_extension("negations", default=[])

load_verbs(verbs)

Conjugate negating verbs to specific tenses.

PARAMETER	DESCRIPTION
`verbs`	TYPE: `List[str]`

RETURNS	DESCRIPTION
`list_neg_verbs`

Source code in edsnlp/pipelines/qualifiers/negation/negation.py

def load_verbs(self, verbs: List[str]) -> List[str]:
    """
    Conjugate negating verbs to specific tenses.

    Parameters
    ----------
    verbs: list of negating verbs to conjugate

    Returns
    -------
    list_neg_verbs: List of negating verbs conjugated to specific tenses.
    """

    neg_verbs = get_verbs(verbs)

    neg_verbs = neg_verbs.loc[
        ((neg_verbs["mode"] == "Indicatif") & (neg_verbs["tense"] == "Présent"))
        | (neg_verbs["tense"] == "Participe Présent")
        | (neg_verbs["tense"] == "Participe Passé")
    ]

    list_neg_verbs = list(neg_verbs["term"].unique())

    return list_neg_verbs

annotate_entity(ent, sub_preceding, sub_following)

Annotate entities using preceding and following negations.

PARAMETER DESCRIPTION

ent

Entity to annotate

TYPE: Span

sub_preceding

List of preceding negations cues

TYPE: List[Span]

sub_following

List of following negations cues

TYPE: List[Span]

Source code in edsnlp/pipelines/qualifiers/negation/negation.py

def annotate_entity(
    self,
    ent: Span,
    sub_preceding: List[Span],
    sub_following: List[Span],
) -> None:
    """
    Annotate entities using preceding and following negations.

    Parameters
    ----------
    ent : Span
        Entity to annotate
    sub_preceding : List[Span]
        List of preceding negations cues
    sub_following : List[Span]
        List of following negations cues
    """
    if self.within_ents:
        cues = [m for m in sub_preceding if m.end <= ent.end]
        cues += [m for m in sub_following if m.start >= ent.start]
    else:
        cues = [m for m in sub_preceding if m.end <= ent.start]
        cues += [m for m in sub_following if m.start >= ent.end]

    negation = ent._.negation or bool(cues)

    ent._.negation = negation

    if self.explain and negation:
        ent._.negation_cues += cues

    if not self.on_ents_only and negation:
        for token in ent:
            token._.negation = True

process(doc)

Finds entities related to negation.

PARAMETER	DESCRIPTION
`doc`	TYPE: `Doc`

RETURNS	DESCRIPTION
`doc`

Source code in edsnlp/pipelines/qualifiers/negation/negation.py

def process(self, doc: Doc) -> Doc:
    """
    Finds entities related to negation.

    Parameters
    ----------
    doc: spaCy `Doc` object

    Returns
    -------
    doc: spaCy `Doc` object, annotated for negation
    """

    matches = self.get_matches(doc)

    terminations = get_spans(matches, "termination")
    boundaries = self._boundaries(doc, terminations)

    entities = list(doc.ents) + list(doc.spans.get("discarded", []))
    ents = None

    # Removes duplicate matches and pseudo-expressions in one statement
    matches = filter_spans(matches, label_to_remove="pseudo")

    for start, end in boundaries:

        ents, entities = consume_spans(
            entities,
            filter=lambda s: check_inclusion(s, start, end),
            second_chance=ents,
        )

        sub_matches, matches = consume_spans(
            matches, lambda s: start <= s.start < end
        )

        if self.on_ents_only and not ents:
            continue

        sub_preceding = get_spans(sub_matches, "preceding")
        sub_following = get_spans(sub_matches, "following")
        # Verbs precede negated content
        sub_preceding += get_spans(sub_matches, "verbs")

        if not sub_preceding + sub_following:
            continue

        if not self.on_ents_only:
            for token in doc[start:end]:
                token._.negation = any(
                    m.end <= token.i for m in sub_preceding
                ) or any(m.start > token.i for m in sub_following)

        for ent in ents:
            self.annotate_entity(
                ent=ent,
                sub_preceding=sub_preceding,
                sub_following=sub_following,
            )

    return doc

__call__(doc)

Source code in edsnlp/pipelines/qualifiers/negation/negation.py

273
274

def __call__(self, doc: Doc) -> Doc:
    return self.process(doc)

`factory`

DEFAULT_CONFIG = dict(pseudo=None, preceding=None, following=None, termination=None, verbs=None, attr='NORM', on_ents_only=True, within_ents=False, explain=False)

module-attribute

create_component(nlp, name, attr, pseudo, preceding, following, termination, verbs, on_ents_only, within_ents, explain)

Source code in edsnlp/pipelines/qualifiers/negation/factory.py

@deprecated_factory("negation", "eds.negation", default_config=DEFAULT_CONFIG)
@Language.factory("eds.negation", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    attr: str,
    pseudo: Optional[List[str]],
    preceding: Optional[List[str]],
    following: Optional[List[str]],
    termination: Optional[List[str]],
    verbs: Optional[List[str]],
    on_ents_only: bool,
    within_ents: bool,
    explain: bool,
):

    return Negation(
        nlp=nlp,
        attr=attr,
        pseudo=pseudo,
        preceding=preceding,
        following=following,
        termination=termination,
        verbs=verbs,
        on_ents_only=on_ents_only,
        within_ents=within_ents,
        explain=explain,
    )

`reported_speech`

ReportedSpeech

Bases: Qualifier

Implements a reported speech detection algorithm.

The components looks for terms indicating patient statements, and quotations to detect patient speech.

PARAMETER	DESCRIPTION
`nlp`	spaCy nlp pipeline to use for matching. TYPE: `Language`
`quotation`	String gathering all quotation cues. TYPE: `str`
`verbs`	List of reported speech verbs. TYPE: `List[str]`
`following`	List of terms following a reported speech. TYPE: `List[str]`
`preceding`	List of terms preceding a reported speech. TYPE: `List[str]`
`filter_matches`	Whether to filter out overlapping matches. TYPE: `bool`
`attr`	spaCy's attribute to use: a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr' we can also add a key for each regex. TYPE: `str`
`on_ents_only`	Whether to look for matches around detected entities only. Useful for faster inference in downstream tasks. TYPE: `bool`
`within_ents`	Whether to consider cues within entities. TYPE: `bool`
`explain`	Whether to keep track of cues for each entity. TYPE: `bool`

Source code in edsnlp/pipelines/qualifiers/reported_speech/reported_speech.py

class ReportedSpeech(Qualifier):
    """
    Implements a reported speech detection algorithm.

    The components looks for terms indicating patient statements,
    and quotations to detect patient speech.

    Parameters
    ----------
    nlp : Language
        spaCy nlp pipeline to use for matching.
    quotation : str
        String gathering all quotation cues.
    verbs : List[str]
        List of reported speech verbs.
    following : List[str]
        List of terms following a reported speech.
    preceding : List[str]
        List of terms preceding a reported speech.
    filter_matches : bool
        Whether to filter out overlapping matches.
    attr : str
        spaCy's attribute to use:
        a string with the value "TEXT" or "NORM",
        or a dict with the key 'term_attr'
        we can also add a key for each regex.
    on_ents_only : bool
        Whether to look for matches around detected entities only.
        Useful for faster inference in downstream tasks.
    within_ents : bool
        Whether to consider cues within entities.
    explain : bool
        Whether to keep track of cues for each entity.
    """

    defaults = dict(
        following=following,
        preceding=preceding,
        verbs=verbs,
        quotation=quotation,
    )

    def __init__(
        self,
        nlp: Language,
        attr: str,
        pseudo: Optional[List[str]],
        preceding: Optional[List[str]],
        following: Optional[List[str]],
        quotation: Optional[List[str]],
        verbs: Optional[List[str]],
        on_ents_only: bool,
        within_ents: bool,
        explain: bool,
    ):

        terms = self.get_defaults(
            pseudo=pseudo,
            preceding=preceding,
            following=following,
            quotation=quotation,
            verbs=verbs,
        )
        terms["verbs"] = self.load_verbs(terms["verbs"])

        quotation = terms.pop("quotation")

        super().__init__(
            nlp=nlp,
            attr=attr,
            on_ents_only=on_ents_only,
            explain=explain,
            **terms,
        )

        self.regex_matcher = RegexMatcher(attr=attr)
        self.regex_matcher.build_patterns(dict(quotation=quotation))

        self.within_ents = within_ents

        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:

        if not Token.has_extension("reported_speech"):
            Token.set_extension("reported_speech", default=False)

        if not Token.has_extension("reported_speech_"):
            Token.set_extension(
                "reported_speech_",
                getter=lambda token: "REPORTED"
                if token._.reported_speech
                else "DIRECT",
            )

        if not Span.has_extension("reported_speech"):
            Span.set_extension("reported_speech", default=False)

        if not Span.has_extension("reported_speech_"):
            Span.set_extension(
                "reported_speech_",
                getter=lambda span: "REPORTED" if span._.reported_speech else "DIRECT",
            )

        if not Span.has_extension("reported_speech_cues"):
            Span.set_extension("reported_speech_cues", default=[])

        if not Doc.has_extension("rspeechs"):
            Doc.set_extension("rspeechs", default=[])

    def load_verbs(self, verbs: List[str]) -> List[str]:
        """
        Conjugate reporting verbs to specific tenses (trhid person)

        Parameters
        ----------
        verbs: list of reporting verbs to conjugate

        Returns
        -------
        list_rep_verbs: List of reporting verbs conjugated to specific tenses.
        """

        rep_verbs = get_verbs(verbs)

        rep_verbs = rep_verbs.loc[
            (
                (rep_verbs["mode"] == "Indicatif")
                & (rep_verbs["tense"] == "Présent")
                & (rep_verbs["person"].isin(["3s", "3p"]))
            )
            | (rep_verbs["tense"] == "Participe Présent")
            | (rep_verbs["tense"] == "Participe Passé")
        ]

        list_rep_verbs = list(rep_verbs["term"].unique())

        return list_rep_verbs

    def process(self, doc: Doc) -> Doc:
        """
        Finds entities related to reported speech.

        Parameters
        ----------
        doc: spaCy Doc object

        Returns
        -------
        doc: spaCy Doc object, annotated for negation
        """

        matches = self.get_matches(doc)
        matches += list(self.regex_matcher(doc, as_spans=True))

        boundaries = self._boundaries(doc)

        entities = list(doc.ents) + list(doc.spans.get("discarded", []))
        ents = None

        # Removes duplicate matches and pseudo-expressions in one statement
        matches = filter_spans(matches, label_to_remove="pseudo")

        for start, end in boundaries:

            ents, entities = consume_spans(
                entities,
                filter=lambda s: check_inclusion(s, start, end),
                second_chance=ents,
            )

            sub_matches, matches = consume_spans(
                matches, lambda s: start <= s.start < end
            )

            if self.on_ents_only and not ents:
                continue

            sub_preceding = get_spans(sub_matches, "preceding")
            sub_following = get_spans(sub_matches, "following")
            sub_verbs = get_spans(sub_matches, "verbs")
            sub_quotation = get_spans(sub_matches, "quotation")

            if not sub_preceding + sub_following + sub_verbs + sub_quotation:
                continue

            if not self.on_ents_only:
                for token in doc[start:end]:
                    token._.reported_speech = (
                        any(m.end <= token.i for m in sub_preceding + sub_verbs)
                        or any(m.start > token.i for m in sub_following)
                        or any(
                            ((m.start < token.i) & (m.end > token.i + 1))
                            for m in sub_quotation
                        )
                    )
            for ent in ents:

                if self.within_ents:
                    cues = [m for m in sub_preceding + sub_verbs if m.end <= ent.end]
                    cues += [m for m in sub_following if m.start >= ent.start]
                else:
                    cues = [m for m in sub_preceding + sub_verbs if m.end <= ent.start]
                    cues += [m for m in sub_following if m.start >= ent.end]

                cues += [
                    m
                    for m in sub_quotation
                    if (m.start < ent.start) & (m.end > ent.end)
                ]

                reported_speech = ent._.reported_speech or bool(cues)
                ent._.reported_speech = reported_speech

                if self.explain:
                    ent._.reported_speech_cues += cues

                if not self.on_ents_only and reported_speech:
                    for token in ent:
                        token._.reported_speech = True
        return doc

defaults = dict(following=following, preceding=preceding, verbs=verbs, quotation=quotation) class-attribute

regex_matcher = RegexMatcher(attr=attr) instance-attribute

within_ents = within_ents instance-attribute

__init__(nlp, attr, pseudo, preceding, following, quotation, verbs, on_ents_only, within_ents, explain)

Source code in edsnlp/pipelines/qualifiers/reported_speech/reported_speech.py

def __init__(
    self,
    nlp: Language,
    attr: str,
    pseudo: Optional[List[str]],
    preceding: Optional[List[str]],
    following: Optional[List[str]],
    quotation: Optional[List[str]],
    verbs: Optional[List[str]],
    on_ents_only: bool,
    within_ents: bool,
    explain: bool,
):

    terms = self.get_defaults(
        pseudo=pseudo,
        preceding=preceding,
        following=following,
        quotation=quotation,
        verbs=verbs,
    )
    terms["verbs"] = self.load_verbs(terms["verbs"])

    quotation = terms.pop("quotation")

    super().__init__(
        nlp=nlp,
        attr=attr,
        on_ents_only=on_ents_only,
        explain=explain,
        **terms,
    )

    self.regex_matcher = RegexMatcher(attr=attr)
    self.regex_matcher.build_patterns(dict(quotation=quotation))

    self.within_ents = within_ents

    self.set_extensions()

set_extensions()

Source code in edsnlp/pipelines/qualifiers/reported_speech/reported_speech.py

@staticmethod
def set_extensions() -> None:

    if not Token.has_extension("reported_speech"):
        Token.set_extension("reported_speech", default=False)

    if not Token.has_extension("reported_speech_"):
        Token.set_extension(
            "reported_speech_",
            getter=lambda token: "REPORTED"
            if token._.reported_speech
            else "DIRECT",
        )

    if not Span.has_extension("reported_speech"):
        Span.set_extension("reported_speech", default=False)

    if not Span.has_extension("reported_speech_"):
        Span.set_extension(
            "reported_speech_",
            getter=lambda span: "REPORTED" if span._.reported_speech else "DIRECT",
        )

    if not Span.has_extension("reported_speech_cues"):
        Span.set_extension("reported_speech_cues", default=[])

    if not Doc.has_extension("rspeechs"):
        Doc.set_extension("rspeechs", default=[])

load_verbs(verbs)

Conjugate reporting verbs to specific tenses (trhid person)

PARAMETER	DESCRIPTION
`verbs`	TYPE: `List[str]`

RETURNS	DESCRIPTION
`list_rep_verbs`

Source code in edsnlp/pipelines/qualifiers/reported_speech/reported_speech.py

def load_verbs(self, verbs: List[str]) -> List[str]:
    """
    Conjugate reporting verbs to specific tenses (trhid person)

    Parameters
    ----------
    verbs: list of reporting verbs to conjugate

    Returns
    -------
    list_rep_verbs: List of reporting verbs conjugated to specific tenses.
    """

    rep_verbs = get_verbs(verbs)

    rep_verbs = rep_verbs.loc[
        (
            (rep_verbs["mode"] == "Indicatif")
            & (rep_verbs["tense"] == "Présent")
            & (rep_verbs["person"].isin(["3s", "3p"]))
        )
        | (rep_verbs["tense"] == "Participe Présent")
        | (rep_verbs["tense"] == "Participe Passé")
    ]

    list_rep_verbs = list(rep_verbs["term"].unique())

    return list_rep_verbs

process(doc)

Finds entities related to reported speech.

PARAMETER	DESCRIPTION
`doc`	TYPE: `Doc`

RETURNS	DESCRIPTION
`doc`

Source code in edsnlp/pipelines/qualifiers/reported_speech/reported_speech.py

def process(self, doc: Doc) -> Doc:
    """
    Finds entities related to reported speech.

    Parameters
    ----------
    doc: spaCy Doc object

    Returns
    -------
    doc: spaCy Doc object, annotated for negation
    """

    matches = self.get_matches(doc)
    matches += list(self.regex_matcher(doc, as_spans=True))

    boundaries = self._boundaries(doc)

    entities = list(doc.ents) + list(doc.spans.get("discarded", []))
    ents = None

    # Removes duplicate matches and pseudo-expressions in one statement
    matches = filter_spans(matches, label_to_remove="pseudo")

    for start, end in boundaries:

        ents, entities = consume_spans(
            entities,
            filter=lambda s: check_inclusion(s, start, end),
            second_chance=ents,
        )

        sub_matches, matches = consume_spans(
            matches, lambda s: start <= s.start < end
        )

        if self.on_ents_only and not ents:
            continue

        sub_preceding = get_spans(sub_matches, "preceding")
        sub_following = get_spans(sub_matches, "following")
        sub_verbs = get_spans(sub_matches, "verbs")
        sub_quotation = get_spans(sub_matches, "quotation")

        if not sub_preceding + sub_following + sub_verbs + sub_quotation:
            continue

        if not self.on_ents_only:
            for token in doc[start:end]:
                token._.reported_speech = (
                    any(m.end <= token.i for m in sub_preceding + sub_verbs)
                    or any(m.start > token.i for m in sub_following)
                    or any(
                        ((m.start < token.i) & (m.end > token.i + 1))
                        for m in sub_quotation
                    )
                )
        for ent in ents:

            if self.within_ents:
                cues = [m for m in sub_preceding + sub_verbs if m.end <= ent.end]
                cues += [m for m in sub_following if m.start >= ent.start]
            else:
                cues = [m for m in sub_preceding + sub_verbs if m.end <= ent.start]
                cues += [m for m in sub_following if m.start >= ent.end]

            cues += [
                m
                for m in sub_quotation
                if (m.start < ent.start) & (m.end > ent.end)
            ]

            reported_speech = ent._.reported_speech or bool(cues)
            ent._.reported_speech = reported_speech

            if self.explain:
                ent._.reported_speech_cues += cues

            if not self.on_ents_only and reported_speech:
                for token in ent:
                    token._.reported_speech = True
    return doc

`patterns`

verbs: List[str] = ['affirmer', 'ajouter', 'assurer', 'confirmer', 'demander', 'dire', 'déclarer', 'décrire', 'décrire', 'démontrer', 'expliquer', 'faire remarquer', 'indiquer', 'informer', 'insinuer', 'insister', 'jurer', 'nier', 'nier', 'noter', 'objecter', 'observer', 'parler', 'promettre', 'préciser', 'prétendre', 'prévenir', 'raconter', 'rappeler', 'rapporter', 'reconnaître', 'réfuter', 'répliquer', 'répondre', 'répéter', 'révéler', 'se plaindre', 'souhaiter', 'souligner', 'supplier', 'verbaliser', 'vouloir', 'vouloir']

module-attribute

following: List[str] = ["d'après le patient", "d'après la patiente"] module-attribute

preceding: List[str] = ['pas de critique de', 'crainte de', 'menace de', 'insiste sur le fait que', "d'après le patient", "d'après la patiente", 'peur de']

module-attribute

quotation: str = '(\\".+\\")|(\\«.+\\»)' module-attribute

`factory`

DEFAULT_CONFIG = dict(pseudo=None, preceding=None, following=None, quotation=None, verbs=None, attr='NORM', on_ents_only=True, within_ents=False, explain=False)

module-attribute

create_component(nlp, name, attr, pseudo, preceding, following, quotation, verbs, on_ents_only, within_ents, explain)

Source code in edsnlp/pipelines/qualifiers/reported_speech/factory.py

@deprecated_factory("rspeech", "eds.reported_speech", default_config=DEFAULT_CONFIG)
@deprecated_factory(
    "reported_speech", "eds.reported_speech", default_config=DEFAULT_CONFIG
)
@Language.factory("eds.reported_speech", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    attr: str,
    pseudo: Optional[List[str]],
    preceding: Optional[List[str]],
    following: Optional[List[str]],
    quotation: Optional[List[str]],
    verbs: Optional[List[str]],
    on_ents_only: bool,
    within_ents: bool,
    explain: bool,
):
    return ReportedSpeech(
        nlp=nlp,
        attr=attr,
        pseudo=pseudo,
        preceding=preceding,
        following=following,
        quotation=quotation,
        verbs=verbs,
        on_ents_only=on_ents_only,
        within_ents=within_ents,
        explain=explain,
    )

`history`

`patterns`

history = ['antécédents', 'atcd', 'atcds', 'tacds', 'antécédent'] module-attribute

`history`

History

Bases: Qualifier

Implements an history detection algorithm.

The components looks for terms indicating history in the text.

PARAMETER	DESCRIPTION
`nlp`	spaCy nlp pipeline to use for matching. TYPE: `Language`
`history`	List of terms indicating medical history reference. TYPE: `Optional[List[str]]`
`termination`	List of syntagme termination terms. TYPE: `Optional[List[str]]`
`use_sections`	Whether to use section pipeline to detect medical history section. TYPE: `bool`
`attr`	spaCy's attribute to use: a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr' we can also add a key for each regex. TYPE: `str`
`on_ents_only`	Whether to look for matches around detected entities only. Useful for faster inference in downstream tasks. TYPE: `bool`
`regex`	A dictionnary of regex patterns. TYPE: `Optional[Dict[str, Union[List[str], str]]]`
`explain`	Whether to keep track of cues for each entity. TYPE: `bool`

Source code in edsnlp/pipelines/qualifiers/history/history.py

class History(Qualifier):
    """
    Implements an history detection algorithm.

    The components looks for terms indicating history in the text.

    Parameters
    ----------
    nlp : Language
        spaCy nlp pipeline to use for matching.
    history : Optional[List[str]]
        List of terms indicating medical history reference.
    termination : Optional[List[str]]
        List of syntagme termination terms.
    use_sections : bool
        Whether to use section pipeline to detect medical history section.
    attr : str
        spaCy's attribute to use:
        a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr'
        we can also add a key for each regex.
    on_ents_only : bool
        Whether to look for matches around detected entities only.
        Useful for faster inference in downstream tasks.
    regex : Optional[Dict[str, Union[List[str], str]]]
        A dictionnary of regex patterns.
    explain : bool
        Whether to keep track of cues for each entity.
    """

    defaults = dict(
        history=history,
        termination=termination,
    )

    def __init__(
        self,
        nlp: Language,
        attr: str,
        history: Optional[List[str]],
        termination: Optional[List[str]],
        use_sections: bool,
        explain: bool,
        on_ents_only: bool,
    ):

        terms = self.get_defaults(
            history=history,
            termination=termination,
        )

        super().__init__(
            nlp=nlp,
            attr=attr,
            on_ents_only=on_ents_only,
            explain=explain,
            **terms,
        )

        self.set_extensions()

        self.sections = use_sections and (
            "eds.sections" in nlp.pipe_names or "sections" in nlp.pipe_names
        )
        if use_sections and not self.sections:
            logger.warning(
                "You have requested that the pipeline use annotations "
                "provided by the `section` pipeline, but it was not set. "
                "Skipping that step."
            )

    @staticmethod
    def set_extensions() -> None:

        if not Token.has_extension("history"):
            Token.set_extension("history", default=False)

        if not Token.has_extension("antecedents"):
            Token.set_extension(
                "antecedents",
                getter=deprecated_getter_factory("antecedents", "history"),
            )

        if not Token.has_extension("antecedent"):
            Token.set_extension(
                "antecedent",
                getter=deprecated_getter_factory("antecedent", "history"),
            )

        if not Token.has_extension("history_"):
            Token.set_extension(
                "history_",
                getter=lambda token: "ATCD" if token._.history else "CURRENT",
            )

        if not Token.has_extension("antecedents_"):
            Token.set_extension(
                "antecedents_",
                getter=deprecated_getter_factory("antecedents_", "history_"),
            )

        if not Token.has_extension("antecedent_"):
            Token.set_extension(
                "antecedent_",
                getter=deprecated_getter_factory("antecedent_", "history_"),
            )

        if not Span.has_extension("history"):
            Span.set_extension("history", default=False)

        if not Span.has_extension("antecedents"):
            Span.set_extension(
                "antecedents",
                getter=deprecated_getter_factory("antecedents", "history"),
            )

        if not Span.has_extension("antecedent"):
            Span.set_extension(
                "antecedent",
                getter=deprecated_getter_factory("antecedent", "history"),
            )

        if not Span.has_extension("history_"):
            Span.set_extension(
                "history_",
                getter=lambda span: "ATCD" if span._.history else "CURRENT",
            )

        if not Span.has_extension("antecedents_"):
            Span.set_extension(
                "antecedents_",
                getter=deprecated_getter_factory("antecedents_", "history_"),
            )

        if not Span.has_extension("antecedent_"):
            Span.set_extension(
                "antecedent_",
                getter=deprecated_getter_factory("antecedent_", "history_"),
            )

        if not Span.has_extension("history_cues"):
            Span.set_extension("history_cues", default=[])

        if not Span.has_extension("antecedents_cues"):
            Span.set_extension(
                "antecedents_cues",
                getter=deprecated_getter_factory("antecedents_cues", "history_cues"),
            )

        if not Span.has_extension("antecedent_cues"):
            Span.set_extension(
                "antecedent_cues",
                getter=deprecated_getter_factory("antecedent_cues", "history_cues"),
            )

    def process(self, doc: Doc) -> Doc:
        """
        Finds entities related to history.

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        doc:
            spaCy Doc object, annotated for history
        """

        matches = self.get_matches(doc)

        terminations = get_spans(matches, "termination")
        boundaries = self._boundaries(doc, terminations)

        # Removes duplicate matches and pseudo-expressions in one statement
        matches = filter_spans(matches, label_to_remove="pseudo")

        entities = list(doc.ents) + list(doc.spans.get("discarded", []))
        ents = None

        sections = []

        if self.sections:
            sections = [
                Span(doc, section.start, section.end, label="ATCD")
                for section in doc.spans["sections"]
                if section.label_ == "antécédents"
            ]

        for start, end in boundaries:
            ents, entities = consume_spans(
                entities,
                filter=lambda s: check_inclusion(s, start, end),
                second_chance=ents,
            )

            sub_matches, matches = consume_spans(
                matches, lambda s: start <= s.start < end
            )

            sub_sections, sections = consume_spans(sections, lambda s: doc[start] in s)

            if self.on_ents_only and not ents:
                continue

            cues = get_spans(sub_matches, "history")
            cues += sub_sections

            history = bool(cues)

            if not self.on_ents_only:
                for token in doc[start:end]:
                    token._.history = history

            for ent in ents:
                ent._.history = ent._.history or history

                if self.explain:
                    ent._.history_cues += cues

                if not self.on_ents_only and ent._.history:
                    for token in ent:
                        token._.history = True

        return doc

defaults = dict(history=history, termination=termination) class-attribute

sections = use_sections and 'eds.sections' in nlp.pipe_names or 'sections' in nlp.pipe_names instance-attribute

__init__(nlp, attr, history, termination, use_sections, explain, on_ents_only)

Source code in edsnlp/pipelines/qualifiers/history/history.py

def __init__(
    self,
    nlp: Language,
    attr: str,
    history: Optional[List[str]],
    termination: Optional[List[str]],
    use_sections: bool,
    explain: bool,
    on_ents_only: bool,
):

    terms = self.get_defaults(
        history=history,
        termination=termination,
    )

    super().__init__(
        nlp=nlp,
        attr=attr,
        on_ents_only=on_ents_only,
        explain=explain,
        **terms,
    )

    self.set_extensions()

    self.sections = use_sections and (
        "eds.sections" in nlp.pipe_names or "sections" in nlp.pipe_names
    )
    if use_sections and not self.sections:
        logger.warning(
            "You have requested that the pipeline use annotations "
            "provided by the `section` pipeline, but it was not set. "
            "Skipping that step."
        )

set_extensions()

Source code in edsnlp/pipelines/qualifiers/history/history.py

@staticmethod
def set_extensions() -> None:

    if not Token.has_extension("history"):
        Token.set_extension("history", default=False)

    if not Token.has_extension("antecedents"):
        Token.set_extension(
            "antecedents",
            getter=deprecated_getter_factory("antecedents", "history"),
        )

    if not Token.has_extension("antecedent"):
        Token.set_extension(
            "antecedent",
            getter=deprecated_getter_factory("antecedent", "history"),
        )

    if not Token.has_extension("history_"):
        Token.set_extension(
            "history_",
            getter=lambda token: "ATCD" if token._.history else "CURRENT",
        )

    if not Token.has_extension("antecedents_"):
        Token.set_extension(
            "antecedents_",
            getter=deprecated_getter_factory("antecedents_", "history_"),
        )

    if not Token.has_extension("antecedent_"):
        Token.set_extension(
            "antecedent_",
            getter=deprecated_getter_factory("antecedent_", "history_"),
        )

    if not Span.has_extension("history"):
        Span.set_extension("history", default=False)

    if not Span.has_extension("antecedents"):
        Span.set_extension(
            "antecedents",
            getter=deprecated_getter_factory("antecedents", "history"),
        )

    if not Span.has_extension("antecedent"):
        Span.set_extension(
            "antecedent",
            getter=deprecated_getter_factory("antecedent", "history"),
        )

    if not Span.has_extension("history_"):
        Span.set_extension(
            "history_",
            getter=lambda span: "ATCD" if span._.history else "CURRENT",
        )

    if not Span.has_extension("antecedents_"):
        Span.set_extension(
            "antecedents_",
            getter=deprecated_getter_factory("antecedents_", "history_"),
        )

    if not Span.has_extension("antecedent_"):
        Span.set_extension(
            "antecedent_",
            getter=deprecated_getter_factory("antecedent_", "history_"),
        )

    if not Span.has_extension("history_cues"):
        Span.set_extension("history_cues", default=[])

    if not Span.has_extension("antecedents_cues"):
        Span.set_extension(
            "antecedents_cues",
            getter=deprecated_getter_factory("antecedents_cues", "history_cues"),
        )

    if not Span.has_extension("antecedent_cues"):
        Span.set_extension(
            "antecedent_cues",
            getter=deprecated_getter_factory("antecedent_cues", "history_cues"),
        )

process(doc)

Finds entities related to history.

PARAMETER DESCRIPTION

doc

spaCy Doc object

TYPE: Doc

RETURNS	DESCRIPTION
`doc`	spaCy Doc object, annotated for history

Source code in edsnlp/pipelines/qualifiers/history/history.py

def process(self, doc: Doc) -> Doc:
    """
    Finds entities related to history.

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    doc:
        spaCy Doc object, annotated for history
    """

    matches = self.get_matches(doc)

    terminations = get_spans(matches, "termination")
    boundaries = self._boundaries(doc, terminations)

    # Removes duplicate matches and pseudo-expressions in one statement
    matches = filter_spans(matches, label_to_remove="pseudo")

    entities = list(doc.ents) + list(doc.spans.get("discarded", []))
    ents = None

    sections = []

    if self.sections:
        sections = [
            Span(doc, section.start, section.end, label="ATCD")
            for section in doc.spans["sections"]
            if section.label_ == "antécédents"
        ]

    for start, end in boundaries:
        ents, entities = consume_spans(
            entities,
            filter=lambda s: check_inclusion(s, start, end),
            second_chance=ents,
        )

        sub_matches, matches = consume_spans(
            matches, lambda s: start <= s.start < end
        )

        sub_sections, sections = consume_spans(sections, lambda s: doc[start] in s)

        if self.on_ents_only and not ents:
            continue

        cues = get_spans(sub_matches, "history")
        cues += sub_sections

        history = bool(cues)

        if not self.on_ents_only:
            for token in doc[start:end]:
                token._.history = history

        for ent in ents:
            ent._.history = ent._.history or history

            if self.explain:
                ent._.history_cues += cues

            if not self.on_ents_only and ent._.history:
                for token in ent:
                    token._.history = True

    return doc

`factory`

DEFAULT_CONFIG = dict(attr='NORM', history=patterns.history, termination=termination, use_sections=False, explain=False, on_ents_only=True)

module-attribute

create_component(nlp, name, history, termination, use_sections, attr, explain, on_ents_only)

Source code in edsnlp/pipelines/qualifiers/history/factory.py

@deprecated_factory("antecedents", "eds.history", default_config=DEFAULT_CONFIG)
@deprecated_factory("eds.antecedents", "eds.history", default_config=DEFAULT_CONFIG)
@deprecated_factory("history", "eds.history", default_config=DEFAULT_CONFIG)
@Language.factory("eds.history", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    history: Optional[List[str]],
    termination: Optional[List[str]],
    use_sections: bool,
    attr: str,
    explain: str,
    on_ents_only: bool,
):
    return History(
        nlp,
        attr=attr,
        history=history,
        termination=termination,
        use_sections=use_sections,
        explain=explain,
        on_ents_only=on_ents_only,
    )

`hypothesis`

`patterns`

pseudo: List[str] = ['aucun doute', 'même si', 'pas de condition', 'pas de doute', 'sans aucun doute', 'sans condition', 'sans risque']

module-attribute

confirmation: List[str] = ['certain', 'certaine', 'certainement', 'certaines', 'certains', 'confirmer', 'évidemment', 'évident', 'évidente', 'montrer que', 'visiblement']

module-attribute

preceding: List[str] = ['à condition', 'à la condition que', 'à moins que', 'au cas où', 'conditionnellement', 'doute', 'en admettant que', 'en cas', 'en considérant que', 'en supposant que', 'éventuellement', 'faudrait', 'hypothèse', 'hypothèses', 'idée depas confirmer', 'pas sûr', 'pas sûre', 'peut correspondre', 'peut-être', 'peuvent correspondre', 'possible', 'possiblement', 'potentiel', 'potentielle', 'potentiellement', 'potentielles', 'potentiels', 'prédisposant à', 'probable', 'probablement', 'probables', "recherche d'recherche de", 'recherche des', 'risque', 'sauf si', 'selon', 'si', "s'il", 'soit', 'sous condition', 'sous réserve', 'suspicion']

module-attribute

following: List[str] = ['?', 'envisagé', 'envisageable', 'envisageables', 'envisagées', 'envisagés', 'hypothétique', 'hypothétiquement', 'hypothétiques', 'pas certain', 'pas certaine', 'pas clair', 'pas claire', 'pas confirmé', 'pas confirmée', 'pas confirmées', 'pas confirmés', 'pas évident', 'pas évidente', 'pas sûr', 'pas sûre', 'possible', 'potentiel', 'potentielle', 'potentiels', 'probable', 'probables', ': \n', ':\n']

module-attribute

verbs_hyp: List[str] = ['douter', 'envisager', "s'apparenter", 'sembler', 'soupçonner', 'suggérer', 'suspecter'] module-attribute

verbs_eds: List[str] = ['abandonner', 'abolir', 'aborder', 'accepter', 'accidenter', 'accompagnemer', 'accompagner', 'acoller', 'acquérir', 'activer', 'actualiser', 'adapter', 'adhérer', 'adjuver', 'admettre', 'administrer', 'adopter', 'adresser', 'aggraver', 'agir', 'agréer', 'aider', 'aimer', 'alcooliser', 'alerter', 'alimenter', 'aller', 'allonger', 'alléger', 'alterner', 'altérer', 'amender', 'amener', 'améliorer', 'amyotrophier', 'améliorer', 'analyser', 'anesthésier', 'animer', 'annexer', 'annuler', 'anonymiser', 'anticiper', 'anticoaguler', 'apercevoir', 'aplatir', 'apparaître', 'appareiller', 'appeler', 'appliquer', 'apporter', 'apprendre', 'apprécier', 'appuyer', 'argumenter', 'arquer', 'arrêter', 'arriver', 'arrêter', 'articuler', 'aspirer', 'asseoir', 'assister', 'associer', 'assurer', 'assécher', 'attacher', 'atteindre', 'attendre', 'attribuer', 'augmenter', 'autonomiser', 'autoriser', 'avaler', 'avancer', 'avertir', 'avoir', 'avérer', 'aérer', 'baisser', 'ballonner', 'blesser', 'bloquer', 'boire', 'border', 'brancher', 'brûler', 'bénéficier', 'cadrer', 'calcifier', 'calculer', 'calmer', 'canaliser', 'capter', 'carencer', 'casser', 'centrer', 'cerner', 'certifier', 'changer', 'charger', 'chevaucher', 'choisir', 'chronomoduler', 'chuter', 'cicatriser', 'circoncire', 'circuler', 'classer', 'codéiner', 'coincer', 'colorer', 'combler', 'commander', 'commencer', 'communiquer', 'comparer', 'compliquer', 'compléter', 'comporter', 'comprendre', 'comprimer', 'concerner', 'conclure', 'condamner', 'conditionner', 'conduire', 'confiner', 'confirmer', 'confronter', 'congeler', 'conjoindre', 'conjuguer', 'connaître', 'connecter', 'conseiller', 'conserver', 'considérer', 'consommer', 'constater', 'constituer', 'consulter', 'contacter', 'contaminer', 'contenir', 'contentionner', 'continuer', 'contracter', 'contrarier', 'contribuer', 'contrôler', 'convaincre', 'convenir', 'convier', 'convoquer', 'copier', 'correspondre', 'corriger', 'corréler', 'coucher', 'coupler', 'couvrir', 'crapotter', 'creuser', 'croire', 'croiser', 'créer', 'crémer', 'crépiter', 'cumuler', 'curariser', 'céder', 'dater', 'demander', 'demeurer', 'destiner', 'devenir', 'devoir', 'diagnostiquer', 'dialyser', 'dicter', 'diffuser', 'différencier', 'différer', 'digérer', 'dilater', 'diluer', 'diminuer', 'diner', 'dire', 'diriger', 'discuter', 'disparaître', 'disposer', 'dissocier', 'disséminer', 'disséquer', 'distendre', 'distinguer', 'divorcer', 'documenter', 'donner', 'dorer', 'doser', 'doubler', 'durer', 'dyaliser', 'dyspner', 'débuter', 'décaler', 'déceler', 'décider', 'déclarer', 'déclencher', 'découvrir', 'décrire', 'décroître', 'décurariser', 'décéder', 'dédier', 'définir', 'dégrader', 'délivrer', 'dépasser', 'dépendre', 'déplacer', 'dépolir', 'déposer', 'dériver', 'dérouler', 'désappareiller', 'désigner', 'désinfecter', 'désorienter', 'détecter', 'déterminer', 'détruire', 'développer', 'dévouer', 'dîner', 'écraser', 'effacer', 'effectuer', 'effondrer', 'emboliser', 'emmener', 'empêcher', 'encadrer', 'encourager', 'endormir', 'endurer', 'enlever', 'enregistrer', 'entamer', 'entendre', 'entourer', 'entraîner', 'entreprendre', 'entrer', 'envahir', 'envisager', 'envoyer', 'espérer', 'essayer', 'estimer', 'être', 'examiner', 'excentrer', 'exciser', 'exclure', 'expirer', 'expliquer', 'explorer', 'exposer', 'exprimer', 'extérioriser', 'exécuter', 'faciliter', 'faire', 'fatiguer', 'favoriser', 'faxer', 'fermer', 'figurer', 'fixer', 'focaliser', 'foncer', 'former', 'fournir', 'fractionner', 'fragmenter', 'fuiter', 'fusionner', 'garder', 'graver', 'guider', 'gérer', 'gêner', 'honorer', 'hopsitaliser', 'hospitaliser', 'hydrater', 'hyperartérialiser', 'hyperfixer', 'hypertrophier', 'hésiter', 'identifier', 'illustrer', 'immuniser', 'impacter', 'implanter', 'impliquer', 'importer', 'imposer', 'impregner', 'imprimer', 'inclure', 'indifferencier', 'indiquer', 'infecter', 'infertiliser', 'infiltrer', 'informer', 'inhaler', 'initier', 'injecter', 'inscrire', 'insister', 'installer', 'interdire', 'interpréter', 'interrompre', 'intervenir', 'intituler', 'introduire', 'intéragir', 'inverser', 'inviter', 'ioder', 'ioniser', 'irradier', 'itérativer', 'joindre', 'juger', 'justifier', 'laisser', 'laminer', 'lancer', 'latéraliser', 'laver', 'lever', 'lier', 'ligaturer', 'limiter', 'lire', 'localiser', 'loger', 'louper', 'luire', 'lutter', 'lyricer', 'lyser', 'maculer', 'macérer', 'maintenir', 'majorer', 'malaiser', 'manger', 'manifester', 'manipuler', 'manquer', 'marcher', 'marier', 'marmoner', 'marquer', 'masquer', 'masser', 'mater', 'mener', 'mesurer', 'meteoriser', 'mettre', 'mitiger', 'modifier', 'moduler', 'modérer', 'monter', 'montrer', 'motiver', 'moucheter', 'mouler', 'mourir', 'multiopéréer', 'munir', 'muter', 'médicaliser', 'météoriser', 'naître', 'normaliser', 'noter', 'nuire', 'numériser', 'nécessiter', 'négativer', 'objectiver', 'observer', 'obstruer', 'obtenir', 'occasionner', 'occuper', 'opposer', 'opérer', 'organiser', 'orienter', 'ouvrir', 'palper', 'parasiter', 'paraître', 'parcourir', 'parer', 'paresthésier', 'parfaire', 'partager', 'partir', 'parvenir', 'passer', 'penser', 'percevoir', 'perdre', 'perforer', 'permettre', 'persister', 'personnaliser', 'peser', 'pigmenter', 'piloter', 'placer', 'plaindre', 'planifier', 'plier', 'plonger', 'porter', 'poser', 'positionner', 'posséder', 'poursuivre', 'pousser', 'pouvoir', 'pratiquer', 'preciser', 'prendre', 'prescrire', 'prier', 'produire', 'programmer', 'prolonger', 'prononcer', 'proposer', 'prouver', 'provoquer', 'préciser', 'précéder', 'prédominer', 'préexister', 'préférer', 'prélever', 'préparer', 'présenter', 'préserver', 'prévenir', 'prévoir', 'puruler', 'pénétrer', 'radiofréquencer', 'ralentir', 'ramener', 'rappeler', 'rapporter', 'rapprocher', 'rassurer', 'rattacher', 'rattraper', 'realiser', 'recenser', 'recevoir', 'rechercher', 'recommander', 'reconnaître', 'reconsulter', 'recontacter', 'recontrôler', 'reconvoquer', 'recouvrir', 'recueillir', 'recuperer', 'redescendre', 'rediscuter', 'refaire', 'refouler', 'refuser', 'regarder', 'rehausser', 'relancer', 'relayer', 'relever', 'relire', 'relâcher', 'remanier', 'remarquer', 'remercier', 'remettre', 'remonter', 'remplacer', 'remplir', 'rencontrer', 'rendormir', 'rendre', 'renfermer', 'renforcer', 'renouveler', 'renseigner', 'rentrer', 'reparler', 'repasser', 'reporter', 'reprendre', 'represcrire', 'reproduire', 'reprogrammer', 'représenter', 'repérer', 'requérir', 'respecter', 'ressembler', 'ressentir', 'rester', 'restreindre', 'retarder', 'retenir', 'retirer', 'retrouver', 'revasculariser', 'revenir', 'reverticaliser', 'revoir', 'rompre', 'rouler', 'réadapter', 'réadmettre', 'réadresser', 'réaliser', 'récidiver', 'récupérer', 'rédiger', 'réduire', 'réessayer', 'réexpliquer', 'référer', 'régler', 'régresser', 'réhausser', 'réopérer', 'répartir', 'répondre', 'répéter', 'réserver', 'résorber', 'résoudre', 'réséquer', 'réveiller', 'révéler', 'réévaluer', 'rêver', 'sacrer', 'saisir', 'satisfaire', 'savoir', 'scanner', 'scolariser', 'sembler', 'sensibiliser', 'sentir', 'serrer', 'servir', 'sevrer', 'signaler', 'signer', 'situer', 'siéger', 'soigner', 'sommeiller', 'sonder', 'sortir', 'souffler', 'souhaiter', 'soulager', 'soussigner', 'souvenir', 'spécialiser', 'stabiliser', 'statuer', 'stenter', 'stopper', 'stratifier', 'subir', 'substituer', 'sucrer', 'suggérer', 'suivre', 'supporter', 'supprimer', 'surajouter', 'surmonter', 'surveiller', 'survenir', 'suspecter', 'suspendre', 'suturer', 'synchroniser', 'systématiser', 'sécréter', 'sécuriser', 'sédater', 'séjourner', 'séparer', 'taire', 'taper', 'teinter', 'tendre', 'tenir', 'tenter', 'terminer', 'tester', 'thromboser', 'tirer', 'tiroir', 'tissulaire', 'titulariser', 'tolérer', 'tourner', 'tracer', 'trachéotomiser', 'traduire', 'traiter', 'transcrire', 'transférer', 'transmettre', 'transporter', 'trasnfixer', 'travailler', 'tronquer', 'trouver', 'téléphoner', 'ulcérer', 'uriner', 'utiliser', 'vacciner', 'valider', 'valoir', 'varier', 'vasculariser', 'venir', 'verifier', 'vieillir', 'viser', 'visualiser', 'vivre', 'voir', 'vouloir', 'vérifier', 'ébaucher', 'écarter', 'échographier', 'échoguider', 'échoir', 'échouer', 'éclairer', 'écraser', 'élargir', 'éliminer', 'émousser', 'épaissir', 'épargner', 'épuiser', 'épurer', 'équilibrer', 'établir', 'étager', 'étendre', 'étiqueter', 'étrangler', 'évaluer', 'éviter', 'évoluer', 'évoquer', 'être']

module-attribute

`hypothesis`

Hypothesis

Bases: Qualifier

Hypothesis detection with spaCy.

The component looks for five kinds of expressions in the text :

preceding hypothesis, ie cues that precede a hypothetic expression
following hypothesis, ie cues that follow a hypothetic expression
pseudo hypothesis : contain a hypothesis cue, but are not hypothesis (eg "pas de doute"/"no doubt")
hypothetic verbs : verbs indicating hypothesis (eg "douter")
classic verbs conjugated to the conditional, thus indicating hypothesis

PARAMETER	DESCRIPTION
`nlp`	spaCy nlp pipeline to use for matching. TYPE: `Language`
`pseudo`	List of pseudo hypothesis cues. TYPE: `Optional[List[str]]`
`preceding`	List of preceding hypothesis cues TYPE: `Optional[List[str]]`
`following`	List of following hypothesis cues. TYPE: `Optional[List[str]]`
`verbs_hyp`	List of hypothetic verbs. TYPE: `Optional[List[str]]`
`verbs_eds`	List of mainstream verbs. TYPE: `Optional[List[str]]`
`filter_matches`	Whether to filter out overlapping matches. TYPE: `bool`
`attr`	spaCy's attribute to use: a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr' we can also add a key for each regex. TYPE: `str`
`on_ents_only`	Whether to look for matches around detected entities only. Useful for faster inference in downstream tasks. TYPE: `bool`
`within_ents`	Whether to consider cues within entities. TYPE: `bool`
`explain`	Whether to keep track of cues for each entity. TYPE: `bool`
`regex`	A dictionnary of regex patterns. TYPE: `Optional[Dict[str, Union[List[str], str]]]`

Source code in edsnlp/pipelines/qualifiers/hypothesis/hypothesis.py

class Hypothesis(Qualifier):
    """
    Hypothesis detection with spaCy.

    The component looks for five kinds of expressions in the text :

    - preceding hypothesis, ie cues that precede a hypothetic expression
    - following hypothesis, ie cues that follow a hypothetic expression
    - pseudo hypothesis : contain a hypothesis cue, but are not hypothesis
      (eg "pas de doute"/"no doubt")
    - hypothetic verbs : verbs indicating hypothesis (eg "douter")
    - classic verbs conjugated to the conditional, thus indicating hypothesis

    Parameters
    ----------
    nlp : Language
        spaCy nlp pipeline to use for matching.
    pseudo : Optional[List[str]]
        List of pseudo hypothesis cues.
    preceding : Optional[List[str]]
        List of preceding hypothesis cues
    following : Optional[List[str]]
        List of following hypothesis cues.
    verbs_hyp : Optional[List[str]]
        List of hypothetic verbs.
    verbs_eds : Optional[List[str]]
        List of mainstream verbs.
    filter_matches : bool
        Whether to filter out overlapping matches.
    attr : str
        spaCy's attribute to use:
        a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr'
        we can also add a key for each regex.
    on_ents_only : bool
        Whether to look for matches around detected entities only.
        Useful for faster inference in downstream tasks.
    within_ents : bool
        Whether to consider cues within entities.
    explain : bool
        Whether to keep track of cues for each entity.
    regex : Optional[Dict[str, Union[List[str], str]]]
        A dictionnary of regex patterns.
    """

    defaults = dict(
        following=following,
        preceding=preceding,
        pseudo=pseudo,
        termination=termination,
        verbs_eds=verbs_eds,
        verbs_hyp=verbs_hyp,
    )

    def __init__(
        self,
        nlp: Language,
        attr: str,
        pseudo: Optional[List[str]],
        preceding: Optional[List[str]],
        following: Optional[List[str]],
        termination: Optional[List[str]],
        verbs_eds: Optional[List[str]],
        verbs_hyp: Optional[List[str]],
        on_ents_only: bool,
        within_ents: bool,
        explain: bool,
    ):

        terms = self.get_defaults(
            pseudo=pseudo,
            preceding=preceding,
            following=following,
            termination=termination,
            verbs_eds=verbs_eds,
            verbs_hyp=verbs_hyp,
        )
        terms["verbs"] = self.load_verbs(
            verbs_hyp=terms.pop("verbs_hyp"),
            verbs_eds=terms.pop("verbs_eds"),
        )

        super().__init__(
            nlp=nlp,
            attr=attr,
            on_ents_only=on_ents_only,
            explain=explain,
            **terms,
        )

        self.within_ents = within_ents
        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:
        if not Token.has_extension("hypothesis"):
            Token.set_extension("hypothesis", default=False)

        if not Token.has_extension("hypothesis_"):
            Token.set_extension(
                "hypothesis_",
                getter=lambda token: "HYP" if token._.hypothesis else "CERT",
            )

        if not Span.has_extension("hypothesis"):
            Span.set_extension("hypothesis", default=False)

        if not Span.has_extension("hypothesis_"):
            Span.set_extension(
                "hypothesis_",
                getter=lambda span: "HYP" if span._.hypothesis else "CERT",
            )

        if not Span.has_extension("hypothesis_cues"):
            Span.set_extension("hypothesis_cues", default=[])

        if not Doc.has_extension("hypothesis"):
            Doc.set_extension("hypothesis", default=[])

    def load_verbs(
        self,
        verbs_hyp: List[str],
        verbs_eds: List[str],
    ) -> List[str]:
        """
        Conjugate "classic" verbs to conditional, and add hypothesis
        verbs conjugated to all tenses.

        Parameters
        ----------
        verbs_hyp: List of verbs that specifically imply an hypothesis.
        verbs_eds: List of general verbs.

        Returns
        -------
        list of hypothesis verbs conjugated at all tenses and classic
        verbs conjugated to conditional.
        """

        classic_verbs = get_verbs(verbs_eds)
        classic_verbs = classic_verbs.loc[classic_verbs["mode"] == "Conditionnel"]
        list_classic_verbs = list(classic_verbs["term"].unique())

        hypo_verbs = get_verbs(verbs_hyp)
        list_hypo_verbs = list(hypo_verbs["term"].unique())

        return list_hypo_verbs + list_classic_verbs

    def process(self, doc: Doc) -> Doc:
        """
        Finds entities related to hypothesis.

        Parameters
        ----------
        doc: spaCy Doc object

        Returns
        -------
        doc: spaCy Doc object, annotated for hypothesis
        """

        matches = self.get_matches(doc)

        terminations = get_spans(matches, "termination")
        boundaries = self._boundaries(doc, terminations)

        # Removes duplicate matches and pseudo-expressions in one statement
        matches = filter_spans(matches, label_to_remove="pseudo")

        entities = list(doc.ents) + list(doc.spans.get("discarded", []))
        ents = None

        for start, end in boundaries:

            ents, entities = consume_spans(
                entities,
                filter=lambda s: check_inclusion(s, start, end),
                second_chance=ents,
            )

            sub_matches, matches = consume_spans(
                matches, lambda s: start <= s.start < end
            )

            if self.on_ents_only and not ents:
                continue

            sub_preceding = get_spans(sub_matches, "preceding")
            sub_following = get_spans(sub_matches, "following")
            sub_verbs = get_spans(sub_matches, "verbs")

            if not sub_preceding + sub_following + sub_verbs:
                continue

            if not self.on_ents_only:
                for token in doc[start:end]:
                    token._.hypothesis = any(
                        m.end <= token.i for m in sub_preceding + sub_verbs
                    ) or any(m.start > token.i for m in sub_following)

            for ent in ents:

                if self.within_ents:
                    cues = [m for m in sub_preceding + sub_verbs if m.end <= ent.end]
                    cues += [m for m in sub_following if m.start >= ent.start]
                else:
                    cues = [m for m in sub_preceding + sub_verbs if m.end <= ent.start]
                    cues += [m for m in sub_following if m.start >= ent.end]

                hypothesis = ent._.hypothesis or bool(cues)

                ent._.hypothesis = hypothesis

                if self.explain and hypothesis:
                    ent._.hypothesis_cues += cues

                if not self.on_ents_only and hypothesis:
                    for token in ent:
                        token._.hypothesis = True

        return doc

defaults = dict(following=following, preceding=preceding, pseudo=pseudo, termination=termination, verbs_eds=verbs_eds, verbs_hyp=verbs_hyp)

class-attribute

within_ents = within_ents instance-attribute

__init__(nlp, attr, pseudo, preceding, following, termination, verbs_eds, verbs_hyp, on_ents_only, within_ents, explain)

Source code in edsnlp/pipelines/qualifiers/hypothesis/hypothesis.py

def __init__(
    self,
    nlp: Language,
    attr: str,
    pseudo: Optional[List[str]],
    preceding: Optional[List[str]],
    following: Optional[List[str]],
    termination: Optional[List[str]],
    verbs_eds: Optional[List[str]],
    verbs_hyp: Optional[List[str]],
    on_ents_only: bool,
    within_ents: bool,
    explain: bool,
):

    terms = self.get_defaults(
        pseudo=pseudo,
        preceding=preceding,
        following=following,
        termination=termination,
        verbs_eds=verbs_eds,
        verbs_hyp=verbs_hyp,
    )
    terms["verbs"] = self.load_verbs(
        verbs_hyp=terms.pop("verbs_hyp"),
        verbs_eds=terms.pop("verbs_eds"),
    )

    super().__init__(
        nlp=nlp,
        attr=attr,
        on_ents_only=on_ents_only,
        explain=explain,
        **terms,
    )

    self.within_ents = within_ents
    self.set_extensions()

set_extensions()

Source code in edsnlp/pipelines/qualifiers/hypothesis/hypothesis.py

@staticmethod
def set_extensions() -> None:
    if not Token.has_extension("hypothesis"):
        Token.set_extension("hypothesis", default=False)

    if not Token.has_extension("hypothesis_"):
        Token.set_extension(
            "hypothesis_",
            getter=lambda token: "HYP" if token._.hypothesis else "CERT",
        )

    if not Span.has_extension("hypothesis"):
        Span.set_extension("hypothesis", default=False)

    if not Span.has_extension("hypothesis_"):
        Span.set_extension(
            "hypothesis_",
            getter=lambda span: "HYP" if span._.hypothesis else "CERT",
        )

    if not Span.has_extension("hypothesis_cues"):
        Span.set_extension("hypothesis_cues", default=[])

    if not Doc.has_extension("hypothesis"):
        Doc.set_extension("hypothesis", default=[])

load_verbs(verbs_hyp, verbs_eds)

Conjugate "classic" verbs to conditional, and add hypothesis verbs conjugated to all tenses.

PARAMETER	DESCRIPTION
`verbs_hyp`	TYPE: `List[str]`
`verbs_eds`	TYPE: `List[str]`

RETURNS	DESCRIPTION
`list of hypothesis verbs conjugated at all tenses and classic`
`verbs conjugated to conditional.`

Source code in edsnlp/pipelines/qualifiers/hypothesis/hypothesis.py

def load_verbs(
    self,
    verbs_hyp: List[str],
    verbs_eds: List[str],
) -> List[str]:
    """
    Conjugate "classic" verbs to conditional, and add hypothesis
    verbs conjugated to all tenses.

    Parameters
    ----------
    verbs_hyp: List of verbs that specifically imply an hypothesis.
    verbs_eds: List of general verbs.

    Returns
    -------
    list of hypothesis verbs conjugated at all tenses and classic
    verbs conjugated to conditional.
    """

    classic_verbs = get_verbs(verbs_eds)
    classic_verbs = classic_verbs.loc[classic_verbs["mode"] == "Conditionnel"]
    list_classic_verbs = list(classic_verbs["term"].unique())

    hypo_verbs = get_verbs(verbs_hyp)
    list_hypo_verbs = list(hypo_verbs["term"].unique())

    return list_hypo_verbs + list_classic_verbs

process(doc)

Finds entities related to hypothesis.

PARAMETER	DESCRIPTION
`doc`	TYPE: `Doc`

RETURNS	DESCRIPTION
`doc`

Source code in edsnlp/pipelines/qualifiers/hypothesis/hypothesis.py

def process(self, doc: Doc) -> Doc:
    """
    Finds entities related to hypothesis.

    Parameters
    ----------
    doc: spaCy Doc object

    Returns
    -------
    doc: spaCy Doc object, annotated for hypothesis
    """

    matches = self.get_matches(doc)

    terminations = get_spans(matches, "termination")
    boundaries = self._boundaries(doc, terminations)

    # Removes duplicate matches and pseudo-expressions in one statement
    matches = filter_spans(matches, label_to_remove="pseudo")

    entities = list(doc.ents) + list(doc.spans.get("discarded", []))
    ents = None

    for start, end in boundaries:

        ents, entities = consume_spans(
            entities,
            filter=lambda s: check_inclusion(s, start, end),
            second_chance=ents,
        )

        sub_matches, matches = consume_spans(
            matches, lambda s: start <= s.start < end
        )

        if self.on_ents_only and not ents:
            continue

        sub_preceding = get_spans(sub_matches, "preceding")
        sub_following = get_spans(sub_matches, "following")
        sub_verbs = get_spans(sub_matches, "verbs")

        if not sub_preceding + sub_following + sub_verbs:
            continue

        if not self.on_ents_only:
            for token in doc[start:end]:
                token._.hypothesis = any(
                    m.end <= token.i for m in sub_preceding + sub_verbs
                ) or any(m.start > token.i for m in sub_following)

        for ent in ents:

            if self.within_ents:
                cues = [m for m in sub_preceding + sub_verbs if m.end <= ent.end]
                cues += [m for m in sub_following if m.start >= ent.start]
            else:
                cues = [m for m in sub_preceding + sub_verbs if m.end <= ent.start]
                cues += [m for m in sub_following if m.start >= ent.end]

            hypothesis = ent._.hypothesis or bool(cues)

            ent._.hypothesis = hypothesis

            if self.explain and hypothesis:
                ent._.hypothesis_cues += cues

            if not self.on_ents_only and hypothesis:
                for token in ent:
                    token._.hypothesis = True

    return doc

`factory`

DEFAULT_CONFIG = dict(pseudo=None, preceding=None, following=None, termination=None, verbs_hyp=None, verbs_eds=None, attr='NORM', on_ents_only=True, within_ents=False, explain=False)

module-attribute

create_component(nlp, name, attr, pseudo, preceding, following, termination, verbs_eds, verbs_hyp, on_ents_only, within_ents, explain)

Source code in edsnlp/pipelines/qualifiers/hypothesis/factory.py

@deprecated_factory("hypothesis", "eds.hypothesis", default_config=DEFAULT_CONFIG)
@Language.factory("eds.hypothesis", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    attr: str,
    pseudo: Optional[List[str]],
    preceding: Optional[List[str]],
    following: Optional[List[str]],
    termination: Optional[List[str]],
    verbs_eds: Optional[List[str]],
    verbs_hyp: Optional[List[str]],
    on_ents_only: bool,
    within_ents: bool,
    explain: bool,
):
    return Hypothesis(
        nlp=nlp,
        attr=attr,
        pseudo=pseudo,
        preceding=preceding,
        following=following,
        termination=termination,
        verbs_eds=verbs_eds,
        verbs_hyp=verbs_hyp,
        on_ents_only=on_ents_only,
        within_ents=within_ents,
        explain=explain,
    )

`ner`

`covid`

`patterns`

covid = ['covid([-\\s]?19)?', 'sars[-\\s]?cov[-\\s]?2', 'corona[-\\s]?virus'] module-attribute

diseases = ['pneumopathies?', 'infections?'] module-attribute

pattern = '(' + make_pattern(diseases) + '\\s[àa]u?\\s)?' + make_pattern(covid) module-attribute

`factory`

DEFAULT_CONFIG = dict(attr='LOWER', ignore_excluded=False) module-attribute

create_component(nlp, name, attr, ignore_excluded)

Source code in edsnlp/pipelines/ner/covid/factory.py

@Language.factory("eds.covid", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    attr: Union[str, Dict[str, str]],
    ignore_excluded: bool,
):

    return GenericMatcher(
        nlp,
        terms=None,
        regex=dict(covid=patterns.pattern),
        attr=attr,
        ignore_excluded=ignore_excluded,
    )

`scores`

`base_score`

Score

Bases: AdvancedRegex

Matcher component to extract a numeric score

PARAMETER	DESCRIPTION
`nlp`	The spaCy object. TYPE: `Language`
`score_name`	The name of the extracted score TYPE: `str`
`regex`	A list of regexes to identify the score TYPE: `List[str]`
`attr`	Wether to match on the text ('TEXT') or on the normalized text ('NORM') TYPE: `str`
`after_extract`	Regex with capturing group to get the score value TYPE: `str`
`score_normalization`	Function that takes the "raw" value extracted from the `after_extract` regex, and should return - None if no score could be extracted - The desired score value else TYPE: `Callable[[Union[str,None]], Any]`
`window`	Number of token to include after the score's mention to find the score's value TYPE: `int`

Source code in edsnlp/pipelines/ner/scores/base_score.py

class Score(AdvancedRegex):
    """
    Matcher component to extract a numeric score

    Parameters
    ----------
    nlp : Language
        The spaCy object.
    score_name : str
        The name of the extracted score
    regex : List[str]
        A list of regexes to identify the score
    attr : str
        Wether to match on the text ('TEXT') or on the normalized text ('NORM')
    after_extract : str
        Regex with capturing group to get the score value
    score_normalization : Callable[[Union[str,None]], Any]
        Function that takes the "raw" value extracted from the `after_extract` regex,
        and should return
        - None if no score could be extracted
        - The desired score value else
    window : int
        Number of token to include after the score's mention to find the
        score's value
    """

    def __init__(
        self,
        nlp: Language,
        score_name: str,
        regex: List[str],
        attr: str,
        after_extract: str,
        score_normalization: Union[str, Callable[[Union[str, None]], Any]],
        window: int,
        verbose: int,
        ignore_excluded: bool,
    ):

        regex_config = {
            score_name: dict(regex=regex, attr=attr, after_extract=after_extract)
        }

        super().__init__(
            nlp=nlp,
            regex_config=regex_config,
            window=window,
            verbose=verbose,
            ignore_excluded=ignore_excluded,
            attr=attr,
        )

        self.score_name = score_name

        if isinstance(score_normalization, str):
            self.score_normalization = registry.get("misc", score_normalization)
        else:
            self.score_normalization = score_normalization

        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:
        super(Score, Score).set_extensions()
        if not Span.has_extension("score_name"):
            Span.set_extension("score_name", default=None)
        if not Span.has_extension("score_value"):
            Span.set_extension("score_value", default=None)

    def __call__(self, doc: Doc) -> Doc:
        """
        Adds spans to document.

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        doc:
            spaCy Doc object, annotated for extracted terms.
        """

        ents = super(Score, Score).process(self, doc)
        ents = self.score_filtering(ents)

        ents, discarded = filter_spans(list(doc.ents) + ents, return_discarded=True)

        doc.ents = ents

        if "discarded" not in doc.spans:
            doc.spans["discarded"] = []
        doc.spans["discarded"].extend(discarded)

        return doc

    def score_filtering(self, ents: List[Span]) -> List[Span]:
        """
        Extracts, if available, the value of the score.
        Normalizes the score via the provided `self.score_normalization` method.

        Parameters
        ----------
        ents: List[Span]
            List of spaCy's spans extracted by the score matcher

        Returns
        -------
        ents: List[Span]
            List of spaCy's spans, with, if found, an added `score_value` extension
        """
        to_keep_ents = []
        for ent in ents:
            value = ent._.after_extract[0]
            normalized_value = self.score_normalization(value)
            if normalized_value is not None:
                ent._.score_name = self.score_name
                ent._.score_value = int(value)
                to_keep_ents.append(ent)

        return to_keep_ents

score_name = score_name instance-attribute

score_normalization = registry.get('misc', score_normalization) instance-attribute

__init__(nlp, score_name, regex, attr, after_extract, score_normalization, window, verbose, ignore_excluded)

Source code in edsnlp/pipelines/ner/scores/base_score.py

def __init__(
    self,
    nlp: Language,
    score_name: str,
    regex: List[str],
    attr: str,
    after_extract: str,
    score_normalization: Union[str, Callable[[Union[str, None]], Any]],
    window: int,
    verbose: int,
    ignore_excluded: bool,
):

    regex_config = {
        score_name: dict(regex=regex, attr=attr, after_extract=after_extract)
    }

    super().__init__(
        nlp=nlp,
        regex_config=regex_config,
        window=window,
        verbose=verbose,
        ignore_excluded=ignore_excluded,
        attr=attr,
    )

    self.score_name = score_name

    if isinstance(score_normalization, str):
        self.score_normalization = registry.get("misc", score_normalization)
    else:
        self.score_normalization = score_normalization

    self.set_extensions()

set_extensions()

Source code in edsnlp/pipelines/ner/scores/base_score.py

@staticmethod
def set_extensions() -> None:
    super(Score, Score).set_extensions()
    if not Span.has_extension("score_name"):
        Span.set_extension("score_name", default=None)
    if not Span.has_extension("score_value"):
        Span.set_extension("score_value", default=None)

__call__(doc)

Adds spans to document.

PARAMETER DESCRIPTION

doc

spaCy Doc object

TYPE: Doc

RETURNS	DESCRIPTION
`doc`	spaCy Doc object, annotated for extracted terms.

Source code in edsnlp/pipelines/ner/scores/base_score.py

def __call__(self, doc: Doc) -> Doc:
    """
    Adds spans to document.

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    doc:
        spaCy Doc object, annotated for extracted terms.
    """

    ents = super(Score, Score).process(self, doc)
    ents = self.score_filtering(ents)

    ents, discarded = filter_spans(list(doc.ents) + ents, return_discarded=True)

    doc.ents = ents

    if "discarded" not in doc.spans:
        doc.spans["discarded"] = []
    doc.spans["discarded"].extend(discarded)

    return doc

score_filtering(ents)

Extracts, if available, the value of the score. Normalizes the score via the provided self.score_normalization method.

PARAMETER DESCRIPTION

ents

List of spaCy's spans extracted by the score matcher

TYPE: List[Span]

RETURNS	DESCRIPTION
`ents`	List of spaCy's spans, with, if found, an added `score_value` extension

Source code in edsnlp/pipelines/ner/scores/base_score.py

def score_filtering(self, ents: List[Span]) -> List[Span]:
    """
    Extracts, if available, the value of the score.
    Normalizes the score via the provided `self.score_normalization` method.

    Parameters
    ----------
    ents: List[Span]
        List of spaCy's spans extracted by the score matcher

    Returns
    -------
    ents: List[Span]
        List of spaCy's spans, with, if found, an added `score_value` extension
    """
    to_keep_ents = []
    for ent in ents:
        value = ent._.after_extract[0]
        normalized_value = self.score_normalization(value)
        if normalized_value is not None:
            ent._.score_name = self.score_name
            ent._.score_value = int(value)
            to_keep_ents.append(ent)

    return to_keep_ents

`factory`

DEFAULT_CONFIG = dict(attr='NORM', window=7, verbose=0, ignore_excluded=False) module-attribute

create_component(nlp, name, score_name, regex, after_extract, score_normalization, attr, window, verbose, ignore_excluded)

Source code in edsnlp/pipelines/ner/scores/factory.py

@deprecated_factory("score", "eds.score", default_config=DEFAULT_CONFIG)
@Language.factory("eds.score", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    score_name: str,
    regex: List[str],
    after_extract: str,
    score_normalization: Union[str, Callable[[Union[str, None]], Any]],
    attr: str,
    window: int,
    verbose: int,
    ignore_excluded: bool,
):
    return Score(
        nlp,
        score_name=score_name,
        regex=regex,
        after_extract=after_extract,
        score_normalization=score_normalization,
        attr=attr,
        window=window,
        verbose=verbose,
        ignore_excluded=ignore_excluded,
    )

`charlson`

patterns

regex = ['charlson'] module-attribute

after_extract = 'charlson.*?[\\n\\W]*?(\\d+)' module-attribute

score_normalization_str = 'score_normalization.charlson' module-attribute

score_normalization(extracted_score)

Charlson score normalization. If available, returns the integer value of the Charlson score.

Source code in edsnlp/pipelines/ner/scores/charlson/patterns.py

@spacy.registry.misc(score_normalization_str)
def score_normalization(extracted_score: Union[str, None]):
    """
    Charlson score normalization.
    If available, returns the integer value of the Charlson score.
    """
    score_range = list(range(0, 30))
    if (extracted_score is not None) and (int(extracted_score) in score_range):
        return int(extracted_score)

factory

DEFAULT_CONFIG = dict(regex=patterns.regex, after_extract=patterns.after_extract, score_normalization=patterns.score_normalization_str, attr='NORM', window=7, verbose=0, ignore_excluded=False)

module-attribute

create_component(nlp, name, regex, after_extract, score_normalization, attr, window, verbose, ignore_excluded)

Source code in edsnlp/pipelines/ner/scores/charlson/factory.py

@deprecated_factory("charlson", "eds.charlson", default_config=DEFAULT_CONFIG)
@Language.factory("eds.charlson", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    regex: List[str],
    after_extract: str,
    score_normalization: Union[str, Callable[[Union[str, None]], Any]],
    attr: str,
    window: int,
    verbose: int,
    ignore_excluded: bool,
):
    return Score(
        nlp,
        score_name=name,
        regex=regex,
        after_extract=after_extract,
        score_normalization=score_normalization,
        attr=attr,
        window=window,
        verbose=verbose,
        ignore_excluded=ignore_excluded,
    )

`emergency`

priority

patterns

regex = ['\\bpriorite\\b'] module-attribute

after_extract = 'priorite.*?[\\n\\W]*?(\\d+)' module-attribute

score_normalization_str = 'score_normalization.priority' module-attribute

score_normalization(extracted_score)

Priority score normalization. If available, returns the integer value of the priority score.

Source code in edsnlp/pipelines/ner/scores/emergency/priority/patterns.py

@spacy.registry.misc(score_normalization_str)
def score_normalization(extracted_score: Union[str, None]):
    """
    Priority score normalization.
    If available, returns the integer value of the priority score.
    """
    score_range = list(range(0, 6))
    if (extracted_score is not None) and (int(extracted_score) in score_range):
        return int(extracted_score)

factory

DEFAULT_CONFIG = dict(regex=patterns.regex, after_extract=patterns.after_extract, score_normalization=patterns.score_normalization_str, attr='NORM', window=7, verbose=0, ignore_excluded=False)

module-attribute

create_component(nlp, name, regex, after_extract, score_normalization, attr, window, verbose, ignore_excluded)

Source code in edsnlp/pipelines/ner/scores/emergency/priority/factory.py

@deprecated_factory(
    "emergency.priority", "eds.emergency.priority", default_config=DEFAULT_CONFIG
)
@Language.factory("eds.emergency.priority", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    regex: List[str],
    after_extract: str,
    score_normalization: Union[str, Callable[[Union[str, None]], Any]],
    attr: str,
    window: int,
    verbose: int,
    ignore_excluded: bool,
):
    return Score(
        nlp,
        score_name=name,
        regex=regex,
        after_extract=after_extract,
        score_normalization=score_normalization,
        attr=attr,
        window=window,
        verbose=verbose,
        ignore_excluded=ignore_excluded,
    )

ccmu

patterns

regex = ['\\bccmu\\b'] module-attribute

after_extract = 'ccmu.*?[\\n\\W]*?(\\d+)' module-attribute

score_normalization_str = 'score_normalization.ccmu' module-attribute

score_normalization(extracted_score)

CCMU score normalization. If available, returns the integer value of the CCMU score.

Source code in edsnlp/pipelines/ner/scores/emergency/ccmu/patterns.py

@spacy.registry.misc(score_normalization_str)
def score_normalization(extracted_score: Union[str, None]):
    """
    CCMU score normalization.
    If available, returns the integer value of the CCMU score.
    """
    score_range = [1, 2, 3, 4, 5]
    if (extracted_score is not None) and (int(extracted_score) in score_range):
        return int(extracted_score)

factory

DEFAULT_CONFIG = dict(regex=patterns.regex, after_extract=patterns.after_extract, score_normalization=patterns.score_normalization_str, attr='NORM', window=20, verbose=0, ignore_excluded=False)

module-attribute

create_component(nlp, name, regex, after_extract, score_normalization, attr, window, verbose, ignore_excluded)

Source code in edsnlp/pipelines/ner/scores/emergency/ccmu/factory.py

@deprecated_factory(
    "emergency.ccmu", "eds.emergency.ccmu", default_config=DEFAULT_CONFIG
)
@Language.factory("eds.emergency.ccmu", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    regex: List[str],
    after_extract: str,
    score_normalization: Union[str, Callable[[Union[str, None]], Any]],
    attr: str,
    window: int,
    verbose: int,
    ignore_excluded: bool,
):
    return Score(
        nlp,
        score_name=name,
        regex=regex,
        after_extract=after_extract,
        score_normalization=score_normalization,
        attr=attr,
        window=window,
        verbose=verbose,
        ignore_excluded=ignore_excluded,
    )

gemsa

patterns

regex = ['\\bgemsa\\b'] module-attribute

after_extract = 'gemsa.*?[\\n\\W]*?(\\d+)' module-attribute

score_normalization_str = 'score_normalization.gemsa' module-attribute

score_normalization(extracted_score)

GEMSA score normalization. If available, returns the integer value of the GEMSA score.

Source code in edsnlp/pipelines/ner/scores/emergency/gemsa/patterns.py

@spacy.registry.misc(score_normalization_str)
def score_normalization(extracted_score: Union[str, None]):
    """
    GEMSA score normalization.
    If available, returns the integer value of the GEMSA score.
    """
    score_range = [1, 2, 3, 4, 5, 6]
    if (extracted_score is not None) and (int(extracted_score) in score_range):
        return int(extracted_score)

factory

DEFAULT_CONFIG = dict(regex=patterns.regex, after_extract=patterns.after_extract, score_normalization=patterns.score_normalization_str, attr='NORM', window=20, verbose=0, ignore_excluded=False)

module-attribute

create_component(nlp, name, regex, after_extract, score_normalization, attr, window, verbose, ignore_excluded)

Source code in edsnlp/pipelines/ner/scores/emergency/gemsa/factory.py

@deprecated_factory(
    "emergency.gemsa", "eds.emergency.gemsa", default_config=DEFAULT_CONFIG
)
@Language.factory("eds.emergency.gemsa", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    regex: List[str],
    after_extract: str,
    score_normalization: Union[str, Callable[[Union[str, None]], Any]],
    attr: str,
    window: int,
    verbose: int,
    ignore_excluded: bool,
):
    return Score(
        nlp,
        score_name=name,
        regex=regex,
        after_extract=after_extract,
        score_normalization=score_normalization,
        attr=attr,
        window=window,
        verbose=verbose,
        ignore_excluded=ignore_excluded,
    )

`sofa`

sofa

Sofa

Bases: Score

Matcher component to extract the SOFA score

PARAMETER	DESCRIPTION
`nlp`	The spaCy object. TYPE: `Language`
`score_name`	The name of the extracted score TYPE: `str`
`regex`	A list of regexes to identify the SOFA score TYPE: `List[str]`
`attr`	Wether to match on the text ('TEXT') or on the normalized text ('CUSTOM_NORM') TYPE: `str`
`method_regex`	Regex with capturing group to get the score extraction method (e.g. "à l'admission", "à 24H", "Maximum") TYPE: `str`
`value_regex`	Regex to extract the score value TYPE: `str`
`score_normalization`	Function that takes the "raw" value extracted from the `after_extract` regex, and should return - None if no score could be extracted - The desired score value else TYPE: `Callable[[Union[str,None]], Any]`
`window`	Number of token to include after the score's mention to find the score's value TYPE: `int`

Source code in edsnlp/pipelines/ner/scores/sofa/sofa.py

class Sofa(Score):
    """
    Matcher component to extract the SOFA score

    Parameters
    ----------
    nlp : Language
        The spaCy object.
    score_name : str
        The name of the extracted score
    regex : List[str]
        A list of regexes to identify the SOFA score
    attr : str
        Wether to match on the text ('TEXT') or on the normalized text ('CUSTOM_NORM')
    method_regex : str
        Regex with capturing group to get the score extraction method
        (e.g. "à l'admission", "à 24H", "Maximum")
    value_regex : str
        Regex to extract the score value
    score_normalization : Callable[[Union[str,None]], Any]
        Function that takes the "raw" value extracted from the `after_extract` regex,
        and should return
        - None if no score could be extracted
        - The desired score value else
    window : int
        Number of token to include after the score's mention to find the
        score's value
    """

    def __init__(
        self,
        nlp: Language,
        score_name: str,
        regex: List[str],
        attr: str,
        method_regex: str,
        value_regex: str,
        score_normalization: Union[str, Callable[[Union[str, None]], Any]],
        window: int,
        verbose: int,
        ignore_excluded: bool,
    ):

        super().__init__(
            nlp,
            score_name=score_name,
            regex=regex,
            after_extract=[],
            score_normalization=score_normalization,
            attr=attr,
            window=window,
            verbose=verbose,
            ignore_excluded=ignore_excluded,
        )

        self.method_regex = method_regex
        self.value_regex = value_regex

        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:
        super(Sofa, Sofa).set_extensions()
        if not Span.has_extension("score_method"):
            Span.set_extension("score_method", default=None)

    def score_filtering(self, ents: List[Span]) -> List[Span]:
        """
        Extracts, if available, the value of the score.
        Normalizes the score via the provided `self.score_normalization` method.

        Parameters
        ----------
        ents: List[Span]
            List of spaCy's spans extracted by the score matcher

        Returns
        -------
        ents: List[Span]
            List of spaCy's spans, with, if found, an added `score_value` extension
        """

        to_keep_ents = []

        for ent in ents:
            after_snippet = get_text(
                ent._.after_snippet,
                attr=self.attr,
                ignore_excluded=self.ignore_excluded,
            )
            matches = re.search(self.method_regex, after_snippet)

            if matches is None:
                method = "Non précisée"
                value = after_snippet

            else:
                groups = matches.groupdict()
                value = groups["after_value"]
                if groups["max"] is not None:
                    method = "Maximum"
                elif groups["vqheures"] is not None:
                    method = "24H"
                elif groups["admission"] is not None:
                    method = "A l'admission"

            digit_value = re.match(
                self.value_regex, value
            )  # Use match instead of search to only look at the beginning
            digit_value = None if digit_value is None else digit_value.groups()[0]

            normalized_value = self.score_normalization(digit_value)
            if normalized_value is not None:
                ent._.score_name = self.score_name
                ent._.score_value = int(normalized_value)
                ent._.score_method = method
                to_keep_ents.append(ent)

        return to_keep_ents

method_regex = method_regex instance-attribute

value_regex = value_regex instance-attribute

__init__(nlp, score_name, regex, attr, method_regex, value_regex, score_normalization, window, verbose, ignore_excluded)

Source code in edsnlp/pipelines/ner/scores/sofa/sofa.py

def __init__(
    self,
    nlp: Language,
    score_name: str,
    regex: List[str],
    attr: str,
    method_regex: str,
    value_regex: str,
    score_normalization: Union[str, Callable[[Union[str, None]], Any]],
    window: int,
    verbose: int,
    ignore_excluded: bool,
):

    super().__init__(
        nlp,
        score_name=score_name,
        regex=regex,
        after_extract=[],
        score_normalization=score_normalization,
        attr=attr,
        window=window,
        verbose=verbose,
        ignore_excluded=ignore_excluded,
    )

    self.method_regex = method_regex
    self.value_regex = value_regex

    self.set_extensions()

set_extensions()

Source code in edsnlp/pipelines/ner/scores/sofa/sofa.py

@staticmethod
def set_extensions() -> None:
    super(Sofa, Sofa).set_extensions()
    if not Span.has_extension("score_method"):
        Span.set_extension("score_method", default=None)

score_filtering(ents)

Extracts, if available, the value of the score. Normalizes the score via the provided self.score_normalization method.

PARAMETER DESCRIPTION

ents

List of spaCy's spans extracted by the score matcher

TYPE: List[Span]

RETURNS	DESCRIPTION
`ents`	List of spaCy's spans, with, if found, an added `score_value` extension

Source code in edsnlp/pipelines/ner/scores/sofa/sofa.py

def score_filtering(self, ents: List[Span]) -> List[Span]:
    """
    Extracts, if available, the value of the score.
    Normalizes the score via the provided `self.score_normalization` method.

    Parameters
    ----------
    ents: List[Span]
        List of spaCy's spans extracted by the score matcher

    Returns
    -------
    ents: List[Span]
        List of spaCy's spans, with, if found, an added `score_value` extension
    """

    to_keep_ents = []

    for ent in ents:
        after_snippet = get_text(
            ent._.after_snippet,
            attr=self.attr,
            ignore_excluded=self.ignore_excluded,
        )
        matches = re.search(self.method_regex, after_snippet)

        if matches is None:
            method = "Non précisée"
            value = after_snippet

        else:
            groups = matches.groupdict()
            value = groups["after_value"]
            if groups["max"] is not None:
                method = "Maximum"
            elif groups["vqheures"] is not None:
                method = "24H"
            elif groups["admission"] is not None:
                method = "A l'admission"

        digit_value = re.match(
            self.value_regex, value
        )  # Use match instead of search to only look at the beginning
        digit_value = None if digit_value is None else digit_value.groups()[0]

        normalized_value = self.score_normalization(digit_value)
        if normalized_value is not None:
            ent._.score_name = self.score_name
            ent._.score_value = int(normalized_value)
            ent._.score_method = method
            to_keep_ents.append(ent)

    return to_keep_ents

patterns

regex = ['\\bsofa\\b'] module-attribute

method_regex = 'sofa.*?((?P<max>max\\w*)|(?P<vqheures>24h\\w*)|(?P<admission>admission\\w*))(?P<after_value>(.|\\n)*)' module-attribute

value_regex = '.*?.[\\n\\W]*?(\\d+)[^h\\d]' module-attribute

score_normalization_str = 'score_normalization.sofa' module-attribute

score_normalization(extracted_score)

Sofa score normalization. If available, returns the integer value of the SOFA score.

Source code in edsnlp/pipelines/ner/scores/sofa/patterns.py

@spacy.registry.misc(score_normalization_str)
def score_normalization(extracted_score: Union[str, None]):
    """
    Sofa score normalization.
    If available, returns the integer value of the SOFA score.
    """
    score_range = list(range(0, 30))
    if (extracted_score is not None) and (int(extracted_score) in score_range):
        return int(extracted_score)

factory

DEFAULT_CONFIG = dict(regex=patterns.regex, method_regex=patterns.method_regex, value_regex=patterns.value_regex, score_normalization=patterns.score_normalization_str, attr='NORM', window=20, verbose=0, ignore_excluded=False)

module-attribute

create_component(nlp, name, regex, method_regex, value_regex, score_normalization, attr, window, verbose, ignore_excluded)

Source code in edsnlp/pipelines/ner/scores/sofa/factory.py

@deprecated_factory("SOFA", "eds.SOFA", default_config=DEFAULT_CONFIG)
@Language.factory("eds.SOFA", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    regex: List[str],
    method_regex: str,
    value_regex: str,
    score_normalization: Union[str, Callable[[Union[str, None]], Any]],
    attr: str,
    window: int,
    verbose: int,
    ignore_excluded: bool,
):
    return Sofa(
        nlp,
        score_name=name,
        regex=regex,
        method_regex=method_regex,
        value_regex=value_regex,
        score_normalization=score_normalization,
        attr=attr,
        window=window,
        verbose=verbose,
        ignore_excluded=ignore_excluded,
    )

`misc`

`dates`

parsers = [parser for parser in default_parsers if parser != 'relative-time'] module-attribute

parser1 = DateDataParser(languages=['fr'], settings={'PREFER_DAY_OF_MONTH': 'first', 'PREFER_DATES_FROM': 'past', 'PARSERS': parsers, 'RETURN_AS_TIMEZONE_AWARE': False})

module-attribute

parser2 = DateDataParser(languages=['fr'], settings={'PREFER_DAY_OF_MONTH': 'first', 'PREFER_DATES_FROM': 'past', 'PARSERS': ['relative-time'], 'RETURN_AS_TIMEZONE_AWARE': False})

module-attribute

Dates

Bases: BaseComponent

Tags and normalizes dates, using the open-source dateparser library.

The pipeline uses spaCy's filter_spans function. It filters out false positives, and introduce a hierarchy between patterns. For instance, in case of ambiguity, the pipeline will decide that a date is a date without a year rather than a date without a day.

PARAMETER	DESCRIPTION
`nlp`	Language pipeline object TYPE: `spacy.language.Language`
`absolute`	List of regular expressions for absolute dates. TYPE: `Union[List[str], str]`
`full`	List of regular expressions for full dates in YYYY-MM-DD format. TYPE: `Union[List[str], str]`
`relative`	List of regular expressions for relative dates (eg `hier`, `la semaine prochaine`). TYPE: `Union[List[str], str]`
`no_year`	List of regular expressions for dates that do not display a year. TYPE: `Union[List[str], str]`
`no_day`	List of regular expressions for dates that do not display a day. TYPE: `Union[List[str], str]`
`year_only`	List of regular expressions for dates that only display a year. TYPE: `Union[List[str], str]`
`current`	List of regular expressions for dates that relate to the current month, week, year, etc. TYPE: `Union[List[str], str]`
`false_positive`	List of regular expressions for false positive (eg phone numbers, etc). TYPE: `Union[List[str], str]`
`on_ents_only`	Wether to look on dates in the whole document or in specific sentences: If `True`: Only look in the sentences of each entity in doc.ents If False: Look in the whole document If given a string `key` or list of string: Only look in the sentences of each entity in `doc.spans[key]` TYPE: `Union[bool, str, List[str]]`

Source code in edsnlp/pipelines/misc/dates/dates.py

class Dates(BaseComponent):
    """
    Tags and normalizes dates, using the open-source `dateparser` library.

    The pipeline uses spaCy's `filter_spans` function.
    It filters out false positives, and introduce a hierarchy between patterns.
    For instance, in case of ambiguity, the pipeline will decide that a date is a
    date without a year rather than a date without a day.

    Parameters
    ----------
    nlp : spacy.language.Language
        Language pipeline object
    absolute : Union[List[str], str]
        List of regular expressions for absolute dates.
    full : Union[List[str], str]
        List of regular expressions for full dates in YYYY-MM-DD format.
    relative : Union[List[str], str]
        List of regular expressions for relative dates
        (eg `hier`, `la semaine prochaine`).
    no_year : Union[List[str], str]
        List of regular expressions for dates that do not display a year.
    no_day : Union[List[str], str]
        List of regular expressions for dates that do not display a day.
    year_only : Union[List[str], str]
        List of regular expressions for dates that only display a year.
    current : Union[List[str], str]
        List of regular expressions for dates that relate to
        the current month, week, year, etc.
    false_positive : Union[List[str], str]
        List of regular expressions for false positive (eg phone numbers, etc).
    on_ents_only : Union[bool, str, List[str]]
        Wether to look on dates in the whole document or in specific sentences:

        - If `True`: Only look in the sentences of each entity in doc.ents
        - If False: Look in the whole document
        - If given a string `key` or list of string: Only look in the sentences of
          each entity in `#!python doc.spans[key]`
    """

    # noinspection PyProtectedMember
    def __init__(
        self,
        nlp: Language,
        absolute: Optional[List[str]],
        full: Optional[List[str]],
        relative: Optional[List[str]],
        no_year: Optional[List[str]],
        no_day: Optional[List[str]],
        year_only: Optional[List[str]],
        current: Optional[List[str]],
        false_positive: Optional[List[str]],
        on_ents_only: bool,
        attr: str,
    ):

        self.nlp = nlp

        if no_year is None:
            no_year = patterns.no_year_pattern
        if year_only is None:
            year_only = patterns.full_year_pattern
        if no_day is None:
            no_day = patterns.no_day_pattern
        if absolute is None:
            absolute = patterns.absolute_date_pattern
        if relative is None:
            relative = patterns.relative_date_pattern
        if full is None:
            full = patterns.full_date_pattern
        if current is None:
            current = patterns.current_pattern
        if false_positive is None:
            false_positive = patterns.false_positive_pattern

        if isinstance(absolute, str):
            absolute = [absolute]
        if isinstance(relative, str):
            relative = [relative]
        if isinstance(no_year, str):
            no_year = [no_year]
        if isinstance(no_day, str):
            no_day = [no_day]
        if isinstance(year_only, str):
            year_only = [year_only]
        if isinstance(full, str):
            full = [full]
        if isinstance(current, str):
            current = [current]
        if isinstance(false_positive, str):
            false_positive = [false_positive]

        self.on_ents_only = on_ents_only
        self.regex_matcher = RegexMatcher(attr=attr, alignment_mode="strict")

        self.regex_matcher.add("false_positive", false_positive)
        self.regex_matcher.add("full_date", full)
        self.regex_matcher.add("absolute", absolute)
        self.regex_matcher.add("relative", relative)
        self.regex_matcher.add("no_year", no_year)
        self.regex_matcher.add("no_day", no_day)
        self.regex_matcher.add("year_only", year_only)
        self.regex_matcher.add("current", current)

        self.parser = date_parser
        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:

        if not Doc.has_extension("note_datetime"):
            Doc.set_extension("note_datetime", default=None)

        if not Span.has_extension("parsed_date"):
            Span.set_extension("parsed_date", default=None)

        if not Span.has_extension("parsed_delta"):
            Span.set_extension("parsed_delta", default=None)

        if not Span.has_extension("date"):
            Span.set_extension("date", getter=date_getter)

    def process(self, doc: Doc) -> List[Span]:
        """
        Find dates in doc.

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        dates:
            list of date spans
        """

        if self.on_ents_only:

            if type(self.on_ents_only) == bool:
                ents = doc.ents
            else:
                if type(self.on_ents_only) == str:
                    self.on_ents_only = [self.on_ents_only]
                ents = []
                for key in self.on_ents_only:
                    ents.extend(list(doc.spans[key]))

            dates = []
            for sent in set([ent.sent for ent in ents]):
                dates = chain(
                    dates,
                    self.regex_matcher(
                        sent,
                        as_spans=True,
                        # return_groupdict=True,
                    ),
                )

        else:
            dates = self.regex_matcher(
                doc,
                as_spans=True,
                # return_groupdict=True,
            )

        # dates = apply_groupdict(dates)

        dates = filter_spans(dates)
        dates = [date for date in dates if date.label_ != "false_positive"]

        return dates

    def get_date(self, date: Span) -> Optional[datetime]:
        """
        Get normalised date using `dateparser`.

        Parameters
        ----------
        date : Span
            Date span.

        Returns
        -------
        Optional[datetime]
            If a date is recognised, returns a Python `datetime` object.
            Returns `None` otherwise.
        """

        text_date = date.text

        if date.label_ == "no_day":
            text_date = "01/" + re.sub(r"[\.\/\s]", "/", text_date)

        elif date.label_ == "full_date":
            text_date = re.sub(r"[\.\/\s]", "-", text_date)

            try:
                return datetime.strptime(text_date, "%Y-%m-%d")
            except ValueError:
                try:
                    return datetime.strptime(text_date, "%Y-%d-%m")
                except ValueError:
                    return None

        # text_date = re.sub(r"\.", "-", text_date)

        return self.parser(text_date)

    def __call__(self, doc: Doc) -> Doc:
        """
        Tags dates.

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        doc:
            spaCy Doc object, annotated for dates
        """
        dates = self.process(doc)

        for date in dates:
            d = self.get_date(date)

            if d is None:
                date._.parsed_date = None
            else:
                date._.parsed_date = d
                date._.parsed_delta = d - datetime.now() + timedelta(seconds=10)

        doc.spans["dates"] = dates

        return doc

nlp = nlp instance-attribute

on_ents_only = on_ents_only instance-attribute

regex_matcher = RegexMatcher(attr=attr, alignment_mode='strict') instance-attribute

parser = date_parser instance-attribute

__init__(nlp, absolute, full, relative, no_year, no_day, year_only, current, false_positive, on_ents_only, attr)

Source code in edsnlp/pipelines/misc/dates/dates.py

def __init__(
    self,
    nlp: Language,
    absolute: Optional[List[str]],
    full: Optional[List[str]],
    relative: Optional[List[str]],
    no_year: Optional[List[str]],
    no_day: Optional[List[str]],
    year_only: Optional[List[str]],
    current: Optional[List[str]],
    false_positive: Optional[List[str]],
    on_ents_only: bool,
    attr: str,
):

    self.nlp = nlp

    if no_year is None:
        no_year = patterns.no_year_pattern
    if year_only is None:
        year_only = patterns.full_year_pattern
    if no_day is None:
        no_day = patterns.no_day_pattern
    if absolute is None:
        absolute = patterns.absolute_date_pattern
    if relative is None:
        relative = patterns.relative_date_pattern
    if full is None:
        full = patterns.full_date_pattern
    if current is None:
        current = patterns.current_pattern
    if false_positive is None:
        false_positive = patterns.false_positive_pattern

    if isinstance(absolute, str):
        absolute = [absolute]
    if isinstance(relative, str):
        relative = [relative]
    if isinstance(no_year, str):
        no_year = [no_year]
    if isinstance(no_day, str):
        no_day = [no_day]
    if isinstance(year_only, str):
        year_only = [year_only]
    if isinstance(full, str):
        full = [full]
    if isinstance(current, str):
        current = [current]
    if isinstance(false_positive, str):
        false_positive = [false_positive]

    self.on_ents_only = on_ents_only
    self.regex_matcher = RegexMatcher(attr=attr, alignment_mode="strict")

    self.regex_matcher.add("false_positive", false_positive)
    self.regex_matcher.add("full_date", full)
    self.regex_matcher.add("absolute", absolute)
    self.regex_matcher.add("relative", relative)
    self.regex_matcher.add("no_year", no_year)
    self.regex_matcher.add("no_day", no_day)
    self.regex_matcher.add("year_only", year_only)
    self.regex_matcher.add("current", current)

    self.parser = date_parser
    self.set_extensions()

set_extensions()

Source code in edsnlp/pipelines/misc/dates/dates.py

@staticmethod
def set_extensions() -> None:

    if not Doc.has_extension("note_datetime"):
        Doc.set_extension("note_datetime", default=None)

    if not Span.has_extension("parsed_date"):
        Span.set_extension("parsed_date", default=None)

    if not Span.has_extension("parsed_delta"):
        Span.set_extension("parsed_delta", default=None)

    if not Span.has_extension("date"):
        Span.set_extension("date", getter=date_getter)

process(doc)

Find dates in doc.

PARAMETER DESCRIPTION

doc

spaCy Doc object

TYPE: Doc

RETURNS	DESCRIPTION
`dates`	list of date spans

Source code in edsnlp/pipelines/misc/dates/dates.py

def process(self, doc: Doc) -> List[Span]:
    """
    Find dates in doc.

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    dates:
        list of date spans
    """

    if self.on_ents_only:

        if type(self.on_ents_only) == bool:
            ents = doc.ents
        else:
            if type(self.on_ents_only) == str:
                self.on_ents_only = [self.on_ents_only]
            ents = []
            for key in self.on_ents_only:
                ents.extend(list(doc.spans[key]))

        dates = []
        for sent in set([ent.sent for ent in ents]):
            dates = chain(
                dates,
                self.regex_matcher(
                    sent,
                    as_spans=True,
                    # return_groupdict=True,
                ),
            )

    else:
        dates = self.regex_matcher(
            doc,
            as_spans=True,
            # return_groupdict=True,
        )

    # dates = apply_groupdict(dates)

    dates = filter_spans(dates)
    dates = [date for date in dates if date.label_ != "false_positive"]

    return dates

get_date(date)

Get normalised date using dateparser.

PARAMETER DESCRIPTION

date

Date span.

TYPE: Span

RETURNS	DESCRIPTION
`Optional[datetime]`	If a date is recognised, returns a Python `datetime` object. Returns `None` otherwise.

Source code in edsnlp/pipelines/misc/dates/dates.py

def get_date(self, date: Span) -> Optional[datetime]:
    """
    Get normalised date using `dateparser`.

    Parameters
    ----------
    date : Span
        Date span.

    Returns
    -------
    Optional[datetime]
        If a date is recognised, returns a Python `datetime` object.
        Returns `None` otherwise.
    """

    text_date = date.text

    if date.label_ == "no_day":
        text_date = "01/" + re.sub(r"[\.\/\s]", "/", text_date)

    elif date.label_ == "full_date":
        text_date = re.sub(r"[\.\/\s]", "-", text_date)

        try:
            return datetime.strptime(text_date, "%Y-%m-%d")
        except ValueError:
            try:
                return datetime.strptime(text_date, "%Y-%d-%m")
            except ValueError:
                return None

    # text_date = re.sub(r"\.", "-", text_date)

    return self.parser(text_date)

__call__(doc)

Tags dates.

PARAMETER DESCRIPTION

doc

spaCy Doc object

TYPE: Doc

RETURNS	DESCRIPTION
`doc`	spaCy Doc object, annotated for dates

Source code in edsnlp/pipelines/misc/dates/dates.py

def __call__(self, doc: Doc) -> Doc:
    """
    Tags dates.

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    doc:
        spaCy Doc object, annotated for dates
    """
    dates = self.process(doc)

    for date in dates:
        d = self.get_date(date)

        if d is None:
            date._.parsed_date = None
        else:
            date._.parsed_date = d
            date._.parsed_delta = d - datetime.now() + timedelta(seconds=10)

    doc.spans["dates"] = dates

    return doc

td2str(td)

Transforms a timedelta object to a string representation.

PARAMETER	DESCRIPTION
`td`	The timedelta object to represent. TYPE: `timedelta`

RETURNS	DESCRIPTION
`str`	Usable representation for the timedelta object.

Source code in edsnlp/pipelines/misc/dates/dates.py

def td2str(td: timedelta):
    """
    Transforms a timedelta object to a string representation.

    Parameters
    ----------
    td : timedelta
        The timedelta object to represent.

    Returns
    -------
    str
        Usable representation for the timedelta object.
    """
    seconds = td.total_seconds()
    days = int(seconds / 3600 / 24)
    return f"TD{days:+d}"

date_getter(date)

Getter for dates. Uses the information from note_datetime.

PARAMETER DESCRIPTION

date

Date detected by the pipeline.

TYPE: Span

RETURNS	DESCRIPTION
`str`	Normalized date.

Source code in edsnlp/pipelines/misc/dates/dates.py

def date_getter(date: Span) -> str:
    """
    Getter for dates. Uses the information from `note_datetime`.

    Parameters
    ----------
    date : Span
        Date detected by the pipeline.

    Returns
    -------
    str
        Normalized date.
    """

    d = date._.parsed_date

    if d is None:
        # dateparser could not interpret the date.
        return "????-??-??"

    delta = date._.parsed_delta
    note_datetime = date.doc._.note_datetime

    if date.label_ in {"absolute", "full_date", "no_day"}:
        normalized = d.strftime("%Y-%m-%d")
    elif date.label_ == "no_year":
        if note_datetime:
            year = note_datetime.strftime("%Y")
        else:
            year = "????"
        normalized = d.strftime(f"{year}-%m-%d")
    else:
        if note_datetime:
            # We need to adjust the timedelta, since most dates are set at 00h00.
            # The slightest difference leads to a day difference.
            d = note_datetime + delta
            normalized = d.strftime("%Y-%m-%d")
        else:
            normalized = td2str(d - datetime.now())

    return normalized

date_parser(text_date)

Function to parse dates. It try first all available parsers ('timestamp', 'custom-formats', 'absolute-time') but 'relative-time'. If no date is found, retries with 'relative-time'.

When just the year is identified, it returns a datetime object with month and day equal to 1.

PARAMETER	DESCRIPTION
`text_date`	TYPE: `str`

RETURNS	DESCRIPTION
`datetime`

Source code in edsnlp/pipelines/misc/dates/dates.py

def date_parser(text_date: str) -> datetime:
    """
    Function to parse dates. It try first all available parsers
    ('timestamp', 'custom-formats', 'absolute-time') but 'relative-time'.
    If no date is found, retries with 'relative-time'.

    When just the year is identified, it returns a datetime object with
    month and day equal to 1.


    Parameters
    ----------
    text_date : str

    Returns
    -------
    datetime
    """

    parsed_date = parser1.get_date_data(text_date)
    if parsed_date.date_obj:
        if parsed_date.period == "year":
            return datetime(year=parsed_date.date_obj.year, month=1, day=1)
        else:
            return parsed_date.date_obj
    else:
        parsed_date2 = parser2.get_date_data(text_date)
        return parsed_date2.date_obj

apply_groupdict(dates)

Source code in edsnlp/pipelines/misc/dates/dates.py

def apply_groupdict(
    dates: Iterable[Tuple[Span, Dict[str, str]]]
) -> Generator[Span, None, None]:
    for span, groupdict in dates:
        span._.groupdict = groupdict
        yield span

parse_groupdict(day=None, month=None, year=None, hour=None, minute=None, second=None, **kwargs)

Parse date groupdict.

PARAMETER	DESCRIPTION
`day`	String representation of the day, by default None TYPE: `str, optional` DEFAULT: `None`
`month`	String representation of the month, by default None TYPE: `str, optional` DEFAULT: `None`
`year`	String representation of the year, by default None TYPE: `str, optional` DEFAULT: `None`
`hour`	String representation of the hour, by default None TYPE: `str, optional` DEFAULT: `None`
`minute`	String representation of the minute, by default None TYPE: `str, optional` DEFAULT: `None`
`second`	String representation of the minute, by default None TYPE: `str, optional` DEFAULT: `None`

RETURNS	DESCRIPTION
`Dict[str, int]`	Parsed groupdict.

Source code in edsnlp/pipelines/misc/dates/dates.py

def parse_groupdict(
    day: str = None,
    month: str = None,
    year: str = None,
    hour: str = None,
    minute: str = None,
    second: str = None,
    **kwargs: Dict[str, str],
) -> Dict[str, int]:
    """
    Parse date groupdict.

    Parameters
    ----------
    day : str, optional
        String representation of the day, by default None
    month : str, optional
        String representation of the month, by default None
    year : str, optional
        String representation of the year, by default None
    hour : str, optional
        String representation of the hour, by default None
    minute : str, optional
        String representation of the minute, by default None
    second : str, optional
        String representation of the minute, by default None

    Returns
    -------
    Dict[str, int]
        Parsed groupdict.
    """

    result = dict()

    if day is not None:
        result["day"] = day2int(day)

    if month is not None:
        result["month"] = month2int(month)

    if year is not None:
        result["year"] = str2int(year)

    if hour is not None:
        result["hour"] = str2int(hour)

    if minute is not None:
        result["minute"] = str2int(minute)

    if second is not None:
        result["second"] = str2int(second)

    result.update(**kwargs)

    return result

`parsing`

month2int = time2int_factory(months.letter_months_dict) module-attribute

day2int = time2int_factory(days.letter_days_dict) module-attribute

str2int(time)

Converts a string to an integer. Returns None if the string cannot be converted.

PARAMETER DESCRIPTION

time

String representation

TYPE: str

RETURNS	DESCRIPTION
`int`	Integer conversion.

Source code in edsnlp/pipelines/misc/dates/parsing.py

def str2int(time: str) -> int:
    """
    Converts a string to an integer. Returns `None` if the string cannot be converted.

    Parameters
    ----------
    time : str
        String representation

    Returns
    -------
    int
        Integer conversion.
    """
    try:
        return int(time)
    except ValueError:
        return None

time2int_factory(patterns)

Factory for a time2int conversion function.

PARAMETER DESCRIPTION

patterns

Dictionary of conversion/pattern.

TYPE: Dict[str, int]

RETURNS	DESCRIPTION
`Callable[[str], int]`	String to integer function.

Source code in edsnlp/pipelines/misc/dates/parsing.py

def time2int_factory(patterns: Dict[str, int]) -> Callable[[str], int]:
    """
    Factory for a `time2int` conversion function.

    Parameters
    ----------
    patterns : Dict[str, int]
        Dictionary of conversion/pattern.

    Returns
    -------
    Callable[[str], int]
        String to integer function.
    """

    def time2int(time: str) -> int:
        """
        Converts a string representation to the proper integer,
        iterating over a dictionnary of pattern/conversion.

        Parameters
        ----------
        time : str
            String representation

        Returns
        -------
        int
            Integer conversion
        """
        m = str2int(time)

        if m is not None:
            return m

        for pattern, key in patterns.items():
            if re.match(f"^{pattern}$", time):
                m = key
                break

        return m

    return time2int

`factory`

DEFAULT_CONFIG = dict(no_year=None, year_only=None, no_day=None, absolute=None, relative=None, full=None, current=None, false_positive=None, on_ents_only=False, attr='LOWER')

module-attribute

create_component(nlp, name, no_year, year_only, no_day, absolute, full, relative, current, false_positive, on_ents_only, attr)

Source code in edsnlp/pipelines/misc/dates/factory.py

@deprecated_factory("dates", "eds.dates", default_config=DEFAULT_CONFIG)
@Language.factory("eds.dates", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    no_year: Optional[List[str]],
    year_only: Optional[List[str]],
    no_day: Optional[List[str]],
    absolute: Optional[List[str]],
    full: Optional[List[str]],
    relative: Optional[List[str]],
    current: Optional[List[str]],
    false_positive: Optional[List[str]],
    on_ents_only: bool,
    attr: str,
):
    return Dates(
        nlp,
        no_year=no_year,
        absolute=absolute,
        relative=relative,
        year_only=year_only,
        no_day=no_day,
        full=full,
        current=current,
        false_positive=false_positive,
        on_ents_only=on_ents_only,
        attr=attr,
    )

`patterns`

raw_delimiters = ['\\/', '\\-'] module-attribute

delimiters = raw_delimiters + ['\\.', '[^\\S\\r\\n]+'] module-attribute

raw_delimiter_pattern = make_pattern(raw_delimiters) module-attribute

raw_delimiter_with_spaces_pattern = make_pattern(raw_delimiters + ['[^\\S\\r\\n]+']) module-attribute

delimiter_pattern = make_pattern(delimiters) module-attribute

ante_num_pattern = '(?<!{raw_delimiter_pattern})' module-attribute

post_num_pattern = '(?!{raw_delimiter_pattern})' module-attribute

full_year_pattern = ante_num_pattern + fy_pattern + post_num_pattern module-attribute

absolute_date_pattern: List[str] = [ante_num_pattern + day_pattern + d + month_pattern + d + year_pattern + post_num_pattern for d in delimiters] + [ante_num_pattern + year_pattern + d + numeric_month_pattern + d + numeric_day_pattern + post_num_pattern for d in delimiters]

module-attribute

full_date_pattern = [ante_num_pattern + fy_pattern + d + lz_numeric_month_pattern + d + lz_numeric_day_pattern + post_num_pattern for d in ['-', '\\.']]

module-attribute

no_year_pattern = [day + raw_delimiter_with_spaces_pattern + month for day in [ante_num_pattern + numeric_day_pattern, letter_day_pattern] for month in [numeric_month_pattern + post_num_pattern, letter_month_pattern]]

module-attribute

no_day_pattern = [letter_month_pattern + raw_delimiter_with_spaces_pattern + year_pattern + post_num_pattern, ante_num_pattern + lz_numeric_month_pattern + raw_delimiter_with_spaces_pattern + year_pattern + post_num_pattern]

module-attribute

relative_date_pattern = relative_pattern module-attribute

since_pattern = ['(?<=depuis)' + '.{,5}' + pattern for pattern in absolute_date_pattern + no_year_pattern + full_date_pattern + [relative_pattern]]

module-attribute

false_positive_pattern = make_pattern(['(\\d+' + delimiter_pattern + '){3,}\\d+', '\\d\\/\\d']) module-attribute

current

current_patterns: List[str] = ['cette\\sann[ée]e(?![-\\s]l[àa])', 'ce\\sjour', 'ces\\sjours[-\\s]ci', "aujourd'?hui", 'ce\\smois([-\\s]ci)?', 'cette\\ssemaine', 'cet?\\s([ée]t[ée]|automne|hiver|printemps)']

module-attribute

current_pattern = make_pattern(current_patterns, with_breaks=True) module-attribute

relative

ago_pattern = 'il\\s+y\\s+a\\s+.{,10}?\\s+(heures?|jours?|semaines?|mois|ann[ée]es?|ans?)' module-attribute

in_pattern = 'dans\\s+.{,10}?\\s+(heures?|jours?|semaines?|mois|ann[ée]es?|ans?)' module-attribute

last_pattern = "l['ae]\\s*(semaine|année|an|mois)\\s+derni[èe]re?" module-attribute

next_pattern = "l['ae]\\s*(semaine|année|an|mois)\\s+prochaine?" module-attribute

since_pattern = '(?<=depuis\\s)\\s*.{,10}\\s+(heures?|jours?|semaines?|mois|ann[ée]es?|ans?)(\\s+derni[èe]re?)?' module-attribute

during_pattern = '(pendant|pdt|pour)\\s+.{,10}?\\s+(heures?|jours?|mois|ann[ée]es?|ans?)' module-attribute

week_patterns = ['(avant\\-?\\s*)?hier', '(apr[èe]s\\-?\\s*)?demain'] module-attribute

week_pattern = make_pattern(week_patterns, with_breaks=True) module-attribute

relative_pattern = make_pattern(patterns=[ago_pattern, in_pattern, last_pattern, next_pattern, since_pattern, week_pattern], with_breaks=True)

module-attribute

atomic

time

hour_pattern = '(?<!\\d)(?P<hour>0?[1-9]|1\\d|2[0-3])(?!\\d)' module-attribute

lz_hour_pattern = '(?<!\\d)(?P<hour>0[1-9]|[12]\\d|3[01])(?!\\d)' module-attribute

minute_pattern = '(?<!\\d)(?P<minute>0?[1-9]|[1-5]\\d)(?!\\d)' module-attribute

lz_minute_pattern = '(?<!\\d)(?P<minute>0[1-9]|[1-5]\\d)(?!\\d)' module-attribute

second_pattern = '(?<!\\d)(?P<second>0?[1-9]|[1-5]\\d)(?!\\d)' module-attribute

lz_second_pattern = '(?<!\\d)(?P<second>0[1-9]|[1-5]\\d)(?!\\d)' module-attribute

time_pattern = '(\\s.{,3}' + '{hour_pattern}[h:]({lz_minute_pattern})?' + '((:|m|min){lz_second_pattern})?' + ')?' module-attribute

years

year_patterns: List[str] = ['19\\d\\d'] + [str(year) for year in range(2000, date.today().year + 2)] module-attribute

full_year_pattern = '(?<!\\d)' + full_year_pattern + '(?!\\d)' module-attribute

year_pattern = '(?<!\\d)' + year_pattern + '(?!\\d)' module-attribute

months

letter_months_dict: Dict[str, int] = {'(janvier|janv\\.?)': 1, '(f[ée]vrier|f[ée]v\\.?)': 2, '(mars|mar\\.?)': 3, '(avril|avr\\.?)': 4, 'mai': 5, 'juin': 6, '(juillet|juill?\\.?)': 7, 'ao[uû]t': 8, '(septembre|sept?\\.?)': 9, '(octobre|oct\\.?)': 10, '(novembre|nov\\.)': 11, '(d[ée]cembre|d[ée]c\\.?)': 12}

module-attribute

letter_months: List[str] = list(letter_months_dict.keys()) module-attribute

month_pattern = '(?P<month>{letter_month_pattern}|{numeric_month_pattern})' module-attribute

letter_month_pattern = '(?P<month>{letter_month_pattern})' module-attribute

numeric_month_pattern = '(?P<month>{numeric_month_pattern})' module-attribute

lz_numeric_month_pattern = '(?P<month>{lz_numeric_month_pattern})' module-attribute

days

letter_days_dict: Dict[str, int] = {'(premier|1\\s*er)': 1, 'deux': 2, 'trois': 3, 'quatre': 4, 'cinq': 5, 'six': 6, 'sept': 7, 'huit': 8, 'neuf': 9, 'dix': 10, 'onze': 11, 'douze': 12, 'treize': 13, 'quatorze': 14, 'quinze': 15, 'seize': 16, 'dix\\-?\\s*sept': 17, 'dix\\-?\\s*huit': 18, 'dix\\-?\\s*neuf': 19, 'vingt': 20, 'vingt\\-?\\s*et\\-?\\s*un': 21, 'vingt\\-?\\s*deux': 22, 'vingt\\-?\\s*trois': 23, 'vingt\\-?\\s*quatre': 24, 'vingt\\-?\\s*cinq': 25, 'vingt\\-?\\s*six': 26, 'vingt\\-?\\s*sept': 27, 'vingt\\-?\\s*huit': 28, 'vingt\\-?\\s*neuf': 29, 'trente': 30, 'trente\\-?\\s*et\\-?\\s*un': 31}

module-attribute

letter_days: List[str] = list(letter_days_dict.keys()) module-attribute

nlz_numeric_day_pattern = '(?<!\\d)([1-9]|[12]\\d|3[01])(?!\\d)' module-attribute

day_pattern = '(?P<day>{letter_day_pattern}|{numeric_day_pattern})' module-attribute

letter_day_pattern = '(?P<day>{letter_day_pattern})' module-attribute

numeric_day_pattern = '(?P<day>{numeric_day_pattern})' module-attribute

lz_numeric_day_pattern = '(?P<day>{lz_numeric_day_pattern})' module-attribute

`measures`

Measure

Bases: abc.ABC

Source code in edsnlp/pipelines/misc/measures/measures.py

class Measure(abc.ABC):
    INTEGER = r"(?:[0-9]+)"
    CONJUNCTIONS = "et|ou"
    COMPOSERS = r"[x*]|par"

    UNITS = {}
    COMPOSITE = None

    @abc.abstractmethod
    def __iter__(self) -> Iterable["SimpleMeasure"]:
        """
        Iter over items of the measure (only one for SimpleMeasure)

        Returns
        -------
        iterable : Iterable["SimpleMeasure"]
        """

    @abc.abstractmethod
    def __getitem__(self, item) -> "SimpleMeasure":
        """
        Access items of the measure (only one for SimpleMeasure)

        Parameters
        ----------
        item : int

        Returns
        -------
        measure : SimpleMeasure
        """

INTEGER = '(?:[0-9]+)' class-attribute

CONJUNCTIONS = 'et|ou' class-attribute

COMPOSERS = '[x*]|par' class-attribute

UNITS = {} class-attribute

COMPOSITE = None class-attribute

__iter__()

Iter over items of the measure (only one for SimpleMeasure)

RETURNS	DESCRIPTION
`iterable`	TYPE: `Iterable["SimpleMeasure"]`

Source code in edsnlp/pipelines/misc/measures/measures.py

@abc.abstractmethod
def __iter__(self) -> Iterable["SimpleMeasure"]:
    """
    Iter over items of the measure (only one for SimpleMeasure)

    Returns
    -------
    iterable : Iterable["SimpleMeasure"]
    """

__getitem__(item)

Access items of the measure (only one for SimpleMeasure)

PARAMETER	DESCRIPTION
`item`	TYPE: `int`

RETURNS	DESCRIPTION
`measure`	TYPE: `SimpleMeasure`

Source code in edsnlp/pipelines/misc/measures/measures.py

@abc.abstractmethod
def __getitem__(self, item) -> "SimpleMeasure":
    """
    Access items of the measure (only one for SimpleMeasure)

    Parameters
    ----------
    item : int

    Returns
    -------
    measure : SimpleMeasure
    """

SimpleMeasure

Bases: Measure

Source code in edsnlp/pipelines/misc/measures/measures.py

class SimpleMeasure(Measure):
    def __init__(self, value, unit):
        """
        The SimpleMeasure class contains the value and unit
        for a single non-composite measure

        Parameters
        ----------
        value : float
        unit : str
        """
        super().__init__()
        self.value = value
        self.unit = unit

    @classmethod
    @abc.abstractmethod
    def parse(
        self, int_part: str, dec_part: str, unit: str, infix: bool
    ) -> "SimpleMeasure":
        """
        Class method to create an instance from the match groups

        int_part : str
            The integer part of the match (eg 12 in 12 metres 50 or 12.50metres)
        dec_part : str
            The decimal part of the match (eg 50 in 12 metres 50 or 12.50metres)
        unit : str
            The normalized variant of the unit (eg "m" for 12 metre 50)
        infix : bool
            Whether the unit was in the before (True) or after (False) the decimal part
        """

    def _get_scale_to(self, unit: str):
        return self.UNITS[self.unit]["value"] / self.UNITS[unit]["value"]

    def __iter__(self):
        return iter((self,))

    def __getitem__(self, item: int):
        assert isinstance(item, int)
        return [self][item]

    def __str__(self):
        return f"{self.value}{self.unit}"

    def __repr__(self):
        return f"{self.__class__.__name__}({self.value}, {repr(self.unit)})"

    def __eq__(self, other: "SimpleMeasure"):
        return getattr(self, other.unit) == other.value

    def __lt__(self, other: "SimpleMeasure"):
        return getattr(self, other.unit) < other.value

    def __le__(self, other: "SimpleMeasure"):
        return getattr(self, other.unit) <= other.value

value = value instance-attribute

unit = unit instance-attribute

__init__(value, unit)

The SimpleMeasure class contains the value and unit for a single non-composite measure

PARAMETER	DESCRIPTION
`value`	TYPE: `float`
`unit`	TYPE: `str`

Source code in edsnlp/pipelines/misc/measures/measures.py

def __init__(self, value, unit):
    """
    The SimpleMeasure class contains the value and unit
    for a single non-composite measure

    Parameters
    ----------
    value : float
    unit : str
    """
    super().__init__()
    self.value = value
    self.unit = unit

parse(int_part, dec_part, unit, infix)

Class method to create an instance from the match groups

int_part : str The integer part of the match (eg 12 in 12 metres 50 or 12.50metres) dec_part : str The decimal part of the match (eg 50 in 12 metres 50 or 12.50metres) unit : str The normalized variant of the unit (eg "m" for 12 metre 50) infix : bool Whether the unit was in the before (True) or after (False) the decimal part

Source code in edsnlp/pipelines/misc/measures/measures.py

@classmethod
@abc.abstractmethod
def parse(
    self, int_part: str, dec_part: str, unit: str, infix: bool
) -> "SimpleMeasure":
    """
    Class method to create an instance from the match groups

    int_part : str
        The integer part of the match (eg 12 in 12 metres 50 or 12.50metres)
    dec_part : str
        The decimal part of the match (eg 50 in 12 metres 50 or 12.50metres)
    unit : str
        The normalized variant of the unit (eg "m" for 12 metre 50)
    infix : bool
        Whether the unit was in the before (True) or after (False) the decimal part
    """

_get_scale_to(unit)

Source code in edsnlp/pipelines/misc/measures/measures.py

189
190

def _get_scale_to(self, unit: str):
    return self.UNITS[self.unit]["value"] / self.UNITS[unit]["value"]

__iter__()

Source code in edsnlp/pipelines/misc/measures/measures.py

192
193

def __iter__(self):
    return iter((self,))

__getitem__(item)

Source code in edsnlp/pipelines/misc/measures/measures.py

195
196
197

def __getitem__(self, item: int):
    assert isinstance(item, int)
    return [self][item]

__str__()

Source code in edsnlp/pipelines/misc/measures/measures.py

199
200

def __str__(self):
    return f"{self.value}{self.unit}"

__repr__()

Source code in edsnlp/pipelines/misc/measures/measures.py

202
203

def __repr__(self):
    return f"{self.__class__.__name__}({self.value}, {repr(self.unit)})"

__eq__(other)

Source code in edsnlp/pipelines/misc/measures/measures.py

205
206

def __eq__(self, other: "SimpleMeasure"):
    return getattr(self, other.unit) == other.value

__lt__(other)

Source code in edsnlp/pipelines/misc/measures/measures.py

208
209

def __lt__(self, other: "SimpleMeasure"):
    return getattr(self, other.unit) < other.value

__le__(other)

Source code in edsnlp/pipelines/misc/measures/measures.py

211
212

def __le__(self, other: "SimpleMeasure"):
    return getattr(self, other.unit) <= other.value

CompositeMeasure

Bases: Measure

The CompositeMeasure class contains a sequence of multiple SimpleMeasure instances

PARAMETER	DESCRIPTION
`measures`	TYPE: `List[SimpleMeasure]`

Source code in edsnlp/pipelines/misc/measures/measures.py

class CompositeMeasure(Measure):
    """
    The CompositeMeasure class contains a sequence
    of multiple SimpleMeasure instances

    Parameters
    ----------
    measures : List[SimpleMeasure]
    """

    def __init__(self, measures: Iterable["SimpleMeasure"]):
        super().__init__()
        self.measures = list(measures)

    def __iter__(self):
        return iter(self.measures)

    def __getitem__(self, item: int):
        assert isinstance(item, int)
        res = self.measures[item]
        return res

    def __str__(self):
        return " x ".join(map(str, self.measures))

    def __repr__(self):
        return f"{self.__class__.__name__}({repr(self.measures)})"

measures = list(measures) instance-attribute

__init__(measures)

Source code in edsnlp/pipelines/misc/measures/measures.py

225
226
227

def __init__(self, measures: Iterable["SimpleMeasure"]):
    super().__init__()
    self.measures = list(measures)

__iter__()

Source code in edsnlp/pipelines/misc/measures/measures.py

229
230

def __iter__(self):
    return iter(self.measures)

__getitem__(item)

Source code in edsnlp/pipelines/misc/measures/measures.py

def __getitem__(self, item: int):
    assert isinstance(item, int)
    res = self.measures[item]
    return res

__str__()

Source code in edsnlp/pipelines/misc/measures/measures.py

237
238

def __str__(self):
    return " x ".join(map(str, self.measures))

__repr__()

Source code in edsnlp/pipelines/misc/measures/measures.py

240
241

def __repr__(self):
    return f"{self.__class__.__name__}({repr(self.measures)})"

Measures

Bases: BaseComponent

Matcher component to extract measures. A measures is most often composed of a number and a unit like

1,26 cm The unit can also be positioned in place of the decimal dot/comma 1 cm 26 Some measures can be composite 1,26 cm x 2,34 mm And sometimes they are factorized Les trois kystes mesurent 1, 2 et 3cm.

The recognized measures are stored in the "measures" SpanGroup. Each span has a Measure object stored in the "value" extension attribute.

PARAMETER	DESCRIPTION
`nlp`	The SpaCy object. TYPE: `Language`
`measures`	The registry names of the measures to extract TYPE: `List[str]`
`attr`	Whether to match on the text ('TEXT') or on the normalized text ('NORM') TYPE: `str`
`ignore_excluded`	Whether to exclude pollution patterns when matching in the text TYPE: `bool`

Source code in edsnlp/pipelines/misc/measures/measures.py

class Measures(BaseComponent):
    """
    Matcher component to extract measures.
    A measures is most often composed of a number and a unit like
    > 1,26 cm
    The unit can also be positioned in place of the decimal dot/comma
    > 1 cm 26
    Some measures can be composite
    > 1,26 cm x 2,34 mm
    And sometimes they are factorized
    > Les trois kystes mesurent 1, 2 et 3cm.

    The recognized measures are stored in the "measures" SpanGroup.
    Each span has a `Measure` object stored in the "value" extension attribute.

    Parameters
    ----------
    nlp : Language
        The SpaCy object.
    measures : List[str]
        The registry names of the measures to extract
    attr : str
        Whether to match on the text ('TEXT') or on the normalized text ('NORM')
    ignore_excluded : bool
        Whether to exclude pollution patterns when matching in the text
    """

    def __init__(
        self,
        nlp: Language,
        measures: List[str],
        attr: str,
        ignore_excluded: bool,
    ):

        self.regex_matcher = RegexMatcher(
            attr=attr,
            ignore_excluded=ignore_excluded,
        )

        self.extraction_regexes = {}
        self.measures: Dict[str, Measure] = {}
        for name in measures:
            cls: Measure = spacy.registry.misc.get(name)
            self.measures[name] = cls
            regexes = make_patterns(cls)
            self.regex_matcher.add(name, regexes["trigger"])
            self.extraction_regexes[name] = regexes["extraction"]

        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:
        super(Measures, Measures).set_extensions()
        if not Span.has_extension("value"):
            Span.set_extension("value", default=None)

    def __call__(self, doc: Doc) -> Doc:
        """
        Adds measures to document's "measures" SpanGroup.

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        doc:
            spaCy Doc object, annotated for extracted terms.
        """

        matches = dict(self.regex_matcher(doc, as_spans=True, return_groupdict=True))

        # Filter spans by rightmost, largest spans first to handle cases like 1 m 50 kg
        # while keeping the corresponding groupdicts
        matches = {
            match: matches[match]
            for match in filter_spans(matches, sort_key=rightmost_largest_sort_key)
        }

        measures = []
        for match, groupdict in matches.items():
            measure_name = match.label_
            extraction_regex = self.extraction_regexes[measure_name]

            parsed_values = []

            shared_unit_part = next(
                (key for key, val in groupdict.items() if val is not None), None
            )
            for sub_match in regex.finditer(extraction_regex, match.text):
                sub_groupdict = dict(sub_match.groupdict())

                # Integer part of the match
                int_part = sub_groupdict.pop("int_part", 0)

                # Decimal part of the match, if any
                dec_part = sub_groupdict.pop("dec_part", 0) or 0

                # If the unit was not postfix (in cases like 1cm, or 1 et 2cm)
                # the unit must be infix: we extract it now using non empty groupdict
                # entries
                infix_unit_part = next(
                    (key for key, val in sub_groupdict.items() if val is not None),
                    None,
                )
                unit_part = infix_unit_part or shared_unit_part

                # Create one SimpleMeasure per submatch inside each match...
                parsed_values.append(
                    self.measures[measure_name].parse(
                        int_part=int_part,
                        dec_part=dec_part,
                        unit=unit_part,
                        infix=infix_unit_part is not None,
                    )
                )

            # ... and compose theses measures together if there are more than one
            measure = Span(doc, start=match.start, end=match.end, label=measure_name)
            measure._.value = (
                parsed_values[0]
                if len(parsed_values) == 1
                else self.measures[measure_name].COMPOSITE(parsed_values)
                if self.measures[measure_name].COMPOSITE is not None
                else parsed_values[-1]
            )
            measures.append(match)

        doc.spans["measures"] = sorted(measures)

        return doc

regex_matcher = RegexMatcher(attr=attr, ignore_excluded=ignore_excluded) instance-attribute

extraction_regexes = {} instance-attribute

measures: Dict[str, Measure] = {} instance-attribute

__init__(nlp, measures, attr, ignore_excluded)

Source code in edsnlp/pipelines/misc/measures/measures.py

def __init__(
    self,
    nlp: Language,
    measures: List[str],
    attr: str,
    ignore_excluded: bool,
):

    self.regex_matcher = RegexMatcher(
        attr=attr,
        ignore_excluded=ignore_excluded,
    )

    self.extraction_regexes = {}
    self.measures: Dict[str, Measure] = {}
    for name in measures:
        cls: Measure = spacy.registry.misc.get(name)
        self.measures[name] = cls
        regexes = make_patterns(cls)
        self.regex_matcher.add(name, regexes["trigger"])
        self.extraction_regexes[name] = regexes["extraction"]

    self.set_extensions()

set_extensions()

Source code in edsnlp/pipelines/misc/measures/measures.py

@staticmethod
def set_extensions() -> None:
    super(Measures, Measures).set_extensions()
    if not Span.has_extension("value"):
        Span.set_extension("value", default=None)

__call__(doc)

Adds measures to document's "measures" SpanGroup.

PARAMETER DESCRIPTION

doc

spaCy Doc object

TYPE: Doc

RETURNS	DESCRIPTION
`doc`	spaCy Doc object, annotated for extracted terms.

Source code in edsnlp/pipelines/misc/measures/measures.py

def __call__(self, doc: Doc) -> Doc:
    """
    Adds measures to document's "measures" SpanGroup.

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    doc:
        spaCy Doc object, annotated for extracted terms.
    """

    matches = dict(self.regex_matcher(doc, as_spans=True, return_groupdict=True))

    # Filter spans by rightmost, largest spans first to handle cases like 1 m 50 kg
    # while keeping the corresponding groupdicts
    matches = {
        match: matches[match]
        for match in filter_spans(matches, sort_key=rightmost_largest_sort_key)
    }

    measures = []
    for match, groupdict in matches.items():
        measure_name = match.label_
        extraction_regex = self.extraction_regexes[measure_name]

        parsed_values = []

        shared_unit_part = next(
            (key for key, val in groupdict.items() if val is not None), None
        )
        for sub_match in regex.finditer(extraction_regex, match.text):
            sub_groupdict = dict(sub_match.groupdict())

            # Integer part of the match
            int_part = sub_groupdict.pop("int_part", 0)

            # Decimal part of the match, if any
            dec_part = sub_groupdict.pop("dec_part", 0) or 0

            # If the unit was not postfix (in cases like 1cm, or 1 et 2cm)
            # the unit must be infix: we extract it now using non empty groupdict
            # entries
            infix_unit_part = next(
                (key for key, val in sub_groupdict.items() if val is not None),
                None,
            )
            unit_part = infix_unit_part or shared_unit_part

            # Create one SimpleMeasure per submatch inside each match...
            parsed_values.append(
                self.measures[measure_name].parse(
                    int_part=int_part,
                    dec_part=dec_part,
                    unit=unit_part,
                    infix=infix_unit_part is not None,
                )
            )

        # ... and compose theses measures together if there are more than one
        measure = Span(doc, start=match.start, end=match.end, label=measure_name)
        measure._.value = (
            parsed_values[0]
            if len(parsed_values) == 1
            else self.measures[measure_name].COMPOSITE(parsed_values)
            if self.measures[measure_name].COMPOSITE is not None
            else parsed_values[-1]
        )
        measures.append(match)

    doc.spans["measures"] = sorted(measures)

    return doc

disj_capture(regexes, capture=True)

Source code in edsnlp/pipelines/misc/measures/measures.py

def disj_capture(regexes, capture=True):
    return "|".join(
        ("(?P<{key}>{forms})" if capture else "{forms}").format(
            key=key, forms="|".join(forms)
        )
        for key, forms in regexes.items()
    )

rightmost_largest_sort_key(span)

Source code in edsnlp/pipelines/misc/measures/measures.py

23
24

def rightmost_largest_sort_key(span):
    return span.end, (len(span))

make_patterns(measure)

Build recognition and extraction patterns for a given Measure class

PARAMETER DESCRIPTION

measure

The measure to build recognition and extraction patterns for

TYPE: 'Measure'

RETURNS	DESCRIPTION
`trigger`	TYPE: `List[str]`
`extraction`	TYPE: `str`

Source code in edsnlp/pipelines/misc/measures/measures.py

def make_patterns(measure: "Measure") -> Dict[str, Union[List[str], str]]:
    """
    Build recognition and extraction patterns for a given Measure class

    Parameters
    ----------
    measure: Measure class
        The measure to build recognition and extraction patterns for

    Returns
    -------
    trigger : List[str]
    extraction : str
    """
    unit_prefix_reg = disj_capture(
        {key: [entry["prefix"]] for key, entry in measure.UNITS.items()},
        capture=True,
    )
    unit_abbreviation_reg = disj_capture(
        {key: [entry["abbr"]] for key, entry in measure.UNITS.items()},
        capture=True,
    )
    unit_reg = rf"(?:(?:{unit_prefix_reg})[a-z]*|(?:{unit_abbreviation_reg})(?![a-z]))"

    number_reg = rf"(?:{measure.INTEGER}(?:[,.]{measure.INTEGER})?)"
    infix_measure_reg = rf"(?:{measure.INTEGER}{unit_reg}{measure.INTEGER})"

    # Simple measure
    simple_measure_reg = rf"{number_reg}\s*{unit_reg}"
    trigger = [
        simple_measure_reg,
        infix_measure_reg,
        # Factorized measures separated by a conjunction
        rf"{number_reg}(?=(?:\s*[,]\s*{number_reg})*\s*"
        rf"(?:{measure.CONJUNCTIONS})\s*{number_reg}\s*{unit_reg})",
    ]
    if measure.COMPOSITE:
        # Factorized composite measures (3 x 2cm)
        trigger.append(
            rf"(?<![a-z]){number_reg}"
            rf"(?:\s*(?:{measure.COMPOSERS})\s*{number_reg})*\s*{unit_reg}"
        )
        # Expanded composite measures (3cm x 2cm)
        trigger.append(
            rf"(?<![a-z])(?:{infix_measure_reg}|{simple_measure_reg})"
            rf"(\s*(?:{measure.COMPOSERS})\s*"
            rf"(?:{infix_measure_reg}|{simple_measure_reg}))*"
        )

    unit_reg_capture = (
        rf"(?:(?:{unit_prefix_reg})[a-z]*|(?:{unit_abbreviation_reg})(?![a-z]))"
    )

    return {
        "trigger": trigger,
        "extraction": rf"(?P<int_part>{measure.INTEGER})\s*(?:[,.]|"
        rf"{unit_reg_capture})?\s*(?P<dec_part>{measure.INTEGER})?",
    }

make_simple_getter(name)

Source code in edsnlp/pipelines/misc/measures/measures.py

def make_simple_getter(name):
    def getter(self):
        """
        Get a scaled numerical value of a measure

        Parameters
        ----------
        self

        Returns
        -------
        float
        """
        return self.value * self._get_scale_to(name)

    return getter

make_multi_getter(name)

Source code in edsnlp/pipelines/misc/measures/measures.py

def make_multi_getter(name: str) -> Callable[["CompositeMeasure"], Tuple[float]]:
    def getter(self) -> Tuple[float]:
        """
        Get a scaled numerical values of a multi-measure

        Parameters
        ----------
        self

        Returns
        -------
        float
        """
        return tuple(getattr(measure, name) for measure in self.measures)

    return getter

`patterns`

CompositeSize

Bases: CompositeMeasure

Composite size measure. Supports the following units: - mm - cm - dm - m

Source code in edsnlp/pipelines/misc/measures/patterns.py

class CompositeSize(CompositeMeasure):
    """
    Composite size measure. Supports the following units:
    - mm
    - cm
    - dm
    - m
    """

    mm = property(make_multi_getter("mm"))
    cm = property(make_multi_getter("cm"))
    dm = property(make_multi_getter("dm"))
    m = property(make_multi_getter("m"))

mm = property(make_multi_getter('mm')) class-attribute

cm = property(make_multi_getter('cm')) class-attribute

dm = property(make_multi_getter('dm')) class-attribute

m = property(make_multi_getter('m')) class-attribute

Size

Bases: SimpleMeasure

Size measure. Supports the following units: - mm - cm - dm - m

Source code in edsnlp/pipelines/misc/measures/patterns.py

@spacy.registry.misc("eds.measures.size")
class Size(SimpleMeasure):
    """
    Size measure. Supports the following units:
    - mm
    - cm
    - dm
    - m
    """

    COMPOSITE = CompositeSize
    UNITS = {
        "mm": {"prefix": "mill?im", "abbr": "mm", "value": 1},
        "cm": {"prefix": "centim", "abbr": "cm", "value": 10},
        "dm": {"prefix": "decim", "abbr": "dm", "value": 100},
        "m": {"prefix": "metre", "abbr": "m", "value": 1000},
    }

    @classmethod
    def parse(cls, int_part, dec_part, unit, infix=False):
        result = float("{}.{}".format(int_part, dec_part))
        return cls(result, unit)

    mm = property(make_simple_getter("mm"))
    cm = property(make_simple_getter("cm"))
    dm = property(make_simple_getter("dm"))
    m = property(make_simple_getter("m"))

COMPOSITE = CompositeSize class-attribute

UNITS = {'mm': {'prefix': 'mill?im', 'abbr': 'mm', 'value': 1}, 'cm': {'prefix': 'centim', 'abbr': 'cm', 'value': 10}, 'dm': {'prefix': 'decim', 'abbr': 'dm', 'value': 100}, 'm': {'prefix': 'metre', 'abbr': 'm', 'value': 1000}}

class-attribute

mm = property(make_simple_getter('mm')) class-attribute

cm = property(make_simple_getter('cm')) class-attribute

dm = property(make_simple_getter('dm')) class-attribute

m = property(make_simple_getter('m')) class-attribute

parse(int_part, dec_part, unit, infix=False)

Source code in edsnlp/pipelines/misc/measures/patterns.py

@classmethod
def parse(cls, int_part, dec_part, unit, infix=False):
    result = float("{}.{}".format(int_part, dec_part))
    return cls(result, unit)

Weight

Bases: SimpleMeasure

Weight measure. Supports the following units: - mg - cg - dg - g - kg

Source code in edsnlp/pipelines/misc/measures/patterns.py

@spacy.registry.misc("eds.measures.weight")
class Weight(SimpleMeasure):
    """
    Weight measure. Supports the following units:
    - mg
    - cg
    - dg
    - g
    - kg
    """

    COMPOSITE = None
    UNITS = {
        "mg": {"prefix": "mill?ig", "abbr": "mg", "value": 1},
        "cg": {"prefix": "centig", "abbr": "cg", "value": 10},
        "dg": {"prefix": "decig", "abbr": "dg", "value": 100},
        "g": {"prefix": "gram", "abbr": "g", "value": 1000},
        "kg": {"prefix": "kilo", "abbr": "kg", "value": 1000000},
    }

    @classmethod
    def parse(cls, int_part, dec_part, unit, infix=False):
        result = float("{}.{}".format(int_part, dec_part))
        return cls(result, unit)

    mg = property(make_simple_getter("mg"))
    cg = property(make_simple_getter("cg"))
    dg = property(make_simple_getter("dg"))
    g = property(make_simple_getter("g"))
    kg = property(make_simple_getter("kg"))

COMPOSITE = None class-attribute

UNITS = {'mg': {'prefix': 'mill?ig', 'abbr': 'mg', 'value': 1}, 'cg': {'prefix': 'centig', 'abbr': 'cg', 'value': 10}, 'dg': {'prefix': 'decig', 'abbr': 'dg', 'value': 100}, 'g': {'prefix': 'gram', 'abbr': 'g', 'value': 1000}, 'kg': {'prefix': 'kilo', 'abbr': 'kg', 'value': 1000000}}

class-attribute

mg = property(make_simple_getter('mg')) class-attribute

cg = property(make_simple_getter('cg')) class-attribute

dg = property(make_simple_getter('dg')) class-attribute

g = property(make_simple_getter('g')) class-attribute

kg = property(make_simple_getter('kg')) class-attribute

parse(int_part, dec_part, unit, infix=False)

Source code in edsnlp/pipelines/misc/measures/patterns.py

@classmethod
def parse(cls, int_part, dec_part, unit, infix=False):
    result = float("{}.{}".format(int_part, dec_part))
    return cls(result, unit)

Angle

Bases: SimpleMeasure

Angle measure. Supports the following units: - h

Source code in edsnlp/pipelines/misc/measures/patterns.py

@spacy.registry.misc("eds.measures.angle")
class Angle(SimpleMeasure):
    """
    Angle measure. Supports the following units:
    - h
    """

    COMPOSITE = None
    UNITS = {
        "h": {"prefix": "heur", "abbr": "h", "value": 1},
    }

    @classmethod
    def parse(cls, int_part, dec_part, unit, infix=False):
        if infix:
            result = float(int_part) + int(dec_part) / 60.0
            return cls(result, unit)
        result = float("{}.{}".format(int_part, dec_part))
        return cls(result, unit)

    h = property(make_simple_getter("h"))

COMPOSITE = None class-attribute

UNITS = {'h': {'prefix': 'heur', 'abbr': 'h', 'value': 1}} class-attribute

h = property(make_simple_getter('h')) class-attribute

parse(int_part, dec_part, unit, infix=False)

Source code in edsnlp/pipelines/misc/measures/patterns.py

@classmethod
def parse(cls, int_part, dec_part, unit, infix=False):
    if infix:
        result = float(int_part) + int(dec_part) / 60.0
        return cls(result, unit)
    result = float("{}.{}".format(int_part, dec_part))
    return cls(result, unit)

`factory`

DEFAULT_CONFIG = dict(attr='NORM', ignore_excluded=False, measures=['eds.measures.size', 'eds.measures.weight', 'eds.measures.angle'])

module-attribute

create_component(nlp, name, measures, attr, ignore_excluded)

Source code in edsnlp/pipelines/misc/measures/factory.py

@Language.factory("eds.measures", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    measures: Union[str, List[str], Dict[str, Dict]],
    attr: str,
    ignore_excluded: bool,
):
    return Measures(
        nlp,
        measures=measures,
        attr=attr,
        ignore_excluded=ignore_excluded,
    )

`consultation_dates`

`patterns`

consultation_mention = ['rendez-vous pris', 'consultation', 'consultation.{1,8}examen', 'examen clinique', 'de compte rendu', "date de l'examen", 'examen realise le', 'date de la visite']

module-attribute

town_mention = ['paris', 'kremlin.bicetre', 'creteil', 'boulogne.billancourt', 'villejuif', 'clamart', 'bobigny', 'clichy', 'ivry.sur.seine', 'issy.les.moulineaux', 'draveil', 'limeil', 'champcueil', 'roche.guyon', 'bondy', 'colombes', 'hendaye', 'herck.sur.mer', 'labruyere', 'garches', 'sevran', 'hyeres']

module-attribute

document_date_mention = ['imprime le', 'signe electroniquement', 'signe le', 'saisi le', 'dicte le', 'tape le', 'date de reference', 'date\\s*:', 'dactylographie le', 'date du rapport']

module-attribute

`consultation_dates`

ConsultationDates

Bases: GenericMatcher

Class to extract consultation dates from "CR-CONS" documents.

The pipeline populates the doc.spans['consultation_dates'] list.

For each extraction s in this list, the corresponding date is available as s._.consultation_date.

PARAMETER DESCRIPTION

nlp

Language pipeline object

TYPE: Language

consultation_mention

List of RegEx for consultation mentions.

If type==list: Overrides the default list
If type==bool: Uses the default list of True, disable if False

TYPE: Union[List[str], bool]

town_mention : Union[List[str], bool] List of RegEx for all AP-HP hospitals' towns mentions.

- If `type==list`: Overrides the default list
- If `type==bool`: Uses the default list of True, disable if False

document_date_mention : Union[List[str], bool] List of RegEx for document date.

- If `type==list`: Overrides the default list
- If `type==bool`: Uses the default list of True, disable if False

Source code in edsnlp/pipelines/misc/consultation_dates/consultation_dates.py

class ConsultationDates(GenericMatcher):
    """
    Class to extract consultation dates from "CR-CONS" documents.

    The pipeline populates the `#!python doc.spans['consultation_dates']` list.

    For each extraction `s` in this list, the corresponding date is available
    as `s._.consultation_date`.

    Parameters
    ----------
    nlp : Language
        Language pipeline object
    consultation_mention : Union[List[str], bool]
        List of RegEx for consultation mentions.

        - If `type==list`: Overrides the default list
        - If `type==bool`: Uses the default list of True, disable if False

    town_mention : Union[List[str], bool]
        List of RegEx for all AP-HP hospitals' towns mentions.

        - If `type==list`: Overrides the default list
        - If `type==bool`: Uses the default list of True, disable if False
    document_date_mention : Union[List[str], bool]
        List of RegEx for document date.

        - If `type==list`: Overrides the default list
        - If `type==bool`: Uses the default list of True, disable if False
    """

    def __init__(
        self,
        nlp: Language,
        consultation_mention: Union[List[str], bool],
        town_mention: Union[List[str], bool],
        document_date_mention: Union[List[str], bool],
        attr: str,
        **kwargs,
    ):

        logger.warning("This pipeline is still in beta")
        logger.warning(
            "This pipeline should ONLY be used on notes "
            "where `note_class_source_value == 'CR-CONS'`"
        )
        logger.warning(
            """This pipeline requires to use the normalizer pipeline with:
        lowercase=True,
        accents=True,
        quotes=True"""
        )

        if not (nlp.has_pipe("dates") and nlp.get_pipe("dates").on_ents_only is False):

            config = dict(**DEFAULT_CONFIG)
            config["on_ents_only"] = "consultation_mentions"

            self.date_matcher = Dates(nlp, **config)

        else:
            self.date_matcher = None

        if not consultation_mention:
            consultation_mention = []
        elif consultation_mention is True:
            consultation_mention = consult_regex.consultation_mention

        if not document_date_mention:
            document_date_mention = []
        elif document_date_mention is True:
            document_date_mention = consult_regex.document_date_mention

        if not town_mention:
            town_mention = []
        elif town_mention is True:
            town_mention = consult_regex.town_mention

        regex = dict(
            consultation_mention=consultation_mention,
            town_mention=town_mention,
            document_date_mention=document_date_mention,
        )

        super().__init__(
            nlp,
            regex=regex,
            terms=dict(),
            attr=attr,
            ignore_excluded=False,
            **kwargs,
        )

        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:
        if not Span.has_extension("consultation_date"):
            Span.set_extension("consultation_date", default=None)

    def __call__(self, doc: Doc) -> Doc:
        """
        Finds entities

        Parameters
        ----------
        doc: spaCy Doc object

        Returns
        -------
        doc: spaCy Doc object with additionnal doc.spans['consultation_dates] SpanGroup
        """

        ents = self.process(doc)

        doc.spans["consultation_mentions"] = ents
        doc.spans["consultation_dates"] = []

        if self.date_matcher is not None:
            doc = self.date_matcher(doc)

        for mention in ents:
            # Looking for a date
            # - In the same sentence
            # - Not less than 10 tokens AFTER the consultation mention
            matching_dates = [
                date
                for date in doc.spans["dates"]
                if (
                    (mention.sent == date.sent)
                    and (date.start > mention.start)
                    and (date.start - mention.end <= 10)
                )
            ]

            if matching_dates:
                # We keep the first mention of a date
                kept_date = min(matching_dates, key=lambda d: d.start)
                span = doc[mention.start : kept_date.end]
                span.label_ = mention.label_
                span._.consultation_date = kept_date._.parsed_date

                doc.spans["consultation_dates"].append(span)

        del doc.spans["consultation_mentions"]

        return doc

date_matcher = Dates(nlp, None=config) instance-attribute

__init__(nlp, consultation_mention, town_mention, document_date_mention, attr, **kwargs)

Source code in edsnlp/pipelines/misc/consultation_dates/consultation_dates.py

def __init__(
    self,
    nlp: Language,
    consultation_mention: Union[List[str], bool],
    town_mention: Union[List[str], bool],
    document_date_mention: Union[List[str], bool],
    attr: str,
    **kwargs,
):

    logger.warning("This pipeline is still in beta")
    logger.warning(
        "This pipeline should ONLY be used on notes "
        "where `note_class_source_value == 'CR-CONS'`"
    )
    logger.warning(
        """This pipeline requires to use the normalizer pipeline with:
    lowercase=True,
    accents=True,
    quotes=True"""
    )

    if not (nlp.has_pipe("dates") and nlp.get_pipe("dates").on_ents_only is False):

        config = dict(**DEFAULT_CONFIG)
        config["on_ents_only"] = "consultation_mentions"

        self.date_matcher = Dates(nlp, **config)

    else:
        self.date_matcher = None

    if not consultation_mention:
        consultation_mention = []
    elif consultation_mention is True:
        consultation_mention = consult_regex.consultation_mention

    if not document_date_mention:
        document_date_mention = []
    elif document_date_mention is True:
        document_date_mention = consult_regex.document_date_mention

    if not town_mention:
        town_mention = []
    elif town_mention is True:
        town_mention = consult_regex.town_mention

    regex = dict(
        consultation_mention=consultation_mention,
        town_mention=town_mention,
        document_date_mention=document_date_mention,
    )

    super().__init__(
        nlp,
        regex=regex,
        terms=dict(),
        attr=attr,
        ignore_excluded=False,
        **kwargs,
    )

    self.set_extensions()

set_extensions()

Source code in edsnlp/pipelines/misc/consultation_dates/consultation_dates.py

@staticmethod
def set_extensions() -> None:
    if not Span.has_extension("consultation_date"):
        Span.set_extension("consultation_date", default=None)

__call__(doc)

Finds entities

PARAMETER	DESCRIPTION
`doc`	TYPE: `Doc`

RETURNS	DESCRIPTION
`doc`

Source code in edsnlp/pipelines/misc/consultation_dates/consultation_dates.py

def __call__(self, doc: Doc) -> Doc:
    """
    Finds entities

    Parameters
    ----------
    doc: spaCy Doc object

    Returns
    -------
    doc: spaCy Doc object with additionnal doc.spans['consultation_dates] SpanGroup
    """

    ents = self.process(doc)

    doc.spans["consultation_mentions"] = ents
    doc.spans["consultation_dates"] = []

    if self.date_matcher is not None:
        doc = self.date_matcher(doc)

    for mention in ents:
        # Looking for a date
        # - In the same sentence
        # - Not less than 10 tokens AFTER the consultation mention
        matching_dates = [
            date
            for date in doc.spans["dates"]
            if (
                (mention.sent == date.sent)
                and (date.start > mention.start)
                and (date.start - mention.end <= 10)
            )
        ]

        if matching_dates:
            # We keep the first mention of a date
            kept_date = min(matching_dates, key=lambda d: d.start)
            span = doc[mention.start : kept_date.end]
            span.label_ = mention.label_
            span._.consultation_date = kept_date._.parsed_date

            doc.spans["consultation_dates"].append(span)

    del doc.spans["consultation_mentions"]

    return doc

`factory`

DEFAULT_CONFIG = dict(consultation_mention=True, town_mention=False, document_date_mention=False, attr='NORM') module-attribute

create_component(nlp, name, attr, consultation_mention, town_mention, document_date_mention)

Source code in edsnlp/pipelines/misc/consultation_dates/factory.py

@deprecated_factory(
    "consultation_dates",
    "eds.consultation_dates",
    default_config=DEFAULT_CONFIG,
)
@Language.factory("eds.consultation_dates", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    attr: str,
    consultation_mention: Union[List[str], bool],
    town_mention: Union[List[str], bool],
    document_date_mention: Union[List[str], bool],
):
    return ConsultationDates(
        nlp,
        attr=attr,
        consultation_mention=consultation_mention,
        document_date_mention=document_date_mention,
        town_mention=town_mention,
    )

`reason`

`patterns`

reasons = dict(reasons=['(?i)motif de l.?hospitalisation : .+', '(?i)hospitalis[ée].?.*(pour|. cause|suite [àa]).+', '(?i)(consulte|prise en charge(?!\\set\\svous\\sassurer\\sun\\straitement\\sadapté)).*pour.+', '(?i)motif\\sd.hospitalisation\\s:.+', '(?i)au total\\s?\\:?\\s?\\n?.+', '(?i)motif\\sde\\sla\\sconsultation', '(?i)motif\\sd.admission', '(?i)conclusion\\smedicale'])

module-attribute

sections_reason = ['motif', 'conclusion'] module-attribute

section_exclude = ['antécédents', 'antécédents familiaux', 'histoire de la maladie'] module-attribute

`reason`

Reason

Bases: GenericMatcher

Pipeline to identify the reason of the hospitalisation.

It declares a Span extension called ents_reason and adds the key reasons to doc.spans.

It also declares the boolean extension is_reason. This extension is set to True for the Reason Spans but also for the entities that overlap the reason span.

PARAMETER	DESCRIPTION
`nlp`	spaCy nlp pipeline to use for matching. TYPE: `Language`
`reasons`	The terminology of reasons. TYPE: `Optional[Dict[str, Union[List[str], str]]]`
`attr`	spaCy's attribute to use: a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr'. We can also add a key for each regex. TYPE: `str`
`use_sections`	whether or not use the `sections` pipeline to improve results. TYPE: `bool,`
`ignore_excluded`	Whether to skip excluded tokens. TYPE: `bool`

Source code in edsnlp/pipelines/misc/reason/reason.py

class Reason(GenericMatcher):
    """Pipeline to identify the reason of the hospitalisation.

    It declares a Span extension called `ents_reason` and adds
    the key `reasons` to doc.spans.

    It also declares the boolean extension `is_reason`.
    This extension is set to True for the Reason Spans but also
    for the entities that overlap the reason span.

    Parameters
    ----------
    nlp : Language
        spaCy nlp pipeline to use for matching.
    reasons : Optional[Dict[str, Union[List[str], str]]]
        The terminology of reasons.
    attr : str
        spaCy's attribute to use:
        a string with the value "TEXT" or "NORM", or a dict with
        the key 'term_attr'. We can also add a key for each regex.
    use_sections : bool,
        whether or not use the `sections` pipeline to improve results.
    ignore_excluded : bool
        Whether to skip excluded tokens.
    """

    def __init__(
        self,
        nlp: Language,
        reasons: Optional[Dict[str, Union[List[str], str]]],
        attr: Union[Dict[str, str], str],
        use_sections: bool,
        ignore_excluded: bool,
    ):

        if reasons is None:
            reasons = patterns.reasons

        super().__init__(
            nlp,
            terms=None,
            regex=reasons,
            attr=attr,
            ignore_excluded=ignore_excluded,
        )

        self.use_sections = use_sections and (
            "eds.sections" in self.nlp.pipe_names or "sections" in self.nlp.pipe_names
        )
        if use_sections and not self.use_sections:
            logger.warning(
                "You have requested that the pipeline use annotations "
                "provided by the `eds.section` pipeline, but it was not set. "
                "Skipping that step."
            )

        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:

        if not Span.has_extension("ents_reason"):
            Span.set_extension("ents_reason", default=None)

        if not Span.has_extension("is_reason"):
            Span.set_extension("is_reason", default=False)

    def _enhance_with_sections(self, sections: Iterable, reasons: Iterable) -> List:
        """Enhance the list of reasons with the section information.
        If the reason overlaps with history, so it will be removed from the list

        Parameters
        ----------
        sections : Iterable
            Spans of sections identified with the `sections` pipeline
        reasons : Iterable
            Reasons list identified by the regex

        Returns
        -------
        List
            Updated list of spans reasons
        """

        for section in sections:
            if section.label_ in patterns.sections_reason:
                reasons.append(section)

            if section.label_ in patterns.section_exclude:
                for reason in reasons:
                    if check_inclusion(reason, section.start, section.end):
                        reasons.remove(reason)

        return reasons

    def __call__(self, doc: Doc) -> Doc:
        """Find spans related to the reasons of the hospitalisation

        Parameters
        ----------
        doc : Doc

        Returns
        -------
        Doc
        """
        matches = self.process(doc)
        reasons = get_spans(matches, "reasons")

        if self.use_sections:
            sections = doc.spans["sections"]
            reasons = self._enhance_with_sections(sections=sections, reasons=reasons)

        doc.spans["reasons"] = reasons

        # Entities
        if len(doc.ents) > 0:
            for reason in reasons:  # TODO optimize this iteration
                ent_list = []
                for ent in doc.ents:
                    if check_inclusion(ent, reason.start, reason.end):
                        ent_list.append(ent)
                        ent._.is_reason = True

                reason._.ents_reason = ent_list
                reason._.is_reason = True

        return doc

use_sections = use_sections and 'eds.sections' in self.nlp.pipe_names or 'sections' in self.nlp.pipe_names instance-attribute

__init__(nlp, reasons, attr, use_sections, ignore_excluded)

Source code in edsnlp/pipelines/misc/reason/reason.py

def __init__(
    self,
    nlp: Language,
    reasons: Optional[Dict[str, Union[List[str], str]]],
    attr: Union[Dict[str, str], str],
    use_sections: bool,
    ignore_excluded: bool,
):

    if reasons is None:
        reasons = patterns.reasons

    super().__init__(
        nlp,
        terms=None,
        regex=reasons,
        attr=attr,
        ignore_excluded=ignore_excluded,
    )

    self.use_sections = use_sections and (
        "eds.sections" in self.nlp.pipe_names or "sections" in self.nlp.pipe_names
    )
    if use_sections and not self.use_sections:
        logger.warning(
            "You have requested that the pipeline use annotations "
            "provided by the `eds.section` pipeline, but it was not set. "
            "Skipping that step."
        )

    self.set_extensions()

set_extensions()

Source code in edsnlp/pipelines/misc/reason/reason.py

@staticmethod
def set_extensions() -> None:

    if not Span.has_extension("ents_reason"):
        Span.set_extension("ents_reason", default=None)

    if not Span.has_extension("is_reason"):
        Span.set_extension("is_reason", default=False)

_enhance_with_sections(sections, reasons)

Enhance the list of reasons with the section information. If the reason overlaps with history, so it will be removed from the list

PARAMETER DESCRIPTION

sections

Spans of sections identified with the sections pipeline

TYPE: Iterable

reasons

Reasons list identified by the regex

TYPE: Iterable

RETURNS	DESCRIPTION
`List`	Updated list of spans reasons

Source code in edsnlp/pipelines/misc/reason/reason.py

def _enhance_with_sections(self, sections: Iterable, reasons: Iterable) -> List:
    """Enhance the list of reasons with the section information.
    If the reason overlaps with history, so it will be removed from the list

    Parameters
    ----------
    sections : Iterable
        Spans of sections identified with the `sections` pipeline
    reasons : Iterable
        Reasons list identified by the regex

    Returns
    -------
    List
        Updated list of spans reasons
    """

    for section in sections:
        if section.label_ in patterns.sections_reason:
            reasons.append(section)

        if section.label_ in patterns.section_exclude:
            for reason in reasons:
                if check_inclusion(reason, section.start, section.end):
                    reasons.remove(reason)

    return reasons

__call__(doc)

Find spans related to the reasons of the hospitalisation

PARAMETER	DESCRIPTION
`doc`	TYPE: `Doc`

RETURNS	DESCRIPTION
`Doc`

Source code in edsnlp/pipelines/misc/reason/reason.py

def __call__(self, doc: Doc) -> Doc:
    """Find spans related to the reasons of the hospitalisation

    Parameters
    ----------
    doc : Doc

    Returns
    -------
    Doc
    """
    matches = self.process(doc)
    reasons = get_spans(matches, "reasons")

    if self.use_sections:
        sections = doc.spans["sections"]
        reasons = self._enhance_with_sections(sections=sections, reasons=reasons)

    doc.spans["reasons"] = reasons

    # Entities
    if len(doc.ents) > 0:
        for reason in reasons:  # TODO optimize this iteration
            ent_list = []
            for ent in doc.ents:
                if check_inclusion(ent, reason.start, reason.end):
                    ent_list.append(ent)
                    ent._.is_reason = True

            reason._.ents_reason = ent_list
            reason._.is_reason = True

    return doc

`factory`

DEFAULT_CONFIG = dict(reasons=None, attr='TEXT', use_sections=False, ignore_excluded=False) module-attribute

create_component(nlp, name, reasons, attr, use_sections, ignore_excluded)

Source code in edsnlp/pipelines/misc/reason/factory.py

@deprecated_factory("reason", "eds.reason", default_config=DEFAULT_CONFIG)
@Language.factory("eds.reason", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    reasons: Optional[Dict[str, Union[List[str], str]]],
    attr: str,
    use_sections: bool,
    ignore_excluded: bool,
):
    return Reason(
        nlp,
        reasons=reasons,
        attr=attr,
        use_sections=use_sections,
        ignore_excluded=ignore_excluded,
    )

`sections`

`patterns`

These section titles were extracted from a work performed by Ivan Lerner at AP-HP. It supplied a number of documents annotated for section titles.

The section titles were reviewed by Gilles Chatellier, who gave meaningful insights.

See sections/section-dataset notebook for detail.

allergies = ['allergies'] module-attribute

antecedents = ['antecedents', 'antecedents medicaux et chirurgicaux', 'antecedents personnels', 'antecedents medicaux', 'antecedents chirurgicaux', 'atcd']

module-attribute

antecedents_familiaux = ['antecedents familiaux'] module-attribute

traitements_entree = ['attitude therapeutique initiale', "traitement a l'entree", 'traitement actuel', 'traitement en cours', "traitements a l'entree"]

module-attribute

conclusion = ['au total', 'conclusion', 'conclusion de sortie', 'syntese medicale / conclusion', 'synthese', 'synthese medicale', 'synthese medicale/conclusion', 'conclusion medicale']

module-attribute

conclusion_entree = ["conclusion a l'entree"] module-attribute

habitus = ['contexte familial et social', 'habitus', 'mode de vie', 'mode de vie - scolarite', 'situation sociale, mode de vie']

module-attribute

correspondants = ['correspondants'] module-attribute

diagnostic = ['diagnostic retenu'] module-attribute

donnees_biometriques_entree = ["donnees biometriques et parametres vitaux a l'entree", "parametres vitaux et donnees biometriques a l'entree"]

module-attribute

examens = ['examen clinique', "examen clinique a l'entree"] module-attribute

examens_complementaires = ['examen(s) complementaire(s)', 'examens complementaires', "examens complementaires a l'entree", 'examens complementaires realises pendant le sejour', 'examens para-cliniques']

module-attribute

facteurs_de_risques = ['facteurs de risque', 'facteurs de risques'] module-attribute

histoire_de_la_maladie = ['histoire de la maladie', 'histoire de la maladie - explorations', 'histoire de la maladie actuelle', 'histoire du poids', 'histoire recente', 'histoire recente de la maladie', 'rappel clinique', 'resume', 'resume clinique']

module-attribute

actes = ['intervention'] module-attribute

motif = ['motif', "motif d'hospitalisation", "motif de l'hospitalisation", 'motif medical'] module-attribute

prescriptions = ['prescriptions de sortie', 'prescriptions medicales de sortie'] module-attribute

traitements_sortie = ['traitement de sortie'] module-attribute

sections = {'allergies': allergies, 'antécédents': antecedents, 'antécédents familiaux': antecedents_familiaux, 'traitements entrée': traitements_entree, 'conclusion': conclusion, 'conclusion entrée': conclusion_entree, 'habitus': habitus, 'correspondants': correspondants, 'diagnostic': diagnostic, 'données biométriques entrée': donnees_biometriques_entree, 'examens': examens, 'examens complémentaires': examens_complementaires, 'facteurs de risques': facteurs_de_risques, 'histoire de la maladie': histoire_de_la_maladie, 'actes': actes, 'motif': motif, 'prescriptions': prescriptions, 'traitements sortie': traitements_sortie}

module-attribute

`sections`

Sections

Bases: GenericMatcher

Divides the document into sections.

By default, we are using a dataset of documents annotated for section titles, using the work done by Ivan Lerner, reviewed by Gilles Chatellier.

Detected sections are :

allergies ;
antécédents ;
antécédents familiaux ;
traitements entrée ;
conclusion ;
conclusion entrée ;
habitus ;
correspondants ;
diagnostic ;
données biométriques entrée ;
examens ;
examens complémentaires ;
facteurs de risques ;
histoire de la maladie ;
actes ;
motif ;
prescriptions ;
traitements sortie.

The component looks for section titles within the document, and stores them in the section_title extension.

For ease-of-use, the component also populates a section extension, which contains a list of spans corresponding to the "sections" of the document. These span from the start of one section title to the next, which can introduce obvious bias should an intermediate section title goes undetected.

PARAMETER	DESCRIPTION
`nlp`	spaCy pipeline object. TYPE: `Language`
`sections`	Dictionary of terms to look for. TYPE: `Dict[str, List[str]]`
`attr`	Default attribute to match on. TYPE: `str`
`ignore_excluded`	Whether to skip excluded tokens. TYPE: `bool`

Source code in edsnlp/pipelines/misc/sections/sections.py

class Sections(GenericMatcher):
    """
    Divides the document into sections.

    By default, we are using a dataset of documents annotated for section titles,
    using the work done by Ivan Lerner, reviewed by Gilles Chatellier.

    Detected sections are :

    - allergies ;
    - antécédents ;
    - antécédents familiaux ;
    - traitements entrée ;
    - conclusion ;
    - conclusion entrée ;
    - habitus ;
    - correspondants ;
    - diagnostic ;
    - données biométriques entrée ;
    - examens ;
    - examens complémentaires ;
    - facteurs de risques ;
    - histoire de la maladie ;
    - actes ;
    - motif ;
    - prescriptions ;
    - traitements sortie.

    The component looks for section titles within the document,
    and stores them in the `section_title` extension.

    For ease-of-use, the component also populates a `section` extension,
    which contains a list of spans corresponding to the "sections" of the
    document. These span from the start of one section title to the next,
    which can introduce obvious bias should an intermediate section title
    goes undetected.

    Parameters
    ----------
    nlp : Language
        spaCy pipeline object.
    sections : Dict[str, List[str]]
        Dictionary of terms to look for.
    attr : str
        Default attribute to match on.
    ignore_excluded : bool
        Whether to skip excluded tokens.
    """

    def __init__(
        self,
        nlp: Language,
        sections: Dict[str, List[str]],
        add_patterns: bool,
        attr: str,
        ignore_excluded: bool,
    ):

        logger.warning(
            "The component Sections is still in Beta. Use at your own risks."
        )

        if sections is None:
            sections = patterns.sections

        self.add_patterns = add_patterns
        if add_patterns:
            for k, v in sections.items():
                sections[k] = [r"\n[^\n]{0,5}" + ent + r"[^\n]{0,5}\n" for ent in v]

        super().__init__(
            nlp,
            terms=None,
            regex=sections,
            attr=attr,
            ignore_excluded=ignore_excluded,
        )

        self.set_extensions()

        if not nlp.has_pipe("normalizer") and not not nlp.has_pipe("eds.normalizer"):
            logger.warning("You should add pipe `eds.normalizer`")

    @staticmethod
    def set_extensions():

        if not Span.has_extension("section_title"):
            Span.set_extension("section_title", default=None)

        if not Span.has_extension("section"):
            Span.set_extension("section", default=None)

    # noinspection PyProtectedMember
    def __call__(self, doc: Doc) -> Doc:
        """
        Divides the doc into sections

        Parameters
        ----------
        doc:
            spaCy Doc object

        Returns
        -------
        doc:
            spaCy Doc object, annotated for sections
        """
        titles = filter_spans(self.process(doc))

        if self.add_patterns:
            # Remove preceding newline
            titles = [
                Span(doc, title.start + 1, title.end - 1, label=title.label_)
                for title in titles
            ]

        sections = []

        for t1, t2 in zip(titles[:-1], titles[1:]):
            section = Span(doc, t1.start, t2.start, label=t1.label)
            section._.section_title = t1
            sections.append(section)

        if titles:
            t = titles[-1]
            section = Span(doc, t.start, len(doc), label=t.label)
            section._.section_title = t
            sections.append(section)

        doc.spans["sections"] = sections
        doc.spans["section_titles"] = titles

        return doc

add_patterns = add_patterns instance-attribute

__init__(nlp, sections, add_patterns, attr, ignore_excluded)

Source code in edsnlp/pipelines/misc/sections/sections.py

def __init__(
    self,
    nlp: Language,
    sections: Dict[str, List[str]],
    add_patterns: bool,
    attr: str,
    ignore_excluded: bool,
):

    logger.warning(
        "The component Sections is still in Beta. Use at your own risks."
    )

    if sections is None:
        sections = patterns.sections

    self.add_patterns = add_patterns
    if add_patterns:
        for k, v in sections.items():
            sections[k] = [r"\n[^\n]{0,5}" + ent + r"[^\n]{0,5}\n" for ent in v]

    super().__init__(
        nlp,
        terms=None,
        regex=sections,
        attr=attr,
        ignore_excluded=ignore_excluded,
    )

    self.set_extensions()

    if not nlp.has_pipe("normalizer") and not not nlp.has_pipe("eds.normalizer"):
        logger.warning("You should add pipe `eds.normalizer`")

set_extensions()

Source code in edsnlp/pipelines/misc/sections/sections.py

@staticmethod
def set_extensions():

    if not Span.has_extension("section_title"):
        Span.set_extension("section_title", default=None)

    if not Span.has_extension("section"):
        Span.set_extension("section", default=None)

__call__(doc)

Divides the doc into sections

PARAMETER DESCRIPTION

doc

spaCy Doc object

TYPE: Doc

RETURNS	DESCRIPTION
`doc`	spaCy Doc object, annotated for sections

Source code in edsnlp/pipelines/misc/sections/sections.py

def __call__(self, doc: Doc) -> Doc:
    """
    Divides the doc into sections

    Parameters
    ----------
    doc:
        spaCy Doc object

    Returns
    -------
    doc:
        spaCy Doc object, annotated for sections
    """
    titles = filter_spans(self.process(doc))

    if self.add_patterns:
        # Remove preceding newline
        titles = [
            Span(doc, title.start + 1, title.end - 1, label=title.label_)
            for title in titles
        ]

    sections = []

    for t1, t2 in zip(titles[:-1], titles[1:]):
        section = Span(doc, t1.start, t2.start, label=t1.label)
        section._.section_title = t1
        sections.append(section)

    if titles:
        t = titles[-1]
        section = Span(doc, t.start, len(doc), label=t.label)
        section._.section_title = t
        sections.append(section)

    doc.spans["sections"] = sections
    doc.spans["section_titles"] = titles

    return doc

`factory`

DEFAULT_CONFIG = dict(sections=None, add_patterns=True, attr='NORM', ignore_excluded=True) module-attribute

create_component(nlp, name, sections, add_patterns, attr, ignore_excluded)

Source code in edsnlp/pipelines/misc/sections/factory.py

@deprecated_factory("sections", "eds.sections", default_config=DEFAULT_CONFIG)
@Language.factory("eds.sections", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    sections: Optional[Dict[str, List[str]]],
    add_patterns: bool,
    attr: str,
    ignore_excluded: bool,
):
    return Sections(
        nlp,
        sections=sections,
        add_patterns=add_patterns,
        attr=attr,
        ignore_excluded=ignore_excluded,
    )

edsnlp

__version__ = '0.4.4' module-attribute

BASE_DIR = Path(__file__).parent module-attribute

conjugator

conjugate_verb(verb, conjugator)

conjugate(verbs, language='fr')

get_conjugated_verbs(verbs, matches, language='fr')

extensions

components

matchers

phrase

PatternDict = Dict[str, Union[str, Dict[str, str]]] module-attribute

EDSPhraseMatcher

matcher = Matcher(vocab, validate=True) instance-attribute

attr = attr instance-attribute

ignore_excluded = ignore_excluded instance-attribute

exclusion_attribute = 'excluded_or_space' if exclude_newlines else 'excluded' instance-attribute

__init__(vocab, attr='TEXT', ignore_excluded=True, exclude_newlines=False)

get_attr(token, attr, custom_attr=False)

create_pattern(match_pattern, attr=None, ignore_excluded=None)

build_patterns(nlp, terms)

add(key, patterns, attr=None, ignore_excluded=None)

remove(key)

__len__()

__call__(doclike, as_spans=False)

get_normalized_variant(doclike)

phrase_matcher_factory(attr, ignore_excluded, exclude_newlines)

regex

RegexMatcher

alignment_mode = alignment_mode instance-attribute

regex = [] instance-attribute

default_attr = attr instance-attribute

ignore_excluded = ignore_excluded instance-attribute

__init__(alignment_mode='expand', attr='TEXT', ignore_excluded=False)

build_patterns(regex)

add(key, patterns, attr=None, ignore_excluded=None, alignment_mode=None)

remove(key)

__len__()

match(doclike)

__call__(doclike, as_spans=False, return_groupdict=False)

get_first_included(doclike)

create_span(doclike, start_char, end_char, key, attr, alignment_mode, ignore_excluded)

utils

ListOrStr = Union[List[str], str] module-attribute

DictOrPattern = Union[Dict[str, ListOrStr], ListOrStr] module-attribute

Patterns = Dict[str, DictOrPattern] module-attribute

ATTRIBUTES = {'LOWER': 'lower_', 'TEXT': 'text', 'NORM': 'norm_', 'SHAPE': 'shape_'} module-attribute

offset

token_length(token, custom, attr)

alignment(doc, attr='TEXT', ignore_excluded=True)

offset(doc, attr, ignore_excluded, index)

text

get_text(doclike, attr, ignore_excluded)

processing

helpers

DataFrames = None module-attribute

spec = importlib.util.find_spec(module.value) module-attribute

DataFrameModules

PANDAS = 'pandas' class-attribute

PYSPARK = 'pyspark.sql' class-attribute

KOALAS = 'databricks.koalas' class-attribute

get_module(df)

check_spacy_version_for_context()

simple

nlp = spacy.blank('fr') module-attribute

ExtensionSchema = Union[str, List[str], Dict[str, Any]] module-attribute

_df_to_spacy(note, nlp, context)

_flatten(list_of_lists)

_pipe_generator(note, nlp, context=[], additional_spans='discarded', extensions=[], batch_size=50, progress_bar=True)

_single_schema(ent, span_type='ents', extensions=[])

_full_schema(doc, additional_spans=[], extensions=[])

pipe(note, nlp, context=[], additional_spans='discarded', extensions=[], batch_size=1000, progress_bar=True)

wrapper

pipe(note, nlp, n_jobs=-2, context=[], additional_spans='discarded', extensions=[], **kwargs)

parallel

nlp = spacy.blank('fr') module-attribute

_define_nlp(new_nlp)

_chunker(iterable, total_length, chunksize)

_process_chunk(note, **pipe_kwargs)

pipe(note, nlp, context=[], additional_spans='discarded', extensions=[], chunksize=100, n_jobs=-2, progress_bar=True, **pipe_kwargs)

`edsnlp`

`version = '0.4.4'` `module-attribute`

`BASE_DIR = Path(file).parent` `module-attribute`

`conjugator`

`conjugate_verb(verb, conjugator)`

`conjugate(verbs, language='fr')`

`get_conjugated_verbs(verbs, matches, language='fr')`

`extensions`

`components`

`matchers`

`phrase`

`PatternDict = Dict[str, Union[str, Dict[str, str]]]` `module-attribute`

`EDSPhraseMatcher`

`matcher = Matcher(vocab, validate=True)` `instance-attribute`

`attr = attr` `instance-attribute`

`ignore_excluded = ignore_excluded` `instance-attribute`

`exclusion_attribute = 'excluded_or_space' if exclude_newlines else 'excluded'` `instance-attribute`

`init(vocab, attr='TEXT', ignore_excluded=True, exclude_newlines=False)`

`get_attr(token, attr, custom_attr=False)`

`create_pattern(match_pattern, attr=None, ignore_excluded=None)`

`build_patterns(nlp, terms)`

`add(key, patterns, attr=None, ignore_excluded=None)`

`remove(key)`

`len()`

`call(doclike, as_spans=False)`

`get_normalized_variant(doclike)`

`phrase_matcher_factory(attr, ignore_excluded, exclude_newlines)`

`regex`

`RegexMatcher`

`alignment_mode = alignment_mode` `instance-attribute`

`regex = []` `instance-attribute`

`default_attr = attr` `instance-attribute`

`ignore_excluded = ignore_excluded` `instance-attribute`

`init(alignment_mode='expand', attr='TEXT', ignore_excluded=False)`

`build_patterns(regex)`

`add(key, patterns, attr=None, ignore_excluded=None, alignment_mode=None)`

`remove(key)`

`len()`

`match(doclike)`

`call(doclike, as_spans=False, return_groupdict=False)`

`get_first_included(doclike)`

`create_span(doclike, start_char, end_char, key, attr, alignment_mode, ignore_excluded)`

`utils`

`ListOrStr = Union[List[str], str]` `module-attribute`

`DictOrPattern = Union[Dict[str, ListOrStr], ListOrStr]` `module-attribute`

`Patterns = Dict[str, DictOrPattern]` `module-attribute`

`ATTRIBUTES = {'LOWER': 'lower_', 'TEXT': 'text', 'NORM': 'norm_', 'SHAPE': 'shape_'}` `module-attribute`

`offset`

`token_length(token, custom, attr)`

`alignment(doc, attr='TEXT', ignore_excluded=True)`

`offset(doc, attr, ignore_excluded, index)`

`text`

`get_text(doclike, attr, ignore_excluded)`

`processing`

`helpers`

`DataFrames = None` `module-attribute`

`spec = importlib.util.find_spec(module.value)` `module-attribute`

`DataFrameModules`

`PANDAS = 'pandas'` `class-attribute`

`PYSPARK = 'pyspark.sql'` `class-attribute`

`KOALAS = 'databricks.koalas'` `class-attribute`

`get_module(df)`

`check_spacy_version_for_context()`

`simple`

`nlp = spacy.blank('fr')` `module-attribute`

`ExtensionSchema = Union[str, List[str], Dict[str, Any]]` `module-attribute`

`_df_to_spacy(note, nlp, context)`

`_flatten(list_of_lists)`

`_pipe_generator(note, nlp, context=[], additional_spans='discarded', extensions=[], batch_size=50, progress_bar=True)`

`_single_schema(ent, span_type='ents', extensions=[])`

`_full_schema(doc, additional_spans=[], extensions=[])`

`pipe(note, nlp, context=[], additional_spans='discarded', extensions=[], batch_size=1000, progress_bar=True)`

`wrapper`

`pipe(note, nlp, n_jobs=-2, context=[], additional_spans='discarded', extensions=[], **kwargs)`

`parallel`

`nlp = spacy.blank('fr')` `module-attribute`

`_define_nlp(new_nlp)`

`_chunker(iterable, total_length, chunksize)`

`_process_chunk(note, **pipe_kwargs)`

`pipe(note, nlp, context=[], additional_spans='discarded', extensions=[], chunksize=100, n_jobs=-2, progress_bar=True, **pipe_kwargs)`