Skip to content

edsnlp.matchers.utils.offset

token_length(token, custom, attr)

Source code in edsnlp/matchers/utils/offset.py
10
11
12
13
14
15
def token_length(token: Token, custom: bool, attr: str):
    if custom:
        text = getattr(token._, attr)
    else:
        text = getattr(token, attr)
    return len(text)

alignment(doc, attr='TEXT', ignore_excluded=True)

Align different representations of a Doc or Span object.

PARAMETER DESCRIPTION
doc

spaCy Doc or Span object

TYPE: Doc

attr

Attribute to use, by default "TEXT"

TYPE: str, optional DEFAULT: 'TEXT'

ignore_excluded

Whether to remove excluded tokens, by default True

TYPE: bool, optional DEFAULT: True

RETURNS DESCRIPTION
Tuple[List[int], List[int]]

An alignment tuple: original and clean lists.

Source code in edsnlp/matchers/utils/offset.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
@lru_cache(maxsize=32)
def alignment(
    doc: Doc,
    attr: str = "TEXT",
    ignore_excluded: bool = True,
) -> Tuple[List[int], List[int]]:
    """
    Align different representations of a `Doc` or `Span` object.

    Parameters
    ----------
    doc : Doc
        spaCy `Doc` or `Span` object
    attr : str, optional
        Attribute to use, by default `"TEXT"`
    ignore_excluded : bool, optional
        Whether to remove excluded tokens, by default True

    Returns
    -------
    Tuple[List[int], List[int]]
        An alignment tuple: original and clean lists.
    """
    assert isinstance(doc, Doc)

    attr = attr.upper()
    attr = ATTRIBUTES.get(attr, attr)

    custom = attr.startswith("_")

    if custom:
        attr = attr[1:].lower()

    # Define the length function
    length = partial(token_length, custom=custom, attr=attr)

    original = []
    clean = []

    cursor = 0

    for token in doc:

        if not ignore_excluded or not token._.excluded:

            # The token is not excluded, we add its extremities to the list
            original.append(token.idx)

            # We add the cursor
            clean.append(cursor)
            cursor += length(token)

            if token.whitespace_:
                cursor += 1

    return original, clean

offset(doc, attr, ignore_excluded, index)

Compute offset between the original text and a given representation (defined by the couple attr, ignore_excluded).

The alignment itself is computed with alignment.

PARAMETER DESCRIPTION
doc

The spaCy Doc object

TYPE: Doc

attr

The attribute used by the RegexMatcher (eg NORM)

TYPE: str

ignore_excluded

Whether the RegexMatcher ignores excluded tokens.

TYPE: bool

index

The index in the pre-processed text.

TYPE: int

RETURNS DESCRIPTION
int

The offset. To get the character index in the original document, just do: original = index + offset(doc, attr, ignore_excluded, index)

Source code in edsnlp/matchers/utils/offset.py
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
def offset(
    doc: Doc,
    attr: str,
    ignore_excluded: bool,
    index: int,
) -> int:
    """
    Compute offset between the original text and a given representation
    (defined by the couple `attr`, `ignore_excluded`).

    The alignment itself is computed with
    [`alignment`][edsnlp.matchers.utils.offset.alignment].

    Parameters
    ----------
    doc : Doc
        The spaCy `Doc` object
    attr : str
        The attribute used by the [`RegexMatcher`][edsnlp.matchers.regex.RegexMatcher]
        (eg `NORM`)
    ignore_excluded : bool
        Whether the RegexMatcher ignores excluded tokens.
    index : int
        The index in the pre-processed text.

    Returns
    -------
    int
        The offset. To get the character index in the original document,
        just do: `#!python original = index + offset(doc, attr, ignore_excluded, index)`
    """
    original, clean = alignment(
        doc=doc,
        attr=attr,
        ignore_excluded=ignore_excluded,
    )

    # We use bisect to efficiently find the correct rightmost-lower index
    i = bisect_left(clean, index)
    i = min(i, len(original) - 1)

    return original[i] - clean[i]