Skip to content

edsnlp.matchers.utils.text

get_text(doclike, attr, ignore_excluded)

Get text using a custom attribute, possibly ignoring excluded tokens.

PARAMETER DESCRIPTION
doclike

Doc or Span to get text from.

TYPE: Union[Doc, Span]

attr

Attribute to use.

TYPE: str

ignore_excluded

Whether to skip excluded tokens, by default False

TYPE: bool

RETURNS DESCRIPTION
str

Extracted text.

Source code in edsnlp/matchers/utils/text.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
@lru_cache(32)
def get_text(
    doclike: Union[Doc, Span],
    attr: str,
    ignore_excluded: bool,
) -> str:
    """
    Get text using a custom attribute, possibly ignoring excluded tokens.

    Parameters
    ----------
    doclike : Union[Doc, Span]
        Doc or Span to get text from.
    attr : str
        Attribute to use.
    ignore_excluded : bool
        Whether to skip excluded tokens, by default False

    Returns
    -------
    str
        Extracted text.
    """

    attr = attr.upper()

    if not ignore_excluded:
        if attr == "TEXT":
            return doclike.text
        elif attr == "LOWER":
            return doclike.text.lower()
        else:
            tokens = doclike
    else:
        tokens = [t for t in doclike if t.tag_ != "EXCLUDED"]

    if not tokens:
        return ""

    attr = ATTRIBUTES.get(attr, attr)

    if attr.startswith("_"):
        attr = attr[1:].lower()
        return "".join(
            [getattr(t._, attr) + t.whitespace_ for t in tokens[:-1]]
        ) + getattr(tokens[-1], attr)
    else:
        return "".join(
            [getattr(t, attr) + t.whitespace_ for t in tokens[:-1]]
        ) + getattr(tokens[-1], attr)