Skip to content

edsnlp.viz.quick_examples

QuickExample

Source code in edsnlp/viz/quick_examples.py
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
class QuickExample:
    def __init__(self, nlp: Language, extensions: List[str] = []):
        self.nlp = nlp
        self.qualifiers = get_qualifier_extensions(nlp)
        self.extensions = extensions

    def __call__(
        self, object: Union[str, Doc], as_dataframe: bool = False
    ) -> Optional[pd.DataFrame]:
        """
        Displays the text and a table of entities

        Parameters
        ----------
        as_dataframe : bool, optional
            If true, returns the table as a DataFrame instead of displaying it,
            by default False

        Returns
        -------
        Optional[pd.DataFrame]
            The DataFrame describing the document
        """
        if isinstance(object, str):
            self.txt = object
            self.doc = self.nlp(object)
        elif isinstance(object, Doc):
            self.txt = object.text
            self.doc = object
        self.get_ents()
        self.get_ents_interval()
        self.get_text()
        return self.display(as_dataframe=as_dataframe)

    def get_ents(self):

        all_spans = {k: list(s) for k, s in self.doc.spans.items() if s}
        all_spans["ents"] = list(self.doc.ents).copy()

        ents = []

        for key, spans in all_spans.items():
            for span in spans:
                if span in all_spans["ents"]:
                    all_spans["ents"].remove(span)
                start, end = span.start, span.end
                text = get_text(span, attr="TEXT", ignore_excluded=False)
                ent = dict(
                    key=key,
                    start=start,
                    end=end,
                    text=text,
                )
                for name, extension in self.qualifiers.items():
                    ent[name] = rgetattr(span, extension)
                for extension in self.extensions:
                    ent[extension] = rgetattr(span, extension)
                ents.append(ent)

        self.ents = ents

    def get_ents_interval(self):
        """
        From the list of all entities, removes overlapping spans
        """

        intervals = []
        for ent in self.ents:
            interval = (ent["start"], ent["end"])
            istart, iend = interval

            i = bisect.bisect_right(intervals, (iend, len(self.doc) + 1))

            for idx, (start, end) in enumerate(intervals[:i]):
                if end > istart:
                    interval = (start, iend)
                    del intervals[idx]
                    break

            bisect.insort(intervals, interval)

        self.intervals = intervals

    def is_ent(self, tok: Token) -> bool:
        """
        Check if the provided Token is part of an entity

        Parameters
        ----------
        tok : Token
            A spaCy Token

        Returns
        -------
        bool
            True if `tok` is part of an entity
        """
        for interval in self.intervals:
            if (tok.i >= interval[0]) and (tok.i < interval[1]):
                return True
        return False

    def get_text(self) -> None:
        """
        Adds bold tags to `self.text`
        """
        text = []
        for tok in self.doc:
            raw_tok_text = tok.text + tok.whitespace_
            tok_text = (
                f"[bold]{raw_tok_text}[not bold]" if self.is_ent(tok) else raw_tok_text
            )
            text.append(tok_text)
        self.text = "".join(text)

    def display(self, as_dataframe: bool = False) -> Optional[pd.DataFrame]:
        """
        Displays the text and a table of entities

        Parameters
        ----------
        as_dataframe : bool, optional
            If true, returns the table as a DataFrame instead of displaying it,
            by default False

        Returns
        -------
        Optional[pd.DataFrame]
            The DataFrame describing the document
        """

        console = Console()

        table = Table(title=self.text + "\n")

        headers = ["Entity", "Source"] + list(self.qualifiers.keys()) + self.extensions

        if not as_dataframe:
            [table.add_column(h) for h in headers]

            for ent in self.ents:
                table.add_row(
                    ent["text"],
                    ent["key"],
                    *(
                        "[green]" + str(ent[q]) if ent[q] else "[red]" + str(ent[q])
                        for q in self.qualifiers
                    ),
                    *(str(ent[extension]) for extension in self.extensions),
                )

            console.print(table)

        else:
            df = pd.DataFrame(
                [
                    [
                        ent["text"],
                        ent["key"],
                        *(ent[q] for q in list(self.qualifiers.keys())),
                        *(ent[e] for e in self.extensions),
                    ]
                    for ent in self.ents
                ],
                columns=headers,
            )

            console.print(self.text)
            return df

__call__(object, as_dataframe=False)

Displays the text and a table of entities

PARAMETER DESCRIPTION
as_dataframe

If true, returns the table as a DataFrame instead of displaying it, by default False

TYPE: bool, optional DEFAULT: False

RETURNS DESCRIPTION
Optional[pd.DataFrame]

The DataFrame describing the document

Source code in edsnlp/viz/quick_examples.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
def __call__(
    self, object: Union[str, Doc], as_dataframe: bool = False
) -> Optional[pd.DataFrame]:
    """
    Displays the text and a table of entities

    Parameters
    ----------
    as_dataframe : bool, optional
        If true, returns the table as a DataFrame instead of displaying it,
        by default False

    Returns
    -------
    Optional[pd.DataFrame]
        The DataFrame describing the document
    """
    if isinstance(object, str):
        self.txt = object
        self.doc = self.nlp(object)
    elif isinstance(object, Doc):
        self.txt = object.text
        self.doc = object
    self.get_ents()
    self.get_ents_interval()
    self.get_text()
    return self.display(as_dataframe=as_dataframe)

get_ents_interval()

From the list of all entities, removes overlapping spans

Source code in edsnlp/viz/quick_examples.py
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
def get_ents_interval(self):
    """
    From the list of all entities, removes overlapping spans
    """

    intervals = []
    for ent in self.ents:
        interval = (ent["start"], ent["end"])
        istart, iend = interval

        i = bisect.bisect_right(intervals, (iend, len(self.doc) + 1))

        for idx, (start, end) in enumerate(intervals[:i]):
            if end > istart:
                interval = (start, iend)
                del intervals[idx]
                break

        bisect.insort(intervals, interval)

    self.intervals = intervals

is_ent(tok)

Check if the provided Token is part of an entity

PARAMETER DESCRIPTION
tok

A spaCy Token

TYPE: Token

RETURNS DESCRIPTION
bool

True if tok is part of an entity

Source code in edsnlp/viz/quick_examples.py
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
def is_ent(self, tok: Token) -> bool:
    """
    Check if the provided Token is part of an entity

    Parameters
    ----------
    tok : Token
        A spaCy Token

    Returns
    -------
    bool
        True if `tok` is part of an entity
    """
    for interval in self.intervals:
        if (tok.i >= interval[0]) and (tok.i < interval[1]):
            return True
    return False

get_text()

Adds bold tags to self.text

Source code in edsnlp/viz/quick_examples.py
117
118
119
120
121
122
123
124
125
126
127
128
def get_text(self) -> None:
    """
    Adds bold tags to `self.text`
    """
    text = []
    for tok in self.doc:
        raw_tok_text = tok.text + tok.whitespace_
        tok_text = (
            f"[bold]{raw_tok_text}[not bold]" if self.is_ent(tok) else raw_tok_text
        )
        text.append(tok_text)
    self.text = "".join(text)

display(as_dataframe=False)

Displays the text and a table of entities

PARAMETER DESCRIPTION
as_dataframe

If true, returns the table as a DataFrame instead of displaying it, by default False

TYPE: bool, optional DEFAULT: False

RETURNS DESCRIPTION
Optional[pd.DataFrame]

The DataFrame describing the document

Source code in edsnlp/viz/quick_examples.py
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
def display(self, as_dataframe: bool = False) -> Optional[pd.DataFrame]:
    """
    Displays the text and a table of entities

    Parameters
    ----------
    as_dataframe : bool, optional
        If true, returns the table as a DataFrame instead of displaying it,
        by default False

    Returns
    -------
    Optional[pd.DataFrame]
        The DataFrame describing the document
    """

    console = Console()

    table = Table(title=self.text + "\n")

    headers = ["Entity", "Source"] + list(self.qualifiers.keys()) + self.extensions

    if not as_dataframe:
        [table.add_column(h) for h in headers]

        for ent in self.ents:
            table.add_row(
                ent["text"],
                ent["key"],
                *(
                    "[green]" + str(ent[q]) if ent[q] else "[red]" + str(ent[q])
                    for q in self.qualifiers
                ),
                *(str(ent[extension]) for extension in self.extensions),
            )

        console.print(table)

    else:
        df = pd.DataFrame(
            [
                [
                    ent["text"],
                    ent["key"],
                    *(ent[q] for q in list(self.qualifiers.keys())),
                    *(ent[e] for e in self.extensions),
                ]
                for ent in self.ents
            ],
            columns=headers,
        )

        console.print(self.text)
        return df