Skip to content

edsnlp.connectors.labeltool

docs2labeltool(docs, extensions=None)

Returns a labeltool-ready dataframe from a list of annotated document.

PARAMETER DESCRIPTION
docs

List of annotated spacy docs.

TYPE: List[Doc]

extensions

List of extensions to use by labeltool.

TYPE: Optional[List[str]] DEFAULT: None

RETURNS DESCRIPTION
df

DataFrame tailored for labeltool.

Source code in edsnlp/connectors/labeltool.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
def docs2labeltool(
    docs: List[Doc],
    extensions: Optional[List[str]] = None,
) -> pd.DataFrame:
    """
    Returns a labeltool-ready dataframe from a list of annotated document.

    Parameters
    ----------
    docs: list of spaCy Doc
        List of annotated spacy docs.
    extensions: list of extensions
        List of extensions to use by labeltool.

    Returns
    -------
    df: pd.DataFrame
        DataFrame tailored for labeltool.
    """

    if extensions is None:
        extensions = []

    entities = []

    for i, doc in enumerate(tqdm(docs, ascii=True, ncols=100)):
        for ent in doc.ents:
            d = dict(
                note_text=doc.text,
                offset_begin=ent.start_char,
                offset_end=ent.end_char,
                label_name=ent.label_,
                label_value=ent.text,
            )

            d["note_id"] = doc._.note_id or i

            for ext in extensions:
                d[ext] = getattr(ent._, ext)

            entities.append(d)

    df = pd.DataFrame.from_records(entities)

    columns = [
        "note_id",
        "note_text",
        "offset_begin",
        "offset_end",
        "label_name",
        "label_value",
    ]

    df = df[columns + extensions]

    return df
Back to top