Skip to content

edsnlp.connectors.brat

BratConnector

Bases: object

Two-way connector with BRAT. Supports entities only.

PARAMETER DESCRIPTION
directory

Directory containing the BRAT files.

TYPE: str

n_jobs

Number of jobs for multiprocessing, by default 1

TYPE: int, optional

Source code in edsnlp/connectors/brat.py
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
class BratConnector(object):
    """
    Two-way connector with BRAT. Supports entities only.

    Parameters
    ----------
    directory : str
        Directory containing the BRAT files.
    n_jobs : int, optional
        Number of jobs for multiprocessing, by default 1
    """

    def __init__(self, directory: str, n_jobs: int = 1):
        self.directory = directory
        self.n_jobs = n_jobs

        os.makedirs(directory, exist_ok=True)

    def full_path(self, filename: str) -> str:
        return os.path.join(self.directory, filename)

    def read_file(self, filename: str) -> str:
        """
        Reads a file within the BRAT directory.

        Parameters
        ----------
        filename:
            The path to the file within the BRAT directory.

        Returns
        -------
        text:
            The text content of the file.
        """
        with open(self.full_path(filename), "r", encoding="utf-8") as f:
            return f.read()

    def read_texts(self) -> pd.DataFrame:
        """
        Reads all texts from the BRAT folder.

        Returns
        -------
        texts:
            DataFrame containing all texts in the BRAT directory.
        """
        files = os.listdir(self.directory)
        filenames = [f[:-4] for f in files if f.endswith(".txt")]

        assert filenames, f"BRAT directory {self.directory} is empty!"

        logger.info(
            f"The BRAT directory contains {len(filenames)} annotated documents."
        )

        texts = pd.DataFrame(dict(note_id=filenames))

        with tqdm(
            texts.note_id, ascii=True, ncols=100, desc="Text extraction"
        ) as iterator:
            texts["note_text"] = [
                self.read_file(note_id + ".txt") for note_id in iterator
            ]

        return texts

    def read_brat_annotation(self, note_id: Union[str, int]) -> pd.DataFrame:
        """
        Reads BRAT annotation inside the BRAT directory.

        Parameters
        ----------
        note_id:
            Note ID within the BRAT directory.

        Returns
        -------
        annotations:
            DataFrame containing the annotations for the given note.
        """
        filename = f"{note_id}.ann"
        annotations = read_brat_annotation(self.full_path(filename))
        return annotations

    def read_annotations(self, texts: pd.DataFrame) -> pd.DataFrame:
        dfs = []

        with tqdm(
            texts.note_id, ascii=True, ncols=100, desc="Annotation extraction"
        ) as iterator:
            dfs = Parallel(n_jobs=self.n_jobs)(
                delayed(self.read_brat_annotation)(note_id) for note_id in iterator
            )
            # for note_id in iterator:
            #     dfs.append(self.read_brat_annotation(note_id))

        annotations = pd.concat(dfs, keys=texts.note_id, names=["note_id"])

        annotations = annotations.droplevel(1).reset_index()

        return annotations

    def get_brat(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Reads texts and annotations, and returns two DataFrame objects.

        Returns
        -------
        texts:
            A DataFrame containing two fields, `note_id` and `note_text`
        annotations:
            A DataFrame containing the annotations.
        """

        texts = self.read_texts()
        annotations = self.read_annotations(texts)

        return texts, annotations

    def brat2docs(self, nlp: Language) -> List[Doc]:
        """
        Transforms a BRAT folder to a list of spaCy documents.

        Parameters
        ----------
        nlp:
            A spaCy pipeline.

        Returns
        -------
        docs:
            List of spaCy documents, with annotations in the `ents` attribute.
        """
        texts, annotations = self.get_brat()

        docs = []

        with tqdm(
            zip(
                texts.note_id,
                nlp.pipe(texts.note_text, batch_size=50, n_process=self.n_jobs),
            ),
            ascii=True,
            ncols=100,
            desc="spaCy conversion",
            total=len(texts),
        ) as iterator:
            for note_id, doc in iterator:

                doc._.note_id = note_id

                ann = annotations.query("note_id == @note_id")

                spans = []

                for _, row in ann.iterrows():
                    span = doc.char_span(
                        row.start,
                        row.end,
                        label=row.label,
                        alignment_mode="expand",
                    )
                    spans.append(span)

                doc.ents = filter_spans(spans)

                docs.append(doc)

        return docs

    def doc2brat(self, doc: Doc) -> None:
        """
        Writes a spaCy document to file in the BRAT directory.

        Parameters
        ----------
        doc:
            spaCy Doc object. The spans in `ents` will populate the `note_id.ann` file.
        """
        filename = str(doc._.note_id)

        with open(self.full_path(f"{filename}.txt"), "w", encoding="utf-8") as f:
            f.write(doc.text)

        annotations = pd.DataFrame.from_records(
            [
                dict(
                    label=ann.label_,
                    lexical_variant=ann.text,
                    start=ann.start_char,
                    end=ann.end_char,
                )
                for ann in doc.ents
            ]
        )

        if len(annotations) > 0:

            annotations["annot"] = (
                annotations.label
                + " "
                + annotations.start.astype(str)
                + " "
                + annotations.end.astype(str)
            )

            annotations["index"] = [f"T{i + 1}" for i in range(len(annotations))]

            annotations = annotations[["index", "annot", "lexical_variant"]]
            annotations.to_csv(
                self.full_path(f"{filename}.ann"),
                sep="\t",
                header=None,
                index=False,
                encoding="utf-8",
            )

        else:
            open(self.full_path(f"{filename}.ann"), "w", encoding="utf-8").close()

    def docs2brat(self, docs: List[Doc]) -> None:
        """
        Writes a list of spaCy documents to file.

        Parameters
        ----------
        docs:
            List of spaCy documents.
        """
        for doc in docs:
            self.doc2brat(doc)

directory = directory instance-attribute

n_jobs = n_jobs instance-attribute

__init__(directory, n_jobs=1)

Source code in edsnlp/connectors/brat.py
66
67
68
69
70
def __init__(self, directory: str, n_jobs: int = 1):
    self.directory = directory
    self.n_jobs = n_jobs

    os.makedirs(directory, exist_ok=True)

full_path(filename)

Source code in edsnlp/connectors/brat.py
72
73
def full_path(self, filename: str) -> str:
    return os.path.join(self.directory, filename)

read_file(filename)

Reads a file within the BRAT directory.

PARAMETER DESCRIPTION
filename

The path to the file within the BRAT directory.

TYPE: str

RETURNS DESCRIPTION
text

The text content of the file.

Source code in edsnlp/connectors/brat.py
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
def read_file(self, filename: str) -> str:
    """
    Reads a file within the BRAT directory.

    Parameters
    ----------
    filename:
        The path to the file within the BRAT directory.

    Returns
    -------
    text:
        The text content of the file.
    """
    with open(self.full_path(filename), "r", encoding="utf-8") as f:
        return f.read()

read_texts()

Reads all texts from the BRAT folder.

RETURNS DESCRIPTION
texts

DataFrame containing all texts in the BRAT directory.

Source code in edsnlp/connectors/brat.py
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
def read_texts(self) -> pd.DataFrame:
    """
    Reads all texts from the BRAT folder.

    Returns
    -------
    texts:
        DataFrame containing all texts in the BRAT directory.
    """
    files = os.listdir(self.directory)
    filenames = [f[:-4] for f in files if f.endswith(".txt")]

    assert filenames, f"BRAT directory {self.directory} is empty!"

    logger.info(
        f"The BRAT directory contains {len(filenames)} annotated documents."
    )

    texts = pd.DataFrame(dict(note_id=filenames))

    with tqdm(
        texts.note_id, ascii=True, ncols=100, desc="Text extraction"
    ) as iterator:
        texts["note_text"] = [
            self.read_file(note_id + ".txt") for note_id in iterator
        ]

    return texts

read_brat_annotation(note_id)

Reads BRAT annotation inside the BRAT directory.

PARAMETER DESCRIPTION
note_id

Note ID within the BRAT directory.

TYPE: Union[str, int]

RETURNS DESCRIPTION
annotations

DataFrame containing the annotations for the given note.

Source code in edsnlp/connectors/brat.py
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
def read_brat_annotation(self, note_id: Union[str, int]) -> pd.DataFrame:
    """
    Reads BRAT annotation inside the BRAT directory.

    Parameters
    ----------
    note_id:
        Note ID within the BRAT directory.

    Returns
    -------
    annotations:
        DataFrame containing the annotations for the given note.
    """
    filename = f"{note_id}.ann"
    annotations = read_brat_annotation(self.full_path(filename))
    return annotations

read_annotations(texts)

Source code in edsnlp/connectors/brat.py
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
def read_annotations(self, texts: pd.DataFrame) -> pd.DataFrame:
    dfs = []

    with tqdm(
        texts.note_id, ascii=True, ncols=100, desc="Annotation extraction"
    ) as iterator:
        dfs = Parallel(n_jobs=self.n_jobs)(
            delayed(self.read_brat_annotation)(note_id) for note_id in iterator
        )
        # for note_id in iterator:
        #     dfs.append(self.read_brat_annotation(note_id))

    annotations = pd.concat(dfs, keys=texts.note_id, names=["note_id"])

    annotations = annotations.droplevel(1).reset_index()

    return annotations

get_brat()

Reads texts and annotations, and returns two DataFrame objects.

RETURNS DESCRIPTION
texts

A DataFrame containing two fields, note_id and note_text

annotations

A DataFrame containing the annotations.

Source code in edsnlp/connectors/brat.py
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
def get_brat(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Reads texts and annotations, and returns two DataFrame objects.

    Returns
    -------
    texts:
        A DataFrame containing two fields, `note_id` and `note_text`
    annotations:
        A DataFrame containing the annotations.
    """

    texts = self.read_texts()
    annotations = self.read_annotations(texts)

    return texts, annotations

brat2docs(nlp)

Transforms a BRAT folder to a list of spaCy documents.

PARAMETER DESCRIPTION
nlp

A spaCy pipeline.

TYPE: Language

RETURNS DESCRIPTION
docs

List of spaCy documents, with annotations in the ents attribute.

Source code in edsnlp/connectors/brat.py
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
def brat2docs(self, nlp: Language) -> List[Doc]:
    """
    Transforms a BRAT folder to a list of spaCy documents.

    Parameters
    ----------
    nlp:
        A spaCy pipeline.

    Returns
    -------
    docs:
        List of spaCy documents, with annotations in the `ents` attribute.
    """
    texts, annotations = self.get_brat()

    docs = []

    with tqdm(
        zip(
            texts.note_id,
            nlp.pipe(texts.note_text, batch_size=50, n_process=self.n_jobs),
        ),
        ascii=True,
        ncols=100,
        desc="spaCy conversion",
        total=len(texts),
    ) as iterator:
        for note_id, doc in iterator:

            doc._.note_id = note_id

            ann = annotations.query("note_id == @note_id")

            spans = []

            for _, row in ann.iterrows():
                span = doc.char_span(
                    row.start,
                    row.end,
                    label=row.label,
                    alignment_mode="expand",
                )
                spans.append(span)

            doc.ents = filter_spans(spans)

            docs.append(doc)

    return docs

doc2brat(doc)

Writes a spaCy document to file in the BRAT directory.

PARAMETER DESCRIPTION
doc

spaCy Doc object. The spans in ents will populate the note_id.ann file.

TYPE: Doc

Source code in edsnlp/connectors/brat.py
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
def doc2brat(self, doc: Doc) -> None:
    """
    Writes a spaCy document to file in the BRAT directory.

    Parameters
    ----------
    doc:
        spaCy Doc object. The spans in `ents` will populate the `note_id.ann` file.
    """
    filename = str(doc._.note_id)

    with open(self.full_path(f"{filename}.txt"), "w", encoding="utf-8") as f:
        f.write(doc.text)

    annotations = pd.DataFrame.from_records(
        [
            dict(
                label=ann.label_,
                lexical_variant=ann.text,
                start=ann.start_char,
                end=ann.end_char,
            )
            for ann in doc.ents
        ]
    )

    if len(annotations) > 0:

        annotations["annot"] = (
            annotations.label
            + " "
            + annotations.start.astype(str)
            + " "
            + annotations.end.astype(str)
        )

        annotations["index"] = [f"T{i + 1}" for i in range(len(annotations))]

        annotations = annotations[["index", "annot", "lexical_variant"]]
        annotations.to_csv(
            self.full_path(f"{filename}.ann"),
            sep="\t",
            header=None,
            index=False,
            encoding="utf-8",
        )

    else:
        open(self.full_path(f"{filename}.ann"), "w", encoding="utf-8").close()

docs2brat(docs)

Writes a list of spaCy documents to file.

PARAMETER DESCRIPTION
docs

List of spaCy documents.

TYPE: List[Doc]

Source code in edsnlp/connectors/brat.py
275
276
277
278
279
280
281
282
283
284
285
def docs2brat(self, docs: List[Doc]) -> None:
    """
    Writes a list of spaCy documents to file.

    Parameters
    ----------
    docs:
        List of spaCy documents.
    """
    for doc in docs:
        self.doc2brat(doc)

read_brat_annotation(filename)

Read BRAT annotation file and returns a pandas DataFrame.

PARAMETER DESCRIPTION
filename

Path to the annotation file.

TYPE: str

RETURNS DESCRIPTION
annotations

DataFrame containing the annotations.

Source code in edsnlp/connectors/brat.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
def read_brat_annotation(filename: str) -> pd.DataFrame:
    """
    Read BRAT annotation file and returns a pandas DataFrame.

    Parameters
    ----------
    filename:
        Path to the annotation file.

    Returns
    -------
    annotations:
        DataFrame containing the annotations.
    """

    lines = []

    with open(filename, "r") as f:
        for line in f.readlines():
            lines.append(tuple(line.rstrip("\n").split("\t", 2)))

    if not lines or len(lines[0]) == 1:
        return pd.DataFrame(
            columns=["index", "start", "end", "label", "lexical_variant"]
        )

    annotations = pd.DataFrame(lines, columns=["index", "annot", "lexical_variant"])

    annotations["end"] = annotations.annot.str.split().str[-1]
    annotations["annot"] = annotations.annot.str.split(";").str[0]

    annotations["label"] = annotations.annot.str.split().str[:-2].str.join(" ")
    annotations["start"] = annotations.annot.str.split().str[-2]

    annotations[["start", "end"]] = annotations[["start", "end"]].astype(int)

    annotations = annotations.drop(columns=["annot"])

    return annotations
Back to top