Skip to content

edsnlp.pipelines.trainable.nested_ner

msg = Printer() module-attribute

NUM_INITIALIZATION_EXAMPLES = 1000 module-attribute

nested_ner_default_config = '\n[model]\n @architectures = "eds.stack_crf_ner_model.v1"\n mode = "joint"\n\n [model.tok2vec]\n @architectures = "spacy.Tok2Vec.v1"\n\n [model.tok2vec.embed]\n @architectures = "spacy.MultiHashEmbed.v1"\n width = 96\n rows = [5000, 2000, 1000, 1000]\n attrs = ["ORTH", "PREFIX", "SUFFIX", "SHAPE"]\n include_static_vectors = false\n\n [model.tok2vec.encode]\n @architectures = "spacy.MaxoutWindowEncoder.v1"\n width = ${model.tok2vec.embed.width}\n window_size = 1\n maxout_pieces = 3\n depth = 4\n\n[scorer]\n @scorers = "eds.nested_ner_scorer.v1"\n' module-attribute

NESTED_NER_DEFAULTS = Config().from_str(nested_ner_default_config) module-attribute

np_ops = NumpyOps() module-attribute

TrainableNer

Bases: TrainablePipe

Source code in edsnlp/pipelines/trainable/nested_ner.py
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
class TrainableNer(TrainablePipe):
    def __init__(
        self,
        vocab: Vocab,
        model: Model,
        name: str = "nested_ner",
        ent_labels: Iterable[str] = (),
        spans_labels: Mapping[str, Iterable[str]] = None,
        scorer: Optional[Callable] = None,
    ) -> None:
        """
        Initialize a general named entity recognizer (with or without nested or
        overlapping entities).

        Parameters
        ----------
        vocab: Vocab
            Spacy vocabulary
        model: Model
            The model to extract the spans
        name: str
            Name of the component
        ent_labels: Iterable[str]
            list of labels to filter entities for in `doc.ents`
        spans_labels: Mapping[str, Iterable[str]]
            Mapping from span group names to list of labels to look for entities
            and assign the predicted entities
        scorer: Optional[Callable]
            Method to call to score predictions
        """

        super().__init__(vocab, model, name)

        self.cfg["ent_labels"]: Optional[Tuple[str]] = (
            tuple(ent_labels) if ent_labels is not None else None
        )
        self.cfg["spans_labels"]: Optional[Dict[str, Tuple[str]]] = (
            {k: tuple(labels) for k, labels in spans_labels.items()}
            if spans_labels is not None
            else None
        )
        self.cfg["labels"] = tuple(
            sorted(
                set(
                    (list(ent_labels) if ent_labels is not None else [])
                    + [
                        label
                        for group in (spans_labels or {}).values()
                        for label in group
                    ]
                )
            )
        )

        self.scorer = scorer

    @property
    def labels(self) -> Tuple[str]:
        """Return the labels currently added to the component."""
        return self.cfg["labels"]

    @property
    def spans_labels(self) -> Dict[str, Tuple[str]]:
        """Return the span group to labels filters mapping"""
        return self.cfg["spans_labels"]

    @property
    def ent_labels(self):
        """Return the doc.ents labels filters"""
        return self.cfg["ent_labels"]

    def add_label(self, label: str) -> int:
        """Add a new label to the pipe."""
        raise Exception("Cannot add a new label to the pipe")

    def predict(self, docs: List[Doc]) -> Dict[str, Ints2d]:
        """
        Apply the pipeline's model to a batch of docs, without modifying them.

        Parameters
        ----------
        docs: List[Doc]

        Returns
        -------
        Int2d
            The predicted list of (doc_idx, label_idx, begin, end) tuples as a tensor
            that contain the spans' prediction for all the batch
        """
        return self.model.predict((docs, None, True))[1]

    def set_annotations(
        self, docs: List[Doc], predictions: Dict[str, Ints2d], **kwargs
    ) -> None:
        """
        Modify a batch of `Doc` objects, using predicted spans.

        Parameters
        ----------
        docs: List[Doc]
            The documents to update
        predictions:
            Spans predictions, as returned by the model's predict method
        """
        docs = list(docs)
        new_doc_spans: List[List[Span]] = [[] for _ in docs]
        for doc_idx, label_idx, begin, end in np_ops.asarray(predictions.get("spans")):
            label = self.labels[label_idx]
            new_doc_spans[doc_idx].append(Span(docs[doc_idx], begin, end, label))

        for doc, new_spans in zip(docs, new_doc_spans):
            # Only add a span to `doc.ents` if its label is in `self.ents_labels`
            doc.ents = filter_spans(
                [s for s in new_spans if s.label_ in self.ent_labels]
            )

            # Only add a span to `doc.spans[name]` if its label is in the matching
            # `self.spans_labels[name]` list
            for name, group_labels in self.spans_labels.items():
                doc.spans[name] = [s for s in new_spans if s.label_ in group_labels]

    def update(
        self,
        examples: Iterable[Example],
        *,
        drop: float = 0.0,
        set_annotations: bool = False,
        sgd: Optional[Optimizer] = None,
        losses: Optional[Dict[str, float]] = None,
    ) -> Dict[str, float]:
        """
        Learn from a batch of documents and gold-standard information,
        updating the pipe's model. Delegates to begin_update and get_loss.

        Unlike standard TrainablePipe components, the discrete ops (best selection
        of tags) is performed by the model directly (`begin_update` returns the loss
        and the predictions)

        Parameters
        ----------
        examples: Iterable[Example]
        drop: float = 0.0

        set_annotations: bool
            Whether to update the document with predicted spans
        sgd: Optional[Optimizer]
            Optimizer
        losses: Optional[Dict[str, float]]
            Dict of loss, updated in place

        Returns
        -------
        Dict[str, float]
            Updated losses dict
        """

        if losses is None:
            losses = {}
        losses.setdefault(self.name, 0.0)
        set_dropout_rate(self.model, drop)
        examples = list(examples)

        # run the model
        docs = [eg.predicted for eg in examples]
        gold = self.examples_to_truth(examples)
        (loss, predictions), backprop = self.model.begin_update(
            (docs, gold, set_annotations)
        )
        loss, gradient = self.get_loss(examples, loss)
        backprop(gradient)
        if sgd is not None:
            self.model.finish_update(sgd)
        if set_annotations:
            self.set_annotations(docs, predictions)

        losses[self.name] = loss

        return loss

    def get_loss(self, examples: Iterable[Example], loss) -> Tuple[float, float]:
        """Find the loss and gradient of loss for the batch of documents and
        their predicted scores."""
        return float(loss.item()), self.model.ops.xp.array([1])

    def initialize(
        self,
        get_examples: Callable[[], Iterable[Example]],
        *,
        nlp: Language = None,
        labels: Optional[List[str]] = None,
    ):
        """
        Initialize the pipe for training, using a representative set
        of data examples.

        1. If no ent_labels are provided, we scrap them from the ents
           of the set of examples.
        2. If no span labels are provided, we scrap them from the spans of the set
           of examples, and filter these labels with the ents_labels.

        Parameters
        ----------
        get_examples: Callable[[], Iterable[Example]]
            Method to sample some examples
        nlp: spacy.Language
            Unused spacy model
        labels
            Unused list of labels
        """
        sub_batch = list(islice(get_examples(), NUM_INITIALIZATION_EXAMPLES))
        if self.ent_labels is None or self.spans_labels is None:
            ent_labels_before = self.ent_labels
            if self.ent_labels is None:
                self.cfg["ent_labels"] = tuple(
                    sorted(
                        {
                            span.label_
                            for doc in sub_batch
                            for span in doc.reference.ents
                        }
                    )
                )

            if self.spans_labels is None:
                spans_labels = defaultdict(lambda: set())
                for doc in sub_batch:
                    for name, group in doc.reference.spans.items():
                        for span in group:
                            if (
                                ent_labels_before is None
                                or span.label_ in ent_labels_before
                            ):
                                spans_labels[name].add(span.label_)

                self.cfg["spans_labels"] = {
                    name: tuple(sorted(group)) for name, group in spans_labels.items()
                }

            self.cfg["labels"] = tuple(
                sorted(
                    set(
                        list(self.ent_labels)
                        + [
                            label
                            for group in self.spans_labels.values()
                            for label in group
                        ]
                    )
                )
            )

        doc_sample = [eg.reference for eg in sub_batch]
        spans_sample = self.examples_to_truth(sub_batch)
        if spans_sample is None:
            raise ValueError(
                "Call begin_training with relevant entities "
                "and relations annotated in "
                "at least a few reference examples!"
            )
        self.model.attrs["set_n_labels"](len(self.labels))
        self.model.initialize(X=doc_sample, Y=spans_sample)

    def examples_to_truth(self, examples: List[Example]) -> Ints2d:
        """
        Converts the spans of the examples into a list
        of (doc_idx, label_idx, begin, end) tuple as a tensor,
        that will be fed to the model with the `begin_update` method.

        Parameters
        ----------
        examples: List[Example]

        Returns
        -------
        Ints2d
        """
        label_vocab = {self.vocab.strings[l]: i for i, l in enumerate(self.labels)}
        spans = set()
        for eg_idx, eg in enumerate(examples):
            for span in (
                *eg.reference.ents,
                *(
                    span
                    for name in (
                        self.spans_labels
                        if self.spans_labels is not None
                        else eg.reference.spans
                    )
                    for span in eg.reference.spans.get(name, ())
                ),
            ):
                label_idx = label_vocab.get(span.label)
                if label_idx is None:
                    continue
                spans.add((eg_idx, label_idx, span.start, span.end))
        truths = self.model.ops.asarray(list(spans))
        return truths

scorer = scorer instance-attribute

__init__(vocab, model, name='nested_ner', ent_labels=(), spans_labels=None, scorer=None)

Initialize a general named entity recognizer (with or without nested or overlapping entities).

PARAMETER DESCRIPTION
vocab

Spacy vocabulary

TYPE: Vocab

model

The model to extract the spans

TYPE: Model

name

Name of the component

TYPE: str DEFAULT: 'nested_ner'

ent_labels

list of labels to filter entities for in doc.ents

TYPE: Iterable[str] DEFAULT: ()

spans_labels

Mapping from span group names to list of labels to look for entities and assign the predicted entities

TYPE: Mapping[str, Iterable[str]] DEFAULT: None

scorer

Method to call to score predictions

TYPE: Optional[Callable] DEFAULT: None

Source code in edsnlp/pipelines/trainable/nested_ner.py
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
def __init__(
    self,
    vocab: Vocab,
    model: Model,
    name: str = "nested_ner",
    ent_labels: Iterable[str] = (),
    spans_labels: Mapping[str, Iterable[str]] = None,
    scorer: Optional[Callable] = None,
) -> None:
    """
    Initialize a general named entity recognizer (with or without nested or
    overlapping entities).

    Parameters
    ----------
    vocab: Vocab
        Spacy vocabulary
    model: Model
        The model to extract the spans
    name: str
        Name of the component
    ent_labels: Iterable[str]
        list of labels to filter entities for in `doc.ents`
    spans_labels: Mapping[str, Iterable[str]]
        Mapping from span group names to list of labels to look for entities
        and assign the predicted entities
    scorer: Optional[Callable]
        Method to call to score predictions
    """

    super().__init__(vocab, model, name)

    self.cfg["ent_labels"]: Optional[Tuple[str]] = (
        tuple(ent_labels) if ent_labels is not None else None
    )
    self.cfg["spans_labels"]: Optional[Dict[str, Tuple[str]]] = (
        {k: tuple(labels) for k, labels in spans_labels.items()}
        if spans_labels is not None
        else None
    )
    self.cfg["labels"] = tuple(
        sorted(
            set(
                (list(ent_labels) if ent_labels is not None else [])
                + [
                    label
                    for group in (spans_labels or {}).values()
                    for label in group
                ]
            )
        )
    )

    self.scorer = scorer

labels()

Return the labels currently added to the component.

Source code in edsnlp/pipelines/trainable/nested_ner.py
207
208
209
210
@property
def labels(self) -> Tuple[str]:
    """Return the labels currently added to the component."""
    return self.cfg["labels"]

spans_labels()

Return the span group to labels filters mapping

Source code in edsnlp/pipelines/trainable/nested_ner.py
212
213
214
215
@property
def spans_labels(self) -> Dict[str, Tuple[str]]:
    """Return the span group to labels filters mapping"""
    return self.cfg["spans_labels"]

ent_labels()

Return the doc.ents labels filters

Source code in edsnlp/pipelines/trainable/nested_ner.py
217
218
219
220
@property
def ent_labels(self):
    """Return the doc.ents labels filters"""
    return self.cfg["ent_labels"]

add_label(label)

Add a new label to the pipe.

Source code in edsnlp/pipelines/trainable/nested_ner.py
222
223
224
def add_label(self, label: str) -> int:
    """Add a new label to the pipe."""
    raise Exception("Cannot add a new label to the pipe")

predict(docs)

Apply the pipeline's model to a batch of docs, without modifying them.

PARAMETER DESCRIPTION
docs

TYPE: List[Doc]

RETURNS DESCRIPTION
Int2d

The predicted list of (doc_idx, label_idx, begin, end) tuples as a tensor that contain the spans' prediction for all the batch

Source code in edsnlp/pipelines/trainable/nested_ner.py
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
def predict(self, docs: List[Doc]) -> Dict[str, Ints2d]:
    """
    Apply the pipeline's model to a batch of docs, without modifying them.

    Parameters
    ----------
    docs: List[Doc]

    Returns
    -------
    Int2d
        The predicted list of (doc_idx, label_idx, begin, end) tuples as a tensor
        that contain the spans' prediction for all the batch
    """
    return self.model.predict((docs, None, True))[1]

set_annotations(docs, predictions, **kwargs)

Modify a batch of Doc objects, using predicted spans.

PARAMETER DESCRIPTION
docs

The documents to update

TYPE: List[Doc]

predictions

Spans predictions, as returned by the model's predict method

TYPE: Dict[str, Ints2d]

Source code in edsnlp/pipelines/trainable/nested_ner.py
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
def set_annotations(
    self, docs: List[Doc], predictions: Dict[str, Ints2d], **kwargs
) -> None:
    """
    Modify a batch of `Doc` objects, using predicted spans.

    Parameters
    ----------
    docs: List[Doc]
        The documents to update
    predictions:
        Spans predictions, as returned by the model's predict method
    """
    docs = list(docs)
    new_doc_spans: List[List[Span]] = [[] for _ in docs]
    for doc_idx, label_idx, begin, end in np_ops.asarray(predictions.get("spans")):
        label = self.labels[label_idx]
        new_doc_spans[doc_idx].append(Span(docs[doc_idx], begin, end, label))

    for doc, new_spans in zip(docs, new_doc_spans):
        # Only add a span to `doc.ents` if its label is in `self.ents_labels`
        doc.ents = filter_spans(
            [s for s in new_spans if s.label_ in self.ent_labels]
        )

        # Only add a span to `doc.spans[name]` if its label is in the matching
        # `self.spans_labels[name]` list
        for name, group_labels in self.spans_labels.items():
            doc.spans[name] = [s for s in new_spans if s.label_ in group_labels]

update(examples, *, drop=0.0, set_annotations=False, sgd=None, losses=None)

Learn from a batch of documents and gold-standard information, updating the pipe's model. Delegates to begin_update and get_loss.

Unlike standard TrainablePipe components, the discrete ops (best selection of tags) is performed by the model directly (begin_update returns the loss and the predictions)

PARAMETER DESCRIPTION
examples

TYPE: Iterable[Example]

drop

TYPE: float DEFAULT: 0.0

set_annotations: bool Whether to update the document with predicted spans sgd: Optional[Optimizer] Optimizer losses: Optional[Dict[str, float]] Dict of loss, updated in place

RETURNS DESCRIPTION
Dict[str, float]

Updated losses dict

Source code in edsnlp/pipelines/trainable/nested_ner.py
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
def update(
    self,
    examples: Iterable[Example],
    *,
    drop: float = 0.0,
    set_annotations: bool = False,
    sgd: Optional[Optimizer] = None,
    losses: Optional[Dict[str, float]] = None,
) -> Dict[str, float]:
    """
    Learn from a batch of documents and gold-standard information,
    updating the pipe's model. Delegates to begin_update and get_loss.

    Unlike standard TrainablePipe components, the discrete ops (best selection
    of tags) is performed by the model directly (`begin_update` returns the loss
    and the predictions)

    Parameters
    ----------
    examples: Iterable[Example]
    drop: float = 0.0

    set_annotations: bool
        Whether to update the document with predicted spans
    sgd: Optional[Optimizer]
        Optimizer
    losses: Optional[Dict[str, float]]
        Dict of loss, updated in place

    Returns
    -------
    Dict[str, float]
        Updated losses dict
    """

    if losses is None:
        losses = {}
    losses.setdefault(self.name, 0.0)
    set_dropout_rate(self.model, drop)
    examples = list(examples)

    # run the model
    docs = [eg.predicted for eg in examples]
    gold = self.examples_to_truth(examples)
    (loss, predictions), backprop = self.model.begin_update(
        (docs, gold, set_annotations)
    )
    loss, gradient = self.get_loss(examples, loss)
    backprop(gradient)
    if sgd is not None:
        self.model.finish_update(sgd)
    if set_annotations:
        self.set_annotations(docs, predictions)

    losses[self.name] = loss

    return loss

get_loss(examples, loss)

Find the loss and gradient of loss for the batch of documents and their predicted scores.

Source code in edsnlp/pipelines/trainable/nested_ner.py
330
331
332
333
def get_loss(self, examples: Iterable[Example], loss) -> Tuple[float, float]:
    """Find the loss and gradient of loss for the batch of documents and
    their predicted scores."""
    return float(loss.item()), self.model.ops.xp.array([1])

initialize(get_examples, *, nlp=None, labels=None)

Initialize the pipe for training, using a representative set of data examples.

  1. If no ent_labels are provided, we scrap them from the ents of the set of examples.
  2. If no span labels are provided, we scrap them from the spans of the set of examples, and filter these labels with the ents_labels.
PARAMETER DESCRIPTION
get_examples

Method to sample some examples

TYPE: Callable[[], Iterable[Example]]

nlp

Unused spacy model

TYPE: Language DEFAULT: None

labels

Unused list of labels

TYPE: Optional[List[str]] DEFAULT: None

Source code in edsnlp/pipelines/trainable/nested_ner.py
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
def initialize(
    self,
    get_examples: Callable[[], Iterable[Example]],
    *,
    nlp: Language = None,
    labels: Optional[List[str]] = None,
):
    """
    Initialize the pipe for training, using a representative set
    of data examples.

    1. If no ent_labels are provided, we scrap them from the ents
       of the set of examples.
    2. If no span labels are provided, we scrap them from the spans of the set
       of examples, and filter these labels with the ents_labels.

    Parameters
    ----------
    get_examples: Callable[[], Iterable[Example]]
        Method to sample some examples
    nlp: spacy.Language
        Unused spacy model
    labels
        Unused list of labels
    """
    sub_batch = list(islice(get_examples(), NUM_INITIALIZATION_EXAMPLES))
    if self.ent_labels is None or self.spans_labels is None:
        ent_labels_before = self.ent_labels
        if self.ent_labels is None:
            self.cfg["ent_labels"] = tuple(
                sorted(
                    {
                        span.label_
                        for doc in sub_batch
                        for span in doc.reference.ents
                    }
                )
            )

        if self.spans_labels is None:
            spans_labels = defaultdict(lambda: set())
            for doc in sub_batch:
                for name, group in doc.reference.spans.items():
                    for span in group:
                        if (
                            ent_labels_before is None
                            or span.label_ in ent_labels_before
                        ):
                            spans_labels[name].add(span.label_)

            self.cfg["spans_labels"] = {
                name: tuple(sorted(group)) for name, group in spans_labels.items()
            }

        self.cfg["labels"] = tuple(
            sorted(
                set(
                    list(self.ent_labels)
                    + [
                        label
                        for group in self.spans_labels.values()
                        for label in group
                    ]
                )
            )
        )

    doc_sample = [eg.reference for eg in sub_batch]
    spans_sample = self.examples_to_truth(sub_batch)
    if spans_sample is None:
        raise ValueError(
            "Call begin_training with relevant entities "
            "and relations annotated in "
            "at least a few reference examples!"
        )
    self.model.attrs["set_n_labels"](len(self.labels))
    self.model.initialize(X=doc_sample, Y=spans_sample)

examples_to_truth(examples)

Converts the spans of the examples into a list of (doc_idx, label_idx, begin, end) tuple as a tensor, that will be fed to the model with the begin_update method.

PARAMETER DESCRIPTION
examples

TYPE: List[Example]

RETURNS DESCRIPTION
Ints2d
Source code in edsnlp/pipelines/trainable/nested_ner.py
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
def examples_to_truth(self, examples: List[Example]) -> Ints2d:
    """
    Converts the spans of the examples into a list
    of (doc_idx, label_idx, begin, end) tuple as a tensor,
    that will be fed to the model with the `begin_update` method.

    Parameters
    ----------
    examples: List[Example]

    Returns
    -------
    Ints2d
    """
    label_vocab = {self.vocab.strings[l]: i for i, l in enumerate(self.labels)}
    spans = set()
    for eg_idx, eg in enumerate(examples):
        for span in (
            *eg.reference.ents,
            *(
                span
                for name in (
                    self.spans_labels
                    if self.spans_labels is not None
                    else eg.reference.spans
                )
                for span in eg.reference.spans.get(name, ())
            ),
        ):
            label_idx = label_vocab.get(span.label)
            if label_idx is None:
                continue
            spans.add((eg_idx, label_idx, span.start, span.end))
    truths = self.model.ops.asarray(list(spans))
    return truths

create_component(nlp, name, model, ent_labels=None, spans_labels=None, scorer=None)

Construct a TrainableQualifier component.

Source code in edsnlp/pipelines/trainable/nested_ner.py
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
@Language.factory(
    "nested_ner",
    default_config=NESTED_NER_DEFAULTS,
    requires=["doc.ents", "doc.spans"],
    assigns=["doc.ents", "doc.spans"],
    default_score_weights={
        "ents_f": 1.0,
        "ents_p": 0.0,
        "ents_r": 0.0,
    },
)
def create_component(
    nlp: Language,
    name: str,
    model: Model,
    ent_labels=None,
    spans_labels=None,
    scorer=None,
):
    """Construct a TrainableQualifier component."""
    return TrainableNer(
        vocab=nlp.vocab,
        model=model,
        name=name,
        ent_labels=ent_labels,
        spans_labels=spans_labels,
        scorer=scorer,
    )

nested_ner_scorer(examples, **cfg)

Scores the extracted entities that may be overlapping or nested by looking in doc.ents, and doc.spans.

PARAMETER DESCRIPTION
examples

TYPE: Iterable[Example]

cfg
  • labels: Iterable[str] labels to take into account
  • spans_labels: Iterable[str] span group names to look into for entities

RETURNS DESCRIPTION
Dict[str, float]
Source code in edsnlp/pipelines/trainable/nested_ner.py
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
def nested_ner_scorer(examples: Iterable[Example], **cfg):
    """
    Scores the extracted entities that may be overlapping or nested
    by looking in `doc.ents`, and `doc.spans`.

    Parameters
    ----------
    examples: Iterable[Example]
    cfg: Dict[str]
        - labels: Iterable[str] labels to take into account
        - spans_labels: Iterable[str] span group names to look into for entities

    Returns
    -------
    Dict[str, float]
    """
    labels = set(cfg["labels"]) if "labels" in cfg is not None else None
    spans_labels = cfg["spans_labels"]

    pred_spans = set()
    gold_spans = set()
    for eg_idx, eg in enumerate(examples):
        for span in (
            *eg.predicted.ents,
            *(
                span
                for name in (
                    spans_labels if spans_labels is not None else eg.reference.spans
                )
                for span in eg.predicted.spans.get(name, ())
            ),
        ):
            if labels is None or span.label_ in labels:
                pred_spans.add((eg_idx, span.start, span.end, span.label_))

        for span in (
            *eg.reference.ents,
            *(
                span
                for name in (
                    spans_labels if spans_labels is not None else eg.reference.spans
                )
                for span in eg.reference.spans.get(name, ())
            ),
        ):
            if labels is None or span.label_ in labels:
                gold_spans.add((eg_idx, span.start, span.end, span.label_))

    tp = len(pred_spans & gold_spans)

    return {
        "ents_p": tp / len(pred_spans) if pred_spans else float(tp == len(pred_spans)),
        "ents_r": tp / len(gold_spans) if gold_spans else float(tp == len(gold_spans)),
        "ents_f": 2 * tp / (len(pred_spans) + len(gold_spans))
        if pred_spans or gold_spans
        else float(len(pred_spans) == len(gold_spans)),
    }

make_nested_ner_scorer()

Source code in edsnlp/pipelines/trainable/nested_ner.py
145
146
147
@spacy.registry.scorers("eds.nested_ner_scorer.v1")
def make_nested_ner_scorer():
    return nested_ner_scorer