Skip to content

edsnlp.pipelines.base

BaseComponent

Bases: object

The BaseComponent adds a set_extensions method, called at the creation of the object.

It helps decouple the initialisation of the pipeline from the creation of extensions, and is particularly usefull when distributing EDSNLP on a cluster, since the serialisation mechanism imposes that the extensions be reset.

Source code in edsnlp/pipelines/base.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
class BaseComponent(object):
    """
    The `BaseComponent` adds a `set_extensions` method,
    called at the creation of the object.

    It helps decouple the initialisation of the pipeline from
    the creation of extensions, and is particularly usefull when
    distributing EDSNLP on a cluster, since the serialisation mechanism
    imposes that the extensions be reset.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.set_extensions()

    @classmethod
    def set_extensions(cls) -> None:
        """
        Set `Doc`, `Span` and `Token` extensions.
        """
        pass

    def get_spans(self, doc: Doc):
        """
        Returns sorted spans of interest according to the
        possible value of `on_ents_only`.
        Includes `doc.ents` by default, and adds eventual SpanGroups.
        """
        ents = list(doc.ents) + list(doc.spans.get("discarded", []))

        on_ents_only = getattr(self, "on_ents_only", None)

        if isinstance(on_ents_only, str):
            on_ents_only = [on_ents_only]
        if isinstance(on_ents_only, (set, list)):
            for spankey in set(on_ents_only) & set(doc.spans.keys()):
                ents.extend(doc.spans.get(spankey, []))

        return sorted(list(set(ents)), key=(attrgetter("start", "end")))

    def _boundaries(
        self, doc: Doc, terminations: Optional[List[Span]] = None
    ) -> List[Tuple[int, int]]:
        """
        Create sub sentences based sentences and terminations found in text.

        Parameters
        ----------
        doc:
            spaCy Doc object
        terminations:
            List of tuples with (match_id, start, end)

        Returns
        -------
        boundaries:
            List of tuples with (start, end) of spans
        """

        if terminations is None:
            terminations = []

        sent_starts = [sent.start for sent in doc.sents]
        termination_starts = [t.start for t in terminations]

        starts = sent_starts + termination_starts + [len(doc)]

        # Remove duplicates
        starts = list(set(starts))

        # Sort starts
        starts.sort()

        boundaries = [(start, end) for start, end in zip(starts[:-1], starts[1:])]

        return boundaries

set_extensions() classmethod

Set Doc, Span and Token extensions.

Source code in edsnlp/pipelines/base.py
23
24
25
26
27
28
@classmethod
def set_extensions(cls) -> None:
    """
    Set `Doc`, `Span` and `Token` extensions.
    """
    pass

get_spans(doc)

Returns sorted spans of interest according to the possible value of on_ents_only. Includes doc.ents by default, and adds eventual SpanGroups.

Source code in edsnlp/pipelines/base.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
def get_spans(self, doc: Doc):
    """
    Returns sorted spans of interest according to the
    possible value of `on_ents_only`.
    Includes `doc.ents` by default, and adds eventual SpanGroups.
    """
    ents = list(doc.ents) + list(doc.spans.get("discarded", []))

    on_ents_only = getattr(self, "on_ents_only", None)

    if isinstance(on_ents_only, str):
        on_ents_only = [on_ents_only]
    if isinstance(on_ents_only, (set, list)):
        for spankey in set(on_ents_only) & set(doc.spans.keys()):
            ents.extend(doc.spans.get(spankey, []))

    return sorted(list(set(ents)), key=(attrgetter("start", "end")))