Skip to content

edsnlp.pipelines.base

BaseComponent

Bases: object

The BaseComponent adds a set_extensions method, called at the creation of the object.

It helps decouple the initialisation of the pipeline from the creation of extensions, and is particularly usefull when distributing EDSNLP on a cluster, since the serialisation mechanism imposes that the extensions be reset.

Source code in edsnlp/pipelines/base.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
class BaseComponent(object):
    """
    The `BaseComponent` adds a `set_extensions` method,
    called at the creation of the object.

    It helps decouple the initialisation of the pipeline from
    the creation of extensions, and is particularly usefull when
    distributing EDSNLP on a cluster, since the serialisation mechanism
    imposes that the extensions be reset.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:
        """
        Set `Doc`, `Span` and `Token` extensions.
        """
        pass

    def _boundaries(
        self, doc: Doc, terminations: Optional[List[Span]] = None
    ) -> List[Tuple[int, int]]:
        """
        Create sub sentences based sentences and terminations found in text.

        Parameters
        ----------
        doc:
            spaCy Doc object
        terminations:
            List of tuples with (match_id, start, end)

        Returns
        -------
        boundaries:
            List of tuples with (start, end) of spans
        """

        if terminations is None:
            terminations = []

        sent_starts = [sent.start for sent in doc.sents]
        termination_starts = [t.start for t in terminations]

        starts = sent_starts + termination_starts + [len(doc)]

        # Remove duplicates
        starts = list(set(starts))

        # Sort starts
        starts.sort()

        boundaries = [(start, end) for start, end in zip(starts[:-1], starts[1:])]

        return boundaries

__init__(*args, **kwargs)

Source code in edsnlp/pipelines/base.py
17
18
19
20
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)

    self.set_extensions()

set_extensions()

Set Doc, Span and Token extensions.

Source code in edsnlp/pipelines/base.py
22
23
24
25
26
27
@staticmethod
def set_extensions() -> None:
    """
    Set `Doc`, `Span` and `Token` extensions.
    """
    pass

_boundaries(doc, terminations=None)

Create sub sentences based sentences and terminations found in text.

PARAMETER DESCRIPTION
doc

spaCy Doc object

TYPE: Doc

terminations

List of tuples with (match_id, start, end)

TYPE: Optional[List[Span]] DEFAULT: None

RETURNS DESCRIPTION
boundaries

List of tuples with (start, end) of spans

Source code in edsnlp/pipelines/base.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
def _boundaries(
    self, doc: Doc, terminations: Optional[List[Span]] = None
) -> List[Tuple[int, int]]:
    """
    Create sub sentences based sentences and terminations found in text.

    Parameters
    ----------
    doc:
        spaCy Doc object
    terminations:
        List of tuples with (match_id, start, end)

    Returns
    -------
    boundaries:
        List of tuples with (start, end) of spans
    """

    if terminations is None:
        terminations = []

    sent_starts = [sent.start for sent in doc.sents]
    termination_starts = [t.start for t in terminations]

    starts = sent_starts + termination_starts + [len(doc)]

    # Remove duplicates
    starts = list(set(starts))

    # Sort starts
    starts.sort()

    boundaries = [(start, end) for start, end in zip(starts[:-1], starts[1:])]

    return boundaries
Back to top