Skip to content

edsnlp.pipelines.core.sentences

factory

DEFAULT_CONFIG = dict(punct_chars=None, use_endlines=True) module-attribute

create_component(nlp, name, punct_chars, use_endlines)

Source code in edsnlp/pipelines/core/sentences/factory.py
15
16
17
18
19
20
21
22
23
24
25
26
@deprecated_factory("sentences", "eds.sentences", default_config=DEFAULT_CONFIG)
@Language.factory("eds.sentences", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    punct_chars: Optional[List[str]],
    use_endlines: bool,
):
    return SentenceSegmenter(
        punct_chars=punct_chars,
        use_endlines=use_endlines,
    )

sentences

SentenceSegmenter

Bases: object

Segments the Doc into sentences using a rule-based strategy, specific to AP-HP documents.

Applies the same rule-based pipeline as spaCy's sentencizer, and adds a simple rule on the new lines : if a new line is followed by a capitalised word, then it is also an end of sentence.

DOCS: https://spacy.io/api/sentencizer

Arguments

punct_chars : Optional[List[str]] Punctuation characters. use_endlines : bool Whether to use endlines prediction.

Source code in edsnlp/pipelines/core/sentences/sentences.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
class SentenceSegmenter(object):
    """
    Segments the Doc into sentences using a rule-based strategy,
    specific to AP-HP documents.

    Applies the same rule-based pipeline as spaCy's sentencizer,
    and adds a simple rule on the new lines : if a new line is followed by a
    capitalised word, then it is also an end of sentence.

    DOCS: https://spacy.io/api/sentencizer

    Arguments
    ---------
    punct_chars : Optional[List[str]]
        Punctuation characters.
    use_endlines : bool
        Whether to use endlines prediction.
    """

    def __init__(
        self,
        punct_chars: Optional[List[str]],
        use_endlines: bool,
    ):

        if punct_chars is None:
            punct_chars = punctuation

        self.punct_chars = set(punct_chars)
        self.use_endlines = use_endlines

    def __call__(self, doc: Doc) -> Doc:
        """
        Segments the document in sentences.

        Arguments
        ---------
        doc:
            A spacy Doc object.

        Returns
        -------
        doc:
            A spaCy Doc object, annotated for sentences.
        """

        if not doc:
            return doc

        doc[0].sent_start = True

        seen_period = False
        seen_newline = False

        for i, token in enumerate(doc):
            is_in_punct_chars = token.text in self.punct_chars
            is_newline = token.is_space and "\n" in token.text

            if self.use_endlines:
                end_line = getattr(token._, "end_line", None)
                is_newline = is_newline and (end_line or end_line is None)

            token.sent_start = (
                i == 0
            )  # To set the attributes at False by default for the other tokens
            if seen_period or seen_newline:
                if token.is_punct or is_in_punct_chars or is_newline:
                    continue
                if seen_period:
                    token.sent_start = True
                    seen_newline = False
                    seen_period = False
                else:
                    token.sent_start = token.shape_.startswith("Xx")
                    seen_newline = False
                    seen_period = False
            elif is_in_punct_chars:
                seen_period = True
            elif is_newline:
                seen_newline = True

        return doc
punct_chars = set(punct_chars) instance-attribute
use_endlines = use_endlines instance-attribute
__init__(punct_chars, use_endlines)
Source code in edsnlp/pipelines/core/sentences/sentences.py
27
28
29
30
31
32
33
34
35
36
37
def __init__(
    self,
    punct_chars: Optional[List[str]],
    use_endlines: bool,
):

    if punct_chars is None:
        punct_chars = punctuation

    self.punct_chars = set(punct_chars)
    self.use_endlines = use_endlines
__call__(doc)

Segments the document in sentences.

Arguments

doc: A spacy Doc object.

RETURNS DESCRIPTION
doc

A spaCy Doc object, annotated for sentences.

Source code in edsnlp/pipelines/core/sentences/sentences.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
def __call__(self, doc: Doc) -> Doc:
    """
    Segments the document in sentences.

    Arguments
    ---------
    doc:
        A spacy Doc object.

    Returns
    -------
    doc:
        A spaCy Doc object, annotated for sentences.
    """

    if not doc:
        return doc

    doc[0].sent_start = True

    seen_period = False
    seen_newline = False

    for i, token in enumerate(doc):
        is_in_punct_chars = token.text in self.punct_chars
        is_newline = token.is_space and "\n" in token.text

        if self.use_endlines:
            end_line = getattr(token._, "end_line", None)
            is_newline = is_newline and (end_line or end_line is None)

        token.sent_start = (
            i == 0
        )  # To set the attributes at False by default for the other tokens
        if seen_period or seen_newline:
            if token.is_punct or is_in_punct_chars or is_newline:
                continue
            if seen_period:
                token.sent_start = True
                seen_newline = False
                seen_period = False
            else:
                token.sent_start = token.shape_.startswith("Xx")
                seen_newline = False
                seen_period = False
        elif is_in_punct_chars:
            seen_period = True
        elif is_newline:
            seen_newline = True

    return doc

terms

punctuation = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹', '।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄', '᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿', '‼', '‽', '⁇', '⁈', '⁉', '⸮', '⸼', '꓿', '꘎', '꘏', '꛳', '꛷', '꡶', '꡷', '꣎', '꣏', '꤯', '꧈', '꧉', '꩝', '꩞', '꩟', '꫰', '꫱', '꯫', '﹒', '﹖', '﹗', '!', '.', '?', '𐩖', '𐩗', '𑁇', '𑁈', '𑂾', '𑂿', '𑃀', '𑃁', '𑅁', '𑅂', '𑅃', '𑇅', '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼', '𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐', '𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂', '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈', '。', '。'] module-attribute

Back to top