Bases: object
Segments the Doc into sentences using a rule-based strategy,
specific to AP-HP documents.
Applies the same rule-based pipeline as spaCy's sentencizer,
and adds a simple rule on the new lines : if a new line is followed by a
capitalised word, then it is also an end of sentence.
DOCS: https://spacy.io/api/sentencizer
Arguments
punct_chars : Optional[List[str]]
Punctuation characters.
use_endlines : bool
Whether to use endlines prediction.
Source code in edsnlp/pipelines/core/sentences/sentences.py
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89 | class SentenceSegmenter(object):
"""
Segments the Doc into sentences using a rule-based strategy,
specific to AP-HP documents.
Applies the same rule-based pipeline as spaCy's sentencizer,
and adds a simple rule on the new lines : if a new line is followed by a
capitalised word, then it is also an end of sentence.
DOCS: https://spacy.io/api/sentencizer
Arguments
---------
punct_chars : Optional[List[str]]
Punctuation characters.
use_endlines : bool
Whether to use endlines prediction.
"""
def __init__(
self,
punct_chars: Optional[List[str]],
use_endlines: bool,
):
if punct_chars is None:
punct_chars = punctuation
self.punct_chars = set(punct_chars)
self.use_endlines = use_endlines
def __call__(self, doc: Doc) -> Doc:
"""
Segments the document in sentences.
Arguments
---------
doc:
A spacy Doc object.
Returns
-------
doc:
A spaCy Doc object, annotated for sentences.
"""
if not doc:
return doc
doc[0].sent_start = True
seen_period = False
seen_newline = False
for i, token in enumerate(doc):
is_in_punct_chars = token.text in self.punct_chars
is_newline = token.is_space and "\n" in token.text
if self.use_endlines:
end_line = getattr(token._, "end_line", None)
is_newline = is_newline and (end_line or end_line is None)
token.sent_start = (
i == 0
) # To set the attributes at False by default for the other tokens
if seen_period or seen_newline:
if token.is_punct or is_in_punct_chars or is_newline:
continue
if seen_period:
token.sent_start = True
seen_newline = False
seen_period = False
else:
token.sent_start = token.shape_.startswith("Xx")
seen_newline = False
seen_period = False
elif is_in_punct_chars:
seen_period = True
elif is_newline:
seen_newline = True
return doc
|
__call__(doc)
Segments the document in sentences.
Arguments
doc:
A spacy Doc object.
Source code in edsnlp/pipelines/core/sentences/sentences.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89 | def __call__(self, doc: Doc) -> Doc:
"""
Segments the document in sentences.
Arguments
---------
doc:
A spacy Doc object.
Returns
-------
doc:
A spaCy Doc object, annotated for sentences.
"""
if not doc:
return doc
doc[0].sent_start = True
seen_period = False
seen_newline = False
for i, token in enumerate(doc):
is_in_punct_chars = token.text in self.punct_chars
is_newline = token.is_space and "\n" in token.text
if self.use_endlines:
end_line = getattr(token._, "end_line", None)
is_newline = is_newline and (end_line or end_line is None)
token.sent_start = (
i == 0
) # To set the attributes at False by default for the other tokens
if seen_period or seen_newline:
if token.is_punct or is_in_punct_chars or is_newline:
continue
if seen_period:
token.sent_start = True
seen_newline = False
seen_period = False
else:
token.sent_start = token.shape_.startswith("Xx")
seen_newline = False
seen_period = False
elif is_in_punct_chars:
seen_period = True
elif is_newline:
seen_newline = True
return doc
|