Bases: GenericMatcher
Class to extract consultation dates from "CR-CONS" documents.
The pipeline populates the doc.spans['consultation_dates']
list.
For each extraction s
in this list, the corresponding date is available
as s._.consultation_date
.
town_mention : Union[List[str], bool]
List of RegEx for all AP-HP hospitals' towns mentions.
- If `type==list`: Overrides the default list
- If `type==bool`: Uses the default list of True, disable if False
document_date_mention : Union[List[str], bool]
List of RegEx for document date.
- If `type==list`: Overrides the default list
- If `type==bool`: Uses the default list of True, disable if False
Source code in edsnlp/pipelines/misc/consultation_dates/consultation_dates.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162 | class ConsultationDates(GenericMatcher):
"""
Class to extract consultation dates from "CR-CONS" documents.
The pipeline populates the `#!python doc.spans['consultation_dates']` list.
For each extraction `s` in this list, the corresponding date is available
as `s._.consultation_date`.
Parameters
----------
nlp : Language
Language pipeline object
consultation_mention : Union[List[str], bool]
List of RegEx for consultation mentions.
- If `type==list`: Overrides the default list
- If `type==bool`: Uses the default list of True, disable if False
town_mention : Union[List[str], bool]
List of RegEx for all AP-HP hospitals' towns mentions.
- If `type==list`: Overrides the default list
- If `type==bool`: Uses the default list of True, disable if False
document_date_mention : Union[List[str], bool]
List of RegEx for document date.
- If `type==list`: Overrides the default list
- If `type==bool`: Uses the default list of True, disable if False
"""
def __init__(
self,
nlp: Language,
consultation_mention: Union[List[str], bool],
town_mention: Union[List[str], bool],
document_date_mention: Union[List[str], bool],
attr: str,
**kwargs,
):
logger.warning("This pipeline is still in beta")
logger.warning(
"This pipeline should ONLY be used on notes "
"where `note_class_source_value == 'CR-CONS'`"
)
logger.warning(
"""This pipeline requires to use the normalizer pipeline with:
lowercase=True,
accents=True,
quotes=True"""
)
if not (nlp.has_pipe("dates") and nlp.get_pipe("dates").on_ents_only is False):
config = dict(**DEFAULT_CONFIG)
config["on_ents_only"] = "consultation_mentions"
self.date_matcher = Dates(nlp, **config)
else:
self.date_matcher = None
if not consultation_mention:
consultation_mention = []
elif consultation_mention is True:
consultation_mention = consult_regex.consultation_mention
if not document_date_mention:
document_date_mention = []
elif document_date_mention is True:
document_date_mention = consult_regex.document_date_mention
if not town_mention:
town_mention = []
elif town_mention is True:
town_mention = consult_regex.town_mention
regex = dict(
consultation_mention=consultation_mention,
town_mention=town_mention,
document_date_mention=document_date_mention,
)
super().__init__(
nlp,
regex=regex,
terms=dict(),
attr=attr,
ignore_excluded=False,
**kwargs,
)
self.set_extensions()
@staticmethod
def set_extensions() -> None:
if not Span.has_extension("consultation_date"):
Span.set_extension("consultation_date", default=None)
def __call__(self, doc: Doc) -> Doc:
"""
Finds entities
Parameters
----------
doc: spaCy Doc object
Returns
-------
doc: Doc
spaCy Doc object with additional
`doc.spans['consultation_dates]` `SpanGroup`
"""
ents = self.process(doc)
doc.spans["consultation_mentions"] = ents
doc.spans["consultation_dates"] = []
if self.date_matcher is not None:
doc = self.date_matcher(doc)
for mention in ents:
# Looking for a date
# - In the same sentence
# - Not less than 10 tokens AFTER the consultation mention
matching_dates = [
date
for date in doc.spans["dates"]
if (
(mention.sent == date.sent)
and (date.start > mention.start)
and (date.start - mention.end <= 10)
)
]
if matching_dates:
# We keep the first mention of a date
kept_date = min(matching_dates, key=lambda d: d.start)
span = doc[mention.start : kept_date.end]
span.label_ = mention.label_
span._.consultation_date = kept_date._.date
doc.spans["consultation_dates"].append(span)
del doc.spans["consultation_mentions"]
return doc
|
date_matcher = Dates(nlp, None=config)
instance-attribute
__init__(nlp, consultation_mention, town_mention, document_date_mention, attr, **kwargs)
Source code in edsnlp/pipelines/misc/consultation_dates/consultation_dates.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107 | def __init__(
self,
nlp: Language,
consultation_mention: Union[List[str], bool],
town_mention: Union[List[str], bool],
document_date_mention: Union[List[str], bool],
attr: str,
**kwargs,
):
logger.warning("This pipeline is still in beta")
logger.warning(
"This pipeline should ONLY be used on notes "
"where `note_class_source_value == 'CR-CONS'`"
)
logger.warning(
"""This pipeline requires to use the normalizer pipeline with:
lowercase=True,
accents=True,
quotes=True"""
)
if not (nlp.has_pipe("dates") and nlp.get_pipe("dates").on_ents_only is False):
config = dict(**DEFAULT_CONFIG)
config["on_ents_only"] = "consultation_mentions"
self.date_matcher = Dates(nlp, **config)
else:
self.date_matcher = None
if not consultation_mention:
consultation_mention = []
elif consultation_mention is True:
consultation_mention = consult_regex.consultation_mention
if not document_date_mention:
document_date_mention = []
elif document_date_mention is True:
document_date_mention = consult_regex.document_date_mention
if not town_mention:
town_mention = []
elif town_mention is True:
town_mention = consult_regex.town_mention
regex = dict(
consultation_mention=consultation_mention,
town_mention=town_mention,
document_date_mention=document_date_mention,
)
super().__init__(
nlp,
regex=regex,
terms=dict(),
attr=attr,
ignore_excluded=False,
**kwargs,
)
self.set_extensions()
|
set_extensions()
Source code in edsnlp/pipelines/misc/consultation_dates/consultation_dates.py
| @staticmethod
def set_extensions() -> None:
if not Span.has_extension("consultation_date"):
Span.set_extension("consultation_date", default=None)
|
__call__(doc)
Finds entities
Source code in edsnlp/pipelines/misc/consultation_dates/consultation_dates.py
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162 | def __call__(self, doc: Doc) -> Doc:
"""
Finds entities
Parameters
----------
doc: spaCy Doc object
Returns
-------
doc: Doc
spaCy Doc object with additional
`doc.spans['consultation_dates]` `SpanGroup`
"""
ents = self.process(doc)
doc.spans["consultation_mentions"] = ents
doc.spans["consultation_dates"] = []
if self.date_matcher is not None:
doc = self.date_matcher(doc)
for mention in ents:
# Looking for a date
# - In the same sentence
# - Not less than 10 tokens AFTER the consultation mention
matching_dates = [
date
for date in doc.spans["dates"]
if (
(mention.sent == date.sent)
and (date.start > mention.start)
and (date.start - mention.end <= 10)
)
]
if matching_dates:
# We keep the first mention of a date
kept_date = min(matching_dates, key=lambda d: d.start)
span = doc[mention.start : kept_date.end]
span.label_ = mention.label_
span._.consultation_date = kept_date._.date
doc.spans["consultation_dates"].append(span)
del doc.spans["consultation_mentions"]
return doc
|