Skip to content

edsnlp.pipelines.qualifiers.reported_speech

factory

DEFAULT_CONFIG = dict(pseudo=None, preceding=None, following=None, quotation=None, verbs=None, attr='NORM', on_ents_only=True, within_ents=False, explain=False) module-attribute

create_component(nlp, name, attr, pseudo, preceding, following, quotation, verbs, on_ents_only, within_ents, explain)

Source code in edsnlp/pipelines/qualifiers/reported_speech/factory.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
@deprecated_factory("rspeech", "eds.reported_speech", default_config=DEFAULT_CONFIG)
@deprecated_factory(
    "reported_speech", "eds.reported_speech", default_config=DEFAULT_CONFIG
)
@Language.factory("eds.reported_speech", default_config=DEFAULT_CONFIG)
def create_component(
    nlp: Language,
    name: str,
    attr: str,
    pseudo: Optional[List[str]],
    preceding: Optional[List[str]],
    following: Optional[List[str]],
    quotation: Optional[List[str]],
    verbs: Optional[List[str]],
    on_ents_only: bool,
    within_ents: bool,
    explain: bool,
):
    return ReportedSpeech(
        nlp=nlp,
        attr=attr,
        pseudo=pseudo,
        preceding=preceding,
        following=following,
        quotation=quotation,
        verbs=verbs,
        on_ents_only=on_ents_only,
        within_ents=within_ents,
        explain=explain,
    )

reported_speech

ReportedSpeech

Bases: Qualifier

Implements a reported speech detection algorithm.

The components looks for terms indicating patient statements, and quotations to detect patient speech.

PARAMETER DESCRIPTION
nlp

spaCy nlp pipeline to use for matching.

TYPE: Language

quotation

String gathering all quotation cues.

TYPE: str

verbs

List of reported speech verbs.

TYPE: List[str]

following

List of terms following a reported speech.

TYPE: List[str]

preceding

List of terms preceding a reported speech.

TYPE: List[str]

filter_matches

Whether to filter out overlapping matches.

TYPE: bool

attr

spaCy's attribute to use: a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr' we can also add a key for each regex.

TYPE: str

on_ents_only

Whether to look for matches around detected entities only. Useful for faster inference in downstream tasks.

TYPE: bool

within_ents

Whether to consider cues within entities.

TYPE: bool

explain

Whether to keep track of cues for each entity.

TYPE: bool

Source code in edsnlp/pipelines/qualifiers/reported_speech/reported_speech.py
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
class ReportedSpeech(Qualifier):
    """
    Implements a reported speech detection algorithm.

    The components looks for terms indicating patient statements,
    and quotations to detect patient speech.

    Parameters
    ----------
    nlp : Language
        spaCy nlp pipeline to use for matching.
    quotation : str
        String gathering all quotation cues.
    verbs : List[str]
        List of reported speech verbs.
    following : List[str]
        List of terms following a reported speech.
    preceding : List[str]
        List of terms preceding a reported speech.
    filter_matches : bool
        Whether to filter out overlapping matches.
    attr : str
        spaCy's attribute to use:
        a string with the value "TEXT" or "NORM",
        or a dict with the key 'term_attr'
        we can also add a key for each regex.
    on_ents_only : bool
        Whether to look for matches around detected entities only.
        Useful for faster inference in downstream tasks.
    within_ents : bool
        Whether to consider cues within entities.
    explain : bool
        Whether to keep track of cues for each entity.
    """

    defaults = dict(
        following=following,
        preceding=preceding,
        verbs=verbs,
        quotation=quotation,
    )

    def __init__(
        self,
        nlp: Language,
        attr: str,
        pseudo: Optional[List[str]],
        preceding: Optional[List[str]],
        following: Optional[List[str]],
        quotation: Optional[List[str]],
        verbs: Optional[List[str]],
        on_ents_only: bool,
        within_ents: bool,
        explain: bool,
    ):

        terms = self.get_defaults(
            pseudo=pseudo,
            preceding=preceding,
            following=following,
            quotation=quotation,
            verbs=verbs,
        )
        terms["verbs"] = self.load_verbs(terms["verbs"])

        quotation = terms.pop("quotation")

        super().__init__(
            nlp=nlp,
            attr=attr,
            on_ents_only=on_ents_only,
            explain=explain,
            **terms,
        )

        self.regex_matcher = RegexMatcher(attr=attr)
        self.regex_matcher.build_patterns(dict(quotation=quotation))

        self.within_ents = within_ents

        self.set_extensions()

    @staticmethod
    def set_extensions() -> None:

        if not Token.has_extension("reported_speech"):
            Token.set_extension("reported_speech", default=False)

        if not Token.has_extension("reported_speech_"):
            Token.set_extension(
                "reported_speech_",
                getter=lambda token: "REPORTED"
                if token._.reported_speech
                else "DIRECT",
            )

        if not Span.has_extension("reported_speech"):
            Span.set_extension("reported_speech", default=False)

        if not Span.has_extension("reported_speech_"):
            Span.set_extension(
                "reported_speech_",
                getter=lambda span: "REPORTED" if span._.reported_speech else "DIRECT",
            )

        if not Span.has_extension("reported_speech_cues"):
            Span.set_extension("reported_speech_cues", default=[])

        if not Doc.has_extension("rspeechs"):
            Doc.set_extension("rspeechs", default=[])

    def load_verbs(self, verbs: List[str]) -> List[str]:
        """
        Conjugate reporting verbs to specific tenses (trhid person)

        Parameters
        ----------
        verbs: list of reporting verbs to conjugate

        Returns
        -------
        list_rep_verbs: List of reporting verbs conjugated to specific tenses.
        """

        rep_verbs = get_verbs(verbs)

        rep_verbs = rep_verbs.loc[
            (
                (rep_verbs["mode"] == "Indicatif")
                & (rep_verbs["tense"] == "Présent")
                & (rep_verbs["person"].isin(["3s", "3p"]))
            )
            | (rep_verbs["tense"] == "Participe Présent")
            | (rep_verbs["tense"] == "Participe Passé")
        ]

        list_rep_verbs = list(rep_verbs["term"].unique())

        return list_rep_verbs

    def process(self, doc: Doc) -> Doc:
        """
        Finds entities related to reported speech.

        Parameters
        ----------
        doc: spaCy Doc object

        Returns
        -------
        doc: spaCy Doc object, annotated for negation
        """

        matches = self.get_matches(doc)
        matches += list(self.regex_matcher(doc, as_spans=True))

        boundaries = self._boundaries(doc)

        entities = list(doc.ents) + list(doc.spans.get("discarded", []))
        ents = None

        # Removes duplicate matches and pseudo-expressions in one statement
        matches = filter_spans(matches, label_to_remove="pseudo")

        for start, end in boundaries:

            ents, entities = consume_spans(
                entities,
                filter=lambda s: check_inclusion(s, start, end),
                second_chance=ents,
            )

            sub_matches, matches = consume_spans(
                matches, lambda s: start <= s.start < end
            )

            if self.on_ents_only and not ents:
                continue

            sub_preceding = get_spans(sub_matches, "preceding")
            sub_following = get_spans(sub_matches, "following")
            sub_verbs = get_spans(sub_matches, "verbs")
            sub_quotation = get_spans(sub_matches, "quotation")

            if not sub_preceding + sub_following + sub_verbs + sub_quotation:
                continue

            if not self.on_ents_only:
                for token in doc[start:end]:
                    token._.reported_speech = (
                        any(m.end <= token.i for m in sub_preceding + sub_verbs)
                        or any(m.start > token.i for m in sub_following)
                        or any(
                            ((m.start < token.i) & (m.end > token.i + 1))
                            for m in sub_quotation
                        )
                    )
            for ent in ents:

                if self.within_ents:
                    cues = [m for m in sub_preceding + sub_verbs if m.end <= ent.end]
                    cues += [m for m in sub_following if m.start >= ent.start]
                else:
                    cues = [m for m in sub_preceding + sub_verbs if m.end <= ent.start]
                    cues += [m for m in sub_following if m.start >= ent.end]

                cues += [
                    m
                    for m in sub_quotation
                    if (m.start < ent.start) & (m.end > ent.end)
                ]

                reported_speech = ent._.reported_speech or bool(cues)
                ent._.reported_speech = reported_speech

                if self.explain:
                    ent._.reported_speech_cues += cues

                if not self.on_ents_only and reported_speech:
                    for token in ent:
                        token._.reported_speech = True
        return doc
defaults = dict(following=following, preceding=preceding, verbs=verbs, quotation=quotation) class-attribute
regex_matcher = RegexMatcher(attr=attr) instance-attribute
within_ents = within_ents instance-attribute
__init__(nlp, attr, pseudo, preceding, following, quotation, verbs, on_ents_only, within_ents, explain)
Source code in edsnlp/pipelines/qualifiers/reported_speech/reported_speech.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
def __init__(
    self,
    nlp: Language,
    attr: str,
    pseudo: Optional[List[str]],
    preceding: Optional[List[str]],
    following: Optional[List[str]],
    quotation: Optional[List[str]],
    verbs: Optional[List[str]],
    on_ents_only: bool,
    within_ents: bool,
    explain: bool,
):

    terms = self.get_defaults(
        pseudo=pseudo,
        preceding=preceding,
        following=following,
        quotation=quotation,
        verbs=verbs,
    )
    terms["verbs"] = self.load_verbs(terms["verbs"])

    quotation = terms.pop("quotation")

    super().__init__(
        nlp=nlp,
        attr=attr,
        on_ents_only=on_ents_only,
        explain=explain,
        **terms,
    )

    self.regex_matcher = RegexMatcher(attr=attr)
    self.regex_matcher.build_patterns(dict(quotation=quotation))

    self.within_ents = within_ents

    self.set_extensions()
set_extensions()
Source code in edsnlp/pipelines/qualifiers/reported_speech/reported_speech.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
@staticmethod
def set_extensions() -> None:

    if not Token.has_extension("reported_speech"):
        Token.set_extension("reported_speech", default=False)

    if not Token.has_extension("reported_speech_"):
        Token.set_extension(
            "reported_speech_",
            getter=lambda token: "REPORTED"
            if token._.reported_speech
            else "DIRECT",
        )

    if not Span.has_extension("reported_speech"):
        Span.set_extension("reported_speech", default=False)

    if not Span.has_extension("reported_speech_"):
        Span.set_extension(
            "reported_speech_",
            getter=lambda span: "REPORTED" if span._.reported_speech else "DIRECT",
        )

    if not Span.has_extension("reported_speech_cues"):
        Span.set_extension("reported_speech_cues", default=[])

    if not Doc.has_extension("rspeechs"):
        Doc.set_extension("rspeechs", default=[])
load_verbs(verbs)

Conjugate reporting verbs to specific tenses (trhid person)

PARAMETER DESCRIPTION
verbs

TYPE: List[str]

RETURNS DESCRIPTION
list_rep_verbs
Source code in edsnlp/pipelines/qualifiers/reported_speech/reported_speech.py
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
def load_verbs(self, verbs: List[str]) -> List[str]:
    """
    Conjugate reporting verbs to specific tenses (trhid person)

    Parameters
    ----------
    verbs: list of reporting verbs to conjugate

    Returns
    -------
    list_rep_verbs: List of reporting verbs conjugated to specific tenses.
    """

    rep_verbs = get_verbs(verbs)

    rep_verbs = rep_verbs.loc[
        (
            (rep_verbs["mode"] == "Indicatif")
            & (rep_verbs["tense"] == "Présent")
            & (rep_verbs["person"].isin(["3s", "3p"]))
        )
        | (rep_verbs["tense"] == "Participe Présent")
        | (rep_verbs["tense"] == "Participe Passé")
    ]

    list_rep_verbs = list(rep_verbs["term"].unique())

    return list_rep_verbs
process(doc)

Finds entities related to reported speech.

PARAMETER DESCRIPTION
doc

TYPE: Doc

RETURNS DESCRIPTION
doc
Source code in edsnlp/pipelines/qualifiers/reported_speech/reported_speech.py
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
def process(self, doc: Doc) -> Doc:
    """
    Finds entities related to reported speech.

    Parameters
    ----------
    doc: spaCy Doc object

    Returns
    -------
    doc: spaCy Doc object, annotated for negation
    """

    matches = self.get_matches(doc)
    matches += list(self.regex_matcher(doc, as_spans=True))

    boundaries = self._boundaries(doc)

    entities = list(doc.ents) + list(doc.spans.get("discarded", []))
    ents = None

    # Removes duplicate matches and pseudo-expressions in one statement
    matches = filter_spans(matches, label_to_remove="pseudo")

    for start, end in boundaries:

        ents, entities = consume_spans(
            entities,
            filter=lambda s: check_inclusion(s, start, end),
            second_chance=ents,
        )

        sub_matches, matches = consume_spans(
            matches, lambda s: start <= s.start < end
        )

        if self.on_ents_only and not ents:
            continue

        sub_preceding = get_spans(sub_matches, "preceding")
        sub_following = get_spans(sub_matches, "following")
        sub_verbs = get_spans(sub_matches, "verbs")
        sub_quotation = get_spans(sub_matches, "quotation")

        if not sub_preceding + sub_following + sub_verbs + sub_quotation:
            continue

        if not self.on_ents_only:
            for token in doc[start:end]:
                token._.reported_speech = (
                    any(m.end <= token.i for m in sub_preceding + sub_verbs)
                    or any(m.start > token.i for m in sub_following)
                    or any(
                        ((m.start < token.i) & (m.end > token.i + 1))
                        for m in sub_quotation
                    )
                )
        for ent in ents:

            if self.within_ents:
                cues = [m for m in sub_preceding + sub_verbs if m.end <= ent.end]
                cues += [m for m in sub_following if m.start >= ent.start]
            else:
                cues = [m for m in sub_preceding + sub_verbs if m.end <= ent.start]
                cues += [m for m in sub_following if m.start >= ent.end]

            cues += [
                m
                for m in sub_quotation
                if (m.start < ent.start) & (m.end > ent.end)
            ]

            reported_speech = ent._.reported_speech or bool(cues)
            ent._.reported_speech = reported_speech

            if self.explain:
                ent._.reported_speech_cues += cues

            if not self.on_ents_only and reported_speech:
                for token in ent:
                    token._.reported_speech = True
    return doc

patterns

verbs: List[str] = ['affirmer', 'ajouter', 'assurer', 'confirmer', 'demander', 'dire', 'déclarer', 'décrire', 'décrire', 'démontrer', 'expliquer', 'faire remarquer', 'indiquer', 'informer', 'insinuer', 'insister', 'jurer', 'nier', 'nier', 'noter', 'objecter', 'observer', 'parler', 'promettre', 'préciser', 'prétendre', 'prévenir', 'raconter', 'rappeler', 'rapporter', 'reconnaître', 'réfuter', 'répliquer', 'répondre', 'répéter', 'révéler', 'se plaindre', 'souhaiter', 'souligner', 'supplier', 'verbaliser', 'vouloir', 'vouloir'] module-attribute

following: List[str] = ["d'après le patient", "d'après la patiente"] module-attribute

preceding: List[str] = ['pas de critique de', 'crainte de', 'menace de', 'insiste sur le fait que', "d'après le patient", "d'après la patiente", 'peur de'] module-attribute

quotation: str = '(\\".+\\")|(\\«.+\\»)' module-attribute

Back to top