Skip to content

eds_scikit.event.consultations

get_consultation_dates

get_consultation_dates(vo: DataFrame, note: DataFrame, note_nlp: Optional[DataFrame] = None, algo: Union[str, List[str]] = ['nlp'], max_timedelta: timedelta = timedelta(days=7), structured_config: Dict[str, Any] = dict(), nlp_config: Dict[str, Any] = dict()) -> DataFrame

Extract consultation dates. See the implementation details of the algo(s) you want to use

PARAMETER DESCRIPTION
vo

visit_occurrence DataFrame

TYPE: DataFrame

note

note DataFrame

TYPE: DataFrame

note_nlp

note_nlp DataFrame, used only with the "nlp" algo

TYPE: Optional[DataFrame] DEFAULT: None

algo

Algorithm(s) to use to determine consultation dates. Multiple algorithms can be provided as a list. Accepted values are:

TYPE: Union[str, List[str]] DEFAULT: ['nlp']

max_timedelta

If two extracted consultations are spaced by less than max_timedelta, we consider that they correspond to the same event and only keep the first one.

TYPE: timedelta DEFAULT: timedelta(days=7)

structured_config

A dictionnary of parameters when using the structured algorithm

TYPE: Dict[str, Any] DEFAULT: dict()

nlp_config

A dictionnary of parameters when using the nlp algorithm

TYPE: Dict[str, Any] DEFAULT: dict()

RETURNS DESCRIPTION
DataFrame

Event type DataFrame with the following columns:

  • person_id
  • visit_occurrence_id
  • CONSULTATION_DATE: corresponds to the note_datetime value of a consultation report coming from the considered visit.
  • CONSULTATION_NOTE_ID: the note_id of the corresponding report.
  • CONSULTATION_DATE_EXTRACTION: the method of extraction
Source code in eds_scikit/event/consultations.py
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
@concept_checker(
    concepts=[
        "CONSULTATION_DATE",
        "CONSULTATION_ID",
        "CONSULTATION_DATE_EXTRACTION",
    ]
)
def get_consultation_dates(
    vo: DataFrame,
    note: DataFrame,
    note_nlp: Optional[DataFrame] = None,
    algo: Union[str, List[str]] = ["nlp"],
    max_timedelta: timedelta = timedelta(days=7),
    structured_config: Dict[str, Any] = dict(),
    nlp_config: Dict[str, Any] = dict(),
) -> DataFrame:
    """
    Extract consultation dates.
    See the implementation details of the algo(s) you want to use

    Parameters
    ----------
    vo : DataFrame
        `visit_occurrence` DataFrame
    note : DataFrame
        `note` DataFrame
    note_nlp : Optional[DataFrame]
        `note_nlp` DataFrame, used only with the `"nlp"` algo
    algo: Union[str, List[str]] = ["nlp"]
        Algorithm(s) to use to determine consultation dates.
        Multiple algorithms can be provided as a list. Accepted values are:

        - `"structured"`: See [get_consultation_dates_structured()][eds_scikit.event.consultations.get_consultation_dates_structured]
        - `"nlp"`: See [get_consultation_dates_nlp()][eds_scikit.event.consultations.get_consultation_dates_nlp]
    max_timedelta: timedelta = timedelta(days=7)
        If two extracted consultations are spaced by less than `max_timedelta`,
        we consider that they correspond to the same event and only keep the first one.
    structured_config : Dict[str, Any] = dict()
        A dictionnary of parameters when using the [`structured`][eds_scikit.event.consultations.get_consultation_dates_structured] algorithm
    nlp_config : Dict[str, Any] = dict()
        A dictionnary of parameters when using the [`nlp`][eds_scikit.event.consultations.get_consultation_dates_nlp] algorithm

    Returns
    -------
    DataFrame
        Event type DataFrame with the following columns:

        - `person_id`
        - `visit_occurrence_id`
        - `CONSULTATION_DATE`: corresponds to the `note_datetime` value of a consultation
          report coming from the considered visit.
        - `CONSULTATION_NOTE_ID`: the `note_id` of the corresponding report.
        - `CONSULTATION_DATE_EXTRACTION`: the method of extraction

    """

    fw = get_framework(vo)

    if type(algo) == str:
        algo = [algo]

    dates = []

    for a in algo:
        if a == "structured":
            dates.append(
                get_consultation_dates_structured(
                    vo=vo,
                    note=note,
                    **structured_config,
                )
            )
        if a == "nlp":
            dates.append(
                get_consultation_dates_nlp(
                    note_nlp=note_nlp,
                    **nlp_config,
                )
            )

    dates_per_note = (
        fw.concat(dates)
        .reset_index()
        .merge(note[["note_id", "visit_occurrence_id"]], on="note_id", how="inner")
    )

    # Remove timezone errors from spark
    dates_per_note["CONSULTATION_DATE"] = dates_per_note["CONSULTATION_DATE"].astype(
        str
    )

    dates_per_visit = (
        dates_per_note.groupby(["visit_occurrence_id", "CONSULTATION_DATE"])[
            "CONSULTATION_DATE_EXTRACTION"
        ]
        .unique()
        .apply(sorted)
        .str.join("+")
    )

    dates_per_visit.name = "CONSULTATION_DATE_EXTRACTION"

    dates_per_visit = bd.add_unique_id(
        dates_per_visit.reset_index(), col_name="TMP_CONSULTATION_ID"
    )

    # Convert back to datetime format
    dates_per_visit["CONSULTATION_DATE"] = bd.to_datetime(
        dates_per_visit["CONSULTATION_DATE"], errors="coerce"
    )

    dates_per_visit = clean_consultations(
        dates_per_visit,
        max_timedelta,
    )

    # Equivalent to df.spark.cache() for ks.DataFrame
    bd.cache(dates_per_visit)

    return dates_per_visit

get_consultation_dates_structured

get_consultation_dates_structured(note: DataFrame, vo: Optional[DataFrame] = None, kept_note_class_source_value: Optional[Union[str, List[str]]] = 'CR-CONS', kept_visit_source_value: Optional[Union[str, List[str]]] = 'consultation externe') -> DataFrame

Uses note_datetime value to infer true consultation dates

PARAMETER DESCRIPTION
note

A note DataFrame with at least the following columns:

  • note_id
  • note_datetime
  • note_source_value if kept_note_class_source_value is not None
  • visit_occurrence_id if kept_visit_source_value is not None

TYPE: DataFrame

vo

A visit_occurrence DataFrame to provide if kept_visit_source_value is not None, with at least the following columns:

  • visit_occurrence_id
  • visit_source_value if kept_visit_source_value is not None

TYPE: Optional[DataFrame] DEFAULT: None

kept_note_class_source_value

Value(s) allowed for the note_class_source_value column.

TYPE: Optional[Union[str, List[str]]] DEFAULT: 'CR-CONS'

kept_visit_source_value

Value(s) allowed for the visit_source_value column.

TYPE: Optional[Union[str, List[str]]], optional DEFAULT: 'consultation externe'

RETURNS DESCRIPTION
Dataframe

With 2 added columns corresponding to the following concept:

  • CONSULTATION_DATE, containing the date
  • CONSULTATION_DATE_EXTRACTION, containing "STRUCTURED"
Source code in eds_scikit/event/consultations.py
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
def get_consultation_dates_structured(
    note: DataFrame,
    vo: Optional[DataFrame] = None,
    kept_note_class_source_value: Optional[Union[str, List[str]]] = "CR-CONS",
    kept_visit_source_value: Optional[Union[str, List[str]]] = "consultation externe",
) -> DataFrame:
    """
    Uses `note_datetime` value to infer *true* consultation dates

    Parameters
    ----------
    note : DataFrame
        A `note` DataFrame with at least the following columns:

        - `note_id`
        - `note_datetime`
        - `note_source_value` **if** `kept_note_class_source_value is not None`
        - `visit_occurrence_id` **if** `kept_visit_source_value is not None`
    vo : Optional[DataFrame]
        A visit_occurrence DataFrame to provide **if** `kept_visit_source_value is not None`,
        with at least the following columns:

        - `visit_occurrence_id`
        - `visit_source_value` **if** `kept_visit_source_value is not None`
    kept_note_class_source_value : Optional[Union[str, List[str]]]
        Value(s) allowed for the `note_class_source_value` column.
    kept_visit_source_value : Optional[Union[str, List[str]]], optional
        Value(s) allowed for the `visit_source_value` column.

    Returns
    -------
    Dataframe
        With 2 added columns corresponding to the following concept:

        - `CONSULTATION_DATE`, containing the date
        - `CONSULTATION_DATE_EXTRACTION`, containing `"STRUCTURED"`
    """

    kept_note = note

    if kept_note_class_source_value is not None:
        if type(kept_note_class_source_value) == str:
            kept_note_class_source_value = [kept_note_class_source_value]
        kept_note = note[
            note.note_class_source_value.isin(set(kept_note_class_source_value))
        ]

    if kept_visit_source_value is not None:
        if type(kept_visit_source_value) == str:
            kept_visit_source_value = [kept_visit_source_value]
        kept_note = kept_note.merge(
            vo[
                [
                    "visit_occurrence_id",
                    "visit_source_value",
                ]
            ][vo.visit_source_value.isin(set(kept_visit_source_value))],
            on="visit_occurrence_id",
        )

    dates_per_note = kept_note[["note_datetime", "note_id"]].rename(
        columns={
            "note_datetime": "CONSULTATION_DATE",
        }
    )

    dates_per_note["CONSULTATION_DATE_EXTRACTION"] = "STRUCTURED"

    return dates_per_note.set_index("note_id")

get_consultation_dates_nlp

get_consultation_dates_nlp(note_nlp: DataFrame, dates_to_keep: str = 'min') -> DataFrame

Uses consultation dates extracted a priori in consultation reports to infer true consultation dates

PARAMETER DESCRIPTION
note_nlp

A DataFrame with (at least) the following columns:

  • note_id
  • consultation_date
  • end if using dates_to_keep=first: end should store the character offset of the extracted date.

TYPE: DataFrame

dates_to_keep

How to handle multiple consultation dates found in the document:

  • min: keep the oldest one
  • first: keep the occurrence that appeared first in the text
  • all: keep all date

TYPE: str, optional DEFAULT: 'min'

RETURNS DESCRIPTION
Dataframe

With 2 added columns corresponding to the following concept:

  • CONSULTATION_DATE, containing the date
  • CONSULTATION_DATE_EXTRACTION, containing "NLP"
Source code in eds_scikit/event/consultations.py
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
def get_consultation_dates_nlp(
    note_nlp: DataFrame,
    dates_to_keep: str = "min",
) -> DataFrame:
    """
    Uses consultation dates extracted *a priori* in consultation reports to infer *true* consultation dates

    Parameters
    ----------
    note_nlp : DataFrame
        A DataFrame with (at least) the following columns:

        - `note_id`
        - `consultation_date`
        - `end` **if** using `dates_to_keep=first`:
        `end` should store the character offset of the extracted date.
    dates_to_keep : str, optional
        How to handle multiple consultation dates found in the document:

        - `min`: keep the oldest one
        - `first`: keep the occurrence that appeared first in the text
        - `all`: keep all date

    Returns
    -------
    Dataframe
        With 2 added columns corresponding to the following concept:

        - `CONSULTATION_DATE`, containing the date
        - `CONSULTATION_DATE_EXTRACTION`, containing `"NLP"`
    """

    if dates_to_keep == "min":
        dates_per_note = note_nlp.groupby("note_id").agg(
            CONSULTATION_DATE=("consultation_date", "min"),
        )
    elif dates_to_keep == "first":
        dates_per_note = (
            note_nlp.sort_values(by="start")
            .groupby("note_id")
            .agg(CONSULTATION_DATE=("consultation_date", "first"))
        )
    elif dates_to_keep == "all":
        dates_per_note = note_nlp[["consultation_date", "note_id"]].set_index("note_id")
        dates_per_note = dates_per_note.rename(
            columns={"consultation_date": "CONSULTATION_DATE"}
        )
    dates_per_note["CONSULTATION_DATE_EXTRACTION"] = "NLP"

    return dates_per_note