def get_consultation_dates_nlp(
note_nlp: DataFrame,
dates_to_keep: str = "min",
) -> DataFrame:
"""
Uses consultation dates extracted *a priori* in consultation reports to infer *true* consultation dates
Parameters
----------
note_nlp : DataFrame
A DataFrame with (at least) the following columns:
- `note_id`
- `consultation_date`
- `end` **if** using `dates_to_keep=first`:
`end` should store the character offset of the extracted date.
dates_to_keep : str, optional
How to handle multiple consultation dates found in the document:
- `min`: keep the oldest one
- `first`: keep the occurrence that appeared first in the text
- `all`: keep all date
Returns
-------
Dataframe
With 2 added columns corresponding to the following concept:
- `CONSULTATION_DATE`, containing the date
- `CONSULTATION_DATE_EXTRACTION`, containing `"NLP"`
"""
if dates_to_keep == "min":
dates_per_note = note_nlp.groupby("note_id").agg(
CONSULTATION_DATE=("consultation_date", "min"),
)
elif dates_to_keep == "first":
dates_per_note = (
note_nlp.sort_values(by="start")
.groupby("note_id")
.agg(CONSULTATION_DATE=("consultation_date", "first"))
)
elif dates_to_keep == "all":
dates_per_note = note_nlp[["consultation_date", "note_id"]].set_index("note_id")
dates_per_note = dates_per_note.rename(
columns={"consultation_date": "CONSULTATION_DATE"}
)
dates_per_note["CONSULTATION_DATE_EXTRACTION"] = "NLP"
return dates_per_note