Skip to content

eds_scikit.utils.process_table

tag_table_by_type

tag_table_by_type(table: DataFrame, type_groups: Union[str, Dict], source_col: str, target_col: str, filter_table: bool = False)

Add tag column to table based on their value (ex : condition_occurrence -> "DIABETIC", "NOT DIABETIC)

PARAMETER DESCRIPTION
table

Table (must contain columns source_col, target_col)

TYPE: DataFrame

type_groups

Regex or Dict of regex to define tags and associated regex.

TYPE: Union[str, Dict]

source_col

Column on which the tagging is applied.

TYPE: str

target_col

Label column name

TYPE: str

remove_other

If True, remove untagged columns

TYPE: bool

RETURNS DESCRIPTION
DataFrame

Input dataframe with tag column target_col

Output
person_id condition_source_value DIABETIC_CONDITION
001 E100 DIABETES_TYPE_I
002 E101 DIABETES_TYPE_I
003 E110 DIABETES_TYPE_II
004 E113 DIABETES_TYPE_II
005 A001 OTHER
Source code in eds_scikit/utils/process_table.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
def tag_table_by_type(
    table: DataFrame,
    type_groups: Union[str, Dict],
    source_col: str,
    target_col: str,
    filter_table: bool = False,
):
    """Add tag column to table based on their value (ex : condition_occurrence -> "DIABETIC", "NOT DIABETIC)

    Parameters
    ----------
    table : DataFrame
        Table (must contain columns source_col, target_col)
    type_groups : Union[str, Dict]
        Regex or Dict of regex to define tags and associated regex.
    source_col : str
        Column on which the tagging is applied.
    target_col : str
        Label column name
    remove_other : bool
        If True, remove untagged columns

    Returns
    -------
    DataFrame
        Input dataframe with tag column `target_col`

    Output
    -------
    | person_id                   |   condition_source_value | DIABETIC_CONDITION    |
    |:---------------------------:|-------------------------:|:---------------------:|
    | 001                         |                     E100 | DIABETES_TYPE_I       |
    | 002                         |                     E101 | DIABETES_TYPE_I       |
    | 003                         |                     E110 | DIABETES_TYPE_II      |
    | 004                         |                     E113 | DIABETES_TYPE_II      |
    | 005                         |                     A001 | OTHER                 |


    """
    if isinstance(type_groups, str):
        type_groups = {type_groups: type_groups}
    table[target_col] = "OTHER"

    for type_name, type_value in type_groups.items():

        table.loc[
            table[source_col]
            .astype(str)
            .str.contains(
                type_value,
                case=False,
                regex=True,
                na=False,
            ),
            target_col,
        ] = type_name

    logger.debug(
        "The following {} : {} have been tagged on table.",
        target_col,
        type_groups,
    )

    table = table[table[target_col] != "OTHER"] if filter_table else table

    return table

tag_table_period_length

tag_table_period_length(table: DataFrame, length_of_stays: List[float], start_date_col: str = 'visit_start_datetime', end_date_col: str = 'visit_end_datetime') -> DataFrame

Tag table by length of stays (can be applied to visit_occurrence table)

Example : length_of_stays = [7, 14]

Output
person_id visit_start_datetime visit_end_datetime length_of_stay
001 2020-04-01 2020-04-12 "7 days - 14 days"
002 2020-04-01 2020-04-03 "<= 7 days "
003 2020-04-01 2020-04-09 ">= 7 days "
PARAMETER DESCRIPTION
table

TYPE: DataFrame

length_of_stays

Example : [7 , 14]

TYPE: List[float]

start_date_col

by default "visit_start_datetime"

TYPE: str, optional DEFAULT: 'visit_start_datetime'

end_date_col

by default "visit_end_datetime"

TYPE: str, optional DEFAULT: 'visit_end_datetime'

RETURNS DESCRIPTION
DataFrame
Source code in eds_scikit/utils/process_table.py
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
def tag_table_period_length(
    table: DataFrame,
    length_of_stays: List[float],
    start_date_col: str = "visit_start_datetime",
    end_date_col: str = "visit_end_datetime",
) -> DataFrame:
    """Tag table by length of stays (can be applied to visit_occurrence table)

    Example : length_of_stays = [7, 14]

    Output
    -------
    | person_id                   |   visit_start_datetime   | visit_end_datetime    | length_of_stay        |
    |:---------------------------:|-------------------------:|:---------------------:|:---------------------:|
    | 001                         |               2020-04-01 | 2020-04-12            | "7 days - 14 days"    |
    | 002                         |               2020-04-01 | 2020-04-03            | "<= 7 days "          |
    | 003                         |               2020-04-01 | 2020-04-09            | ">= 7 days "          |


    Parameters
    ----------
    table : DataFrame
    length_of_stays : List[float]
        Example : [7 , 14]
    start_date_col : str, optional
        by default "visit_start_datetime"
    end_date_col : str, optional
        by default "visit_end_datetime"

    Returns
    -------
    DataFrame
    """
    table = table.assign(
        length=(table[end_date_col] - table[start_date_col])
        / np.timedelta64(timedelta(days=1))
    )

    # Incomplete stays
    table = table.assign(length_of_stay="Not specified")
    table["length_of_stay"] = table.length_of_stay.mask(
        table[end_date_col].isna(),
        "Incomplete stay",
    )

    # Complete stays
    min_duration = length_of_stays[0]
    max_duration = length_of_stays[-1]
    table["length_of_stay"] = table["length_of_stay"].mask(
        (table["length"] <= min_duration),
        "<= {} days".format(min_duration),
    )
    table["length_of_stay"] = table["length_of_stay"].mask(
        (table["length"] >= max_duration),
        ">= {} days".format(max_duration),
    )
    for min_length, max_length in zip(length_of_stays[:-1], length_of_stays[1:]):
        table["length_of_stay"] = table["length_of_stay"].mask(
            (table["length"] >= min_length) & (table["length"] < max_length),
            "{} days - {} days".format(min_length, max_length),
        )
    table = table.drop(columns="length")

    return table

tag_table_with_age

tag_table_with_age(table: DataFrame, date_col: str, person: DataFrame, age_ranges: List[int] = None)

Tag table with person age

PARAMETER DESCRIPTION
table

must contain person_id and date_col

TYPE: DataFrame

date_column

date column from table on which to compute age

person

must contain person_id

TYPE: DataFrame

age_ranges

if None, simply compute age. example : None, [18], [18, 60]

TYPE: List[int] DEFAULT: None

RETURNS DESCRIPTION
DataFrame
Source code in eds_scikit/utils/process_table.py
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
def tag_table_with_age(
    table: DataFrame, date_col: str, person: DataFrame, age_ranges: List[int] = None
):
    """Tag table with person age

    Parameters
    ----------
    table : DataFrame
        must contain person_id and date_col
    date_column: str
        date column from table on which to compute age
    person : DataFrame
        must contain person_id
    age_ranges : List[int]
        if None, simply compute age.
        example : None, [18], [18, 60]

    Returns
    -------
    DataFrame
    """
    check_columns(df=person, required_columns=["person_id", "birth_datetime"])
    check_columns(df=table, required_columns=[date_col, "person_id"])

    table = table.merge(person[["person_id", "birth_datetime"]], on="person_id")

    table["age"] = (table[date_col] - table["birth_datetime"]) / (
        np.timedelta64(timedelta(days=1)) * 356
    )
    table["age"] = table["age"].astype(int)

    table["age_range"] = "Not specified"
    if age_ranges:
        age_ranges.sort()
        table.loc[table.age <= age_ranges[0], "age_range"] = f"age <= {age_ranges[0]}"

        for age_min, age_max in zip(age_ranges[:-1], age_ranges[1:]):
            in_range = (table.age > age_min) & (table.age <= age_max)
            table.loc[in_range, "age_range"] = f"{age_min} < age <= {age_max}"

        table.loc[table.age > age_ranges[-1], "age_range"] = f"age > {age_ranges[-1]}"

    return table
Back to top