Skip to content

eds_scikit.phenotype.base

Features

Features()

Class used to store features (i.e. DataFrames). Features are stored in the self._features dictionary.

Source code in eds_scikit/phenotype/base.py
22
23
24
def __init__(self):
    self._features = {}
    self.last_feature = None

Phenotype

Phenotype(data: BaseData, name: Optional[str] = None, **kwargs)

Base class for phenotyping

PARAMETER DESCRIPTION
data

A BaseData object

TYPE: BaseData

name

Name of the phenotype. If left to None, the name of the class will be used instead

TYPE: Optional[str] DEFAULT: None

Source code in eds_scikit/phenotype/base.py
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def __init__(
    self,
    data: BaseData,
    name: Optional[str] = None,
    **kwargs,
):
    """
    Parameters
    ----------
    data : BaseData
        A BaseData object
    name : Optional[str]
        Name of the phenotype. If left to None,
        the name of the class will be used instead
    """
    self.data = data
    self.features = Features()
    self.name = (
        to_valid_variable_name(name)
        if name is not None
        else self.__class__.__name__
    )
    self.logger = logger.bind(classname=self.name, sep=".")

add_code_feature

add_code_feature(output_feature: str, codes: dict, source: str = 'icd10', additional_filtering: Optional[dict] = None)

Adds a feature from either ICD10 or CCAM codes

PARAMETER DESCRIPTION
output_feature

Name of the feature

TYPE: str

codes

Dictionary of codes to provide to the from_codes function

TYPE: dict

source

Either 'icd10' or 'ccam', by default 'icd10'

TYPE: str DEFAULT: 'icd10'

additional_filtering

Dictionary passed to the from_codes functions for filtering

TYPE: Optional[dict] DEFAULT: None

RETURNS DESCRIPTION
Phenotype

The current Phenotype object with an additional feature stored in self.features[output_feature]

Source code in eds_scikit/phenotype/base.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
def add_code_feature(
    self,
    output_feature: str,
    codes: dict,
    source: str = "icd10",
    additional_filtering: Optional[dict] = None,
):
    """
    Adds a feature from either ICD10 or CCAM codes

    Parameters
    ----------
    output_feature : str
        Name of the feature
    codes : dict
        Dictionary of codes to provide to the `from_codes` function
    source : str,
        Either 'icd10' or 'ccam', by default 'icd10'
    additional_filtering : Optional[dict]
        Dictionary passed to the `from_codes` functions for filtering

    Returns
    -------
    Phenotype
        The current Phenotype object with an additional feature
        stored in self.features[output_feature]

    """
    additional_filtering = additional_filtering or dict()

    if source not in ["icd10", "ccam"]:
        raise ValueError(f"source should be either 'icd10' or 'ccam', got {source}")

    self.logger.info(f"Getting {source.upper()} features...")

    from_code_func = (
        conditions_from_icd10 if (source == "icd10") else procedures_from_ccam
    )
    codes_df = (
        self.data.condition_occurrence
        if (source == "icd10")
        else self.data.procedure_occurrence
    )

    df = from_code_func(
        codes_df,
        codes=codes,
        additional_filtering=additional_filtering,
        date_from_visit=False,
    )
    df["phenotype"] = self.name
    df = df.rename(columns={"concept": "subphenotype"})

    bd.cache(df)

    self.features[output_feature] = df

    self.logger.info(
        f"{source.upper()} features stored in self.features['{output_feature}'] (N = {len(df)})"
    )

    return self

agg_single_feature

agg_single_feature(input_feature: str, output_feature: Optional[str] = None, level: str = 'patient', subphenotype: bool = True, threshold: int = 1) -> Phenotype

Simple aggregation rule on a feature:

  • If level="patient", keeps patients with at least threshold visits showing the (sub)phenotype
  • If level="visit", keeps visits with at least threshold events (could be ICD10 codes, NLP features, biology, etc) showing the (sub)phenotype
PARAMETER DESCRIPTION
input_feature

Name of the input feature

TYPE: str

output_feature

Name of the input feature. If None, will be set to input_feature + "_agg"

TYPE: Optional[str] DEFAULT: None

level

On which level to do the aggregation, either "patient" or "visit"

TYPE: str DEFAULT: 'patient'

subphenotype

Whether the threshold should apply to the phenotype ("phenotype" column) of the subphenotype ("subphenotype" column)

TYPE: bool DEFAULT: True

threshold

Minimal number of events (which definition depends on the level value)

TYPE: int, optional DEFAULT: 1

RETURNS DESCRIPTION
Phenotype

The current Phenotype object with an additional feature stored in self.features[output_feature]

Source code in eds_scikit/phenotype/base.py
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
def agg_single_feature(
    self,
    input_feature: str,
    output_feature: Optional[str] = None,
    level: str = "patient",
    subphenotype: bool = True,
    threshold: int = 1,
) -> "Phenotype":
    """
    Simple aggregation rule on a feature:

    - If level="patient", keeps patients with at least `threshold`
      visits showing the (sub)phenotype
    - If level="visit", keeps visits with at least `threshold` events
      (could be ICD10 codes, NLP features, biology, etc) showing the (sub)phenotype

    Parameters
    ----------
    input_feature : str
        Name of the input feature
    output_feature : Optional[str]
        Name of the input feature. If None, will be set to
        input_feature + "_agg"
    level : str
        On which level to do the aggregation,
        either "patient" or "visit"
    subphenotype : bool
        Whether the threshold should apply to the phenotype
        ("phenotype" column) of the subphenotype ("subphenotype" column)
    threshold : int, optional
        Minimal number of *events* (which definition depends on the `level` value)

    Returns
    -------
    Phenotype
        The current Phenotype object with an additional feature
        stored in self.features[output_feature]

    """
    assert level in {"patient", "visit"}

    output_feature = output_feature or f"{input_feature}_agg"

    if input_feature not in self.features:
        raise ValueError(
            f"Input feature {input_feature} not found in self.features. "
            "Maybe you forgot to call self.get_features() ?"
        )

    # We use `size` below for two reasons
    # 1) to use it with the `threshold` parameter directly if level == 'visit'
    # 2) to drop duplicates on the group_cols + ["visit_occurrence_id"] subset

    phenotype_type = "subphenotype" if subphenotype else "phenotype"
    group_cols = ["person_id", phenotype_type]

    group_visit = (
        self.features[input_feature]
        .groupby(group_cols + ["visit_occurrence_id"])
        .size()
        .rename("N")  # number of events per visit_occurrence
        .reset_index()
    )

    if level == "patient":
        group_visit = (
            group_visit.groupby(group_cols)
            .size()
            .rename("N")  # number of visits per person
            .reset_index()
        )

    group_visit = group_visit[group_visit["N"] >= threshold].drop(columns="N")
    group_visit["phenotype"] = self.name

    bd.cache(group_visit)

    self.features[output_feature] = group_visit

    self.logger.info(
        f"Aggregation from {input_feature} stored in self.features['{output_feature}'] "
        f"(N = {len(group_visit)})"
    )

    return self

agg_two_features

agg_two_features(input_feature_1: str, input_feature_2: str, output_feature: str = None, how: str = 'AND', level: str = 'patient', subphenotype: bool = True, thresholds: Tuple[int, int] = (1, 1)) -> Phenotype
  • If level='patient', keeps a specific patient if

    • At least thresholds[0] visits are found in feature_1 AND/OR
    • At least thresholds[1] visits are found in feature_2
  • If level='visit', keeps a specific visit if

    • At least thresholds[0] events are found in feature_1 AND/OR
    • At least thresholds[1] events are found in feature_2
PARAMETER DESCRIPTION
input_feature_1

Name of the first input feature

TYPE: str

input_feature_2

Name of the second input feature

TYPE: str

output_feature

Name of the input feature. If None, will be set to input_feature + "_agg"

TYPE: str DEFAULT: None

how

Whether to perform a boolean "AND" or "OR" aggregation

TYPE: str, optional DEFAULT: 'AND'

level

On which level to do the aggregation, either "patient" or "visit"

TYPE: str DEFAULT: 'patient'

subphenotype

Whether the threshold should apply to the phenotype ("phenotype" column) of the subphenotype ("subphenotype" column)

TYPE: bool DEFAULT: True

thresholds

Repsective threshold for the first and second feature

TYPE: Tuple[int, int], optional DEFAULT: (1, 1)

RETURNS DESCRIPTION
Phenotype

The current Phenotype object with an additional feature stored in self.features[output_feature]

Source code in eds_scikit/phenotype/base.py
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
def agg_two_features(
    self,
    input_feature_1: str,
    input_feature_2: str,
    output_feature: str = None,
    how: str = "AND",
    level: str = "patient",
    subphenotype: bool = True,
    thresholds: Tuple[int, int] = (1, 1),
) -> "Phenotype":
    """

    - If level='patient', keeps a specific patient if
        - At least `thresholds[0]` visits are found in feature_1 AND/OR
        - At least `thresholds[1]` visits are found in feature_2

    - If level='visit', keeps a specific visit if
        - At least `thresholds[0]` events are found in feature_1 AND/OR
        - At least `thresholds[1]` events are found in feature_2

    Parameters
    ----------
    input_feature_1 : str
        Name of the first input feature
    input_feature_2 : str
        Name of the second input feature
    output_feature : str
        Name of the input feature. If None, will be set to
        input_feature + "_agg"
    how : str, optional
        Whether to perform a boolean "AND" or "OR" aggregation
    level : str
        On which level to do the aggregation,
        either "patient" or "visit"
    subphenotype : bool
        Whether the threshold should apply to the phenotype
        ("phenotype" column) of the subphenotype ("subphenotype" column)
    thresholds : Tuple[int, int], optional
        Repsective threshold for the first and second feature

    Returns
    -------
    Phenotype
        The current Phenotype object with an additional feature
        stored in self.features[output_feature]
    """

    self.agg_single_feature(
        input_feature=input_feature_1,
        level=level,
        subphenotype=subphenotype,
        threshold=thresholds[0],
    )

    self.agg_single_feature(
        input_feature=input_feature_2,
        level=level,
        subphenotype=subphenotype,
        threshold=thresholds[1],
    )

    results_1 = self.features[f"{input_feature_1}_agg"]
    results_2 = self.features[f"{input_feature_2}_agg"]

    assert set(results_1.columns) == set(results_2.columns)

    if how == "AND":
        result = results_1.merge(results_2, on=list(results_1.columns), how="inner")
    elif how == "OR":
        result = bd.concat(
            [
                results_1,
                results_2,
            ]
        ).drop_duplicates()
    else:
        raise ValueError(f"'how' options are ('AND', 'OR'), got {how}.")

    bd.cache(result)

    output_feature = output_feature or f"{input_feature_1}_{how}_{input_feature_2}"
    self.features[output_feature] = result

    self.logger.info(
        f"Aggregation from {input_feature_1} {how} {input_feature_1} stored in self.features['{output_feature}'] "
        f"(N = {len(result)})"
    )
    return self

compute

compute(**kwargs)

Fetch all necessary features and perform aggregation

Source code in eds_scikit/phenotype/base.py
325
326
327
328
329
def compute(self, **kwargs):
    """
    Fetch all necessary features and perform aggregation
    """
    raise NotImplementedError()

to_data

to_data(key: Optional[str] = None) -> BaseData

Appends the feature found in self.features[key] to the data object. If no key is provided, uses the last added feature

PARAMETER DESCRIPTION
key

Key of the self.feature dictionary

TYPE: Optional[str] DEFAULT: None

RETURNS DESCRIPTION
BaseData

The data object with phenotype added to data.computed

Source code in eds_scikit/phenotype/base.py
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
def to_data(self, key: Optional[str] = None) -> BaseData:
    """
    Appends the feature found in self.features[key] to the data object.
    If no key is provided, uses the last added feature

    Parameters
    ----------
    key : Optional[str]
        Key of the self.feature dictionary

    Returns
    -------
    BaseData
        The data object with phenotype added to `data.computed`
    """

    if not self.features:
        self.compute()

    if key is None:
        self.logger.info("No key provided: Using last added feature.")
        return self._set(self.features.last())

    else:
        assert (
            key in self.features
        ), f"Key {key} not found in features. Available {self.features}"
        self.logger.info("Using feature {key}")
        return self._set(self.features[key])

to_valid_variable_name

to_valid_variable_name(s: str)

Converts a string to a valid variable name

Source code in eds_scikit/phenotype/base.py
415
416
417
418
419
420
421
422
423
424
425
426
def to_valid_variable_name(s: str):
    """
    Converts a string to a valid variable name
    """
    # Replace non-alphanumeric characters with underscores
    s = re.sub(r"\W+", "_", s)
    # Remove leading underscores
    s = re.sub(r"^_+", "", s)
    # If the string is empty or starts with a number, prepend an underscore
    if not s or s[0].isdigit():
        s = "_" + s
    return s
Back to top