Skip to content

eds_scikit.biology.cleaning.main

bioclean

bioclean(data: Data, concepts_sets: List[ConceptsSet] = None, start_date: datetime = None, end_date: datetime = None, convert_units: bool = False, studied_cohort: Union[DataFrame, List[int]] = None) -> Data

It follows the pipeline explained [here][cleaning]:

PARAMETER DESCRIPTION
data

Instantiated HiveData, PostgresData or PandasData

TYPE: Data

concepts_sets

List of concepts-sets to select

TYPE: List[ConceptsSet], optional DEFAULT: None

start_date

EXAMPLE: "2019-05-01"

TYPE: datetime, optional DEFAULT: None

end_date

EXAMPLE: "2022-05-01"

TYPE: datetime, optional DEFAULT: None

convert_units

If True, convert units based on ConceptsSets Units object. Eager execution., by default False

TYPE: bool, optional DEFAULT: False

studied_cohort

List of patient_ids to select

TYPE: Union[DataFrame, np.iterable, set], optional DEFAULT: None

RETURNS DESCRIPTION
Data

Same as the input with the transformed bioclean table

Source code in eds_scikit/biology/cleaning/main.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def bioclean(
    data: Data,
    concepts_sets: List[ConceptsSet] = None,
    start_date: datetime = None,
    end_date: datetime = None,
    convert_units: bool = False,
    studied_cohort: Union[DataFrame, List[int]] = None,
) -> Data:
    """It follows the pipeline explained [here][cleaning]:

    Parameters
    ----------
    data : Data
        Instantiated [``HiveData``][eds_scikit.io.hive.HiveData], [``PostgresData``][eds_scikit.io.postgres.PostgresData] or [``PandasData``][eds_scikit.io.files.PandasData]
    concepts_sets : List[ConceptsSet], optional
        List of concepts-sets to select
    start_date : datetime, optional
        **EXAMPLE**: `"2019-05-01"`
    end_date : datetime, optional
        **EXAMPLE**: `"2022-05-01"`
    convert_units : bool, optional
        If True, convert units based on ConceptsSets Units object. Eager execution., by default False
    studied_cohort : Union[DataFrame, np.iterable, set], optional
        List of patient_ids to select

    Returns
    -------
    Data
        Same as the input with the transformed `bioclean` table
    """

    if concepts_sets is None:
        logger.info("No concepts sets provided. Loading default concepts sets.")
        concepts_sets = fetch_all_concepts_set()

    measurements = prepare_measurement_table(
        data, start_date, end_date, concepts_sets, False, convert_units
    )
    # Filter Measurement.
    if studied_cohort:
        measurements = select_cohort(measurements, studied_cohort)
    # Transform values
    data.bioclean = measurements

    measurements = measurements.merge(
        data.visit_occurrence[["care_site_id", "visit_occurrence_id"]],
        on="visit_occurrence_id",
    )
    measurements = measurements.merge(
        data.care_site[["care_site_id", "care_site_short_name"]], on="care_site_id"
    )
    # Plot values
    value_column = "value_as_number_normalized" if convert_units else "value_as_number"
    unit_column = (
        "unit_source_value_normalized" if convert_units else "unit_source_value"
    )

    plot_biology_summary(measurements, value_column, unit_column)
Back to top