Skip to content

eds_scikit.biology.cleaning.transform

transform_measurement

transform_measurement(measurement: DataFrame, clip: bool = False, config_name: str = 'all_aphp') -> DataFrame

Normalize units and flag outliers based on the configuration file

PARAMETER DESCRIPTION
measurement

Target DataFrame to transform

TYPE: DataFrame

clip

If True extreme values are set equal to the thresholds

TYPE: bool, optional DEFAULT: False

config_name

Name of the configuration file

TYPE: str, optional DEFAULT: 'all_aphp'

RETURNS DESCRIPTION
DataFrame

Transformed DataFrame with normalized units and flagged outliers

Source code in eds_scikit/biology/cleaning/transform.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
def transform_measurement(
    measurement: DataFrame, clip: bool = False, config_name: str = "all_aphp"
) -> DataFrame:
    """Normalize units and flag outliers based on the configuration file

    Parameters
    ----------
    measurement : DataFrame
        Target DataFrame to transform
    clip : bool, optional
        If `True` extreme values are set equal to the thresholds
    config_name : str, optional
        Name of the configuration file

    Returns
    -------
    DataFrame
        Transformed DataFrame with normalized units and flagged outliers
    """
    concept_code_cols = [
        column_name
        for column_name in measurement.columns
        if "concept_code" in column_name
    ]

    config = registry.get("data", f"get_biology_config.{config_name}")()
    config = config[
        concept_code_cols
        + [
            "unit_source_value",
            "max_threshold",
            "min_threshold",
            "transformed_unit",
            "Action",
            "Coefficient",
        ]
    ]

    config = to(get_framework(measurement), config)

    logger.info("Normalizing units...")
    clean_measurement = normalize_unit(measurement)
    clean_measurement = clean_measurement.merge(
        config, on=concept_code_cols + ["unit_source_value"]
    )
    clean_measurement = clean_measurement[~(clean_measurement["Action"] == "Delete")]
    clean_measurement["transformed_value"] = clean_measurement["value_as_number"].mask(
        clean_measurement["Action"] == "Transform",
        clean_measurement["value_as_number"] * clean_measurement["Coefficient"],
    )
    clean_measurement["max_threshold"] = clean_measurement["max_threshold"].mask(
        clean_measurement["Action"] == "Transform",
        clean_measurement["max_threshold"] * clean_measurement["Coefficient"],
    )
    clean_measurement["min_threshold"] = clean_measurement["min_threshold"].mask(
        clean_measurement["Action"] == "Transform",
        clean_measurement["min_threshold"] * clean_measurement["Coefficient"],
    )
    clean_measurement = clean_measurement.drop(columns=["Action", "Coefficient"])

    logger.info("Flagging outliers...")
    clean_measurement["outlier"] = False
    clean_measurement["outlier"] = clean_measurement.outlier.mask(
        (clean_measurement["transformed_value"] > clean_measurement["max_threshold"])
        | (clean_measurement["transformed_value"] < clean_measurement["min_threshold"]),
        True,
    )

    if clip:
        logger.info("Clipping extreme values...")
        clean_measurement[
            "transformed_value"
        ] = clean_measurement.transformed_value.mask(
            clean_measurement["transformed_value"]
            >= clean_measurement["max_threshold"],
            clean_measurement["max_threshold"],
        )
        clean_measurement[
            "transformed_value"
        ] = clean_measurement.transformed_value.mask(
            clean_measurement["transformed_value"]
            <= clean_measurement["min_threshold"],
            clean_measurement["min_threshold"],
        )

    return clean_measurement