Skip to content

eds_scikit.biology.cleaning.main

bioclean

bioclean(data: Data, concepts_sets: List[ConceptsSet] = None, config_name: str = None, start_date: datetime = None, end_date: datetime = None, studied_cohort: Union[DataFrame, List[int]] = None, clip: bool = False, standard_terminologies: List[str] = default_standard_terminologies, standard_concept_regex: dict = default_standard_concept_regex) -> Data

It follows the pipeline explained here:

PARAMETER DESCRIPTION
data

Instantiated HiveData, PostgresData or PandasData

TYPE: Data

concepts_sets

List of concepts-sets to select

TYPE: List[ConceptsSet], optional DEFAULT: None

config_name

Name of the dataset used to transform the data.

TYPE: str, optional DEFAULT: None

start_date

EXAMPLE: "2019-05-01"

TYPE: datetime, optional DEFAULT: None

end_date

EXAMPLE: "2022-05-01"

TYPE: datetime, optional DEFAULT: None

studied_cohort

List of patient_ids to select

TYPE: Union[DataFrame, np.iterable, set], optional DEFAULT: None

clip

If True extreme values are set equal to the thresholds

TYPE: bool, optional DEFAULT: False

standard_terminologies

EXAMPLE: ["LOINC", "AnaBio"]

TYPE: List[str], optional DEFAULT: default_standard_terminologies

standard_concept_regex

EXAMPLE: {"LOINC": "[0-9]{2,5}[-][0-9]","AnaBio": "[A-Z][0-9]{4}"}

TYPE: dict, optional DEFAULT: default_standard_concept_regex

RETURNS DESCRIPTION
Data

Same as the input with the transformed bioclean table

Source code in eds_scikit/biology/cleaning/main.py
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
def bioclean(
    data: Data,
    concepts_sets: List[ConceptsSet] = None,
    config_name: str = None,
    start_date: datetime = None,
    end_date: datetime = None,
    studied_cohort: Union[DataFrame, List[int]] = None,
    clip: bool = False,
    standard_terminologies: List[str] = default_standard_terminologies,
    standard_concept_regex: dict = default_standard_concept_regex,
) -> Data:
    """It follows the pipeline explained [here][cleaning]:

    Parameters
    ----------
    data : Data
        Instantiated [``HiveData``][eds_scikit.io.hive.HiveData], [``PostgresData``][eds_scikit.io.postgres.PostgresData] or [``PandasData``][eds_scikit.io.files.PandasData]
    concepts_sets : List[ConceptsSet], optional
        List of concepts-sets to select
    config_name : str, optional
        Name of the dataset used to [transform][eds_scikit.biology.cleaning.transform.transform_measurement] the data.
    start_date : datetime, optional
        **EXAMPLE**: `"2019-05-01"`
    end_date : datetime, optional
        **EXAMPLE**: `"2022-05-01"`
    studied_cohort : Union[DataFrame, np.iterable, set], optional
        List of patient_ids to select
    clip : bool, optional
        If `True` extreme values are set equal to the thresholds
    standard_terminologies : List[str], optional
        **EXAMPLE**: `["LOINC", "AnaBio"]`
    standard_concept_regex : dict, optional
        **EXAMPLE**: `{"LOINC": "[0-9]{2,5}[-][0-9]","AnaBio": "[A-Z][0-9]{4}"}`

    Returns
    -------
    Data
        Same as the input with the transformed `bioclean` table
    """
    # Check the data
    check_the_data_for_cleaning(data)

    # Extract tables
    measurement = data.measurement[
        list(
            data.measurement.columns[
                data.measurement.columns.isin(
                    [
                        "measurement_id",
                        "person_id",
                        "visit_occurrence_id",
                        "measurement_date",
                        "measurement_datetime",
                        "value_source_value",
                        "value_as_number",
                        "unit_source_value",
                        "row_status_source_value",
                        "measurement_source_concept_id",
                    ]
                )
            ]
        )
    ]
    concept = data.concept[
        [
            "concept_id",
            "concept_name",
            "concept_code",
            "vocabulary_id",
        ]
    ]
    concept_relationship = data.concept_relationship[
        ["concept_id_1", "concept_id_2", "relationship_id"]
    ]

    # Filter valid measurement
    measurement_valid = get_valid_measurement(measurement)

    # Filter measurement by date
    measurement_timed = filter_measurement_by_date(
        measurement_valid, start_date, end_date
    )

    # Query concepts-set information
    if concepts_sets is None:
        concepts_sets = fetch_all_concepts_set()

    src_to_std = get_concept_src_to_std(
        concept=concept,
        concept_relationship=concept_relationship,
        concepts_sets=concepts_sets,
        standard_concept_regex=standard_concept_regex,
        standard_terminologies=standard_terminologies,
    )
    # Extract concept-set
    measurement_std_filtered = get_measurement_std(measurement_timed, src_to_std)

    # Filter Measurement
    if studied_cohort:
        measurement_std_filtered = select_cohort(
            measurement_std_filtered, studied_cohort
        )

    # Transform values
    data.bioclean = transform_measurement(measurement_std_filtered, clip, config_name)