Skip to content

eds_scikit.biology.viz.aggregate

aggregate_concepts_set

aggregate_concepts_set(data: Data, concepts_set: ConceptsSet, start_date: datetime = None, end_date: datetime = None, number_of_concept: Tuple[str, int] = None, limit_count: Tuple[str, int] = None, standard_terminologies: List[str] = default_standard_terminologies, standard_concept_regex: dict = default_standard_concept_regex, pd_limit_size: int = 100000, stats_only: bool = False) -> Dict[str, pd.DataFrame]

Aggregates the data for visualization.

PARAMETER DESCRIPTION
data

Instantiated HiveData, PostgresData or PandasData

TYPE: Data

concepts_set

List of concepts-sets to select

TYPE: ConceptsSet

start_date

EXAMPLE: "2019-05-01"

TYPE: datetime, optional DEFAULT: None

end_date

EXAMPLE: "2022-01-01"

TYPE: datetime, optional DEFAULT: None

number_of_concept

The maximum number of concepts for a given terminology EXAMPLE: ("LOINC", 5)

TYPE: Tuple[str, int], optional DEFAULT: None

limit_count

The minimum number of observations per concepts for a given terminology EXAMPLE: ("LOINC", 5)

TYPE: Tuple[str, int], optional DEFAULT: None

standard_terminologies

EXAMPLE: ["LOINC", "AnaBio"]

TYPE: List[str], optional DEFAULT: default_standard_terminologies

standard_concept_regex

EXAMPLE: {"LOINC": "[0-9]{2,5}[-][0-9]","AnaBio": "[A-Z][0-9]{4}"}

TYPE: dict, optional DEFAULT: default_standard_concept_regex

pd_limit_size

The limit number of rows to convert Koalas DatFrame into Pandas DataFrame

TYPE: int, optional DEFAULT: 100000

stats_only

If True, it will only aggregate the data for the summary table.

TYPE: bool, optional DEFAULT: False

RETURNS DESCRIPTION
Dict[str, pd.DataFrame]

Aggregated tables for visualization

Source code in eds_scikit/biology/viz/aggregate.py
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
def aggregate_concepts_set(
    data: Data,
    concepts_set: ConceptsSet,
    start_date: datetime = None,
    end_date: datetime = None,
    number_of_concept: Tuple[str, int] = None,
    limit_count: Tuple[str, int] = None,
    standard_terminologies: List[str] = default_standard_terminologies,
    standard_concept_regex: dict = default_standard_concept_regex,
    pd_limit_size: int = 100000,
    stats_only: bool = False,
) -> Dict[str, pd.DataFrame]:
    """Aggregates the data for [visualization][visualization].

    Parameters
    ----------
    data : Data
         Instantiated [``HiveData``][eds_scikit.io.hive.HiveData], [``PostgresData``][eds_scikit.io.postgres.PostgresData] or [``PandasData``][eds_scikit.io.files.PandasData]
    concepts_set : ConceptsSet
        List of concepts-sets to select
    start_date : datetime, optional
        **EXAMPLE**: `"2019-05-01"`
    end_date : datetime, optional
        **EXAMPLE**: `"2022-01-01"`
    number_of_concept : Tuple[str, int], optional
        The maximum number of concepts for a given terminology
        **EXAMPLE**: `("LOINC", 5)`
    limit_count : Tuple[str, int], optional
        The minimum number of observations per concepts for a given terminology
        **EXAMPLE**: `("LOINC", 5)`
    standard_terminologies : List[str], optional
        **EXAMPLE**: `["LOINC", "AnaBio"]`
    standard_concept_regex : dict, optional
        **EXAMPLE**: `{"LOINC": "[0-9]{2,5}[-][0-9]","AnaBio": "[A-Z][0-9]{4}"}`
    pd_limit_size : int, optional
        The limit number of rows to convert [Koalas](https://koalas.readthedocs.io/en/latest/) DatFrame into [Pandas](https://pandas.pydata.org/) DataFrame
    stats_only : bool, optional
        If ``True``, it will only aggregate the data for the [summary table][summary-table].

    Returns
    -------
    Dict[str, pd.DataFrame]
        Aggregated tables for visualization
    """
    # Check the data
    _check_the_data_for_aggregation(data)

    # Extract tables
    measurement = (
        data.measurement[
            list(
                data.measurement.columns[
                    data.measurement.columns.isin(
                        [
                            "measurement_id",
                            "visit_occurrence_id",
                            "measurement_date",
                            "measurement_datetime",
                            "value_as_number",
                            "unit_source_value",
                            "row_status_source_value",
                            "measurement_source_concept_id",
                        ]
                    )
                ]
            )
        ]
        if "bioclean" not in dir(data)
        else data.bioclean
    )
    concept = data.concept[
        [
            "concept_id",
            "concept_name",
            "concept_code",
            "vocabulary_id",
        ]
    ]
    concept_relationship = data.concept_relationship[
        ["concept_id_1", "concept_id_2", "relationship_id"]
    ]
    visit = data.visit_occurrence[["visit_occurrence_id", "care_site_id"]]
    care_site = data.care_site[["care_site_short_name", "care_site_id"]]

    # Filter measurement by date
    measurement = filter_measurement_by_date(measurement, start_date, end_date)

    if "bioclean" in dir(data):
        measurement_std_filtered = _extract_concepts_set(measurement, concepts_set)

    else:
        # Filter valid measurement
        measurement_valid = get_valid_measurement(measurement)

        # Select concepts-set
        src_to_std = get_concept_src_to_std(
            concept,
            concept_relationship,
            concepts_set,
            standard_concept_regex,
            standard_terminologies,
        )

        if "concepts_set" in src_to_std.columns:
            src_to_std = src_to_std.drop(columns="concepts_set")

        # Extract concept-set
        measurement_std_filtered = get_measurement_std(measurement_valid, src_to_std)
        measurement_std_filtered = measurement_std_filtered.drop(
            columns="source_concept_id"
        )

    # Filter limit number of concepts
    if number_of_concept:
        measurement_std_filtered = filter_concept_by_number(
            measurement_std_filtered, number_of_concept
        )

    # Filter limit concept with enough measurements
    if limit_count:
        measurement_std_filtered = filter_concept_by_count(
            measurement_std_filtered, limit_count
        )

    # Add care_site column
    measurement_std_filtered = _add_hospital(measurement_std_filtered, visit, care_site)

    # Normalize unit string
    measurement_std_filtered = normalize_unit(measurement_std_filtered)

    # Aggregate measurement
    tables = aggregate_measurement(
        measurement=measurement_std_filtered,
        pd_limit_size=pd_limit_size,
        stats_only=stats_only,
        overall_only=stats_only,
    )
    return tables