Skip to content

eds_scikit.datasets.synthetic.biology

load_biology_data

load_biology_data(n_entity: int = 5, mean_measurement: int = 10000, n_care_site: int = 5, n_person: int = 5, n_visit_occurrence: int = 5, units: List[str] = ['g', 'g/l', 'mol', 's'], row_status_source_values: List[str] = ['Validé', 'Discontinué', 'Disponible', 'Attendu', 'Confirmé', 'Initial'], t_start: datetime = datetime(2017, 1, 1), t_end: datetime = datetime(2022, 1, 1), seed: int = None)

Create a minimalistic dataset for the bioclean function.

RETURNS DESCRIPTION
biology_dataset

measurement, concept and concept_relationship.

TYPE: BiologyDataset, a dataclass comprised of

Source code in eds_scikit/datasets/synthetic/biology.py
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
def load_biology_data(
    n_entity: int = 5,
    mean_measurement: int = 10000,
    n_care_site: int = 5,
    n_person: int = 5,
    n_visit_occurrence: int = 5,
    units: List[str] = ["g", "g/l", "mol", "s"],
    row_status_source_values: List[str] = [
        "Validé",
        "Discontinué",
        "Disponible",
        "Attendu",
        "Confirmé",
        "Initial",
    ],
    t_start: datetime = datetime(2017, 1, 1),
    t_end: datetime = datetime(2022, 1, 1),
    seed: int = None,
):
    """
    Create a minimalistic dataset for the `bioclean` function.

    Returns
    -------
    biology_dataset: BiologyDataset, a dataclass comprised of
        measurement, concept and concept_relationship.
    """
    if seed:
        np.random.seed(seed=seed)

    concept, concept_relationship, src_concept_name = _generate_concept(
        n_entity=n_entity, units=units
    )
    measurement = _generate_measurement(
        t_start=t_start,
        t_end=t_end,
        mean_measurement=mean_measurement,
        units=units,
        src_concept_name=src_concept_name,
        n_visit_occurrence=n_visit_occurrence,
        n_person=n_person,
        row_status_source_values=row_status_source_values,
    )
    care_site = _generate_care_site(n_care_site=n_care_site)
    visit_occurrence = _generate_visit_occurrence(
        n_visit_occurrence=n_visit_occurrence, n_care_site=n_care_site
    )

    return BiologyDataset(
        measurement=measurement,
        concept=concept,
        concept_relationship=concept_relationship,
        visit_occurrence=visit_occurrence,
        care_site=care_site,
        available_tables=[
            "measurement",
            "concept",
            "concept_relationship",
            "visit_occurrence",
            "care_site",
        ],
        t_start=t_start,
        t_end=t_end,
        module="pandas",
    )