Skip to content

Metadata Module

  • Logic for Getting/formatting/loading metadata from OpenML.
  • If you want to modify the logic for the data ingestion pipeline : Refer to OpenMLObjectHandler , OpenMLDatasetHandler
  • If you want to chane the pipeline itself, refer to OpenMLMetadataProcessor

OpenMLDatasetHandler

Bases: OpenMLObjectHandler

Description: The class for handling OpenML dataset objects.

Source code in backend/modules/metadata_utils.py
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
class OpenMLDatasetHandler(OpenMLObjectHandler):
    """
    Description: The class for handling OpenML dataset objects.
    """

    def get_description(self, data_id: int):
        return openml.datasets.get_dataset(
            dataset_id=data_id,
            download_data=False,
            download_qualities=True,
            download_features_meta_data=True,
        )

    def get_openml_objects(self):
        return openml.datasets.list_datasets(output_format="dataframe")

    def process_metadata(
        self,
        openml_data_object: Sequence[openml.datasets.dataset.OpenMLDataset],
        data_id: Sequence[int],
        all_dataset_metadata: pd.DataFrame,
        file_path: str,
        subset_ids=None,
    ):
        """
        Description: Combine the metadata attributes into a single string and save it to a CSV / ChromaDB file. Subset the data if given a list of IDs to subset by.
        """

        # Metadata
        descriptions = [
            self.extract_attribute(attr, "description") for attr in openml_data_object
        ]
        joined_qualities = [
            self.join_attributes(attr, "qualities") for attr in openml_data_object
        ]
        joined_features = [
            self.join_attributes(attr, "features") for attr in openml_data_object
        ]

        # Combine them

        all_data_description_df = self.create_combined_information_df_for_datasets(
            data_id, descriptions, joined_qualities, joined_features
        )
        all_dataset_metadata = self.combine_metadata(
            all_dataset_metadata, all_data_description_df
        )

        # subset the metadata if subset_ids is not None
        all_dataset_metadata = self.subset_metadata(subset_ids, all_dataset_metadata)

        # Save to a CSV
        all_dataset_metadata.to_csv(file_path)

        # Save to chroma if needed
        if self.config.get("use_chroma_for_saving_metadata"):
            client = chromadb.PersistentClient(
                path=self.config["persist_dir"] + "metadata_db"
            )
            vecmanager = VectorStoreManager(client, self.config)
            vecmanager.add_df_chunks_to_db(all_dataset_metadata)

        return (
            all_dataset_metadata[["did", "name", "Combined_information"]],
            all_dataset_metadata,
        )

process_metadata(openml_data_object, data_id, all_dataset_metadata, file_path, subset_ids=None)

Description: Combine the metadata attributes into a single string and save it to a CSV / ChromaDB file. Subset the data if given a list of IDs to subset by.

Source code in backend/modules/metadata_utils.py
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
def process_metadata(
    self,
    openml_data_object: Sequence[openml.datasets.dataset.OpenMLDataset],
    data_id: Sequence[int],
    all_dataset_metadata: pd.DataFrame,
    file_path: str,
    subset_ids=None,
):
    """
    Description: Combine the metadata attributes into a single string and save it to a CSV / ChromaDB file. Subset the data if given a list of IDs to subset by.
    """

    # Metadata
    descriptions = [
        self.extract_attribute(attr, "description") for attr in openml_data_object
    ]
    joined_qualities = [
        self.join_attributes(attr, "qualities") for attr in openml_data_object
    ]
    joined_features = [
        self.join_attributes(attr, "features") for attr in openml_data_object
    ]

    # Combine them

    all_data_description_df = self.create_combined_information_df_for_datasets(
        data_id, descriptions, joined_qualities, joined_features
    )
    all_dataset_metadata = self.combine_metadata(
        all_dataset_metadata, all_data_description_df
    )

    # subset the metadata if subset_ids is not None
    all_dataset_metadata = self.subset_metadata(subset_ids, all_dataset_metadata)

    # Save to a CSV
    all_dataset_metadata.to_csv(file_path)

    # Save to chroma if needed
    if self.config.get("use_chroma_for_saving_metadata"):
        client = chromadb.PersistentClient(
            path=self.config["persist_dir"] + "metadata_db"
        )
        vecmanager = VectorStoreManager(client, self.config)
        vecmanager.add_df_chunks_to_db(all_dataset_metadata)

    return (
        all_dataset_metadata[["did", "name", "Combined_information"]],
        all_dataset_metadata,
    )

OpenMLFlowHandler

Bases: OpenMLObjectHandler

Description: The class for handling OpenML flow objects.

Source code in backend/modules/metadata_utils.py
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
class OpenMLFlowHandler(OpenMLObjectHandler):
    """
    Description: The class for handling OpenML flow objects.
    """

    def get_description(self, data_id: int):
        return openml.flows.get_flow(flow_id=data_id)

    def get_openml_objects(self):
        all_objects = openml.flows.list_flows(output_format="dataframe")
        return all_objects.rename(columns={"id": "did"})

    def process_metadata(
        self,
        openml_data_object: Sequence[openml.flows.flow.OpenMLFlow],
        data_id: Sequence[int],
        all_dataset_metadata: pd.DataFrame,
        file_path: str,
        subset_ids=None,
    ):
        descriptions = [
            self.extract_attribute(attr, "description") for attr in openml_data_object
        ]
        names = [self.extract_attribute(attr, "name") for attr in openml_data_object]
        tags = [self.extract_attribute(attr, "tags") for attr in openml_data_object]

        all_data_description_df = pd.DataFrame(
            {
                "did": data_id,
                "description": descriptions,
                "name": names,
                "tags": tags,
            }
        )

        all_data_description_df["Combined_information"] = all_data_description_df.apply(
            self.merge_all_columns_to_string, axis=1
        )
        # subset the metadata if subset_ids is not None

        all_dataset_metadata = self.subset_metadata(subset_ids, all_dataset_metadata)

        all_data_description_df.to_csv(file_path)

        return (
            all_data_description_df[["did", "name", "Combined_information"]],
            all_data_description_df,
        )

OpenMLMetadataProcessor

Description: Process metadata using the OpenMLHandlers

Source code in backend/modules/metadata_utils.py
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
class OpenMLMetadataProcessor:
    """
    Description: Process metadata using the OpenMLHandlers
    """

    def __init__(self, config: dict):
        self.config = config
        self.save_filename = os.path.join(
            config["data_dir"], f"all_{config['type_of_data']}_metadata.pkl"
        )
        self.description_filename = os.path.join(
            config["data_dir"], f"all_{config['type_of_data']}_description.csv"
        )

    def get_all_metadata_from_openml(self):
        """
        Description: Gets all the metadata from OpenML for the type of data specified in the config.
        If training is set to False, it loads the metadata from the files. If training is set to True, it gets the metadata from OpenML.

        This uses parallel threads (pqdm) and so to ensure thread safety, install the package oslo.concurrency.
        """
        if not self.config.get("training", False) or self.config.get(
            "ignore_downloading_data", False
        ):
            if not os.path.exists(self.save_filename):
                raise Exception(
                    "Metadata files do not exist. Please run the training pipeline first."
                )
            print("[INFO] Loading metadata from file.")
            return load_metadata_from_file(self.save_filename)

        print("[INFO] Training is set to True.")
        handler = (
            OpenMLDatasetHandler(self.config)
            if self.config["type_of_data"] == "dataset"
            else OpenMLFlowHandler(self.config)
        )

        all_objects = handler.get_openml_objects()

        if self.config.get("test_subset", False):
            print("[INFO] Subsetting the data.")
            all_objects = all_objects[:500]

        data_id = [int(all_objects.iloc[i]["did"]) for i in range(len(all_objects))]

        print("[INFO] Initializing cache.")
        handler.initialize_cache(data_id)

        print(f"[INFO] Getting {self.config['type_of_data']} metadata from OpenML.")
        openml_data_object = handler.get_metadata(data_id)

        print("[INFO] Saving metadata to file.")
        save_metadata_to_file(
            (openml_data_object, data_id, all_objects, handler), self.save_filename
        )

        return openml_data_object, data_id, all_objects, handler

    def create_metadata_dataframe(
        self,
        handler: Union["OpenMLDatasetHandler", "OpenMLFlowHandler"],
        openml_data_object: Sequence[
            Union[openml.datasets.dataset.OpenMLDataset, openml.flows.flow.OpenMLFlow]
        ],
        data_id: Sequence[int],
        all_dataset_metadata: pd.DataFrame,
        subset_ids=None,
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Description: Creates a dataframe with all the metadata, joined columns with all information
        for the type of data specified in the config. If training is set to False,
        the dataframes are loaded from the files. If training is set to True, the
        dataframes are created and then saved to the files.
        """
        if not self.config.get("training", False):
            return (
                handler.load_metadata(self.description_filename),
                all_dataset_metadata,
            )

        return handler.process_metadata(
            openml_data_object,
            data_id,
            all_dataset_metadata,
            self.description_filename,
            subset_ids,
        )

create_metadata_dataframe(handler, openml_data_object, data_id, all_dataset_metadata, subset_ids=None)

Description: Creates a dataframe with all the metadata, joined columns with all information for the type of data specified in the config. If training is set to False, the dataframes are loaded from the files. If training is set to True, the dataframes are created and then saved to the files.

Source code in backend/modules/metadata_utils.py
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
def create_metadata_dataframe(
    self,
    handler: Union["OpenMLDatasetHandler", "OpenMLFlowHandler"],
    openml_data_object: Sequence[
        Union[openml.datasets.dataset.OpenMLDataset, openml.flows.flow.OpenMLFlow]
    ],
    data_id: Sequence[int],
    all_dataset_metadata: pd.DataFrame,
    subset_ids=None,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Description: Creates a dataframe with all the metadata, joined columns with all information
    for the type of data specified in the config. If training is set to False,
    the dataframes are loaded from the files. If training is set to True, the
    dataframes are created and then saved to the files.
    """
    if not self.config.get("training", False):
        return (
            handler.load_metadata(self.description_filename),
            all_dataset_metadata,
        )

    return handler.process_metadata(
        openml_data_object,
        data_id,
        all_dataset_metadata,
        self.description_filename,
        subset_ids,
    )

get_all_metadata_from_openml()

Description: Gets all the metadata from OpenML for the type of data specified in the config. If training is set to False, it loads the metadata from the files. If training is set to True, it gets the metadata from OpenML.

This uses parallel threads (pqdm) and so to ensure thread safety, install the package oslo.concurrency.

Source code in backend/modules/metadata_utils.py
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
def get_all_metadata_from_openml(self):
    """
    Description: Gets all the metadata from OpenML for the type of data specified in the config.
    If training is set to False, it loads the metadata from the files. If training is set to True, it gets the metadata from OpenML.

    This uses parallel threads (pqdm) and so to ensure thread safety, install the package oslo.concurrency.
    """
    if not self.config.get("training", False) or self.config.get(
        "ignore_downloading_data", False
    ):
        if not os.path.exists(self.save_filename):
            raise Exception(
                "Metadata files do not exist. Please run the training pipeline first."
            )
        print("[INFO] Loading metadata from file.")
        return load_metadata_from_file(self.save_filename)

    print("[INFO] Training is set to True.")
    handler = (
        OpenMLDatasetHandler(self.config)
        if self.config["type_of_data"] == "dataset"
        else OpenMLFlowHandler(self.config)
    )

    all_objects = handler.get_openml_objects()

    if self.config.get("test_subset", False):
        print("[INFO] Subsetting the data.")
        all_objects = all_objects[:500]

    data_id = [int(all_objects.iloc[i]["did"]) for i in range(len(all_objects))]

    print("[INFO] Initializing cache.")
    handler.initialize_cache(data_id)

    print(f"[INFO] Getting {self.config['type_of_data']} metadata from OpenML.")
    openml_data_object = handler.get_metadata(data_id)

    print("[INFO] Saving metadata to file.")
    save_metadata_to_file(
        (openml_data_object, data_id, all_objects, handler), self.save_filename
    )

    return openml_data_object, data_id, all_objects, handler

OpenMLObjectHandler

Description: The base class for handling OpenML objects. The logic for handling datasets/flows are subclasses from this.

Source code in backend/modules/metadata_utils.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
class OpenMLObjectHandler:
    """
    Description: The base class for handling OpenML objects. The logic for handling datasets/flows are subclasses from this.
    """

    def __init__(self, config):
        self.config = config

    def get_description(self, data_id: int):
        """
        Description: Get the description of the OpenML object.
        """
        raise NotImplementedError

    def get_openml_objects(self):
        """
        Description: Get the OpenML objects.
        """
        raise NotImplementedError

    def initialize_cache(self, data_id: Sequence[int]) -> None:
        """
        Description: Initialize the cache for the OpenML objects.
        """
        self.get_description(data_id[0])

    def get_metadata(self, data_id: Sequence[int]):
        """
        Description: Get metadata from OpenML using parallel processing.
        """
        return pqdm(
            data_id, self.get_description, n_jobs=self.config["data_download_n_jobs"]
        )

    def process_metadata(
        self,
        openml_data_object,
        data_id: Sequence[int],
        all_dataset_metadata: pd.DataFrame,
        file_path: str,
        subset_ids=None,
    ):
        """
        Description: Process the metadata.
        """
        raise NotImplementedError

    @staticmethod
    def load_metadata(file_path: str):
        """
        Description: Load metadata from a file.
        """
        try:
            return pd.read_csv(file_path)
        except FileNotFoundError:
            raise Exception(
                "Metadata files do not exist. Please run the training pipeline first."
            )

    @staticmethod
    def extract_attribute(attribute: object, attr_name: str) -> str:
        """
        Description: Extract an attribute from the OpenML object.
        """
        return getattr(attribute, attr_name, "")

    @staticmethod
    def join_attributes(attribute: object, attr_name: str) -> str:
        """
        Description: Join the attributes of the OpenML object.
        """
        return (
            " ".join(
                [f"{k} : {v}," for k, v in getattr(attribute, attr_name, {}).items()]
            )
            if hasattr(attribute, attr_name)
            else ""
        )

    @staticmethod
    def create_combined_information_df_for_datasets(
        data_id: int | Sequence[int],
        descriptions: Sequence[str],
        joined_qualities: Sequence[str],
        joined_features: Sequence[str],
    ) -> pd.DataFrame:
        """
        Description: Create a dataframe with the combined information of the OpenML object.
        """
        return pd.DataFrame(
            {
                "did": data_id,
                "description": descriptions,
                "qualities": joined_qualities,
                "features": joined_features,
            }
        )

    @staticmethod
    def merge_all_columns_to_string(row: pd.Series) -> str:
        """
        Description: Create a single column that has a combined string of all the metadata and the description in the form of "column - value, column - value, ... description"
        """
        return " ".join([f"{col} - {val}," for col, val in zip(row.index, row.values)])

    def combine_metadata(
        self, all_dataset_metadata: pd.DataFrame, all_data_description_df: pd.DataFrame
    ) -> pd.DataFrame:
        """
        Description: Combine the descriptions with the metadata table.
        """
        all_dataset_metadata = pd.merge(
            all_dataset_metadata, all_data_description_df, on="did", how="inner"
        )
        all_dataset_metadata["Combined_information"] = all_dataset_metadata.apply(
            self.merge_all_columns_to_string, axis=1
        )
        return all_dataset_metadata

    @staticmethod
    def subset_metadata(
        subset_ids: Sequence[int] | None, all_dataset_metadata: pd.DataFrame
    ):
        if subset_ids is not None:
            subset_ids = [int(x) for x in subset_ids]
            all_dataset_metadata = all_dataset_metadata[
                all_dataset_metadata["did"].isin(subset_ids)
            ]
        return all_dataset_metadata

combine_metadata(all_dataset_metadata, all_data_description_df)

Description: Combine the descriptions with the metadata table.

Source code in backend/modules/metadata_utils.py
123
124
125
126
127
128
129
130
131
132
133
134
135
def combine_metadata(
    self, all_dataset_metadata: pd.DataFrame, all_data_description_df: pd.DataFrame
) -> pd.DataFrame:
    """
    Description: Combine the descriptions with the metadata table.
    """
    all_dataset_metadata = pd.merge(
        all_dataset_metadata, all_data_description_df, on="did", how="inner"
    )
    all_dataset_metadata["Combined_information"] = all_dataset_metadata.apply(
        self.merge_all_columns_to_string, axis=1
    )
    return all_dataset_metadata

create_combined_information_df_for_datasets(data_id, descriptions, joined_qualities, joined_features) staticmethod

Description: Create a dataframe with the combined information of the OpenML object.

Source code in backend/modules/metadata_utils.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
@staticmethod
def create_combined_information_df_for_datasets(
    data_id: int | Sequence[int],
    descriptions: Sequence[str],
    joined_qualities: Sequence[str],
    joined_features: Sequence[str],
) -> pd.DataFrame:
    """
    Description: Create a dataframe with the combined information of the OpenML object.
    """
    return pd.DataFrame(
        {
            "did": data_id,
            "description": descriptions,
            "qualities": joined_qualities,
            "features": joined_features,
        }
    )

extract_attribute(attribute, attr_name) staticmethod

Description: Extract an attribute from the OpenML object.

Source code in backend/modules/metadata_utils.py
77
78
79
80
81
82
@staticmethod
def extract_attribute(attribute: object, attr_name: str) -> str:
    """
    Description: Extract an attribute from the OpenML object.
    """
    return getattr(attribute, attr_name, "")

get_description(data_id)

Description: Get the description of the OpenML object.

Source code in backend/modules/metadata_utils.py
26
27
28
29
30
def get_description(self, data_id: int):
    """
    Description: Get the description of the OpenML object.
    """
    raise NotImplementedError

get_metadata(data_id)

Description: Get metadata from OpenML using parallel processing.

Source code in backend/modules/metadata_utils.py
44
45
46
47
48
49
50
def get_metadata(self, data_id: Sequence[int]):
    """
    Description: Get metadata from OpenML using parallel processing.
    """
    return pqdm(
        data_id, self.get_description, n_jobs=self.config["data_download_n_jobs"]
    )

get_openml_objects()

Description: Get the OpenML objects.

Source code in backend/modules/metadata_utils.py
32
33
34
35
36
def get_openml_objects(self):
    """
    Description: Get the OpenML objects.
    """
    raise NotImplementedError

initialize_cache(data_id)

Description: Initialize the cache for the OpenML objects.

Source code in backend/modules/metadata_utils.py
38
39
40
41
42
def initialize_cache(self, data_id: Sequence[int]) -> None:
    """
    Description: Initialize the cache for the OpenML objects.
    """
    self.get_description(data_id[0])

join_attributes(attribute, attr_name) staticmethod

Description: Join the attributes of the OpenML object.

Source code in backend/modules/metadata_utils.py
84
85
86
87
88
89
90
91
92
93
94
95
@staticmethod
def join_attributes(attribute: object, attr_name: str) -> str:
    """
    Description: Join the attributes of the OpenML object.
    """
    return (
        " ".join(
            [f"{k} : {v}," for k, v in getattr(attribute, attr_name, {}).items()]
        )
        if hasattr(attribute, attr_name)
        else ""
    )

load_metadata(file_path) staticmethod

Description: Load metadata from a file.

Source code in backend/modules/metadata_utils.py
65
66
67
68
69
70
71
72
73
74
75
@staticmethod
def load_metadata(file_path: str):
    """
    Description: Load metadata from a file.
    """
    try:
        return pd.read_csv(file_path)
    except FileNotFoundError:
        raise Exception(
            "Metadata files do not exist. Please run the training pipeline first."
        )

merge_all_columns_to_string(row) staticmethod

Description: Create a single column that has a combined string of all the metadata and the description in the form of "column - value, column - value, ... description"

Source code in backend/modules/metadata_utils.py
116
117
118
119
120
121
@staticmethod
def merge_all_columns_to_string(row: pd.Series) -> str:
    """
    Description: Create a single column that has a combined string of all the metadata and the description in the form of "column - value, column - value, ... description"
    """
    return " ".join([f"{col} - {val}," for col, val in zip(row.index, row.values)])

process_metadata(openml_data_object, data_id, all_dataset_metadata, file_path, subset_ids=None)

Description: Process the metadata.

Source code in backend/modules/metadata_utils.py
52
53
54
55
56
57
58
59
60
61
62
63
def process_metadata(
    self,
    openml_data_object,
    data_id: Sequence[int],
    all_dataset_metadata: pd.DataFrame,
    file_path: str,
    subset_ids=None,
):
    """
    Description: Process the metadata.
    """
    raise NotImplementedError