Skip to content

Change data input

1
2
3
4
5
6
from __future__ import annotations
from langchain.globals import set_llm_cache
from langchain_community.cache import SQLiteCache
import os
import sys
import chromadb
1
2
3
from backend.modules.utils import *
from backend.modules.rag_llm import *
from backend.modules.results_gen import *
/Users/smukherjee/.pyenv/versions/3.10.14/envs/openml/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
config = load_config_and_device("../../../backend/config.json")
config["persist_dir"] = "../../data/doc_examples/chroma_db/"
config["data_dir"] = "../../data/doc_examples/"
config["type_of_data"] = "dataset"
config["training"] = False
config["testing_flag"] = True  # set this to false while training, this is for demo
config["test_subset"] = True  # set this to false while training, this is for demo

# load the persistent database using ChromaDB
client = chromadb.PersistentClient(path=config["persist_dir"])
print(config)
[INFO] Finding device.
[INFO] Device found: mps
{'rqa_prompt_template': 'This database is a list of metadata. Use the following pieces of context to find the relevant document. Answer only from the context given using the {question} given. If you do not know the answer, say you do not know. {context}', 'llm_prompt_template': 'The following is a set of documents {docs}. Based on these docs, please summarize the content concisely. Also give a list of main concepts found in the documents. Do not add any new information. Helpful Answer: ', 'num_return_documents': 30, 'embedding_model': 'BAAI/bge-large-en-v1.5', 'llm_model': 'llama3', 'num_documents_for_llm': 30, 'data_dir': '../../data/doc_examples/', 'persist_dir': '../../data/doc_examples/chroma_db/', 'testing_flag': True, 'ignore_downloading_data': False, 'test_subset': True, 'data_download_n_jobs': 20, 'training': False, 'temperature': 0.95, 'top_p': 0.95, 'search_type': 'similarity', 'reranking': False, 'long_context_reorder': False, 'structure_query': False, 'use_chroma_for_saving_metadata': False, 'device': 'mps', 'type_of_data': 'dataset'}

Change the way the data is combined

  • To pass to the RAG, all the metadata is combined into a single string. This is done by concatenating all the metadata fields with a space separator.
  • We can change the way the data in whatever way we want. For example, we can concatenate all the metadata fields with a "~" separator.
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
def join_attributes(attribute: object, attr_name: str) -> str:
    """
    Description: Join the attributes of the OpenML objects into a single string with the format "key : value"
    """
    return (
        " ~ ".join(
            [f"{k} : {v}," for k, v in getattr(attribute, attr_name, {}).items()]
        )
        if hasattr(attribute, attr_name)
        else ""
    )


def combine_metadata(
    self, all_dataset_metadata: pd.DataFrame, all_data_description_df: pd.DataFrame
) -> pd.DataFrame:
    """
    Description: Combine the descriptions with the metadata table.
    """
    all_dataset_metadata = pd.merge(
        all_dataset_metadata, all_data_description_df, on="did", how="inner"
    )
    all_dataset_metadata["Combined_information"] = all_dataset_metadata.apply(
        self.merge_all_columns_to_string, axis=1
    )
    return all_dataset_metadata
1
2
OpenMLObjectHandler.join_attributes = join_attributes
OpenMLObjectHandler.combine_metadata = combine_metadata
1
2
3
4
5
6
# Setup llm chain, initialize the retriever and llm, and setup Retrieval QA
qa_dataset_handler = QASetup(
    config=config,
    data_type=config["type_of_data"],
    client=client,
)
1
qa_dataset, _ = qa_dataset_handler.setup_vector_db_and_qa()