classQueryProcessor:def__init__(self,query:str,qa:RetrievalQA,type_of_query:str,config:dict):self.query=queryself.qa=qaself.type_of_query=type_of_queryself.config=configdeffetch_results(self):""" Fetch results for the query using the QA chain. """results=self.qa.invoke(input=self.query,config={"temperature":self.config["temperature"],"top-p":self.config["top_p"],},)ifself.config["long_context_reorder"]:results=long_context_reorder(results)id_column={"dataset":"did","flow":"id","data":"did"}[self.type_of_query]ifself.config["reranking"]:try:print("[INFO] Reranking results...")ranker=Ranker(model_name="ms-marco-MiniLM-L-12-v2",cache_dir="/tmp/")rerankrequest=RerankRequest(query=self.query,passages=[{"id":result.metadata[id_column],"text":result.page_content}forresultinresults],)ranking=ranker.rerank(rerankrequest)ids=[result["id"]forresultinranking]ranked_results=[resultforresultinresultsifresult.metadata[id_column]inids]print("[INFO] Reranking complete.")returnranked_resultsexceptExceptionase:print(f"[ERROR] Reranking failed: {e}")returnresultselse:returnresults@staticmethoddefprocess_documents(source_documents:Sequence[Document],)->Tuple[OrderedDict,list]:""" Process the source documents and create a dictionary with the key_name as the key and the name and page content as the values. """dict_results=OrderedDict()forresultinsource_documents:dict_results[result.metadata["did"]]={"name":result.metadata["name"],"page_content":result.page_content,}ids=[result.metadata["did"]forresultinsource_documents]returndict_results,ids@staticmethoddefmake_clickable(val:str)->str:""" Make the URL clickable in the dataframe. """return'<a href="{}">{}</a>'.format(val,val)defcreate_output_dataframe(self,dict_results:dict,type_of_data:str,ids_order:list)->pd.DataFrame:""" Create an output dataframe with the results. The URLs are API calls to the OpenML API for the specific type of data. """output_df=pd.DataFrame(dict_results).T.reset_index()output_df["index"]=output_df["index"].astype(int)output_df=output_df.set_index("index").loc[ids_order].reset_index()output_df["urls"]=output_df["index"].apply(lambdax:f"https://www.openml.org/search?type={type_of_data}&id={x}")output_df["urls"]=output_df["urls"].apply(self.make_clickable)iftype_of_data=="data":output_df["command"]=output_df["index"].apply(lambdax:f"dataset = openml.datasets.get_dataset({x})")eliftype_of_data=="flow":output_df["command"]=output_df["index"].apply(lambdax:f"flow = openml.flows.get_flow({x})")output_df=output_df.drop_duplicates(subset=["name"])replace_dict={"index":"id","command":"Command","urls":"OpenML URL","page_content":"Description",}forcolin["index","command","urls","page_content"]:ifcolinoutput_df.columns:output_df=output_df.rename(columns={col:replace_dict[col]})returnoutput_df@staticmethoddefcheck_query(query:str)->str:""" Performs checks on the query: - Replaces %20 with space character (browsers do this automatically when spaces are in the URL) - Removes leading and trailing spaces - Limits the query to 200 characters """ifquery=="":raiseValueError("Query cannot be empty.")query=query.replace("%20"," ")query=query.strip()query=query[:200]returnquerydefget_result_from_query(self)->Tuple[pd.DataFrame,Sequence[Document]]:""" Get the result from the query using the QA chain and return the results in a dataframe that is then sent to the frontend. """ifself.type_of_query=="dataset":type_of_query="data"elifself.type_of_query=="flow":type_of_query="flow"else:raiseValueError(f"Unsupported type_of_data: {self.type_of_query}")query=self.check_query(self.query)ifquery=="":returnpd.DataFrame(),[]source_documents=self.fetch_results()dict_results,ids_order=self.process_documents(source_documents)output_df=self.create_output_dataframe(dict_results,type_of_query,ids_order)returnoutput_df,ids_order
Performs checks on the query:
- Replaces %20 with space character (browsers do this automatically when spaces are in the URL)
- Removes leading and trailing spaces
- Limits the query to 200 characters
Source code in backend/modules/results_gen.py
138139140141142143144145146147148149150151
@staticmethoddefcheck_query(query:str)->str:""" Performs checks on the query: - Replaces %20 with space character (browsers do this automatically when spaces are in the URL) - Removes leading and trailing spaces - Limits the query to 200 characters """ifquery=="":raiseValueError("Query cannot be empty.")query=query.replace("%20"," ")query=query.strip()query=query[:200]returnquery
defcreate_output_dataframe(self,dict_results:dict,type_of_data:str,ids_order:list)->pd.DataFrame:""" Create an output dataframe with the results. The URLs are API calls to the OpenML API for the specific type of data. """output_df=pd.DataFrame(dict_results).T.reset_index()output_df["index"]=output_df["index"].astype(int)output_df=output_df.set_index("index").loc[ids_order].reset_index()output_df["urls"]=output_df["index"].apply(lambdax:f"https://www.openml.org/search?type={type_of_data}&id={x}")output_df["urls"]=output_df["urls"].apply(self.make_clickable)iftype_of_data=="data":output_df["command"]=output_df["index"].apply(lambdax:f"dataset = openml.datasets.get_dataset({x})")eliftype_of_data=="flow":output_df["command"]=output_df["index"].apply(lambdax:f"flow = openml.flows.get_flow({x})")output_df=output_df.drop_duplicates(subset=["name"])replace_dict={"index":"id","command":"Command","urls":"OpenML URL","page_content":"Description",}forcolin["index","command","urls","page_content"]:ifcolinoutput_df.columns:output_df=output_df.rename(columns={col:replace_dict[col]})returnoutput_df
deffetch_results(self):""" Fetch results for the query using the QA chain. """results=self.qa.invoke(input=self.query,config={"temperature":self.config["temperature"],"top-p":self.config["top_p"],},)ifself.config["long_context_reorder"]:results=long_context_reorder(results)id_column={"dataset":"did","flow":"id","data":"did"}[self.type_of_query]ifself.config["reranking"]:try:print("[INFO] Reranking results...")ranker=Ranker(model_name="ms-marco-MiniLM-L-12-v2",cache_dir="/tmp/")rerankrequest=RerankRequest(query=self.query,passages=[{"id":result.metadata[id_column],"text":result.page_content}forresultinresults],)ranking=ranker.rerank(rerankrequest)ids=[result["id"]forresultinranking]ranked_results=[resultforresultinresultsifresult.metadata[id_column]inids]print("[INFO] Reranking complete.")returnranked_resultsexceptExceptionase:print(f"[ERROR] Reranking failed: {e}")returnresultselse:returnresults
defget_result_from_query(self)->Tuple[pd.DataFrame,Sequence[Document]]:""" Get the result from the query using the QA chain and return the results in a dataframe that is then sent to the frontend. """ifself.type_of_query=="dataset":type_of_query="data"elifself.type_of_query=="flow":type_of_query="flow"else:raiseValueError(f"Unsupported type_of_data: {self.type_of_query}")query=self.check_query(self.query)ifquery=="":returnpd.DataFrame(),[]source_documents=self.fetch_results()dict_results,ids_order=self.process_documents(source_documents)output_df=self.create_output_dataframe(dict_results,type_of_query,ids_order)returnoutput_df,ids_order
Process the source documents and create a dictionary with the key_name as the key and the name and page content as the values.
Source code in backend/modules/results_gen.py
818283848586878889909192939495
@staticmethoddefprocess_documents(source_documents:Sequence[Document],)->Tuple[OrderedDict,list]:""" Process the source documents and create a dictionary with the key_name as the key and the name and page content as the values. """dict_results=OrderedDict()forresultinsource_documents:dict_results[result.metadata["did"]]={"name":result.metadata["name"],"page_content":result.page_content,}ids=[result.metadata["did"]forresultinsource_documents]returndict_results,ids
Description: Lost in the middle reorder: the less relevant documents will be at the
middle of the list and more relevant elements at beginning / end.
See: https://arxiv.org/abs//2307.03172
Source code in backend/modules/results_gen.py
20212223242526272829303132
deflong_context_reorder(results):""" Description: Lost in the middle reorder: the less relevant documents will be at the middle of the list and more relevant elements at beginning / end. See: https://arxiv.org/abs//2307.03172 """print("[INFO] Reordering results...")reordering=LongContextReorder()results=reordering.transform_documents(results)print("[INFO] Reordering complete.")returnresults