[Feature] Add document retrieval QA (#5020)

* add langchain * add langchain * Add files via upload * add langchain * fix style * fix style: remove extra space * add pytest; modified retriever * add pytest; modified retriever * add tests to build_on_pr.yml * fix build_on_pr.yml * fix build on pr; fix environ vars * seperate unit tests for colossalqa from build from pr * fix container setting; fix environ vars * commented dev code * add incremental update * remove stale code * fix style * change to sha3 224 * fix retriever; fix style; add unit test for document loader * fix ci workflow config * fix ci workflow config * add set cuda visible device script in ci * fix doc string * fix style; update readme; refactored * add force log info * change build on pr, ignore colossalqa * fix docstring, captitalize all initial letters * fix indexing; fix text-splitter * remove debug code, update reference * reset previous commit * update LICENSE update README add key-value mode, fix bugs * add files back * revert force push * remove junk file * add test files * fix retriever bug, add intent classification * change conversation chain design * rewrite prompt and conversation chain * add ui v1 * ui v1 * fix atavar * add header * Refactor the RAG Code and support Pangu * Refactor the ColossalQA chain to Object-Oriented Programming and the UI demo. * resolved conversation. tested scripts under examples. web demo still buggy * fix ci tests * Some modifications to add ChatGPT api * modify llm.py and remove unnecessary files * Delete applications/ColossalQA/examples/ui/test_frontend_input.json * Remove OpenAI api key * add colossalqa * move files * move files * move files * move files * fix style * Add Readme and fix some bugs. * Add something to readme and modify some code * modify a directory name for clarity * remove redundant directory * Correct a type in llm.py * fix AI prefix * fix test_memory.py * fix conversation * fix some erros and typos * Fix a missing import in RAG_ChatBot.py * add colossalcloud LLM wrapper, correct issues in code review --------- Co-authored-by: YeAnbang <anbangy2@outlook.com> Co-authored-by: Orion-Zheng <zheng_zian@u.nus.edu> Co-authored-by: Zian(Andy) Zheng <62330719+Orion-Zheng@users.noreply.github.com> Co-authored-by: Orion-Zheng <zhengzian@u.nus.edu>
2025-09-02 09:38:05 +00:00 · 2023-11-23 10:33:48 +08:00
parent 3acbf6d496
commit e53e729d8e
69 changed files with 6758 additions and 0 deletions
--- a/applications/ColossalQA/colossalqa/data_loader/init.py
+++ b/applications/ColossalQA/colossalqa/data_loader/init.py
--- a/applications/ColossalQA/colossalqa/data_loader/document_loader.py
+++ b/applications/ColossalQA/colossalqa/data_loader/document_loader.py
@@ -0,0 +1,128 @@
+"""
+Class for loading document type data
+"""
+
+import glob
+from typing import List
+
+from colossalqa.mylogging import get_logger
+from langchain.document_loaders import (
+    JSONLoader,
+    PyPDFLoader,
+    TextLoader,
+    UnstructuredHTMLLoader,
+    UnstructuredMarkdownLoader,
+)
+from langchain.document_loaders.csv_loader import CSVLoader
+
+logger = get_logger()
+
+SUPPORTED_DATA_FORMAT = [".csv", ".json", ".html", ".md", ".pdf", ".txt", ".jsonl"]
+
+
+class DocumentLoader:
+    """
+    Load documents from different files into list of langchain Documents
+    """
+
+    def __init__(self, files: List, **kwargs) -> None:
+        """
+        Args:
+            files: list of files (list[file path, name])
+            **kwargs: keyword type arguments, useful for certain document types
+        """
+        self.data = {}
+        self.kwargs = kwargs
+
+        for item in files:
+            path = item[0] if isinstance(item, list) else item
+            logger.info(f"Loading data from {path}")
+            self.load_data(path)
+            logger.info("Data loaded")
+
+        self.all_data = []
+        for key in self.data:
+            if isinstance(self.data[key], list):
+                for item in self.data[key]:
+                    if isinstance(item, list):
+                        self.all_data.extend(item)
+                    else:
+                        self.all_data.append(item)
+
+    def load_data(self, path: str) -> None:
+        """
+        Load data. Please refer to https://python.langchain.com/docs/modules/data_connection/document_loaders/
+            for sepcific format requirements.
+        Args:
+            path: path to a file
+                To load files with glob path, here are some examples.
+                    Load all file from directory: folder1/folder2/*
+                    Load all pdf file from directory: folder1/folder2/*.pdf
+        """
+        files = []
+
+        # Handle glob expression
+        try:
+            files = glob.glob(path)
+        except Exception as e:
+            logger.error(e)
+        if len(files) == 0:
+            raise ValueError("Unsupported file/directory format. For directories, please use glob expression")
+        elif len(files) == 1:
+            path = files[0]
+        else:
+            for file in files:
+                self.load_data(file)
+            return
+
+        # Load data if the path is a file
+        logger.info(f"load {path}", verbose=True)
+        if path.endswith(".csv"):
+            # Load csv
+            loader = CSVLoader(file_path=path, encoding="utf8")
+            data = loader.load()
+            self.data[path] = data
+        elif path.endswith(".txt"):
+            # Load txt
+            loader = TextLoader(path, encoding="utf8")
+            data = loader.load()
+            self.data[path] = data
+        elif path.endswith("html"):
+            # Load html
+            loader = UnstructuredHTMLLoader(path, encoding="utf8")
+            data = loader.load()
+            self.data[path] = data
+        elif path.endswith("json"):
+            # Load json
+            loader = JSONLoader(
+                file_path=path,
+                jq_schema=self.kwargs.get("jq_schema", ".data[]"),
+                content_key=self.kwargs.get("content_key", "content"),
+                metadata_func=self.kwargs.get("metadata_func", None),
+            )
+
+            data = loader.load()
+            self.data[path] = data
+        elif path.endswith("jsonl"):
+            # Load jsonl
+            loader = JSONLoader(
+                file_path=path, jq_schema=self.kwargs.get("jq_schema", ".data[].content"), json_lines=True
+            )
+            data = loader.load()
+            self.data[path] = data
+        elif path.endswith(".md"):
+            # Load markdown
+            loader = UnstructuredMarkdownLoader(path)
+            data = loader.load()
+            self.data[path] = data
+        elif path.endswith(".pdf"):
+            # Load pdf
+            loader = PyPDFLoader(path)
+            data = loader.load_and_split()
+            self.data[path] = data
+        else:
+            if "." in path.split("/")[-1]:
+                raise ValueError(f"Unsupported file format {path}. Supported formats: {SUPPORTED_DATA_FORMAT}")
+            else:
+                # May ba a directory, we strictly follow the glob path and will not load files in subdirectories
+                pass
--- a/applications/ColossalQA/colossalqa/data_loader/table_dataloader.py
+++ b/applications/ColossalQA/colossalqa/data_loader/table_dataloader.py
@@ -0,0 +1,119 @@
+'''
+Class for loading table type data. please refer to Pandas-Input/Output for file format details.
+'''
+
+
+import os
+import glob
+import pandas as pd
+from sqlalchemy import create_engine
+from colossalqa.utils import drop_table
+from colossalqa.mylogging import get_logger
+
+logger = get_logger()
+
+SUPPORTED_DATA_FORMAT = ['.csv','.xlsx', '.xls','.json','.html','.h5', '.hdf5','.parquet','.feather','.dta']
+
+class TableLoader:
+    '''
+    Load tables from different files and serve a sql database for database operations
+    '''
+    def __init__(self, files: str, 
+                 sql_path:str='sqlite:///mydatabase.db', 
+                 verbose=False, **kwargs) -> None:
+        '''
+        Args:
+            files: list of files (list[file path, name])
+            sql_path: how to serve the sql database
+            **kwargs: keyword type arguments, useful for certain document types 
+        '''
+        self.data = {}
+        self.verbose = verbose
+        self.sql_path = sql_path
+        self.kwargs = kwargs
+        self.sql_engine = create_engine(self.sql_path)
+        drop_table(self.sql_engine)
+        
+        self.sql_engine = create_engine(self.sql_path)
+        for item in files:
+            path = item[0]
+            dataset_name = item[1]
+            if not os.path.exists(path):
+                raise FileNotFoundError(f"{path} doesn't exists")
+            if not any([path.endswith(i) for i in SUPPORTED_DATA_FORMAT]):
+                raise TypeError(f"{path} not supported. Supported type {SUPPORTED_DATA_FORMAT}")
+            
+            logger.info("loading data", verbose=self.verbose)
+            self.load_data(path)
+            logger.info("data loaded", verbose=self.verbose)
+            self.to_sql(path, dataset_name)
+
+    def load_data(self, path):
+        '''
+        Load data and serve the data as sql database.
+        Data must be in pandas format
+        '''
+        files = []
+        # Handle glob expression
+        try:
+            files = glob.glob(path)
+        except Exception as e:
+            logger.error(e)
+        if len(files)==0:
+            raise ValueError("Unsupported file/directory format. For directories, please use glob expression")
+        elif len(files)==1:
+            path = files[0]
+        else:
+            for file in files:
+                self.load_data(file)
+
+        if path.endswith('.csv'):
+            # Load csv
+            self.data[path] = pd.read_csv(path)
+        elif path.endswith('.xlsx') or path.endswith('.xls'):
+            # Load excel
+            self.data[path] = pd.read_excel(path)  # You can adjust the sheet_name as needed
+        elif path.endswith('.json'):
+            # Load json
+            self.data[path] = pd.read_json(path)
+        elif path.endswith('.html'):
+            # Load html
+            html_tables = pd.read_html(path)
+            # Choose the desired table from the list of DataFrame objects
+            self.data[path] = html_tables[0]  # You may need to adjust this index
+        elif path.endswith('.h5') or path.endswith('.hdf5'):
+            # Load h5
+            self.data[path] = pd.read_hdf(path, key=self.kwargs.get('key', 'data'))  # You can adjust the key as needed
+        elif path.endswith('.parquet'):
+            # Load parquet
+            self.data[path] = pd.read_parquet(path, engine='fastparquet')
+        elif path.endswith('.feather'):
+            # Load feather
+            self.data[path] = pd.read_feather(path)
+        elif path.endswith('.dta'):
+            # Load dta
+            self.data[path] = pd.read_stata(path)
+        else:
+            raise ValueError("Unsupported file format")
+        
+    def to_sql(self, path, table_name):
+        '''
+        Serve the data as sql database.
+        '''
+        self.data[path].to_sql(table_name, con=self.sql_engine, if_exists='replace', index=False)
+        logger.info(f"Loaded to Sqlite3\nPath: {path}", verbose=self.verbose)
+        return self.sql_path
+    
+    def get_sql_path(self):
+        return self.sql_path
+
+    def __del__(self):
+        if self.sql_engine:
+            drop_table(self.sql_engine)
+            self.sql_engine.dispose()
+            del self.data
+            del self.sql_engine
+
+
+
+