[Feature] Add document retrieval QA (#5020)

* add langchain * add langchain * Add files via upload * add langchain * fix style * fix style: remove extra space * add pytest; modified retriever * add pytest; modified retriever * add tests to build_on_pr.yml * fix build_on_pr.yml * fix build on pr; fix environ vars * seperate unit tests for colossalqa from build from pr * fix container setting; fix environ vars * commented dev code * add incremental update * remove stale code * fix style * change to sha3 224 * fix retriever; fix style; add unit test for document loader * fix ci workflow config * fix ci workflow config * add set cuda visible device script in ci * fix doc string * fix style; update readme; refactored * add force log info * change build on pr, ignore colossalqa * fix docstring, captitalize all initial letters * fix indexing; fix text-splitter * remove debug code, update reference * reset previous commit * update LICENSE update README add key-value mode, fix bugs * add files back * revert force push * remove junk file * add test files * fix retriever bug, add intent classification * change conversation chain design * rewrite prompt and conversation chain * add ui v1 * ui v1 * fix atavar * add header * Refactor the RAG Code and support Pangu * Refactor the ColossalQA chain to Object-Oriented Programming and the UI demo. * resolved conversation. tested scripts under examples. web demo still buggy * fix ci tests * Some modifications to add ChatGPT api * modify llm.py and remove unnecessary files * Delete applications/ColossalQA/examples/ui/test_frontend_input.json * Remove OpenAI api key * add colossalqa * move files * move files * move files * move files * fix style * Add Readme and fix some bugs. * Add something to readme and modify some code * modify a directory name for clarity * remove redundant directory * Correct a type in llm.py * fix AI prefix * fix test_memory.py * fix conversation * fix some erros and typos * Fix a missing import in RAG_ChatBot.py * add colossalcloud LLM wrapper, correct issues in code review --------- Co-authored-by: YeAnbang <anbangy2@outlook.com> Co-authored-by: Orion-Zheng <zheng_zian@u.nus.edu> Co-authored-by: Zian(Andy) Zheng <62330719+Orion-Zheng@users.noreply.github.com> Co-authored-by: Orion-Zheng <zhengzian@u.nus.edu>
2025-09-03 01:55:12 +00:00 · 2023-11-23 10:33:48 +08:00
parent 3acbf6d496
commit e53e729d8e
69 changed files with 6758 additions and 0 deletions
--- a/applications/ColossalQA/colossalqa/retriever.py
+++ b/applications/ColossalQA/colossalqa/retriever.py
@@ -0,0 +1,166 @@
+"""
+Code for custom retriver with incremental update
+"""
+import copy
+import hashlib
+import os
+from collections import defaultdict
+from typing import Any, Callable, Dict, List
+
+from colossalqa.mylogging import get_logger
+from langchain.callbacks.manager import CallbackManagerForRetrieverRun
+from langchain.embeddings.base import Embeddings
+from langchain.indexes import SQLRecordManager, index
+from langchain.schema.retriever import BaseRetriever, Document
+from langchain.vectorstores.base import VectorStore
+from langchain.vectorstores.chroma import Chroma
+
+logger = get_logger()
+
+
+class CustomRetriever(BaseRetriever):
+    """
+    Custom retriever class with support for incremental update of indexes
+    """
+
+    vector_stores: Dict[str, VectorStore] = {}
+    sql_index_database: Dict[str, str] = {}
+    record_managers: Dict[str, SQLRecordManager] = {}
+    sql_db_chains = []
+    k = 3
+    rephrase_handler: Callable = None
+    buffer: Dict = []
+    buffer_size: int = 5
+    verbose: bool = False
+    sql_file_path: str = None
+
+    @classmethod
+    def from_documents(
+        cls,
+        documents: List[Document],
+        embeddings: Embeddings,
+        **kwargs: Any,
+    ) -> BaseRetriever:
+        k = kwargs.pop("k", 3)
+        cleanup = kwargs.pop("cleanup", "incremental")
+        mode = kwargs.pop("mode", "by_source")
+        ret = cls(k=k)
+        ret.add_documents(documents, embedding=embeddings, cleanup=cleanup, mode=mode)
+        return ret
+
+    def add_documents(
+        self,
+        docs: Dict[str, Document] = [],
+        cleanup: str = "incremental",
+        mode: str = "by_source",
+        embedding: Embeddings = None,
+    ) -> None:
+        """
+        Add documents to retriever
+        Args:
+            docs: the documents to add
+            cleanup: choose from "incremental" (update embeddings, skip existing embeddings) and "full" (destory and rebuild retriever)
+            mode: choose from "by source" (documents are grouped by source) and "merge" (documents are merged into one vector store)
+        """
+        if cleanup == "full":
+            # Cleanup
+            for source in self.vector_stores:
+                os.remove(self.sql_index_database[source])
+        # Add documents
+        data_by_source = defaultdict(list)
+        if mode == "by_source":
+            for doc in docs:
+                data_by_source[doc.metadata["source"]].append(doc)
+        elif mode == "merge":
+            data_by_source["merged"] = docs
+        for source in data_by_source:
+            if source not in self.vector_stores:
+                hash_encoding = hashlib.sha3_224(source.encode()).hexdigest()
+                if os.path.exists(f"{self.sql_file_path}/{hash_encoding}.db"):
+                    # Remove the stale file
+                    os.remove(f"{self.sql_file_path}/{hash_encoding}.db")
+                # Create a new sql database to store indexes, sql files are stored in the same directory as the source file
+                sql_path = f"sqlite:///{self.sql_file_path}/{hash_encoding}.db"
+                self.vector_stores[source] = Chroma(embedding_function=embedding, collection_name=hash_encoding)
+                self.sql_index_database[source] = f"{self.sql_file_path}/{hash_encoding}.db"
+                self.record_managers[source] = SQLRecordManager(source, db_url=sql_path)
+                self.record_managers[source].create_schema()
+            index(
+                data_by_source[source],
+                self.record_managers[source],
+                self.vector_stores[source],
+                cleanup=cleanup,
+                source_id_key="source",
+            )
+
+    def __del__(self):
+        for source in self.sql_index_database:
+            if os.path.exists(self.sql_index_database[source]):
+                os.remove(self.sql_index_database[source])
+
+    def set_sql_database_chain(self, db_chains) -> None:
+        """
+        set sql agent chain to retrieve information from sql database
+        Not used in this version
+        """
+        self.sql_db_chains = db_chains
+
+    def set_rephrase_handler(self, handler: Callable = None) -> None:
+        """
+        Set a handler to preprocess the input str before feed into the retriever
+        """
+        self.rephrase_handler = handler
+
+    def _get_relevant_documents(
+        self,
+        query: str,
+        *,
+        run_manager: CallbackManagerForRetrieverRun = None,
+        score_threshold: float = None,
+        return_scores: bool = False,
+    ) -> List[Document]:
+        """
+        This function is called by the retriever to get the relevant documents.
+        recent vistied queries are stored in buffer, if the query is in buffer, return the documents directly
+
+        Args:
+            query: the query to be searched
+            run_manager: the callback manager for retriever run
+        Returns:
+            documents: the relevant documents
+        """
+        for buffered_doc in self.buffer:
+            if buffered_doc[0] == query:
+                return buffered_doc[1]
+        query_ = str(query)
+        # Use your existing retriever to get the documents
+        if self.rephrase_handler:
+            query = self.rephrase_handler(query)
+        documents = []
+        for k in self.vector_stores:
+            # Retrieve documents from each retriever
+            vectorstore = self.vector_stores[k]
+            documents.extend(vectorstore.similarity_search_with_score(query, self.k, score_threshold=score_threshold))
+            # print(documents)
+        # Return the top k documents among all retrievers
+        documents = sorted(documents, key=lambda x: x[1], reverse=False)[: self.k]
+        if return_scores:
+            # Return score
+            documents = copy.deepcopy(documents)
+            for doc in documents:
+                doc[0].metadata["score"] = doc[1]
+        documents = [doc[0] for doc in documents]
+        # Retrieve documents from sql database (not applicable for the local chains)
+        for sql_chain in self.sql_db_chains:
+            documents.append(
+                Document(
+                    page_content=f"Query: {query}  Answer: {sql_chain.run(query)}", metadata={"source": "sql_query"}
+                )
+            )
+        if len(self.buffer) < self.buffer_size:
+            self.buffer.append([query_, documents])
+        else:
+            self.buffer.pop(0)
+            self.buffer.append([query_, documents])
+        logger.info(f"retrieved documents:\n{str(documents)}", verbose=self.verbose)
+        return documents