[Feature] Add document retrieval QA (#5020)

* add langchain * add langchain * Add files via upload * add langchain * fix style * fix style: remove extra space * add pytest; modified retriever * add pytest; modified retriever * add tests to build_on_pr.yml * fix build_on_pr.yml * fix build on pr; fix environ vars * seperate unit tests for colossalqa from build from pr * fix container setting; fix environ vars * commented dev code * add incremental update * remove stale code * fix style * change to sha3 224 * fix retriever; fix style; add unit test for document loader * fix ci workflow config * fix ci workflow config * add set cuda visible device script in ci * fix doc string * fix style; update readme; refactored * add force log info * change build on pr, ignore colossalqa * fix docstring, captitalize all initial letters * fix indexing; fix text-splitter * remove debug code, update reference * reset previous commit * update LICENSE update README add key-value mode, fix bugs * add files back * revert force push * remove junk file * add test files * fix retriever bug, add intent classification * change conversation chain design * rewrite prompt and conversation chain * add ui v1 * ui v1 * fix atavar * add header * Refactor the RAG Code and support Pangu * Refactor the ColossalQA chain to Object-Oriented Programming and the UI demo. * resolved conversation. tested scripts under examples. web demo still buggy * fix ci tests * Some modifications to add ChatGPT api * modify llm.py and remove unnecessary files * Delete applications/ColossalQA/examples/ui/test_frontend_input.json * Remove OpenAI api key * add colossalqa * move files * move files * move files * move files * fix style * Add Readme and fix some bugs. * Add something to readme and modify some code * modify a directory name for clarity * remove redundant directory * Correct a type in llm.py * fix AI prefix * fix test_memory.py * fix conversation * fix some erros and typos * Fix a missing import in RAG_ChatBot.py * add colossalcloud LLM wrapper, correct issues in code review --------- Co-authored-by: YeAnbang <anbangy2@outlook.com> Co-authored-by: Orion-Zheng <zheng_zian@u.nus.edu> Co-authored-by: Zian(Andy) Zheng <62330719+Orion-Zheng@users.noreply.github.com> Co-authored-by: Orion-Zheng <zhengzian@u.nus.edu>
2025-09-02 17:46:42 +00:00 · 2023-11-23 10:33:48 +08:00
parent 3acbf6d496
commit e53e729d8e
69 changed files with 6758 additions and 0 deletions
--- a/applications/ColossalQA/colossalqa/chain/retrieval_qa/load_chain.py
+++ b/applications/ColossalQA/colossalqa/chain/retrieval_qa/load_chain.py
@@ -0,0 +1,87 @@
+"""
+Load question answering chains.
+For now, only the stuffed chain is modified
+
+Modified from Original Source
+
+This code is based on LangChain Ai's langchain, which can be found at
+https://github.com/langchain-ai/langchain
+The original code is licensed under the MIT license.
+"""
+import copy
+from typing import Any, Mapping, Optional, Protocol
+
+from colossalqa.chain.retrieval_qa.stuff import CustomStuffDocumentsChain
+from langchain.callbacks.base import BaseCallbackManager
+from langchain.callbacks.manager import Callbacks
+from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
+from langchain.chains.llm import LLMChain
+from langchain.chains.question_answering import stuff_prompt
+from langchain.schema.language_model import BaseLanguageModel
+from langchain.schema.prompt_template import BasePromptTemplate
+
+
+class LoadingCallable(Protocol):
+    """Interface for loading the combine documents chain."""
+
+    def __call__(self, llm: BaseLanguageModel, **kwargs: Any) -> BaseCombineDocumentsChain:
+        """Callable to load the combine documents chain."""
+
+
+def _load_stuff_chain(
+    llm: BaseLanguageModel,
+    prompt: Optional[BasePromptTemplate] = None,
+    document_variable_name: str = "context",
+    verbose: Optional[bool] = None,
+    callback_manager: Optional[BaseCallbackManager] = None,
+    callbacks: Callbacks = None,
+    **kwargs: Any,
+) -> CustomStuffDocumentsChain:
+    _prompt = prompt or stuff_prompt.PROMPT_SELECTOR.get_prompt(llm)
+    if "llm_kwargs" in kwargs:
+        llm_kwargs = copy.deepcopy(kwargs["llm_kwargs"])
+        del kwargs["llm_kwargs"]
+    else:
+        llm_kwargs = {}
+    llm_chain = LLMChain(
+        llm=llm,
+        prompt=_prompt,
+        verbose=verbose,
+        callback_manager=callback_manager,
+        callbacks=callbacks,
+        llm_kwargs=llm_kwargs,
+    )
+    return CustomStuffDocumentsChain(
+        llm_chain=llm_chain,
+        document_variable_name=document_variable_name,
+        verbose=verbose,
+        callback_manager=callback_manager,
+        callbacks=callbacks,
+        **kwargs,
+    )
+
+
+def load_qa_chain(
+    llm: BaseLanguageModel,
+    chain_type: str = "stuff",
+    verbose: Optional[bool] = None,
+    callback_manager: Optional[BaseCallbackManager] = None,
+    **kwargs: Any,
+) -> BaseCombineDocumentsChain:
+    """Load question answering chain.
+
+    Args:
+        llm: Language Model to use in the chain.
+        chain_type: Type of document combining chain to use. Should be one of "stuff",
+            "map_reduce", "map_rerank", and "refine".
+        verbose: Whether chains should be run in verbose mode or not. Note that this
+            applies to all chains that make up the final chain.
+        callback_manager: Callback manager to use for the chain.
+
+    Returns:
+        A chain to use for question answering.
+    """
+    loader_mapping: Mapping[str, LoadingCallable] = {"stuff": _load_stuff_chain}
+    if chain_type not in loader_mapping:
+        raise ValueError(f"Got unsupported chain type: {chain_type}. " f"Should be one of {loader_mapping.keys()}")
+    return loader_mapping[chain_type](llm, verbose=verbose, callback_manager=callback_manager, **kwargs)