[Feature] Add document retrieval QA (#5020)

* add langchain * add langchain * Add files via upload * add langchain * fix style * fix style: remove extra space * add pytest; modified retriever * add pytest; modified retriever * add tests to build_on_pr.yml * fix build_on_pr.yml * fix build on pr; fix environ vars * seperate unit tests for colossalqa from build from pr * fix container setting; fix environ vars * commented dev code * add incremental update * remove stale code * fix style * change to sha3 224 * fix retriever; fix style; add unit test for document loader * fix ci workflow config * fix ci workflow config * add set cuda visible device script in ci * fix doc string * fix style; update readme; refactored * add force log info * change build on pr, ignore colossalqa * fix docstring, captitalize all initial letters * fix indexing; fix text-splitter * remove debug code, update reference * reset previous commit * update LICENSE update README add key-value mode, fix bugs * add files back * revert force push * remove junk file * add test files * fix retriever bug, add intent classification * change conversation chain design * rewrite prompt and conversation chain * add ui v1 * ui v1 * fix atavar * add header * Refactor the RAG Code and support Pangu * Refactor the ColossalQA chain to Object-Oriented Programming and the UI demo. * resolved conversation. tested scripts under examples. web demo still buggy * fix ci tests * Some modifications to add ChatGPT api * modify llm.py and remove unnecessary files * Delete applications/ColossalQA/examples/ui/test_frontend_input.json * Remove OpenAI api key * add colossalqa * move files * move files * move files * move files * fix style * Add Readme and fix some bugs. * Add something to readme and modify some code * modify a directory name for clarity * remove redundant directory * Correct a type in llm.py * fix AI prefix * fix test_memory.py * fix conversation * fix some erros and typos * Fix a missing import in RAG_ChatBot.py * add colossalcloud LLM wrapper, correct issues in code review --------- Co-authored-by: YeAnbang <anbangy2@outlook.com> Co-authored-by: Orion-Zheng <zheng_zian@u.nus.edu> Co-authored-by: Zian(Andy) Zheng <62330719+Orion-Zheng@users.noreply.github.com> Co-authored-by: Orion-Zheng <zhengzian@u.nus.edu>
2025-09-01 09:07:51 +00:00 · 2023-11-23 10:33:48 +08:00
parent 3acbf6d496
commit e53e729d8e
69 changed files with 6758 additions and 0 deletions
--- a/applications/ColossalQA/tests/init.py
+++ b/applications/ColossalQA/tests/init.py
--- a/applications/ColossalQA/tests/test_document_loader.py
+++ b/applications/ColossalQA/tests/test_document_loader.py
@@ -0,0 +1,21 @@
+import os
+from colossalqa.data_loader.document_loader import DocumentLoader
+
+
+def test_add_document():
+    PATH = os.environ.get('TEST_DOCUMENT_LOADER_DATA_PATH')
+    files = [[PATH, 'all data']]
+    document_loader = DocumentLoader(files)
+    documents = document_loader.all_data
+    all_files = []
+    for doc in documents:
+        assert isinstance(doc.page_content, str)==True
+        if doc.metadata['source'] not in all_files:
+            all_files.append(doc.metadata['source'])
+    print(all_files)
+    assert len(all_files) == 6
+
+
+if __name__=='__main__':
+    test_add_document()
+
--- a/applications/ColossalQA/tests/test_memory.py
+++ b/applications/ColossalQA/tests/test_memory.py
@@ -0,0 +1,117 @@
+import os
+
+from colossalqa.data_loader.document_loader import DocumentLoader
+from colossalqa.local.llm import ColossalAPI, ColossalLLM
+from colossalqa.memory import ConversationBufferWithSummary
+from colossalqa.prompt.prompt import PROMPT_RETRIEVAL_QA_ZH
+from colossalqa.retriever import CustomRetriever
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+
+
+def test_memory_long():
+    model_path = os.environ.get("EN_MODEL_PATH")
+    data_path = os.environ.get("TEST_DATA_PATH_EN")
+    model_name = os.environ.get("EN_MODEL_NAME")
+    sql_file_path = os.environ.get("SQL_FILE_PATH")
+
+    if not os.path.exists(sql_file_path):
+        os.makedirs(sql_file_path)
+
+    colossal_api = ColossalAPI.get_api(model_name, model_path)
+    llm = ColossalLLM(n=4, api=colossal_api)
+    memory = ConversationBufferWithSummary(
+        llm=llm, max_tokens=600, llm_kwargs={"max_new_tokens": 50, "temperature": 0.6, "do_sample": True}
+    )
+    retriever_data = DocumentLoader([[data_path, "company information"]]).all_data
+
+    # Split
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20)
+    splits = text_splitter.split_documents(retriever_data)
+
+    embedding = HuggingFaceEmbeddings(
+        model_name="moka-ai/m3e-base", model_kwargs={"device": "cpu"}, encode_kwargs={"normalize_embeddings": False}
+    )
+
+    # Create retriever
+    information_retriever = CustomRetriever(k=3, sql_file_path=sql_file_path)
+    information_retriever.add_documents(docs=splits, cleanup="incremental", mode="by_source", embedding=embedding)
+
+    memory.initiate_document_retrieval_chain(
+        llm,
+        PROMPT_RETRIEVAL_QA_ZH,
+        information_retriever,
+        chain_type_kwargs={
+            "chat_history": "",
+        },
+    )
+
+    # This keep the prompt length excluding dialogues the same
+    docs = information_retriever.get_relevant_documents("this is a test input.")
+    prompt_length = memory.chain.prompt_length(docs, **{"question": "this is a test input.", "chat_history": ""})
+    remain = 600 - prompt_length
+    have_summarization_flag = False
+    for i in range(40):
+        chat_history = memory.load_memory_variables({"question": "this is a test input.", "input_documents": docs})[
+            "chat_history"
+        ]
+
+        assert memory.get_conversation_length() <= remain
+        memory.save_context({"question": "this is a test input."}, {"output": "this is a test output."})
+        if "A summarization of historical conversation:" in chat_history:
+            have_summarization_flag = True
+    assert have_summarization_flag == True
+
+
+def test_memory_short():
+    model_path = os.environ.get("EN_MODEL_PATH")
+    data_path = os.environ.get("TEST_DATA_PATH_EN")
+    model_name = os.environ.get("EN_MODEL_NAME")
+    sql_file_path = os.environ.get("SQL_FILE_PATH")
+
+    if not os.path.exists(sql_file_path):
+        os.makedirs(sql_file_path)
+
+    colossal_api = ColossalAPI.get_api(model_name, model_path)
+    llm = ColossalLLM(n=4, api=colossal_api)
+    memory = ConversationBufferWithSummary(
+        llm=llm, llm_kwargs={"max_new_tokens": 50, "temperature": 0.6, "do_sample": True}
+    )
+    retriever_data = DocumentLoader([[data_path, "company information"]]).all_data
+
+    # Split
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20)
+    splits = text_splitter.split_documents(retriever_data)
+
+    embedding = HuggingFaceEmbeddings(
+        model_name="moka-ai/m3e-base", model_kwargs={"device": "cpu"}, encode_kwargs={"normalize_embeddings": False}
+    )
+
+    # create retriever
+    information_retriever = CustomRetriever(k=3, sql_file_path=sql_file_path)
+    information_retriever.add_documents(docs=splits, cleanup="incremental", mode="by_source", embedding=embedding)
+
+    memory.initiate_document_retrieval_chain(
+        llm,
+        PROMPT_RETRIEVAL_QA_ZH,
+        information_retriever,
+        chain_type_kwargs={
+            "chat_history": "",
+        },
+    )
+
+    # This keep the prompt length excluding dialogues the same
+    docs = information_retriever.get_relevant_documents("this is a test input.", return_scores=True)
+
+    for i in range(4):
+        chat_history = memory.load_memory_variables({"question": "this is a test input.", "input_documents": docs})[
+            "chat_history"
+        ]
+        assert chat_history.count("Assistant: this is a test output.") == i
+        assert chat_history.count("Human: this is a test input.") == i
+        memory.save_context({"question": "this is a test input."}, {"output": "this is a test output."})
+
+
+if __name__ == "__main__":
+    test_memory_short()
+    test_memory_long()
--- a/applications/ColossalQA/tests/test_retrieval_qa.py
+++ b/applications/ColossalQA/tests/test_retrieval_qa.py
@@ -0,0 +1,62 @@
+import os
+
+from colossalqa.retrieval_conversation_universal import UniversalRetrievalConversation
+
+
+def test_en_retrievalQA():
+    data_path_en = os.environ.get('TEST_DATA_PATH_EN')
+    data_path_zh = os.environ.get('TEST_DATA_PATH_ZH')
+    en_model_path = os.environ.get('EN_MODEL_PATH')
+    zh_model_path = os.environ.get('ZH_MODEL_PATH')
+    zh_model_name = os.environ.get('ZH_MODEL_NAME')
+    en_model_name = os.environ.get('EN_MODEL_NAME')
+    sql_file_path = os.environ.get('SQL_FILE_PATH')
+    qa_session = UniversalRetrievalConversation(files_en=[{
+        'data_path': data_path_en,
+        'name': 'company information',
+        'separator': '\n'
+    }],
+                                                files_zh=[{
+                                                    'data_path': data_path_zh,
+                                                    'name': 'company information',
+                                                    'separator': '\n'
+                                                }],
+                                                zh_model_path=zh_model_path,
+                                                en_model_path=en_model_path,
+                                                zh_model_name=zh_model_name,
+                                                en_model_name=en_model_name,
+                                                sql_file_path=sql_file_path)
+    ans = qa_session.run("which company runs business in hotel industry?", which_language='en')
+    print(ans)
+
+
+def test_zh_retrievalQA():
+    data_path_en = os.environ.get('TEST_DATA_PATH_EN')
+    data_path_zh = os.environ.get('TEST_DATA_PATH_ZH')
+    en_model_path = os.environ.get('EN_MODEL_PATH')
+    zh_model_path = os.environ.get('ZH_MODEL_PATH')
+    zh_model_name = os.environ.get('ZH_MODEL_NAME')
+    en_model_name = os.environ.get('EN_MODEL_NAME')
+    sql_file_path = os.environ.get('SQL_FILE_PATH')
+    qa_session = UniversalRetrievalConversation(files_en=[{
+        'data_path': data_path_en,
+        'name': 'company information',
+        'separator': '\n'
+    }],
+                                                files_zh=[{
+                                                    'data_path': data_path_zh,
+                                                    'name': 'company information',
+                                                    'separator': '\n'
+                                                }],
+                                                zh_model_path=zh_model_path,
+                                                en_model_path=en_model_path,
+                                                zh_model_name=zh_model_name,
+                                                en_model_name=en_model_name,
+                                                sql_file_path=sql_file_path)
+    ans = qa_session.run("哪家公司在经营酒店业务？", which_language='zh')
+    print(ans)
+
+
+if __name__ == "__main__":
+    test_en_retrievalQA()
+    test_zh_retrievalQA()
--- a/applications/ColossalQA/tests/test_text_splitter.py
+++ b/applications/ColossalQA/tests/test_text_splitter.py
@@ -0,0 +1,11 @@
+from colossalqa.text_splitter.chinese_text_splitter import ChineseTextSplitter
+
+
+def test_text_splitter():
+    # unit test
+    spliter = ChineseTextSplitter(chunk_size=30, chunk_overlap=0)
+    out = spliter.split_text(
+        "移动端语音唤醒模型，检测关键词为“小云小云”。模型主体为4层FSMN结构，使用CTC训练准则，参数量750K，适用于移动端设备运行。模型输入为Fbank特征，输出为基于char建模的中文全集token预测，测试工具根据每一帧的预测数据进行后处理得到输入音频的实时检测结果。模型训练采用“basetrain + finetune”的模式，basetrain过程使用大量内部移动端数据，在此基础上，使用1万条设备端录制安静场景“小云小云”数据进行微调，得到最终面向业务的模型。后续用户可在basetrain模型基础上，使用其他关键词数据进行微调，得到新的语音唤醒模型，但暂时未开放模型finetune功能。"
+    )
+    print(len(out))
+    assert len(out) == 4  # ChineseTextSplitter will not break sentence. Hence the actual chunk size is not 30