diff --git a/docs/getting_started/getting_started.md b/docs/getting_started/getting_started.md index 672f0f74e..946448853 100644 --- a/docs/getting_started/getting_started.md +++ b/docs/getting_started/getting_started.md @@ -25,22 +25,25 @@ $ docker run --name=mysql -p 3306:3306 -e MYSQL_ROOT_PASSWORD=aa12345678 -dit my We use [Chroma embedding database](https://github.com/chroma-core/chroma) as the default for our vector database, so there is no need for special installation. If you choose to connect to other databases, you can follow our tutorial for installation and configuration. For the entire installation process of DB-GPT, we use the miniconda3 virtual environment. Create a virtual environment and install the Python dependencies. -```{tip} +```bash python>=3.10 conda create -n dbgpt_env python=3.10 conda activate dbgpt_env pip install -r requirements.txt ``` Before use DB-GPT Knowledge Management -```{tip} +```bash python -m spacy download zh_core_web_sm ``` Once the environment is installed, we have to create a new folder "models" in the DB-GPT project, and then we can put all the models downloaded from huggingface in this directory -Notice make sure you have install git-lfs ```{tip} +Notice make sure you have install git-lfs +``` + +```bash git clone https://huggingface.co/Tribbiani/vicuna-13b git clone https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 git clone https://huggingface.co/GanymedeNil/text2vec-large-chinese diff --git a/docs/getting_started/installation.md b/docs/getting_started/installation.md index 2919cb667..786889ba4 100644 --- a/docs/getting_started/installation.md +++ b/docs/getting_started/installation.md @@ -4,11 +4,13 @@ DB-GPT provides a third-party Python API package that you can integrate into you ### Installation from Pip You can simply pip install: -```{tip} +```bash pip install -i https://pypi.org/ db-gpt==0.3.0 ``` -Notice:make sure python>=3.10 +```{tip} +Notice:make sure python>=3.10 +``` ### Environment Setup @@ -16,8 +18,11 @@ By default, if you use the EmbeddingEngine api you will prepare embedding models from huggingface -Notice make sure you have install git-lfs ```{tip} +Notice make sure you have install git-lfs +``` + +```bash git clone https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 git clone https://huggingface.co/GanymedeNil/text2vec-large-chinese diff --git a/docs/modules/knowledge.rst b/docs/modules/knowledge.rst index 47d4a628f..05d716749 100644 --- a/docs/modules/knowledge.rst +++ b/docs/modules/knowledge.rst @@ -4,13 +4,13 @@ Knowledge | As the knowledge base is currently the most significant user demand scenario, we natively support the construction and processing of knowledge bases. At the same time, we also provide multiple knowledge base management strategies in this project, such as pdf knowledge,md knowledge, txt knowledge, word knowledge, ppt knowledge: We currently support many document formats: raw text, txt, pdf, md, html, doc, ppt, and url. - +In the future, we will continue to support more types of knowledge, including audio, video, various databases, and big data sources. Of course, we look forward to your active participation in contributing code. **Create your own knowledge repository** 1.prepare -We currently support many document formats: raw text, txt, pdf, md, html, doc, ppt, and url. +We currently support many document formats: TEXT(raw text), DOCUMENT(.txt, .pdf, .md, .doc, .ppt, .html), and URL. before execution: @@ -72,12 +72,13 @@ eg: git clone https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 vector_store_config=vector_store_config) embedding_engine.knowledge_embedding() -If you want to add your text_splitter, do this: +If you want to add your source_reader or text_splitter, do this: :: url = "https://db-gpt.readthedocs.io/en/latest/getting_started/getting_started.html" + source_reader = WebBaseLoader(web_path=self.file_path) text_splitter = RecursiveCharacterTextSplitter( chunk_size=100, chunk_overlap=50 ) @@ -86,6 +87,7 @@ If you want to add your text_splitter, do this: knowledge_type=KnowledgeType.URL.value, model_name=embedding_model, vector_store_config=vector_store_config, + source_reader=source_reader, text_splitter=text_splitter ) diff --git a/pilot/embedding_engine/csv_embedding.py b/pilot/embedding_engine/csv_embedding.py index 9ba28459b..af092274f 100644 --- a/pilot/embedding_engine/csv_embedding.py +++ b/pilot/embedding_engine/csv_embedding.py @@ -2,7 +2,7 @@ from typing import Dict, List, Optional from langchain.document_loaders import CSVLoader from langchain.schema import Document -from langchain.text_splitter import TextSplitter +from langchain.text_splitter import TextSplitter, SpacyTextSplitter, RecursiveCharacterTextSplitter from pilot.embedding_engine import SourceEmbedding, register @@ -14,19 +14,34 @@ class CSVEmbedding(SourceEmbedding): self, file_path, vector_store_config, + source_reader: Optional = None, text_splitter: Optional[TextSplitter] = None, ): """Initialize with csv path.""" - super().__init__(file_path, vector_store_config, text_splitter=None) + super().__init__(file_path, vector_store_config, source_reader=None, text_splitter=None) self.file_path = file_path self.vector_store_config = vector_store_config + self.source_reader = source_reader or None self.text_splitter = text_splitter or None @register def read(self): """Load from csv path.""" - loader = CSVLoader(file_path=self.file_path) - return loader.load() + if self.source_reader is None: + self.source_reader = CSVLoader(self.file_path) + if self.text_splitter is None: + try: + self.text_splitter = SpacyTextSplitter( + pipeline="zh_core_web_sm", + chunk_size=100, + chunk_overlap=100, + ) + except Exception: + self.text_splitter = RecursiveCharacterTextSplitter( + chunk_size=100, chunk_overlap=50 + ) + + return self.source_reader.load_and_split(self.text_splitter) @register def data_process(self, documents: List[Document]): diff --git a/pilot/embedding_engine/embedding_engine.py b/pilot/embedding_engine/embedding_engine.py index d52dbab76..f3b09ae5d 100644 --- a/pilot/embedding_engine/embedding_engine.py +++ b/pilot/embedding_engine/embedding_engine.py @@ -22,6 +22,7 @@ class EmbeddingEngine: vector_store_config, knowledge_type: Optional[str] = KnowledgeType.DOCUMENT.value, knowledge_source: Optional[str] = None, + source_reader: Optional = None, text_splitter: Optional[TextSplitter] = None, ): """Initialize with knowledge embedding client, model_name, vector_store_config, knowledge_type, knowledge_source""" @@ -31,6 +32,7 @@ class EmbeddingEngine: self.knowledge_type = knowledge_type self.embeddings = HuggingFaceEmbeddings(model_name=self.model_name) self.vector_store_config["embeddings"] = self.embeddings + self.source_reader = source_reader self.text_splitter = text_splitter def knowledge_embedding(self): @@ -53,6 +55,7 @@ class EmbeddingEngine: self.knowledge_type, self.knowledge_source, self.vector_store_config, + self.source_reader, self.text_splitter, ) diff --git a/pilot/embedding_engine/knowledge_type.py b/pilot/embedding_engine/knowledge_type.py index 4fe00ca62..77fb98666 100644 --- a/pilot/embedding_engine/knowledge_type.py +++ b/pilot/embedding_engine/knowledge_type.py @@ -41,7 +41,7 @@ class KnowledgeType(Enum): def get_knowledge_embedding( - knowledge_type, knowledge_source, vector_store_config, text_splitter + knowledge_type, knowledge_source, vector_store_config, source_reader, text_splitter ): match knowledge_type: case KnowledgeType.DOCUMENT.value: @@ -51,6 +51,7 @@ def get_knowledge_embedding( embedding = knowledge_class( knowledge_source, vector_store_config=vector_store_config, + source_reader=source_reader, text_splitter=text_splitter, **knowledge_args, ) @@ -60,6 +61,7 @@ def get_knowledge_embedding( embedding = URLEmbedding( file_path=knowledge_source, vector_store_config=vector_store_config, + source_reader=source_reader, text_splitter=text_splitter, ) return embedding @@ -67,6 +69,7 @@ def get_knowledge_embedding( embedding = StringEmbedding( file_path=knowledge_source, vector_store_config=vector_store_config, + source_reader=source_reader, text_splitter=text_splitter, ) return embedding diff --git a/pilot/embedding_engine/markdown_embedding.py b/pilot/embedding_engine/markdown_embedding.py index fa2ddc914..b57037257 100644 --- a/pilot/embedding_engine/markdown_embedding.py +++ b/pilot/embedding_engine/markdown_embedding.py @@ -24,19 +24,21 @@ class MarkdownEmbedding(SourceEmbedding): self, file_path, vector_store_config, + source_reader: Optional = None, text_splitter: Optional[TextSplitter] = None, ): """Initialize raw text word path.""" - super().__init__(file_path, vector_store_config, text_splitter=None) + super().__init__(file_path, vector_store_config, source_reader=None, text_splitter=None) self.file_path = file_path self.vector_store_config = vector_store_config + self.source_reader = source_reader or None self.text_splitter = text_splitter or None - # self.encoding = encoding @register def read(self): """Load from markdown path.""" - loader = EncodeTextLoader(self.file_path) + if self.source_reader is None: + self.source_reader = EncodeTextLoader(self.file_path) if self.text_splitter is None: try: self.text_splitter = SpacyTextSplitter( @@ -49,7 +51,7 @@ class MarkdownEmbedding(SourceEmbedding): chunk_size=100, chunk_overlap=50 ) - return loader.load_and_split(self.text_splitter) + return self.source_reader.load_and_split(self.text_splitter) @register def data_process(self, documents: List[Document]): diff --git a/pilot/embedding_engine/pdf_embedding.py b/pilot/embedding_engine/pdf_embedding.py index cbe68da1b..a85b2985b 100644 --- a/pilot/embedding_engine/pdf_embedding.py +++ b/pilot/embedding_engine/pdf_embedding.py @@ -20,18 +20,21 @@ class PDFEmbedding(SourceEmbedding): self, file_path, vector_store_config, + source_reader: Optional = None, text_splitter: Optional[TextSplitter] = None, ): """Initialize pdf word path.""" - super().__init__(file_path, vector_store_config, text_splitter=None) + super().__init__(file_path, vector_store_config, source_reader=None, text_splitter=None) self.file_path = file_path self.vector_store_config = vector_store_config + self.source_reader = source_reader or None self.text_splitter = text_splitter or None @register def read(self): """Load from pdf path.""" - loader = PyPDFLoader(self.file_path) + if self.source_reader is None: + self.source_reader = PyPDFLoader(self.file_path) if self.text_splitter is None: try: self.text_splitter = SpacyTextSplitter( @@ -44,7 +47,7 @@ class PDFEmbedding(SourceEmbedding): chunk_size=100, chunk_overlap=50 ) - return loader.load_and_split(self.text_splitter) + return self.source_reader.load_and_split(self.text_splitter) @register def data_process(self, documents: List[Document]): diff --git a/pilot/embedding_engine/ppt_embedding.py b/pilot/embedding_engine/ppt_embedding.py index 59de18392..80dc366ce 100644 --- a/pilot/embedding_engine/ppt_embedding.py +++ b/pilot/embedding_engine/ppt_embedding.py @@ -20,18 +20,21 @@ class PPTEmbedding(SourceEmbedding): self, file_path, vector_store_config, + source_reader: Optional = None, text_splitter: Optional[TextSplitter] = None, ): """Initialize ppt word path.""" - super().__init__(file_path, vector_store_config, text_splitter=None) + super().__init__(file_path, vector_store_config, source_reader=None, text_splitter=None) self.file_path = file_path self.vector_store_config = vector_store_config + self.source_reader = source_reader or None self.text_splitter = text_splitter or None @register def read(self): """Load from ppt path.""" - loader = UnstructuredPowerPointLoader(self.file_path) + if self.source_reader is None: + self.source_reader = UnstructuredPowerPointLoader(self.file_path) if self.text_splitter is None: try: self.text_splitter = SpacyTextSplitter( @@ -44,7 +47,7 @@ class PPTEmbedding(SourceEmbedding): chunk_size=100, chunk_overlap=50 ) - return loader.load_and_split(self.text_splitter) + return self.source_reader.load_and_split(self.text_splitter) @register def data_process(self, documents: List[Document]): diff --git a/pilot/embedding_engine/source_embedding.py b/pilot/embedding_engine/source_embedding.py index c1ceabed1..36df75b19 100644 --- a/pilot/embedding_engine/source_embedding.py +++ b/pilot/embedding_engine/source_embedding.py @@ -26,12 +26,14 @@ class SourceEmbedding(ABC): self, file_path, vector_store_config: {}, + source_reader: Optional = None, text_splitter: Optional[TextSplitter] = None, embedding_args: Optional[Dict] = None, ): """Initialize with Loader url, model_name, vector_store_config""" self.file_path = file_path self.vector_store_config = vector_store_config + self.source_reader = source_reader or None self.text_splitter = text_splitter or None self.embedding_args = embedding_args self.embeddings = vector_store_config["embeddings"] diff --git a/pilot/embedding_engine/string_embedding.py b/pilot/embedding_engine/string_embedding.py index 6a7b0c959..2fdaf1414 100644 --- a/pilot/embedding_engine/string_embedding.py +++ b/pilot/embedding_engine/string_embedding.py @@ -1,7 +1,7 @@ from typing import List, Optional from langchain.schema import Document -from langchain.text_splitter import TextSplitter +from langchain.text_splitter import TextSplitter, SpacyTextSplitter, RecursiveCharacterTextSplitter from pilot.embedding_engine import SourceEmbedding, register @@ -13,19 +13,35 @@ class StringEmbedding(SourceEmbedding): self, file_path, vector_store_config, + source_reader: Optional = None, text_splitter: Optional[TextSplitter] = None, ): """Initialize raw text word path.""" - super().__init__(file_path=file_path, vector_store_config=vector_store_config) + super().__init__(file_path=file_path, vector_store_config=vector_store_config, source_reader=None, text_splitter=None) self.file_path = file_path self.vector_store_config = vector_store_config + self.source_reader = source_reader or None self.text_splitter = text_splitter or None @register def read(self): """Load from String path.""" metadata = {"source": "raw text"} - return [Document(page_content=self.file_path, metadata=metadata)] + docs = [Document(page_content=self.file_path, metadata=metadata)] + if self.text_splitter is None: + try: + self.text_splitter = SpacyTextSplitter( + pipeline="zh_core_web_sm", + chunk_size=100, + chunk_overlap=100, + ) + except Exception: + self.text_splitter = RecursiveCharacterTextSplitter( + chunk_size=100, chunk_overlap=50 + ) + + return self.text_splitter.split_documents(docs) + @register def data_process(self, documents: List[Document]): diff --git a/pilot/embedding_engine/url_embedding.py b/pilot/embedding_engine/url_embedding.py index e748d2d59..4413fc104 100644 --- a/pilot/embedding_engine/url_embedding.py +++ b/pilot/embedding_engine/url_embedding.py @@ -19,18 +19,22 @@ class URLEmbedding(SourceEmbedding): self, file_path, vector_store_config, + source_reader: Optional = None, text_splitter: Optional[TextSplitter] = None, ): """Initialize url word path.""" - super().__init__(file_path, vector_store_config, text_splitter=None) + super().__init__(file_path, vector_store_config, source_reader=None, text_splitter=None) self.file_path = file_path self.vector_store_config = vector_store_config + self.source_reader = source_reader or None self.text_splitter = text_splitter or None + @register def read(self): """Load from url path.""" - loader = WebBaseLoader(web_path=self.file_path) + if self.source_reader is None: + self.source_reader = WebBaseLoader(web_path=self.file_path) if self.text_splitter is None: try: self.text_splitter = SpacyTextSplitter( @@ -43,7 +47,7 @@ class URLEmbedding(SourceEmbedding): chunk_size=100, chunk_overlap=50 ) - return loader.load_and_split(self.text_splitter) + return self.source_reader.load_and_split(self.text_splitter) @register def data_process(self, documents: List[Document]): diff --git a/pilot/embedding_engine/word_embedding.py b/pilot/embedding_engine/word_embedding.py index 98bebec3a..91692b338 100644 --- a/pilot/embedding_engine/word_embedding.py +++ b/pilot/embedding_engine/word_embedding.py @@ -20,18 +20,21 @@ class WordEmbedding(SourceEmbedding): self, file_path, vector_store_config, + source_reader: Optional = None, text_splitter: Optional[TextSplitter] = None, ): """Initialize with word path.""" - super().__init__(file_path, vector_store_config, text_splitter=None) + super().__init__(file_path, vector_store_config, source_reader=None, text_splitter=None) self.file_path = file_path self.vector_store_config = vector_store_config + self.source_reader = source_reader or None self.text_splitter = text_splitter or None @register def read(self): """Load from word path.""" - loader = UnstructuredWordDocumentLoader(self.file_path) + if self.source_reader is None: + self.source_reader = UnstructuredWordDocumentLoader(self.file_path) if self.text_splitter is None: try: self.text_splitter = SpacyTextSplitter( @@ -44,7 +47,7 @@ class WordEmbedding(SourceEmbedding): chunk_size=100, chunk_overlap=50 ) - return loader.load_and_split(self.text_splitter) + return self.source_reader.load_and_split(self.text_splitter) @register def data_process(self, documents: List[Document]): diff --git a/pilot/server/dbgpt_server.py b/pilot/server/dbgpt_server.py index 5a84dcd14..def585069 100644 --- a/pilot/server/dbgpt_server.py +++ b/pilot/server/dbgpt_server.py @@ -1,3 +1,4 @@ +import atexit import traceback import os import shutil @@ -36,7 +37,7 @@ CFG = Config() logger = build_logger("webserver", LOGDIR + "webserver.log") -def signal_handler(sig, frame): +def signal_handler(): print("in order to avoid chroma db atexit problem") os._exit(0) @@ -96,7 +97,6 @@ if __name__ == "__main__": action="store_true", help="enable light mode", ) - signal.signal(signal.SIGINT, signal_handler) # init server config args = parser.parse_args() @@ -114,3 +114,4 @@ if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=args.port) + signal.signal(signal.SIGINT, signal_handler()) diff --git a/pilot/summary/db_summary_client.py b/pilot/summary/db_summary_client.py index 710c2101b..825999907 100644 --- a/pilot/summary/db_summary_client.py +++ b/pilot/summary/db_summary_client.py @@ -124,7 +124,6 @@ class DBSummaryClient: "chroma_persist_path": KNOWLEDGE_UPLOAD_ROOT_PATH, } knowledge_embedding_client = EmbeddingEngine( - file_path="", model_name=LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL], vector_store_config=vector_store_config, )