feat:embedding api

1.embedding_engine add source_reader param
2.docs update
3.fix chroma exit bug
This commit is contained in:
aries_ckt 2023-07-13 15:45:25 +08:00
parent 56c1947eda
commit 6404bfe63a
15 changed files with 100 additions and 36 deletions

View File

@ -25,22 +25,25 @@ $ docker run --name=mysql -p 3306:3306 -e MYSQL_ROOT_PASSWORD=aa12345678 -dit my
We use [Chroma embedding database](https://github.com/chroma-core/chroma) as the default for our vector database, so there is no need for special installation. If you choose to connect to other databases, you can follow our tutorial for installation and configuration.
For the entire installation process of DB-GPT, we use the miniconda3 virtual environment. Create a virtual environment and install the Python dependencies.
```{tip}
```bash
python>=3.10
conda create -n dbgpt_env python=3.10
conda activate dbgpt_env
pip install -r requirements.txt
```
Before use DB-GPT Knowledge Management
```{tip}
```bash
python -m spacy download zh_core_web_sm
```
Once the environment is installed, we have to create a new folder "models" in the DB-GPT project, and then we can put all the models downloaded from huggingface in this directory
Notice make sure you have install git-lfs
```{tip}
Notice make sure you have install git-lfs
```
```bash
git clone https://huggingface.co/Tribbiani/vicuna-13b
git clone https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
git clone https://huggingface.co/GanymedeNil/text2vec-large-chinese

View File

@ -4,11 +4,13 @@ DB-GPT provides a third-party Python API package that you can integrate into you
### Installation from Pip
You can simply pip install:
```{tip}
```bash
pip install -i https://pypi.org/ db-gpt==0.3.0
```
Notice:make sure python>=3.10
```{tip}
Notice:make sure python>=3.10
```
### Environment Setup
@ -16,8 +18,11 @@ By default, if you use the EmbeddingEngine api
you will prepare embedding models from huggingface
Notice make sure you have install git-lfs
```{tip}
Notice make sure you have install git-lfs
```
```bash
git clone https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
git clone https://huggingface.co/GanymedeNil/text2vec-large-chinese

View File

@ -4,13 +4,13 @@ Knowledge
| As the knowledge base is currently the most significant user demand scenario, we natively support the construction and processing of knowledge bases. At the same time, we also provide multiple knowledge base management strategies in this project, such as pdf knowledge,md knowledge, txt knowledge, word knowledge, ppt knowledge:
We currently support many document formats: raw text, txt, pdf, md, html, doc, ppt, and url.
In the future, we will continue to support more types of knowledge, including audio, video, various databases, and big data sources. Of course, we look forward to your active participation in contributing code.
**Create your own knowledge repository**
1.prepare
We currently support many document formats: raw text, txt, pdf, md, html, doc, ppt, and url.
We currently support many document formats: TEXT(raw text), DOCUMENT(.txt, .pdf, .md, .doc, .ppt, .html), and URL.
before execution:
@ -72,12 +72,13 @@ eg: git clone https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
vector_store_config=vector_store_config)
embedding_engine.knowledge_embedding()
If you want to add your text_splitter, do this:
If you want to add your source_reader or text_splitter, do this:
::
url = "https://db-gpt.readthedocs.io/en/latest/getting_started/getting_started.html"
source_reader = WebBaseLoader(web_path=self.file_path)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=100, chunk_overlap=50
)
@ -86,6 +87,7 @@ If you want to add your text_splitter, do this:
knowledge_type=KnowledgeType.URL.value,
model_name=embedding_model,
vector_store_config=vector_store_config,
source_reader=source_reader,
text_splitter=text_splitter
)

View File

@ -2,7 +2,7 @@ from typing import Dict, List, Optional
from langchain.document_loaders import CSVLoader
from langchain.schema import Document
from langchain.text_splitter import TextSplitter
from langchain.text_splitter import TextSplitter, SpacyTextSplitter, RecursiveCharacterTextSplitter
from pilot.embedding_engine import SourceEmbedding, register
@ -14,19 +14,34 @@ class CSVEmbedding(SourceEmbedding):
self,
file_path,
vector_store_config,
source_reader: Optional = None,
text_splitter: Optional[TextSplitter] = None,
):
"""Initialize with csv path."""
super().__init__(file_path, vector_store_config, text_splitter=None)
super().__init__(file_path, vector_store_config, source_reader=None, text_splitter=None)
self.file_path = file_path
self.vector_store_config = vector_store_config
self.source_reader = source_reader or None
self.text_splitter = text_splitter or None
@register
def read(self):
"""Load from csv path."""
loader = CSVLoader(file_path=self.file_path)
return loader.load()
if self.source_reader is None:
self.source_reader = CSVLoader(self.file_path)
if self.text_splitter is None:
try:
self.text_splitter = SpacyTextSplitter(
pipeline="zh_core_web_sm",
chunk_size=100,
chunk_overlap=100,
)
except Exception:
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=100, chunk_overlap=50
)
return self.source_reader.load_and_split(self.text_splitter)
@register
def data_process(self, documents: List[Document]):

View File

@ -22,6 +22,7 @@ class EmbeddingEngine:
vector_store_config,
knowledge_type: Optional[str] = KnowledgeType.DOCUMENT.value,
knowledge_source: Optional[str] = None,
source_reader: Optional = None,
text_splitter: Optional[TextSplitter] = None,
):
"""Initialize with knowledge embedding client, model_name, vector_store_config, knowledge_type, knowledge_source"""
@ -31,6 +32,7 @@ class EmbeddingEngine:
self.knowledge_type = knowledge_type
self.embeddings = HuggingFaceEmbeddings(model_name=self.model_name)
self.vector_store_config["embeddings"] = self.embeddings
self.source_reader = source_reader
self.text_splitter = text_splitter
def knowledge_embedding(self):
@ -53,6 +55,7 @@ class EmbeddingEngine:
self.knowledge_type,
self.knowledge_source,
self.vector_store_config,
self.source_reader,
self.text_splitter,
)

View File

@ -41,7 +41,7 @@ class KnowledgeType(Enum):
def get_knowledge_embedding(
knowledge_type, knowledge_source, vector_store_config, text_splitter
knowledge_type, knowledge_source, vector_store_config, source_reader, text_splitter
):
match knowledge_type:
case KnowledgeType.DOCUMENT.value:
@ -51,6 +51,7 @@ def get_knowledge_embedding(
embedding = knowledge_class(
knowledge_source,
vector_store_config=vector_store_config,
source_reader=source_reader,
text_splitter=text_splitter,
**knowledge_args,
)
@ -60,6 +61,7 @@ def get_knowledge_embedding(
embedding = URLEmbedding(
file_path=knowledge_source,
vector_store_config=vector_store_config,
source_reader=source_reader,
text_splitter=text_splitter,
)
return embedding
@ -67,6 +69,7 @@ def get_knowledge_embedding(
embedding = StringEmbedding(
file_path=knowledge_source,
vector_store_config=vector_store_config,
source_reader=source_reader,
text_splitter=text_splitter,
)
return embedding

View File

@ -24,19 +24,21 @@ class MarkdownEmbedding(SourceEmbedding):
self,
file_path,
vector_store_config,
source_reader: Optional = None,
text_splitter: Optional[TextSplitter] = None,
):
"""Initialize raw text word path."""
super().__init__(file_path, vector_store_config, text_splitter=None)
super().__init__(file_path, vector_store_config, source_reader=None, text_splitter=None)
self.file_path = file_path
self.vector_store_config = vector_store_config
self.source_reader = source_reader or None
self.text_splitter = text_splitter or None
# self.encoding = encoding
@register
def read(self):
"""Load from markdown path."""
loader = EncodeTextLoader(self.file_path)
if self.source_reader is None:
self.source_reader = EncodeTextLoader(self.file_path)
if self.text_splitter is None:
try:
self.text_splitter = SpacyTextSplitter(
@ -49,7 +51,7 @@ class MarkdownEmbedding(SourceEmbedding):
chunk_size=100, chunk_overlap=50
)
return loader.load_and_split(self.text_splitter)
return self.source_reader.load_and_split(self.text_splitter)
@register
def data_process(self, documents: List[Document]):

View File

@ -20,18 +20,21 @@ class PDFEmbedding(SourceEmbedding):
self,
file_path,
vector_store_config,
source_reader: Optional = None,
text_splitter: Optional[TextSplitter] = None,
):
"""Initialize pdf word path."""
super().__init__(file_path, vector_store_config, text_splitter=None)
super().__init__(file_path, vector_store_config, source_reader=None, text_splitter=None)
self.file_path = file_path
self.vector_store_config = vector_store_config
self.source_reader = source_reader or None
self.text_splitter = text_splitter or None
@register
def read(self):
"""Load from pdf path."""
loader = PyPDFLoader(self.file_path)
if self.source_reader is None:
self.source_reader = PyPDFLoader(self.file_path)
if self.text_splitter is None:
try:
self.text_splitter = SpacyTextSplitter(
@ -44,7 +47,7 @@ class PDFEmbedding(SourceEmbedding):
chunk_size=100, chunk_overlap=50
)
return loader.load_and_split(self.text_splitter)
return self.source_reader.load_and_split(self.text_splitter)
@register
def data_process(self, documents: List[Document]):

View File

@ -20,18 +20,21 @@ class PPTEmbedding(SourceEmbedding):
self,
file_path,
vector_store_config,
source_reader: Optional = None,
text_splitter: Optional[TextSplitter] = None,
):
"""Initialize ppt word path."""
super().__init__(file_path, vector_store_config, text_splitter=None)
super().__init__(file_path, vector_store_config, source_reader=None, text_splitter=None)
self.file_path = file_path
self.vector_store_config = vector_store_config
self.source_reader = source_reader or None
self.text_splitter = text_splitter or None
@register
def read(self):
"""Load from ppt path."""
loader = UnstructuredPowerPointLoader(self.file_path)
if self.source_reader is None:
self.source_reader = UnstructuredPowerPointLoader(self.file_path)
if self.text_splitter is None:
try:
self.text_splitter = SpacyTextSplitter(
@ -44,7 +47,7 @@ class PPTEmbedding(SourceEmbedding):
chunk_size=100, chunk_overlap=50
)
return loader.load_and_split(self.text_splitter)
return self.source_reader.load_and_split(self.text_splitter)
@register
def data_process(self, documents: List[Document]):

View File

@ -26,12 +26,14 @@ class SourceEmbedding(ABC):
self,
file_path,
vector_store_config: {},
source_reader: Optional = None,
text_splitter: Optional[TextSplitter] = None,
embedding_args: Optional[Dict] = None,
):
"""Initialize with Loader url, model_name, vector_store_config"""
self.file_path = file_path
self.vector_store_config = vector_store_config
self.source_reader = source_reader or None
self.text_splitter = text_splitter or None
self.embedding_args = embedding_args
self.embeddings = vector_store_config["embeddings"]

View File

@ -1,7 +1,7 @@
from typing import List, Optional
from langchain.schema import Document
from langchain.text_splitter import TextSplitter
from langchain.text_splitter import TextSplitter, SpacyTextSplitter, RecursiveCharacterTextSplitter
from pilot.embedding_engine import SourceEmbedding, register
@ -13,19 +13,35 @@ class StringEmbedding(SourceEmbedding):
self,
file_path,
vector_store_config,
source_reader: Optional = None,
text_splitter: Optional[TextSplitter] = None,
):
"""Initialize raw text word path."""
super().__init__(file_path=file_path, vector_store_config=vector_store_config)
super().__init__(file_path=file_path, vector_store_config=vector_store_config, source_reader=None, text_splitter=None)
self.file_path = file_path
self.vector_store_config = vector_store_config
self.source_reader = source_reader or None
self.text_splitter = text_splitter or None
@register
def read(self):
"""Load from String path."""
metadata = {"source": "raw text"}
return [Document(page_content=self.file_path, metadata=metadata)]
docs = [Document(page_content=self.file_path, metadata=metadata)]
if self.text_splitter is None:
try:
self.text_splitter = SpacyTextSplitter(
pipeline="zh_core_web_sm",
chunk_size=100,
chunk_overlap=100,
)
except Exception:
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=100, chunk_overlap=50
)
return self.text_splitter.split_documents(docs)
@register
def data_process(self, documents: List[Document]):

View File

@ -19,18 +19,22 @@ class URLEmbedding(SourceEmbedding):
self,
file_path,
vector_store_config,
source_reader: Optional = None,
text_splitter: Optional[TextSplitter] = None,
):
"""Initialize url word path."""
super().__init__(file_path, vector_store_config, text_splitter=None)
super().__init__(file_path, vector_store_config, source_reader=None, text_splitter=None)
self.file_path = file_path
self.vector_store_config = vector_store_config
self.source_reader = source_reader or None
self.text_splitter = text_splitter or None
@register
def read(self):
"""Load from url path."""
loader = WebBaseLoader(web_path=self.file_path)
if self.source_reader is None:
self.source_reader = WebBaseLoader(web_path=self.file_path)
if self.text_splitter is None:
try:
self.text_splitter = SpacyTextSplitter(
@ -43,7 +47,7 @@ class URLEmbedding(SourceEmbedding):
chunk_size=100, chunk_overlap=50
)
return loader.load_and_split(self.text_splitter)
return self.source_reader.load_and_split(self.text_splitter)
@register
def data_process(self, documents: List[Document]):

View File

@ -20,18 +20,21 @@ class WordEmbedding(SourceEmbedding):
self,
file_path,
vector_store_config,
source_reader: Optional = None,
text_splitter: Optional[TextSplitter] = None,
):
"""Initialize with word path."""
super().__init__(file_path, vector_store_config, text_splitter=None)
super().__init__(file_path, vector_store_config, source_reader=None, text_splitter=None)
self.file_path = file_path
self.vector_store_config = vector_store_config
self.source_reader = source_reader or None
self.text_splitter = text_splitter or None
@register
def read(self):
"""Load from word path."""
loader = UnstructuredWordDocumentLoader(self.file_path)
if self.source_reader is None:
self.source_reader = UnstructuredWordDocumentLoader(self.file_path)
if self.text_splitter is None:
try:
self.text_splitter = SpacyTextSplitter(
@ -44,7 +47,7 @@ class WordEmbedding(SourceEmbedding):
chunk_size=100, chunk_overlap=50
)
return loader.load_and_split(self.text_splitter)
return self.source_reader.load_and_split(self.text_splitter)
@register
def data_process(self, documents: List[Document]):

View File

@ -1,3 +1,4 @@
import atexit
import traceback
import os
import shutil
@ -36,7 +37,7 @@ CFG = Config()
logger = build_logger("webserver", LOGDIR + "webserver.log")
def signal_handler(sig, frame):
def signal_handler():
print("in order to avoid chroma db atexit problem")
os._exit(0)
@ -96,7 +97,6 @@ if __name__ == "__main__":
action="store_true",
help="enable light mode",
)
signal.signal(signal.SIGINT, signal_handler)
# init server config
args = parser.parse_args()
@ -114,3 +114,4 @@ if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=args.port)
signal.signal(signal.SIGINT, signal_handler())

View File

@ -124,7 +124,6 @@ class DBSummaryClient:
"chroma_persist_path": KNOWLEDGE_UPLOAD_ROOT_PATH,
}
knowledge_embedding_client = EmbeddingEngine(
file_path="",
model_name=LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL],
vector_store_config=vector_store_config,
)