mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-08-06 02:46:40 +00:00
feat:embedding api
1.embedding_engine add source_reader param 2.docs update 3.fix chroma exit bug
This commit is contained in:
parent
56c1947eda
commit
6404bfe63a
@ -25,22 +25,25 @@ $ docker run --name=mysql -p 3306:3306 -e MYSQL_ROOT_PASSWORD=aa12345678 -dit my
|
|||||||
We use [Chroma embedding database](https://github.com/chroma-core/chroma) as the default for our vector database, so there is no need for special installation. If you choose to connect to other databases, you can follow our tutorial for installation and configuration.
|
We use [Chroma embedding database](https://github.com/chroma-core/chroma) as the default for our vector database, so there is no need for special installation. If you choose to connect to other databases, you can follow our tutorial for installation and configuration.
|
||||||
For the entire installation process of DB-GPT, we use the miniconda3 virtual environment. Create a virtual environment and install the Python dependencies.
|
For the entire installation process of DB-GPT, we use the miniconda3 virtual environment. Create a virtual environment and install the Python dependencies.
|
||||||
|
|
||||||
```{tip}
|
```bash
|
||||||
python>=3.10
|
python>=3.10
|
||||||
conda create -n dbgpt_env python=3.10
|
conda create -n dbgpt_env python=3.10
|
||||||
conda activate dbgpt_env
|
conda activate dbgpt_env
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
```
|
```
|
||||||
Before use DB-GPT Knowledge Management
|
Before use DB-GPT Knowledge Management
|
||||||
```{tip}
|
```bash
|
||||||
python -m spacy download zh_core_web_sm
|
python -m spacy download zh_core_web_sm
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Once the environment is installed, we have to create a new folder "models" in the DB-GPT project, and then we can put all the models downloaded from huggingface in this directory
|
Once the environment is installed, we have to create a new folder "models" in the DB-GPT project, and then we can put all the models downloaded from huggingface in this directory
|
||||||
|
|
||||||
Notice make sure you have install git-lfs
|
|
||||||
```{tip}
|
```{tip}
|
||||||
|
Notice make sure you have install git-lfs
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
git clone https://huggingface.co/Tribbiani/vicuna-13b
|
git clone https://huggingface.co/Tribbiani/vicuna-13b
|
||||||
git clone https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
|
git clone https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
|
||||||
git clone https://huggingface.co/GanymedeNil/text2vec-large-chinese
|
git clone https://huggingface.co/GanymedeNil/text2vec-large-chinese
|
||||||
|
@ -4,11 +4,13 @@ DB-GPT provides a third-party Python API package that you can integrate into you
|
|||||||
### Installation from Pip
|
### Installation from Pip
|
||||||
|
|
||||||
You can simply pip install:
|
You can simply pip install:
|
||||||
```{tip}
|
```bash
|
||||||
pip install -i https://pypi.org/ db-gpt==0.3.0
|
pip install -i https://pypi.org/ db-gpt==0.3.0
|
||||||
```
|
```
|
||||||
Notice:make sure python>=3.10
|
|
||||||
|
|
||||||
|
```{tip}
|
||||||
|
Notice:make sure python>=3.10
|
||||||
|
```
|
||||||
|
|
||||||
### Environment Setup
|
### Environment Setup
|
||||||
|
|
||||||
@ -16,8 +18,11 @@ By default, if you use the EmbeddingEngine api
|
|||||||
|
|
||||||
you will prepare embedding models from huggingface
|
you will prepare embedding models from huggingface
|
||||||
|
|
||||||
Notice make sure you have install git-lfs
|
|
||||||
```{tip}
|
```{tip}
|
||||||
|
Notice make sure you have install git-lfs
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
git clone https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
|
git clone https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
|
||||||
|
|
||||||
git clone https://huggingface.co/GanymedeNil/text2vec-large-chinese
|
git clone https://huggingface.co/GanymedeNil/text2vec-large-chinese
|
||||||
|
@ -4,13 +4,13 @@ Knowledge
|
|||||||
| As the knowledge base is currently the most significant user demand scenario, we natively support the construction and processing of knowledge bases. At the same time, we also provide multiple knowledge base management strategies in this project, such as pdf knowledge,md knowledge, txt knowledge, word knowledge, ppt knowledge:
|
| As the knowledge base is currently the most significant user demand scenario, we natively support the construction and processing of knowledge bases. At the same time, we also provide multiple knowledge base management strategies in this project, such as pdf knowledge,md knowledge, txt knowledge, word knowledge, ppt knowledge:
|
||||||
|
|
||||||
We currently support many document formats: raw text, txt, pdf, md, html, doc, ppt, and url.
|
We currently support many document formats: raw text, txt, pdf, md, html, doc, ppt, and url.
|
||||||
|
In the future, we will continue to support more types of knowledge, including audio, video, various databases, and big data sources. Of course, we look forward to your active participation in contributing code.
|
||||||
|
|
||||||
**Create your own knowledge repository**
|
**Create your own knowledge repository**
|
||||||
|
|
||||||
1.prepare
|
1.prepare
|
||||||
|
|
||||||
We currently support many document formats: raw text, txt, pdf, md, html, doc, ppt, and url.
|
We currently support many document formats: TEXT(raw text), DOCUMENT(.txt, .pdf, .md, .doc, .ppt, .html), and URL.
|
||||||
|
|
||||||
before execution:
|
before execution:
|
||||||
|
|
||||||
@ -72,12 +72,13 @@ eg: git clone https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
|
|||||||
vector_store_config=vector_store_config)
|
vector_store_config=vector_store_config)
|
||||||
embedding_engine.knowledge_embedding()
|
embedding_engine.knowledge_embedding()
|
||||||
|
|
||||||
If you want to add your text_splitter, do this:
|
If you want to add your source_reader or text_splitter, do this:
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
url = "https://db-gpt.readthedocs.io/en/latest/getting_started/getting_started.html"
|
url = "https://db-gpt.readthedocs.io/en/latest/getting_started/getting_started.html"
|
||||||
|
|
||||||
|
source_reader = WebBaseLoader(web_path=self.file_path)
|
||||||
text_splitter = RecursiveCharacterTextSplitter(
|
text_splitter = RecursiveCharacterTextSplitter(
|
||||||
chunk_size=100, chunk_overlap=50
|
chunk_size=100, chunk_overlap=50
|
||||||
)
|
)
|
||||||
@ -86,6 +87,7 @@ If you want to add your text_splitter, do this:
|
|||||||
knowledge_type=KnowledgeType.URL.value,
|
knowledge_type=KnowledgeType.URL.value,
|
||||||
model_name=embedding_model,
|
model_name=embedding_model,
|
||||||
vector_store_config=vector_store_config,
|
vector_store_config=vector_store_config,
|
||||||
|
source_reader=source_reader,
|
||||||
text_splitter=text_splitter
|
text_splitter=text_splitter
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -2,7 +2,7 @@ from typing import Dict, List, Optional
|
|||||||
|
|
||||||
from langchain.document_loaders import CSVLoader
|
from langchain.document_loaders import CSVLoader
|
||||||
from langchain.schema import Document
|
from langchain.schema import Document
|
||||||
from langchain.text_splitter import TextSplitter
|
from langchain.text_splitter import TextSplitter, SpacyTextSplitter, RecursiveCharacterTextSplitter
|
||||||
|
|
||||||
from pilot.embedding_engine import SourceEmbedding, register
|
from pilot.embedding_engine import SourceEmbedding, register
|
||||||
|
|
||||||
@ -14,19 +14,34 @@ class CSVEmbedding(SourceEmbedding):
|
|||||||
self,
|
self,
|
||||||
file_path,
|
file_path,
|
||||||
vector_store_config,
|
vector_store_config,
|
||||||
|
source_reader: Optional = None,
|
||||||
text_splitter: Optional[TextSplitter] = None,
|
text_splitter: Optional[TextSplitter] = None,
|
||||||
):
|
):
|
||||||
"""Initialize with csv path."""
|
"""Initialize with csv path."""
|
||||||
super().__init__(file_path, vector_store_config, text_splitter=None)
|
super().__init__(file_path, vector_store_config, source_reader=None, text_splitter=None)
|
||||||
self.file_path = file_path
|
self.file_path = file_path
|
||||||
self.vector_store_config = vector_store_config
|
self.vector_store_config = vector_store_config
|
||||||
|
self.source_reader = source_reader or None
|
||||||
self.text_splitter = text_splitter or None
|
self.text_splitter = text_splitter or None
|
||||||
|
|
||||||
@register
|
@register
|
||||||
def read(self):
|
def read(self):
|
||||||
"""Load from csv path."""
|
"""Load from csv path."""
|
||||||
loader = CSVLoader(file_path=self.file_path)
|
if self.source_reader is None:
|
||||||
return loader.load()
|
self.source_reader = CSVLoader(self.file_path)
|
||||||
|
if self.text_splitter is None:
|
||||||
|
try:
|
||||||
|
self.text_splitter = SpacyTextSplitter(
|
||||||
|
pipeline="zh_core_web_sm",
|
||||||
|
chunk_size=100,
|
||||||
|
chunk_overlap=100,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
self.text_splitter = RecursiveCharacterTextSplitter(
|
||||||
|
chunk_size=100, chunk_overlap=50
|
||||||
|
)
|
||||||
|
|
||||||
|
return self.source_reader.load_and_split(self.text_splitter)
|
||||||
|
|
||||||
@register
|
@register
|
||||||
def data_process(self, documents: List[Document]):
|
def data_process(self, documents: List[Document]):
|
||||||
|
@ -22,6 +22,7 @@ class EmbeddingEngine:
|
|||||||
vector_store_config,
|
vector_store_config,
|
||||||
knowledge_type: Optional[str] = KnowledgeType.DOCUMENT.value,
|
knowledge_type: Optional[str] = KnowledgeType.DOCUMENT.value,
|
||||||
knowledge_source: Optional[str] = None,
|
knowledge_source: Optional[str] = None,
|
||||||
|
source_reader: Optional = None,
|
||||||
text_splitter: Optional[TextSplitter] = None,
|
text_splitter: Optional[TextSplitter] = None,
|
||||||
):
|
):
|
||||||
"""Initialize with knowledge embedding client, model_name, vector_store_config, knowledge_type, knowledge_source"""
|
"""Initialize with knowledge embedding client, model_name, vector_store_config, knowledge_type, knowledge_source"""
|
||||||
@ -31,6 +32,7 @@ class EmbeddingEngine:
|
|||||||
self.knowledge_type = knowledge_type
|
self.knowledge_type = knowledge_type
|
||||||
self.embeddings = HuggingFaceEmbeddings(model_name=self.model_name)
|
self.embeddings = HuggingFaceEmbeddings(model_name=self.model_name)
|
||||||
self.vector_store_config["embeddings"] = self.embeddings
|
self.vector_store_config["embeddings"] = self.embeddings
|
||||||
|
self.source_reader = source_reader
|
||||||
self.text_splitter = text_splitter
|
self.text_splitter = text_splitter
|
||||||
|
|
||||||
def knowledge_embedding(self):
|
def knowledge_embedding(self):
|
||||||
@ -53,6 +55,7 @@ class EmbeddingEngine:
|
|||||||
self.knowledge_type,
|
self.knowledge_type,
|
||||||
self.knowledge_source,
|
self.knowledge_source,
|
||||||
self.vector_store_config,
|
self.vector_store_config,
|
||||||
|
self.source_reader,
|
||||||
self.text_splitter,
|
self.text_splitter,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -41,7 +41,7 @@ class KnowledgeType(Enum):
|
|||||||
|
|
||||||
|
|
||||||
def get_knowledge_embedding(
|
def get_knowledge_embedding(
|
||||||
knowledge_type, knowledge_source, vector_store_config, text_splitter
|
knowledge_type, knowledge_source, vector_store_config, source_reader, text_splitter
|
||||||
):
|
):
|
||||||
match knowledge_type:
|
match knowledge_type:
|
||||||
case KnowledgeType.DOCUMENT.value:
|
case KnowledgeType.DOCUMENT.value:
|
||||||
@ -51,6 +51,7 @@ def get_knowledge_embedding(
|
|||||||
embedding = knowledge_class(
|
embedding = knowledge_class(
|
||||||
knowledge_source,
|
knowledge_source,
|
||||||
vector_store_config=vector_store_config,
|
vector_store_config=vector_store_config,
|
||||||
|
source_reader=source_reader,
|
||||||
text_splitter=text_splitter,
|
text_splitter=text_splitter,
|
||||||
**knowledge_args,
|
**knowledge_args,
|
||||||
)
|
)
|
||||||
@ -60,6 +61,7 @@ def get_knowledge_embedding(
|
|||||||
embedding = URLEmbedding(
|
embedding = URLEmbedding(
|
||||||
file_path=knowledge_source,
|
file_path=knowledge_source,
|
||||||
vector_store_config=vector_store_config,
|
vector_store_config=vector_store_config,
|
||||||
|
source_reader=source_reader,
|
||||||
text_splitter=text_splitter,
|
text_splitter=text_splitter,
|
||||||
)
|
)
|
||||||
return embedding
|
return embedding
|
||||||
@ -67,6 +69,7 @@ def get_knowledge_embedding(
|
|||||||
embedding = StringEmbedding(
|
embedding = StringEmbedding(
|
||||||
file_path=knowledge_source,
|
file_path=knowledge_source,
|
||||||
vector_store_config=vector_store_config,
|
vector_store_config=vector_store_config,
|
||||||
|
source_reader=source_reader,
|
||||||
text_splitter=text_splitter,
|
text_splitter=text_splitter,
|
||||||
)
|
)
|
||||||
return embedding
|
return embedding
|
||||||
|
@ -24,19 +24,21 @@ class MarkdownEmbedding(SourceEmbedding):
|
|||||||
self,
|
self,
|
||||||
file_path,
|
file_path,
|
||||||
vector_store_config,
|
vector_store_config,
|
||||||
|
source_reader: Optional = None,
|
||||||
text_splitter: Optional[TextSplitter] = None,
|
text_splitter: Optional[TextSplitter] = None,
|
||||||
):
|
):
|
||||||
"""Initialize raw text word path."""
|
"""Initialize raw text word path."""
|
||||||
super().__init__(file_path, vector_store_config, text_splitter=None)
|
super().__init__(file_path, vector_store_config, source_reader=None, text_splitter=None)
|
||||||
self.file_path = file_path
|
self.file_path = file_path
|
||||||
self.vector_store_config = vector_store_config
|
self.vector_store_config = vector_store_config
|
||||||
|
self.source_reader = source_reader or None
|
||||||
self.text_splitter = text_splitter or None
|
self.text_splitter = text_splitter or None
|
||||||
# self.encoding = encoding
|
|
||||||
|
|
||||||
@register
|
@register
|
||||||
def read(self):
|
def read(self):
|
||||||
"""Load from markdown path."""
|
"""Load from markdown path."""
|
||||||
loader = EncodeTextLoader(self.file_path)
|
if self.source_reader is None:
|
||||||
|
self.source_reader = EncodeTextLoader(self.file_path)
|
||||||
if self.text_splitter is None:
|
if self.text_splitter is None:
|
||||||
try:
|
try:
|
||||||
self.text_splitter = SpacyTextSplitter(
|
self.text_splitter = SpacyTextSplitter(
|
||||||
@ -49,7 +51,7 @@ class MarkdownEmbedding(SourceEmbedding):
|
|||||||
chunk_size=100, chunk_overlap=50
|
chunk_size=100, chunk_overlap=50
|
||||||
)
|
)
|
||||||
|
|
||||||
return loader.load_and_split(self.text_splitter)
|
return self.source_reader.load_and_split(self.text_splitter)
|
||||||
|
|
||||||
@register
|
@register
|
||||||
def data_process(self, documents: List[Document]):
|
def data_process(self, documents: List[Document]):
|
||||||
|
@ -20,18 +20,21 @@ class PDFEmbedding(SourceEmbedding):
|
|||||||
self,
|
self,
|
||||||
file_path,
|
file_path,
|
||||||
vector_store_config,
|
vector_store_config,
|
||||||
|
source_reader: Optional = None,
|
||||||
text_splitter: Optional[TextSplitter] = None,
|
text_splitter: Optional[TextSplitter] = None,
|
||||||
):
|
):
|
||||||
"""Initialize pdf word path."""
|
"""Initialize pdf word path."""
|
||||||
super().__init__(file_path, vector_store_config, text_splitter=None)
|
super().__init__(file_path, vector_store_config, source_reader=None, text_splitter=None)
|
||||||
self.file_path = file_path
|
self.file_path = file_path
|
||||||
self.vector_store_config = vector_store_config
|
self.vector_store_config = vector_store_config
|
||||||
|
self.source_reader = source_reader or None
|
||||||
self.text_splitter = text_splitter or None
|
self.text_splitter = text_splitter or None
|
||||||
|
|
||||||
@register
|
@register
|
||||||
def read(self):
|
def read(self):
|
||||||
"""Load from pdf path."""
|
"""Load from pdf path."""
|
||||||
loader = PyPDFLoader(self.file_path)
|
if self.source_reader is None:
|
||||||
|
self.source_reader = PyPDFLoader(self.file_path)
|
||||||
if self.text_splitter is None:
|
if self.text_splitter is None:
|
||||||
try:
|
try:
|
||||||
self.text_splitter = SpacyTextSplitter(
|
self.text_splitter = SpacyTextSplitter(
|
||||||
@ -44,7 +47,7 @@ class PDFEmbedding(SourceEmbedding):
|
|||||||
chunk_size=100, chunk_overlap=50
|
chunk_size=100, chunk_overlap=50
|
||||||
)
|
)
|
||||||
|
|
||||||
return loader.load_and_split(self.text_splitter)
|
return self.source_reader.load_and_split(self.text_splitter)
|
||||||
|
|
||||||
@register
|
@register
|
||||||
def data_process(self, documents: List[Document]):
|
def data_process(self, documents: List[Document]):
|
||||||
|
@ -20,18 +20,21 @@ class PPTEmbedding(SourceEmbedding):
|
|||||||
self,
|
self,
|
||||||
file_path,
|
file_path,
|
||||||
vector_store_config,
|
vector_store_config,
|
||||||
|
source_reader: Optional = None,
|
||||||
text_splitter: Optional[TextSplitter] = None,
|
text_splitter: Optional[TextSplitter] = None,
|
||||||
):
|
):
|
||||||
"""Initialize ppt word path."""
|
"""Initialize ppt word path."""
|
||||||
super().__init__(file_path, vector_store_config, text_splitter=None)
|
super().__init__(file_path, vector_store_config, source_reader=None, text_splitter=None)
|
||||||
self.file_path = file_path
|
self.file_path = file_path
|
||||||
self.vector_store_config = vector_store_config
|
self.vector_store_config = vector_store_config
|
||||||
|
self.source_reader = source_reader or None
|
||||||
self.text_splitter = text_splitter or None
|
self.text_splitter = text_splitter or None
|
||||||
|
|
||||||
@register
|
@register
|
||||||
def read(self):
|
def read(self):
|
||||||
"""Load from ppt path."""
|
"""Load from ppt path."""
|
||||||
loader = UnstructuredPowerPointLoader(self.file_path)
|
if self.source_reader is None:
|
||||||
|
self.source_reader = UnstructuredPowerPointLoader(self.file_path)
|
||||||
if self.text_splitter is None:
|
if self.text_splitter is None:
|
||||||
try:
|
try:
|
||||||
self.text_splitter = SpacyTextSplitter(
|
self.text_splitter = SpacyTextSplitter(
|
||||||
@ -44,7 +47,7 @@ class PPTEmbedding(SourceEmbedding):
|
|||||||
chunk_size=100, chunk_overlap=50
|
chunk_size=100, chunk_overlap=50
|
||||||
)
|
)
|
||||||
|
|
||||||
return loader.load_and_split(self.text_splitter)
|
return self.source_reader.load_and_split(self.text_splitter)
|
||||||
|
|
||||||
@register
|
@register
|
||||||
def data_process(self, documents: List[Document]):
|
def data_process(self, documents: List[Document]):
|
||||||
|
@ -26,12 +26,14 @@ class SourceEmbedding(ABC):
|
|||||||
self,
|
self,
|
||||||
file_path,
|
file_path,
|
||||||
vector_store_config: {},
|
vector_store_config: {},
|
||||||
|
source_reader: Optional = None,
|
||||||
text_splitter: Optional[TextSplitter] = None,
|
text_splitter: Optional[TextSplitter] = None,
|
||||||
embedding_args: Optional[Dict] = None,
|
embedding_args: Optional[Dict] = None,
|
||||||
):
|
):
|
||||||
"""Initialize with Loader url, model_name, vector_store_config"""
|
"""Initialize with Loader url, model_name, vector_store_config"""
|
||||||
self.file_path = file_path
|
self.file_path = file_path
|
||||||
self.vector_store_config = vector_store_config
|
self.vector_store_config = vector_store_config
|
||||||
|
self.source_reader = source_reader or None
|
||||||
self.text_splitter = text_splitter or None
|
self.text_splitter = text_splitter or None
|
||||||
self.embedding_args = embedding_args
|
self.embedding_args = embedding_args
|
||||||
self.embeddings = vector_store_config["embeddings"]
|
self.embeddings = vector_store_config["embeddings"]
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
from langchain.schema import Document
|
from langchain.schema import Document
|
||||||
from langchain.text_splitter import TextSplitter
|
from langchain.text_splitter import TextSplitter, SpacyTextSplitter, RecursiveCharacterTextSplitter
|
||||||
|
|
||||||
from pilot.embedding_engine import SourceEmbedding, register
|
from pilot.embedding_engine import SourceEmbedding, register
|
||||||
|
|
||||||
@ -13,19 +13,35 @@ class StringEmbedding(SourceEmbedding):
|
|||||||
self,
|
self,
|
||||||
file_path,
|
file_path,
|
||||||
vector_store_config,
|
vector_store_config,
|
||||||
|
source_reader: Optional = None,
|
||||||
text_splitter: Optional[TextSplitter] = None,
|
text_splitter: Optional[TextSplitter] = None,
|
||||||
):
|
):
|
||||||
"""Initialize raw text word path."""
|
"""Initialize raw text word path."""
|
||||||
super().__init__(file_path=file_path, vector_store_config=vector_store_config)
|
super().__init__(file_path=file_path, vector_store_config=vector_store_config, source_reader=None, text_splitter=None)
|
||||||
self.file_path = file_path
|
self.file_path = file_path
|
||||||
self.vector_store_config = vector_store_config
|
self.vector_store_config = vector_store_config
|
||||||
|
self.source_reader = source_reader or None
|
||||||
self.text_splitter = text_splitter or None
|
self.text_splitter = text_splitter or None
|
||||||
|
|
||||||
@register
|
@register
|
||||||
def read(self):
|
def read(self):
|
||||||
"""Load from String path."""
|
"""Load from String path."""
|
||||||
metadata = {"source": "raw text"}
|
metadata = {"source": "raw text"}
|
||||||
return [Document(page_content=self.file_path, metadata=metadata)]
|
docs = [Document(page_content=self.file_path, metadata=metadata)]
|
||||||
|
if self.text_splitter is None:
|
||||||
|
try:
|
||||||
|
self.text_splitter = SpacyTextSplitter(
|
||||||
|
pipeline="zh_core_web_sm",
|
||||||
|
chunk_size=100,
|
||||||
|
chunk_overlap=100,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
self.text_splitter = RecursiveCharacterTextSplitter(
|
||||||
|
chunk_size=100, chunk_overlap=50
|
||||||
|
)
|
||||||
|
|
||||||
|
return self.text_splitter.split_documents(docs)
|
||||||
|
|
||||||
|
|
||||||
@register
|
@register
|
||||||
def data_process(self, documents: List[Document]):
|
def data_process(self, documents: List[Document]):
|
||||||
|
@ -19,18 +19,22 @@ class URLEmbedding(SourceEmbedding):
|
|||||||
self,
|
self,
|
||||||
file_path,
|
file_path,
|
||||||
vector_store_config,
|
vector_store_config,
|
||||||
|
source_reader: Optional = None,
|
||||||
text_splitter: Optional[TextSplitter] = None,
|
text_splitter: Optional[TextSplitter] = None,
|
||||||
):
|
):
|
||||||
"""Initialize url word path."""
|
"""Initialize url word path."""
|
||||||
super().__init__(file_path, vector_store_config, text_splitter=None)
|
super().__init__(file_path, vector_store_config, source_reader=None, text_splitter=None)
|
||||||
self.file_path = file_path
|
self.file_path = file_path
|
||||||
self.vector_store_config = vector_store_config
|
self.vector_store_config = vector_store_config
|
||||||
|
self.source_reader = source_reader or None
|
||||||
self.text_splitter = text_splitter or None
|
self.text_splitter = text_splitter or None
|
||||||
|
|
||||||
|
|
||||||
@register
|
@register
|
||||||
def read(self):
|
def read(self):
|
||||||
"""Load from url path."""
|
"""Load from url path."""
|
||||||
loader = WebBaseLoader(web_path=self.file_path)
|
if self.source_reader is None:
|
||||||
|
self.source_reader = WebBaseLoader(web_path=self.file_path)
|
||||||
if self.text_splitter is None:
|
if self.text_splitter is None:
|
||||||
try:
|
try:
|
||||||
self.text_splitter = SpacyTextSplitter(
|
self.text_splitter = SpacyTextSplitter(
|
||||||
@ -43,7 +47,7 @@ class URLEmbedding(SourceEmbedding):
|
|||||||
chunk_size=100, chunk_overlap=50
|
chunk_size=100, chunk_overlap=50
|
||||||
)
|
)
|
||||||
|
|
||||||
return loader.load_and_split(self.text_splitter)
|
return self.source_reader.load_and_split(self.text_splitter)
|
||||||
|
|
||||||
@register
|
@register
|
||||||
def data_process(self, documents: List[Document]):
|
def data_process(self, documents: List[Document]):
|
||||||
|
@ -20,18 +20,21 @@ class WordEmbedding(SourceEmbedding):
|
|||||||
self,
|
self,
|
||||||
file_path,
|
file_path,
|
||||||
vector_store_config,
|
vector_store_config,
|
||||||
|
source_reader: Optional = None,
|
||||||
text_splitter: Optional[TextSplitter] = None,
|
text_splitter: Optional[TextSplitter] = None,
|
||||||
):
|
):
|
||||||
"""Initialize with word path."""
|
"""Initialize with word path."""
|
||||||
super().__init__(file_path, vector_store_config, text_splitter=None)
|
super().__init__(file_path, vector_store_config, source_reader=None, text_splitter=None)
|
||||||
self.file_path = file_path
|
self.file_path = file_path
|
||||||
self.vector_store_config = vector_store_config
|
self.vector_store_config = vector_store_config
|
||||||
|
self.source_reader = source_reader or None
|
||||||
self.text_splitter = text_splitter or None
|
self.text_splitter = text_splitter or None
|
||||||
|
|
||||||
@register
|
@register
|
||||||
def read(self):
|
def read(self):
|
||||||
"""Load from word path."""
|
"""Load from word path."""
|
||||||
loader = UnstructuredWordDocumentLoader(self.file_path)
|
if self.source_reader is None:
|
||||||
|
self.source_reader = UnstructuredWordDocumentLoader(self.file_path)
|
||||||
if self.text_splitter is None:
|
if self.text_splitter is None:
|
||||||
try:
|
try:
|
||||||
self.text_splitter = SpacyTextSplitter(
|
self.text_splitter = SpacyTextSplitter(
|
||||||
@ -44,7 +47,7 @@ class WordEmbedding(SourceEmbedding):
|
|||||||
chunk_size=100, chunk_overlap=50
|
chunk_size=100, chunk_overlap=50
|
||||||
)
|
)
|
||||||
|
|
||||||
return loader.load_and_split(self.text_splitter)
|
return self.source_reader.load_and_split(self.text_splitter)
|
||||||
|
|
||||||
@register
|
@register
|
||||||
def data_process(self, documents: List[Document]):
|
def data_process(self, documents: List[Document]):
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import atexit
|
||||||
import traceback
|
import traceback
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
@ -36,7 +37,7 @@ CFG = Config()
|
|||||||
logger = build_logger("webserver", LOGDIR + "webserver.log")
|
logger = build_logger("webserver", LOGDIR + "webserver.log")
|
||||||
|
|
||||||
|
|
||||||
def signal_handler(sig, frame):
|
def signal_handler():
|
||||||
print("in order to avoid chroma db atexit problem")
|
print("in order to avoid chroma db atexit problem")
|
||||||
os._exit(0)
|
os._exit(0)
|
||||||
|
|
||||||
@ -96,7 +97,6 @@ if __name__ == "__main__":
|
|||||||
action="store_true",
|
action="store_true",
|
||||||
help="enable light mode",
|
help="enable light mode",
|
||||||
)
|
)
|
||||||
signal.signal(signal.SIGINT, signal_handler)
|
|
||||||
|
|
||||||
# init server config
|
# init server config
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
@ -114,3 +114,4 @@ if __name__ == "__main__":
|
|||||||
import uvicorn
|
import uvicorn
|
||||||
|
|
||||||
uvicorn.run(app, host="0.0.0.0", port=args.port)
|
uvicorn.run(app, host="0.0.0.0", port=args.port)
|
||||||
|
signal.signal(signal.SIGINT, signal_handler())
|
||||||
|
@ -124,7 +124,6 @@ class DBSummaryClient:
|
|||||||
"chroma_persist_path": KNOWLEDGE_UPLOAD_ROOT_PATH,
|
"chroma_persist_path": KNOWLEDGE_UPLOAD_ROOT_PATH,
|
||||||
}
|
}
|
||||||
knowledge_embedding_client = EmbeddingEngine(
|
knowledge_embedding_client = EmbeddingEngine(
|
||||||
file_path="",
|
|
||||||
model_name=LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL],
|
model_name=LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL],
|
||||||
vector_store_config=vector_store_config,
|
vector_store_config=vector_store_config,
|
||||||
)
|
)
|
||||||
|
Loading…
Reference in New Issue
Block a user