mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-09-05 11:01:09 +00:00
doc:db-gpt doc
This commit is contained in:
42
docs/modules/knowledge/markdown/markdown_embedding.md
Normal file
42
docs/modules/knowledge/markdown/markdown_embedding.md
Normal file
@@ -0,0 +1,42 @@
|
||||
MarkdownEmbedding
|
||||
==================================
|
||||
markdown embedding can import md text into a vector knowledge base. The entire embedding process includes the read (loading data), data_process (data processing), and index_to_store (embedding to the vector database) methods.
|
||||
|
||||
inheriting the SourceEmbedding
|
||||
|
||||
```
|
||||
class MarkdownEmbedding(SourceEmbedding):
|
||||
"""pdf embedding for read pdf document."""
|
||||
|
||||
def __init__(self, file_path, vector_store_config):
|
||||
"""Initialize with pdf path."""
|
||||
super().__init__(file_path, vector_store_config)
|
||||
self.file_path = file_path
|
||||
self.vector_store_config = vector_store_config
|
||||
```
|
||||
implement read() and data_process()
|
||||
read() method allows you to read data and split data into chunk
|
||||
|
||||
```
|
||||
@register
|
||||
def read(self):
|
||||
"""Load from markdown path."""
|
||||
loader = EncodeTextLoader(self.file_path)
|
||||
textsplitter = SpacyTextSplitter(
|
||||
pipeline="zh_core_web_sm",
|
||||
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
||||
chunk_overlap=100,
|
||||
)
|
||||
return loader.load_and_split(textsplitter)
|
||||
```
|
||||
|
||||
data_process() method allows you to pre processing your ways
|
||||
```
|
||||
@register
|
||||
def data_process(self, documents: List[Document]):
|
||||
i = 0
|
||||
for d in documents:
|
||||
documents[i].page_content = d.page_content.replace("\n", "")
|
||||
i += 1
|
||||
return documents
|
||||
```
|
43
docs/modules/knowledge/pdf/pdf_embedding.md
Normal file
43
docs/modules/knowledge/pdf/pdf_embedding.md
Normal file
@@ -0,0 +1,43 @@
|
||||
PDFEmbedding
|
||||
==================================
|
||||
pdfembedding can import PDF text into a vector knowledge base. The entire embedding process includes the read (loading data), data_process (data processing), and index_to_store (embedding to the vector database) methods.
|
||||
|
||||
inheriting the SourceEmbedding
|
||||
```
|
||||
class PDFEmbedding(SourceEmbedding):
|
||||
"""pdf embedding for read pdf document."""
|
||||
|
||||
def __init__(self, file_path, vector_store_config):
|
||||
"""Initialize with pdf path."""
|
||||
super().__init__(file_path, vector_store_config)
|
||||
self.file_path = file_path
|
||||
self.vector_store_config = vector_store_config
|
||||
```
|
||||
|
||||
implement read() and data_process()
|
||||
read() method allows you to read data and split data into chunk
|
||||
```
|
||||
@register
|
||||
def read(self):
|
||||
"""Load from pdf path."""
|
||||
loader = PyPDFLoader(self.file_path)
|
||||
# textsplitter = CHNDocumentSplitter(
|
||||
# pdf=True, sentence_size=CFG.KNOWLEDGE_CHUNK_SIZE
|
||||
# )
|
||||
textsplitter = SpacyTextSplitter(
|
||||
pipeline="zh_core_web_sm",
|
||||
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
||||
chunk_overlap=100,
|
||||
)
|
||||
return loader.load_and_split(textsplitter)
|
||||
```
|
||||
data_process() method allows you to pre processing your ways
|
||||
```
|
||||
@register
|
||||
def data_process(self, documents: List[Document]):
|
||||
i = 0
|
||||
for d in documents:
|
||||
documents[i].page_content = d.page_content.replace("\n", "")
|
||||
i += 1
|
||||
return documents
|
||||
```
|
40
docs/modules/knowledge/ppt/ppt_embedding.md
Normal file
40
docs/modules/knowledge/ppt/ppt_embedding.md
Normal file
@@ -0,0 +1,40 @@
|
||||
PPTEmbedding
|
||||
==================================
|
||||
ppt embedding can import ppt text into a vector knowledge base. The entire embedding process includes the read (loading data), data_process (data processing), and index_to_store (embedding to the vector database) methods.
|
||||
|
||||
inheriting the SourceEmbedding
|
||||
```
|
||||
class PPTEmbedding(SourceEmbedding):
|
||||
"""ppt embedding for read ppt document."""
|
||||
|
||||
def __init__(self, file_path, vector_store_config):
|
||||
"""Initialize with pdf path."""
|
||||
super().__init__(file_path, vector_store_config)
|
||||
self.file_path = file_path
|
||||
self.vector_store_config = vector_store_config
|
||||
```
|
||||
|
||||
implement read() and data_process()
|
||||
read() method allows you to read data and split data into chunk
|
||||
```
|
||||
@register
|
||||
def read(self):
|
||||
"""Load from ppt path."""
|
||||
loader = UnstructuredPowerPointLoader(self.file_path)
|
||||
textsplitter = SpacyTextSplitter(
|
||||
pipeline="zh_core_web_sm",
|
||||
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
||||
chunk_overlap=200,
|
||||
)
|
||||
return loader.load_and_split(textsplitter)
|
||||
```
|
||||
data_process() method allows you to pre processing your ways
|
||||
```
|
||||
@register
|
||||
def data_process(self, documents: List[Document]):
|
||||
i = 0
|
||||
for d in documents:
|
||||
documents[i].page_content = d.page_content.replace("\n", "")
|
||||
i += 1
|
||||
return documents
|
||||
```
|
47
docs/modules/knowledge/url/url_embedding.md
Normal file
47
docs/modules/knowledge/url/url_embedding.md
Normal file
@@ -0,0 +1,47 @@
|
||||
URL Embedding
|
||||
==================================
|
||||
url embedding can import PDF text into a vector knowledge base. The entire embedding process includes the read (loading data), data_process (data processing), and index_to_store (embedding to the vector database) methods.
|
||||
|
||||
inheriting the SourceEmbedding
|
||||
```
|
||||
class URLEmbedding(SourceEmbedding):
|
||||
"""url embedding for read url document."""
|
||||
|
||||
def __init__(self, file_path, vector_store_config):
|
||||
"""Initialize with url path."""
|
||||
super().__init__(file_path, vector_store_config)
|
||||
self.file_path = file_path
|
||||
self.vector_store_config = vector_store_config
|
||||
```
|
||||
|
||||
implement read() and data_process()
|
||||
read() method allows you to read data and split data into chunk
|
||||
```
|
||||
@register
|
||||
def read(self):
|
||||
"""Load from url path."""
|
||||
loader = WebBaseLoader(web_path=self.file_path)
|
||||
if CFG.LANGUAGE == "en":
|
||||
text_splitter = CharacterTextSplitter(
|
||||
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
||||
chunk_overlap=20,
|
||||
length_function=len,
|
||||
)
|
||||
else:
|
||||
text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=1000)
|
||||
return loader.load_and_split(text_splitter)
|
||||
```
|
||||
data_process() method allows you to pre processing your ways
|
||||
```
|
||||
@register
|
||||
def data_process(self, documents: List[Document]):
|
||||
i = 0
|
||||
for d in documents:
|
||||
content = d.page_content.replace("\n", "")
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
for tag in soup(["!doctype", "meta"]):
|
||||
tag.extract()
|
||||
documents[i].page_content = soup.get_text()
|
||||
i += 1
|
||||
return documents
|
||||
```
|
38
docs/modules/knowledge/word/word_embedding.md
Normal file
38
docs/modules/knowledge/word/word_embedding.md
Normal file
@@ -0,0 +1,38 @@
|
||||
WordEmbedding
|
||||
==================================
|
||||
word embedding can import word doc/docx text into a vector knowledge base. The entire embedding process includes the read (loading data), data_process (data processing), and index_to_store (embedding to the vector database) methods.
|
||||
|
||||
inheriting the SourceEmbedding
|
||||
```
|
||||
class WordEmbedding(SourceEmbedding):
|
||||
"""word embedding for read word document."""
|
||||
|
||||
def __init__(self, file_path, vector_store_config):
|
||||
"""Initialize with word path."""
|
||||
super().__init__(file_path, vector_store_config)
|
||||
self.file_path = file_path
|
||||
self.vector_store_config = vector_store_config
|
||||
```
|
||||
|
||||
implement read() and data_process()
|
||||
read() method allows you to read data and split data into chunk
|
||||
```
|
||||
@register
|
||||
def read(self):
|
||||
"""Load from word path."""
|
||||
loader = UnstructuredWordDocumentLoader(self.file_path)
|
||||
textsplitter = CHNDocumentSplitter(
|
||||
pdf=True, sentence_size=CFG.KNOWLEDGE_CHUNK_SIZE
|
||||
)
|
||||
return loader.load_and_split(textsplitter)
|
||||
```
|
||||
data_process() method allows you to pre processing your ways
|
||||
```
|
||||
@register
|
||||
def data_process(self, documents: List[Document]):
|
||||
i = 0
|
||||
for d in documents:
|
||||
documents[i].page_content = d.page_content.replace("\n", "")
|
||||
i += 1
|
||||
return documents
|
||||
```
|
Reference in New Issue
Block a user