mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-09-14 21:51:25 +00:00
feat(RAG):add metadata properties filters (#1395)
This commit is contained in:
@@ -2,7 +2,7 @@
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from enum import Enum
|
||||
from typing import Any, List, Optional, Tuple, Type
|
||||
from typing import Any, Dict, List, Optional, Tuple, Type, Union
|
||||
|
||||
from dbgpt.core import Document
|
||||
from dbgpt.rag.text_splitter.text_splitter import (
|
||||
@@ -147,16 +147,18 @@ class Knowledge(ABC):
|
||||
self,
|
||||
path: Optional[str] = None,
|
||||
knowledge_type: Optional[KnowledgeType] = None,
|
||||
data_loader: Optional[Any] = None,
|
||||
loader: Optional[Any] = None,
|
||||
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize with Knowledge arguments."""
|
||||
self._path = path
|
||||
self._type = knowledge_type
|
||||
self._data_loader = data_loader
|
||||
self._loader = loader
|
||||
self._metadata = metadata
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load knowledge from data_loader."""
|
||||
"""Load knowledge from data loader."""
|
||||
documents = self._load()
|
||||
return self._postprocess(documents)
|
||||
|
||||
@@ -171,12 +173,12 @@ class Knowledge(ABC):
|
||||
return None
|
||||
|
||||
def _postprocess(self, docs: List[Document]) -> List[Document]:
|
||||
"""Post process knowledge from data_loader."""
|
||||
"""Post process knowledge from data loader."""
|
||||
return docs
|
||||
|
||||
@abstractmethod
|
||||
def _load(self) -> List[Document]:
|
||||
"""Preprocess knowledge from data_loader."""
|
||||
"""Preprocess knowledge from data loader."""
|
||||
|
||||
@classmethod
|
||||
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
|
||||
|
@@ -1,6 +1,6 @@
|
||||
"""CSV Knowledge."""
|
||||
import csv
|
||||
from typing import Any, List, Optional
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
from dbgpt.core import Document
|
||||
from dbgpt.rag.knowledge.base import (
|
||||
@@ -21,6 +21,7 @@ class CSVKnowledge(Knowledge):
|
||||
source_column: Optional[str] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
loader: Optional[Any] = None,
|
||||
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Create CSV Knowledge with Knowledge arguments.
|
||||
@@ -32,9 +33,13 @@ class CSVKnowledge(Knowledge):
|
||||
encoding(str, optional): csv encoding
|
||||
loader(Any, optional): loader
|
||||
"""
|
||||
self._path = file_path
|
||||
self._type = knowledge_type
|
||||
self._loader = loader
|
||||
super().__init__(
|
||||
path=file_path,
|
||||
knowledge_type=knowledge_type,
|
||||
data_loader=loader,
|
||||
metadata=metadata,
|
||||
**kwargs,
|
||||
)
|
||||
self._encoding = encoding
|
||||
self._source_column = source_column
|
||||
|
||||
@@ -67,6 +72,8 @@ class CSVKnowledge(Knowledge):
|
||||
f"file."
|
||||
)
|
||||
metadata = {"source": source, "row": i}
|
||||
if self._metadata:
|
||||
metadata.update(self._metadata) # type: ignore
|
||||
doc = Document(content=content, metadata=metadata)
|
||||
docs.append(doc)
|
||||
|
||||
|
@@ -1,5 +1,5 @@
|
||||
"""Datasource Knowledge."""
|
||||
from typing import Any, List, Optional
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
from dbgpt.core import Document
|
||||
from dbgpt.datasource import BaseConnector
|
||||
@@ -16,26 +16,29 @@ class DatasourceKnowledge(Knowledge):
|
||||
connector: BaseConnector,
|
||||
summary_template: str = "{table_name}({columns})",
|
||||
knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
|
||||
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Create Datasource Knowledge with Knowledge arguments.
|
||||
|
||||
Args:
|
||||
path(str, optional): file path
|
||||
connector(BaseConnector): connector
|
||||
summary_template(str, optional): summary template
|
||||
knowledge_type(KnowledgeType, optional): knowledge type
|
||||
data_loader(Any, optional): loader
|
||||
metadata(Dict[str, Union[str, List[str]], optional): metadata
|
||||
"""
|
||||
self._connector = connector
|
||||
self._summary_template = summary_template
|
||||
super().__init__(knowledge_type=knowledge_type, **kwargs)
|
||||
super().__init__(knowledge_type=knowledge_type, metadata=metadata, **kwargs)
|
||||
|
||||
def _load(self) -> List[Document]:
|
||||
"""Load datasource document from data_loader."""
|
||||
docs = []
|
||||
for table_summary in _parse_db_summary(self._connector, self._summary_template):
|
||||
docs.append(
|
||||
Document(content=table_summary, metadata={"source": "database"})
|
||||
)
|
||||
metadata = {"source": "database"}
|
||||
if self._metadata:
|
||||
metadata.update(self._metadata) # type: ignore
|
||||
docs.append(Document(content=table_summary, metadata=metadata))
|
||||
return docs
|
||||
|
||||
@classmethod
|
||||
|
@@ -1,5 +1,5 @@
|
||||
"""Docx Knowledge."""
|
||||
from typing import Any, List, Optional
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
import docx
|
||||
|
||||
@@ -21,6 +21,7 @@ class DocxKnowledge(Knowledge):
|
||||
knowledge_type: Any = KnowledgeType.DOCUMENT,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
loader: Optional[Any] = None,
|
||||
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Create Docx Knowledge with Knowledge arguments.
|
||||
@@ -31,9 +32,13 @@ class DocxKnowledge(Knowledge):
|
||||
encoding(str, optional): csv encoding
|
||||
loader(Any, optional): loader
|
||||
"""
|
||||
self._path = file_path
|
||||
self._type = knowledge_type
|
||||
self._loader = loader
|
||||
super().__init__(
|
||||
path=file_path,
|
||||
knowledge_type=knowledge_type,
|
||||
data_loader=loader,
|
||||
metadata=metadata,
|
||||
**kwargs,
|
||||
)
|
||||
self._encoding = encoding
|
||||
|
||||
def _load(self) -> List[Document]:
|
||||
@@ -48,9 +53,10 @@ class DocxKnowledge(Knowledge):
|
||||
para = doc.paragraphs[i]
|
||||
text = para.text
|
||||
content.append(text)
|
||||
docs.append(
|
||||
Document(content="\n".join(content), metadata={"source": self._path})
|
||||
)
|
||||
metadata = {"source": self._path}
|
||||
if self._metadata:
|
||||
metadata.update(self._metadata) # type: ignore
|
||||
docs.append(Document(content="\n".join(content), metadata=metadata))
|
||||
return docs
|
||||
return [Document.langchain2doc(lc_document) for lc_document in documents]
|
||||
|
||||
|
@@ -1,5 +1,5 @@
|
||||
"""Knowledge Factory to create knowledge from file path and url."""
|
||||
from typing import List, Optional, Type
|
||||
from typing import Dict, List, Optional, Type, Union
|
||||
|
||||
from dbgpt.rag.knowledge.base import Knowledge, KnowledgeType
|
||||
from dbgpt.rag.knowledge.string import StringKnowledge
|
||||
@@ -13,6 +13,7 @@ class KnowledgeFactory:
|
||||
self,
|
||||
file_path: Optional[str] = None,
|
||||
knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
|
||||
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
|
||||
):
|
||||
"""Create Knowledge Factory with file path and knowledge type.
|
||||
|
||||
@@ -22,18 +23,21 @@ class KnowledgeFactory:
|
||||
"""
|
||||
self._file_path = file_path
|
||||
self._knowledge_type = knowledge_type
|
||||
self._metadata = metadata
|
||||
|
||||
@classmethod
|
||||
def create(
|
||||
cls,
|
||||
datasource: str = "",
|
||||
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
|
||||
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
|
||||
):
|
||||
"""Create knowledge from file path, url or text.
|
||||
|
||||
Args:
|
||||
datasource: path of the file to convert
|
||||
knowledge_type: type of knowledge
|
||||
metadata: Optional[Dict[str, Union[str, List[str]]]]
|
||||
|
||||
Examples:
|
||||
.. code-block:: python
|
||||
@@ -52,12 +56,16 @@ class KnowledgeFactory:
|
||||
match knowledge_type:
|
||||
case KnowledgeType.DOCUMENT:
|
||||
return cls.from_file_path(
|
||||
file_path=datasource, knowledge_type=knowledge_type
|
||||
file_path=datasource,
|
||||
knowledge_type=knowledge_type,
|
||||
metadata=metadata,
|
||||
)
|
||||
case KnowledgeType.URL:
|
||||
return cls.from_url(url=datasource, knowledge_type=knowledge_type)
|
||||
case KnowledgeType.TEXT:
|
||||
return cls.from_text(text=datasource, knowledge_type=knowledge_type)
|
||||
return cls.from_text(
|
||||
text=datasource, knowledge_type=knowledge_type, metadata=metadata
|
||||
)
|
||||
case _:
|
||||
raise Exception(f"Unsupported knowledge type '{knowledge_type}'")
|
||||
|
||||
@@ -66,6 +74,7 @@ class KnowledgeFactory:
|
||||
cls,
|
||||
file_path: str = "",
|
||||
knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
|
||||
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
|
||||
) -> Knowledge:
|
||||
"""Create knowledge from path.
|
||||
|
||||
@@ -82,10 +91,11 @@ class KnowledgeFactory:
|
||||
datasource="path/to/document.pdf",
|
||||
knowledge_type=KnowledgeType.DOCUMENT,
|
||||
)
|
||||
|
||||
"""
|
||||
factory = cls(file_path=file_path, knowledge_type=knowledge_type)
|
||||
return factory._select_document_knowledge(
|
||||
file_path=file_path, knowledge_type=knowledge_type
|
||||
file_path=file_path, knowledge_type=knowledge_type, metadata=metadata
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
@@ -117,6 +127,7 @@ class KnowledgeFactory:
|
||||
def from_text(
|
||||
text: str = "",
|
||||
knowledge_type: KnowledgeType = KnowledgeType.TEXT,
|
||||
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
|
||||
) -> Knowledge:
|
||||
"""Create knowledge from text.
|
||||
|
||||
@@ -127,6 +138,7 @@ class KnowledgeFactory:
|
||||
return StringKnowledge(
|
||||
text=text,
|
||||
knowledge_type=knowledge_type,
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
def _select_document_knowledge(self, **kwargs):
|
||||
|
@@ -1,5 +1,5 @@
|
||||
"""HTML Knowledge."""
|
||||
from typing import Any, List, Optional
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
import chardet
|
||||
|
||||
@@ -20,6 +20,7 @@ class HTMLKnowledge(Knowledge):
|
||||
file_path: Optional[str] = None,
|
||||
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
|
||||
loader: Optional[Any] = None,
|
||||
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Create HTML Knowledge with Knowledge arguments.
|
||||
@@ -29,9 +30,13 @@ class HTMLKnowledge(Knowledge):
|
||||
knowledge_type(KnowledgeType, optional): knowledge type
|
||||
loader(Any, optional): loader
|
||||
"""
|
||||
self._path = file_path
|
||||
self._type = knowledge_type
|
||||
self._loader = loader
|
||||
super().__init__(
|
||||
path=file_path,
|
||||
knowledge_type=knowledge_type,
|
||||
data_loader=loader,
|
||||
metadata=metadata,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def _load(self) -> List[Document]:
|
||||
"""Load html document from loader."""
|
||||
@@ -48,6 +53,8 @@ class HTMLKnowledge(Knowledge):
|
||||
else:
|
||||
text = raw_text.decode(result["encoding"])
|
||||
metadata = {"source": self._path}
|
||||
if self._metadata:
|
||||
metadata.update(self._metadata) # type: ignore
|
||||
return [Document(content=text, metadata=metadata)]
|
||||
|
||||
return [Document.langchain2doc(lc_document) for lc_document in documents]
|
||||
|
@@ -1,5 +1,5 @@
|
||||
"""Markdown Knowledge."""
|
||||
from typing import Any, List, Optional
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
from dbgpt.core import Document
|
||||
from dbgpt.rag.knowledge.base import (
|
||||
@@ -19,6 +19,7 @@ class MarkdownKnowledge(Knowledge):
|
||||
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
loader: Optional[Any] = None,
|
||||
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Create Markdown Knowledge with Knowledge arguments.
|
||||
@@ -29,9 +30,13 @@ class MarkdownKnowledge(Knowledge):
|
||||
encoding(str, optional): csv encoding
|
||||
loader(Any, optional): loader
|
||||
"""
|
||||
self._path = file_path
|
||||
self._type = knowledge_type
|
||||
self._loader = loader
|
||||
super().__init__(
|
||||
path=file_path,
|
||||
knowledge_type=knowledge_type,
|
||||
data_loader=loader,
|
||||
metadata=metadata,
|
||||
**kwargs,
|
||||
)
|
||||
self._encoding = encoding
|
||||
|
||||
def _load(self) -> List[Document]:
|
||||
@@ -44,6 +49,8 @@ class MarkdownKnowledge(Knowledge):
|
||||
with open(self._path, encoding=self._encoding, errors="ignore") as f:
|
||||
markdown_text = f.read()
|
||||
metadata = {"source": self._path}
|
||||
if self._metadata:
|
||||
metadata.update(self._metadata) # type: ignore
|
||||
documents = [Document(content=markdown_text, metadata=metadata)]
|
||||
return documents
|
||||
return [Document.langchain2doc(lc_document) for lc_document in documents]
|
||||
|
@@ -1,5 +1,5 @@
|
||||
"""PDF Knowledge."""
|
||||
from typing import Any, List, Optional
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
from dbgpt.core import Document
|
||||
from dbgpt.rag.knowledge.base import (
|
||||
@@ -19,6 +19,7 @@ class PDFKnowledge(Knowledge):
|
||||
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
|
||||
loader: Optional[Any] = None,
|
||||
language: Optional[str] = "zh",
|
||||
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Create PDF Knowledge with Knowledge arguments.
|
||||
@@ -29,9 +30,13 @@ class PDFKnowledge(Knowledge):
|
||||
loader(Any, optional): loader
|
||||
language(str, optional): language
|
||||
"""
|
||||
self._path = file_path
|
||||
self._type = knowledge_type
|
||||
self._loader = loader
|
||||
super().__init__(
|
||||
path=file_path,
|
||||
knowledge_type=knowledge_type,
|
||||
data_loader=loader,
|
||||
metadata=metadata,
|
||||
**kwargs,
|
||||
)
|
||||
self._language = language
|
||||
|
||||
def _load(self) -> List[Document]:
|
||||
@@ -65,6 +70,8 @@ class PDFKnowledge(Knowledge):
|
||||
page = "\n".join(cleaned_lines)
|
||||
# cleaned_pages.append(page)
|
||||
metadata = {"source": self._path, "page": page_num}
|
||||
if self._metadata:
|
||||
metadata.update(self._metadata) # type: ignore
|
||||
# text = "\f".join(cleaned_pages)
|
||||
document = Document(content=page, metadata=metadata)
|
||||
documents.append(document)
|
||||
|
@@ -1,5 +1,5 @@
|
||||
"""PPTX Knowledge."""
|
||||
from typing import Any, List, Optional
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
from dbgpt.core import Document
|
||||
from dbgpt.rag.knowledge.base import (
|
||||
@@ -19,6 +19,7 @@ class PPTXKnowledge(Knowledge):
|
||||
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
|
||||
loader: Optional[Any] = None,
|
||||
language: Optional[str] = "zh",
|
||||
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Create PPTX knowledge with PDF Knowledge arguments.
|
||||
@@ -28,9 +29,13 @@ class PPTXKnowledge(Knowledge):
|
||||
knowledge_type:(KnowledgeType) knowledge type
|
||||
loader:(Optional[Any]) loader
|
||||
"""
|
||||
self._path = file_path
|
||||
self._type = knowledge_type
|
||||
self._loader = loader
|
||||
super().__init__(
|
||||
path=file_path,
|
||||
knowledge_type=knowledge_type,
|
||||
data_loader=loader,
|
||||
metadata=metadata,
|
||||
**kwargs,
|
||||
)
|
||||
self._language = language
|
||||
|
||||
def _load(self) -> List[Document]:
|
||||
@@ -47,9 +52,10 @@ class PPTXKnowledge(Knowledge):
|
||||
for shape in slide.shapes:
|
||||
if hasattr(shape, "text") and shape.text:
|
||||
content += shape.text
|
||||
docs.append(
|
||||
Document(content=content, metadata={"source": slide.slide_id})
|
||||
)
|
||||
metadata = {"source": self._path}
|
||||
if self._metadata:
|
||||
metadata.update(self._metadata) # type: ignore
|
||||
docs.append(Document(content=content, metadata=metadata))
|
||||
return docs
|
||||
return [Document.langchain2doc(lc_document) for lc_document in documents]
|
||||
|
||||
|
@@ -1,5 +1,5 @@
|
||||
"""String Knowledge."""
|
||||
from typing import Any, List, Optional
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
from dbgpt.core import Document
|
||||
from dbgpt.rag.knowledge.base import ChunkStrategy, Knowledge, KnowledgeType
|
||||
@@ -14,6 +14,7 @@ class StringKnowledge(Knowledge):
|
||||
knowledge_type: KnowledgeType = KnowledgeType.TEXT,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
loader: Optional[Any] = None,
|
||||
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Create String knowledge parameters.
|
||||
@@ -24,14 +25,20 @@ class StringKnowledge(Knowledge):
|
||||
encoding(str): encoding
|
||||
loader(Any): loader
|
||||
"""
|
||||
super().__init__(
|
||||
knowledge_type=knowledge_type,
|
||||
data_loader=loader,
|
||||
metadata=metadata,
|
||||
**kwargs,
|
||||
)
|
||||
self._text = text
|
||||
self._type = knowledge_type
|
||||
self._loader = loader
|
||||
self._encoding = encoding
|
||||
|
||||
def _load(self) -> List[Document]:
|
||||
"""Load raw text from loader."""
|
||||
metadata = {"source": "raw text"}
|
||||
if self._metadata:
|
||||
metadata.update(self._metadata) # type: ignore
|
||||
docs = [Document(content=self._text, metadata=metadata)]
|
||||
return docs
|
||||
|
||||
|
@@ -1,5 +1,5 @@
|
||||
"""TXT Knowledge."""
|
||||
from typing import Any, List, Optional
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
import chardet
|
||||
|
||||
@@ -20,6 +20,7 @@ class TXTKnowledge(Knowledge):
|
||||
file_path: Optional[str] = None,
|
||||
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
|
||||
loader: Optional[Any] = None,
|
||||
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Create TXT Knowledge with Knowledge arguments.
|
||||
@@ -29,9 +30,13 @@ class TXTKnowledge(Knowledge):
|
||||
knowledge_type(KnowledgeType, optional): knowledge type
|
||||
loader(Any, optional): loader
|
||||
"""
|
||||
self._path = file_path
|
||||
self._type = knowledge_type
|
||||
self._loader = loader
|
||||
super().__init__(
|
||||
path=file_path,
|
||||
knowledge_type=knowledge_type,
|
||||
data_loader=loader,
|
||||
metadata=metadata,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def _load(self) -> List[Document]:
|
||||
"""Load txt document from loader."""
|
||||
@@ -48,6 +53,8 @@ class TXTKnowledge(Knowledge):
|
||||
else:
|
||||
text = raw_text.decode(result["encoding"])
|
||||
metadata = {"source": self._path}
|
||||
if self._metadata:
|
||||
metadata.update(self._metadata) # type: ignore
|
||||
return [Document(content=text, metadata=metadata)]
|
||||
|
||||
return [Document.langchain2doc(lc_document) for lc_document in documents]
|
||||
|
@@ -26,9 +26,9 @@ class URLKnowledge(Knowledge):
|
||||
encoding(str, optional): csv encoding
|
||||
loader(Any, optional): loader
|
||||
"""
|
||||
self._path = url or None
|
||||
self._type = knowledge_type
|
||||
self._loader = loader
|
||||
super().__init__(
|
||||
path=url, knowledge_type=knowledge_type, loader=loader, **kwargs
|
||||
)
|
||||
self._encoding = encoding
|
||||
self._source_column = source_column
|
||||
|
||||
|
Reference in New Issue
Block a user