feat(RAG):add metadata properties filters (#1395)

This commit is contained in:
Aries-ckt
2024-04-10 14:33:24 +08:00
committed by GitHub
parent 0f2b46da62
commit 37e7c0151b
26 changed files with 619 additions and 166 deletions

View File

@@ -2,7 +2,7 @@
from abc import ABC, abstractmethod
from enum import Enum
from typing import Any, List, Optional, Tuple, Type
from typing import Any, Dict, List, Optional, Tuple, Type, Union
from dbgpt.core import Document
from dbgpt.rag.text_splitter.text_splitter import (
@@ -147,16 +147,18 @@ class Knowledge(ABC):
self,
path: Optional[str] = None,
knowledge_type: Optional[KnowledgeType] = None,
data_loader: Optional[Any] = None,
loader: Optional[Any] = None,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
**kwargs: Any,
) -> None:
"""Initialize with Knowledge arguments."""
self._path = path
self._type = knowledge_type
self._data_loader = data_loader
self._loader = loader
self._metadata = metadata
def load(self) -> List[Document]:
"""Load knowledge from data_loader."""
"""Load knowledge from data loader."""
documents = self._load()
return self._postprocess(documents)
@@ -171,12 +173,12 @@ class Knowledge(ABC):
return None
def _postprocess(self, docs: List[Document]) -> List[Document]:
"""Post process knowledge from data_loader."""
"""Post process knowledge from data loader."""
return docs
@abstractmethod
def _load(self) -> List[Document]:
"""Preprocess knowledge from data_loader."""
"""Preprocess knowledge from data loader."""
@classmethod
def support_chunk_strategy(cls) -> List[ChunkStrategy]:

View File

@@ -1,6 +1,6 @@
"""CSV Knowledge."""
import csv
from typing import Any, List, Optional
from typing import Any, Dict, List, Optional, Union
from dbgpt.core import Document
from dbgpt.rag.knowledge.base import (
@@ -21,6 +21,7 @@ class CSVKnowledge(Knowledge):
source_column: Optional[str] = None,
encoding: Optional[str] = "utf-8",
loader: Optional[Any] = None,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
**kwargs: Any,
) -> None:
"""Create CSV Knowledge with Knowledge arguments.
@@ -32,9 +33,13 @@ class CSVKnowledge(Knowledge):
encoding(str, optional): csv encoding
loader(Any, optional): loader
"""
self._path = file_path
self._type = knowledge_type
self._loader = loader
super().__init__(
path=file_path,
knowledge_type=knowledge_type,
data_loader=loader,
metadata=metadata,
**kwargs,
)
self._encoding = encoding
self._source_column = source_column
@@ -67,6 +72,8 @@ class CSVKnowledge(Knowledge):
f"file."
)
metadata = {"source": source, "row": i}
if self._metadata:
metadata.update(self._metadata) # type: ignore
doc = Document(content=content, metadata=metadata)
docs.append(doc)

View File

@@ -1,5 +1,5 @@
"""Datasource Knowledge."""
from typing import Any, List, Optional
from typing import Any, Dict, List, Optional, Union
from dbgpt.core import Document
from dbgpt.datasource import BaseConnector
@@ -16,26 +16,29 @@ class DatasourceKnowledge(Knowledge):
connector: BaseConnector,
summary_template: str = "{table_name}({columns})",
knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
**kwargs: Any,
) -> None:
"""Create Datasource Knowledge with Knowledge arguments.
Args:
path(str, optional): file path
connector(BaseConnector): connector
summary_template(str, optional): summary template
knowledge_type(KnowledgeType, optional): knowledge type
data_loader(Any, optional): loader
metadata(Dict[str, Union[str, List[str]], optional): metadata
"""
self._connector = connector
self._summary_template = summary_template
super().__init__(knowledge_type=knowledge_type, **kwargs)
super().__init__(knowledge_type=knowledge_type, metadata=metadata, **kwargs)
def _load(self) -> List[Document]:
"""Load datasource document from data_loader."""
docs = []
for table_summary in _parse_db_summary(self._connector, self._summary_template):
docs.append(
Document(content=table_summary, metadata={"source": "database"})
)
metadata = {"source": "database"}
if self._metadata:
metadata.update(self._metadata) # type: ignore
docs.append(Document(content=table_summary, metadata=metadata))
return docs
@classmethod

View File

@@ -1,5 +1,5 @@
"""Docx Knowledge."""
from typing import Any, List, Optional
from typing import Any, Dict, List, Optional, Union
import docx
@@ -21,6 +21,7 @@ class DocxKnowledge(Knowledge):
knowledge_type: Any = KnowledgeType.DOCUMENT,
encoding: Optional[str] = "utf-8",
loader: Optional[Any] = None,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
**kwargs: Any,
) -> None:
"""Create Docx Knowledge with Knowledge arguments.
@@ -31,9 +32,13 @@ class DocxKnowledge(Knowledge):
encoding(str, optional): csv encoding
loader(Any, optional): loader
"""
self._path = file_path
self._type = knowledge_type
self._loader = loader
super().__init__(
path=file_path,
knowledge_type=knowledge_type,
data_loader=loader,
metadata=metadata,
**kwargs,
)
self._encoding = encoding
def _load(self) -> List[Document]:
@@ -48,9 +53,10 @@ class DocxKnowledge(Knowledge):
para = doc.paragraphs[i]
text = para.text
content.append(text)
docs.append(
Document(content="\n".join(content), metadata={"source": self._path})
)
metadata = {"source": self._path}
if self._metadata:
metadata.update(self._metadata) # type: ignore
docs.append(Document(content="\n".join(content), metadata=metadata))
return docs
return [Document.langchain2doc(lc_document) for lc_document in documents]

View File

@@ -1,5 +1,5 @@
"""Knowledge Factory to create knowledge from file path and url."""
from typing import List, Optional, Type
from typing import Dict, List, Optional, Type, Union
from dbgpt.rag.knowledge.base import Knowledge, KnowledgeType
from dbgpt.rag.knowledge.string import StringKnowledge
@@ -13,6 +13,7 @@ class KnowledgeFactory:
self,
file_path: Optional[str] = None,
knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
):
"""Create Knowledge Factory with file path and knowledge type.
@@ -22,18 +23,21 @@ class KnowledgeFactory:
"""
self._file_path = file_path
self._knowledge_type = knowledge_type
self._metadata = metadata
@classmethod
def create(
cls,
datasource: str = "",
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
):
"""Create knowledge from file path, url or text.
Args:
datasource: path of the file to convert
knowledge_type: type of knowledge
metadata: Optional[Dict[str, Union[str, List[str]]]]
Examples:
.. code-block:: python
@@ -52,12 +56,16 @@ class KnowledgeFactory:
match knowledge_type:
case KnowledgeType.DOCUMENT:
return cls.from_file_path(
file_path=datasource, knowledge_type=knowledge_type
file_path=datasource,
knowledge_type=knowledge_type,
metadata=metadata,
)
case KnowledgeType.URL:
return cls.from_url(url=datasource, knowledge_type=knowledge_type)
case KnowledgeType.TEXT:
return cls.from_text(text=datasource, knowledge_type=knowledge_type)
return cls.from_text(
text=datasource, knowledge_type=knowledge_type, metadata=metadata
)
case _:
raise Exception(f"Unsupported knowledge type '{knowledge_type}'")
@@ -66,6 +74,7 @@ class KnowledgeFactory:
cls,
file_path: str = "",
knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
) -> Knowledge:
"""Create knowledge from path.
@@ -82,10 +91,11 @@ class KnowledgeFactory:
datasource="path/to/document.pdf",
knowledge_type=KnowledgeType.DOCUMENT,
)
"""
factory = cls(file_path=file_path, knowledge_type=knowledge_type)
return factory._select_document_knowledge(
file_path=file_path, knowledge_type=knowledge_type
file_path=file_path, knowledge_type=knowledge_type, metadata=metadata
)
@staticmethod
@@ -117,6 +127,7 @@ class KnowledgeFactory:
def from_text(
text: str = "",
knowledge_type: KnowledgeType = KnowledgeType.TEXT,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
) -> Knowledge:
"""Create knowledge from text.
@@ -127,6 +138,7 @@ class KnowledgeFactory:
return StringKnowledge(
text=text,
knowledge_type=knowledge_type,
metadata=metadata,
)
def _select_document_knowledge(self, **kwargs):

View File

@@ -1,5 +1,5 @@
"""HTML Knowledge."""
from typing import Any, List, Optional
from typing import Any, Dict, List, Optional, Union
import chardet
@@ -20,6 +20,7 @@ class HTMLKnowledge(Knowledge):
file_path: Optional[str] = None,
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
loader: Optional[Any] = None,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
**kwargs: Any,
) -> None:
"""Create HTML Knowledge with Knowledge arguments.
@@ -29,9 +30,13 @@ class HTMLKnowledge(Knowledge):
knowledge_type(KnowledgeType, optional): knowledge type
loader(Any, optional): loader
"""
self._path = file_path
self._type = knowledge_type
self._loader = loader
super().__init__(
path=file_path,
knowledge_type=knowledge_type,
data_loader=loader,
metadata=metadata,
**kwargs,
)
def _load(self) -> List[Document]:
"""Load html document from loader."""
@@ -48,6 +53,8 @@ class HTMLKnowledge(Knowledge):
else:
text = raw_text.decode(result["encoding"])
metadata = {"source": self._path}
if self._metadata:
metadata.update(self._metadata) # type: ignore
return [Document(content=text, metadata=metadata)]
return [Document.langchain2doc(lc_document) for lc_document in documents]

View File

@@ -1,5 +1,5 @@
"""Markdown Knowledge."""
from typing import Any, List, Optional
from typing import Any, Dict, List, Optional, Union
from dbgpt.core import Document
from dbgpt.rag.knowledge.base import (
@@ -19,6 +19,7 @@ class MarkdownKnowledge(Knowledge):
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
encoding: Optional[str] = "utf-8",
loader: Optional[Any] = None,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
**kwargs: Any,
) -> None:
"""Create Markdown Knowledge with Knowledge arguments.
@@ -29,9 +30,13 @@ class MarkdownKnowledge(Knowledge):
encoding(str, optional): csv encoding
loader(Any, optional): loader
"""
self._path = file_path
self._type = knowledge_type
self._loader = loader
super().__init__(
path=file_path,
knowledge_type=knowledge_type,
data_loader=loader,
metadata=metadata,
**kwargs,
)
self._encoding = encoding
def _load(self) -> List[Document]:
@@ -44,6 +49,8 @@ class MarkdownKnowledge(Knowledge):
with open(self._path, encoding=self._encoding, errors="ignore") as f:
markdown_text = f.read()
metadata = {"source": self._path}
if self._metadata:
metadata.update(self._metadata) # type: ignore
documents = [Document(content=markdown_text, metadata=metadata)]
return documents
return [Document.langchain2doc(lc_document) for lc_document in documents]

View File

@@ -1,5 +1,5 @@
"""PDF Knowledge."""
from typing import Any, List, Optional
from typing import Any, Dict, List, Optional, Union
from dbgpt.core import Document
from dbgpt.rag.knowledge.base import (
@@ -19,6 +19,7 @@ class PDFKnowledge(Knowledge):
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
loader: Optional[Any] = None,
language: Optional[str] = "zh",
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
**kwargs: Any,
) -> None:
"""Create PDF Knowledge with Knowledge arguments.
@@ -29,9 +30,13 @@ class PDFKnowledge(Knowledge):
loader(Any, optional): loader
language(str, optional): language
"""
self._path = file_path
self._type = knowledge_type
self._loader = loader
super().__init__(
path=file_path,
knowledge_type=knowledge_type,
data_loader=loader,
metadata=metadata,
**kwargs,
)
self._language = language
def _load(self) -> List[Document]:
@@ -65,6 +70,8 @@ class PDFKnowledge(Knowledge):
page = "\n".join(cleaned_lines)
# cleaned_pages.append(page)
metadata = {"source": self._path, "page": page_num}
if self._metadata:
metadata.update(self._metadata) # type: ignore
# text = "\f".join(cleaned_pages)
document = Document(content=page, metadata=metadata)
documents.append(document)

View File

@@ -1,5 +1,5 @@
"""PPTX Knowledge."""
from typing import Any, List, Optional
from typing import Any, Dict, List, Optional, Union
from dbgpt.core import Document
from dbgpt.rag.knowledge.base import (
@@ -19,6 +19,7 @@ class PPTXKnowledge(Knowledge):
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
loader: Optional[Any] = None,
language: Optional[str] = "zh",
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
**kwargs: Any,
) -> None:
"""Create PPTX knowledge with PDF Knowledge arguments.
@@ -28,9 +29,13 @@ class PPTXKnowledge(Knowledge):
knowledge_type:(KnowledgeType) knowledge type
loader:(Optional[Any]) loader
"""
self._path = file_path
self._type = knowledge_type
self._loader = loader
super().__init__(
path=file_path,
knowledge_type=knowledge_type,
data_loader=loader,
metadata=metadata,
**kwargs,
)
self._language = language
def _load(self) -> List[Document]:
@@ -47,9 +52,10 @@ class PPTXKnowledge(Knowledge):
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text:
content += shape.text
docs.append(
Document(content=content, metadata={"source": slide.slide_id})
)
metadata = {"source": self._path}
if self._metadata:
metadata.update(self._metadata) # type: ignore
docs.append(Document(content=content, metadata=metadata))
return docs
return [Document.langchain2doc(lc_document) for lc_document in documents]

View File

@@ -1,5 +1,5 @@
"""String Knowledge."""
from typing import Any, List, Optional
from typing import Any, Dict, List, Optional, Union
from dbgpt.core import Document
from dbgpt.rag.knowledge.base import ChunkStrategy, Knowledge, KnowledgeType
@@ -14,6 +14,7 @@ class StringKnowledge(Knowledge):
knowledge_type: KnowledgeType = KnowledgeType.TEXT,
encoding: Optional[str] = "utf-8",
loader: Optional[Any] = None,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
**kwargs: Any,
) -> None:
"""Create String knowledge parameters.
@@ -24,14 +25,20 @@ class StringKnowledge(Knowledge):
encoding(str): encoding
loader(Any): loader
"""
super().__init__(
knowledge_type=knowledge_type,
data_loader=loader,
metadata=metadata,
**kwargs,
)
self._text = text
self._type = knowledge_type
self._loader = loader
self._encoding = encoding
def _load(self) -> List[Document]:
"""Load raw text from loader."""
metadata = {"source": "raw text"}
if self._metadata:
metadata.update(self._metadata) # type: ignore
docs = [Document(content=self._text, metadata=metadata)]
return docs

View File

@@ -1,5 +1,5 @@
"""TXT Knowledge."""
from typing import Any, List, Optional
from typing import Any, Dict, List, Optional, Union
import chardet
@@ -20,6 +20,7 @@ class TXTKnowledge(Knowledge):
file_path: Optional[str] = None,
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
loader: Optional[Any] = None,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
**kwargs: Any,
) -> None:
"""Create TXT Knowledge with Knowledge arguments.
@@ -29,9 +30,13 @@ class TXTKnowledge(Knowledge):
knowledge_type(KnowledgeType, optional): knowledge type
loader(Any, optional): loader
"""
self._path = file_path
self._type = knowledge_type
self._loader = loader
super().__init__(
path=file_path,
knowledge_type=knowledge_type,
data_loader=loader,
metadata=metadata,
**kwargs,
)
def _load(self) -> List[Document]:
"""Load txt document from loader."""
@@ -48,6 +53,8 @@ class TXTKnowledge(Knowledge):
else:
text = raw_text.decode(result["encoding"])
metadata = {"source": self._path}
if self._metadata:
metadata.update(self._metadata) # type: ignore
return [Document(content=text, metadata=metadata)]
return [Document.langchain2doc(lc_document) for lc_document in documents]

View File

@@ -26,9 +26,9 @@ class URLKnowledge(Knowledge):
encoding(str, optional): csv encoding
loader(Any, optional): loader
"""
self._path = url or None
self._type = knowledge_type
self._loader = loader
super().__init__(
path=url, knowledge_type=knowledge_type, loader=loader, **kwargs
)
self._encoding = encoding
self._source_column = source_column