diff --git a/examples/knowledge_embedding/csv_embedding_test.py b/examples/knowledge_embedding/csv_embedding_test.py index 3f08422f7..dcf4873b2 100644 --- a/examples/knowledge_embedding/csv_embedding_test.py +++ b/examples/knowledge_embedding/csv_embedding_test.py @@ -1,4 +1,4 @@ -from pilot.source_embedding.csv_embedding import CSVEmbedding +from pilot.embedding_engine.csv_embedding import CSVEmbedding # path = "/Users/chenketing/Downloads/share_ireserve双写数据异常2.xlsx" path = "xx.csv" diff --git a/examples/knowledge_embedding/pdf_embedding_test.py b/examples/knowledge_embedding/pdf_embedding_test.py index 660b811ee..ef0e1d87e 100644 --- a/examples/knowledge_embedding/pdf_embedding_test.py +++ b/examples/knowledge_embedding/pdf_embedding_test.py @@ -1,4 +1,4 @@ -from pilot.source_embedding.pdf_embedding import PDFEmbedding +from pilot.embedding_engine.pdf_embedding import PDFEmbedding path = "xxx.pdf" path = "your_path/OceanBase-数据库-V4.1.0-应用开发.pdf" diff --git a/examples/knowledge_embedding/url_embedding_test.py b/examples/knowledge_embedding/url_embedding_test.py index aeb353c89..c702fd1f7 100644 --- a/examples/knowledge_embedding/url_embedding_test.py +++ b/examples/knowledge_embedding/url_embedding_test.py @@ -1,4 +1,4 @@ -from pilot.source_embedding.url_embedding import URLEmbedding +from pilot.embedding_engine.url_embedding import URLEmbedding path = "https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-8-2023" model_name = "your_path/all-MiniLM-L6-v2" diff --git a/pilot/__init__.py b/pilot/__init__.py index f44b2e809..86aa3585f 100644 --- a/pilot/__init__.py +++ b/pilot/__init__.py @@ -1,3 +1,3 @@ -from pilot.source_embedding import SourceEmbedding, register +from pilot.embedding_engine import SourceEmbedding, register __all__ = ["SourceEmbedding", "register"] diff --git a/pilot/source_embedding/EncodeTextLoader.py b/pilot/embedding_engine/EncodeTextLoader.py similarity index 100% rename from pilot/source_embedding/EncodeTextLoader.py rename to pilot/embedding_engine/EncodeTextLoader.py diff --git a/pilot/embedding_engine/__init__.py b/pilot/embedding_engine/__init__.py new file mode 100644 index 000000000..ac54efd20 --- /dev/null +++ b/pilot/embedding_engine/__init__.py @@ -0,0 +1,3 @@ +from pilot.embedding_engine.source_embedding import SourceEmbedding, register + +__all__ = ["SourceEmbedding", "register"] diff --git a/pilot/source_embedding/chn_document_splitter.py b/pilot/embedding_engine/chn_document_splitter.py similarity index 100% rename from pilot/source_embedding/chn_document_splitter.py rename to pilot/embedding_engine/chn_document_splitter.py diff --git a/pilot/source_embedding/csv_embedding.py b/pilot/embedding_engine/csv_embedding.py similarity index 94% rename from pilot/source_embedding/csv_embedding.py rename to pilot/embedding_engine/csv_embedding.py index 0e69574b4..0e0aa54ec 100644 --- a/pilot/source_embedding/csv_embedding.py +++ b/pilot/embedding_engine/csv_embedding.py @@ -3,7 +3,7 @@ from typing import Dict, List, Optional from langchain.document_loaders import CSVLoader from langchain.schema import Document -from pilot.source_embedding import SourceEmbedding, register +from pilot.embedding_engine import SourceEmbedding, register class CSVEmbedding(SourceEmbedding): diff --git a/pilot/source_embedding/external/__init__.py b/pilot/embedding_engine/external/__init__.py similarity index 100% rename from pilot/source_embedding/external/__init__.py rename to pilot/embedding_engine/external/__init__.py diff --git a/pilot/source_embedding/knowledge_embedding.py b/pilot/embedding_engine/knowledge_embedding.py similarity index 88% rename from pilot/source_embedding/knowledge_embedding.py rename to pilot/embedding_engine/knowledge_embedding.py index 97b515897..2d7780510 100644 --- a/pilot/source_embedding/knowledge_embedding.py +++ b/pilot/embedding_engine/knowledge_embedding.py @@ -4,12 +4,12 @@ from chromadb.errors import NotEnoughElementsException from langchain.embeddings import HuggingFaceEmbeddings from pilot.configs.config import Config -from pilot.source_embedding.csv_embedding import CSVEmbedding -from pilot.source_embedding.markdown_embedding import MarkdownEmbedding -from pilot.source_embedding.pdf_embedding import PDFEmbedding -from pilot.source_embedding.ppt_embedding import PPTEmbedding -from pilot.source_embedding.url_embedding import URLEmbedding -from pilot.source_embedding.word_embedding import WordEmbedding +from pilot.embedding_engine.csv_embedding import CSVEmbedding +from pilot.embedding_engine.markdown_embedding import MarkdownEmbedding +from pilot.embedding_engine.pdf_embedding import PDFEmbedding +from pilot.embedding_engine.ppt_embedding import PPTEmbedding +from pilot.embedding_engine.url_embedding import URLEmbedding +from pilot.embedding_engine.word_embedding import WordEmbedding from pilot.vector_store.connector import VectorStoreConnector CFG = Config() diff --git a/pilot/source_embedding/markdown_embedding.py b/pilot/embedding_engine/markdown_embedding.py similarity index 88% rename from pilot/source_embedding/markdown_embedding.py rename to pilot/embedding_engine/markdown_embedding.py index d8caee959..e9a97dce9 100644 --- a/pilot/source_embedding/markdown_embedding.py +++ b/pilot/embedding_engine/markdown_embedding.py @@ -9,9 +9,9 @@ from langchain.schema import Document from langchain.text_splitter import SpacyTextSplitter from pilot.configs.config import Config -from pilot.source_embedding import SourceEmbedding, register -from pilot.source_embedding.EncodeTextLoader import EncodeTextLoader -from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter +from pilot.embedding_engine import SourceEmbedding, register +from pilot.embedding_engine.EncodeTextLoader import EncodeTextLoader +from pilot.embedding_engine.chn_document_splitter import CHNDocumentSplitter CFG = Config() diff --git a/pilot/source_embedding/pdf_embedding.py b/pilot/embedding_engine/pdf_embedding.py similarity index 95% rename from pilot/source_embedding/pdf_embedding.py rename to pilot/embedding_engine/pdf_embedding.py index dd8c39c03..ea4276460 100644 --- a/pilot/source_embedding/pdf_embedding.py +++ b/pilot/embedding_engine/pdf_embedding.py @@ -7,7 +7,7 @@ from langchain.schema import Document from langchain.text_splitter import SpacyTextSplitter from pilot.configs.config import Config -from pilot.source_embedding import SourceEmbedding, register +from pilot.embedding_engine import SourceEmbedding, register CFG = Config() diff --git a/pilot/source_embedding/pdf_loader.py b/pilot/embedding_engine/pdf_loader.py similarity index 100% rename from pilot/source_embedding/pdf_loader.py rename to pilot/embedding_engine/pdf_loader.py diff --git a/pilot/source_embedding/ppt_embedding.py b/pilot/embedding_engine/ppt_embedding.py similarity index 95% rename from pilot/source_embedding/ppt_embedding.py rename to pilot/embedding_engine/ppt_embedding.py index 583b29ed1..485083d1c 100644 --- a/pilot/source_embedding/ppt_embedding.py +++ b/pilot/embedding_engine/ppt_embedding.py @@ -7,7 +7,7 @@ from langchain.schema import Document from langchain.text_splitter import SpacyTextSplitter from pilot.configs.config import Config -from pilot.source_embedding import SourceEmbedding, register +from pilot.embedding_engine import SourceEmbedding, register CFG = Config() diff --git a/pilot/source_embedding/search_milvus.py b/pilot/embedding_engine/search_milvus.py similarity index 100% rename from pilot/source_embedding/search_milvus.py rename to pilot/embedding_engine/search_milvus.py diff --git a/pilot/source_embedding/source_embedding.py b/pilot/embedding_engine/source_embedding.py similarity index 100% rename from pilot/source_embedding/source_embedding.py rename to pilot/embedding_engine/source_embedding.py diff --git a/pilot/source_embedding/string_embedding.py b/pilot/embedding_engine/string_embedding.py similarity index 100% rename from pilot/source_embedding/string_embedding.py rename to pilot/embedding_engine/string_embedding.py diff --git a/pilot/source_embedding/url_embedding.py b/pilot/embedding_engine/url_embedding.py similarity index 92% rename from pilot/source_embedding/url_embedding.py rename to pilot/embedding_engine/url_embedding.py index a315e6e45..113e2985e 100644 --- a/pilot/source_embedding/url_embedding.py +++ b/pilot/embedding_engine/url_embedding.py @@ -7,8 +7,8 @@ from langchain.text_splitter import CharacterTextSplitter from pilot.configs.config import Config from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE -from pilot.source_embedding import SourceEmbedding, register -from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter +from pilot.embedding_engine import SourceEmbedding, register +from pilot.embedding_engine.chn_document_splitter import CHNDocumentSplitter CFG = Config() diff --git a/pilot/source_embedding/word_embedding.py b/pilot/embedding_engine/word_embedding.py similarity index 90% rename from pilot/source_embedding/word_embedding.py rename to pilot/embedding_engine/word_embedding.py index 1f30f241c..34fc48450 100644 --- a/pilot/source_embedding/word_embedding.py +++ b/pilot/embedding_engine/word_embedding.py @@ -6,8 +6,8 @@ from langchain.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoad from langchain.schema import Document from pilot.configs.config import Config -from pilot.source_embedding import SourceEmbedding, register -from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter +from pilot.embedding_engine import SourceEmbedding, register +from pilot.embedding_engine.chn_document_splitter import CHNDocumentSplitter CFG = Config() diff --git a/pilot/model/cache/__init__.py b/pilot/model/cache/__init__.py new file mode 100644 index 000000000..6a2d6fc7e --- /dev/null +++ b/pilot/model/cache/__init__.py @@ -0,0 +1,4 @@ +from .base import Cache +from .disk_cache import DiskCache +from .memory_cache import InMemoryCache +from .gpt_cache import GPTCache \ No newline at end of file diff --git a/pilot/model/cache/base.py b/pilot/model/cache/base.py new file mode 100644 index 000000000..d8c3d5851 --- /dev/null +++ b/pilot/model/cache/base.py @@ -0,0 +1,27 @@ +import json +import hashlib +from typing import Any, Dict +from abc import ABC, abstractmethod + +class Cache(ABC): + + def create(self, key: str) -> bool: + pass + + def clear(self): + pass + + @abstractmethod + def __getitem__(self, key: str) -> str: + """get an item from the cache or throw key error""" + pass + + @abstractmethod + def __setitem__(self, key: str, value: str) -> None: + """set an item in the cache""" + pass + + @abstractmethod + def __contains__(self, key: str) -> bool: + """see if we can return a cached value for the passed key""" + pass \ No newline at end of file diff --git a/pilot/model/cache/disk_cache.py b/pilot/model/cache/disk_cache.py new file mode 100644 index 000000000..c461a37ae --- /dev/null +++ b/pilot/model/cache/disk_cache.py @@ -0,0 +1,27 @@ +import os +import diskcache +import platformdirs +from pilot.model.cache import Cache + +class DiskCache(Cache): + """DiskCache is a cache that uses diskcache lib. + https://github.com/grantjenks/python-diskcache + """ + def __init__(self, llm_name: str): + self._diskcache = diskcache.Cache( + os.path.join( + platformdirs.user_cache_dir("dbgpt"), f"_{llm_name}.diskcache" + ) + ) + + def __getitem__(self, key: str) -> str: + return self._diskcache[key] + + def __setitem__(self, key: str, value: str) -> None: + self._diskcache[key] = value + + def __contains__(self, key: str) -> bool: + return key in self._diskcache + + def clear(self): + self._diskcache.clear() \ No newline at end of file diff --git a/pilot/model/cache/gpt_cache.py b/pilot/model/cache/gpt_cache.py new file mode 100644 index 000000000..0fc680510 --- /dev/null +++ b/pilot/model/cache/gpt_cache.py @@ -0,0 +1,44 @@ +import os +from typing import Dict, Any +import platformdirs + +from pilot.model.cache import Cache + +try: + from gptcache.adapter.api import get, put, init_similar_cache +except ImportError: + pass + +class GPTCache(Cache): + + """ + GPTCache is a semantic cache that uses + """ + + def __init__(self, cache) -> None: + """GPT Cache is a semantic cache that uses GPTCache lib.""" + + if isinstance(cache, str): + _cache = Cache() + init_similar_cache( + data_dir=os.path.join( + platformdirs.user_cache_dir("dbgpt"), f"_{cache}.gptcache" + ), + cache_obj=_cache + ) + else: + _cache = cache + + self._cache_obj = _cache + + def __getitem__(self, key: str) -> str: + return get(key) + + def __setitem__(self, key: str, value: str) -> None: + put(key, value) + + def __contains__(self, key: str) -> bool: + return get(key) is not None + + def create(self, llm: str, **kwargs: Dict[str, Any]) -> str: + pass \ No newline at end of file diff --git a/pilot/model/cache/memory_cache.py b/pilot/model/cache/memory_cache.py new file mode 100644 index 000000000..b5311a341 --- /dev/null +++ b/pilot/model/cache/memory_cache.py @@ -0,0 +1,24 @@ +from typing import Dict, Any +from pilot.model.cache import Cache + +class InMemoryCache(Cache): + + def __init__(self) -> None: + "Initialize that stores things in memory." + self._cache: Dict[str, Any] = {} + + def create(self, key: str) -> bool: + pass + + def clear(self): + return self._cache.clear() + + def __setitem__(self, key: str, value: str) -> None: + self._cache[key] = value + + def __getitem__(self, key: str) -> str: + return self._cache[key] + + def __contains__(self, key: str) -> bool: + return self._cache.get(key, None) is not None + diff --git a/pilot/model/proxy/__init__.py b/pilot/model/proxy/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/pilot/openapi/__init__.py b/pilot/openapi/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/pilot/scene/chat_knowledge/custom/chat.py b/pilot/scene/chat_knowledge/custom/chat.py index 214bf1656..85d48a657 100644 --- a/pilot/scene/chat_knowledge/custom/chat.py +++ b/pilot/scene/chat_knowledge/custom/chat.py @@ -19,7 +19,7 @@ from pilot.configs.model_config import ( ) from pilot.scene.chat_knowledge.custom.prompt import prompt -from pilot.source_embedding.knowledge_embedding import KnowledgeEmbedding +from pilot.embedding_engine.knowledge_embedding import KnowledgeEmbedding CFG = Config() diff --git a/pilot/scene/chat_knowledge/default/chat.py b/pilot/scene/chat_knowledge/default/chat.py index 6116deecd..838ff834c 100644 --- a/pilot/scene/chat_knowledge/default/chat.py +++ b/pilot/scene/chat_knowledge/default/chat.py @@ -19,7 +19,7 @@ from pilot.configs.model_config import ( ) from pilot.scene.chat_knowledge.default.prompt import prompt -from pilot.source_embedding.knowledge_embedding import KnowledgeEmbedding +from pilot.embedding_engine.knowledge_embedding import KnowledgeEmbedding CFG = Config() diff --git a/pilot/scene/chat_knowledge/url/chat.py b/pilot/scene/chat_knowledge/url/chat.py index ce45602a2..57fb8b618 100644 --- a/pilot/scene/chat_knowledge/url/chat.py +++ b/pilot/scene/chat_knowledge/url/chat.py @@ -17,7 +17,7 @@ from pilot.configs.model_config import ( ) from pilot.scene.chat_knowledge.url.prompt import prompt -from pilot.source_embedding.knowledge_embedding import KnowledgeEmbedding +from pilot.embedding_engine.knowledge_embedding import KnowledgeEmbedding CFG = Config() diff --git a/pilot/server/webserver.py b/pilot/server/webserver.py index c7a033336..7cc32bbad 100644 --- a/pilot/server/webserver.py +++ b/pilot/server/webserver.py @@ -39,7 +39,7 @@ from pilot.common.plugins import scan_plugins, load_native_plugins from pilot.server.gradio_css import code_highlight_css from pilot.server.gradio_patch import Chatbot as grChatbot -from pilot.source_embedding.knowledge_embedding import KnowledgeEmbedding +from pilot.embedding_engine.knowledge_embedding import KnowledgeEmbedding from pilot.utils import build_logger from pilot.vector_store.extract_tovec import ( get_vector_storelist, diff --git a/pilot/source_embedding/__init__.py b/pilot/source_embedding/__init__.py deleted file mode 100644 index 464ff11b1..000000000 --- a/pilot/source_embedding/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from pilot.source_embedding.source_embedding import SourceEmbedding, register - -__all__ = ["SourceEmbedding", "register"] diff --git a/pilot/summary/db_summary_client.py b/pilot/summary/db_summary_client.py index 5e551514b..2a15b55c5 100644 --- a/pilot/summary/db_summary_client.py +++ b/pilot/summary/db_summary_client.py @@ -7,8 +7,8 @@ from pilot.configs.config import Config from pilot.configs.model_config import LLM_MODEL_CONFIG from pilot.scene.base import ChatScene from pilot.scene.base_chat import BaseChat -from pilot.source_embedding.knowledge_embedding import KnowledgeEmbedding -from pilot.source_embedding.string_embedding import StringEmbedding +from pilot.embedding_engine.knowledge_embedding import KnowledgeEmbedding +from pilot.embedding_engine.string_embedding import StringEmbedding from pilot.summary.mysql_db_summary import MysqlSummary from pilot.scene.chat_factory import ChatFactory diff --git a/requirements.txt b/requirements.txt index 469dbd29a..bcc4879d4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -50,6 +50,7 @@ pymysql unstructured==0.6.3 grpcio==1.47.5 gpt4all==0.3.0 +diskcache==5.6.1 auto-gpt-plugin-template pymdown-extensions diff --git a/tools/knowlege_init.py b/tools/knowlege_init.py index c9a0c5457..e72c13aeb 100644 --- a/tools/knowlege_init.py +++ b/tools/knowlege_init.py @@ -11,7 +11,7 @@ from pilot.configs.model_config import ( DATASETS_DIR, LLM_MODEL_CONFIG, ) -from pilot.source_embedding.knowledge_embedding import KnowledgeEmbedding +from pilot.embedding_engine.knowledge_embedding import KnowledgeEmbedding CFG = Config()