diff --git a/.env.template b/.env.template index ca3751f5c..3b7c8e887 100644 --- a/.env.template +++ b/.env.template @@ -55,6 +55,7 @@ EMBEDDING_MODEL=text2vec #EMBEDDING_MODEL=bge-large-zh KNOWLEDGE_CHUNK_SIZE=500 KNOWLEDGE_SEARCH_TOP_SIZE=5 +#KNOWLEDGE_CHUNK_OVERLAP=50 # Control whether to display the source document of knowledge on the front end. KNOWLEDGE_CHAT_SHOW_RELATIONS=False ## EMBEDDING_TOKENIZER - Tokenizer to use for chunking large inputs diff --git a/pilot/configs/config.py b/pilot/configs/config.py index a25462c5e..680f9f856 100644 --- a/pilot/configs/config.py +++ b/pilot/configs/config.py @@ -228,7 +228,7 @@ class Config(metaclass=Singleton): ### EMBEDDING Configuration self.EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text2vec") self.KNOWLEDGE_CHUNK_SIZE = int(os.getenv("KNOWLEDGE_CHUNK_SIZE", 100)) - self.KNOWLEDGE_CHUNK_OVERLAP = int(os.getenv("KNOWLEDGE_CHUNK_OVERLAP", 100)) + self.KNOWLEDGE_CHUNK_OVERLAP = int(os.getenv("KNOWLEDGE_CHUNK_OVERLAP", 50)) self.KNOWLEDGE_SEARCH_TOP_SIZE = int(os.getenv("KNOWLEDGE_SEARCH_TOP_SIZE", 5)) self.KNOWLEDGE_SEARCH_MAX_TOKEN = int( os.getenv("KNOWLEDGE_SEARCH_MAX_TOKEN", 2000)