mirror of
https://github.com/imartinez/privateGPT.git
synced 2025-06-28 16:26:56 +00:00
Use chunk_size variable in logs. Make vectorstore check more flexible
This commit is contained in:
parent
fca1128fba
commit
4a0e0d2e70
@ -37,6 +37,8 @@ embeddings_model_name = os.environ.get('EMBEDDINGS_MODEL_NAME')
|
||||
chunk_size = 500
|
||||
chunk_overlap = 50
|
||||
|
||||
|
||||
# Custom document loaders
|
||||
class MyElmLoader(UnstructuredEmailLoader):
|
||||
"""Wrapper to fallback to text/plain when default does not work"""
|
||||
|
||||
@ -79,8 +81,6 @@ LOADER_MAPPING = {
|
||||
}
|
||||
|
||||
|
||||
load_dotenv()
|
||||
|
||||
def load_single_document(file_path: str) -> Document:
|
||||
ext = "." + file_path.rsplit(".", 1)[-1]
|
||||
if ext in LOADER_MAPPING:
|
||||
@ -123,7 +123,7 @@ def process_documents(ignored_files: List[str] = []) -> List[Document]:
|
||||
print(f"Loaded {len(documents)} new documents from {source_directory}")
|
||||
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
||||
texts = text_splitter.split_documents(documents)
|
||||
print(f"Split into {len(texts)} chunks of text (max. 500 tokens each)")
|
||||
print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
|
||||
return texts
|
||||
|
||||
def does_vectorstore_exist(persist_directory: str) -> bool:
|
||||
@ -134,7 +134,8 @@ def does_vectorstore_exist(persist_directory: str) -> bool:
|
||||
if os.path.exists(os.path.join(persist_directory, 'chroma-collections.parquet')) and os.path.exists(os.path.join(persist_directory, 'chroma-embeddings.parquet')):
|
||||
list_index_files = glob.glob(os.path.join(persist_directory, 'index/*.bin'))
|
||||
list_index_files += glob.glob(os.path.join(persist_directory, 'index/*.pkl'))
|
||||
if len(list_index_files) == 4:
|
||||
# At least 3 documents are needed in a working vectorstore
|
||||
if len(list_index_files) > 3:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user