Use chunk_size variable in logs. Make vectorstore check more flexible

2025-06-28 16:26:56 +00:00 · 2023-05-20 12:02:40 +02:00 · 2023-05-20 12:02:40 +02:00 · 4a0e0d2e70
commit 4a0e0d2e70
parent fca1128fba
1 changed files with 6 additions and 5 deletions
--- a/ingest.py
+++ b/ingest.py
@ -37,6 +37,8 @@ embeddings_model_name = os.environ.get('EMBEDDINGS_MODEL_NAME')
 chunk_size = 500
 chunk_overlap = 50

+
+# Custom document loaders
 class MyElmLoader(UnstructuredEmailLoader):
    """Wrapper to fallback to text/plain when default does not work"""

@ -79,8 +81,6 @@ LOADER_MAPPING = {
 }


-load_dotenv()
-
 def load_single_document(file_path: str) -> Document:
    ext = "." + file_path.rsplit(".", 1)[-1]
    if ext in LOADER_MAPPING:
@ -123,7 +123,7 @@ def process_documents(ignored_files: List[str] = []) -> List[Document]:
    print(f"Loaded {len(documents)} new documents from {source_directory}")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    texts = text_splitter.split_documents(documents)
-    print(f"Split into {len(texts)} chunks of text (max. 500 tokens each)")
+    print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
    return texts

 def does_vectorstore_exist(persist_directory: str) -> bool:
@ -134,7 +134,8 @@ def does_vectorstore_exist(persist_directory: str) -> bool:
        if os.path.exists(os.path.join(persist_directory, 'chroma-collections.parquet')) and os.path.exists(os.path.join(persist_directory, 'chroma-embeddings.parquet')):
            list_index_files = glob.glob(os.path.join(persist_directory, 'index/*.bin'))
            list_index_files += glob.glob(os.path.join(persist_directory, 'index/*.pkl'))
-            if len(list_index_files) == 4:
+            # At least 3 documents are needed in a working vectorstore
+            if len(list_index_files) > 3:
                return True
    return False