Use chunk_size variable in logs. Make vectorstore check more flexible

This commit is contained in:
Iván Martínez 2023-05-20 12:02:40 +02:00
parent fca1128fba
commit 4a0e0d2e70

View File

@ -37,6 +37,8 @@ embeddings_model_name = os.environ.get('EMBEDDINGS_MODEL_NAME')
chunk_size = 500
chunk_overlap = 50
# Custom document loaders
class MyElmLoader(UnstructuredEmailLoader):
"""Wrapper to fallback to text/plain when default does not work"""
@ -79,8 +81,6 @@ LOADER_MAPPING = {
}
load_dotenv()
def load_single_document(file_path: str) -> Document:
ext = "." + file_path.rsplit(".", 1)[-1]
if ext in LOADER_MAPPING:
@ -123,7 +123,7 @@ def process_documents(ignored_files: List[str] = []) -> List[Document]:
print(f"Loaded {len(documents)} new documents from {source_directory}")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
texts = text_splitter.split_documents(documents)
print(f"Split into {len(texts)} chunks of text (max. 500 tokens each)")
print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
return texts
def does_vectorstore_exist(persist_directory: str) -> bool:
@ -134,7 +134,8 @@ def does_vectorstore_exist(persist_directory: str) -> bool:
if os.path.exists(os.path.join(persist_directory, 'chroma-collections.parquet')) and os.path.exists(os.path.join(persist_directory, 'chroma-embeddings.parquet')):
list_index_files = glob.glob(os.path.join(persist_directory, 'index/*.bin'))
list_index_files += glob.glob(os.path.join(persist_directory, 'index/*.pkl'))
if len(list_index_files) == 4:
# At least 3 documents are needed in a working vectorstore
if len(list_index_files) > 3:
return True
return False