mirror of
https://github.com/imartinez/privateGPT.git
synced 2025-06-28 16:26:56 +00:00
Use chunk_size variable in logs. Make vectorstore check more flexible
This commit is contained in:
parent
fca1128fba
commit
4a0e0d2e70
11
ingest.py
11
ingest.py
@ -37,6 +37,8 @@ embeddings_model_name = os.environ.get('EMBEDDINGS_MODEL_NAME')
|
|||||||
chunk_size = 500
|
chunk_size = 500
|
||||||
chunk_overlap = 50
|
chunk_overlap = 50
|
||||||
|
|
||||||
|
|
||||||
|
# Custom document loaders
|
||||||
class MyElmLoader(UnstructuredEmailLoader):
|
class MyElmLoader(UnstructuredEmailLoader):
|
||||||
"""Wrapper to fallback to text/plain when default does not work"""
|
"""Wrapper to fallback to text/plain when default does not work"""
|
||||||
|
|
||||||
@ -79,8 +81,6 @@ LOADER_MAPPING = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
load_dotenv()
|
|
||||||
|
|
||||||
def load_single_document(file_path: str) -> Document:
|
def load_single_document(file_path: str) -> Document:
|
||||||
ext = "." + file_path.rsplit(".", 1)[-1]
|
ext = "." + file_path.rsplit(".", 1)[-1]
|
||||||
if ext in LOADER_MAPPING:
|
if ext in LOADER_MAPPING:
|
||||||
@ -123,7 +123,7 @@ def process_documents(ignored_files: List[str] = []) -> List[Document]:
|
|||||||
print(f"Loaded {len(documents)} new documents from {source_directory}")
|
print(f"Loaded {len(documents)} new documents from {source_directory}")
|
||||||
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
||||||
texts = text_splitter.split_documents(documents)
|
texts = text_splitter.split_documents(documents)
|
||||||
print(f"Split into {len(texts)} chunks of text (max. 500 tokens each)")
|
print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
|
||||||
return texts
|
return texts
|
||||||
|
|
||||||
def does_vectorstore_exist(persist_directory: str) -> bool:
|
def does_vectorstore_exist(persist_directory: str) -> bool:
|
||||||
@ -131,10 +131,11 @@ def does_vectorstore_exist(persist_directory: str) -> bool:
|
|||||||
Checks if vectorstore exists
|
Checks if vectorstore exists
|
||||||
"""
|
"""
|
||||||
if os.path.exists(os.path.join(persist_directory, 'index')):
|
if os.path.exists(os.path.join(persist_directory, 'index')):
|
||||||
if os.path.exists(os.path.join(persist_directory, 'chroma-collections.parquet')) and os.path.exists(os.path.join(persist_directory, 'chroma-embeddings.parquet')):
|
if os.path.exists(os.path.join(persist_directory, 'chroma-collections.parquet')) and os.path.exists(os.path.join(persist_directory, 'chroma-embeddings.parquet')):
|
||||||
list_index_files = glob.glob(os.path.join(persist_directory, 'index/*.bin'))
|
list_index_files = glob.glob(os.path.join(persist_directory, 'index/*.bin'))
|
||||||
list_index_files += glob.glob(os.path.join(persist_directory, 'index/*.pkl'))
|
list_index_files += glob.glob(os.path.join(persist_directory, 'index/*.pkl'))
|
||||||
if len(list_index_files) == 4:
|
# At least 3 documents are needed in a working vectorstore
|
||||||
|
if len(list_index_files) > 3:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user