Merge pull request #999 from imartinez/990-cannot-submit-more-than-166-embeddings-at-once-while-ingesting

Batch embeddings to be processed by chromadb
This commit is contained in:
Iván Martínez 2023-09-25 11:59:19 +02:00 committed by GitHub
commit 0b5a6687e3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 83 additions and 21 deletions

3
.gitignore vendored
View File

@ -7,6 +7,7 @@ models/
# Local Chroma db
.chroma/
db/
persist_directory/chroma.sqlite
# Byte-compiled / optimized / DLL files
__pycache__/
@ -169,5 +170,5 @@ cython_debug/
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
# vscode
.vscode/launch.json
persist_directory/chroma.sqlite3

View File

@ -31,6 +31,7 @@ if not load_dotenv():
from constants import CHROMA_SETTINGS
import chromadb
from chromadb.api.segment import API
# Load environment variables
persist_directory = os.environ.get('PERSIST_DIRECTORY')
@ -126,9 +127,19 @@ def process_documents(ignored_files: List[str] = []) -> List[Document]:
exit(0)
print(f"Loaded {len(documents)} new documents from {source_directory}")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
texts = text_splitter.split_documents(documents)
print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
return texts
documents = text_splitter.split_documents(documents)
print(f"Split into {len(documents)} chunks of text (max. {chunk_size} tokens each)")
return documents
def batch_chromadb_insertions(chroma_client: API, documents: List[Document]) -> List[Document]:
"""
Split the total documents to be inserted into batches of documents that the local chroma client can process
"""
# Get max batch size.
max_batch_size = chroma_client.max_batch_size
for i in range(0, len(documents), max_batch_size):
yield documents[i:i + max_batch_size]
def does_vectorstore_exist(persist_directory: str, embeddings: HuggingFaceEmbeddings) -> bool:
"""
@ -150,17 +161,22 @@ def main():
print(f"Appending to existing vectorstore at {persist_directory}")
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS, client=chroma_client)
collection = db.get()
texts = process_documents([metadata['source'] for metadata in collection['metadatas']])
documents = process_documents([metadata['source'] for metadata in collection['metadatas']])
print(f"Creating embeddings. May take some minutes...")
db.add_documents(texts)
for batched_chromadb_insertion in batch_chromadb_insertions(chroma_client, documents):
db.add_documents(batched_chromadb_insertion)
else:
# Create and store locally vectorstore
print("Creating new vectorstore")
texts = process_documents()
documents = process_documents()
print(f"Creating embeddings. May take some minutes...")
db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS, client=chroma_client)
db.persist()
db = None
# Create the db with the first batch of documents to insert
batched_chromadb_insertions = batch_chromadb_insertions(chroma_client, documents)
first_insertion = next(batched_chromadb_insertions)
db = Chroma.from_documents(first_insertion, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS, client=chroma_client)
# Add the rest of batches of documents
for batched_chromadb_insertion in batched_chromadb_insertions:
db.add_documents(batched_chromadb_insertion)
print(f"Ingestion complete! You can now run privateGPT.py to query your documents")

63
poetry.lock generated
View File

@ -419,13 +419,36 @@ files = [
[[package]]
name = "chroma-hnswlib"
version = "0.7.2"
version = "0.7.3"
description = "Chromas fork of hnswlib"
optional = false
python-versions = "*"
files = [
{file = "chroma-hnswlib-0.7.2.tar.gz", hash = "sha256:87c6a0ced9e52ac7c8ca01ded25bb70c4a7f63f5871181eb18bea9111ce786c4"},
{file = "chroma_hnswlib-0.7.2-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:3ffbb542cada959771ae4b8394f8cee1ef76bd17950adb592531433e912377db"},
{file = "chroma-hnswlib-0.7.3.tar.gz", hash = "sha256:b6137bedde49fffda6af93b0297fe00429fc61e5a072b1ed9377f909ed95a932"},
{file = "chroma_hnswlib-0.7.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:59d6a7c6f863c67aeb23e79a64001d537060b6995c3eca9a06e349ff7b0998ca"},
{file = "chroma_hnswlib-0.7.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d71a3f4f232f537b6152947006bd32bc1629a8686df22fd97777b70f416c127a"},
{file = "chroma_hnswlib-0.7.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c92dc1ebe062188e53970ba13f6b07e0ae32e64c9770eb7f7ffa83f149d4210"},
{file = "chroma_hnswlib-0.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49da700a6656fed8753f68d44b8cc8ae46efc99fc8a22a6d970dc1697f49b403"},
{file = "chroma_hnswlib-0.7.3-cp310-cp310-win_amd64.whl", hash = "sha256:108bc4c293d819b56476d8f7865803cb03afd6ca128a2a04d678fffc139af029"},
{file = "chroma_hnswlib-0.7.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:11e7ca93fb8192214ac2b9c0943641ac0daf8f9d4591bb7b73be808a83835667"},
{file = "chroma_hnswlib-0.7.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6f552e4d23edc06cdeb553cdc757d2fe190cdeb10d43093d6a3319f8d4bf1c6b"},
{file = "chroma_hnswlib-0.7.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f96f4d5699e486eb1fb95849fe35ab79ab0901265805be7e60f4eaa83ce263ec"},
{file = "chroma_hnswlib-0.7.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:368e57fe9ebae05ee5844840fa588028a023d1182b0cfdb1d13f607c9ea05756"},
{file = "chroma_hnswlib-0.7.3-cp311-cp311-win_amd64.whl", hash = "sha256:b7dca27b8896b494456db0fd705b689ac6b73af78e186eb6a42fea2de4f71c6f"},
{file = "chroma_hnswlib-0.7.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:70f897dc6218afa1d99f43a9ad5eb82f392df31f57ff514ccf4eeadecd62f544"},
{file = "chroma_hnswlib-0.7.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5aef10b4952708f5a1381c124a29aead0c356f8d7d6e0b520b778aaa62a356f4"},
{file = "chroma_hnswlib-0.7.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ee2d8d1529fca3898d512079144ec3e28a81d9c17e15e0ea4665697a7923253"},
{file = "chroma_hnswlib-0.7.3-cp37-cp37m-win_amd64.whl", hash = "sha256:a4021a70e898783cd6f26e00008b494c6249a7babe8774e90ce4766dd288c8ba"},
{file = "chroma_hnswlib-0.7.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:a8f61fa1d417fda848e3ba06c07671f14806a2585272b175ba47501b066fe6b1"},
{file = "chroma_hnswlib-0.7.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:d7563be58bc98e8f0866907368e22ae218d6060601b79c42f59af4eccbbd2e0a"},
{file = "chroma_hnswlib-0.7.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51b8d411486ee70d7b66ec08cc8b9b6620116b650df9c19076d2d8b6ce2ae914"},
{file = "chroma_hnswlib-0.7.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d706782b628e4f43f1b8a81e9120ac486837fbd9bcb8ced70fe0d9b95c72d77"},
{file = "chroma_hnswlib-0.7.3-cp38-cp38-win_amd64.whl", hash = "sha256:54f053dedc0e3ba657f05fec6e73dd541bc5db5b09aa8bc146466ffb734bdc86"},
{file = "chroma_hnswlib-0.7.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e607c5a71c610a73167a517062d302c0827ccdd6e259af6e4869a5c1306ffb5d"},
{file = "chroma_hnswlib-0.7.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c2358a795870156af6761890f9eb5ca8cade57eb10c5f046fe94dae1faa04b9e"},
{file = "chroma_hnswlib-0.7.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7cea425df2e6b8a5e201fff0d922a1cc1d165b3cfe762b1408075723c8892218"},
{file = "chroma_hnswlib-0.7.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:454df3dd3e97aa784fba7cf888ad191e0087eef0fd8c70daf28b753b3b591170"},
{file = "chroma_hnswlib-0.7.3-cp39-cp39-win_amd64.whl", hash = "sha256:df587d15007ca701c6de0ee7d5585dd5e976b7edd2b30ac72bc376b3c3f85882"},
]
[package.dependencies]
@ -433,21 +456,21 @@ numpy = "*"
[[package]]
name = "chromadb"
version = "0.4.7"
version = "0.4.12"
description = "Chroma."
optional = false
python-versions = ">=3.7"
files = [
{file = "chromadb-0.4.7-py3-none-any.whl", hash = "sha256:e928406410efdd1e5550cb456a3f4c40774aec1efcd95011389483fa0ae3c472"},
{file = "chromadb-0.4.7.tar.gz", hash = "sha256:7282aab8fd7cf81f0bf55f5a056bdc3aca15bf56a37b711ec53fab1440b5e6f7"},
{file = "chromadb-0.4.12-py3-none-any.whl", hash = "sha256:2a9d99945c25049ce8b8d2896ef296909f42ba2f5dca983a496adae0a0deb64a"},
{file = "chromadb-0.4.12.tar.gz", hash = "sha256:430585725e1f2f43f51ef3d0d7a41d99d0cdc4635264e75aaf1e303ab48ae616"},
]
[package.dependencies]
bcrypt = ">=4.0.1"
chroma-hnswlib = "0.7.2"
chroma-hnswlib = "0.7.3"
fastapi = ">=0.95.2,<0.100.0"
importlib-resources = "*"
numpy = ">=1.21.6"
numpy = {version = ">=1.22.5", markers = "python_version >= \"3.8\""}
onnxruntime = ">=1.14.1"
overrides = ">=7.3.1"
posthog = ">=2.4.0"
@ -457,6 +480,7 @@ pypika = ">=0.48.9"
requests = ">=2.28"
tokenizers = ">=0.13.2"
tqdm = ">=4.65.0"
typer = ">=0.9.0"
typing-extensions = ">=4.5.0"
uvicorn = {version = ">=0.18.3", extras = ["standard"]}
@ -3038,6 +3062,27 @@ torchhub = ["filelock", "huggingface-hub (>=0.15.1,<1.0)", "importlib-metadata",
video = ["av (==9.2.0)", "decord (==0.6.0)"]
vision = ["Pillow (<10.0.0)"]
[[package]]
name = "typer"
version = "0.9.0"
description = "Typer, build great CLIs. Easy to code. Based on Python type hints."
optional = false
python-versions = ">=3.6"
files = [
{file = "typer-0.9.0-py3-none-any.whl", hash = "sha256:5d96d986a21493606a358cae4461bd8cdf83cbf33a5aa950ae629ca3b51467ee"},
{file = "typer-0.9.0.tar.gz", hash = "sha256:50922fd79aea2f4751a8e0408ff10d2662bd0c8bbfa84755a699f3bada2978b2"},
]
[package.dependencies]
click = ">=7.1.1,<9.0.0"
typing-extensions = ">=3.7.4.3"
[package.extras]
all = ["colorama (>=0.4.3,<0.5.0)", "rich (>=10.11.0,<14.0.0)", "shellingham (>=1.3.0,<2.0.0)"]
dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2.17.0,<3.0.0)"]
doc = ["cairosvg (>=2.5.2,<3.0.0)", "mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)", "pillow (>=9.3.0,<10.0.0)"]
test = ["black (>=22.3.0,<23.0.0)", "coverage (>=6.2,<7.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<8.0.0)", "pytest-cov (>=2.10.0,<5.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<4.0.0)", "rich (>=10.11.0,<14.0.0)", "shellingham (>=1.3.0,<2.0.0)"]
[[package]]
name = "typing-extensions"
version = "4.7.1"
@ -3455,4 +3500,4 @@ multidict = ">=4.0"
[metadata]
lock-version = "2.0"
python-versions = "^3.10"
content-hash = "9772f4040d3a2152ec06db1ec709509e0f05815c2ddc3cba9ed974ce183e2691"
content-hash = "111b08c8b4a98f2efb0ad223dab9777c171cea626211aa7efef03a4e4605bc08"

View File

@ -10,7 +10,7 @@ readme = "README.md"
python = "^3.10"
langchain = "0.0.274"
gpt4all = "1.0.8"
chromadb = "0.4.7"
chromadb = "0.4.12"
llama-cpp-python = "0.1.81"
urllib3 = "2.0.4"
PyMuPDF = "1.23.1"

View File

@ -1,6 +1,6 @@
langchain==0.0.274
gpt4all==1.0.8
chromadb==0.4.7
chromadb==0.4.12
llama-cpp-python==0.1.81
urllib3==2.0.4
PyMuPDF==1.23.1