diff --git a/.gitignore b/.gitignore index 1f7d3a10..eb26736b 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ models/ # Local Chroma db .chroma/ db/ +persist_directory/chroma.sqlite # Byte-compiled / optimized / DLL files __pycache__/ @@ -169,5 +170,5 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ +# vscode .vscode/launch.json -persist_directory/chroma.sqlite3 diff --git a/ingest.py b/ingest.py index d7a66351..77e1040e 100755 --- a/ingest.py +++ b/ingest.py @@ -31,6 +31,7 @@ if not load_dotenv(): from constants import CHROMA_SETTINGS import chromadb +from chromadb.api.segment import API # Load environment variables persist_directory = os.environ.get('PERSIST_DIRECTORY') @@ -126,9 +127,19 @@ def process_documents(ignored_files: List[str] = []) -> List[Document]: exit(0) print(f"Loaded {len(documents)} new documents from {source_directory}") text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) - texts = text_splitter.split_documents(documents) - print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)") - return texts + documents = text_splitter.split_documents(documents) + print(f"Split into {len(documents)} chunks of text (max. {chunk_size} tokens each)") + return documents + +def batch_chromadb_insertions(chroma_client: API, documents: List[Document]) -> List[Document]: + """ + Split the total documents to be inserted into batches of documents that the local chroma client can process + """ + # Get max batch size. + max_batch_size = chroma_client.max_batch_size + for i in range(0, len(documents), max_batch_size): + yield documents[i:i + max_batch_size] + def does_vectorstore_exist(persist_directory: str, embeddings: HuggingFaceEmbeddings) -> bool: """ @@ -150,17 +161,22 @@ def main(): print(f"Appending to existing vectorstore at {persist_directory}") db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS, client=chroma_client) collection = db.get() - texts = process_documents([metadata['source'] for metadata in collection['metadatas']]) + documents = process_documents([metadata['source'] for metadata in collection['metadatas']]) print(f"Creating embeddings. May take some minutes...") - db.add_documents(texts) + for batched_chromadb_insertion in batch_chromadb_insertions(chroma_client, documents): + db.add_documents(batched_chromadb_insertion) else: # Create and store locally vectorstore print("Creating new vectorstore") - texts = process_documents() + documents = process_documents() print(f"Creating embeddings. May take some minutes...") - db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS, client=chroma_client) - db.persist() - db = None + # Create the db with the first batch of documents to insert + batched_chromadb_insertions = batch_chromadb_insertions(chroma_client, documents) + first_insertion = next(batched_chromadb_insertions) + db = Chroma.from_documents(first_insertion, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS, client=chroma_client) + # Add the rest of batches of documents + for batched_chromadb_insertion in batched_chromadb_insertions: + db.add_documents(batched_chromadb_insertion) print(f"Ingestion complete! You can now run privateGPT.py to query your documents") diff --git a/poetry.lock b/poetry.lock index a1c6edbb..4f3c9c6f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -419,13 +419,36 @@ files = [ [[package]] name = "chroma-hnswlib" -version = "0.7.2" +version = "0.7.3" description = "Chromas fork of hnswlib" optional = false python-versions = "*" files = [ - {file = "chroma-hnswlib-0.7.2.tar.gz", hash = "sha256:87c6a0ced9e52ac7c8ca01ded25bb70c4a7f63f5871181eb18bea9111ce786c4"}, - {file = "chroma_hnswlib-0.7.2-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:3ffbb542cada959771ae4b8394f8cee1ef76bd17950adb592531433e912377db"}, + {file = "chroma-hnswlib-0.7.3.tar.gz", hash = "sha256:b6137bedde49fffda6af93b0297fe00429fc61e5a072b1ed9377f909ed95a932"}, + {file = "chroma_hnswlib-0.7.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:59d6a7c6f863c67aeb23e79a64001d537060b6995c3eca9a06e349ff7b0998ca"}, + {file = "chroma_hnswlib-0.7.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d71a3f4f232f537b6152947006bd32bc1629a8686df22fd97777b70f416c127a"}, + {file = "chroma_hnswlib-0.7.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c92dc1ebe062188e53970ba13f6b07e0ae32e64c9770eb7f7ffa83f149d4210"}, + {file = "chroma_hnswlib-0.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49da700a6656fed8753f68d44b8cc8ae46efc99fc8a22a6d970dc1697f49b403"}, + {file = "chroma_hnswlib-0.7.3-cp310-cp310-win_amd64.whl", hash = "sha256:108bc4c293d819b56476d8f7865803cb03afd6ca128a2a04d678fffc139af029"}, + {file = "chroma_hnswlib-0.7.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:11e7ca93fb8192214ac2b9c0943641ac0daf8f9d4591bb7b73be808a83835667"}, + {file = "chroma_hnswlib-0.7.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6f552e4d23edc06cdeb553cdc757d2fe190cdeb10d43093d6a3319f8d4bf1c6b"}, + {file = "chroma_hnswlib-0.7.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f96f4d5699e486eb1fb95849fe35ab79ab0901265805be7e60f4eaa83ce263ec"}, + {file = "chroma_hnswlib-0.7.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:368e57fe9ebae05ee5844840fa588028a023d1182b0cfdb1d13f607c9ea05756"}, + {file = "chroma_hnswlib-0.7.3-cp311-cp311-win_amd64.whl", hash = "sha256:b7dca27b8896b494456db0fd705b689ac6b73af78e186eb6a42fea2de4f71c6f"}, + {file = "chroma_hnswlib-0.7.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:70f897dc6218afa1d99f43a9ad5eb82f392df31f57ff514ccf4eeadecd62f544"}, + {file = "chroma_hnswlib-0.7.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5aef10b4952708f5a1381c124a29aead0c356f8d7d6e0b520b778aaa62a356f4"}, + {file = "chroma_hnswlib-0.7.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ee2d8d1529fca3898d512079144ec3e28a81d9c17e15e0ea4665697a7923253"}, + {file = "chroma_hnswlib-0.7.3-cp37-cp37m-win_amd64.whl", hash = "sha256:a4021a70e898783cd6f26e00008b494c6249a7babe8774e90ce4766dd288c8ba"}, + {file = "chroma_hnswlib-0.7.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:a8f61fa1d417fda848e3ba06c07671f14806a2585272b175ba47501b066fe6b1"}, + {file = "chroma_hnswlib-0.7.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:d7563be58bc98e8f0866907368e22ae218d6060601b79c42f59af4eccbbd2e0a"}, + {file = "chroma_hnswlib-0.7.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51b8d411486ee70d7b66ec08cc8b9b6620116b650df9c19076d2d8b6ce2ae914"}, + {file = "chroma_hnswlib-0.7.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d706782b628e4f43f1b8a81e9120ac486837fbd9bcb8ced70fe0d9b95c72d77"}, + {file = "chroma_hnswlib-0.7.3-cp38-cp38-win_amd64.whl", hash = "sha256:54f053dedc0e3ba657f05fec6e73dd541bc5db5b09aa8bc146466ffb734bdc86"}, + {file = "chroma_hnswlib-0.7.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e607c5a71c610a73167a517062d302c0827ccdd6e259af6e4869a5c1306ffb5d"}, + {file = "chroma_hnswlib-0.7.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c2358a795870156af6761890f9eb5ca8cade57eb10c5f046fe94dae1faa04b9e"}, + {file = "chroma_hnswlib-0.7.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7cea425df2e6b8a5e201fff0d922a1cc1d165b3cfe762b1408075723c8892218"}, + {file = "chroma_hnswlib-0.7.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:454df3dd3e97aa784fba7cf888ad191e0087eef0fd8c70daf28b753b3b591170"}, + {file = "chroma_hnswlib-0.7.3-cp39-cp39-win_amd64.whl", hash = "sha256:df587d15007ca701c6de0ee7d5585dd5e976b7edd2b30ac72bc376b3c3f85882"}, ] [package.dependencies] @@ -433,21 +456,21 @@ numpy = "*" [[package]] name = "chromadb" -version = "0.4.7" +version = "0.4.12" description = "Chroma." optional = false python-versions = ">=3.7" files = [ - {file = "chromadb-0.4.7-py3-none-any.whl", hash = "sha256:e928406410efdd1e5550cb456a3f4c40774aec1efcd95011389483fa0ae3c472"}, - {file = "chromadb-0.4.7.tar.gz", hash = "sha256:7282aab8fd7cf81f0bf55f5a056bdc3aca15bf56a37b711ec53fab1440b5e6f7"}, + {file = "chromadb-0.4.12-py3-none-any.whl", hash = "sha256:2a9d99945c25049ce8b8d2896ef296909f42ba2f5dca983a496adae0a0deb64a"}, + {file = "chromadb-0.4.12.tar.gz", hash = "sha256:430585725e1f2f43f51ef3d0d7a41d99d0cdc4635264e75aaf1e303ab48ae616"}, ] [package.dependencies] bcrypt = ">=4.0.1" -chroma-hnswlib = "0.7.2" +chroma-hnswlib = "0.7.3" fastapi = ">=0.95.2,<0.100.0" importlib-resources = "*" -numpy = ">=1.21.6" +numpy = {version = ">=1.22.5", markers = "python_version >= \"3.8\""} onnxruntime = ">=1.14.1" overrides = ">=7.3.1" posthog = ">=2.4.0" @@ -457,6 +480,7 @@ pypika = ">=0.48.9" requests = ">=2.28" tokenizers = ">=0.13.2" tqdm = ">=4.65.0" +typer = ">=0.9.0" typing-extensions = ">=4.5.0" uvicorn = {version = ">=0.18.3", extras = ["standard"]} @@ -3038,6 +3062,27 @@ torchhub = ["filelock", "huggingface-hub (>=0.15.1,<1.0)", "importlib-metadata", video = ["av (==9.2.0)", "decord (==0.6.0)"] vision = ["Pillow (<10.0.0)"] +[[package]] +name = "typer" +version = "0.9.0" +description = "Typer, build great CLIs. Easy to code. Based on Python type hints." +optional = false +python-versions = ">=3.6" +files = [ + {file = "typer-0.9.0-py3-none-any.whl", hash = "sha256:5d96d986a21493606a358cae4461bd8cdf83cbf33a5aa950ae629ca3b51467ee"}, + {file = "typer-0.9.0.tar.gz", hash = "sha256:50922fd79aea2f4751a8e0408ff10d2662bd0c8bbfa84755a699f3bada2978b2"}, +] + +[package.dependencies] +click = ">=7.1.1,<9.0.0" +typing-extensions = ">=3.7.4.3" + +[package.extras] +all = ["colorama (>=0.4.3,<0.5.0)", "rich (>=10.11.0,<14.0.0)", "shellingham (>=1.3.0,<2.0.0)"] +dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2.17.0,<3.0.0)"] +doc = ["cairosvg (>=2.5.2,<3.0.0)", "mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)", "pillow (>=9.3.0,<10.0.0)"] +test = ["black (>=22.3.0,<23.0.0)", "coverage (>=6.2,<7.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<8.0.0)", "pytest-cov (>=2.10.0,<5.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<4.0.0)", "rich (>=10.11.0,<14.0.0)", "shellingham (>=1.3.0,<2.0.0)"] + [[package]] name = "typing-extensions" version = "4.7.1" @@ -3455,4 +3500,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "9772f4040d3a2152ec06db1ec709509e0f05815c2ddc3cba9ed974ce183e2691" +content-hash = "111b08c8b4a98f2efb0ad223dab9777c171cea626211aa7efef03a4e4605bc08" diff --git a/pyproject.toml b/pyproject.toml index 3c5ef303..f14cb2dd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ readme = "README.md" python = "^3.10" langchain = "0.0.274" gpt4all = "1.0.8" -chromadb = "0.4.7" +chromadb = "0.4.12" llama-cpp-python = "0.1.81" urllib3 = "2.0.4" PyMuPDF = "1.23.1" diff --git a/requirements.txt b/requirements.txt index 1e3e71e6..f5906cf7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ langchain==0.0.274 gpt4all==1.0.8 -chromadb==0.4.7 +chromadb==0.4.12 llama-cpp-python==0.1.81 urllib3==2.0.4 PyMuPDF==1.23.1