End-to-end working version

2025-09-09 11:11:05 +00:00 · 2023-05-02 19:35:40 +02:00
parent 51dae80058
commit 55338b8f6e
6 changed files with 943 additions and 0 deletions
--- a/ingest.py
+++ b/ingest.py
@@ -0,0 +1,21 @@
+from langchain.document_loaders import TextLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.vectorstores import Chroma
+from langchain.embeddings import LlamaCppEmbeddings
+
+def main():
+    # Load document and split in chunks
+    loader = TextLoader('./source_documents/state_of_the_union.txt', encoding='utf8')
+    documents = loader.load()
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
+    texts = text_splitter.split_documents(documents)
+    # Create embeddings
+    llama = LlamaCppEmbeddings(model_path="./models/ggml-model-q4_0.bin")
+    # Create and store locally vectorstore
+    persist_directory = 'db'
+    db = Chroma.from_documents(texts, llama, persist_directory=persist_directory)
+    db.persist()
+    db = None
+
+if __name__ == "__main__":
+    main()