From 6070287b34b79264c3bff8ef2e3c4bfe10eabea6 Mon Sep 17 00:00:00 2001 From: ChosenQC Date: Wed, 23 Jul 2025 04:26:43 +0000 Subject: [PATCH 1/3] add pdf-rag example --- examples/cloud/pdf-rag/HNSW_retrieve.py | 101 ++++++++++++++++++++++++ examples/cloud/pdf-rag/README.md | 56 +++++++++++++ examples/cloud/pdf-rag/embedding.py | 78 ++++++++++++++++++ examples/cloud/pdf-rag/parse.py | 56 +++++++++++++ examples/cloud/pdf-rag/requirements.txt | 7 ++ 5 files changed, 298 insertions(+) create mode 100644 examples/cloud/pdf-rag/HNSW_retrieve.py create mode 100644 examples/cloud/pdf-rag/README.md create mode 100644 examples/cloud/pdf-rag/embedding.py create mode 100644 examples/cloud/pdf-rag/parse.py create mode 100644 examples/cloud/pdf-rag/requirements.txt diff --git a/examples/cloud/pdf-rag/HNSW_retrieve.py b/examples/cloud/pdf-rag/HNSW_retrieve.py new file mode 100644 index 000000000..626c28603 --- /dev/null +++ b/examples/cloud/pdf-rag/HNSW_retrieve.py @@ -0,0 +1,101 @@ +import numpy as np +import json +from FlagEmbedding import FlagAutoModel +import time +from rank_bm25 import BM25Okapi +import hnswlib + +def get_list_shape(lst): + shape = [] + current = lst + while isinstance(current, list) and len(current) > 0: + shape.append(len(current)) + current = current[0] + return tuple(shape) + +def load_model(): + return FlagAutoModel.from_finetuned( + 'BAAI/bge-base-en-v1.5', + query_instruction_for_retrieval="Represent this sentence for searching relevant passages:", + # devices='cpu', # Uncomment this line if you want to use GPU. + use_fp16=True + ) + +def encode_query(model, query): + query_vectors = [np.array(model.encode(query)).tolist()] + print('query_vectors_shape', get_list_shape(query_vectors)) + return query_vectors + +def load_data(vectors_path, docs_path): + vectors = np.load(vectors_path).tolist() + with open(docs_path, 'r', encoding='utf-8') as file: + docs = json.load(file) + return vectors, docs + +def build_hnsw_index(vectors): + # start_time = time.time() + num_elements = len(vectors) + p = hnswlib.Index(space='cosine', dim=768) + p.init_index(max_elements=num_elements, ef_construction=200, M=16) + # M defines the maximum number of outgoing connections in the graph. Higher M leads to higher accuracy/run_time at fixed ef/efConstruction. + # ef_construction controls index search speed/build speed tradeoff. Increasing the efConstruction parameter may enhance index quality, but it also tends to lengthen the indexing time. + p.add_items(np.array(vectors), np.arange(num_elements)) + # HNSW_time = time.time() + #print('HNSW build time:', HNSW_time - start_time) + p.set_ef(32) + # ef controlling query time/accuracy trade-off. Higher ef leads to more accurate but slower search. + return p + +def search_hnsw(index, query_vectors, docs): + # HNSW_time = time.time() + labels, distances = index.knn_query(np.array(query_vectors), k=10) + results = [docs[i]['content'] for i in labels[0]] + # end_HNSW_time = time.time() + # print('HNSW search time:', end_HNSW_time - HNSW_time) + return results + +def build_bm25(docs): + corpus = [doc['content'] for doc in docs] + tokenized_corpus = [list(text.split()) for text in corpus] + # bm25_build_start = time.time() + bm25 = BM25Okapi(tokenized_corpus) + # bm25_build_end = time.time() + # print('BM25 build time:', bm25_build_end - bm25_build_start) + return bm25, corpus + +def search_bm25(bm25, corpus, query): + # bm25_search_start = time.time() + tokenized_query = list(query.split()) + bm25_scores = bm25.get_scores(tokenized_query) + bm25_top_n = np.argsort(bm25_scores)[::-1][:10] + bm25_results = [corpus[i] for i in bm25_top_n] + # bm25_search_end = time.time() + # print('BM25 search time:', bm25_search_end - bm25_search_start) + return bm25_results + +def merge_results(results, bm25_results): + merged_results = [] + for i in range(len(results)): + merged_results.append(results[i]) + for i in range(len(bm25_results)): + merged_results.append(bm25_results[i]) + merged_results = list(set(merged_results)) + return merged_results + +def main(): + model = load_model() + query = "This is a test query to find relevant documents." + query_vectors = encode_query(model, query) + vectors, docs = load_data('PATH_TO_YOUR_EMBEDDING.npy', 'PATH_TO_YOUR_JSON.json') + + hnsw_index = build_hnsw_index(vectors) + hnsw_results = search_hnsw(hnsw_index, query_vectors, docs) + + bm25, corpus = build_bm25(docs) + bm25_results = search_bm25(bm25, corpus, query) + + merged_results = merge_results(hnsw_results, bm25_results) + + return merged_results +if __name__ == "__main__": + retrieved_data=main() diff --git a/examples/cloud/pdf-rag/README.md b/examples/cloud/pdf-rag/README.md new file mode 100644 index 000000000..a7da19314 --- /dev/null +++ b/examples/cloud/pdf-rag/README.md @@ -0,0 +1,56 @@ +# PDF_RAG Workflow Demo + +This project demonstrates a document retrieval and vectorization workflow based on Haystack, FlagEmbedding, HNSWLib, and BM25. + +## Directory Structure + +- `RAG_workflow/parse.py`: Parses and splits PDF documents, and generates the content in JSON format. +- `RAG_workflow/embedding.py`:Vectorizes the document content and produces the embedding vector base. +- `RAG_workflow/HNSW_retrieve.py`:Performs hybrid retrieval and recall using HNSW and BM25. + +## Environment Setup + +Install dependencies with: + +```bash +pip install -r requirements.txt +``` + +## Workflow Steps + +1. **PDF Parsing** + - Modify `PATH_TO_YOUR_PDF_DIRECTORY` in `parse.py` to your PDF folder path. + - Update the output JSON path(`PATH_TO_YOUR_JSON`)。 + - Run: + ```bash + python PDF-RAG/parse.py + ``` + - The generated JSON file will be used in the next embedding step. + +2. **JSON Vectorization** + - In `embedding.py`, update the input JSON path (`PATH_TO_YOUR_JSON.json`) and the output embedding file path (`PATH_TO_YOUR_EMBEDDING.npy`). + - Run: + ```bash + python PDF-RAG/embedding.py + ``` + +3. **Retrieval and Recall** + - In `HNSW_retrieve.py`, update the embedding and JSON paths (`PATH_TO_YOUR_JSON.json`). + - Run: + ```bash + python PDF-RAG/HNSW_retrieve.py + ``` + - The script will output the construction and retrieval times for both HNSW and BM25, along with the merged retrieval results. + - Adjust HNSW and BM25 parameters according to the descriptions to get desired results. + - In `hnswlib.Index()`, use `space='l2'` for Squared L2, `'ip'` for Inner Product, and `'cosine'` for Cosine Similarity. + +## Dependencies + +- numpy +- scikit-learn +- hnswlib +- rank_bm25 +- FlagEmbedding +- haystack +- haystack-integrations + diff --git a/examples/cloud/pdf-rag/embedding.py b/examples/cloud/pdf-rag/embedding.py new file mode 100644 index 000000000..30d7d747e --- /dev/null +++ b/examples/cloud/pdf-rag/embedding.py @@ -0,0 +1,78 @@ +import json +import numpy as np +from FlagEmbedding import FlagAutoModel +import time +from sklearn.metrics.pairwise import cosine_similarity +import os + +def load_model(model_name="BAAI/bge-base-en-v1.5", use_fp16=True): + return FlagAutoModel.from_finetuned( + model_name, + query_instruction_for_retrieval="Represent this sentence for searching relevant passages:", + # device='cpu', # Uncomment this line if you want to use GPU. + use_fp16=use_fp16 + ) + +def load_data(file_path): + try: + with open(file_path, 'r', encoding='utf-8') as f: + return json.load(f) + except (FileNotFoundError, json.JSONDecodeError): + print("Error loading data from", file_path) + return [] + +def extract_texts(data): + return [doc.get("content", '').strip() for doc in data] + +def generate_embeddings(model, texts): + return np.array(model.encode(texts)) + +def save_embeddings(embeddings, output_path): + os.makedirs(os.path.dirname(output_path), exist_ok=True) + np.save(output_path, embeddings) + +def load_embeddings(file_path): + try: + return np.load(file_path) + except FileNotFoundError: + print("Error loading embeddings from", file_path) + return None + + +def main(): + config = { + "model_name": "BAAI/bge-base-en-v1.5", + "json_path": #PATH_TO_YOUR_JSON.json#, + "embedding_path": #PATH_TO_YOUR_EMBEDDING.npy#, + "use_fp16": True, + "use_precomputed_embeddings": False + } + + model = load_model( + model_name=config["model_name"], + use_fp16=config["use_fp16"] + ) + + if config["use_precomputed_embeddings"]: + embeddings = load_embeddings(config["embedding_path"]) + if embeddings is None: + return + else: + data = load_data(config["json_path"]) + if not data: + return + + texts = extract_texts(data) + embeddings = generate_embeddings(model, texts) + save_embeddings(embeddings, config["embedding_path"]) + +##### Test demo with simple KNN cosine_similarity + # query='This is a test query to find relevant documents.' + # query_embedding=np.array(model.encode(query)) + # similarity_scores = cosine_similarity([query_embedding], embeddings) + # indices = np.argsort(-similarity_scores) + + return embeddings + +if __name__ == '__main__': + main() diff --git a/examples/cloud/pdf-rag/parse.py b/examples/cloud/pdf-rag/parse.py new file mode 100644 index 000000000..9a5f1ef1e --- /dev/null +++ b/examples/cloud/pdf-rag/parse.py @@ -0,0 +1,56 @@ +from pathlib import Path +import time +import json +from haystack import Pipeline +from haystack.components.converters import PyPDFToDocument +from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter +from haystack.components.writers import DocumentWriter +from haystack.document_stores.types import DuplicatePolicy +from haystack.document_stores.in_memory import InMemoryDocumentStore +from haystack import Document + +def create_indexing_pipeline(): + document_store = InMemoryDocumentStore() + converter = PyPDFToDocument() + cleaner = DocumentCleaner() + splitter = DocumentSplitter(split_by="sentence", split_length=1) + writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP) + + indexing_pipeline = Pipeline() + indexing_pipeline.add_component("converter", converter) + indexing_pipeline.add_component("cleaner", cleaner) + indexing_pipeline.add_component("splitter", splitter) + indexing_pipeline.add_component("writer", writer) + + indexing_pipeline.connect("converter", "cleaner") + indexing_pipeline.connect("cleaner", "splitter") + indexing_pipeline.connect("splitter", "writer") + + return indexing_pipeline, document_store + +def process_pdfs(pdf_directory, indexing_pipeline): + papers_dir = Path(pdf_directory) + pdf_files = list(papers_dir.glob("*.pdf")) + for pdf_file in pdf_files: + try: + indexing_pipeline.run({"converter": {"sources": [pdf_file]}}) + except: + pass + +def save_to_json(document_store, output_path): + all_documents = document_store.filter_documents() + docs_list = [doc.to_dict() for doc in all_documents] + with open(output_path, "w", encoding="utf-8") as f: + json.dump(docs_list, f, ensure_ascii=False, indent=2) + +def main(): + PDF_DIRECTORY = #PATH_TO_YOUR_PDF_DIRECTORY# + OUTPUT_JSON = #PATH_TO_YOUR_JSON# + + start_time = time.time() + indexing_pipeline, document_store = create_indexing_pipeline() + process_pdfs(PDF_DIRECTORY, indexing_pipeline) + save_to_json(document_store, OUTPUT_JSON) + +if __name__ == "__main__": + main() diff --git a/examples/cloud/pdf-rag/requirements.txt b/examples/cloud/pdf-rag/requirements.txt new file mode 100644 index 000000000..84d85fecc --- /dev/null +++ b/examples/cloud/pdf-rag/requirements.txt @@ -0,0 +1,7 @@ +numpy +scikit-learn +hnswlib +rank_bm25 +FlagEmbedding +haystack +haystack-integrations \ No newline at end of file From eadcad8749a0a767fbde8c9f5f7e2a9ad8992caa Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 23 Jul 2025 05:59:43 +0000 Subject: [PATCH 2/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/cloud/pdf-rag/HNSW_retrieve.py | 48 +++++++++++++++---------- examples/cloud/pdf-rag/README.md | 1 - examples/cloud/pdf-rag/embedding.py | 20 ++++++----- examples/cloud/pdf-rag/parse.py | 23 ++++++------ examples/cloud/pdf-rag/requirements.txt | 2 +- 5 files changed, 54 insertions(+), 40 deletions(-) diff --git a/examples/cloud/pdf-rag/HNSW_retrieve.py b/examples/cloud/pdf-rag/HNSW_retrieve.py index 626c28603..f02298544 100644 --- a/examples/cloud/pdf-rag/HNSW_retrieve.py +++ b/examples/cloud/pdf-rag/HNSW_retrieve.py @@ -1,9 +1,10 @@ -import numpy as np import json -from FlagEmbedding import FlagAutoModel -import time -from rank_bm25 import BM25Okapi + import hnswlib +import numpy as np +from FlagEmbedding import FlagAutoModel +from rank_bm25 import BM25Okapi + def get_list_shape(lst): shape = [] @@ -13,49 +14,55 @@ def get_list_shape(lst): current = current[0] return tuple(shape) + def load_model(): return FlagAutoModel.from_finetuned( - 'BAAI/bge-base-en-v1.5', + "BAAI/bge-base-en-v1.5", query_instruction_for_retrieval="Represent this sentence for searching relevant passages:", # devices='cpu', # Uncomment this line if you want to use GPU. - use_fp16=True + use_fp16=True, ) + def encode_query(model, query): query_vectors = [np.array(model.encode(query)).tolist()] - print('query_vectors_shape', get_list_shape(query_vectors)) + print("query_vectors_shape", get_list_shape(query_vectors)) return query_vectors + def load_data(vectors_path, docs_path): vectors = np.load(vectors_path).tolist() - with open(docs_path, 'r', encoding='utf-8') as file: + with open(docs_path, "r", encoding="utf-8") as file: docs = json.load(file) return vectors, docs + def build_hnsw_index(vectors): # start_time = time.time() num_elements = len(vectors) - p = hnswlib.Index(space='cosine', dim=768) + p = hnswlib.Index(space="cosine", dim=768) p.init_index(max_elements=num_elements, ef_construction=200, M=16) # M defines the maximum number of outgoing connections in the graph. Higher M leads to higher accuracy/run_time at fixed ef/efConstruction. # ef_construction controls index search speed/build speed tradeoff. Increasing the efConstruction parameter may enhance index quality, but it also tends to lengthen the indexing time. p.add_items(np.array(vectors), np.arange(num_elements)) # HNSW_time = time.time() - #print('HNSW build time:', HNSW_time - start_time) + # print('HNSW build time:', HNSW_time - start_time) p.set_ef(32) # ef controlling query time/accuracy trade-off. Higher ef leads to more accurate but slower search. return p + def search_hnsw(index, query_vectors, docs): # HNSW_time = time.time() labels, distances = index.knn_query(np.array(query_vectors), k=10) - results = [docs[i]['content'] for i in labels[0]] + results = [docs[i]["content"] for i in labels[0]] # end_HNSW_time = time.time() # print('HNSW search time:', end_HNSW_time - HNSW_time) return results + def build_bm25(docs): - corpus = [doc['content'] for doc in docs] + corpus = [doc["content"] for doc in docs] tokenized_corpus = [list(text.split()) for text in corpus] # bm25_build_start = time.time() bm25 = BM25Okapi(tokenized_corpus) @@ -63,6 +70,7 @@ def build_bm25(docs): # print('BM25 build time:', bm25_build_end - bm25_build_start) return bm25, corpus + def search_bm25(bm25, corpus, query): # bm25_search_start = time.time() tokenized_query = list(query.split()) @@ -73,6 +81,7 @@ def search_bm25(bm25, corpus, query): # print('BM25 search time:', bm25_search_end - bm25_search_start) return bm25_results + def merge_results(results, bm25_results): merged_results = [] for i in range(len(results)): @@ -82,20 +91,23 @@ def merge_results(results, bm25_results): merged_results = list(set(merged_results)) return merged_results + def main(): model = load_model() query = "This is a test query to find relevant documents." query_vectors = encode_query(model, query) - vectors, docs = load_data('PATH_TO_YOUR_EMBEDDING.npy', 'PATH_TO_YOUR_JSON.json') - - hnsw_index = build_hnsw_index(vectors) + vectors, docs = load_data("#PATH_TO_YOUR_EMBEDDING.npy#", "#PATH_TO_YOUR_JSON.json#") + + hnsw_index = build_hnsw_index(vectors) hnsw_results = search_hnsw(hnsw_index, query_vectors, docs) - + bm25, corpus = build_bm25(docs) bm25_results = search_bm25(bm25, corpus, query) - + merged_results = merge_results(hnsw_results, bm25_results) return merged_results + + if __name__ == "__main__": - retrieved_data=main() + retrieved_data = main() diff --git a/examples/cloud/pdf-rag/README.md b/examples/cloud/pdf-rag/README.md index a7da19314..8c458e14d 100644 --- a/examples/cloud/pdf-rag/README.md +++ b/examples/cloud/pdf-rag/README.md @@ -53,4 +53,3 @@ pip install -r requirements.txt - FlagEmbedding - haystack - haystack-integrations - diff --git a/examples/cloud/pdf-rag/embedding.py b/examples/cloud/pdf-rag/embedding.py index 30d7d747e..39966efa5 100644 --- a/examples/cloud/pdf-rag/embedding.py +++ b/examples/cloud/pdf-rag/embedding.py @@ -1,9 +1,11 @@ import json +import os +import time + import numpy as np from FlagEmbedding import FlagAutoModel -import time from sklearn.metrics.pairwise import cosine_similarity -import os + def load_model(model_name="BAAI/bge-base-en-v1.5", use_fp16=True): return FlagAutoModel.from_finetuned( @@ -42,17 +44,17 @@ def load_embeddings(file_path): def main(): config = { "model_name": "BAAI/bge-base-en-v1.5", - "json_path": #PATH_TO_YOUR_JSON.json#, - "embedding_path": #PATH_TO_YOUR_EMBEDDING.npy#, + "json_path": "#PATH_TO_YOUR_JSON.json#", + "embedding_path": "#PATH_TO_YOUR_EMBEDDING.npy#", "use_fp16": True, "use_precomputed_embeddings": False } - + model = load_model( model_name=config["model_name"], use_fp16=config["use_fp16"] ) - + if config["use_precomputed_embeddings"]: embeddings = load_embeddings(config["embedding_path"]) if embeddings is None: @@ -61,17 +63,17 @@ def main(): data = load_data(config["json_path"]) if not data: return - + texts = extract_texts(data) embeddings = generate_embeddings(model, texts) save_embeddings(embeddings, config["embedding_path"]) - + ##### Test demo with simple KNN cosine_similarity # query='This is a test query to find relevant documents.' # query_embedding=np.array(model.encode(query)) # similarity_scores = cosine_similarity([query_embedding], embeddings) # indices = np.argsort(-similarity_scores) - + return embeddings if __name__ == '__main__': diff --git a/examples/cloud/pdf-rag/parse.py b/examples/cloud/pdf-rag/parse.py index 9a5f1ef1e..eb6e62bf3 100644 --- a/examples/cloud/pdf-rag/parse.py +++ b/examples/cloud/pdf-rag/parse.py @@ -1,13 +1,14 @@ -from pathlib import Path -import time import json -from haystack import Pipeline +import time +from pathlib import Path + +from haystack import Document, Pipeline from haystack.components.converters import PyPDFToDocument from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter from haystack.components.writers import DocumentWriter -from haystack.document_stores.types import DuplicatePolicy from haystack.document_stores.in_memory import InMemoryDocumentStore -from haystack import Document +from haystack.document_stores.types import DuplicatePolicy + def create_indexing_pipeline(): document_store = InMemoryDocumentStore() @@ -15,23 +16,23 @@ def create_indexing_pipeline(): cleaner = DocumentCleaner() splitter = DocumentSplitter(split_by="sentence", split_length=1) writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP) - + indexing_pipeline = Pipeline() indexing_pipeline.add_component("converter", converter) indexing_pipeline.add_component("cleaner", cleaner) indexing_pipeline.add_component("splitter", splitter) indexing_pipeline.add_component("writer", writer) - + indexing_pipeline.connect("converter", "cleaner") indexing_pipeline.connect("cleaner", "splitter") indexing_pipeline.connect("splitter", "writer") - + return indexing_pipeline, document_store def process_pdfs(pdf_directory, indexing_pipeline): papers_dir = Path(pdf_directory) pdf_files = list(papers_dir.glob("*.pdf")) - for pdf_file in pdf_files: + for pdf_file in pdf_files: try: indexing_pipeline.run({"converter": {"sources": [pdf_file]}}) except: @@ -44,8 +45,8 @@ def save_to_json(document_store, output_path): json.dump(docs_list, f, ensure_ascii=False, indent=2) def main(): - PDF_DIRECTORY = #PATH_TO_YOUR_PDF_DIRECTORY# - OUTPUT_JSON = #PATH_TO_YOUR_JSON# + PDF_DIRECTORY = "#PATH_TO_YOUR_PDF_DIRECTORY#" + OUTPUT_JSON = "#PATH_TO_YOUR_JSON#" start_time = time.time() indexing_pipeline, document_store = create_indexing_pipeline() diff --git a/examples/cloud/pdf-rag/requirements.txt b/examples/cloud/pdf-rag/requirements.txt index 84d85fecc..f4553ccd1 100644 --- a/examples/cloud/pdf-rag/requirements.txt +++ b/examples/cloud/pdf-rag/requirements.txt @@ -4,4 +4,4 @@ hnswlib rank_bm25 FlagEmbedding haystack -haystack-integrations \ No newline at end of file +haystack-integrations From 3fdd4e7733e09e056c190b535da6f2f561941b78 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 23 Jul 2025 06:21:29 +0000 Subject: [PATCH 3/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/cloud/pdf-rag/embedding.py | 25 +++++++++++++------------ examples/cloud/pdf-rag/parse.py | 10 +++++++--- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/examples/cloud/pdf-rag/embedding.py b/examples/cloud/pdf-rag/embedding.py index 39966efa5..d6231462a 100644 --- a/examples/cloud/pdf-rag/embedding.py +++ b/examples/cloud/pdf-rag/embedding.py @@ -1,10 +1,8 @@ import json import os -import time import numpy as np from FlagEmbedding import FlagAutoModel -from sklearn.metrics.pairwise import cosine_similarity def load_model(model_name="BAAI/bge-base-en-v1.5", use_fp16=True): @@ -12,27 +10,32 @@ def load_model(model_name="BAAI/bge-base-en-v1.5", use_fp16=True): model_name, query_instruction_for_retrieval="Represent this sentence for searching relevant passages:", # device='cpu', # Uncomment this line if you want to use GPU. - use_fp16=use_fp16 + use_fp16=use_fp16, ) + def load_data(file_path): try: - with open(file_path, 'r', encoding='utf-8') as f: + with open(file_path, "r", encoding="utf-8") as f: return json.load(f) except (FileNotFoundError, json.JSONDecodeError): print("Error loading data from", file_path) return [] + def extract_texts(data): - return [doc.get("content", '').strip() for doc in data] + return [doc.get("content", "").strip() for doc in data] + def generate_embeddings(model, texts): return np.array(model.encode(texts)) + def save_embeddings(embeddings, output_path): os.makedirs(os.path.dirname(output_path), exist_ok=True) np.save(output_path, embeddings) + def load_embeddings(file_path): try: return np.load(file_path) @@ -47,13 +50,10 @@ def main(): "json_path": "#PATH_TO_YOUR_JSON.json#", "embedding_path": "#PATH_TO_YOUR_EMBEDDING.npy#", "use_fp16": True, - "use_precomputed_embeddings": False + "use_precomputed_embeddings": False, } - model = load_model( - model_name=config["model_name"], - use_fp16=config["use_fp16"] - ) + model = load_model(model_name=config["model_name"], use_fp16=config["use_fp16"]) if config["use_precomputed_embeddings"]: embeddings = load_embeddings(config["embedding_path"]) @@ -68,7 +68,7 @@ def main(): embeddings = generate_embeddings(model, texts) save_embeddings(embeddings, config["embedding_path"]) -##### Test demo with simple KNN cosine_similarity + ##### Test demo with simple KNN cosine_similarity # query='This is a test query to find relevant documents.' # query_embedding=np.array(model.encode(query)) # similarity_scores = cosine_similarity([query_embedding], embeddings) @@ -76,5 +76,6 @@ def main(): return embeddings -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/examples/cloud/pdf-rag/parse.py b/examples/cloud/pdf-rag/parse.py index eb6e62bf3..a6462b628 100644 --- a/examples/cloud/pdf-rag/parse.py +++ b/examples/cloud/pdf-rag/parse.py @@ -2,7 +2,7 @@ import json import time from pathlib import Path -from haystack import Document, Pipeline +from haystack import Pipeline from haystack.components.converters import PyPDFToDocument from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter from haystack.components.writers import DocumentWriter @@ -29,6 +29,7 @@ def create_indexing_pipeline(): return indexing_pipeline, document_store + def process_pdfs(pdf_directory, indexing_pipeline): papers_dir = Path(pdf_directory) pdf_files = list(papers_dir.glob("*.pdf")) @@ -38,20 +39,23 @@ def process_pdfs(pdf_directory, indexing_pipeline): except: pass + def save_to_json(document_store, output_path): all_documents = document_store.filter_documents() docs_list = [doc.to_dict() for doc in all_documents] with open(output_path, "w", encoding="utf-8") as f: json.dump(docs_list, f, ensure_ascii=False, indent=2) + def main(): PDF_DIRECTORY = "#PATH_TO_YOUR_PDF_DIRECTORY#" OUTPUT_JSON = "#PATH_TO_YOUR_JSON#" - - start_time = time.time() + + time.time() indexing_pipeline, document_store = create_indexing_pipeline() process_pdfs(PDF_DIRECTORY, indexing_pipeline) save_to_json(document_store, OUTPUT_JSON) + if __name__ == "__main__": main()