[pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
2025-08-02 16:28:58 +00:00 · 2025-07-23 05:59:43 +00:00 · 2025-07-23 05:59:43 +00:00 · eadcad8749
commit eadcad8749
parent 6070287b34
5 changed files with 54 additions and 40 deletions
--- a/examples/cloud/pdf-rag/HNSW_retrieve.py
+++ b/examples/cloud/pdf-rag/HNSW_retrieve.py
@ -1,9 +1,10 @@
-import numpy as np
 import json
-from FlagEmbedding import FlagAutoModel
-import time
-from rank_bm25 import BM25Okapi
+
 import hnswlib
+import numpy as np
+from FlagEmbedding import FlagAutoModel
+from rank_bm25 import BM25Okapi
+

 def get_list_shape(lst):
    shape = []
@ -13,29 +14,33 @@ def get_list_shape(lst):
        current = current[0]
    return tuple(shape)

+
 def load_model():
    return FlagAutoModel.from_finetuned(
-        'BAAI/bge-base-en-v1.5',
+        "BAAI/bge-base-en-v1.5",
        query_instruction_for_retrieval="Represent this sentence for searching relevant passages:",
        #  devices='cpu', # Uncomment this line if you want to use GPU.
-        use_fp16=True
+        use_fp16=True,
    )

+
 def encode_query(model, query):
    query_vectors = [np.array(model.encode(query)).tolist()]
-    print('query_vectors_shape', get_list_shape(query_vectors))
+    print("query_vectors_shape", get_list_shape(query_vectors))
    return query_vectors

+
 def load_data(vectors_path, docs_path):
    vectors = np.load(vectors_path).tolist()
-    with open(docs_path, 'r', encoding='utf-8') as file:
+    with open(docs_path, "r", encoding="utf-8") as file:
        docs = json.load(file)
    return vectors, docs

+
 def build_hnsw_index(vectors):
    # start_time = time.time()
    num_elements = len(vectors)
-    p = hnswlib.Index(space='cosine', dim=768)
+    p = hnswlib.Index(space="cosine", dim=768)
    p.init_index(max_elements=num_elements, ef_construction=200, M=16)
    # M defines the maximum number of outgoing connections in the graph. Higher M leads to higher accuracy/run_time at fixed ef/efConstruction.
    # ef_construction controls index search speed/build speed tradeoff. Increasing the efConstruction parameter may enhance index quality, but it also tends to lengthen the indexing time.
@ -46,16 +51,18 @@ def build_hnsw_index(vectors):
    # ef controlling query time/accuracy trade-off. Higher ef leads to more accurate but slower search.
    return p

+
 def search_hnsw(index, query_vectors, docs):
    # HNSW_time = time.time()
    labels, distances = index.knn_query(np.array(query_vectors), k=10)
-    results = [docs[i]['content'] for i in labels[0]]
+    results = [docs[i]["content"] for i in labels[0]]
    # end_HNSW_time = time.time()
    # print('HNSW search time:', end_HNSW_time - HNSW_time)
    return results

+
 def build_bm25(docs):
-    corpus = [doc['content'] for doc in docs]
+    corpus = [doc["content"] for doc in docs]
    tokenized_corpus = [list(text.split()) for text in corpus]
    # bm25_build_start = time.time()
    bm25 = BM25Okapi(tokenized_corpus)
@ -63,6 +70,7 @@ def build_bm25(docs):
    # print('BM25 build time:', bm25_build_end - bm25_build_start)
    return bm25, corpus

+
 def search_bm25(bm25, corpus, query):
    # bm25_search_start = time.time()
    tokenized_query = list(query.split())
@ -73,6 +81,7 @@ def search_bm25(bm25, corpus, query):
    # print('BM25 search time:', bm25_search_end - bm25_search_start)
    return bm25_results

+
 def merge_results(results, bm25_results):
    merged_results = []
    for i in range(len(results)):
@ -82,11 +91,12 @@ def merge_results(results, bm25_results):
    merged_results = list(set(merged_results))
    return merged_results

+
 def main():
    model = load_model()
    query = "This is a test query to find relevant documents."
    query_vectors = encode_query(model, query)
-    vectors, docs = load_data('PATH_TO_YOUR_EMBEDDING.npy', 'PATH_TO_YOUR_JSON.json')
+    vectors, docs = load_data("#PATH_TO_YOUR_EMBEDDING.npy#", "#PATH_TO_YOUR_JSON.json#")

    hnsw_index = build_hnsw_index(vectors)
    hnsw_results = search_hnsw(hnsw_index, query_vectors, docs)
@ -97,5 +107,7 @@ def main():
    merged_results = merge_results(hnsw_results, bm25_results)

    return merged_results
+
+
 if __name__ == "__main__":
    retrieved_data = main()
--- a/examples/cloud/pdf-rag/README.md
+++ b/examples/cloud/pdf-rag/README.md
@ -53,4 +53,3 @@ pip install -r requirements.txt
 - FlagEmbedding
 - haystack
 - haystack-integrations
-
--- a/examples/cloud/pdf-rag/embedding.py
+++ b/examples/cloud/pdf-rag/embedding.py
@ -1,9 +1,11 @@
 import json
+import os
+import time
+
 import numpy as np
 from FlagEmbedding import FlagAutoModel
-import time
 from sklearn.metrics.pairwise import cosine_similarity
-import os
+

 def load_model(model_name="BAAI/bge-base-en-v1.5", use_fp16=True):
    return FlagAutoModel.from_finetuned(
@ -42,8 +44,8 @@ def load_embeddings(file_path):
 def main():
    config = {
        "model_name": "BAAI/bge-base-en-v1.5",
-        "json_path": #PATH_TO_YOUR_JSON.json#,
-        "embedding_path": #PATH_TO_YOUR_EMBEDDING.npy#,
+        "json_path": "#PATH_TO_YOUR_JSON.json#",
+        "embedding_path": "#PATH_TO_YOUR_EMBEDDING.npy#",
        "use_fp16": True,
        "use_precomputed_embeddings": False
    }
--- a/examples/cloud/pdf-rag/parse.py
+++ b/examples/cloud/pdf-rag/parse.py
@ -1,13 +1,14 @@
-from pathlib import Path
-import time
 import json
-from haystack import Pipeline
+import time
+from pathlib import Path
+
+from haystack import Document, Pipeline
 from haystack.components.converters import PyPDFToDocument
 from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
 from haystack.components.writers import DocumentWriter
-from haystack.document_stores.types import DuplicatePolicy
 from haystack.document_stores.in_memory import InMemoryDocumentStore
-from haystack import Document
+from haystack.document_stores.types import DuplicatePolicy
+

 def create_indexing_pipeline():
    document_store = InMemoryDocumentStore()
@ -44,8 +45,8 @@ def save_to_json(document_store, output_path):
        json.dump(docs_list, f, ensure_ascii=False, indent=2)

 def main():
-    PDF_DIRECTORY = #PATH_TO_YOUR_PDF_DIRECTORY#
-    OUTPUT_JSON = #PATH_TO_YOUR_JSON#
+    PDF_DIRECTORY = "#PATH_TO_YOUR_PDF_DIRECTORY#"
+    OUTPUT_JSON = "#PATH_TO_YOUR_JSON#"
    
    start_time = time.time()
    indexing_pipeline, document_store = create_indexing_pipeline()