From 8d2c34e655fa0932aac709ecf88d1951fe09a304 Mon Sep 17 00:00:00 2001 From: "k.muto" <63308909+eycjur@users.noreply.github.com> Date: Sun, 17 Mar 2024 07:28:56 +0900 Subject: [PATCH] community: Fix all page numbers were the same for _BaseGoogleVertexAISearchRetriever (#19175) - Description: - This pull request is to fix a bug where page numbers were not set correctly. In the current code, all chunks share the same metadata object doc_metadata, so the page number is set with the same value for all documents. To fix this, I changed to using separate metadata objects for each chunk. - Issue: - None - Dependencies: - No additional dependencies are required for this change. - Twitter handle: - @eycjur - Test - Even if it's not a bug, there are cases where everything ends up with the same number of pages, so it's very difficult for me to write integration tests. --- .../retrievers/google_vertex_ai_search.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/libs/community/langchain_community/retrievers/google_vertex_ai_search.py b/libs/community/langchain_community/retrievers/google_vertex_ai_search.py index b4d13203ef8..78595a00cb2 100644 --- a/libs/community/langchain_community/retrievers/google_vertex_ai_search.py +++ b/libs/community/langchain_community/retrievers/google_vertex_ai_search.py @@ -137,14 +137,15 @@ class _BaseGoogleVertexAISearchRetriever(BaseModel): continue for chunk in derived_struct_data[chunk_type]: - doc_metadata["source"] = derived_struct_data.get("link", "") + chunk_metadata = doc_metadata.copy() + chunk_metadata["source"] = derived_struct_data.get("link", "") if chunk_type == "extractive_answers": - doc_metadata["source"] += f":{chunk.get('pageNumber', '')}" + chunk_metadata["source"] += f":{chunk.get('pageNumber', '')}" documents.append( Document( - page_content=chunk.get("content", ""), metadata=doc_metadata + page_content=chunk.get("content", ""), metadata=chunk_metadata ) )