mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-24 23:54:14 +00:00
community: Fix all page numbers were the same for _BaseGoogleVertexAISearchRetriever (#19175)
- Description: - This pull request is to fix a bug where page numbers were not set correctly. In the current code, all chunks share the same metadata object doc_metadata, so the page number is set with the same value for all documents. To fix this, I changed to using separate metadata objects for each chunk. - Issue: - None - Dependencies: - No additional dependencies are required for this change. - Twitter handle: - @eycjur - Test - Even if it's not a bug, there are cases where everything ends up with the same number of pages, so it's very difficult for me to write integration tests.
This commit is contained in:
parent
160a7077b0
commit
8d2c34e655
@ -137,14 +137,15 @@ class _BaseGoogleVertexAISearchRetriever(BaseModel):
|
||||
continue
|
||||
|
||||
for chunk in derived_struct_data[chunk_type]:
|
||||
doc_metadata["source"] = derived_struct_data.get("link", "")
|
||||
chunk_metadata = doc_metadata.copy()
|
||||
chunk_metadata["source"] = derived_struct_data.get("link", "")
|
||||
|
||||
if chunk_type == "extractive_answers":
|
||||
doc_metadata["source"] += f":{chunk.get('pageNumber', '')}"
|
||||
chunk_metadata["source"] += f":{chunk.get('pageNumber', '')}"
|
||||
|
||||
documents.append(
|
||||
Document(
|
||||
page_content=chunk.get("content", ""), metadata=doc_metadata
|
||||
page_content=chunk.get("content", ""), metadata=chunk_metadata
|
||||
)
|
||||
)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user