From 567dc1e4228ec4baa6d04655df006e4428234d49 Mon Sep 17 00:00:00 2001 From: Renzo-vS Date: Wed, 20 Nov 2024 21:49:03 +0100 Subject: [PATCH] community: fix duplicate content (#28003) Thank you for reading my first PR! **Description:** Deduplicate content in AzureSearch vectorstore. Currently, by default, the content of the retrieval is placed both in metadata and page_content of a Document. This PR removes the content from metadata, and leaves it in page_content. **Issue:**: Previously, the content was popped from result before metadata was populated. In #25828 , the order was changed which leads to a response with duplicated content. This was not the intention of that PR and seems undesirable. Looking forward to seeing my contribution in the next version! Cheers, Renzo --- .../langchain_community/vectorstores/azuresearch.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/azuresearch.py b/libs/community/langchain_community/vectorstores/azuresearch.py index 193f6fc680e..2a715b846f8 100644 --- a/libs/community/langchain_community/vectorstores/azuresearch.py +++ b/libs/community/langchain_community/vectorstores/azuresearch.py @@ -1798,7 +1798,9 @@ def _result_to_document(result: Dict) -> Document: fields_metadata = json.loads(result[FIELDS_METADATA]) else: fields_metadata = { - key: value for key, value in result.items() if key != FIELDS_CONTENT_VECTOR + key: value + for key, value in result.items() + if key not in [FIELDS_CONTENT_VECTOR, FIELDS_CONTENT] } # IDs if FIELDS_ID in result: @@ -1806,7 +1808,7 @@ def _result_to_document(result: Dict) -> Document: else: fields_id = {} return Document( - page_content=result.pop(FIELDS_CONTENT), + page_content=result[FIELDS_CONTENT], metadata={ **fields_id, **fields_metadata,