From f723a8456ebab0243dded27971cb6a6698540baf Mon Sep 17 00:00:00 2001 From: binhnd102 <51689237+binhnd102@users.noreply.github.com> Date: Wed, 18 Dec 2024 22:21:28 +0700 Subject: [PATCH] Fixes: community: fix LanceDB return no metadata (#27024) - [ x ] Fix when lancedb return table without metadata column - **Description:** Check the table schema, if not has metadata column, init the Document with metadata argument equal to empty dict - **Issue:** https://github.com/langchain-ai/langchain/issues/27005 - [ x ] **Add tests and docs** --------- Co-authored-by: ccurme --- .../vectorstores/lancedb.py | 8 +++- .../vectorstores/test_lancedb.py | 38 +++++++++++++++++++ 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/lancedb.py b/libs/community/langchain_community/vectorstores/lancedb.py index 7253d885b11..f08e4380481 100644 --- a/libs/community/langchain_community/vectorstores/lancedb.py +++ b/libs/community/langchain_community/vectorstores/lancedb.py @@ -151,12 +151,14 @@ class LanceDB(VectorStore): score_col = "_relevance_score" else: score_col = None + # Check if 'metadata' is in the columns + has_metadata = "metadata" in columns if score_col is None or not score: return [ Document( page_content=results[self._text_key][idx].as_py(), - metadata=results["metadata"][idx].as_py(), + metadata=results["metadata"][idx].as_py() if has_metadata else {}, ) for idx in range(len(results)) ] @@ -165,7 +167,9 @@ class LanceDB(VectorStore): ( Document( page_content=results[self._text_key][idx].as_py(), - metadata=results["metadata"][idx].as_py(), + metadata=results["metadata"][idx].as_py() + if has_metadata + else {}, ), results[score_col][idx].as_py(), ) diff --git a/libs/community/tests/integration_tests/vectorstores/test_lancedb.py b/libs/community/tests/integration_tests/vectorstores/test_lancedb.py index 7ba3a004663..615b310629d 100644 --- a/libs/community/tests/integration_tests/vectorstores/test_lancedb.py +++ b/libs/community/tests/integration_tests/vectorstores/test_lancedb.py @@ -114,3 +114,41 @@ def test_lancedb_all_searches() -> None: ) assert len(result_3[0]) == 2 # type: ignore assert "text 1" in result_3[0][0].page_content # type: ignore + + +@pytest.mark.requires("lancedb") +def test_lancedb_no_metadata() -> None: + lancedb = import_lancedb() + embeddings = FakeEmbeddings() + # Connect to a temporary LanceDB instance + db = lancedb.connect("/tmp/lancedb_no_metadata_test") + # Create data without the 'metadata' field + texts = ["text 1", "text 2", "item 3"] + data = [] + for idx, text in enumerate(texts): + embedding = embeddings.embed_documents([text])[0] + data.append( + { + "vector": embedding, + "id": str(idx), + "text": text, + # Note: We're deliberately not including 'metadata' here + } + ) + # Create the table without 'metadata' column + db.create_table("vectorstore_no_metadata", data=data) + # Initialize LanceDB with the existing connection and table name + store = LanceDB( + connection=db, + embedding=embeddings, + table_name="vectorstore_no_metadata", + ) + # Perform a similarity search + result = store.similarity_search("text 1") + # Verify that the metadata in the Document objects is an empty dictionary + for doc in result: + assert ( + doc.metadata == {} + ), "Expected empty metadata when 'metadata' column is missing" + # Clean up by deleting the table (optional) + db.drop_table("vectorstore_no_metadata")