Fixes: community: fix LanceDB return no metadata (#27024)

- [ x ] Fix when lancedb return table without metadata column
- **Description:** Check the table schema, if not has metadata column,
init the Document with metadata argument equal to empty dict
    - **Issue:** https://github.com/langchain-ai/langchain/issues/27005

- [ x ] **Add tests and docs**

---------

Co-authored-by: ccurme <chester.curme@gmail.com>
This commit is contained in:
binhnd102 2024-12-18 22:21:28 +07:00 committed by GitHub
parent 91d28ef453
commit f723a8456e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 44 additions and 2 deletions

View File

@ -151,12 +151,14 @@ class LanceDB(VectorStore):
score_col = "_relevance_score" score_col = "_relevance_score"
else: else:
score_col = None score_col = None
# Check if 'metadata' is in the columns
has_metadata = "metadata" in columns
if score_col is None or not score: if score_col is None or not score:
return [ return [
Document( Document(
page_content=results[self._text_key][idx].as_py(), page_content=results[self._text_key][idx].as_py(),
metadata=results["metadata"][idx].as_py(), metadata=results["metadata"][idx].as_py() if has_metadata else {},
) )
for idx in range(len(results)) for idx in range(len(results))
] ]
@ -165,7 +167,9 @@ class LanceDB(VectorStore):
( (
Document( Document(
page_content=results[self._text_key][idx].as_py(), page_content=results[self._text_key][idx].as_py(),
metadata=results["metadata"][idx].as_py(), metadata=results["metadata"][idx].as_py()
if has_metadata
else {},
), ),
results[score_col][idx].as_py(), results[score_col][idx].as_py(),
) )

View File

@ -114,3 +114,41 @@ def test_lancedb_all_searches() -> None:
) )
assert len(result_3[0]) == 2 # type: ignore assert len(result_3[0]) == 2 # type: ignore
assert "text 1" in result_3[0][0].page_content # type: ignore assert "text 1" in result_3[0][0].page_content # type: ignore
@pytest.mark.requires("lancedb")
def test_lancedb_no_metadata() -> None:
lancedb = import_lancedb()
embeddings = FakeEmbeddings()
# Connect to a temporary LanceDB instance
db = lancedb.connect("/tmp/lancedb_no_metadata_test")
# Create data without the 'metadata' field
texts = ["text 1", "text 2", "item 3"]
data = []
for idx, text in enumerate(texts):
embedding = embeddings.embed_documents([text])[0]
data.append(
{
"vector": embedding,
"id": str(idx),
"text": text,
# Note: We're deliberately not including 'metadata' here
}
)
# Create the table without 'metadata' column
db.create_table("vectorstore_no_metadata", data=data)
# Initialize LanceDB with the existing connection and table name
store = LanceDB(
connection=db,
embedding=embeddings,
table_name="vectorstore_no_metadata",
)
# Perform a similarity search
result = store.similarity_search("text 1")
# Verify that the metadata in the Document objects is an empty dictionary
for doc in result:
assert (
doc.metadata == {}
), "Expected empty metadata when 'metadata' column is missing"
# Clean up by deleting the table (optional)
db.drop_table("vectorstore_no_metadata")