community[patch]: LanceDB integration improvements/fixes (#16173)

Hi, I'm from the LanceDB team. Improves LanceDB integration by making it easier to use - now you aren't required to create tables manually and pass them in the constructor, although that is still backward compatible. Bug fix - pandas was being used even though it's not a dependency for LanceDB or langchain PS - this issue was raised a few months ago but lost traction. It is a feature improvement for our users kindly review this , Thanks !
2025-08-15 15:46:47 +00:00 · 2024-02-19 13:22:02 -05:00 · 2024-02-19 13:22:02 -05:00 · 6c18f73ca5
commit 6c18f73ca5
parent e92e96193f
4 changed files with 226 additions and 74 deletions
--- a/docs/docs/integrations/vectorstores/lancedb.ipynb
+++ b/docs/docs/integrations/vectorstores/lancedb.ipynb
--- a/docs/docs/modules/data_connection/vectorstores/index.mdx
+++ b/docs/docs/modules/data_connection/vectorstores/index.mdx
@ -131,7 +131,7 @@ table = db.create_table(
 raw_documents = TextLoader('../../../state_of_the_union.txt').load()
 text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
 documents = text_splitter.split_documents(raw_documents)
-db = LanceDB.from_documents(documents, OpenAIEmbeddings(), connection=table)
+db = LanceDB.from_documents(documents, OpenAIEmbeddings())
 ```
  </TabItem>
--- a/libs/community/langchain_community/vectorstores/lancedb.py
+++ b/libs/community/langchain_community/vectorstores/lancedb.py
@ -12,6 +12,18 @@ class LanceDB(VectorStore):
    """`LanceDB` vector store.
    To use, you should have ``lancedb`` python package installed.
    You can install it with ``pip install lancedb``.
    Args:
        connection: LanceDB connection to use. If not provided, a new connection
                    will be created.
        embedding: Embedding to use for the vectorstore.
        vector_key: Key to use for the vector in the database. Defaults to ``vector``.
        id_key: Key to use for the id in the database. Defaults to ``id``.
        text_key: Key to use for the text in the database. Defaults to ``text``.
        table_name: Name of the table to use. Defaults to ``vectorstore``.
    Example:
        .. code-block:: python
@ -25,13 +37,14 @@ class LanceDB(VectorStore):
    def __init__(
        self,
-        connection: Any,
+        connection: Optional[Any] = None,
-        embedding: Embeddings,
+        embedding: Optional[Embeddings] = None,
        vector_key: Optional[str] = "vector",
        id_key: Optional[str] = "id",
        text_key: Optional[str] = "text",
        table_name: Optional[str] = "vectorstore",
    ):
-        """Initialize with Lance DB connection"""
+        """Initialize with Lance DB vectorstore"""
        try:
            import lancedb
        except ImportError:
@ -39,19 +52,28 @@ class LanceDB(VectorStore):
                "Could not import lancedb python package. "
                "Please install it with `pip install lancedb`."
            )
-        if not isinstance(connection, lancedb.db.LanceTable):
+        self.lancedb = lancedb
            raise ValueError(
                "connection should be an instance of lancedb.db.LanceTable, ",
                f"got {type(connection)}",
            )
        self._connection = connection
        self._embedding = embedding
        self._vector_key = vector_key
        self._id_key = id_key
        self._text_key = text_key
        self._table_name = table_name
        if self._embedding is None:
            raise ValueError("embedding should be provided")
        if connection is not None:
            if not isinstance(connection, lancedb.db.LanceTable):
                raise ValueError(
                    "connection should be an instance of lancedb.db.LanceTable, ",
                    f"got {type(connection)}",
                )
            self._connection = connection
        else:
            self._connection = self._init_table()
    @property
-    def embeddings(self) -> Embeddings:
+    def embeddings(self) -> Optional[Embeddings]:
        return self._embedding
    def add_texts(
@ -74,7 +96,7 @@ class LanceDB(VectorStore):
        # Embed texts and create documents
        docs = []
        ids = ids or [str(uuid.uuid4()) for _ in texts]
-        embeddings = self._embedding.embed_documents(list(texts))
+        embeddings = self._embedding.embed_documents(list(texts))  # type: ignore
        for idx, text in enumerate(texts):
            embedding = embeddings[idx]
            metadata = metadatas[idx] if metadatas else {}
@ -86,7 +108,6 @@ class LanceDB(VectorStore):
                    **metadata,
                }
            )
        self._connection.add(docs)
        return ids
@ -102,14 +123,23 @@ class LanceDB(VectorStore):
        Returns:
            List of documents most similar to the query.
        """
-        embedding = self._embedding.embed_query(query)
+        embedding = self._embedding.embed_query(query)  # type: ignore
-        docs = self._connection.search(embedding).limit(k).to_df()
+        docs = (
            self._connection.search(embedding, vector_column_name=self._vector_key)
            .limit(k)
            .to_arrow()
        )
        columns = docs.schema.names
        return [
            Document(
-                page_content=row[self._text_key],
+                page_content=docs[self._text_key][idx].as_py(),
-                metadata=row[docs.columns != self._text_key],
+                metadata={
                    col: docs[col][idx].as_py()
                    for col in columns
                    if col != self._text_key
                },
            )
-            for _, row in docs.iterrows()
+            for idx in range(len(docs))
        ]
    @classmethod
@ -134,3 +164,23 @@ class LanceDB(VectorStore):
        instance.add_texts(texts, metadatas=metadatas, **kwargs)
        return instance
    def _init_table(self) -> Any:
        import pyarrow as pa
        schema = pa.schema(
            [
                pa.field(
                    self._vector_key,
                    pa.list_(
                        pa.float32(),
                        len(self.embeddings.embed_query("test")),  # type: ignore
                    ),
                ),
                pa.field(self._id_key, pa.string()),
                pa.field(self._text_key, pa.string()),
            ]
        )
        db = self.lancedb.connect("/tmp/lancedb")
        tbl = db.create_table(self._table_name, schema=schema, mode="overwrite")
        return tbl
--- a/libs/community/tests/integration_tests/vectorstores/test_lancedb.py
+++ b/libs/community/tests/integration_tests/vectorstores/test_lancedb.py
@ -1,8 +1,11 @@
 import pytest
 from langchain_community.vectorstores import LanceDB
 from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
-def test_lancedb() -> None:
+@pytest.mark.requires("lancedb")
 def test_lancedb_with_connection() -> None:
    import lancedb
    embeddings = FakeEmbeddings()
@ -23,22 +26,23 @@ def test_lancedb() -> None:
    assert "text 1" in result_texts
-def test_lancedb_add_texts() -> None:
+@pytest.mark.requires("lancedb")
-    import lancedb
+def test_lancedb_without_connection() -> None:
    embeddings = FakeEmbeddings()
-    db = lancedb.connect("/tmp/lancedb")
+    texts = ["text 1", "text 2", "item 3"]
-    texts = ["text 1"]
+
-    vectors = embeddings.embed_documents(texts)
+    store = LanceDB(embedding=embeddings)
-    table = db.create_table(
+    store.add_texts(texts)
-        "my_table",
+    result = store.similarity_search("text 1")
-        data=[
+    result_texts = [doc.page_content for doc in result]
-            {"vector": vectors[idx], "id": text, "text": text}
+    assert "text 1" in result_texts
-            for idx, text in enumerate(texts)
+
-        ],
+
-        mode="overwrite",
+@pytest.mark.requires("lancedb")
-    )
+def test_lancedb_add_texts() -> None:
-    store = LanceDB(table, embeddings)
+    embeddings = FakeEmbeddings()
    store = LanceDB(embedding=embeddings)
    store.add_texts(["text 2"])
    result = store.similarity_search("text 2")
    result_texts = [doc.page_content for doc in result]