Fixed issue with metadata in query (#9500)

- Description: Changed metadata retrieval so that it combines Vectara doc level and part level metadata - Tag maintainer: @rlancemartin - Twitter handle: @ofermend
2025-06-23 23:29:21 +00:00 · 2023-08-20 16:00:14 -07:00 · 2023-08-20 16:00:14 -07:00 · a758496236
commit a758496236
parent 103094286e
2 changed files with 65 additions and 38 deletions
--- a/libs/langchain/langchain/vectorstores/vectara.py
+++ b/libs/langchain/langchain/vectorstores/vectara.py
@ -202,12 +202,12 @@ class Vectara(VectorStore):
            doc_metadata: optional metadata for the document
        This function indexes all the input text strings in the Vectara corpus as a
-        single Vectara document, where each input text is considered a "part" and the
+        single Vectara document, where each input text is considered a "section" and the
-        metadata are associated with each part.
+        metadata are associated with each section.
        if 'doc_metadata' is provided, it is associated with the Vectara document.
        Returns:
-            List of ids from adding the texts into the vectorstore.
+            document ID of the document added
        """
        doc_hash = md5()
@ -307,21 +307,27 @@ class Vectara(VectorStore):
        result = response.json()
        responses = result["responseSet"][0]["response"]
-        vectara_default_metadata = ["lang", "len", "offset"]
+        documents = result["responseSet"][0]["document"]
        metadatas = []
        for x in responses:
            md = {m["name"]: m["value"] for m in x["metadata"]}
            doc_num = x["documentIndex"]
            doc_md = {m["name"]: m["value"] for m in documents[doc_num]["metadata"]}
            md.update(doc_md)
            metadatas.append(md)
        docs = [
            (
                Document(
                    page_content=x["text"],
-                    metadata={
+                    metadata=md,
                        m["name"]: m["value"]
                        for m in x["metadata"]
                        if m["name"] not in vectara_default_metadata
                    },
                ),
                x["score"],
            )
-            for x in responses
+            for x, md in zip(responses, metadatas)
        ]
        return docs
    def similarity_search(
--- a/libs/langchain/tests/integration_tests/vectorstores/test_vectara.py
+++ b/libs/langchain/tests/integration_tests/vectorstores/test_vectara.py
@ -5,11 +5,14 @@ from langchain.docstore.document import Document
 from langchain.vectorstores.vectara import Vectara
 from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
-# For this test to run properly, please setup as follows
+#
-# 1. Create a corpus in Vectara, with a filter attribute called "test_num".
+# For this test to run properly, please setup as follows:
-# 2. Create an API_KEY for this corpus with permissions for query and indexing
+# 1. Create a Vectara account: sign up at https://console.vectara.com/signup
-# 3. Setup environment variables:
+# 2. Create a corpus in your Vectara account, with a filter attribute called "test_num".
 # 3. Create an API_KEY for this corpus with permissions for query and indexing
 # 4. Setup environment variables:
 #    VECTARA_API_KEY, VECTARA_CORPUS_ID and VECTARA_CUSTOMER_ID
 #
 def get_abbr(s: str) -> str:
@ -21,37 +24,52 @@ def get_abbr(s: str) -> str:
 def test_vectara_add_documents() -> None:
    """Test end to end construction and search."""
-    # start with some initial texts
+    # create a new Vectara instance
-    texts = ["grounded generation", "retrieval augmented generation", "data privacy"]
+    docsearch: Vectara = Vectara()
-    docsearch: Vectara = Vectara.from_texts(
+
-        texts,
+    # start with some initial texts, added with add_texts
-        embedding=FakeEmbeddings(),
+    texts1 = ["grounded generation", "retrieval augmented generation", "data privacy"]
-        metadatas=[
+    md = [{"abbr": get_abbr(t)} for t in texts1]
-            {"abbr": "gg", "test_num": "1"},
+    doc_id1 = docsearch.add_texts(
-            {"abbr": "rag", "test_num": "1"},
+        texts1,
-            {"abbr": "dp", "test_num": "1"},
+        metadatas=md,
        ],
        doc_metadata={"test_num": "1"},
    )
-    # then add some additional documents
+    # then add some additional documents, now with add_documents
-    new_texts = ["large language model", "information retrieval", "question answering"]
+    texts2 = ["large language model", "information retrieval", "question answering"]
-    docsearch.add_documents(
+    doc_id2 = docsearch.add_documents(
-        [Document(page_content=t, metadata={"abbr": get_abbr(t)}) for t in new_texts],
+        [Document(page_content=t, metadata={"abbr": get_abbr(t)}) for t in texts2],
-        doc_metadata={"test_num": "1"},
+        doc_metadata={"test_num": "2"},
    )
    doc_ids = doc_id1 + doc_id2
-    # finally do a similarity search to see if all works okay
+    # test without filter
-    output = docsearch.similarity_search(
+    output1 = docsearch.similarity_search(
        "large language model",
        k=2,
        n_sentence_context=0,
    )
    assert len(output1) == 2
    assert output1[0].page_content == "large language model"
    assert output1[0].metadata["abbr"] == "llm"
    assert output1[1].page_content == "information retrieval"
    assert output1[1].metadata["abbr"] == "ir"
    # test with metadata filter (doc level)
    # since the query does not match test_num=1 directly we get "RAG" as the result
    output2 = docsearch.similarity_search(
        "large language model",
        k=1,
        n_sentence_context=0,
        filter="doc.test_num = 1",
    )
-    assert output[0].page_content == "large language model"
+    assert len(output2) == 1
-    assert output[0].metadata == {"abbr": "llm"}
+    assert output2[0].page_content == "retrieval augmented generation"
-    assert output[1].page_content == "information retrieval"
+    assert output2[0].metadata["abbr"] == "rag"
-    assert output[1].metadata == {"abbr": "ir"}
+
    for doc_id in doc_ids:
        docsearch._delete_doc(doc_id)
 def test_vectara_from_files() -> None:
@ -73,8 +91,9 @@ def test_vectara_from_files() -> None:
        urllib.request.urlretrieve(url, name)
        files_list.append(name)
-    docsearch: Vectara = Vectara.from_files(
+    docsearch: Vectara = Vectara()
-        files=files_list,
+    doc_ids = docsearch.add_files(
        files_list=files_list,
        embedding=FakeEmbeddings(),
        metadatas=[{"url": url, "test_num": "2"} for url in urls],
    )
@ -101,7 +120,6 @@ def test_vectara_from_files() -> None:
        n_sentence_context=1,
        filter="doc.test_num = 2",
    )
    print(output[0].page_content)
    assert output[0].page_content == (
        """\
 Note the use of “hybrid” in 3) above is different from that used sometimes in the literature, \
@ -114,3 +132,6 @@ This classification scheme, however, misses a key insight gained in deep learnin
 models can greatly improve the training of DNNs and other deep discriminative models via better regularization.\
 """  # noqa: E501
    )
    for doc_id in doc_ids:
        docsearch._delete_doc(doc_id)