From a75849623618dbae243e65871c9db3db9b397da4 Mon Sep 17 00:00:00 2001
From: Ofer Mendelevitch <ofer@vectara.com>
Date: Sun, 20 Aug 2023 16:00:14 -0700
Subject: [PATCH] Fixed issue with metadata in query (#9500)

- Description: Changed metadata retrieval so that it combines Vectara
doc level and part level metadata
  - Tag maintainer: @rlancemartin
  - Twitter handle: @ofermend
---
 .../langchain/vectorstores/vectara.py         | 26 ++++---
 .../vectorstores/test_vectara.py              | 77 ++++++++++++-------
 2 files changed, 65 insertions(+), 38 deletions(-)

diff --git a/libs/langchain/langchain/vectorstores/vectara.py b/libs/langchain/langchain/vectorstores/vectara.py
index f263ebeac84..cd8ee9c9fad 100644
--- a/libs/langchain/langchain/vectorstores/vectara.py
+++ b/libs/langchain/langchain/vectorstores/vectara.py
@@ -202,12 +202,12 @@ class Vectara(VectorStore):
             doc_metadata: optional metadata for the document
 
         This function indexes all the input text strings in the Vectara corpus as a
-        single Vectara document, where each input text is considered a "part" and the
-        metadata are associated with each part.
+        single Vectara document, where each input text is considered a "section" and the
+        metadata are associated with each section.
         if 'doc_metadata' is provided, it is associated with the Vectara document.
 
         Returns:
-            List of ids from adding the texts into the vectorstore.
+            document ID of the document added
 
         """
         doc_hash = md5()
@@ -307,21 +307,27 @@ class Vectara(VectorStore):
         result = response.json()
 
         responses = result["responseSet"][0]["response"]
-        vectara_default_metadata = ["lang", "len", "offset"]
+        documents = result["responseSet"][0]["document"]
+
+        metadatas = []
+        for x in responses:
+            md = {m["name"]: m["value"] for m in x["metadata"]}
+            doc_num = x["documentIndex"]
+            doc_md = {m["name"]: m["value"] for m in documents[doc_num]["metadata"]}
+            md.update(doc_md)
+            metadatas.append(md)
+
         docs = [
             (
                 Document(
                     page_content=x["text"],
-                    metadata={
-                        m["name"]: m["value"]
-                        for m in x["metadata"]
-                        if m["name"] not in vectara_default_metadata
-                    },
+                    metadata=md,
                 ),
                 x["score"],
             )
-            for x in responses
+            for x, md in zip(responses, metadatas)
         ]
+
         return docs
 
     def similarity_search(
diff --git a/libs/langchain/tests/integration_tests/vectorstores/test_vectara.py b/libs/langchain/tests/integration_tests/vectorstores/test_vectara.py
index 57338e7f994..8fa3cd7f40d 100644
--- a/libs/langchain/tests/integration_tests/vectorstores/test_vectara.py
+++ b/libs/langchain/tests/integration_tests/vectorstores/test_vectara.py
@@ -5,11 +5,14 @@ from langchain.docstore.document import Document
 from langchain.vectorstores.vectara import Vectara
 from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
 
-# For this test to run properly, please setup as follows
-# 1. Create a corpus in Vectara, with a filter attribute called "test_num".
-# 2. Create an API_KEY for this corpus with permissions for query and indexing
-# 3. Setup environment variables:
+#
+# For this test to run properly, please setup as follows:
+# 1. Create a Vectara account: sign up at https://console.vectara.com/signup
+# 2. Create a corpus in your Vectara account, with a filter attribute called "test_num".
+# 3. Create an API_KEY for this corpus with permissions for query and indexing
+# 4. Setup environment variables:
 #    VECTARA_API_KEY, VECTARA_CORPUS_ID and VECTARA_CUSTOMER_ID
+#
 
 
 def get_abbr(s: str) -> str:
@@ -21,37 +24,52 @@ def get_abbr(s: str) -> str:
 def test_vectara_add_documents() -> None:
     """Test end to end construction and search."""
 
-    # start with some initial texts
-    texts = ["grounded generation", "retrieval augmented generation", "data privacy"]
-    docsearch: Vectara = Vectara.from_texts(
-        texts,
-        embedding=FakeEmbeddings(),
-        metadatas=[
-            {"abbr": "gg", "test_num": "1"},
-            {"abbr": "rag", "test_num": "1"},
-            {"abbr": "dp", "test_num": "1"},
-        ],
+    # create a new Vectara instance
+    docsearch: Vectara = Vectara()
+
+    # start with some initial texts, added with add_texts
+    texts1 = ["grounded generation", "retrieval augmented generation", "data privacy"]
+    md = [{"abbr": get_abbr(t)} for t in texts1]
+    doc_id1 = docsearch.add_texts(
+        texts1,
+        metadatas=md,
         doc_metadata={"test_num": "1"},
     )
 
-    # then add some additional documents
-    new_texts = ["large language model", "information retrieval", "question answering"]
-    docsearch.add_documents(
-        [Document(page_content=t, metadata={"abbr": get_abbr(t)}) for t in new_texts],
-        doc_metadata={"test_num": "1"},
+    # then add some additional documents, now with add_documents
+    texts2 = ["large language model", "information retrieval", "question answering"]
+    doc_id2 = docsearch.add_documents(
+        [Document(page_content=t, metadata={"abbr": get_abbr(t)}) for t in texts2],
+        doc_metadata={"test_num": "2"},
     )
+    doc_ids = doc_id1 + doc_id2
 
-    # finally do a similarity search to see if all works okay
-    output = docsearch.similarity_search(
+    # test without filter
+    output1 = docsearch.similarity_search(
         "large language model",
         k=2,
         n_sentence_context=0,
+    )
+    assert len(output1) == 2
+    assert output1[0].page_content == "large language model"
+    assert output1[0].metadata["abbr"] == "llm"
+    assert output1[1].page_content == "information retrieval"
+    assert output1[1].metadata["abbr"] == "ir"
+
+    # test with metadata filter (doc level)
+    # since the query does not match test_num=1 directly we get "RAG" as the result
+    output2 = docsearch.similarity_search(
+        "large language model",
+        k=1,
+        n_sentence_context=0,
         filter="doc.test_num = 1",
     )
-    assert output[0].page_content == "large language model"
-    assert output[0].metadata == {"abbr": "llm"}
-    assert output[1].page_content == "information retrieval"
-    assert output[1].metadata == {"abbr": "ir"}
+    assert len(output2) == 1
+    assert output2[0].page_content == "retrieval augmented generation"
+    assert output2[0].metadata["abbr"] == "rag"
+
+    for doc_id in doc_ids:
+        docsearch._delete_doc(doc_id)
 
 
 def test_vectara_from_files() -> None:
@@ -73,8 +91,9 @@ def test_vectara_from_files() -> None:
         urllib.request.urlretrieve(url, name)
         files_list.append(name)
 
-    docsearch: Vectara = Vectara.from_files(
-        files=files_list,
+    docsearch: Vectara = Vectara()
+    doc_ids = docsearch.add_files(
+        files_list=files_list,
         embedding=FakeEmbeddings(),
         metadatas=[{"url": url, "test_num": "2"} for url in urls],
     )
@@ -101,7 +120,6 @@ def test_vectara_from_files() -> None:
         n_sentence_context=1,
         filter="doc.test_num = 2",
     )
-    print(output[0].page_content)
     assert output[0].page_content == (
         """\
 Note the use of “hybrid” in 3) above is different from that used sometimes in the literature, \
@@ -114,3 +132,6 @@ This classification scheme, however, misses a key insight gained in deep learnin
 models can greatly improve the training of DNNs and other deep discriminative models via better regularization.\
 """  # noqa: E501
     )
+
+    for doc_id in doc_ids:
+        docsearch._delete_doc(doc_id)