Vectara upd2 (#6506)

Update to Vectara integration - By user request added "add_files" to take advantage of Vectara capabilities to process files on the backend, without the need for separate loading of documents and chunking in the chain. - Updated vectara.ipynb example notebook to be broader and added testing of add_file() @hwchase17 - project lead --------- Co-authored-by: rlm <pexpresss31@gmail.com>
2025-09-09 15:03:21 +00:00 · 2023-07-02 12:15:50 -07:00
parent 1feac83323
commit 153b56d19b
4 changed files with 296 additions and 84 deletions
--- a/tests/integration_tests/vectorstores/test_vectara.py
+++ b/tests/integration_tests/vectorstores/test_vectara.py
@@ -1,7 +1,16 @@
+import tempfile
+import urllib.request
+
 from langchain.docstore.document import Document
 from langchain.vectorstores.vectara import Vectara
 from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings

+# For this test to run properly, please setup as follows
+# 1. Create a corpus in Vectara, with a filter attribute called "test_num".
+# 2. Create an API_KEY for this corpus with permissions for query and indexing
+# 3. Setup environment variables:
+#    VECTARA_API_KEY, VECTARA_CORPUS_ID and VECTARA_CUSTOMER_ID
+

 def get_abbr(s: str) -> str:
    words = s.split(" ")  # Split the string into words
@@ -12,25 +21,76 @@ def get_abbr(s: str) -> str:
 def test_vectara_add_documents() -> None:
    """Test end to end construction and search."""

-    # start with some initial documents
+    # start with some initial texts
    texts = ["grounded generation", "retrieval augmented generation", "data privacy"]
    docsearch: Vectara = Vectara.from_texts(
        texts,
        embedding=FakeEmbeddings(),
-        metadatas=[{"abbr": "gg"}, {"abbr": "rag"}, {"abbr": "dp"}],
+        metadatas=[
+            {"abbr": "gg", "test_num": "1"},
+            {"abbr": "rag", "test_num": "1"},
+            {"abbr": "dp", "test_num": "1"},
+        ],
+        doc_metadata={"test_num": "1"},
    )

    # then add some additional documents
    new_texts = ["large language model", "information retrieval", "question answering"]
    docsearch.add_documents(
-        [Document(page_content=t, metadata={"abbr": get_abbr(t)}) for t in new_texts]
+        [Document(page_content=t, metadata={"abbr": get_abbr(t)}) for t in new_texts],
+        doc_metadata={"test_num": "1"},
    )

    # finally do a similarity search to see if all works okay
    output = docsearch.similarity_search(
-        "large language model", k=2, n_sentence_context=0
+        "large language model",
+        k=2,
+        n_sentence_context=0,
+        filter="doc.test_num = 1",
    )
    assert output[0].page_content == "large language model"
    assert output[0].metadata == {"abbr": "llm"}
    assert output[1].page_content == "information retrieval"
    assert output[1].metadata == {"abbr": "ir"}
+
+
+def test_vectara_from_files() -> None:
+    """Test end to end construction and search."""
+
+    # download documents to local storage and then upload as files
+    # attention paper and deep learning book
+    urls = [
+        ("https://arxiv.org/pdf/1706.03762.pdf"),
+        (
+            "https://www.microsoft.com/en-us/research/wp-content/uploads/"
+            "2016/02/Final-DengYu-NOW-Book-DeepLearn2013-ForLecturesJuly2.docx"
+        ),
+    ]
+
+    files_list = []
+    for url in urls:
+        name = tempfile.NamedTemporaryFile().name
+        urllib.request.urlretrieve(url, name)
+        files_list.append(name)
+
+    docsearch: Vectara = Vectara.from_files(
+        files=files_list,
+        embedding=FakeEmbeddings(),
+        metadatas=[{"url": url, "test_num": "2"} for url in urls],
+    )
+
+    # finally do a similarity search to see if all works okay
+    output = docsearch.similarity_search(
+        "By the commonly adopted machine learning tradition",
+        k=1,
+        n_sentence_context=0,
+        filter="doc.test_num = 2",
+    )
+    print(output)
+    assert output[0].page_content == (
+        "By the commonly adopted machine learning tradition "
+        "(e.g., Chapter 28 in Murphy, 2012; Deng and Li, 2013), it may be natural "
+        "to just classify deep learning techniques into deep discriminative models "
+        "(e.g., DNNs) and deep probabilistic generative models (e.g., DBN, Deep "
+        "Boltzmann Machine (DBM))."
+    )