mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-09 15:03:21 +00:00
Vectara upd2 (#6506)
Update to Vectara integration - By user request added "add_files" to take advantage of Vectara capabilities to process files on the backend, without the need for separate loading of documents and chunking in the chain. - Updated vectara.ipynb example notebook to be broader and added testing of add_file() @hwchase17 - project lead --------- Co-authored-by: rlm <pexpresss31@gmail.com>
This commit is contained in:
committed by
GitHub
parent
1feac83323
commit
153b56d19b
@@ -1,7 +1,16 @@
|
||||
import tempfile
|
||||
import urllib.request
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.vectorstores.vectara import Vectara
|
||||
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
|
||||
|
||||
# For this test to run properly, please setup as follows
|
||||
# 1. Create a corpus in Vectara, with a filter attribute called "test_num".
|
||||
# 2. Create an API_KEY for this corpus with permissions for query and indexing
|
||||
# 3. Setup environment variables:
|
||||
# VECTARA_API_KEY, VECTARA_CORPUS_ID and VECTARA_CUSTOMER_ID
|
||||
|
||||
|
||||
def get_abbr(s: str) -> str:
|
||||
words = s.split(" ") # Split the string into words
|
||||
@@ -12,25 +21,76 @@ def get_abbr(s: str) -> str:
|
||||
def test_vectara_add_documents() -> None:
|
||||
"""Test end to end construction and search."""
|
||||
|
||||
# start with some initial documents
|
||||
# start with some initial texts
|
||||
texts = ["grounded generation", "retrieval augmented generation", "data privacy"]
|
||||
docsearch: Vectara = Vectara.from_texts(
|
||||
texts,
|
||||
embedding=FakeEmbeddings(),
|
||||
metadatas=[{"abbr": "gg"}, {"abbr": "rag"}, {"abbr": "dp"}],
|
||||
metadatas=[
|
||||
{"abbr": "gg", "test_num": "1"},
|
||||
{"abbr": "rag", "test_num": "1"},
|
||||
{"abbr": "dp", "test_num": "1"},
|
||||
],
|
||||
doc_metadata={"test_num": "1"},
|
||||
)
|
||||
|
||||
# then add some additional documents
|
||||
new_texts = ["large language model", "information retrieval", "question answering"]
|
||||
docsearch.add_documents(
|
||||
[Document(page_content=t, metadata={"abbr": get_abbr(t)}) for t in new_texts]
|
||||
[Document(page_content=t, metadata={"abbr": get_abbr(t)}) for t in new_texts],
|
||||
doc_metadata={"test_num": "1"},
|
||||
)
|
||||
|
||||
# finally do a similarity search to see if all works okay
|
||||
output = docsearch.similarity_search(
|
||||
"large language model", k=2, n_sentence_context=0
|
||||
"large language model",
|
||||
k=2,
|
||||
n_sentence_context=0,
|
||||
filter="doc.test_num = 1",
|
||||
)
|
||||
assert output[0].page_content == "large language model"
|
||||
assert output[0].metadata == {"abbr": "llm"}
|
||||
assert output[1].page_content == "information retrieval"
|
||||
assert output[1].metadata == {"abbr": "ir"}
|
||||
|
||||
|
||||
def test_vectara_from_files() -> None:
|
||||
"""Test end to end construction and search."""
|
||||
|
||||
# download documents to local storage and then upload as files
|
||||
# attention paper and deep learning book
|
||||
urls = [
|
||||
("https://arxiv.org/pdf/1706.03762.pdf"),
|
||||
(
|
||||
"https://www.microsoft.com/en-us/research/wp-content/uploads/"
|
||||
"2016/02/Final-DengYu-NOW-Book-DeepLearn2013-ForLecturesJuly2.docx"
|
||||
),
|
||||
]
|
||||
|
||||
files_list = []
|
||||
for url in urls:
|
||||
name = tempfile.NamedTemporaryFile().name
|
||||
urllib.request.urlretrieve(url, name)
|
||||
files_list.append(name)
|
||||
|
||||
docsearch: Vectara = Vectara.from_files(
|
||||
files=files_list,
|
||||
embedding=FakeEmbeddings(),
|
||||
metadatas=[{"url": url, "test_num": "2"} for url in urls],
|
||||
)
|
||||
|
||||
# finally do a similarity search to see if all works okay
|
||||
output = docsearch.similarity_search(
|
||||
"By the commonly adopted machine learning tradition",
|
||||
k=1,
|
||||
n_sentence_context=0,
|
||||
filter="doc.test_num = 2",
|
||||
)
|
||||
print(output)
|
||||
assert output[0].page_content == (
|
||||
"By the commonly adopted machine learning tradition "
|
||||
"(e.g., Chapter 28 in Murphy, 2012; Deng and Li, 2013), it may be natural "
|
||||
"to just classify deep learning techniques into deep discriminative models "
|
||||
"(e.g., DNNs) and deep probabilistic generative models (e.g., DBN, Deep "
|
||||
"Boltzmann Machine (DBM))."
|
||||
)
|
||||
|
Reference in New Issue
Block a user