From a75849623618dbae243e65871c9db3db9b397da4 Mon Sep 17 00:00:00 2001 From: Ofer Mendelevitch Date: Sun, 20 Aug 2023 16:00:14 -0700 Subject: [PATCH] Fixed issue with metadata in query (#9500) - Description: Changed metadata retrieval so that it combines Vectara doc level and part level metadata - Tag maintainer: @rlancemartin - Twitter handle: @ofermend --- .../langchain/vectorstores/vectara.py | 26 ++++--- .../vectorstores/test_vectara.py | 77 ++++++++++++------- 2 files changed, 65 insertions(+), 38 deletions(-) diff --git a/libs/langchain/langchain/vectorstores/vectara.py b/libs/langchain/langchain/vectorstores/vectara.py index f263ebeac84..cd8ee9c9fad 100644 --- a/libs/langchain/langchain/vectorstores/vectara.py +++ b/libs/langchain/langchain/vectorstores/vectara.py @@ -202,12 +202,12 @@ class Vectara(VectorStore): doc_metadata: optional metadata for the document This function indexes all the input text strings in the Vectara corpus as a - single Vectara document, where each input text is considered a "part" and the - metadata are associated with each part. + single Vectara document, where each input text is considered a "section" and the + metadata are associated with each section. if 'doc_metadata' is provided, it is associated with the Vectara document. Returns: - List of ids from adding the texts into the vectorstore. + document ID of the document added """ doc_hash = md5() @@ -307,21 +307,27 @@ class Vectara(VectorStore): result = response.json() responses = result["responseSet"][0]["response"] - vectara_default_metadata = ["lang", "len", "offset"] + documents = result["responseSet"][0]["document"] + + metadatas = [] + for x in responses: + md = {m["name"]: m["value"] for m in x["metadata"]} + doc_num = x["documentIndex"] + doc_md = {m["name"]: m["value"] for m in documents[doc_num]["metadata"]} + md.update(doc_md) + metadatas.append(md) + docs = [ ( Document( page_content=x["text"], - metadata={ - m["name"]: m["value"] - for m in x["metadata"] - if m["name"] not in vectara_default_metadata - }, + metadata=md, ), x["score"], ) - for x in responses + for x, md in zip(responses, metadatas) ] + return docs def similarity_search( diff --git a/libs/langchain/tests/integration_tests/vectorstores/test_vectara.py b/libs/langchain/tests/integration_tests/vectorstores/test_vectara.py index 57338e7f994..8fa3cd7f40d 100644 --- a/libs/langchain/tests/integration_tests/vectorstores/test_vectara.py +++ b/libs/langchain/tests/integration_tests/vectorstores/test_vectara.py @@ -5,11 +5,14 @@ from langchain.docstore.document import Document from langchain.vectorstores.vectara import Vectara from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings -# For this test to run properly, please setup as follows -# 1. Create a corpus in Vectara, with a filter attribute called "test_num". -# 2. Create an API_KEY for this corpus with permissions for query and indexing -# 3. Setup environment variables: +# +# For this test to run properly, please setup as follows: +# 1. Create a Vectara account: sign up at https://console.vectara.com/signup +# 2. Create a corpus in your Vectara account, with a filter attribute called "test_num". +# 3. Create an API_KEY for this corpus with permissions for query and indexing +# 4. Setup environment variables: # VECTARA_API_KEY, VECTARA_CORPUS_ID and VECTARA_CUSTOMER_ID +# def get_abbr(s: str) -> str: @@ -21,37 +24,52 @@ def get_abbr(s: str) -> str: def test_vectara_add_documents() -> None: """Test end to end construction and search.""" - # start with some initial texts - texts = ["grounded generation", "retrieval augmented generation", "data privacy"] - docsearch: Vectara = Vectara.from_texts( - texts, - embedding=FakeEmbeddings(), - metadatas=[ - {"abbr": "gg", "test_num": "1"}, - {"abbr": "rag", "test_num": "1"}, - {"abbr": "dp", "test_num": "1"}, - ], + # create a new Vectara instance + docsearch: Vectara = Vectara() + + # start with some initial texts, added with add_texts + texts1 = ["grounded generation", "retrieval augmented generation", "data privacy"] + md = [{"abbr": get_abbr(t)} for t in texts1] + doc_id1 = docsearch.add_texts( + texts1, + metadatas=md, doc_metadata={"test_num": "1"}, ) - # then add some additional documents - new_texts = ["large language model", "information retrieval", "question answering"] - docsearch.add_documents( - [Document(page_content=t, metadata={"abbr": get_abbr(t)}) for t in new_texts], - doc_metadata={"test_num": "1"}, + # then add some additional documents, now with add_documents + texts2 = ["large language model", "information retrieval", "question answering"] + doc_id2 = docsearch.add_documents( + [Document(page_content=t, metadata={"abbr": get_abbr(t)}) for t in texts2], + doc_metadata={"test_num": "2"}, ) + doc_ids = doc_id1 + doc_id2 - # finally do a similarity search to see if all works okay - output = docsearch.similarity_search( + # test without filter + output1 = docsearch.similarity_search( "large language model", k=2, n_sentence_context=0, + ) + assert len(output1) == 2 + assert output1[0].page_content == "large language model" + assert output1[0].metadata["abbr"] == "llm" + assert output1[1].page_content == "information retrieval" + assert output1[1].metadata["abbr"] == "ir" + + # test with metadata filter (doc level) + # since the query does not match test_num=1 directly we get "RAG" as the result + output2 = docsearch.similarity_search( + "large language model", + k=1, + n_sentence_context=0, filter="doc.test_num = 1", ) - assert output[0].page_content == "large language model" - assert output[0].metadata == {"abbr": "llm"} - assert output[1].page_content == "information retrieval" - assert output[1].metadata == {"abbr": "ir"} + assert len(output2) == 1 + assert output2[0].page_content == "retrieval augmented generation" + assert output2[0].metadata["abbr"] == "rag" + + for doc_id in doc_ids: + docsearch._delete_doc(doc_id) def test_vectara_from_files() -> None: @@ -73,8 +91,9 @@ def test_vectara_from_files() -> None: urllib.request.urlretrieve(url, name) files_list.append(name) - docsearch: Vectara = Vectara.from_files( - files=files_list, + docsearch: Vectara = Vectara() + doc_ids = docsearch.add_files( + files_list=files_list, embedding=FakeEmbeddings(), metadatas=[{"url": url, "test_num": "2"} for url in urls], ) @@ -101,7 +120,6 @@ def test_vectara_from_files() -> None: n_sentence_context=1, filter="doc.test_num = 2", ) - print(output[0].page_content) assert output[0].page_content == ( """\ Note the use of “hybrid” in 3) above is different from that used sometimes in the literature, \ @@ -114,3 +132,6 @@ This classification scheme, however, misses a key insight gained in deep learnin models can greatly improve the training of DNNs and other deep discriminative models via better regularization.\ """ # noqa: E501 ) + + for doc_id in doc_ids: + docsearch._delete_doc(doc_id)