Fixed issue with metadata in query (#9500)

- Description: Changed metadata retrieval so that it combines Vectara
doc level and part level metadata
  - Tag maintainer: @rlancemartin
  - Twitter handle: @ofermend
This commit is contained in:
Ofer Mendelevitch 2023-08-20 16:00:14 -07:00 committed by GitHub
parent 103094286e
commit a758496236
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 65 additions and 38 deletions

View File

@ -202,12 +202,12 @@ class Vectara(VectorStore):
doc_metadata: optional metadata for the document doc_metadata: optional metadata for the document
This function indexes all the input text strings in the Vectara corpus as a This function indexes all the input text strings in the Vectara corpus as a
single Vectara document, where each input text is considered a "part" and the single Vectara document, where each input text is considered a "section" and the
metadata are associated with each part. metadata are associated with each section.
if 'doc_metadata' is provided, it is associated with the Vectara document. if 'doc_metadata' is provided, it is associated with the Vectara document.
Returns: Returns:
List of ids from adding the texts into the vectorstore. document ID of the document added
""" """
doc_hash = md5() doc_hash = md5()
@ -307,21 +307,27 @@ class Vectara(VectorStore):
result = response.json() result = response.json()
responses = result["responseSet"][0]["response"] responses = result["responseSet"][0]["response"]
vectara_default_metadata = ["lang", "len", "offset"] documents = result["responseSet"][0]["document"]
metadatas = []
for x in responses:
md = {m["name"]: m["value"] for m in x["metadata"]}
doc_num = x["documentIndex"]
doc_md = {m["name"]: m["value"] for m in documents[doc_num]["metadata"]}
md.update(doc_md)
metadatas.append(md)
docs = [ docs = [
( (
Document( Document(
page_content=x["text"], page_content=x["text"],
metadata={ metadata=md,
m["name"]: m["value"]
for m in x["metadata"]
if m["name"] not in vectara_default_metadata
},
), ),
x["score"], x["score"],
) )
for x in responses for x, md in zip(responses, metadatas)
] ]
return docs return docs
def similarity_search( def similarity_search(

View File

@ -5,11 +5,14 @@ from langchain.docstore.document import Document
from langchain.vectorstores.vectara import Vectara from langchain.vectorstores.vectara import Vectara
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
# For this test to run properly, please setup as follows #
# 1. Create a corpus in Vectara, with a filter attribute called "test_num". # For this test to run properly, please setup as follows:
# 2. Create an API_KEY for this corpus with permissions for query and indexing # 1. Create a Vectara account: sign up at https://console.vectara.com/signup
# 3. Setup environment variables: # 2. Create a corpus in your Vectara account, with a filter attribute called "test_num".
# 3. Create an API_KEY for this corpus with permissions for query and indexing
# 4. Setup environment variables:
# VECTARA_API_KEY, VECTARA_CORPUS_ID and VECTARA_CUSTOMER_ID # VECTARA_API_KEY, VECTARA_CORPUS_ID and VECTARA_CUSTOMER_ID
#
def get_abbr(s: str) -> str: def get_abbr(s: str) -> str:
@ -21,37 +24,52 @@ def get_abbr(s: str) -> str:
def test_vectara_add_documents() -> None: def test_vectara_add_documents() -> None:
"""Test end to end construction and search.""" """Test end to end construction and search."""
# start with some initial texts # create a new Vectara instance
texts = ["grounded generation", "retrieval augmented generation", "data privacy"] docsearch: Vectara = Vectara()
docsearch: Vectara = Vectara.from_texts(
texts, # start with some initial texts, added with add_texts
embedding=FakeEmbeddings(), texts1 = ["grounded generation", "retrieval augmented generation", "data privacy"]
metadatas=[ md = [{"abbr": get_abbr(t)} for t in texts1]
{"abbr": "gg", "test_num": "1"}, doc_id1 = docsearch.add_texts(
{"abbr": "rag", "test_num": "1"}, texts1,
{"abbr": "dp", "test_num": "1"}, metadatas=md,
],
doc_metadata={"test_num": "1"}, doc_metadata={"test_num": "1"},
) )
# then add some additional documents # then add some additional documents, now with add_documents
new_texts = ["large language model", "information retrieval", "question answering"] texts2 = ["large language model", "information retrieval", "question answering"]
docsearch.add_documents( doc_id2 = docsearch.add_documents(
[Document(page_content=t, metadata={"abbr": get_abbr(t)}) for t in new_texts], [Document(page_content=t, metadata={"abbr": get_abbr(t)}) for t in texts2],
doc_metadata={"test_num": "1"}, doc_metadata={"test_num": "2"},
) )
doc_ids = doc_id1 + doc_id2
# finally do a similarity search to see if all works okay # test without filter
output = docsearch.similarity_search( output1 = docsearch.similarity_search(
"large language model", "large language model",
k=2, k=2,
n_sentence_context=0, n_sentence_context=0,
)
assert len(output1) == 2
assert output1[0].page_content == "large language model"
assert output1[0].metadata["abbr"] == "llm"
assert output1[1].page_content == "information retrieval"
assert output1[1].metadata["abbr"] == "ir"
# test with metadata filter (doc level)
# since the query does not match test_num=1 directly we get "RAG" as the result
output2 = docsearch.similarity_search(
"large language model",
k=1,
n_sentence_context=0,
filter="doc.test_num = 1", filter="doc.test_num = 1",
) )
assert output[0].page_content == "large language model" assert len(output2) == 1
assert output[0].metadata == {"abbr": "llm"} assert output2[0].page_content == "retrieval augmented generation"
assert output[1].page_content == "information retrieval" assert output2[0].metadata["abbr"] == "rag"
assert output[1].metadata == {"abbr": "ir"}
for doc_id in doc_ids:
docsearch._delete_doc(doc_id)
def test_vectara_from_files() -> None: def test_vectara_from_files() -> None:
@ -73,8 +91,9 @@ def test_vectara_from_files() -> None:
urllib.request.urlretrieve(url, name) urllib.request.urlretrieve(url, name)
files_list.append(name) files_list.append(name)
docsearch: Vectara = Vectara.from_files( docsearch: Vectara = Vectara()
files=files_list, doc_ids = docsearch.add_files(
files_list=files_list,
embedding=FakeEmbeddings(), embedding=FakeEmbeddings(),
metadatas=[{"url": url, "test_num": "2"} for url in urls], metadatas=[{"url": url, "test_num": "2"} for url in urls],
) )
@ -101,7 +120,6 @@ def test_vectara_from_files() -> None:
n_sentence_context=1, n_sentence_context=1,
filter="doc.test_num = 2", filter="doc.test_num = 2",
) )
print(output[0].page_content)
assert output[0].page_content == ( assert output[0].page_content == (
"""\ """\
Note the use of hybrid in 3) above is different from that used sometimes in the literature, \ Note the use of hybrid in 3) above is different from that used sometimes in the literature, \
@ -114,3 +132,6 @@ This classification scheme, however, misses a key insight gained in deep learnin
models can greatly improve the training of DNNs and other deep discriminative models via better regularization.\ models can greatly improve the training of DNNs and other deep discriminative models via better regularization.\
""" # noqa: E501 """ # noqa: E501
) )
for doc_id in doc_ids:
docsearch._delete_doc(doc_id)