Upgrade the AwaDB from 0.3.5 to 0.3.6 (#7363)

This commit is contained in:
ljeagle 2023-07-08 11:41:17 +08:00 committed by GitHub
parent c5edbea34a
commit fb6e63dc36
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 814 additions and 72 deletions

View File

@ -3,11 +3,14 @@ from __future__ import annotations
import logging import logging
import uuid import uuid
from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Type from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Set, Tuple, Type
import numpy as np
from langchain.docstore.document import Document from langchain.docstore.document import Document
from langchain.embeddings.base import Embeddings from langchain.embeddings.base import Embeddings
from langchain.vectorstores.base import VectorStore from langchain.vectorstores.base import VectorStore
from langchain.vectorstores.utils import maximal_marginal_relevance
# from pydantic import BaseModel, Field, root_validator # from pydantic import BaseModel, Field, root_validator
@ -30,9 +33,19 @@ class AwaDB(VectorStore):
embedding: Optional[Embeddings] = None, embedding: Optional[Embeddings] = None,
log_and_data_dir: Optional[str] = None, log_and_data_dir: Optional[str] = None,
client: Optional[awadb.Client] = None, client: Optional[awadb.Client] = None,
**kwargs: Any,
) -> None: ) -> None:
"""Initialize with AwaDB client.""" """Initialize with AwaDB client.
Args:
table_name: Iterable of strings to add to the vectorstore.
embedding: Optional list of metadatas associated with the texts.
log_and_data_dir: Optional whether to duplicate texts.
client: Optional AwaDB client.
kwargs: any possible extend parameters in the future.
Returns:
None.
"""
try: try:
import awadb import awadb
except ImportError: except ImportError:
@ -71,7 +84,7 @@ class AwaDB(VectorStore):
texts: Iterable of strings to add to the vectorstore. texts: Iterable of strings to add to the vectorstore.
metadatas: Optional list of metadatas associated with the texts. metadatas: Optional list of metadatas associated with the texts.
is_duplicate_texts: Optional whether to duplicate texts. is_duplicate_texts: Optional whether to duplicate texts.
kwargs: vectorstore specific parameters. kwargs: any possible extend parameters in the future.
Returns: Returns:
List of ids from adding the texts into the vectorstore. List of ids from adding the texts into the vectorstore.
@ -99,6 +112,16 @@ class AwaDB(VectorStore):
table_name: str, table_name: str,
**kwargs: Any, **kwargs: Any,
) -> bool: ) -> bool:
"""Load the local specified table.
Args:
table_name: Table name
kwargs: Any possible extend parameters in the future.
Returns:
Success or failure of loading the local specified table
"""
if self.awadb_client is None: if self.awadb_client is None:
raise ValueError("AwaDB client is None!!!") raise ValueError("AwaDB client is None!!!")
@ -110,7 +133,17 @@ class AwaDB(VectorStore):
k: int = DEFAULT_TOPN, k: int = DEFAULT_TOPN,
**kwargs: Any, **kwargs: Any,
) -> List[Document]: ) -> List[Document]:
"""Return docs most similar to query.""" """Return docs most similar to query.
Args:
query: Text query.
k: The maximum number of documents to return.
kwargs: Any possible extend parameters in the future.
Returns:
Returns the k most similar documents to the specified text query.
"""
if self.awadb_client is None: if self.awadb_client is None:
raise ValueError("AwaDB client is None!!!") raise ValueError("AwaDB client is None!!!")
@ -123,7 +156,10 @@ class AwaDB(VectorStore):
llm = llm_embedding.LLMEmbedding() llm = llm_embedding.LLMEmbedding()
embedding = llm.Embedding(query) embedding = llm.Embedding(query)
return self.similarity_search_by_vector(embedding, k) not_include_fields: Set[str] = {"text_embedding", "_id", "score"}
return self.similarity_search_by_vector(
embedding, k, not_include_fields_in_metadata=not_include_fields
)
def similarity_search_with_score( def similarity_search_with_score(
self, self,
@ -131,9 +167,16 @@ class AwaDB(VectorStore):
k: int = DEFAULT_TOPN, k: int = DEFAULT_TOPN,
**kwargs: Any, **kwargs: Any,
) -> List[Tuple[Document, float]]: ) -> List[Tuple[Document, float]]:
"""Return docs and relevance scores, normalized on a scale from 0 to 1. """The most k similar documents and scores of the specified query.
0 is dissimilar, 1 is most similar. Args:
query: Text query.
k: The k most similar documents to the text query.
kwargs: Any possible extend parameters in the future.
Returns:
The k most similar documents to the specified text query.
0 is dissimilar, 1 is the most similar.
""" """
if self.awadb_client is None: if self.awadb_client is None:
@ -150,17 +193,18 @@ class AwaDB(VectorStore):
results: List[Tuple[Document, float]] = [] results: List[Tuple[Document, float]] = []
scores: List[float] = [] dists: List[float] = []
retrieval_docs = self.similarity_search_by_vector(embedding, k, scores) not_include_fields: Set[str] = {"text_embedding", "_id", "score"}
retrieval_docs = self.similarity_search_by_vector(
embedding,
k,
scores=dists,
not_include_fields_in_metadata=not_include_fields,
)
L2_Norm = 0.0
for score in scores:
L2_Norm = L2_Norm + score * score
L2_Norm = pow(L2_Norm, 0.5)
doc_no = 0 doc_no = 0
for doc in retrieval_docs: for doc in retrieval_docs:
doc_tuple = (doc, 1 - (scores[doc_no] / L2_Norm)) doc_tuple = (doc, dists[doc_no])
results.append(doc_tuple) results.append(doc_tuple)
doc_no = doc_no + 1 doc_no = doc_no + 1
@ -172,9 +216,17 @@ class AwaDB(VectorStore):
k: int = DEFAULT_TOPN, k: int = DEFAULT_TOPN,
**kwargs: Any, **kwargs: Any,
) -> List[Tuple[Document, float]]: ) -> List[Tuple[Document, float]]:
"""Return docs and relevance scores, normalized on a scale from 0 to 1. """Return docs and relevance scores
which denote the InnerProduct distance, range from 0 to 1.
0 is dissimilar, 1 is most similar. Args:
query: Text query.
k: Number of the most similar documents to return. Defaults to 4.
Returns:
List of (Document, relevance_score) tuples similar to the text query.
Note that relevance_score ranged from 0 to 1.
0 is dissimilar, 1 is the most similar.
""" """
if self.awadb_client is None: if self.awadb_client is None:
@ -191,17 +243,18 @@ class AwaDB(VectorStore):
if show_results.__len__() == 0: if show_results.__len__() == 0:
return results return results
scores: List[float] = [] dists: List[float] = []
retrieval_docs = self.similarity_search_by_vector(embedding, k, scores) not_include_fields: Set[str] = {"text_embedding", "_id", "score"}
retrieval_docs = self.similarity_search_by_vector(
embedding,
k,
scores=dists,
not_include_fields_in_metadata=not_include_fields,
)
L2_Norm = 0.0
for score in scores:
L2_Norm = L2_Norm + score * score
L2_Norm = pow(L2_Norm, 0.5)
doc_no = 0 doc_no = 0
for doc in retrieval_docs: for doc in retrieval_docs:
doc_tuple = (doc, 1 - scores[doc_no] / L2_Norm) doc_tuple = (doc, dists[doc_no])
results.append(doc_tuple) results.append(doc_tuple)
doc_no = doc_no + 1 doc_no = doc_no + 1
@ -212,6 +265,7 @@ class AwaDB(VectorStore):
embedding: Optional[List[float]] = None, embedding: Optional[List[float]] = None,
k: int = DEFAULT_TOPN, k: int = DEFAULT_TOPN,
scores: Optional[list] = None, scores: Optional[list] = None,
not_include_fields_in_metadata: Optional[Set[str]] = None,
**kwargs: Any, **kwargs: Any,
) -> List[Document]: ) -> List[Document]:
"""Return docs most similar to embedding vector. """Return docs most similar to embedding vector.
@ -219,9 +273,11 @@ class AwaDB(VectorStore):
Args: Args:
embedding: Embedding to look up documents similar to. embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4. k: Number of Documents to return. Defaults to 4.
scores: Scores for retrieved docs.
not_incude_fields_in_metadata: Not include meta fields of each document.
Returns: Returns:
List of Documents most similar to the query vector. List of Documents which are the most similar to the query vector.
""" """
if self.awadb_client is None: if self.awadb_client is None:
@ -232,7 +288,9 @@ class AwaDB(VectorStore):
if embedding is None: if embedding is None:
return results return results
show_results = self.awadb_client.Search(embedding, k) show_results = self.awadb_client.Search(
embedding, k, not_include_fields=not_include_fields_in_metadata
)
if show_results.__len__() == 0: if show_results.__len__() == 0:
return results return results
@ -241,26 +299,200 @@ class AwaDB(VectorStore):
content = "" content = ""
meta_data = {} meta_data = {}
for item_key in item_detail: for item_key in item_detail:
if ( if item_key == "embedding_text":
item_key == "Field@0"
and self.using_table_name in self.table2embeddings
): # text for the document
content = item_detail[item_key] content = item_detail[item_key]
elif item_key == "embedding_text":
content = item_detail[item_key]
elif (
item_key == "Field@1" or item_key == "text_embedding"
): # embedding field for the document
continue continue
elif item_key == "score": # L2 distance elif item_key == "score":
if scores is not None: if scores is not None:
score = item_detail[item_key] scores.append(item_detail[item_key])
scores.append(score) continue
else: elif not_include_fields_in_metadata is not None:
if item_key in not_include_fields_in_metadata:
continue
meta_data[item_key] = item_detail[item_key] meta_data[item_key] = item_detail[item_key]
results.append(Document(page_content=content, metadata=meta_data)) results.append(Document(page_content=content, metadata=meta_data))
return results return results
def max_marginal_relevance_search(
self,
query: str,
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
lambda_mult: Number between 0 and 1 that determines the degree
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
Returns:
List of Documents selected by maximal marginal relevance.
"""
if self.awadb_client is None:
raise ValueError("AwaDB client is None!!!")
embedding: List[float] = []
if self.using_table_name in self.table2embeddings:
embedding = self.table2embeddings[self.using_table_name].embed_query(query)
else:
from awadb import llm_embedding
llm = llm_embedding.LLMEmbedding()
embedding = llm.Embedding(query)
if embedding.__len__() == 0:
return []
results = self.max_marginal_relevance_search_by_vector(
embedding, k, fetch_k, lambda_mult=lambda_mult
)
return results
def max_marginal_relevance_search_by_vector(
self,
embedding: List[float],
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents.
Args:
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
lambda_mult: Number between 0 and 1 that determines the degree
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
Returns:
List of Documents selected by maximal marginal relevance.
"""
if self.awadb_client is None:
raise ValueError("AwaDB client is None!!!")
results: List[Document] = []
if embedding is None:
return results
not_include_fields: set = {"_id", "score"}
retrieved_docs = self.similarity_search_by_vector(
embedding, fetch_k, not_include_fields_in_metadata=not_include_fields
)
top_embeddings = []
for doc in retrieved_docs:
top_embeddings.append(doc.metadata["text_embedding"])
selected_docs = maximal_marginal_relevance(
np.array(embedding, dtype=np.float32), embedding_list=top_embeddings
)
for s_id in selected_docs:
if "text_embedding" in retrieved_docs[s_id].metadata:
del retrieved_docs[s_id].metadata["text_embedding"]
results.append(retrieved_docs[s_id])
return results
def get(
self,
ids: List[str],
not_include_fields: Optional[Set[str]] = None,
**kwargs: Any,
) -> Dict[str, Document]:
"""Return docs according ids.
Args:
ids: The ids of the embedding vectors.
Returns:
Documents which have the ids.
"""
if self.awadb_client is None:
raise ValueError("AwaDB client is None!!!")
docs_detail = self.awadb_client.Get(ids, not_include_fields=not_include_fields)
results: Dict[str, Document] = {}
for doc_detail in docs_detail:
content = ""
meta_info = {}
for field in doc_detail:
if field == "embeddint_text":
content = doc_detail[field]
continue
elif field == "text_embedding" or field == "_id":
continue
meta_info[field] = doc_detail[field]
doc = Document(page_content=content, metadata=meta_info)
results[doc_detail["_id"]] = doc
return results
def delete(
self,
ids: Optional[List[str]] = None,
**kwargs: Any,
) -> Optional[bool]:
"""Delete the documents which have the specified ids.
Args:
ids: The ids of the embedding vectors.
**kwargs: Other keyword arguments that subclasses might use.
Returns:
Optional[bool]: True if deletion is successful.
False otherwise, None if not implemented.
"""
if self.awadb_client is None:
raise ValueError("AwaDB client is None!!!")
ret: Optional[bool] = None
if ids is None or ids.__len__() == 0:
return ret
ret = self.awadb_client.Delete(ids)
return ret
def update(
self,
ids: List[str],
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> List[str]:
"""Update the documents which have the specified ids.
Args:
ids: The id list of the updating embedding vector.
texts: The texts of the updating documents.
metadatas: The metadatas of the updating documents.
Returns:
the ids of the updated documents.
"""
if self.awadb_client is None:
raise ValueError("AwaDB client is None!!!")
return self.awadb_client.UpdateTexts(
ids=ids, text_field_name="embedding_text", texts=texts, metadatas=metadatas
)
def create_table( def create_table(
self, self,
table_name: str, table_name: str,
@ -364,7 +596,8 @@ class AwaDB(VectorStore):
embedding (Optional[Embeddings]): Embedding function. Defaults to None. embedding (Optional[Embeddings]): Embedding function. Defaults to None.
table_name (str): Name of the table to create. table_name (str): Name of the table to create.
log_and_data_dir (Optional[str]): Directory to persist the table. log_and_data_dir (Optional[str]): Directory to persist the table.
client (Optional[awadb.Client]): AwaDB client client (Optional[awadb.Client]): AwaDB client.
Any: Any possible parameters in the future
Returns: Returns:
AwaDB: AwaDB vectorstore. AwaDB: AwaDB vectorstore.

569
poetry.lock generated

File diff suppressed because it is too large Load Diff