mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-24 07:35:18 +00:00
Upgrade the AwaDB from 0.3.5 to 0.3.6 (#7363)
This commit is contained in:
parent
c5edbea34a
commit
fb6e63dc36
@ -3,11 +3,14 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import uuid
|
||||
from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Type
|
||||
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Set, Tuple, Type
|
||||
|
||||
import numpy as np
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.embeddings.base import Embeddings
|
||||
from langchain.vectorstores.base import VectorStore
|
||||
from langchain.vectorstores.utils import maximal_marginal_relevance
|
||||
|
||||
# from pydantic import BaseModel, Field, root_validator
|
||||
|
||||
@ -30,9 +33,19 @@ class AwaDB(VectorStore):
|
||||
embedding: Optional[Embeddings] = None,
|
||||
log_and_data_dir: Optional[str] = None,
|
||||
client: Optional[awadb.Client] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize with AwaDB client."""
|
||||
"""Initialize with AwaDB client.
|
||||
Args:
|
||||
table_name: Iterable of strings to add to the vectorstore.
|
||||
embedding: Optional list of metadatas associated with the texts.
|
||||
log_and_data_dir: Optional whether to duplicate texts.
|
||||
client: Optional AwaDB client.
|
||||
kwargs: any possible extend parameters in the future.
|
||||
|
||||
Returns:
|
||||
None.
|
||||
"""
|
||||
try:
|
||||
import awadb
|
||||
except ImportError:
|
||||
@ -71,7 +84,7 @@ class AwaDB(VectorStore):
|
||||
texts: Iterable of strings to add to the vectorstore.
|
||||
metadatas: Optional list of metadatas associated with the texts.
|
||||
is_duplicate_texts: Optional whether to duplicate texts.
|
||||
kwargs: vectorstore specific parameters.
|
||||
kwargs: any possible extend parameters in the future.
|
||||
|
||||
Returns:
|
||||
List of ids from adding the texts into the vectorstore.
|
||||
@ -99,6 +112,16 @@ class AwaDB(VectorStore):
|
||||
table_name: str,
|
||||
**kwargs: Any,
|
||||
) -> bool:
|
||||
"""Load the local specified table.
|
||||
|
||||
Args:
|
||||
table_name: Table name
|
||||
kwargs: Any possible extend parameters in the future.
|
||||
|
||||
Returns:
|
||||
Success or failure of loading the local specified table
|
||||
"""
|
||||
|
||||
if self.awadb_client is None:
|
||||
raise ValueError("AwaDB client is None!!!")
|
||||
|
||||
@ -110,7 +133,17 @@ class AwaDB(VectorStore):
|
||||
k: int = DEFAULT_TOPN,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs most similar to query."""
|
||||
"""Return docs most similar to query.
|
||||
|
||||
Args:
|
||||
query: Text query.
|
||||
k: The maximum number of documents to return.
|
||||
kwargs: Any possible extend parameters in the future.
|
||||
|
||||
Returns:
|
||||
Returns the k most similar documents to the specified text query.
|
||||
"""
|
||||
|
||||
if self.awadb_client is None:
|
||||
raise ValueError("AwaDB client is None!!!")
|
||||
|
||||
@ -123,7 +156,10 @@ class AwaDB(VectorStore):
|
||||
llm = llm_embedding.LLMEmbedding()
|
||||
embedding = llm.Embedding(query)
|
||||
|
||||
return self.similarity_search_by_vector(embedding, k)
|
||||
not_include_fields: Set[str] = {"text_embedding", "_id", "score"}
|
||||
return self.similarity_search_by_vector(
|
||||
embedding, k, not_include_fields_in_metadata=not_include_fields
|
||||
)
|
||||
|
||||
def similarity_search_with_score(
|
||||
self,
|
||||
@ -131,9 +167,16 @@ class AwaDB(VectorStore):
|
||||
k: int = DEFAULT_TOPN,
|
||||
**kwargs: Any,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Return docs and relevance scores, normalized on a scale from 0 to 1.
|
||||
"""The most k similar documents and scores of the specified query.
|
||||
|
||||
0 is dissimilar, 1 is most similar.
|
||||
Args:
|
||||
query: Text query.
|
||||
k: The k most similar documents to the text query.
|
||||
kwargs: Any possible extend parameters in the future.
|
||||
|
||||
Returns:
|
||||
The k most similar documents to the specified text query.
|
||||
0 is dissimilar, 1 is the most similar.
|
||||
"""
|
||||
|
||||
if self.awadb_client is None:
|
||||
@ -150,17 +193,18 @@ class AwaDB(VectorStore):
|
||||
|
||||
results: List[Tuple[Document, float]] = []
|
||||
|
||||
scores: List[float] = []
|
||||
retrieval_docs = self.similarity_search_by_vector(embedding, k, scores)
|
||||
dists: List[float] = []
|
||||
not_include_fields: Set[str] = {"text_embedding", "_id", "score"}
|
||||
retrieval_docs = self.similarity_search_by_vector(
|
||||
embedding,
|
||||
k,
|
||||
scores=dists,
|
||||
not_include_fields_in_metadata=not_include_fields,
|
||||
)
|
||||
|
||||
L2_Norm = 0.0
|
||||
for score in scores:
|
||||
L2_Norm = L2_Norm + score * score
|
||||
|
||||
L2_Norm = pow(L2_Norm, 0.5)
|
||||
doc_no = 0
|
||||
for doc in retrieval_docs:
|
||||
doc_tuple = (doc, 1 - (scores[doc_no] / L2_Norm))
|
||||
doc_tuple = (doc, dists[doc_no])
|
||||
results.append(doc_tuple)
|
||||
doc_no = doc_no + 1
|
||||
|
||||
@ -172,9 +216,17 @@ class AwaDB(VectorStore):
|
||||
k: int = DEFAULT_TOPN,
|
||||
**kwargs: Any,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Return docs and relevance scores, normalized on a scale from 0 to 1.
|
||||
"""Return docs and relevance scores
|
||||
which denote the InnerProduct distance, range from 0 to 1.
|
||||
|
||||
0 is dissimilar, 1 is most similar.
|
||||
Args:
|
||||
query: Text query.
|
||||
k: Number of the most similar documents to return. Defaults to 4.
|
||||
|
||||
Returns:
|
||||
List of (Document, relevance_score) tuples similar to the text query.
|
||||
Note that relevance_score ranged from 0 to 1.
|
||||
0 is dissimilar, 1 is the most similar.
|
||||
"""
|
||||
|
||||
if self.awadb_client is None:
|
||||
@ -191,17 +243,18 @@ class AwaDB(VectorStore):
|
||||
if show_results.__len__() == 0:
|
||||
return results
|
||||
|
||||
scores: List[float] = []
|
||||
retrieval_docs = self.similarity_search_by_vector(embedding, k, scores)
|
||||
dists: List[float] = []
|
||||
not_include_fields: Set[str] = {"text_embedding", "_id", "score"}
|
||||
retrieval_docs = self.similarity_search_by_vector(
|
||||
embedding,
|
||||
k,
|
||||
scores=dists,
|
||||
not_include_fields_in_metadata=not_include_fields,
|
||||
)
|
||||
|
||||
L2_Norm = 0.0
|
||||
for score in scores:
|
||||
L2_Norm = L2_Norm + score * score
|
||||
|
||||
L2_Norm = pow(L2_Norm, 0.5)
|
||||
doc_no = 0
|
||||
for doc in retrieval_docs:
|
||||
doc_tuple = (doc, 1 - scores[doc_no] / L2_Norm)
|
||||
doc_tuple = (doc, dists[doc_no])
|
||||
results.append(doc_tuple)
|
||||
doc_no = doc_no + 1
|
||||
|
||||
@ -212,6 +265,7 @@ class AwaDB(VectorStore):
|
||||
embedding: Optional[List[float]] = None,
|
||||
k: int = DEFAULT_TOPN,
|
||||
scores: Optional[list] = None,
|
||||
not_include_fields_in_metadata: Optional[Set[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs most similar to embedding vector.
|
||||
@ -219,9 +273,11 @@ class AwaDB(VectorStore):
|
||||
Args:
|
||||
embedding: Embedding to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
scores: Scores for retrieved docs.
|
||||
not_incude_fields_in_metadata: Not include meta fields of each document.
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the query vector.
|
||||
List of Documents which are the most similar to the query vector.
|
||||
"""
|
||||
|
||||
if self.awadb_client is None:
|
||||
@ -232,7 +288,9 @@ class AwaDB(VectorStore):
|
||||
if embedding is None:
|
||||
return results
|
||||
|
||||
show_results = self.awadb_client.Search(embedding, k)
|
||||
show_results = self.awadb_client.Search(
|
||||
embedding, k, not_include_fields=not_include_fields_in_metadata
|
||||
)
|
||||
|
||||
if show_results.__len__() == 0:
|
||||
return results
|
||||
@ -241,26 +299,200 @@ class AwaDB(VectorStore):
|
||||
content = ""
|
||||
meta_data = {}
|
||||
for item_key in item_detail:
|
||||
if (
|
||||
item_key == "Field@0"
|
||||
and self.using_table_name in self.table2embeddings
|
||||
): # text for the document
|
||||
if item_key == "embedding_text":
|
||||
content = item_detail[item_key]
|
||||
elif item_key == "embedding_text":
|
||||
content = item_detail[item_key]
|
||||
elif (
|
||||
item_key == "Field@1" or item_key == "text_embedding"
|
||||
): # embedding field for the document
|
||||
continue
|
||||
elif item_key == "score": # L2 distance
|
||||
elif item_key == "score":
|
||||
if scores is not None:
|
||||
score = item_detail[item_key]
|
||||
scores.append(score)
|
||||
else:
|
||||
meta_data[item_key] = item_detail[item_key]
|
||||
scores.append(item_detail[item_key])
|
||||
continue
|
||||
elif not_include_fields_in_metadata is not None:
|
||||
if item_key in not_include_fields_in_metadata:
|
||||
continue
|
||||
meta_data[item_key] = item_detail[item_key]
|
||||
results.append(Document(page_content=content, metadata=meta_data))
|
||||
return results
|
||||
|
||||
def max_marginal_relevance_search(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs selected using the maximal marginal relevance.
|
||||
|
||||
Maximal marginal relevance optimizes for similarity to query AND diversity
|
||||
among selected documents.
|
||||
|
||||
Args:
|
||||
query: Text to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
||||
lambda_mult: Number between 0 and 1 that determines the degree
|
||||
of diversity among the results with 0 corresponding
|
||||
to maximum diversity and 1 to minimum diversity.
|
||||
Defaults to 0.5.
|
||||
Returns:
|
||||
List of Documents selected by maximal marginal relevance.
|
||||
"""
|
||||
if self.awadb_client is None:
|
||||
raise ValueError("AwaDB client is None!!!")
|
||||
|
||||
embedding: List[float] = []
|
||||
if self.using_table_name in self.table2embeddings:
|
||||
embedding = self.table2embeddings[self.using_table_name].embed_query(query)
|
||||
else:
|
||||
from awadb import llm_embedding
|
||||
|
||||
llm = llm_embedding.LLMEmbedding()
|
||||
embedding = llm.Embedding(query)
|
||||
|
||||
if embedding.__len__() == 0:
|
||||
return []
|
||||
|
||||
results = self.max_marginal_relevance_search_by_vector(
|
||||
embedding, k, fetch_k, lambda_mult=lambda_mult
|
||||
)
|
||||
return results
|
||||
|
||||
def max_marginal_relevance_search_by_vector(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = 4,
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs selected using the maximal marginal relevance.
|
||||
|
||||
Maximal marginal relevance optimizes for similarity to query AND diversity
|
||||
among selected documents.
|
||||
|
||||
Args:
|
||||
embedding: Embedding to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
||||
lambda_mult: Number between 0 and 1 that determines the degree
|
||||
of diversity among the results with 0 corresponding
|
||||
to maximum diversity and 1 to minimum diversity.
|
||||
Defaults to 0.5.
|
||||
Returns:
|
||||
List of Documents selected by maximal marginal relevance.
|
||||
"""
|
||||
|
||||
if self.awadb_client is None:
|
||||
raise ValueError("AwaDB client is None!!!")
|
||||
|
||||
results: List[Document] = []
|
||||
|
||||
if embedding is None:
|
||||
return results
|
||||
|
||||
not_include_fields: set = {"_id", "score"}
|
||||
retrieved_docs = self.similarity_search_by_vector(
|
||||
embedding, fetch_k, not_include_fields_in_metadata=not_include_fields
|
||||
)
|
||||
|
||||
top_embeddings = []
|
||||
|
||||
for doc in retrieved_docs:
|
||||
top_embeddings.append(doc.metadata["text_embedding"])
|
||||
|
||||
selected_docs = maximal_marginal_relevance(
|
||||
np.array(embedding, dtype=np.float32), embedding_list=top_embeddings
|
||||
)
|
||||
|
||||
for s_id in selected_docs:
|
||||
if "text_embedding" in retrieved_docs[s_id].metadata:
|
||||
del retrieved_docs[s_id].metadata["text_embedding"]
|
||||
results.append(retrieved_docs[s_id])
|
||||
return results
|
||||
|
||||
def get(
|
||||
self,
|
||||
ids: List[str],
|
||||
not_include_fields: Optional[Set[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> Dict[str, Document]:
|
||||
"""Return docs according ids.
|
||||
|
||||
Args:
|
||||
ids: The ids of the embedding vectors.
|
||||
Returns:
|
||||
Documents which have the ids.
|
||||
"""
|
||||
|
||||
if self.awadb_client is None:
|
||||
raise ValueError("AwaDB client is None!!!")
|
||||
|
||||
docs_detail = self.awadb_client.Get(ids, not_include_fields=not_include_fields)
|
||||
|
||||
results: Dict[str, Document] = {}
|
||||
for doc_detail in docs_detail:
|
||||
content = ""
|
||||
meta_info = {}
|
||||
for field in doc_detail:
|
||||
if field == "embeddint_text":
|
||||
content = doc_detail[field]
|
||||
continue
|
||||
elif field == "text_embedding" or field == "_id":
|
||||
continue
|
||||
|
||||
meta_info[field] = doc_detail[field]
|
||||
|
||||
doc = Document(page_content=content, metadata=meta_info)
|
||||
results[doc_detail["_id"]] = doc
|
||||
return results
|
||||
|
||||
def delete(
|
||||
self,
|
||||
ids: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> Optional[bool]:
|
||||
"""Delete the documents which have the specified ids.
|
||||
|
||||
Args:
|
||||
ids: The ids of the embedding vectors.
|
||||
**kwargs: Other keyword arguments that subclasses might use.
|
||||
|
||||
Returns:
|
||||
Optional[bool]: True if deletion is successful.
|
||||
False otherwise, None if not implemented.
|
||||
"""
|
||||
if self.awadb_client is None:
|
||||
raise ValueError("AwaDB client is None!!!")
|
||||
ret: Optional[bool] = None
|
||||
if ids is None or ids.__len__() == 0:
|
||||
return ret
|
||||
ret = self.awadb_client.Delete(ids)
|
||||
return ret
|
||||
|
||||
def update(
|
||||
self,
|
||||
ids: List[str],
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
"""Update the documents which have the specified ids.
|
||||
|
||||
Args:
|
||||
ids: The id list of the updating embedding vector.
|
||||
texts: The texts of the updating documents.
|
||||
metadatas: The metadatas of the updating documents.
|
||||
Returns:
|
||||
the ids of the updated documents.
|
||||
"""
|
||||
|
||||
if self.awadb_client is None:
|
||||
raise ValueError("AwaDB client is None!!!")
|
||||
|
||||
return self.awadb_client.UpdateTexts(
|
||||
ids=ids, text_field_name="embedding_text", texts=texts, metadatas=metadatas
|
||||
)
|
||||
|
||||
def create_table(
|
||||
self,
|
||||
table_name: str,
|
||||
@ -364,7 +596,8 @@ class AwaDB(VectorStore):
|
||||
embedding (Optional[Embeddings]): Embedding function. Defaults to None.
|
||||
table_name (str): Name of the table to create.
|
||||
log_and_data_dir (Optional[str]): Directory to persist the table.
|
||||
client (Optional[awadb.Client]): AwaDB client
|
||||
client (Optional[awadb.Client]): AwaDB client.
|
||||
Any: Any possible parameters in the future
|
||||
|
||||
Returns:
|
||||
AwaDB: AwaDB vectorstore.
|
||||
|
569
poetry.lock
generated
569
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user