mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-10 15:33:11 +00:00
Harrison/relevancy score (#3907)
Co-authored-by: Ryan Grippeling <R.Grippeling@hotmail.com> Co-authored-by: Ryan <ryan@webgrip.nl> Co-authored-by: Zander Chase <130414180+vowelparrot@users.noreply.github.com>
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,3 +1,4 @@
|
|||||||
|
.vs/
|
||||||
.vscode/
|
.vscode/
|
||||||
.idea/
|
.idea/
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
|
@@ -74,9 +74,10 @@ class TimeWeightedVectorStoreRetriever(BaseRetriever, BaseModel):
|
|||||||
)
|
)
|
||||||
results = {}
|
results = {}
|
||||||
for fetched_doc, relevance in docs_and_scores:
|
for fetched_doc, relevance in docs_and_scores:
|
||||||
buffer_idx = fetched_doc.metadata["buffer_idx"]
|
if "buffer_idx" in fetched_doc.metadata:
|
||||||
doc = self.memory_stream[buffer_idx]
|
buffer_idx = fetched_doc.metadata["buffer_idx"]
|
||||||
results[buffer_idx] = (doc, relevance)
|
doc = self.memory_stream[buffer_idx]
|
||||||
|
results[buffer_idx] = (doc, relevance)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def get_relevant_documents(self, query: str) -> List[Document]:
|
def get_relevant_documents(self, query: str) -> List[Document]:
|
||||||
|
@@ -81,6 +81,10 @@ def _redis_prefix(index_name: str) -> str:
|
|||||||
return f"doc:{index_name}"
|
return f"doc:{index_name}"
|
||||||
|
|
||||||
|
|
||||||
|
def _default_relevance_score(val: float) -> float:
|
||||||
|
return 1 - val
|
||||||
|
|
||||||
|
|
||||||
class Redis(VectorStore):
|
class Redis(VectorStore):
|
||||||
"""Wrapper around Redis vector database.
|
"""Wrapper around Redis vector database.
|
||||||
|
|
||||||
@@ -108,6 +112,9 @@ class Redis(VectorStore):
|
|||||||
content_key: str = "content",
|
content_key: str = "content",
|
||||||
metadata_key: str = "metadata",
|
metadata_key: str = "metadata",
|
||||||
vector_key: str = "content_vector",
|
vector_key: str = "content_vector",
|
||||||
|
relevance_score_fn: Optional[
|
||||||
|
Callable[[float], float]
|
||||||
|
] = _default_relevance_score,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
):
|
):
|
||||||
"""Initialize with necessary components."""
|
"""Initialize with necessary components."""
|
||||||
@@ -133,6 +140,7 @@ class Redis(VectorStore):
|
|||||||
self.content_key = content_key
|
self.content_key = content_key
|
||||||
self.metadata_key = metadata_key
|
self.metadata_key = metadata_key
|
||||||
self.vector_key = vector_key
|
self.vector_key = vector_key
|
||||||
|
self.relevance_score_fn = relevance_score_fn
|
||||||
|
|
||||||
def _create_index(self, dim: int = 1536) -> None:
|
def _create_index(self, dim: int = 1536) -> None:
|
||||||
try:
|
try:
|
||||||
@@ -328,6 +336,24 @@ class Redis(VectorStore):
|
|||||||
|
|
||||||
return docs
|
return docs
|
||||||
|
|
||||||
|
def _similarity_search_with_relevance_scores(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
k: int = 4,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[Tuple[Document, float]]:
|
||||||
|
"""Return docs and relevance scores, normalized on a scale from 0 to 1.
|
||||||
|
|
||||||
|
0 is dissimilar, 1 is most similar.
|
||||||
|
"""
|
||||||
|
if self.relevance_score_fn is None:
|
||||||
|
raise ValueError(
|
||||||
|
"relevance_score_fn must be provided to"
|
||||||
|
" Weaviate constructor to normalize scores"
|
||||||
|
)
|
||||||
|
docs_and_scores = self.similarity_search_with_score(query, k=k)
|
||||||
|
return [(doc, self.relevance_score_fn(score)) for doc, score in docs_and_scores]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_texts(
|
def from_texts(
|
||||||
cls: Type[Redis],
|
cls: Type[Redis],
|
||||||
|
@@ -1,7 +1,8 @@
|
|||||||
"""Wrapper around weaviate vector database."""
|
"""Wrapper around weaviate vector database."""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from typing import Any, Dict, Iterable, List, Optional, Type
|
import datetime
|
||||||
|
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Type
|
||||||
from uuid import uuid4
|
from uuid import uuid4
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@@ -58,6 +59,10 @@ def _create_weaviate_client(**kwargs: Any) -> Any:
|
|||||||
return client
|
return client
|
||||||
|
|
||||||
|
|
||||||
|
def _default_score_normalizer(val: float) -> float:
|
||||||
|
return 1 - 1 / (1 + np.exp(val))
|
||||||
|
|
||||||
|
|
||||||
class Weaviate(VectorStore):
|
class Weaviate(VectorStore):
|
||||||
"""Wrapper around Weaviate vector database.
|
"""Wrapper around Weaviate vector database.
|
||||||
|
|
||||||
@@ -80,6 +85,9 @@ class Weaviate(VectorStore):
|
|||||||
text_key: str,
|
text_key: str,
|
||||||
embedding: Optional[Embeddings] = None,
|
embedding: Optional[Embeddings] = None,
|
||||||
attributes: Optional[List[str]] = None,
|
attributes: Optional[List[str]] = None,
|
||||||
|
relevance_score_fn: Optional[
|
||||||
|
Callable[[float], float]
|
||||||
|
] = _default_score_normalizer,
|
||||||
):
|
):
|
||||||
"""Initialize with Weaviate client."""
|
"""Initialize with Weaviate client."""
|
||||||
try:
|
try:
|
||||||
@@ -98,6 +106,7 @@ class Weaviate(VectorStore):
|
|||||||
self._embedding = embedding
|
self._embedding = embedding
|
||||||
self._text_key = text_key
|
self._text_key = text_key
|
||||||
self._query_attrs = [self._text_key]
|
self._query_attrs = [self._text_key]
|
||||||
|
self._relevance_score_fn = relevance_score_fn
|
||||||
if attributes is not None:
|
if attributes is not None:
|
||||||
self._query_attrs.extend(attributes)
|
self._query_attrs.extend(attributes)
|
||||||
|
|
||||||
@@ -110,6 +119,11 @@ class Weaviate(VectorStore):
|
|||||||
"""Upload texts with metadata (properties) to Weaviate."""
|
"""Upload texts with metadata (properties) to Weaviate."""
|
||||||
from weaviate.util import get_valid_uuid
|
from weaviate.util import get_valid_uuid
|
||||||
|
|
||||||
|
def json_serializable(value: Any) -> Any:
|
||||||
|
if isinstance(value, datetime.datetime):
|
||||||
|
return value.isoformat()
|
||||||
|
return value
|
||||||
|
|
||||||
with self._client.batch as batch:
|
with self._client.batch as batch:
|
||||||
ids = []
|
ids = []
|
||||||
for i, doc in enumerate(texts):
|
for i, doc in enumerate(texts):
|
||||||
@@ -118,7 +132,7 @@ class Weaviate(VectorStore):
|
|||||||
}
|
}
|
||||||
if metadatas is not None:
|
if metadatas is not None:
|
||||||
for key in metadatas[i].keys():
|
for key in metadatas[i].keys():
|
||||||
data_properties[key] = metadatas[i][key]
|
data_properties[key] = json_serializable(metadatas[i][key])
|
||||||
|
|
||||||
_id = get_valid_uuid(uuid4())
|
_id = get_valid_uuid(uuid4())
|
||||||
|
|
||||||
@@ -267,9 +281,57 @@ class Weaviate(VectorStore):
|
|||||||
payload[idx].pop("_additional")
|
payload[idx].pop("_additional")
|
||||||
meta = payload[idx]
|
meta = payload[idx]
|
||||||
docs.append(Document(page_content=text, metadata=meta))
|
docs.append(Document(page_content=text, metadata=meta))
|
||||||
|
|
||||||
return docs
|
return docs
|
||||||
|
|
||||||
|
def similarity_search_with_score(
|
||||||
|
self, query: str, k: int = 4, **kwargs: Any
|
||||||
|
) -> List[Tuple[Document, float]]:
|
||||||
|
content: Dict[str, Any] = {"concepts": [query]}
|
||||||
|
if kwargs.get("search_distance"):
|
||||||
|
content["certainty"] = kwargs.get("search_distance")
|
||||||
|
query_obj = self._client.query.get(self._index_name, self._query_attrs)
|
||||||
|
result = (
|
||||||
|
query_obj.with_near_text(content)
|
||||||
|
.with_limit(k)
|
||||||
|
.with_additional("vector")
|
||||||
|
.do()
|
||||||
|
)
|
||||||
|
if "errors" in result:
|
||||||
|
raise ValueError(f"Error during query: {result['errors']}")
|
||||||
|
|
||||||
|
docs_and_scores = []
|
||||||
|
if self._embedding is None:
|
||||||
|
raise ValueError(
|
||||||
|
"_embedding cannot be None for similarity_search_with_score"
|
||||||
|
)
|
||||||
|
for res in result["data"]["Get"][self._index_name]:
|
||||||
|
text = res.pop(self._text_key)
|
||||||
|
score = np.dot(
|
||||||
|
res["_additional"]["vector"], self._embedding.embed_query(query)
|
||||||
|
)
|
||||||
|
docs_and_scores.append((Document(page_content=text, metadata=res), score))
|
||||||
|
return docs_and_scores
|
||||||
|
|
||||||
|
def _similarity_search_with_relevance_scores(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
k: int = 4,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[Tuple[Document, float]]:
|
||||||
|
"""Return docs and relevance scores, normalized on a scale from 0 to 1.
|
||||||
|
|
||||||
|
0 is dissimilar, 1 is most similar.
|
||||||
|
"""
|
||||||
|
if self._relevance_score_fn is None:
|
||||||
|
raise ValueError(
|
||||||
|
"relevance_score_fn must be provided to"
|
||||||
|
" Weaviate constructor to normalize scores"
|
||||||
|
)
|
||||||
|
docs_and_scores = self.similarity_search_with_score(query, k=k)
|
||||||
|
return [
|
||||||
|
(doc, self._relevance_score_fn(score)) for doc, score in docs_and_scores
|
||||||
|
]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_texts(
|
def from_texts(
|
||||||
cls: Type[Weaviate],
|
cls: Type[Weaviate],
|
||||||
|
Reference in New Issue
Block a user