mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-13 22:59:05 +00:00
community[patch]: Added missing from_documents method to KNNRetriever. (#18411)
- Description: Added missing `from_documents` method to `KNNRetriever`, providing the ability to supply metadata to LangChain `Document`s, and to give it parity to the other retrievers, which do have `from_documents`. - Issue: None - Dependencies: None - Twitter handle: None Co-authored-by: Victor Adan <vadan@netroadshow.com> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
This commit is contained in:
parent
dfc4177b50
commit
afa2d85405
@ -5,7 +5,7 @@ https://github.com/karpathy/randomfun/blob/master/knn_vs_svm.ipynb"""
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
from typing import Any, List, Optional
|
from typing import Any, Iterable, List, Optional
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from langchain_core.callbacks import CallbackManagerForRetrieverRun
|
from langchain_core.callbacks import CallbackManagerForRetrieverRun
|
||||||
@ -38,6 +38,8 @@ class KNNRetriever(BaseRetriever):
|
|||||||
"""Index of embeddings."""
|
"""Index of embeddings."""
|
||||||
texts: List[str]
|
texts: List[str]
|
||||||
"""List of texts to index."""
|
"""List of texts to index."""
|
||||||
|
metadatas: Optional[List[dict]] = None
|
||||||
|
"""List of metadatas corresponding with each text."""
|
||||||
k: int = 4
|
k: int = 4
|
||||||
"""Number of results to return."""
|
"""Number of results to return."""
|
||||||
relevancy_threshold: Optional[float] = None
|
relevancy_threshold: Optional[float] = None
|
||||||
@ -51,10 +53,32 @@ class KNNRetriever(BaseRetriever):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_texts(
|
def from_texts(
|
||||||
cls, texts: List[str], embeddings: Embeddings, **kwargs: Any
|
cls,
|
||||||
|
texts: List[str],
|
||||||
|
embeddings: Embeddings,
|
||||||
|
metadatas: Optional[List[dict]] = None,
|
||||||
|
**kwargs: Any,
|
||||||
) -> KNNRetriever:
|
) -> KNNRetriever:
|
||||||
index = create_index(texts, embeddings)
|
index = create_index(texts, embeddings)
|
||||||
return cls(embeddings=embeddings, index=index, texts=texts, **kwargs)
|
return cls(
|
||||||
|
embeddings=embeddings,
|
||||||
|
index=index,
|
||||||
|
texts=texts,
|
||||||
|
metadatas=metadatas,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_documents(
|
||||||
|
cls,
|
||||||
|
documents: Iterable[Document],
|
||||||
|
embeddings: Embeddings,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> KNNRetriever:
|
||||||
|
texts, metadatas = zip(*((d.page_content, d.metadata) for d in documents))
|
||||||
|
return cls.from_texts(
|
||||||
|
texts=texts, embeddings=embeddings, metadatas=metadatas, **kwargs
|
||||||
|
)
|
||||||
|
|
||||||
def _get_relevant_documents(
|
def _get_relevant_documents(
|
||||||
self, query: str, *, run_manager: CallbackManagerForRetrieverRun
|
self, query: str, *, run_manager: CallbackManagerForRetrieverRun
|
||||||
@ -71,7 +95,10 @@ class KNNRetriever(BaseRetriever):
|
|||||||
normalized_similarities = (similarities - np.min(similarities)) / denominator
|
normalized_similarities = (similarities - np.min(similarities)) / denominator
|
||||||
|
|
||||||
top_k_results = [
|
top_k_results = [
|
||||||
Document(page_content=self.texts[row])
|
Document(
|
||||||
|
page_content=self.texts[row],
|
||||||
|
metadata=self.metadatas[row] if self.metadatas else {},
|
||||||
|
)
|
||||||
for row in sorted_ix[0 : self.k]
|
for row in sorted_ix[0 : self.k]
|
||||||
if (
|
if (
|
||||||
self.relevancy_threshold is None
|
self.relevancy_threshold is None
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
from langchain_community.embeddings import FakeEmbeddings
|
from langchain_community.embeddings import FakeEmbeddings
|
||||||
from langchain_community.retrievers.knn import KNNRetriever
|
from langchain_community.retrievers.knn import KNNRetriever
|
||||||
|
|
||||||
@ -9,3 +11,19 @@ class TestKNNRetriever:
|
|||||||
texts=input_texts, embeddings=FakeEmbeddings(size=100)
|
texts=input_texts, embeddings=FakeEmbeddings(size=100)
|
||||||
)
|
)
|
||||||
assert len(knn_retriever.texts) == 3
|
assert len(knn_retriever.texts) == 3
|
||||||
|
|
||||||
|
def test_from_documents(self) -> None:
|
||||||
|
input_docs = [
|
||||||
|
Document(page_content="I have a pen.", metadata={"page": 1}),
|
||||||
|
Document(page_content="Do you have a pen?", metadata={"page": 2}),
|
||||||
|
Document(page_content="I have a bag.", metadata={"page": 3}),
|
||||||
|
]
|
||||||
|
knn_retriever = KNNRetriever.from_documents(
|
||||||
|
documents=input_docs, embeddings=FakeEmbeddings(size=100)
|
||||||
|
)
|
||||||
|
assert knn_retriever.texts == [
|
||||||
|
"I have a pen.",
|
||||||
|
"Do you have a pen?",
|
||||||
|
"I have a bag.",
|
||||||
|
]
|
||||||
|
assert knn_retriever.metadatas == [{"page": 1}, {"page": 2}, {"page": 3}]
|
||||||
|
Loading…
Reference in New Issue
Block a user