mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-13 06:40:04 +00:00
community[patch]: Added missing from_documents method to KNNRetriever. (#18411)
- Description: Added missing `from_documents` method to `KNNRetriever`, providing the ability to supply metadata to LangChain `Document`s, and to give it parity to the other retrievers, which do have `from_documents`. - Issue: None - Dependencies: None - Twitter handle: None Co-authored-by: Victor Adan <vadan@netroadshow.com> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
This commit is contained in:
parent
dfc4177b50
commit
afa2d85405
@ -5,7 +5,7 @@ https://github.com/karpathy/randomfun/blob/master/knn_vs_svm.ipynb"""
|
||||
from __future__ import annotations
|
||||
|
||||
import concurrent.futures
|
||||
from typing import Any, List, Optional
|
||||
from typing import Any, Iterable, List, Optional
|
||||
|
||||
import numpy as np
|
||||
from langchain_core.callbacks import CallbackManagerForRetrieverRun
|
||||
@ -38,6 +38,8 @@ class KNNRetriever(BaseRetriever):
|
||||
"""Index of embeddings."""
|
||||
texts: List[str]
|
||||
"""List of texts to index."""
|
||||
metadatas: Optional[List[dict]] = None
|
||||
"""List of metadatas corresponding with each text."""
|
||||
k: int = 4
|
||||
"""Number of results to return."""
|
||||
relevancy_threshold: Optional[float] = None
|
||||
@ -51,10 +53,32 @@ class KNNRetriever(BaseRetriever):
|
||||
|
||||
@classmethod
|
||||
def from_texts(
|
||||
cls, texts: List[str], embeddings: Embeddings, **kwargs: Any
|
||||
cls,
|
||||
texts: List[str],
|
||||
embeddings: Embeddings,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
**kwargs: Any,
|
||||
) -> KNNRetriever:
|
||||
index = create_index(texts, embeddings)
|
||||
return cls(embeddings=embeddings, index=index, texts=texts, **kwargs)
|
||||
return cls(
|
||||
embeddings=embeddings,
|
||||
index=index,
|
||||
texts=texts,
|
||||
metadatas=metadatas,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_documents(
|
||||
cls,
|
||||
documents: Iterable[Document],
|
||||
embeddings: Embeddings,
|
||||
**kwargs: Any,
|
||||
) -> KNNRetriever:
|
||||
texts, metadatas = zip(*((d.page_content, d.metadata) for d in documents))
|
||||
return cls.from_texts(
|
||||
texts=texts, embeddings=embeddings, metadatas=metadatas, **kwargs
|
||||
)
|
||||
|
||||
def _get_relevant_documents(
|
||||
self, query: str, *, run_manager: CallbackManagerForRetrieverRun
|
||||
@ -71,7 +95,10 @@ class KNNRetriever(BaseRetriever):
|
||||
normalized_similarities = (similarities - np.min(similarities)) / denominator
|
||||
|
||||
top_k_results = [
|
||||
Document(page_content=self.texts[row])
|
||||
Document(
|
||||
page_content=self.texts[row],
|
||||
metadata=self.metadatas[row] if self.metadatas else {},
|
||||
)
|
||||
for row in sorted_ix[0 : self.k]
|
||||
if (
|
||||
self.relevancy_threshold is None
|
||||
|
@ -1,3 +1,5 @@
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.embeddings import FakeEmbeddings
|
||||
from langchain_community.retrievers.knn import KNNRetriever
|
||||
|
||||
@ -9,3 +11,19 @@ class TestKNNRetriever:
|
||||
texts=input_texts, embeddings=FakeEmbeddings(size=100)
|
||||
)
|
||||
assert len(knn_retriever.texts) == 3
|
||||
|
||||
def test_from_documents(self) -> None:
|
||||
input_docs = [
|
||||
Document(page_content="I have a pen.", metadata={"page": 1}),
|
||||
Document(page_content="Do you have a pen?", metadata={"page": 2}),
|
||||
Document(page_content="I have a bag.", metadata={"page": 3}),
|
||||
]
|
||||
knn_retriever = KNNRetriever.from_documents(
|
||||
documents=input_docs, embeddings=FakeEmbeddings(size=100)
|
||||
)
|
||||
assert knn_retriever.texts == [
|
||||
"I have a pen.",
|
||||
"Do you have a pen?",
|
||||
"I have a bag.",
|
||||
]
|
||||
assert knn_retriever.metadatas == [{"page": 1}, {"page": 2}, {"page": 3}]
|
||||
|
Loading…
Reference in New Issue
Block a user