mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-24 03:52:10 +00:00
community: BM25Retriever preservation of document id (#27019)
Currently this retriever discards document ids --------- Co-authored-by: asi-cider <88270351+asi-cider@users.noreply.github.com> Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
@@ -33,6 +33,7 @@ class BM25Retriever(BaseRetriever):
|
||||
cls,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[Iterable[dict]] = None,
|
||||
ids: Optional[Iterable[str]] = None,
|
||||
bm25_params: Optional[Dict[str, Any]] = None,
|
||||
preprocess_func: Callable[[str], List[str]] = default_preprocessing_func,
|
||||
**kwargs: Any,
|
||||
@@ -42,6 +43,7 @@ class BM25Retriever(BaseRetriever):
|
||||
Args:
|
||||
texts: A list of texts to vectorize.
|
||||
metadatas: A list of metadata dicts to associate with each text.
|
||||
ids: A list of ids to associate with each text.
|
||||
bm25_params: Parameters to pass to the BM25 vectorizer.
|
||||
preprocess_func: A function to preprocess each text before vectorization.
|
||||
**kwargs: Any other arguments to pass to the retriever.
|
||||
@@ -61,7 +63,15 @@ class BM25Retriever(BaseRetriever):
|
||||
bm25_params = bm25_params or {}
|
||||
vectorizer = BM25Okapi(texts_processed, **bm25_params)
|
||||
metadatas = metadatas or ({} for _ in texts)
|
||||
docs = [Document(page_content=t, metadata=m) for t, m in zip(texts, metadatas)]
|
||||
if ids:
|
||||
docs = [
|
||||
Document(page_content=t, metadata=m, id=i)
|
||||
for t, m, i in zip(texts, metadatas, ids)
|
||||
]
|
||||
else:
|
||||
docs = [
|
||||
Document(page_content=t, metadata=m) for t, m in zip(texts, metadatas)
|
||||
]
|
||||
return cls(
|
||||
vectorizer=vectorizer, docs=docs, preprocess_func=preprocess_func, **kwargs
|
||||
)
|
||||
@@ -86,11 +96,14 @@ class BM25Retriever(BaseRetriever):
|
||||
Returns:
|
||||
A BM25Retriever instance.
|
||||
"""
|
||||
texts, metadatas = zip(*((d.page_content, d.metadata) for d in documents))
|
||||
texts, metadatas, ids = zip(
|
||||
*((d.page_content, d.metadata, d.id) for d in documents)
|
||||
)
|
||||
return cls.from_texts(
|
||||
texts=texts,
|
||||
bm25_params=bm25_params,
|
||||
metadatas=metadatas,
|
||||
ids=ids,
|
||||
preprocess_func=preprocess_func,
|
||||
**kwargs,
|
||||
)
|
||||
|
Reference in New Issue
Block a user