langchain_chroma: added document.id support (#27995)

Description:
* Added internal `Document.id` support to Chroma VectorStore

Dependencies:
* https://github.com/langchain-ai/langchain/pull/27968 should be merged
first and this PR should be re-based on top of those changes.

Tests:
* Modified/Added tests for `Document.id` support. All tests are passing.


Note: I am not a member of the Chroma team.

---------

Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
Eric Pinzur
2024-12-04 01:04:27 +01:00
committed by GitHub
parent 15e7353168
commit eff8a54756
2 changed files with 179 additions and 23 deletions

View File

@@ -44,10 +44,14 @@ def _results_to_docs_and_scores(results: Any) -> List[Tuple[Document, float]]:
return [
# TODO: Chroma can do batch querying,
# we shouldn't hard code to the 1st result
(Document(page_content=result[0], metadata=result[1] or {}), result[2])
(
Document(page_content=result[0], metadata=result[1] or {}, id=result[2]),
result[3],
)
for result in zip(
results["documents"][0],
results["metadatas"][0],
results["ids"][0],
results["distances"][0],
)
]
@@ -1185,6 +1189,8 @@ class Chroma(VectorStore):
"""
texts = [doc.page_content for doc in documents]
metadatas = [doc.metadata for doc in documents]
if ids is None:
ids = [doc.id if doc.id else "" for doc in documents]
return cls.from_texts(
texts=texts,
embedding=embedding,