mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-24 12:01:54 +00:00
fix(core): track within-batch deduplication in indexing num_skipped count (#32273)
**Description:** Fixes incorrect `num_skipped` count in the LangChain indexing API. The current implementation only counts documents that already exist in RecordManager (cross-batch duplicates) but fails to count documents removed during within-batch deduplication via `_deduplicate_in_order()`. This PR adds tracking of the original batch size before deduplication and includes the difference in `num_skipped`, ensuring that `num_added + num_skipped` equals the total number of input documents. **Issue:** Fixes incorrect document count reporting in indexing statistics **Dependencies:** None Fixes #32272 --------- Co-authored-by: Alex Feel <afilippov@spotware.com>
This commit is contained in:
committed by
GitHub
parent
12c0e9b7d8
commit
f0b6baa0ef
@@ -444,6 +444,9 @@ def index(
|
||||
scoped_full_cleanup_source_ids: set[str] = set()
|
||||
|
||||
for doc_batch in _batch(batch_size, doc_iterator):
|
||||
# Track original batch size before deduplication
|
||||
original_batch_size = len(doc_batch)
|
||||
|
||||
hashed_docs = list(
|
||||
_deduplicate_in_order(
|
||||
[
|
||||
@@ -452,6 +455,8 @@ def index(
|
||||
]
|
||||
)
|
||||
)
|
||||
# Count documents removed by within-batch deduplication
|
||||
num_skipped += original_batch_size - len(hashed_docs)
|
||||
|
||||
source_ids: Sequence[Optional[str]] = [
|
||||
source_id_assigner(hashed_doc) for hashed_doc in hashed_docs
|
||||
@@ -784,6 +789,9 @@ async def aindex(
|
||||
scoped_full_cleanup_source_ids: set[str] = set()
|
||||
|
||||
async for doc_batch in _abatch(batch_size, async_doc_iterator):
|
||||
# Track original batch size before deduplication
|
||||
original_batch_size = len(doc_batch)
|
||||
|
||||
hashed_docs = list(
|
||||
_deduplicate_in_order(
|
||||
[
|
||||
@@ -792,6 +800,8 @@ async def aindex(
|
||||
]
|
||||
)
|
||||
)
|
||||
# Count documents removed by within-batch deduplication
|
||||
num_skipped += original_batch_size - len(hashed_docs)
|
||||
|
||||
source_ids: Sequence[Optional[str]] = [
|
||||
source_id_assigner(doc) for doc in hashed_docs
|
||||
|
Reference in New Issue
Block a user