mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-30 02:13:23 +00:00
Improve indexing performance for Postgres (remote database) for refresh (#14126)
**Description:** By combining the document timestamp refresh within a single call to update(), this enables batching of multiple documents in a single SQL statement. This is important for non-local databases where tens of milliseconds has a huge impact on performance when doing document-by-document SQL statements. **Issue:** #11935 **Dependencies:** None **Tag maintainer:** @eyurtsev
This commit is contained in:
parent
b161f302ff
commit
9b0e46dcf0
@ -303,15 +303,19 @@ def index(
|
||||
# Filter out documents that already exist in the record store.
|
||||
uids = []
|
||||
docs_to_index = []
|
||||
docs_to_update = []
|
||||
for hashed_doc, doc_exists in zip(hashed_docs, exists_batch):
|
||||
if doc_exists:
|
||||
# Must be updated to refresh timestamp.
|
||||
record_manager.update([hashed_doc.uid], time_at_least=index_start_dt)
|
||||
num_skipped += 1
|
||||
docs_to_update.append(hashed_doc.uid)
|
||||
continue
|
||||
uids.append(hashed_doc.uid)
|
||||
docs_to_index.append(hashed_doc.to_document())
|
||||
|
||||
# Update refresh timestamp
|
||||
if docs_to_update:
|
||||
record_manager.update(docs_to_update, time_at_least=index_start_dt)
|
||||
num_skipped += len(docs_to_update)
|
||||
|
||||
# Be pessimistic and assume that all vector store write will fail.
|
||||
# First write to vector store
|
||||
if docs_to_index:
|
||||
|
Loading…
Reference in New Issue
Block a user