mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-02 19:47:13 +00:00
index rename delete_mode -> cleanup (#10103)
This commit is contained in:
@@ -169,7 +169,7 @@ def index(
|
||||
vector_store: VectorStore,
|
||||
*,
|
||||
batch_size: int = 100,
|
||||
delete_mode: Literal["incremental", "full", None] = None,
|
||||
cleanup: Literal["incremental", "full", None] = None,
|
||||
source_id_key: Union[str, Callable[[Document], str], None] = None,
|
||||
) -> IndexingResult:
|
||||
"""Index data from the loader into the vector store.
|
||||
@@ -195,7 +195,7 @@ def index(
|
||||
updated.
|
||||
vector_store: Vector store to index the documents into.
|
||||
batch_size: Batch size to use when indexing.
|
||||
delete_mode: How to handle clean up of documents.
|
||||
cleanup: How to handle clean up of documents.
|
||||
- Incremental: Cleans up all documents that haven't been updated AND
|
||||
that are associated with source ids that were seen
|
||||
during indexing.
|
||||
@@ -213,14 +213,14 @@ def index(
|
||||
Indexing result which contains information about how many documents
|
||||
were added, updated, deleted, or skipped.
|
||||
"""
|
||||
if delete_mode not in {"incremental", "full", None}:
|
||||
if cleanup not in {"incremental", "full", None}:
|
||||
raise ValueError(
|
||||
f"delete_mode should be one of 'incremental', 'full' or None. "
|
||||
f"Got {delete_mode}."
|
||||
f"cleanup should be one of 'incremental', 'full' or None. "
|
||||
f"Got {cleanup}."
|
||||
)
|
||||
|
||||
if delete_mode == "incremental" and source_id_key is None:
|
||||
raise ValueError("Source id key is required when delete mode is incremental.")
|
||||
if cleanup == "incremental" and source_id_key is None:
|
||||
raise ValueError("Source id key is required when cleanup mode is incremental.")
|
||||
|
||||
# Check that the Vectorstore has required methods implemented
|
||||
methods = ["delete", "add_documents"]
|
||||
@@ -264,12 +264,12 @@ def index(
|
||||
source_id_assigner(doc) for doc in hashed_docs
|
||||
]
|
||||
|
||||
if delete_mode == "incremental":
|
||||
# If the delete mode is incremental, source ids are required.
|
||||
if cleanup == "incremental":
|
||||
# If the cleanup mode is incremental, source ids are required.
|
||||
for source_id, hashed_doc in zip(source_ids, hashed_docs):
|
||||
if source_id is None:
|
||||
raise ValueError(
|
||||
"Source ids are required when delete mode is incremental. "
|
||||
"Source ids are required when cleanup mode is incremental. "
|
||||
f"Document that starts with "
|
||||
f"content: {hashed_doc.page_content[:100]} was not assigned "
|
||||
f"as source id."
|
||||
@@ -307,7 +307,7 @@ def index(
|
||||
)
|
||||
|
||||
# If source IDs are provided, we can do the deletion incrementally!
|
||||
if delete_mode == "incremental":
|
||||
if cleanup == "incremental":
|
||||
# Get the uids of the documents that were not returned by the loader.
|
||||
|
||||
# mypy isn't good enough to determine that source ids cannot be None
|
||||
@@ -328,7 +328,7 @@ def index(
|
||||
record_manager.delete_keys(uids_to_delete)
|
||||
num_deleted += len(uids_to_delete)
|
||||
|
||||
if delete_mode == "full":
|
||||
if cleanup == "full":
|
||||
uids_to_delete = record_manager.list_keys(before=index_start_dt)
|
||||
|
||||
if uids_to_delete:
|
||||
|
@@ -158,7 +158,7 @@ def test_index_simple_delete_full(
|
||||
with patch.object(
|
||||
record_manager, "get_time", return_value=datetime(2021, 1, 1).timestamp()
|
||||
):
|
||||
assert index(loader, record_manager, vector_store, delete_mode="full") == {
|
||||
assert index(loader, record_manager, vector_store, cleanup="full") == {
|
||||
"num_added": 2,
|
||||
"num_deleted": 0,
|
||||
"num_skipped": 0,
|
||||
@@ -168,7 +168,7 @@ def test_index_simple_delete_full(
|
||||
with patch.object(
|
||||
record_manager, "get_time", return_value=datetime(2021, 1, 1).timestamp()
|
||||
):
|
||||
assert index(loader, record_manager, vector_store, delete_mode="full") == {
|
||||
assert index(loader, record_manager, vector_store, cleanup="full") == {
|
||||
"num_added": 0,
|
||||
"num_deleted": 0,
|
||||
"num_skipped": 2,
|
||||
@@ -189,7 +189,7 @@ def test_index_simple_delete_full(
|
||||
with patch.object(
|
||||
record_manager, "get_time", return_value=datetime(2021, 1, 2).timestamp()
|
||||
):
|
||||
assert index(loader, record_manager, vector_store, delete_mode="full") == {
|
||||
assert index(loader, record_manager, vector_store, cleanup="full") == {
|
||||
"num_added": 1,
|
||||
"num_deleted": 1,
|
||||
"num_skipped": 1,
|
||||
@@ -207,7 +207,7 @@ def test_index_simple_delete_full(
|
||||
with patch.object(
|
||||
record_manager, "get_time", return_value=datetime(2021, 1, 2).timestamp()
|
||||
):
|
||||
assert index(loader, record_manager, vector_store, delete_mode="full") == {
|
||||
assert index(loader, record_manager, vector_store, cleanup="full") == {
|
||||
"num_added": 0,
|
||||
"num_deleted": 0,
|
||||
"num_skipped": 2,
|
||||
@@ -238,7 +238,7 @@ def test_incremental_fails_with_bad_source_ids(
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
# Should raise an error because no source id function was specified
|
||||
index(loader, record_manager, vector_store, delete_mode="incremental")
|
||||
index(loader, record_manager, vector_store, cleanup="incremental")
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
# Should raise an error because no source id function was specified
|
||||
@@ -246,7 +246,7 @@ def test_incremental_fails_with_bad_source_ids(
|
||||
loader,
|
||||
record_manager,
|
||||
vector_store,
|
||||
delete_mode="incremental",
|
||||
cleanup="incremental",
|
||||
source_id_key="source",
|
||||
)
|
||||
|
||||
@@ -275,7 +275,7 @@ def test_no_delete(
|
||||
loader,
|
||||
record_manager,
|
||||
vector_store,
|
||||
delete_mode=None,
|
||||
cleanup=None,
|
||||
source_id_key="source",
|
||||
) == {
|
||||
"num_added": 2,
|
||||
@@ -292,7 +292,7 @@ def test_no_delete(
|
||||
loader,
|
||||
record_manager,
|
||||
vector_store,
|
||||
delete_mode=None,
|
||||
cleanup=None,
|
||||
source_id_key="source",
|
||||
) == {
|
||||
"num_added": 0,
|
||||
@@ -322,7 +322,7 @@ def test_no_delete(
|
||||
loader,
|
||||
record_manager,
|
||||
vector_store,
|
||||
delete_mode=None,
|
||||
cleanup=None,
|
||||
source_id_key="source",
|
||||
) == {
|
||||
"num_added": 1,
|
||||
@@ -356,7 +356,7 @@ def test_incremental_delete(
|
||||
loader,
|
||||
record_manager,
|
||||
vector_store,
|
||||
delete_mode="incremental",
|
||||
cleanup="incremental",
|
||||
source_id_key="source",
|
||||
) == {
|
||||
"num_added": 2,
|
||||
@@ -380,7 +380,7 @@ def test_incremental_delete(
|
||||
loader,
|
||||
record_manager,
|
||||
vector_store,
|
||||
delete_mode="incremental",
|
||||
cleanup="incremental",
|
||||
source_id_key="source",
|
||||
) == {
|
||||
"num_added": 0,
|
||||
@@ -415,7 +415,7 @@ def test_incremental_delete(
|
||||
loader,
|
||||
record_manager,
|
||||
vector_store,
|
||||
delete_mode="incremental",
|
||||
cleanup="incremental",
|
||||
source_id_key="source",
|
||||
) == {
|
||||
"num_added": 2,
|
||||
@@ -442,7 +442,7 @@ def test_indexing_with_no_docs(
|
||||
"""Check edge case when loader returns no new docs."""
|
||||
loader = ToyLoader(documents=[])
|
||||
|
||||
assert index(loader, record_manager, vector_store, delete_mode="full") == {
|
||||
assert index(loader, record_manager, vector_store, cleanup="full") == {
|
||||
"num_added": 0,
|
||||
"num_deleted": 0,
|
||||
"num_skipped": 0,
|
||||
@@ -466,7 +466,7 @@ def test_deduplication(
|
||||
]
|
||||
|
||||
# Should result in only a single document being added
|
||||
assert index(docs, record_manager, vector_store, delete_mode="full") == {
|
||||
assert index(docs, record_manager, vector_store, cleanup="full") == {
|
||||
"num_added": 1,
|
||||
"num_deleted": 0,
|
||||
"num_skipped": 0,
|
||||
|
Reference in New Issue
Block a user