Endpoint to delete documents ingested (#1163)

A file that is ingested will be transformed into several documents (that
are organized into nodes).
This endpoint is deleting documents (bits of a file). These bits can be
retrieved thanks to the endpoint to list all the documents.
This commit is contained in:
lopagela 2023-11-06 15:47:42 +01:00 committed by GitHub
parent 6583dc84c0
commit 0c40cfb115
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 45 additions and 2 deletions

View File

@ -1,3 +1,5 @@
import logging
from injector import inject, singleton from injector import inject, singleton
from llama_index.storage.docstore import BaseDocumentStore, SimpleDocumentStore from llama_index.storage.docstore import BaseDocumentStore, SimpleDocumentStore
from llama_index.storage.index_store import SimpleIndexStore from llama_index.storage.index_store import SimpleIndexStore
@ -5,6 +7,8 @@ from llama_index.storage.index_store.types import BaseIndexStore
from private_gpt.paths import local_data_path from private_gpt.paths import local_data_path
logger = logging.getLogger(__name__)
@singleton @singleton
class NodeStoreComponent: class NodeStoreComponent:
@ -18,6 +22,7 @@ class NodeStoreComponent:
persist_dir=str(local_data_path) persist_dir=str(local_data_path)
) )
except FileNotFoundError: except FileNotFoundError:
logger.debug("Local index store not found, creating a new one")
self.index_store = SimpleIndexStore() self.index_store = SimpleIndexStore()
try: try:
@ -25,4 +30,5 @@ class NodeStoreComponent:
persist_dir=str(local_data_path) persist_dir=str(local_data_path)
) )
except FileNotFoundError: except FileNotFoundError:
logger.debug("Local document store not found, creating a new one")
self.doc_store = SimpleDocumentStore() self.doc_store = SimpleDocumentStore()

View File

@ -47,3 +47,14 @@ def list_ingested() -> IngestResponse:
service = root_injector.get(IngestService) service = root_injector.get(IngestService)
ingested_documents = service.list_ingested() ingested_documents = service.list_ingested()
return IngestResponse(object="list", model="private-gpt", data=ingested_documents) return IngestResponse(object="list", model="private-gpt", data=ingested_documents)
@ingest_router.delete("/ingest/{doc_id}", tags=["Ingestion"])
def delete_ingested(doc_id: str) -> None:
"""Delete the specified ingested Document.
The `doc_id` can be obtained from the `GET /ingest/list` endpoint.
The document will be effectively deleted from your storage context.
"""
service = root_injector.get(IngestService)
service.delete(doc_id)

View File

@ -1,3 +1,4 @@
import logging
import tempfile import tempfile
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Any, AnyStr from typing import TYPE_CHECKING, Any, AnyStr
@ -9,6 +10,7 @@ from llama_index import (
StorageContext, StorageContext,
StringIterableReader, StringIterableReader,
VectorStoreIndex, VectorStoreIndex,
load_index_from_storage,
) )
from llama_index.node_parser import SentenceWindowNodeParser from llama_index.node_parser import SentenceWindowNodeParser
from llama_index.readers.file.base import DEFAULT_FILE_READER_CLS from llama_index.readers.file.base import DEFAULT_FILE_READER_CLS
@ -25,6 +27,8 @@ from private_gpt.paths import local_data_path
if TYPE_CHECKING: if TYPE_CHECKING:
from llama_index.readers.base import BaseReader from llama_index.readers.base import BaseReader
logger = logging.getLogger(__name__)
class IngestedDoc(BaseModel): class IngestedDoc(BaseModel):
object: str = Field(enum=["ingest.document"]) object: str = Field(enum=["ingest.document"])
@ -70,6 +74,7 @@ class IngestService:
) )
def ingest(self, file_name: str, file_data: AnyStr | Path) -> list[IngestedDoc]: def ingest(self, file_name: str, file_data: AnyStr | Path) -> list[IngestedDoc]:
logger.info("Ingesting file_name=%s", file_name)
extension = Path(file_name).suffix extension = Path(file_name).suffix
reader_cls = DEFAULT_FILE_READER_CLS.get(extension) reader_cls = DEFAULT_FILE_READER_CLS.get(extension)
documents: list[Document] documents: list[Document]
@ -100,7 +105,9 @@ class IngestService:
else: else:
path_to_tmp.write_text(str(file_data)) path_to_tmp.write_text(str(file_data))
documents = reader.load_data(path_to_tmp) documents = reader.load_data(path_to_tmp)
logger.info(
"Transformed file=%s into count=%s documents", file_name, len(documents)
)
for document in documents: for document in documents:
document.metadata["file_name"] = file_name document.metadata["file_name"] = file_name
return self._save_docs(documents) return self._save_docs(documents)
@ -153,7 +160,26 @@ class IngestService:
doc_metadata=doc_metadata, doc_metadata=doc_metadata,
) )
) )
return ingested_docs
except ValueError: except ValueError:
logger.warning("Got an exception when getting list of docs", exc_info=True)
pass pass
logger.debug("Found count=%s ingested documents", len(ingested_docs))
return ingested_docs return ingested_docs
def delete(self, doc_id: str) -> None:
"""Delete an ingested document.
:raises ValueError: if the document does not exist
"""
logger.info(
"Deleting the ingested document=%s in the doc and index store", doc_id
)
# Load the index with store_nodes_override=True to be able to delete them
index = load_index_from_storage(self.storage_context, store_nodes_override=True)
# Delete the document from the index
index.delete_ref_doc(doc_id, delete_from_docstore=True)
# Save the index
self.storage_context.persist(persist_dir=local_data_path)