diff --git a/private_gpt/components/node_store/node_store_component.py b/private_gpt/components/node_store/node_store_component.py index c20f98c5..c039bf50 100644 --- a/private_gpt/components/node_store/node_store_component.py +++ b/private_gpt/components/node_store/node_store_component.py @@ -1,3 +1,5 @@ +import logging + from injector import inject, singleton from llama_index.storage.docstore import BaseDocumentStore, SimpleDocumentStore from llama_index.storage.index_store import SimpleIndexStore @@ -5,6 +7,8 @@ from llama_index.storage.index_store.types import BaseIndexStore from private_gpt.paths import local_data_path +logger = logging.getLogger(__name__) + @singleton class NodeStoreComponent: @@ -18,6 +22,7 @@ class NodeStoreComponent: persist_dir=str(local_data_path) ) except FileNotFoundError: + logger.debug("Local index store not found, creating a new one") self.index_store = SimpleIndexStore() try: @@ -25,4 +30,5 @@ class NodeStoreComponent: persist_dir=str(local_data_path) ) except FileNotFoundError: + logger.debug("Local document store not found, creating a new one") self.doc_store = SimpleDocumentStore() diff --git a/private_gpt/server/ingest/ingest_router.py b/private_gpt/server/ingest/ingest_router.py index dd49b5a8..5c156f46 100644 --- a/private_gpt/server/ingest/ingest_router.py +++ b/private_gpt/server/ingest/ingest_router.py @@ -47,3 +47,14 @@ def list_ingested() -> IngestResponse: service = root_injector.get(IngestService) ingested_documents = service.list_ingested() return IngestResponse(object="list", model="private-gpt", data=ingested_documents) + + +@ingest_router.delete("/ingest/{doc_id}", tags=["Ingestion"]) +def delete_ingested(doc_id: str) -> None: + """Delete the specified ingested Document. + + The `doc_id` can be obtained from the `GET /ingest/list` endpoint. + The document will be effectively deleted from your storage context. + """ + service = root_injector.get(IngestService) + service.delete(doc_id) diff --git a/private_gpt/server/ingest/ingest_service.py b/private_gpt/server/ingest/ingest_service.py index 6a34e6fb..0026660c 100644 --- a/private_gpt/server/ingest/ingest_service.py +++ b/private_gpt/server/ingest/ingest_service.py @@ -1,3 +1,4 @@ +import logging import tempfile from pathlib import Path from typing import TYPE_CHECKING, Any, AnyStr @@ -9,6 +10,7 @@ from llama_index import ( StorageContext, StringIterableReader, VectorStoreIndex, + load_index_from_storage, ) from llama_index.node_parser import SentenceWindowNodeParser from llama_index.readers.file.base import DEFAULT_FILE_READER_CLS @@ -25,6 +27,8 @@ from private_gpt.paths import local_data_path if TYPE_CHECKING: from llama_index.readers.base import BaseReader +logger = logging.getLogger(__name__) + class IngestedDoc(BaseModel): object: str = Field(enum=["ingest.document"]) @@ -70,6 +74,7 @@ class IngestService: ) def ingest(self, file_name: str, file_data: AnyStr | Path) -> list[IngestedDoc]: + logger.info("Ingesting file_name=%s", file_name) extension = Path(file_name).suffix reader_cls = DEFAULT_FILE_READER_CLS.get(extension) documents: list[Document] @@ -100,7 +105,9 @@ class IngestService: else: path_to_tmp.write_text(str(file_data)) documents = reader.load_data(path_to_tmp) - + logger.info( + "Transformed file=%s into count=%s documents", file_name, len(documents) + ) for document in documents: document.metadata["file_name"] = file_name return self._save_docs(documents) @@ -153,7 +160,26 @@ class IngestService: doc_metadata=doc_metadata, ) ) - return ingested_docs except ValueError: + logger.warning("Got an exception when getting list of docs", exc_info=True) pass + logger.debug("Found count=%s ingested documents", len(ingested_docs)) return ingested_docs + + def delete(self, doc_id: str) -> None: + """Delete an ingested document. + + :raises ValueError: if the document does not exist + """ + logger.info( + "Deleting the ingested document=%s in the doc and index store", doc_id + ) + + # Load the index with store_nodes_override=True to be able to delete them + index = load_index_from_storage(self.storage_context, store_nodes_override=True) + + # Delete the document from the index + index.delete_ref_doc(doc_id, delete_from_docstore=True) + + # Save the index + self.storage_context.persist(persist_dir=local_data_path)