mirror of
https://github.com/imartinez/privateGPT.git
synced 2025-06-26 15:34:08 +00:00
Endpoint to delete documents ingested (#1163)
A file that is ingested will be transformed into several documents (that are organized into nodes). This endpoint is deleting documents (bits of a file). These bits can be retrieved thanks to the endpoint to list all the documents.
This commit is contained in:
parent
6583dc84c0
commit
0c40cfb115
@ -1,3 +1,5 @@
|
|||||||
|
import logging
|
||||||
|
|
||||||
from injector import inject, singleton
|
from injector import inject, singleton
|
||||||
from llama_index.storage.docstore import BaseDocumentStore, SimpleDocumentStore
|
from llama_index.storage.docstore import BaseDocumentStore, SimpleDocumentStore
|
||||||
from llama_index.storage.index_store import SimpleIndexStore
|
from llama_index.storage.index_store import SimpleIndexStore
|
||||||
@ -5,6 +7,8 @@ from llama_index.storage.index_store.types import BaseIndexStore
|
|||||||
|
|
||||||
from private_gpt.paths import local_data_path
|
from private_gpt.paths import local_data_path
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@singleton
|
@singleton
|
||||||
class NodeStoreComponent:
|
class NodeStoreComponent:
|
||||||
@ -18,6 +22,7 @@ class NodeStoreComponent:
|
|||||||
persist_dir=str(local_data_path)
|
persist_dir=str(local_data_path)
|
||||||
)
|
)
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
|
logger.debug("Local index store not found, creating a new one")
|
||||||
self.index_store = SimpleIndexStore()
|
self.index_store = SimpleIndexStore()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -25,4 +30,5 @@ class NodeStoreComponent:
|
|||||||
persist_dir=str(local_data_path)
|
persist_dir=str(local_data_path)
|
||||||
)
|
)
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
|
logger.debug("Local document store not found, creating a new one")
|
||||||
self.doc_store = SimpleDocumentStore()
|
self.doc_store = SimpleDocumentStore()
|
||||||
|
@ -47,3 +47,14 @@ def list_ingested() -> IngestResponse:
|
|||||||
service = root_injector.get(IngestService)
|
service = root_injector.get(IngestService)
|
||||||
ingested_documents = service.list_ingested()
|
ingested_documents = service.list_ingested()
|
||||||
return IngestResponse(object="list", model="private-gpt", data=ingested_documents)
|
return IngestResponse(object="list", model="private-gpt", data=ingested_documents)
|
||||||
|
|
||||||
|
|
||||||
|
@ingest_router.delete("/ingest/{doc_id}", tags=["Ingestion"])
|
||||||
|
def delete_ingested(doc_id: str) -> None:
|
||||||
|
"""Delete the specified ingested Document.
|
||||||
|
|
||||||
|
The `doc_id` can be obtained from the `GET /ingest/list` endpoint.
|
||||||
|
The document will be effectively deleted from your storage context.
|
||||||
|
"""
|
||||||
|
service = root_injector.get(IngestService)
|
||||||
|
service.delete(doc_id)
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import logging
|
||||||
import tempfile
|
import tempfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING, Any, AnyStr
|
from typing import TYPE_CHECKING, Any, AnyStr
|
||||||
@ -9,6 +10,7 @@ from llama_index import (
|
|||||||
StorageContext,
|
StorageContext,
|
||||||
StringIterableReader,
|
StringIterableReader,
|
||||||
VectorStoreIndex,
|
VectorStoreIndex,
|
||||||
|
load_index_from_storage,
|
||||||
)
|
)
|
||||||
from llama_index.node_parser import SentenceWindowNodeParser
|
from llama_index.node_parser import SentenceWindowNodeParser
|
||||||
from llama_index.readers.file.base import DEFAULT_FILE_READER_CLS
|
from llama_index.readers.file.base import DEFAULT_FILE_READER_CLS
|
||||||
@ -25,6 +27,8 @@ from private_gpt.paths import local_data_path
|
|||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from llama_index.readers.base import BaseReader
|
from llama_index.readers.base import BaseReader
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class IngestedDoc(BaseModel):
|
class IngestedDoc(BaseModel):
|
||||||
object: str = Field(enum=["ingest.document"])
|
object: str = Field(enum=["ingest.document"])
|
||||||
@ -70,6 +74,7 @@ class IngestService:
|
|||||||
)
|
)
|
||||||
|
|
||||||
def ingest(self, file_name: str, file_data: AnyStr | Path) -> list[IngestedDoc]:
|
def ingest(self, file_name: str, file_data: AnyStr | Path) -> list[IngestedDoc]:
|
||||||
|
logger.info("Ingesting file_name=%s", file_name)
|
||||||
extension = Path(file_name).suffix
|
extension = Path(file_name).suffix
|
||||||
reader_cls = DEFAULT_FILE_READER_CLS.get(extension)
|
reader_cls = DEFAULT_FILE_READER_CLS.get(extension)
|
||||||
documents: list[Document]
|
documents: list[Document]
|
||||||
@ -100,7 +105,9 @@ class IngestService:
|
|||||||
else:
|
else:
|
||||||
path_to_tmp.write_text(str(file_data))
|
path_to_tmp.write_text(str(file_data))
|
||||||
documents = reader.load_data(path_to_tmp)
|
documents = reader.load_data(path_to_tmp)
|
||||||
|
logger.info(
|
||||||
|
"Transformed file=%s into count=%s documents", file_name, len(documents)
|
||||||
|
)
|
||||||
for document in documents:
|
for document in documents:
|
||||||
document.metadata["file_name"] = file_name
|
document.metadata["file_name"] = file_name
|
||||||
return self._save_docs(documents)
|
return self._save_docs(documents)
|
||||||
@ -153,7 +160,26 @@ class IngestService:
|
|||||||
doc_metadata=doc_metadata,
|
doc_metadata=doc_metadata,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
return ingested_docs
|
|
||||||
except ValueError:
|
except ValueError:
|
||||||
|
logger.warning("Got an exception when getting list of docs", exc_info=True)
|
||||||
pass
|
pass
|
||||||
|
logger.debug("Found count=%s ingested documents", len(ingested_docs))
|
||||||
return ingested_docs
|
return ingested_docs
|
||||||
|
|
||||||
|
def delete(self, doc_id: str) -> None:
|
||||||
|
"""Delete an ingested document.
|
||||||
|
|
||||||
|
:raises ValueError: if the document does not exist
|
||||||
|
"""
|
||||||
|
logger.info(
|
||||||
|
"Deleting the ingested document=%s in the doc and index store", doc_id
|
||||||
|
)
|
||||||
|
|
||||||
|
# Load the index with store_nodes_override=True to be able to delete them
|
||||||
|
index = load_index_from_storage(self.storage_context, store_nodes_override=True)
|
||||||
|
|
||||||
|
# Delete the document from the index
|
||||||
|
index.delete_ref_doc(doc_id, delete_from_docstore=True)
|
||||||
|
|
||||||
|
# Save the index
|
||||||
|
self.storage_context.persist(persist_dir=local_data_path)
|
||||||
|
Loading…
Reference in New Issue
Block a user