Ingestion Speedup Multiple strategy (#1309)

This commit is contained in:
lopagela
2023-11-25 20:12:09 +01:00
committed by GitHub
parent 546ba33e6f
commit bafdd3baf1
13 changed files with 515 additions and 195 deletions

View File

@@ -3,7 +3,8 @@ from typing import Literal
from fastapi import APIRouter, Depends, HTTPException, Request, UploadFile
from pydantic import BaseModel
from private_gpt.server.ingest.ingest_service import IngestedDoc, IngestService
from private_gpt.server.ingest.ingest_service import IngestService
from private_gpt.server.ingest.model import IngestedDoc
from private_gpt.server.utils.auth import authenticated
ingest_router = APIRouter(prefix="/v1", dependencies=[Depends(authenticated)])
@@ -35,7 +36,7 @@ def ingest(request: Request, file: UploadFile) -> IngestResponse:
service = request.state.injector.get(IngestService)
if file.filename is None:
raise HTTPException(400, "No file name provided")
ingested_documents = service.ingest(file.filename, file.file.read())
ingested_documents = service.ingest_bin_data(file.filename, file.file)
return IngestResponse(object="list", model="private-gpt", data=ingested_documents)

View File

@@ -1,64 +1,27 @@
import logging
import tempfile
from pathlib import Path
from typing import TYPE_CHECKING, Any, AnyStr, Literal
from typing import BinaryIO
from injector import inject, singleton
from llama_index import (
Document,
ServiceContext,
StorageContext,
VectorStoreIndex,
load_index_from_storage,
)
from llama_index.node_parser import SentenceWindowNodeParser
from llama_index.readers import JSONReader, StringIterableReader
from llama_index.readers.file.base import DEFAULT_FILE_READER_CLS
from pydantic import BaseModel, Field
from private_gpt.components.embedding.embedding_component import EmbeddingComponent
from private_gpt.components.ingest.ingest_component import SimpleIngestComponent
from private_gpt.components.llm.llm_component import LLMComponent
from private_gpt.components.node_store.node_store_component import NodeStoreComponent
from private_gpt.components.vector_store.vector_store_component import (
VectorStoreComponent,
)
from private_gpt.paths import local_data_path
if TYPE_CHECKING:
from llama_index.readers.base import BaseReader
# Patching the default file reader to support other file types
FILE_READER_CLS = DEFAULT_FILE_READER_CLS.copy()
FILE_READER_CLS.update(
{
".json": JSONReader,
}
)
from private_gpt.server.ingest.model import IngestedDoc
logger = logging.getLogger(__name__)
class IngestedDoc(BaseModel):
object: Literal["ingest.document"]
doc_id: str = Field(examples=["c202d5e6-7b69-4869-81cc-dd574ee8ee11"])
doc_metadata: dict[str, Any] | None = Field(
examples=[
{
"page_label": "2",
"file_name": "Sales Report Q3 2023.pdf",
}
]
)
@staticmethod
def curate_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
"""Remove unwanted metadata keys."""
metadata.pop("doc_id", None)
metadata.pop("window", None)
metadata.pop("original_text", None)
return metadata
@singleton
class IngestService:
@inject
@@ -75,99 +38,50 @@ class IngestService:
docstore=node_store_component.doc_store,
index_store=node_store_component.index_store,
)
node_parser = SentenceWindowNodeParser.from_defaults()
self.ingest_service_context = ServiceContext.from_defaults(
llm=self.llm_service.llm,
embed_model=embedding_component.embedding_model,
node_parser=SentenceWindowNodeParser.from_defaults(),
node_parser=node_parser,
# Embeddings done early in the pipeline of node transformations, right
# after the node parsing
transformations=[node_parser, embedding_component.embedding_model],
)
def ingest(self, file_name: str, file_data: AnyStr | Path) -> list[IngestedDoc]:
self.ingest_component = SimpleIngestComponent(
self.storage_context, self.ingest_service_context
)
def ingest(self, file_name: str, file_data: Path) -> list[IngestedDoc]:
logger.info("Ingesting file_name=%s", file_name)
extension = Path(file_name).suffix
reader_cls = FILE_READER_CLS.get(extension)
documents: list[Document]
if reader_cls is None:
logger.debug(
"No reader found for extension=%s, using default string reader",
extension,
)
# Read as a plain text
string_reader = StringIterableReader()
if isinstance(file_data, Path):
text = file_data.read_text()
documents = string_reader.load_data([text])
elif isinstance(file_data, bytes):
documents = string_reader.load_data([file_data.decode("utf-8")])
elif isinstance(file_data, str):
documents = string_reader.load_data([file_data])
else:
raise ValueError(f"Unsupported data type {type(file_data)}")
else:
logger.debug("Specific reader found for extension=%s", extension)
reader: BaseReader = reader_cls()
if isinstance(file_data, Path):
# Already a path, nothing to do
documents = reader.load_data(file_data)
else:
# llama-index mainly supports reading from files, so
# we have to create a tmp file to read for it to work
# delete=False to avoid a Windows 11 permission error.
with tempfile.NamedTemporaryFile(delete=False) as tmp:
try:
path_to_tmp = Path(tmp.name)
if isinstance(file_data, bytes):
path_to_tmp.write_bytes(file_data)
else:
path_to_tmp.write_text(str(file_data))
documents = reader.load_data(path_to_tmp)
finally:
tmp.close()
path_to_tmp.unlink()
logger.info(
"Transformed file=%s into count=%s documents", file_name, len(documents)
)
for document in documents:
document.metadata["file_name"] = file_name
return self._save_docs(documents)
documents = self.ingest_component.ingest(file_name, file_data)
return [IngestedDoc.from_document(document) for document in documents]
def _save_docs(self, documents: list[Document]) -> list[IngestedDoc]:
for document in documents:
document.metadata["doc_id"] = document.doc_id
# We don't want the Embeddings search to receive this metadata
document.excluded_embed_metadata_keys = ["doc_id"]
# We don't want the LLM to receive these metadata in the context
document.excluded_llm_metadata_keys = ["file_name", "doc_id", "page_label"]
def ingest_bin_data(
self, file_name: str, raw_file_data: BinaryIO
) -> list[IngestedDoc]:
logger.debug("Ingesting binary data with file_name=%s", file_name)
file_data = raw_file_data.read()
logger.debug("Got file data of size=%s to ingest", len(file_data))
# llama-index mainly supports reading from files, so
# we have to create a tmp file to read for it to work
# delete=False to avoid a Windows 11 permission error.
with tempfile.NamedTemporaryFile(delete=False) as tmp:
try:
path_to_tmp = Path(tmp.name)
if isinstance(file_data, bytes):
path_to_tmp.write_bytes(file_data)
else:
path_to_tmp.write_text(str(file_data))
return self.ingest(file_name, path_to_tmp)
finally:
tmp.close()
path_to_tmp.unlink()
try:
# Load the index from storage and insert new documents,
index = load_index_from_storage(
storage_context=self.storage_context,
service_context=self.ingest_service_context,
store_nodes_override=True, # Force store nodes in index and document stores
show_progress=True,
)
for doc in documents:
index.insert(doc)
except ValueError:
# Or create a new one if there is none
VectorStoreIndex.from_documents(
documents,
storage_context=self.storage_context,
service_context=self.ingest_service_context,
store_nodes_override=True, # Force store nodes in index and document stores
show_progress=True,
)
# persist the index and nodes
self.storage_context.persist(persist_dir=local_data_path)
return [
IngestedDoc(
object="ingest.document",
doc_id=document.doc_id,
doc_metadata=IngestedDoc.curate_metadata(document.metadata),
)
for document in documents
]
def bulk_ingest(self, files: list[tuple[str, Path]]) -> list[IngestedDoc]:
logger.info("Ingesting file_names=%s", [f[0] for f in files])
documents = self.ingest_component.bulk_ingest(files)
return [IngestedDoc.from_document(document) for document in documents]
def list_ingested(self) -> list[IngestedDoc]:
ingested_docs = []
@@ -205,17 +119,4 @@ class IngestService:
logger.info(
"Deleting the ingested document=%s in the doc and index store", doc_id
)
# Load the index with store_nodes_override=True to be able to delete them
index = load_index_from_storage(
storage_context=self.storage_context,
service_context=self.ingest_service_context,
store_nodes_override=True, # Force store nodes in index and document stores
show_progress=True,
)
# Delete the document from the index
index.delete_ref_doc(doc_id, delete_from_docstore=True)
# Save the index
self.storage_context.persist(persist_dir=local_data_path)
self.ingest_component.delete(doc_id)

View File

@@ -0,0 +1,32 @@
from typing import Any, Literal
from llama_index import Document
from pydantic import BaseModel, Field
class IngestedDoc(BaseModel):
object: Literal["ingest.document"]
doc_id: str = Field(examples=["c202d5e6-7b69-4869-81cc-dd574ee8ee11"])
doc_metadata: dict[str, Any] | None = Field(
examples=[
{
"page_label": "2",
"file_name": "Sales Report Q3 2023.pdf",
}
]
)
@staticmethod
def curate_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
"""Remove unwanted metadata keys."""
for key in ["doc_id", "window", "original_text"]:
metadata.pop(key, None)
return metadata
@staticmethod
def from_document(document: Document) -> "IngestedDoc":
return IngestedDoc(
object="ingest.document",
doc_id=document.doc_id,
doc_metadata=IngestedDoc.curate_metadata(document.metadata),
)