feat(API): Ingest plain text (#1417)

* Add ingest/text route to ingest plain text

* Add new ingest text test and adapt ingest/file ones

* Include new API in docs

* Remove duplicated logic
This commit is contained in:
Iván Martínez
2023-12-18 21:47:05 +01:00
committed by GitHub
parent 059f35840a
commit 6eeb95ec7f
6 changed files with 198 additions and 17 deletions

View File

@@ -1,7 +1,7 @@
from typing import Literal
from fastapi import APIRouter, Depends, HTTPException, Request, UploadFile
from pydantic import BaseModel
from pydantic import BaseModel, Field
from private_gpt.server.ingest.ingest_service import IngestService
from private_gpt.server.ingest.model import IngestedDoc
@@ -10,14 +10,35 @@ from private_gpt.server.utils.auth import authenticated
ingest_router = APIRouter(prefix="/v1", dependencies=[Depends(authenticated)])
class IngestTextBody(BaseModel):
file_name: str = Field(examples=["Avatar: The Last Airbender"])
text: str = Field(
examples=[
"Avatar is set in an Asian and Arctic-inspired world in which some "
"people can telekinetically manipulate one of the four elements—water, "
"earth, fire or air—through practices known as 'bending', inspired by "
"Chinese martial arts."
]
)
class IngestResponse(BaseModel):
object: Literal["list"]
model: Literal["private-gpt"]
data: list[IngestedDoc]
@ingest_router.post("/ingest", tags=["Ingestion"])
@ingest_router.post("/ingest", tags=["Ingestion"], deprecated=True)
def ingest(request: Request, file: UploadFile) -> IngestResponse:
"""Ingests and processes a file.
Deprecated. Use ingest/file instead.
"""
return ingest_file(request, file)
@ingest_router.post("/ingest/file", tags=["Ingestion"])
def ingest_file(request: Request, file: UploadFile) -> IngestResponse:
"""Ingests and processes a file, storing its chunks to be used as context.
The context obtained from files is later used in
@@ -40,6 +61,26 @@ def ingest(request: Request, file: UploadFile) -> IngestResponse:
return IngestResponse(object="list", model="private-gpt", data=ingested_documents)
@ingest_router.post("/ingest/text", tags=["Ingestion"])
def ingest_text(request: Request, body: IngestTextBody) -> IngestResponse:
"""Ingests and processes a text, storing its chunks to be used as context.
The context obtained from files is later used in
`/chat/completions`, `/completions`, and `/chunks` APIs.
A Document will be generated with the given text. The Document
ID is returned in the response, together with the
extracted Metadata (which is later used to improve context retrieval). That ID
can be used to filter the context used to create responses in
`/chat/completions`, `/completions`, and `/chunks` APIs.
"""
service = request.state.injector.get(IngestService)
if len(body.file_name) == 0:
raise HTTPException(400, "No file name provided")
ingested_documents = service.ingest_text(body.file_name, body.text)
return IngestResponse(object="list", model="private-gpt", data=ingested_documents)
@ingest_router.get("/ingest/list", tags=["Ingestion"])
def list_ingested(request: Request) -> IngestResponse:
"""Lists already ingested Documents including their Document ID and metadata.

View File

@@ -1,7 +1,7 @@
import logging
import tempfile
from pathlib import Path
from typing import BinaryIO
from typing import AnyStr, BinaryIO
from injector import inject, singleton
from llama_index import (
@@ -53,16 +53,7 @@ class IngestService:
self.storage_context, self.ingest_service_context, settings=settings()
)
def ingest(self, file_name: str, file_data: Path) -> list[IngestedDoc]:
logger.info("Ingesting file_name=%s", file_name)
documents = self.ingest_component.ingest(file_name, file_data)
return [IngestedDoc.from_document(document) for document in documents]
def ingest_bin_data(
self, file_name: str, raw_file_data: BinaryIO
) -> list[IngestedDoc]:
logger.debug("Ingesting binary data with file_name=%s", file_name)
file_data = raw_file_data.read()
def _ingest_data(self, file_name: str, file_data: AnyStr) -> list[IngestedDoc]:
logger.debug("Got file data of size=%s to ingest", len(file_data))
# llama-index mainly supports reading from files, so
# we have to create a tmp file to read for it to work
@@ -74,11 +65,27 @@ class IngestService:
path_to_tmp.write_bytes(file_data)
else:
path_to_tmp.write_text(str(file_data))
return self.ingest(file_name, path_to_tmp)
return self.ingest_file(file_name, path_to_tmp)
finally:
tmp.close()
path_to_tmp.unlink()
def ingest_file(self, file_name: str, file_data: Path) -> list[IngestedDoc]:
logger.info("Ingesting file_name=%s", file_name)
documents = self.ingest_component.ingest(file_name, file_data)
return [IngestedDoc.from_document(document) for document in documents]
def ingest_text(self, file_name: str, text: str) -> list[IngestedDoc]:
logger.debug("Ingesting text data with file_name=%s", file_name)
return self._ingest_data(file_name, text)
def ingest_bin_data(
self, file_name: str, raw_file_data: BinaryIO
) -> list[IngestedDoc]:
logger.debug("Ingesting binary data with file_name=%s", file_name)
file_data = raw_file_data.read()
return self._ingest_data(file_name, file_data)
def bulk_ingest(self, files: list[tuple[str, Path]]) -> list[IngestedDoc]:
logger.info("Ingesting file_names=%s", [f[0] for f in files])
documents = self.ingest_component.bulk_ingest(files)