mirror of
https://github.com/imartinez/privateGPT.git
synced 2025-09-19 09:41:21 +00:00
feat(API): Ingest plain text (#1417)
* Add ingest/text route to ingest plain text * Add new ingest text test and adapt ingest/file ones * Include new API in docs * Remove duplicated logic
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
from typing import Literal
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Request, UploadFile
|
||||
from pydantic import BaseModel
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from private_gpt.server.ingest.ingest_service import IngestService
|
||||
from private_gpt.server.ingest.model import IngestedDoc
|
||||
@@ -10,14 +10,35 @@ from private_gpt.server.utils.auth import authenticated
|
||||
ingest_router = APIRouter(prefix="/v1", dependencies=[Depends(authenticated)])
|
||||
|
||||
|
||||
class IngestTextBody(BaseModel):
|
||||
file_name: str = Field(examples=["Avatar: The Last Airbender"])
|
||||
text: str = Field(
|
||||
examples=[
|
||||
"Avatar is set in an Asian and Arctic-inspired world in which some "
|
||||
"people can telekinetically manipulate one of the four elements—water, "
|
||||
"earth, fire or air—through practices known as 'bending', inspired by "
|
||||
"Chinese martial arts."
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
class IngestResponse(BaseModel):
|
||||
object: Literal["list"]
|
||||
model: Literal["private-gpt"]
|
||||
data: list[IngestedDoc]
|
||||
|
||||
|
||||
@ingest_router.post("/ingest", tags=["Ingestion"])
|
||||
@ingest_router.post("/ingest", tags=["Ingestion"], deprecated=True)
|
||||
def ingest(request: Request, file: UploadFile) -> IngestResponse:
|
||||
"""Ingests and processes a file.
|
||||
|
||||
Deprecated. Use ingest/file instead.
|
||||
"""
|
||||
return ingest_file(request, file)
|
||||
|
||||
|
||||
@ingest_router.post("/ingest/file", tags=["Ingestion"])
|
||||
def ingest_file(request: Request, file: UploadFile) -> IngestResponse:
|
||||
"""Ingests and processes a file, storing its chunks to be used as context.
|
||||
|
||||
The context obtained from files is later used in
|
||||
@@ -40,6 +61,26 @@ def ingest(request: Request, file: UploadFile) -> IngestResponse:
|
||||
return IngestResponse(object="list", model="private-gpt", data=ingested_documents)
|
||||
|
||||
|
||||
@ingest_router.post("/ingest/text", tags=["Ingestion"])
|
||||
def ingest_text(request: Request, body: IngestTextBody) -> IngestResponse:
|
||||
"""Ingests and processes a text, storing its chunks to be used as context.
|
||||
|
||||
The context obtained from files is later used in
|
||||
`/chat/completions`, `/completions`, and `/chunks` APIs.
|
||||
|
||||
A Document will be generated with the given text. The Document
|
||||
ID is returned in the response, together with the
|
||||
extracted Metadata (which is later used to improve context retrieval). That ID
|
||||
can be used to filter the context used to create responses in
|
||||
`/chat/completions`, `/completions`, and `/chunks` APIs.
|
||||
"""
|
||||
service = request.state.injector.get(IngestService)
|
||||
if len(body.file_name) == 0:
|
||||
raise HTTPException(400, "No file name provided")
|
||||
ingested_documents = service.ingest_text(body.file_name, body.text)
|
||||
return IngestResponse(object="list", model="private-gpt", data=ingested_documents)
|
||||
|
||||
|
||||
@ingest_router.get("/ingest/list", tags=["Ingestion"])
|
||||
def list_ingested(request: Request) -> IngestResponse:
|
||||
"""Lists already ingested Documents including their Document ID and metadata.
|
||||
|
@@ -1,7 +1,7 @@
|
||||
import logging
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import BinaryIO
|
||||
from typing import AnyStr, BinaryIO
|
||||
|
||||
from injector import inject, singleton
|
||||
from llama_index import (
|
||||
@@ -53,16 +53,7 @@ class IngestService:
|
||||
self.storage_context, self.ingest_service_context, settings=settings()
|
||||
)
|
||||
|
||||
def ingest(self, file_name: str, file_data: Path) -> list[IngestedDoc]:
|
||||
logger.info("Ingesting file_name=%s", file_name)
|
||||
documents = self.ingest_component.ingest(file_name, file_data)
|
||||
return [IngestedDoc.from_document(document) for document in documents]
|
||||
|
||||
def ingest_bin_data(
|
||||
self, file_name: str, raw_file_data: BinaryIO
|
||||
) -> list[IngestedDoc]:
|
||||
logger.debug("Ingesting binary data with file_name=%s", file_name)
|
||||
file_data = raw_file_data.read()
|
||||
def _ingest_data(self, file_name: str, file_data: AnyStr) -> list[IngestedDoc]:
|
||||
logger.debug("Got file data of size=%s to ingest", len(file_data))
|
||||
# llama-index mainly supports reading from files, so
|
||||
# we have to create a tmp file to read for it to work
|
||||
@@ -74,11 +65,27 @@ class IngestService:
|
||||
path_to_tmp.write_bytes(file_data)
|
||||
else:
|
||||
path_to_tmp.write_text(str(file_data))
|
||||
return self.ingest(file_name, path_to_tmp)
|
||||
return self.ingest_file(file_name, path_to_tmp)
|
||||
finally:
|
||||
tmp.close()
|
||||
path_to_tmp.unlink()
|
||||
|
||||
def ingest_file(self, file_name: str, file_data: Path) -> list[IngestedDoc]:
|
||||
logger.info("Ingesting file_name=%s", file_name)
|
||||
documents = self.ingest_component.ingest(file_name, file_data)
|
||||
return [IngestedDoc.from_document(document) for document in documents]
|
||||
|
||||
def ingest_text(self, file_name: str, text: str) -> list[IngestedDoc]:
|
||||
logger.debug("Ingesting text data with file_name=%s", file_name)
|
||||
return self._ingest_data(file_name, text)
|
||||
|
||||
def ingest_bin_data(
|
||||
self, file_name: str, raw_file_data: BinaryIO
|
||||
) -> list[IngestedDoc]:
|
||||
logger.debug("Ingesting binary data with file_name=%s", file_name)
|
||||
file_data = raw_file_data.read()
|
||||
return self._ingest_data(file_name, file_data)
|
||||
|
||||
def bulk_ingest(self, files: list[tuple[str, Path]]) -> list[IngestedDoc]:
|
||||
logger.info("Ingesting file_names=%s", [f[0] for f in files])
|
||||
documents = self.ingest_component.bulk_ingest(files)
|
||||
|
Reference in New Issue
Block a user