mirror of
https://github.com/imartinez/privateGPT.git
synced 2025-09-26 07:12:47 +00:00
fix: specify dict type, fix bulk ingestion with metadata
This commit is contained in:
@@ -40,11 +40,18 @@ class BaseIngestComponent(abc.ABC):
|
|||||||
self.transformations = transformations
|
self.transformations = transformations
|
||||||
|
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def ingest(self, file_name: str, file_data: Path, file_metadata : dict | None = None) -> list[Document]:
|
def ingest(
|
||||||
|
self,
|
||||||
|
file_name: str,
|
||||||
|
file_data: Path,
|
||||||
|
file_metadata: dict[str, str] | None = None,
|
||||||
|
) -> list[Document]:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def bulk_ingest(self, files: list[tuple[str, Path]], metadata : dict | None = None) -> list[Document]:
|
def bulk_ingest(
|
||||||
|
self, files: list[tuple[str, Path]], metadata: dict[str, str] | None = None
|
||||||
|
) -> list[Document]:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
@@ -117,16 +124,25 @@ class SimpleIngestComponent(BaseIngestComponentWithIndex):
|
|||||||
) -> None:
|
) -> None:
|
||||||
super().__init__(storage_context, embed_model, transformations, *args, **kwargs)
|
super().__init__(storage_context, embed_model, transformations, *args, **kwargs)
|
||||||
|
|
||||||
def ingest(self, file_name: str, file_data: Path, file_metadata : dict | None = None) -> list[Document]:
|
def ingest(
|
||||||
|
self,
|
||||||
|
file_name: str,
|
||||||
|
file_data: Path,
|
||||||
|
file_metadata: dict[str, str] | None = None,
|
||||||
|
) -> list[Document]:
|
||||||
logger.info("Ingesting file_name=%s", file_name)
|
logger.info("Ingesting file_name=%s", file_name)
|
||||||
documents = IngestionHelper.transform_file_into_documents(file_name, file_data, file_metadata)
|
documents = IngestionHelper.transform_file_into_documents(
|
||||||
|
file_name, file_data, file_metadata
|
||||||
|
)
|
||||||
logger.info(
|
logger.info(
|
||||||
"Transformed file=%s into count=%s documents", file_name, len(documents)
|
"Transformed file=%s into count=%s documents", file_name, len(documents)
|
||||||
)
|
)
|
||||||
logger.debug("Saving the documents in the index and doc store")
|
logger.debug("Saving the documents in the index and doc store")
|
||||||
return self._save_docs(documents)
|
return self._save_docs(documents)
|
||||||
|
|
||||||
def bulk_ingest(self, files: list[tuple[str, Path]], metadata : dict | None = None) -> list[Document]:
|
def bulk_ingest(
|
||||||
|
self, files: list[tuple[str, Path]], metadata: dict[str, str] | None = None
|
||||||
|
) -> list[Document]:
|
||||||
saved_documents = []
|
saved_documents = []
|
||||||
for file_name, file_data in files:
|
for file_name, file_data in files:
|
||||||
documents = IngestionHelper.transform_file_into_documents(
|
documents = IngestionHelper.transform_file_into_documents(
|
||||||
@@ -175,20 +191,32 @@ class BatchIngestComponent(BaseIngestComponentWithIndex):
|
|||||||
processes=self.count_workers
|
processes=self.count_workers
|
||||||
)
|
)
|
||||||
|
|
||||||
def ingest(self, file_name: str, file_data: Path, file_metadata : dict | None = None) -> list[Document]:
|
def ingest(
|
||||||
|
self,
|
||||||
|
file_name: str,
|
||||||
|
file_data: Path,
|
||||||
|
file_metadata: dict[str, str] | None = None,
|
||||||
|
) -> list[Document]:
|
||||||
logger.info("Ingesting file_name=%s", file_name)
|
logger.info("Ingesting file_name=%s", file_name)
|
||||||
documents = IngestionHelper.transform_file_into_documents(file_name, file_data, file_metadata)
|
documents = IngestionHelper.transform_file_into_documents(
|
||||||
|
file_name, file_data, file_metadata
|
||||||
|
)
|
||||||
logger.info(
|
logger.info(
|
||||||
"Transformed file=%s into count=%s documents", file_name, len(documents)
|
"Transformed file=%s into count=%s documents", file_name, len(documents)
|
||||||
)
|
)
|
||||||
logger.debug("Saving the documents in the index and doc store")
|
logger.debug("Saving the documents in the index and doc store")
|
||||||
return self._save_docs(documents)
|
return self._save_docs(documents)
|
||||||
|
|
||||||
def bulk_ingest(self, files: list[tuple[str, Path]], metadata : dict | None = None) -> list[Document]:
|
def bulk_ingest(
|
||||||
|
self, files: list[tuple[str, Path]], metadata: dict[str, str] | None = None
|
||||||
|
) -> list[Document]:
|
||||||
|
|
||||||
|
# Pair the files with the metadata
|
||||||
|
args = [(file_tuple, metadata) for file_tuple in files]
|
||||||
documents = list(
|
documents = list(
|
||||||
itertools.chain.from_iterable(
|
itertools.chain.from_iterable(
|
||||||
self._file_to_documents_work_pool.starmap(
|
self._file_to_documents_work_pool.starmap(
|
||||||
IngestionHelper.transform_file_into_documents, files, metadata
|
IngestionHelper.transform_file_into_documents, args
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
@@ -257,12 +285,18 @@ class ParallelizedIngestComponent(BaseIngestComponentWithIndex):
|
|||||||
processes=self.count_workers
|
processes=self.count_workers
|
||||||
)
|
)
|
||||||
|
|
||||||
def ingest(self, file_name: str, file_data: Path, file_metadata : dict | None = None) -> list[Document]:
|
def ingest(
|
||||||
|
self,
|
||||||
|
file_name: str,
|
||||||
|
file_data: Path,
|
||||||
|
file_metadata: dict[str, str] | None = None,
|
||||||
|
) -> list[Document]:
|
||||||
logger.info("Ingesting file_name=%s", file_name)
|
logger.info("Ingesting file_name=%s", file_name)
|
||||||
# Running in a single (1) process to release the current
|
# Running in a single (1) process to release the current
|
||||||
# thread, and take a dedicated CPU core for computation
|
# thread, and take a dedicated CPU core for computation
|
||||||
documents = self._file_to_documents_work_pool.apply(
|
documents = self._file_to_documents_work_pool.apply(
|
||||||
IngestionHelper.transform_file_into_documents, (file_name, file_data, file_metadata)
|
IngestionHelper.transform_file_into_documents,
|
||||||
|
(file_name, file_data, file_metadata),
|
||||||
)
|
)
|
||||||
logger.info(
|
logger.info(
|
||||||
"Transformed file=%s into count=%s documents", file_name, len(documents)
|
"Transformed file=%s into count=%s documents", file_name, len(documents)
|
||||||
@@ -270,13 +304,16 @@ class ParallelizedIngestComponent(BaseIngestComponentWithIndex):
|
|||||||
logger.debug("Saving the documents in the index and doc store")
|
logger.debug("Saving the documents in the index and doc store")
|
||||||
return self._save_docs(documents)
|
return self._save_docs(documents)
|
||||||
|
|
||||||
def bulk_ingest(self, files: list[tuple[str, Path]], metadata : dict | None = None) -> list[Document]:
|
def bulk_ingest(
|
||||||
|
self, files: list[tuple[str, Path]], metadata: dict[str, str] | None = None
|
||||||
|
) -> list[Document]:
|
||||||
|
|
||||||
|
args = [(file_tuple, metadata) for file_tuple in files]
|
||||||
# Lightweight threads, used for parallelize the
|
# Lightweight threads, used for parallelize the
|
||||||
# underlying IO calls made in the ingestion
|
# underlying IO calls made in the ingestion
|
||||||
|
|
||||||
documents = list(
|
documents = list(
|
||||||
itertools.chain.from_iterable(
|
itertools.chain.from_iterable(
|
||||||
self._ingest_work_pool.starmap(self.ingest, files, metadata)
|
self._ingest_work_pool.starmap(self.ingest, args)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
return documents
|
return documents
|
||||||
@@ -459,13 +496,22 @@ class PipelineIngestComponent(BaseIngestComponentWithIndex):
|
|||||||
self.node_q.put(("flush", None, None, None))
|
self.node_q.put(("flush", None, None, None))
|
||||||
self.node_q.join()
|
self.node_q.join()
|
||||||
|
|
||||||
def ingest(self, file_name: str, file_data: Path, file_metadata : dict | None = None) -> list[Document]:
|
def ingest(
|
||||||
documents = IngestionHelper.transform_file_into_documents(file_name, file_data, file_metadata)
|
self,
|
||||||
|
file_name: str,
|
||||||
|
file_data: Path,
|
||||||
|
file_metadata: dict[str, str] | None = None,
|
||||||
|
) -> list[Document]:
|
||||||
|
documents = IngestionHelper.transform_file_into_documents(
|
||||||
|
file_name, file_data, file_metadata
|
||||||
|
)
|
||||||
self.doc_q.put(("process", file_name, documents))
|
self.doc_q.put(("process", file_name, documents))
|
||||||
self._flush()
|
self._flush()
|
||||||
return documents
|
return documents
|
||||||
|
|
||||||
def bulk_ingest(self, files: list[tuple[str, Path]], metadata : dict | None = None) -> list[Document]:
|
def bulk_ingest(
|
||||||
|
self, files: list[tuple[str, Path]], metadata: dict[str, str] | None = None
|
||||||
|
) -> list[Document]:
|
||||||
docs = []
|
docs = []
|
||||||
for file_name, file_data in eta(files):
|
for file_name, file_data in eta(files):
|
||||||
try:
|
try:
|
||||||
|
@@ -69,13 +69,13 @@ class IngestionHelper:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def transform_file_into_documents(
|
def transform_file_into_documents(
|
||||||
file_name: str, file_data: Path, file_metadata : dict | None = None
|
file_name: str, file_data: Path, file_metadata: dict[str, str] | None = None
|
||||||
) -> list[Document]:
|
) -> list[Document]:
|
||||||
documents = IngestionHelper._load_file_to_documents(file_name, file_data)
|
documents = IngestionHelper._load_file_to_documents(file_name, file_data)
|
||||||
for document in documents:
|
for document in documents:
|
||||||
document.metadata.update(file_metadata or {})
|
document.metadata.update(file_metadata or {})
|
||||||
document.metadata["file_name"] = file_name
|
document.metadata["file_name"] = file_name
|
||||||
|
|
||||||
IngestionHelper._exclude_metadata(documents)
|
IngestionHelper._exclude_metadata(documents)
|
||||||
return documents
|
return documents
|
||||||
|
|
||||||
|
@@ -1,6 +1,6 @@
|
|||||||
from typing import Literal, Dict
|
from typing import Literal
|
||||||
|
|
||||||
from fastapi import APIRouter, Depends, HTTPException, Request, UploadFile, Form
|
from fastapi import APIRouter, Depends, Form, HTTPException, Request, UploadFile
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
from private_gpt.server.ingest.ingest_service import IngestService
|
from private_gpt.server.ingest.ingest_service import IngestService
|
||||||
@@ -20,14 +20,15 @@ class IngestTextBody(BaseModel):
|
|||||||
"Chinese martial arts."
|
"Chinese martial arts."
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
metadata: Dict = Field(None,
|
metadata: dict[str, str] = Field(
|
||||||
|
None,
|
||||||
examples=[
|
examples=[
|
||||||
{
|
{
|
||||||
"title": "Avatar: The Last Airbender",
|
"title": "Avatar: The Last Airbender",
|
||||||
"author": "Michael Dante DiMartino, Bryan Konietzko",
|
"author": "Michael Dante DiMartino, Bryan Konietzko",
|
||||||
"year": "2005",
|
"year": "2005",
|
||||||
}
|
}
|
||||||
]
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -47,12 +48,14 @@ def ingest(request: Request, file: UploadFile) -> IngestResponse:
|
|||||||
|
|
||||||
|
|
||||||
@ingest_router.post("/ingest/file", tags=["Ingestion"])
|
@ingest_router.post("/ingest/file", tags=["Ingestion"])
|
||||||
def ingest_file(request: Request, file: UploadFile, metadata: str = Form(None)) -> IngestResponse:
|
def ingest_file(
|
||||||
|
request: Request, file: UploadFile, metadata: str = Form(None)
|
||||||
|
) -> IngestResponse:
|
||||||
"""Ingests and processes a file, storing its chunks to be used as context.
|
"""Ingests and processes a file, storing its chunks to be used as context.
|
||||||
|
|
||||||
metadata: Optional metadata to be associated with the file.
|
metadata: Optional metadata to be associated with the file.
|
||||||
You do not have to specify this field if not needed.
|
You do not have to specify this field if not needed.
|
||||||
e.g. {"title": "Avatar: The Last Airbender", "author": "Michael Dante DiMartino, Bryan Konietzko", "year": "2005"}
|
e.g. {"title": "Avatar: The Last Airbender", "year": "2005"}
|
||||||
|
|
||||||
The context obtained from files is later used in
|
The context obtained from files is later used in
|
||||||
`/chat/completions`, `/completions`, and `/chunks` APIs.
|
`/chat/completions`, `/completions`, and `/chunks` APIs.
|
||||||
@@ -70,9 +73,11 @@ def ingest_file(request: Request, file: UploadFile, metadata: str = Form(None))
|
|||||||
service = request.state.injector.get(IngestService)
|
service = request.state.injector.get(IngestService)
|
||||||
if file.filename is None:
|
if file.filename is None:
|
||||||
raise HTTPException(400, "No file name provided")
|
raise HTTPException(400, "No file name provided")
|
||||||
|
|
||||||
metadata_dict = None if metadata is None else eval(metadata)
|
metadata_dict = None if metadata is None else eval(metadata)
|
||||||
ingested_documents = service.ingest_bin_data(file.filename, file.file, metadata_dict)
|
ingested_documents = service.ingest_bin_data(
|
||||||
|
file.filename, file.file, metadata_dict
|
||||||
|
)
|
||||||
return IngestResponse(object="list", model="private-gpt", data=ingested_documents)
|
return IngestResponse(object="list", model="private-gpt", data=ingested_documents)
|
||||||
|
|
||||||
|
|
||||||
|
@@ -48,7 +48,12 @@ class IngestService:
|
|||||||
settings=settings(),
|
settings=settings(),
|
||||||
)
|
)
|
||||||
|
|
||||||
def _ingest_data(self, file_name: str, file_data: AnyStr, file_metadata : dict | None = None) -> list[IngestedDoc]:
|
def _ingest_data(
|
||||||
|
self,
|
||||||
|
file_name: str,
|
||||||
|
file_data: AnyStr,
|
||||||
|
file_metadata: dict[str, str] | None = None,
|
||||||
|
) -> list[IngestedDoc]:
|
||||||
logger.debug("Got file data of size=%s to ingest", len(file_data))
|
logger.debug("Got file data of size=%s to ingest", len(file_data))
|
||||||
# llama-index mainly supports reading from files, so
|
# llama-index mainly supports reading from files, so
|
||||||
# we have to create a tmp file to read for it to work
|
# we have to create a tmp file to read for it to work
|
||||||
@@ -65,18 +70,28 @@ class IngestService:
|
|||||||
tmp.close()
|
tmp.close()
|
||||||
path_to_tmp.unlink()
|
path_to_tmp.unlink()
|
||||||
|
|
||||||
def ingest_file(self, file_name: str, file_data: Path, file_metadata : dict | None = None) -> list[IngestedDoc]:
|
def ingest_file(
|
||||||
|
self,
|
||||||
|
file_name: str,
|
||||||
|
file_data: Path,
|
||||||
|
file_metadata: dict[str, str] | None = None,
|
||||||
|
) -> list[IngestedDoc]:
|
||||||
logger.info("Ingesting file_name=%s", file_name)
|
logger.info("Ingesting file_name=%s", file_name)
|
||||||
documents = self.ingest_component.ingest(file_name, file_data, file_metadata)
|
documents = self.ingest_component.ingest(file_name, file_data, file_metadata)
|
||||||
logger.info("Finished ingestion file_name=%s", file_name)
|
logger.info("Finished ingestion file_name=%s", file_name)
|
||||||
return [IngestedDoc.from_document(document) for document in documents]
|
return [IngestedDoc.from_document(document) for document in documents]
|
||||||
|
|
||||||
def ingest_text(self, file_name: str, text: str, metadata : dict | None = None) -> list[IngestedDoc]:
|
def ingest_text(
|
||||||
|
self, file_name: str, text: str, metadata: dict[str, str] | None = None
|
||||||
|
) -> list[IngestedDoc]:
|
||||||
logger.debug("Ingesting text data with file_name=%s", file_name)
|
logger.debug("Ingesting text data with file_name=%s", file_name)
|
||||||
return self._ingest_data(file_name, text, metadata)
|
return self._ingest_data(file_name, text, metadata)
|
||||||
|
|
||||||
def ingest_bin_data(
|
def ingest_bin_data(
|
||||||
self, file_name: str, raw_file_data: BinaryIO, file_metadata : dict | None = None
|
self,
|
||||||
|
file_name: str,
|
||||||
|
raw_file_data: BinaryIO,
|
||||||
|
file_metadata: dict[str, str] | None = None,
|
||||||
) -> list[IngestedDoc]:
|
) -> list[IngestedDoc]:
|
||||||
logger.debug("Ingesting binary data with file_name=%s", file_name)
|
logger.debug("Ingesting binary data with file_name=%s", file_name)
|
||||||
file_data = raw_file_data.read()
|
file_data = raw_file_data.read()
|
||||||
|
8
tests/fixtures/ingest_helper.py
vendored
8
tests/fixtures/ingest_helper.py
vendored
@@ -1,7 +1,7 @@
|
|||||||
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import json
|
|
||||||
from fastapi.testclient import TestClient
|
from fastapi.testclient import TestClient
|
||||||
|
|
||||||
from private_gpt.server.ingest.ingest_router import IngestResponse
|
from private_gpt.server.ingest.ingest_router import IngestResponse
|
||||||
@@ -18,15 +18,15 @@ class IngestHelper:
|
|||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
ingest_result = IngestResponse.model_validate(response.json())
|
ingest_result = IngestResponse.model_validate(response.json())
|
||||||
return ingest_result
|
return ingest_result
|
||||||
|
|
||||||
def ingest_file_with_metadata(self, path: Path, metadata: dict) -> IngestResponse:
|
def ingest_file_with_metadata(self, path: Path, metadata: dict) -> IngestResponse:
|
||||||
files = {
|
files = {
|
||||||
"file": (path.name, path.open("rb")),
|
"file": (path.name, path.open("rb")),
|
||||||
"metadata": (None, json.dumps(metadata))
|
"metadata": (None, json.dumps(metadata)),
|
||||||
}
|
}
|
||||||
|
|
||||||
response = self.test_client.post("/v1/ingest/file", files=files)
|
response = self.test_client.post("/v1/ingest/file", files=files)
|
||||||
|
|
||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
ingest_result = IngestResponse.model_validate(response.json())
|
ingest_result = IngestResponse.model_validate(response.json())
|
||||||
return ingest_result
|
return ingest_result
|
||||||
|
@@ -48,17 +48,21 @@ def test_ingest_plain_text(test_client: TestClient) -> None:
|
|||||||
|
|
||||||
def test_ingest_text_with_metadata(test_client: TestClient):
|
def test_ingest_text_with_metadata(test_client: TestClient):
|
||||||
response = test_client.post(
|
response = test_client.post(
|
||||||
"/v1/ingest/text", json={"file_name": "file_name", "text": "text", "metadata": {"foo": "bar"}}
|
"/v1/ingest/text",
|
||||||
|
json={"file_name": "file_name", "text": "text", "metadata": {"foo": "bar"}},
|
||||||
)
|
)
|
||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
ingest_result = IngestResponse.model_validate(response.json())
|
ingest_result = IngestResponse.model_validate(response.json())
|
||||||
assert len(ingest_result.data) == 1
|
assert len(ingest_result.data) == 1
|
||||||
|
|
||||||
assert ingest_result.data[0].doc_metadata == {"file_name" : "file_name", "foo": "bar"}
|
assert ingest_result.data[0].doc_metadata == {
|
||||||
|
"file_name": "file_name",
|
||||||
|
"foo": "bar",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def test_ingest_accepts_txt_files(ingest_helper: IngestHelper) -> None:
|
def test_ingest_accepts_txt_files_with_metadata(ingest_helper: IngestHelper) -> None:
|
||||||
path = Path(__file__).parents[0] / "test.txt"
|
path = Path(__file__).parents[0] / "test.txt"
|
||||||
ingest_result = ingest_helper.ingest_file_with_metadata(path, {"foo": "bar"})
|
ingest_result = ingest_helper.ingest_file_with_metadata(path, {"foo": "bar"})
|
||||||
assert len(ingest_result.data) == 1
|
assert len(ingest_result.data) == 1
|
||||||
assert ingest_result.data[0].doc_metadata == {"file_name": "test.txt", "foo": "bar"}
|
assert ingest_result.data[0].doc_metadata == {"file_name": "test.txt", "foo": "bar"}
|
||||||
|
Reference in New Issue
Block a user