fix: parse metadata as json, allow metadata typing

This commit is contained in:
Nathan Lenas 2024-07-29 09:26:40 +02:00
parent f47c05730d
commit 9177a0ad73
4 changed files with 16 additions and 10 deletions

View File

@ -44,7 +44,7 @@ class BaseIngestComponent(abc.ABC):
self, self,
file_name: str, file_name: str,
file_data: Path, file_data: Path,
file_metadata: dict[str, str] | None = None, file_metadata: dict[str, Any] | None = None,
) -> list[Document]: ) -> list[Document]:
pass pass
@ -126,7 +126,7 @@ class SimpleIngestComponent(BaseIngestComponentWithIndex):
self, self,
file_name: str, file_name: str,
file_data: Path, file_data: Path,
file_metadata: dict[str, str] | None = None, file_metadata: dict[str, Any] | None = None,
) -> list[Document]: ) -> list[Document]:
logger.info("Ingesting file_name=%s", file_name) logger.info("Ingesting file_name=%s", file_name)
documents = IngestionHelper.transform_file_into_documents( documents = IngestionHelper.transform_file_into_documents(
@ -191,7 +191,7 @@ class BatchIngestComponent(BaseIngestComponentWithIndex):
self, self,
file_name: str, file_name: str,
file_data: Path, file_data: Path,
file_metadata: dict[str, str] | None = None, file_metadata: dict[str, Any] | None = None,
) -> list[Document]: ) -> list[Document]:
logger.info("Ingesting file_name=%s", file_name) logger.info("Ingesting file_name=%s", file_name)
documents = IngestionHelper.transform_file_into_documents( documents = IngestionHelper.transform_file_into_documents(
@ -281,7 +281,7 @@ class ParallelizedIngestComponent(BaseIngestComponentWithIndex):
self, self,
file_name: str, file_name: str,
file_data: Path, file_data: Path,
file_metadata: dict[str, str] | None = None, file_metadata: dict[str, Any] | None = None,
) -> list[Document]: ) -> list[Document]:
logger.info("Ingesting file_name=%s", file_name) logger.info("Ingesting file_name=%s", file_name)
# Running in a single (1) process to release the current # Running in a single (1) process to release the current
@ -489,7 +489,7 @@ class PipelineIngestComponent(BaseIngestComponentWithIndex):
self, self,
file_name: str, file_name: str,
file_data: Path, file_data: Path,
file_metadata: dict[str, str] | None = None, file_metadata: dict[str, Any] | None = None,
) -> list[Document]: ) -> list[Document]:
documents = IngestionHelper.transform_file_into_documents( documents = IngestionHelper.transform_file_into_documents(
file_name, file_data, file_metadata file_name, file_data, file_metadata

View File

@ -1,5 +1,6 @@
import logging import logging
from pathlib import Path from pathlib import Path
from typing import Any
from llama_index.core.readers import StringIterableReader from llama_index.core.readers import StringIterableReader
from llama_index.core.readers.base import BaseReader from llama_index.core.readers.base import BaseReader
@ -69,7 +70,7 @@ class IngestionHelper:
@staticmethod @staticmethod
def transform_file_into_documents( def transform_file_into_documents(
file_name: str, file_data: Path, file_metadata: dict[str, str] | None = None file_name: str, file_data: Path, file_metadata: dict[str, Any] | None = None
) -> list[Document]: ) -> list[Document]:
documents = IngestionHelper._load_file_to_documents(file_name, file_data) documents = IngestionHelper._load_file_to_documents(file_name, file_data)
for document in documents: for document in documents:

View File

@ -1,4 +1,5 @@
from typing import Literal import json
from typing import Any, Literal
from fastapi import APIRouter, Depends, Form, HTTPException, Request, UploadFile from fastapi import APIRouter, Depends, Form, HTTPException, Request, UploadFile
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
@ -20,7 +21,7 @@ class IngestTextBody(BaseModel):
"Chinese martial arts." "Chinese martial arts."
] ]
) )
metadata: dict[str, str] = Field( metadata: dict[str, Any] = Field(
None, None,
examples=[ examples=[
{ {
@ -55,6 +56,7 @@ def ingest_file(
metadata: Optional metadata to be associated with the file. metadata: Optional metadata to be associated with the file.
You do not have to specify this field if not needed. You do not have to specify this field if not needed.
The metadata needs to be in JSON format.
e.g. {"title": "Avatar: The Last Airbender", "year": "2005"} e.g. {"title": "Avatar: The Last Airbender", "year": "2005"}
The context obtained from files is later used in The context obtained from files is later used in
@ -74,7 +76,7 @@ def ingest_file(
if file.filename is None: if file.filename is None:
raise HTTPException(400, "No file name provided") raise HTTPException(400, "No file name provided")
metadata_dict = None if metadata is None else eval(metadata) metadata_dict = None if metadata is None else json.loads(metadata)
ingested_documents = service.ingest_bin_data( ingested_documents = service.ingest_bin_data(
file.filename, file.file, metadata_dict file.filename, file.file, metadata_dict
) )

View File

@ -1,5 +1,6 @@
import json import json
from pathlib import Path from pathlib import Path
from typing import Any
import pytest import pytest
from fastapi.testclient import TestClient from fastapi.testclient import TestClient
@ -19,7 +20,9 @@ class IngestHelper:
ingest_result = IngestResponse.model_validate(response.json()) ingest_result = IngestResponse.model_validate(response.json())
return ingest_result return ingest_result
def ingest_file_with_metadata(self, path: Path, metadata: dict) -> IngestResponse: def ingest_file_with_metadata(
self, path: Path, metadata: dict[str, Any]
) -> IngestResponse:
files = { files = {
"file": (path.name, path.open("rb")), "file": (path.name, path.open("rb")),
"metadata": (None, json.dumps(metadata)), "metadata": (None, json.dumps(metadata)),