From 43a9dbe21b3c64d7b46c400198d1a605b6163f43 Mon Sep 17 00:00:00 2001 From: Dmitri Tsiu Date: Tue, 24 Sep 2024 01:34:21 +0300 Subject: [PATCH] Sanitize null bytes before ingestion --- private_gpt/components/ingest/ingest_helper.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/private_gpt/components/ingest/ingest_helper.py b/private_gpt/components/ingest/ingest_helper.py index a1109070..054153e8 100644 --- a/private_gpt/components/ingest/ingest_helper.py +++ b/private_gpt/components/ingest/ingest_helper.py @@ -92,7 +92,12 @@ class IngestionHelper: return string_reader.load_data([file_data.read_text()]) logger.debug("Specific reader found for extension=%s", extension) - return reader_cls().load_data(file_data) + documents = reader_cls().load_data(file_data) + + for i in range(len(documents)): + documents[i].text = documents[i].text.replace("\u0000", "") + + return documents @staticmethod def _exclude_metadata(documents: list[Document]) -> None: