fix: Sanitize null bytes before ingestion (#2090)

* Sanitize null bytes before ingestion

* Added comments
This commit is contained in:
Dmitri Qiu 2024-09-25 13:00:03 +03:00 committed by GitHub
parent fa3c30661d
commit 5fbb402477
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -92,7 +92,13 @@ class IngestionHelper:
return string_reader.load_data([file_data.read_text()])
logger.debug("Specific reader found for extension=%s", extension)
return reader_cls().load_data(file_data)
documents = reader_cls().load_data(file_data)
# Sanitize NUL bytes in text which can't be stored in Postgres
for i in range(len(documents)):
documents[i].text = documents[i].text.replace("\u0000", "")
return documents
@staticmethod
def _exclude_metadata(documents: list[Document]) -> None: