diff --git a/libs/core/langchain_core/utils/__init__.py b/libs/core/langchain_core/utils/__init__.py index 6b95475cc1a..5f26895db83 100644 --- a/libs/core/langchain_core/utils/__init__.py +++ b/libs/core/langchain_core/utils/__init__.py @@ -23,7 +23,12 @@ if TYPE_CHECKING: from langchain_core.utils.iter import batch_iterate from langchain_core.utils.loading import try_load_from_hub from langchain_core.utils.pydantic import pre_init - from langchain_core.utils.strings import comma_list, stringify_dict, stringify_value + from langchain_core.utils.strings import ( + comma_list, + sanitize_for_postgres, + stringify_dict, + stringify_value, + ) from langchain_core.utils.utils import ( build_extra_kwargs, check_package_version, @@ -59,6 +64,7 @@ __all__ = ( "pre_init", "print_text", "raise_for_status_with_text", + "sanitize_for_postgres", "secret_from_env", "stringify_dict", "stringify_value", @@ -81,6 +87,7 @@ _dynamic_imports = { "try_load_from_hub": "loading", "pre_init": "pydantic", "comma_list": "strings", + "sanitize_for_postgres": "strings", "stringify_dict": "strings", "stringify_value": "strings", "build_extra_kwargs": "utils", diff --git a/libs/core/langchain_core/utils/strings.py b/libs/core/langchain_core/utils/strings.py index 4eeb7ed582e..a72780024e9 100644 --- a/libs/core/langchain_core/utils/strings.py +++ b/libs/core/langchain_core/utils/strings.py @@ -46,3 +46,26 @@ def comma_list(items: list[Any]) -> str: str: The comma-separated string. """ return ", ".join(str(item) for item in items) + + +def sanitize_for_postgres(text: str, replacement: str = "") -> str: + r"""Sanitize text by removing NUL bytes that are incompatible with PostgreSQL. + + PostgreSQL text fields cannot contain NUL (0x00) bytes, which can cause + psycopg.DataError when inserting documents. This function removes or replaces + such characters to ensure compatibility. + + Args: + text: The text to sanitize. + replacement: String to replace NUL bytes with. Defaults to empty string. + + Returns: + str: The sanitized text with NUL bytes removed or replaced. + + Example: + >>> sanitize_for_postgres("Hello\\x00world") + 'Helloworld' + >>> sanitize_for_postgres("Hello\\x00world", " ") + 'Hello world' + """ + return text.replace("\x00", replacement) diff --git a/libs/core/tests/unit_tests/utils/test_imports.py b/libs/core/tests/unit_tests/utils/test_imports.py index 67fe97e6569..d37e3c870e7 100644 --- a/libs/core/tests/unit_tests/utils/test_imports.py +++ b/libs/core/tests/unit_tests/utils/test_imports.py @@ -27,6 +27,7 @@ EXPECTED_ALL = [ "pre_init", "from_env", "secret_from_env", + "sanitize_for_postgres", ] diff --git a/libs/core/tests/unit_tests/utils/test_strings.py b/libs/core/tests/unit_tests/utils/test_strings.py new file mode 100644 index 00000000000..2162fb3efe8 --- /dev/null +++ b/libs/core/tests/unit_tests/utils/test_strings.py @@ -0,0 +1,49 @@ +"""Test string utilities.""" + +from langchain_core.utils.strings import ( + comma_list, + sanitize_for_postgres, + stringify_dict, + stringify_value, +) + + +def test_sanitize_for_postgres() -> None: + """Test sanitizing text for PostgreSQL compatibility.""" + # Test with NUL bytes + text_with_nul = "Hello\x00world\x00test" + expected = "Helloworldtest" + assert sanitize_for_postgres(text_with_nul) == expected + + # Test with replacement character + expected_with_replacement = "Hello world test" + assert sanitize_for_postgres(text_with_nul, " ") == expected_with_replacement + + # Test with text without NUL bytes + clean_text = "Hello world" + assert sanitize_for_postgres(clean_text) == clean_text + + # Test empty string + assert sanitize_for_postgres("") == "" + + # Test with multiple consecutive NUL bytes + text_with_multiple_nuls = "Hello\x00\x00\x00world" + assert sanitize_for_postgres(text_with_multiple_nuls) == "Helloworld" + assert sanitize_for_postgres(text_with_multiple_nuls, "-") == "Hello---world" + + +def test_existing_string_functions() -> None: + """Test existing string functions still work.""" + # Test comma_list + assert comma_list([1, 2, 3]) == "1, 2, 3" + assert comma_list(["a", "b", "c"]) == "a, b, c" + + # Test stringify_value + assert stringify_value("hello") == "hello" + assert stringify_value(42) == "42" + + # Test stringify_dict + data = {"key": "value", "number": 123} + result = stringify_dict(data) + assert "key: value" in result + assert "number: 123" in result