Merge branch 'master' into fix/azure_deepseek_structured_output

2025-08-28 05:54:55 +00:00 · 2025-07-22 04:28:44 +03:00 · 2025-07-22 04:28:44 +03:00 · b5b92a0967
commit b5b92a0967
parent 33c803c2c3 2104cf0d9a
11 changed files with 3051 additions and 2957 deletions
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@ -25,7 +25,7 @@ def get_user(user_id: str, verbose: bool = False):  # Maintains stable interface
 * Prefer descriptive, **self-explanatory variable names**. Avoid overly short or cryptic identifiers.
 * Break up overly long or deeply nested functions for **readability and maintainability**.
 * Avoid unnecessary abstraction or premature optimization.
-* All generated Python code must include type hints.
+* All generated Python code must include type hints and return types.

 Bad:

--- a/.github/workflows/_release.yml
+++ b/.github/workflows/_release.yml
@ -340,7 +340,7 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        partner: [openai]
+        partner: [openai, anthropic]
      fail-fast: false  # Continue testing other partners if one fails
    env:
      ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
--- a/docs/src/theme/FeatureTables.js
+++ b/docs/src/theme/FeatureTables.js
@ -1029,7 +1029,7 @@ const FEATURE_TABLES = {
                passesStandardTests: false,
                multiTenancy: false,
                local: true,
-                idsInAddDocuments: false,
+                idsInAddDocuments: true,
            },
            {
                name: "Chroma",
@ -1042,7 +1042,7 @@ const FEATURE_TABLES = {
                passesStandardTests: false,
                multiTenancy: false,
                local: true,
-                idsInAddDocuments: false,
+                idsInAddDocuments: true,
            },
            {
                name: "Clickhouse",
@ -1055,7 +1055,7 @@ const FEATURE_TABLES = {
                passesStandardTests: false,
                multiTenancy: false,
                local: true,
-                idsInAddDocuments: false,
+                idsInAddDocuments: true,
            },
            {
                name: "CouchbaseSearchVectorStore",
@ -1081,7 +1081,7 @@ const FEATURE_TABLES = {
                passesStandardTests: false,
                multiTenancy: false,
                local: false,
-                idsInAddDocuments: false,
+                idsInAddDocuments: true,
            },
            {
                name: "ElasticsearchStore",
@ -1094,7 +1094,7 @@ const FEATURE_TABLES = {
                passesStandardTests: false,
                multiTenancy: false,
                local: true,
-                idsInAddDocuments: false,
+                idsInAddDocuments: true,
            },
            {
                name: "FAISS",
@ -1107,7 +1107,7 @@ const FEATURE_TABLES = {
                passesStandardTests: false,
                multiTenancy: false,
                local: true,
-                idsInAddDocuments: false,
+                idsInAddDocuments: true,
            },
            {
                name: "InMemoryVectorStore",
@ -1120,7 +1120,7 @@ const FEATURE_TABLES = {
                passesStandardTests: false,
                multiTenancy: false,
                local: true,
-                idsInAddDocuments: false,
+                idsInAddDocuments: true,
            },
            {
                name: "Milvus",
@ -1146,7 +1146,7 @@ const FEATURE_TABLES = {
                passesStandardTests: false,
                multiTenancy: false,
                local: true,
-                idsInAddDocuments: false,
+                idsInAddDocuments: true,
            },
            {
                name: "openGauss",
@ -1172,7 +1172,7 @@ const FEATURE_TABLES = {
                passesStandardTests: false,
                multiTenancy: false,
                local: true,
-                idsInAddDocuments: false,
+                idsInAddDocuments: true,
            },
            {
                name: "PineconeVectorStore",
@ -1185,7 +1185,7 @@ const FEATURE_TABLES = {
                passesStandardTests: false,
                multiTenancy: false,
                local: true,
-                idsInAddDocuments: false,
+                idsInAddDocuments: true,
            },
            {
                name: "QdrantVectorStore",
@ -1211,7 +1211,7 @@ const FEATURE_TABLES = {
                passesStandardTests: false,
                multiTenancy: false,
                local: true,
-                idsInAddDocuments: false,
+                idsInAddDocuments: true,
            },
            {
                name: "Weaviate",
@ -1224,7 +1224,7 @@ const FEATURE_TABLES = {
                passesStandardTests: false,
                multiTenancy: true,
                local: true,
-                idsInAddDocuments: false,
+                idsInAddDocuments: true,
            },
            {
                name: "SQLServer",
@ -1237,7 +1237,7 @@ const FEATURE_TABLES = {
                passesStandardTests: false,
                multiTenancy: false,
                local: false,
-                idsInAddDocuments: false,
+                idsInAddDocuments: true,
            },
        ],
    }
--- a/libs/core/langchain_core/utils/init.py
+++ b/libs/core/langchain_core/utils/init.py
@ -23,7 +23,12 @@ if TYPE_CHECKING:
    from langchain_core.utils.iter import batch_iterate
    from langchain_core.utils.loading import try_load_from_hub
    from langchain_core.utils.pydantic import pre_init
-    from langchain_core.utils.strings import comma_list, stringify_dict, stringify_value
+    from langchain_core.utils.strings import (
+        comma_list,
+        sanitize_for_postgres,
+        stringify_dict,
+        stringify_value,
+    )
    from langchain_core.utils.utils import (
        build_extra_kwargs,
        check_package_version,
@ -59,6 +64,7 @@ __all__ = (
    "pre_init",
    "print_text",
    "raise_for_status_with_text",
+    "sanitize_for_postgres",
    "secret_from_env",
    "stringify_dict",
    "stringify_value",
@ -81,6 +87,7 @@ _dynamic_imports = {
    "try_load_from_hub": "loading",
    "pre_init": "pydantic",
    "comma_list": "strings",
+    "sanitize_for_postgres": "strings",
    "stringify_dict": "strings",
    "stringify_value": "strings",
    "build_extra_kwargs": "utils",
--- a/libs/core/langchain_core/utils/strings.py
+++ b/libs/core/langchain_core/utils/strings.py
@ -46,3 +46,26 @@ def comma_list(items: list[Any]) -> str:
        str: The comma-separated string.
    """
    return ", ".join(str(item) for item in items)
+
+
+def sanitize_for_postgres(text: str, replacement: str = "") -> str:
+    r"""Sanitize text by removing NUL bytes that are incompatible with PostgreSQL.
+
+    PostgreSQL text fields cannot contain NUL (0x00) bytes, which can cause
+    psycopg.DataError when inserting documents. This function removes or replaces
+    such characters to ensure compatibility.
+
+    Args:
+        text: The text to sanitize.
+        replacement: String to replace NUL bytes with. Defaults to empty string.
+
+    Returns:
+        str: The sanitized text with NUL bytes removed or replaced.
+
+    Example:
+        >>> sanitize_for_postgres("Hello\\x00world")
+        'Helloworld'
+        >>> sanitize_for_postgres("Hello\\x00world", " ")
+        'Hello world'
+    """
+    return text.replace("\x00", replacement)
--- a/libs/core/tests/unit_tests/utils/test_imports.py
+++ b/libs/core/tests/unit_tests/utils/test_imports.py
@ -27,6 +27,7 @@ EXPECTED_ALL = [
    "pre_init",
    "from_env",
    "secret_from_env",
+    "sanitize_for_postgres",
 ]


--- a/libs/core/tests/unit_tests/utils/test_strings.py
+++ b/libs/core/tests/unit_tests/utils/test_strings.py
@ -0,0 +1,49 @@
+"""Test string utilities."""
+
+from langchain_core.utils.strings import (
+    comma_list,
+    sanitize_for_postgres,
+    stringify_dict,
+    stringify_value,
+)
+
+
+def test_sanitize_for_postgres() -> None:
+    """Test sanitizing text for PostgreSQL compatibility."""
+    # Test with NUL bytes
+    text_with_nul = "Hello\x00world\x00test"
+    expected = "Helloworldtest"
+    assert sanitize_for_postgres(text_with_nul) == expected
+
+    # Test with replacement character
+    expected_with_replacement = "Hello world test"
+    assert sanitize_for_postgres(text_with_nul, " ") == expected_with_replacement
+
+    # Test with text without NUL bytes
+    clean_text = "Hello world"
+    assert sanitize_for_postgres(clean_text) == clean_text
+
+    # Test empty string
+    assert sanitize_for_postgres("") == ""
+
+    # Test with multiple consecutive NUL bytes
+    text_with_multiple_nuls = "Hello\x00\x00\x00world"
+    assert sanitize_for_postgres(text_with_multiple_nuls) == "Helloworld"
+    assert sanitize_for_postgres(text_with_multiple_nuls, "-") == "Hello---world"
+
+
+def test_existing_string_functions() -> None:
+    """Test existing string functions still work."""
+    # Test comma_list
+    assert comma_list([1, 2, 3]) == "1, 2, 3"
+    assert comma_list(["a", "b", "c"]) == "a, b, c"
+
+    # Test stringify_value
+    assert stringify_value("hello") == "hello"
+    assert stringify_value(42) == "42"
+
+    # Test stringify_dict
+    data = {"key": "value", "number": 123}
+    result = stringify_dict(data)
+    assert "key: value" in result
+    assert "number: 123" in result
--- a/libs/langchain/langchain/chains/openai_functions/citation_fuzzy_match.py
+++ b/libs/langchain/langchain/chains/openai_functions/citation_fuzzy_match.py
@ -123,7 +123,10 @@ def create_citation_fuzzy_match_chain(llm: BaseLanguageModel) -> LLMChain:
        Chain (LLMChain) that can be used to answer questions with citations.
    """
    output_parser = PydanticOutputFunctionsParser(pydantic_schema=QuestionAnswer)
-    schema = QuestionAnswer.schema()
+    if hasattr(QuestionAnswer, "model_json_schema"):
+        schema = QuestionAnswer.model_json_schema()
+    else:
+        schema = QuestionAnswer.schema()
    function = {
        "name": schema["title"],
        "description": schema["description"],
--- a/libs/langchain/langchain/evaluation/parsing/json_schema.py
+++ b/libs/langchain/langchain/evaluation/parsing/json_schema.py
@ -70,8 +70,11 @@ class JsonSchemaEvaluator(StringEvaluator):
    def _parse_json(self, node: Any) -> Union[dict, list, None, float, bool, int, str]:
        if isinstance(node, str):
            return parse_json_markdown(node)
+        if hasattr(node, "model_json_schema") and callable(node.model_json_schema):
+            # Pydantic v2 model
+            return node.model_json_schema()
        if hasattr(node, "schema") and callable(node.schema):
-            # Pydantic model
+            # Pydantic v1 model
            return node.schema()
        return node

--- a/libs/langchain/langchain/output_parsers/yaml.py
+++ b/libs/langchain/langchain/output_parsers/yaml.py
@ -43,7 +43,15 @@ class YamlOutputParser(BaseOutputParser[T]):

    def get_format_instructions(self) -> str:
        # Copy schema to avoid altering original Pydantic schema.
-        schema = dict(self.pydantic_object.schema().items())
+        if hasattr(self.pydantic_object, "model_json_schema"):
+            # Pydantic v2
+            schema = dict(self.pydantic_object.model_json_schema().items())
+        elif hasattr(self.pydantic_object, "schema"):
+            # Pydantic v1
+            schema = dict(self.pydantic_object.schema().items())
+        else:
+            msg = "Pydantic object must have either model_json_schema or schema method"
+            raise ValueError(msg)

        # Remove extraneous fields.
        reduced_schema = schema
--- a/libs/langchain/uv.lock
+++ b/libs/langchain/uv.lock