community[patch]: Databricks - fix scope of dangerous deserialization error in Databricks LLM connector (#20368)

fix scope of dangerous deserialization error in Databricks LLM connector --------- Signed-off-by: dbczumar <corey.zumar@databricks.com>
2025-08-13 14:50:00 +00:00 · 2024-04-12 14:27:26 -07:00 · 2024-04-12 14:27:26 -07:00 · 3a068b26f3
commit 3a068b26f3
parent f1248f8d9a
2 changed files with 63 additions and 22 deletions
--- a/libs/community/langchain_community/llms/databricks.py
+++ b/libs/community/langchain_community/llms/databricks.py
@ -221,8 +221,21 @@ def _is_hex_string(data: str) -> bool:
    return bool(re.match(pattern, data))


-def _load_pickled_fn_from_hex_string(data: str) -> Callable:
+def _load_pickled_fn_from_hex_string(
+    data: str, allow_dangerous_deserialization: Optional[bool]
+) -> Callable:
    """Loads a pickled function from a hexadecimal string."""
+    if not allow_dangerous_deserialization:
+        raise ValueError(
+            "This code relies on the pickle module. "
+            "You will need to set allow_dangerous_deserialization=True "
+            "if you want to opt-in to allow deserialization of data using pickle."
+            "Data can be compromised by a malicious actor if "
+            "not handled properly to include "
+            "a malicious payload that when deserialized with "
+            "pickle can execute arbitrary code on your machine."
+        )
+
    try:
        import cloudpickle
    except Exception as e:
@ -443,25 +456,21 @@ class Databricks(LLM):
        return v

    def __init__(self, **data: Any):
-        if not data.get("allow_dangerous_deserialization"):
-            raise ValueError(
-                "This code relies on the pickle module. "
-                "You will need to set allow_dangerous_deserialization=True "
-                "if you want to opt-in to allow deserialization of data using pickle."
-                "Data can be compromised by a malicious actor if "
-                "not handled properly to include "
-                "a malicious payload that when deserialized with "
-                "pickle can execute arbitrary code on your machine."
-            )
        if "transform_input_fn" in data and _is_hex_string(data["transform_input_fn"]):
            data["transform_input_fn"] = _load_pickled_fn_from_hex_string(
-                data["transform_input_fn"]
+                data=data["transform_input_fn"],
+                allow_dangerous_deserialization=data.get(
+                    "allow_dangerous_deserialization"
+                ),
            )
        if "transform_output_fn" in data and _is_hex_string(
            data["transform_output_fn"]
        ):
            data["transform_output_fn"] = _load_pickled_fn_from_hex_string(
-                data["transform_output_fn"]
+                data=data["transform_output_fn"],
+                allow_dangerous_deserialization=data.get(
+                    "allow_dangerous_deserialization"
+                ),
            )

        super().__init__(**data)
--- a/libs/community/tests/unit_tests/llms/test_databricks.py
+++ b/libs/community/tests/unit_tests/llms/test_databricks.py
@ -56,7 +56,10 @@ def test_serde_transform_input_fn(monkeypatch: MonkeyPatch) -> None:
    assert params["transform_input_fn"] == pickled_string

    request = {"prompt": "What is the meaning of life?"}
-    fn = _load_pickled_fn_from_hex_string(params["transform_input_fn"])
+    fn = _load_pickled_fn_from_hex_string(
+        data=params["transform_input_fn"],
+        allow_dangerous_deserialization=True,
+    )
    assert fn(**request) == transform_input(**request)


@ -69,15 +72,44 @@ def test_saving_loading_llm(monkeypatch: MonkeyPatch, tmp_path: Path) -> None:
    monkeypatch.setenv("DATABRICKS_TOKEN", "my-default-token")

    llm = Databricks(
-        endpoint_name="chat", temperature=0.1, allow_dangerous_deserialization=True
+        endpoint_name="chat",
+        temperature=0.1,
    )
    llm.save(file_path=tmp_path / "databricks.yaml")

-    # Loading without allowing_dangerous_deserialization=True should raise an error.
-    with pytest.raises(ValueError, match="This code relies on the pickle module."):
-        load_llm(tmp_path / "databricks.yaml")
-
-    loaded_llm = load_llm(
-        tmp_path / "databricks.yaml", allow_dangerous_deserialization=True
-    )
+    loaded_llm = load_llm(tmp_path / "databricks.yaml")
    assert_llm_equality(llm, loaded_llm)
+
+
+@pytest.mark.requires("cloudpickle")
+def test_saving_loading_llm_dangerous_serde_check(
+    monkeypatch: MonkeyPatch, tmp_path: Path
+) -> None:
+    monkeypatch.setattr(
+        "langchain_community.llms.databricks._DatabricksServingEndpointClient",
+        MockDatabricksServingEndpointClient,
+    )
+    monkeypatch.setenv("DATABRICKS_HOST", "my-default-host")
+    monkeypatch.setenv("DATABRICKS_TOKEN", "my-default-token")
+
+    llm1 = Databricks(
+        endpoint_name="chat",
+        temperature=0.1,
+        transform_input_fn=lambda x, y, **kwargs: {},
+    )
+    llm1.save(file_path=tmp_path / "databricks1.yaml")
+
+    with pytest.raises(ValueError, match="This code relies on the pickle module."):
+        load_llm(tmp_path / "databricks1.yaml")
+
+    load_llm(tmp_path / "databricks1.yaml", allow_dangerous_deserialization=True)
+
+    llm2 = Databricks(
+        endpoint_name="chat", temperature=0.1, transform_output_fn=lambda x: "test"
+    )
+    llm2.save(file_path=tmp_path / "databricks2.yaml")
+
+    with pytest.raises(ValueError, match="This code relies on the pickle module."):
+        load_llm(tmp_path / "databricks2.yaml")
+
+    load_llm(tmp_path / "databricks2.yaml", allow_dangerous_deserialization=True)