mirror of
https://github.com/hwchase17/langchain.git
synced 2026-06-09 18:50:33 +00:00
`PIIMiddleware` previously scrubbed detected PII only at the state level via its `after_model` / `before_model` hooks. Consumers reading the live stream — `astream_events(version="v3")` or `run.messages` / `run.tool_calls` / `run.values` — saw the raw model text, the raw tool-call args, the raw tool outputs, and the raw state snapshots until the run finished and the canonical conversation history was written. This change registers a stream transformer ahead of `MessagesTransformer` that redacts every wire surface of an agent run. The transformer holds a sliding lookback buffer (default 128 characters) per `(run_id, content-block index)` so PII patterns that straddle delta boundaries are caught before the safe prefix is released downstream. Anything older than the lookback is run through the configured detector and emitted; the trailing tail stays buffered until a later delta extends it past the cap or the block finishes. `_finalize_block` always re-runs detection over the full block snapshot so the finalized content lands fully redacted even when the in-flight buffer never released a tail (short responses, or PII arriving in the final delta). The `block` strategy is now supported on the streaming path via a buffering mode that withholds every delta until the block resolves — clean blocks release the full text at finalize, PII-bearing blocks zero the wire and let `after_model` / `apply_to_tool_results` raise `PIIDetectionError` on the original state message. Activation is gated on `apply_to_output=True`, matching the existing post-hoc semantics. The middleware's transformer factory is cloned by `StreamMux._make_child` into every subgraph scope, so attaching `PIIMiddleware` at the outer agent also redacts streamed deltas from sub-agents invoked inside tools. ## Tool-call and tools-channel coverage The transformer covers every wire surface of an agent run, not just AI message text: - **Streamed AI text deltas** (`content-block-delta` of type `text-delta`) — lookback machinery, redacted in place. - **Streamed tool-call args** (`content-block-delta` with `tool_call_chunk` / `server_tool_call_chunk` fields) — each delta carries the full cumulative args string; detection runs on the field directly and redacts in place. Verified empirically against `_compat_bridge.py` and the consumer-side `_merge_block_delta_into_store` snapshot-replace semantics. - **Finalized tool-call blocks** (`content-block-finish` with `tool_call` / `server_tool_call` / `invalid_tool_call`) — `args` dict walked recursively and each string leaf redacted. - **Tool execution events on the `tools` channel** — `tool-started.input`, `tool-output-delta`, `tool-finished.output`, `tool-error.message` all run through detection. String deltas use the same lookback machinery as text-deltas keyed by `tool_call_id`; structured payloads walk recursively. - **State snapshots on the `values` channel** — message lists are walked and each message's `.content` is redacted on a fresh copy. Graph state itself stays intact for the state-level enforcer (`apply_to_tool_results` via `before_model`) to act on independently. - **Legacy `(BaseMessage, metadata)` payloads** on the `messages` channel (Python 3.10 path, where `langgraph`'s `ASYNCIO_ACCEPTS_CONTEXT = sys.version_info >= (3, 11)` falls back to a code path that doesn't propagate the streaming callback into the chat model) — `.content` and `AIMessage.tool_calls[*].args` are scrubbed. For `block`, the event's `data` tuple is replaced with an empty-content copy so the original message stays in state for `after_model` to raise on. ## Worth a careful look - `_PIIStreamTransformer._mutate_text_delta` — lookback partition. Anything older than `lookback` characters is released after redaction; the tail stays buffered. Bulletproof against whitespace-permissive detectors (notably `credit_card`, whose regex matches across spaces). - `_PIIStreamTransformer._mutate_tool_call_chunk_delta` — direct in-place redaction of the cumulative args string. No buffer; the wire shape is cumulative-snapshot, the consumer-side merge is replace-not-append. - `_PIIStreamTransformer._mutate_legacy_payload` — the dual path: mutate-in-place for non-`block` (idempotent with `after_model`), replace-with-empty-copy for `block` (keeps original in graph state for `after_model` to raise on). - `_PIIStreamTransformer._redact_value` — the recursive walker. `BaseMessage` branch returns a fresh `.content`-redacted copy via `model_copy(update=...)` — never mutates in place — so tool-output payloads that wrap a `ToolMessage` and message lists in state snapshots flow through cleanly. - The new `transformers` attribute on `PIIMiddleware`: this is what makes `create_agent` pick the factory up. Multiple `PIIMiddleware` instances each register one transformer; ordering is preserved within the `before_builtins` lane. ## Compatibility Bumps `langgraph` to `>=1.2.1` for the `before_builtins` opt-in on `StreamTransformer`.
200 lines
6.0 KiB
TOML
200 lines
6.0 KiB
TOML
[build-system]
|
||
requires = ["hatchling"]
|
||
build-backend = "hatchling.build"
|
||
|
||
[project]
|
||
name = "langchain"
|
||
description = "Building applications with LLMs through composability"
|
||
license = { text = "MIT" }
|
||
readme = "README.md"
|
||
classifiers = [
|
||
"Development Status :: 5 - Production/Stable",
|
||
"Intended Audience :: Developers",
|
||
"License :: OSI Approved :: MIT License",
|
||
"Programming Language :: Python :: 3",
|
||
"Programming Language :: Python :: 3.10",
|
||
"Programming Language :: Python :: 3.11",
|
||
"Programming Language :: Python :: 3.12",
|
||
"Programming Language :: Python :: 3.13",
|
||
"Programming Language :: Python :: 3.14",
|
||
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||
"Topic :: Software Development :: Libraries :: Python Modules",
|
||
]
|
||
|
||
version = "1.3.1"
|
||
requires-python = ">=3.10.0,<4.0.0"
|
||
dependencies = [
|
||
"langchain-core>=1.4.0,<2.0.0",
|
||
"langgraph>=1.2.1,<1.3.0",
|
||
"pydantic>=2.7.4,<3.0.0",
|
||
]
|
||
|
||
[project.optional-dependencies]
|
||
community = ["langchain-community"]
|
||
anthropic = ["langchain-anthropic"]
|
||
openai = ["langchain-openai"]
|
||
azure-ai = ["langchain-azure-ai"]
|
||
#cohere = ["langchain-cohere"]
|
||
google-vertexai = ["langchain-google-vertexai"]
|
||
google-genai = ["langchain-google-genai"]
|
||
fireworks = ["langchain-fireworks"]
|
||
ollama = ["langchain-ollama"]
|
||
together = ["langchain-together"]
|
||
mistralai = ["langchain-mistralai"]
|
||
huggingface = ["langchain-huggingface"]
|
||
groq = ["langchain-groq"]
|
||
aws = ["langchain-aws"]
|
||
baseten = ["langchain-baseten>=0.2.0"]
|
||
deepseek = ["langchain-deepseek"]
|
||
xai = ["langchain-xai"]
|
||
perplexity = ["langchain-perplexity"]
|
||
|
||
[project.urls]
|
||
Homepage = "https://docs.langchain.com/"
|
||
Documentation = "https://reference.langchain.com/python/langchain/langchain/"
|
||
Repository = "https://github.com/langchain-ai/langchain"
|
||
Issues = "https://github.com/langchain-ai/langchain/issues"
|
||
Changelog = "https://github.com/langchain-ai/langchain/releases?q=tag%3A%22langchain%3D%3D1%22"
|
||
Twitter = "https://x.com/langchain_oss"
|
||
Slack = "https://www.langchain.com/join-community"
|
||
Reddit = "https://www.reddit.com/r/LangChain/"
|
||
|
||
[dependency-groups]
|
||
test = [
|
||
"pytest>=9.0.3,<10.0.0",
|
||
"pytest-cov>=4.0.0,<8.0.0",
|
||
"pytest-watcher>=0.2.6,<1.0.0",
|
||
"pytest-asyncio>=1.3.0,<2.0.0",
|
||
"pytest-socket>=0.6.0,<1.0.0",
|
||
"pytest-xdist<4.0.0,>=3.6.1",
|
||
"pytest-mock",
|
||
"pytest-benchmark>=5.1.0,<6.0.0",
|
||
"syrupy>=5.0.0,<6.0.0",
|
||
"toml>=0.10.2,<1.0.0",
|
||
"blockbuster>=1.5.26,<1.6.0",
|
||
"langchain-tests>=1.1.9,<2.0.0",
|
||
"langchain-openai",
|
||
]
|
||
lint = [
|
||
"ruff>=0.15.0,<0.16.0",
|
||
]
|
||
typing = [
|
||
"mypy>=1.19.1,<1.20.0",
|
||
"types-toml>=0.10.8.20240310,<1.0.0.0",
|
||
]
|
||
|
||
test_integration = [
|
||
"vcrpy>=8.0.0,<9.0.0",
|
||
"wrapt>=1.15.0,<3.0.0",
|
||
"python-dotenv>=1.0.0,<2.0.0",
|
||
"langchainhub>=0.1.16,<1.0.0",
|
||
"langchain-core>=1.4.0,<2.0.0",
|
||
"langchain-text-splitters>=1.0.0,<2.0.0",
|
||
]
|
||
|
||
[tool.uv]
|
||
constraint-dependencies = ["urllib3>=2.6.3", "pygments>=2.20.0"]
|
||
|
||
[tool.uv.sources]
|
||
langchain-core = { path = "../core", editable = true }
|
||
langchain-tests = { path = "../standard-tests", editable = true }
|
||
langchain-text-splitters = { path = "../text-splitters", editable = true }
|
||
langchain-openai = { path = "../partners/openai", editable = true }
|
||
langchain-anthropic = { path = "../partners/anthropic", editable = true }
|
||
|
||
[tool.ruff]
|
||
line-length = 100
|
||
|
||
[tool.mypy]
|
||
strict = true
|
||
enable_error_code = "deprecated"
|
||
warn_unreachable = true
|
||
exclude = [
|
||
# Exclude agents tests except middleware_typing/ which has type-checked tests
|
||
"tests/unit_tests/agents/middleware/",
|
||
"tests/unit_tests/agents/specifications/",
|
||
"tests/unit_tests/agents/test_.*\\.py",
|
||
]
|
||
|
||
# TODO: activate for 'strict' checking
|
||
warn_return_any = false
|
||
|
||
[[tool.mypy.overrides]]
|
||
module = ["pytest_socket.*", "vcr.*"]
|
||
ignore_missing_imports = true
|
||
|
||
[tool.ruff.format]
|
||
docstring-code-format = true
|
||
|
||
[tool.ruff.lint]
|
||
select = [
|
||
"ALL"
|
||
]
|
||
ignore = [
|
||
"C90", # McCabe complexity
|
||
"COM812", # Messes with the formatter
|
||
"CPY", # No copyright
|
||
"FIX002", # Line contains TODO
|
||
"PERF203", # Rarely useful
|
||
"PLR09", # Too many something (arg, statements, etc)
|
||
"TD002", # Missing author in TODO
|
||
"TD003", # Missing issue link in TODO
|
||
|
||
# TODO rules
|
||
"ANN401", # Any in type annotations
|
||
"BLE", # Blind exceptions
|
||
]
|
||
unfixable = [
|
||
"B028", # People should intentionally tune the stacklevel
|
||
]
|
||
|
||
flake8-annotations.allow-star-arg-any = true
|
||
allowed-confusables = ["–"]
|
||
|
||
[tool.ruff.lint.flake8-tidy-imports]
|
||
ban-relative-imports = "all"
|
||
|
||
[tool.ruff.lint.pydocstyle]
|
||
convention = "google"
|
||
ignore-var-parameters = true # ignore missing documentation for *args and **kwargs parameters
|
||
|
||
[tool.ruff.lint.extend-per-file-ignores]
|
||
"tests/unit_tests/agents/*" = [
|
||
"ANN", # Annotations, needs to fix
|
||
"ARG", # Arguments, needs to fix
|
||
]
|
||
"tests/unit_tests/agents/test_responses_spec.py" = ["F821"]
|
||
"tests/unit_tests/agents/test_return_direct_spec.py" = ["F821"]
|
||
"tests/unit_tests/agents/test_react_agent.py" = ["ALL"]
|
||
|
||
"tests/*" = [
|
||
"D1", # Documentation rules
|
||
"S101", # Tests need assertions
|
||
"S311", # Standard pseudo-random generators are not suitable for cryptographic purposes
|
||
"SLF001", # Private member access in tests
|
||
"PLR2004", # Magic values are perfectly fine in unit tests (e.g. 0, 1, 2, etc.)
|
||
]
|
||
|
||
"scripts/*" = [
|
||
"INP", # Scripts are not in a package
|
||
"T201", # Scripts can print to the console
|
||
]
|
||
|
||
[tool.coverage.run]
|
||
omit = ["tests/*"]
|
||
|
||
[tool.pytest.ini_options]
|
||
addopts = "--strict-markers --strict-config --durations=5 --snapshot-warn-unused -vv"
|
||
markers = [
|
||
"requires: mark tests as requiring a specific library",
|
||
"scheduled: mark tests to run in scheduled testing",
|
||
"compile: mark placeholder test used to compile integration tests without running them",
|
||
"benchmark: mark benchmark tests",
|
||
]
|
||
asyncio_mode = "auto"
|
||
filterwarnings = [
|
||
"ignore::langchain_core._api.beta_decorator.LangChainBetaWarning",
|
||
"ignore::langchain_core._api.deprecation.LangChainDeprecationWarning:tests",
|
||
"ignore::langchain_core._api.deprecation.LangChainPendingDeprecationWarning:tests",
|
||
]
|