From 0bc831495ca20024373a4c83a7ce8ce444d941a2 Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Fri, 6 Mar 2026 14:25:08 -0500 Subject: [PATCH] fix(langchain-classic): patch ReDoS vulnerability in MRKL and ReAct action regex (CVE-2024-58340) (#35598) The action-parsing regex in `MRKLOutputParser.parse()` and `ReActSingleInputOutputParser.parse()` used the pattern `(.*?)[\s]*Action` which causes catastrophic backtracking on crafted input where whitespace characters sit between two partial `Action` tokens. An attacker can trigger near-infinite CPU consumption with a relatively short string. The fix removes the redundant `[\s]*` quantifier between the first capture group and the literal `Action` keyword. Since `re.DOTALL` is active and the preceding `(.*?)` already matches any character (including whitespace), the `[\s]*` was unnecessary and was the source of the ambiguity that enabled backtracking. Adds regression tests for both parsers that use `SIGALRM` timeouts to assert the regex completes in bounded time on adversarial input. This fix was reviewed manually. Created with [Deep Agents CLI](https://docs.langchain.com/oss/python/deepagents/cli/overview). --- .../agents/mrkl/output_parser.py | 4 +-- .../output_parsers/react_single_input.py | 4 +-- .../output_parsers/test_react_single_input.py | 32 +++++++++++++++++++ .../agents/test_mrkl_output_parser.py | 30 +++++++++++++++++ 4 files changed, 64 insertions(+), 6 deletions(-) diff --git a/libs/langchain/langchain_classic/agents/mrkl/output_parser.py b/libs/langchain/langchain_classic/agents/mrkl/output_parser.py index 1c08b68338f..4323dc32d25 100644 --- a/libs/langchain/langchain_classic/agents/mrkl/output_parser.py +++ b/libs/langchain/langchain_classic/agents/mrkl/output_parser.py @@ -41,9 +41,7 @@ class MRKLOutputParser(AgentOutputParser): OutputParserException: If the output could not be parsed. """ includes_answer = FINAL_ANSWER_ACTION in text - regex = ( - r"Action\s*\d*\s*:[\s]*(.*?)[\s]*Action\s*\d*\s*Input\s*\d*\s*:[\s]*(.*)" - ) + regex = r"Action\s*\d*\s*:[\s]*(.*?)Action\s*\d*\s*Input\s*\d*\s*:[\s]*(.*)" action_match = re.search(regex, text, re.DOTALL) if action_match and includes_answer: if text.find(FINAL_ANSWER_ACTION) < text.find(action_match.group(0)): diff --git a/libs/langchain/langchain_classic/agents/output_parsers/react_single_input.py b/libs/langchain/langchain_classic/agents/output_parsers/react_single_input.py index ae7634f9636..a74cc65a164 100644 --- a/libs/langchain/langchain_classic/agents/output_parsers/react_single_input.py +++ b/libs/langchain/langchain_classic/agents/output_parsers/react_single_input.py @@ -52,9 +52,7 @@ class ReActSingleInputOutputParser(AgentOutputParser): @override def parse(self, text: str) -> AgentAction | AgentFinish: includes_answer = FINAL_ANSWER_ACTION in text - regex = ( - r"Action\s*\d*\s*:[\s]*(.*?)[\s]*Action\s*\d*\s*Input\s*\d*\s*:[\s]*(.*)" - ) + regex = r"Action\s*\d*\s*:[\s]*(.*?)Action\s*\d*\s*Input\s*\d*\s*:[\s]*(.*)" action_match = re.search(regex, text, re.DOTALL) if action_match: if includes_answer: diff --git a/libs/langchain/tests/unit_tests/agents/output_parsers/test_react_single_input.py b/libs/langchain/tests/unit_tests/agents/output_parsers/test_react_single_input.py index b8f3b612274..eed59a6cfa4 100644 --- a/libs/langchain/tests/unit_tests/agents/output_parsers/test_react_single_input.py +++ b/libs/langchain/tests/unit_tests/agents/output_parsers/test_react_single_input.py @@ -1,3 +1,6 @@ +import signal +import sys + import pytest from langchain_core.agents import AgentAction, AgentFinish from langchain_core.exceptions import OutputParserException @@ -43,3 +46,32 @@ Action: search Final Answer: Action Input: what is the temperature in SF?""" with pytest.raises(OutputParserException): parser.invoke(_input) + + +def _timeout_handler(_signum: int, _frame: object) -> None: + msg = "ReDoS: regex took too long" + raise TimeoutError(msg) + + +@pytest.mark.skipif( + sys.platform == "win32", reason="SIGALRM is not available on Windows" +) +def test_react_single_input_no_redos() -> None: + """Regression test for ReDoS caused by catastrophic backtracking.""" + parser = ReActSingleInputOutputParser() + malicious = "Action: " + " \t" * 1000 + "Action " + old = signal.signal(signal.SIGALRM, _timeout_handler) + signal.alarm(2) + try: + try: + parser.parse(malicious) + except OutputParserException: + pass + except TimeoutError: + pytest.fail( + "ReDoS detected: ReActSingleInputOutputParser.parse() " + "hung on crafted input" + ) + finally: + signal.alarm(0) + signal.signal(signal.SIGALRM, old) diff --git a/libs/langchain/tests/unit_tests/agents/test_mrkl_output_parser.py b/libs/langchain/tests/unit_tests/agents/test_mrkl_output_parser.py index 9112a997d84..58731439dc8 100644 --- a/libs/langchain/tests/unit_tests/agents/test_mrkl_output_parser.py +++ b/libs/langchain/tests/unit_tests/agents/test_mrkl_output_parser.py @@ -1,3 +1,6 @@ +import signal +import sys + import pytest from langchain_core.agents import AgentAction, AgentFinish from langchain_core.exceptions import OutputParserException @@ -79,3 +82,30 @@ def test_final_answer_after_parsable_action() -> None: "Parsing LLM output produced both a final answer and a parse-able action" in exception_info.value.args[0] ) + + +def _timeout_handler(_signum: int, _frame: object) -> None: + msg = "ReDoS: regex took too long" + raise TimeoutError(msg) + + +@pytest.mark.skipif( + sys.platform == "win32", reason="SIGALRM is not available on Windows" +) +def test_mrkl_output_parser_no_redos() -> None: + """Regression test for ReDoS caused by catastrophic backtracking.""" + malicious = "Action: " + " \t" * 1000 + "Action " + old = signal.signal(signal.SIGALRM, _timeout_handler) + signal.alarm(2) + try: + try: + mrkl_output_parser.parse(malicious) + except OutputParserException: + pass + except TimeoutError: + pytest.fail( + "ReDoS detected: MRKLOutputParser.parse() hung on crafted input" + ) + finally: + signal.alarm(0) + signal.signal(signal.SIGALRM, old)