diff --git a/libs/core/langchain_core/messages/utils.py b/libs/core/langchain_core/messages/utils.py index 609d8045ac2..d4659bf0bb9 100644 --- a/libs/core/langchain_core/messages/utils.py +++ b/libs/core/langchain_core/messages/utils.py @@ -28,6 +28,7 @@ from typing import ( cast, overload, ) +from xml.sax.saxutils import escape, quoteattr from pydantic import Discriminator, Field, Tag @@ -98,11 +99,199 @@ AnyMessage = Annotated[ """A type representing any defined `Message` or `MessageChunk` type.""" +def _has_base64_data(block: dict) -> bool: + """Check if a content block contains base64 encoded data. + + Args: + block: A content block dictionary. + + Returns: + Whether the block contains base64 data. + """ + # Check for explicit base64 field (standard content blocks) + if block.get("base64"): + return True + + # Check for data: URL in url field + url = block.get("url", "") + if isinstance(url, str) and url.startswith("data:"): + return True + + # Check for OpenAI-style image_url with data: URL + image_url = block.get("image_url", {}) + if isinstance(image_url, dict): + url = image_url.get("url", "") + if isinstance(url, str) and url.startswith("data:"): + return True + + return False + + +_XML_CONTENT_BLOCK_MAX_LEN = 500 + + +def _truncate(text: str, max_len: int = _XML_CONTENT_BLOCK_MAX_LEN) -> str: + """Truncate text to `max_len` characters, adding ellipsis if truncated.""" + if len(text) <= max_len: + return text + return text[:max_len] + "..." + + +def _format_content_block_xml(block: dict) -> str | None: + """Format a content block as XML. + + Args: + block: A LangChain content block. + + Returns: + XML string representation of the block, or `None` if the block should be + skipped. + + Note: + Plain text document content, server tool call arguments, and server tool + result outputs are truncated to 500 characters. + """ + block_type = block.get("type", "") + + # Skip blocks with base64 encoded data + if _has_base64_data(block): + return None + + # Text blocks + if block_type == "text": + text = block.get("text", "") + return escape(text) if text else None + + # Reasoning blocks + if block_type == "reasoning": + reasoning = block.get("reasoning", "") + if reasoning: + return f"{escape(reasoning)}" + return None + + # Image blocks (URL only, base64 already filtered) + if block_type == "image": + url = block.get("url") + file_id = block.get("file_id") + if url: + return f"" + if file_id: + return f"" + return None + + # OpenAI-style image_url blocks + if block_type == "image_url": + image_url = block.get("image_url", {}) + if isinstance(image_url, dict): + url = image_url.get("url", "") + if url and not url.startswith("data:"): + return f"" + return None + + # Audio blocks (URL only) + if block_type == "audio": + url = block.get("url") + file_id = block.get("file_id") + if url: + return f"