community[patch]: Fix MLX LLM Stream (#20575)

Closes #20561 This PR fixes MLX LLM stream `AttributeError`. Recently, `mlx-lm` changed the token decoding logic, which affected the LC+MLX integration. Additionally, I made minor fixes such as: docs example broken link and enforcing pipeline arguments (max_tokens, temp and etc) for invoke. - **Issue:** #20561 - **Twitter handle:** @Prince_Canuma
2025-08-18 09:01:03 +00:00 · 2024-05-21 02:17:08 +02:00 · 2024-05-21 02:17:08 +02:00 · 3587c60396
commit 3587c60396
parent 96bd0b0844
2 changed files with 55 additions and 12 deletions
--- a/docs/docs/integrations/chat/mlx.ipynb
+++ b/docs/docs/integrations/chat/mlx.ipynb
@ -9,7 +9,7 @@
    "This notebook shows how to get started using `MLX` LLM's as chat models.\n",
    "\n",
    "In particular, we will:\n",
-    "1. Utilize the [MLXPipeline](https://github.com/langchain-ai/langchain/blob/master/libs/langchain/langchain/llms/mlx_pipelines.py), \n",
+    "1. Utilize the [MLXPipeline](https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/llms/mlx_pipeline.py), \n",
    "2. Utilize the `ChatMLX` class to enable any of these LLMs to interface with LangChain's [Chat Messages](https://python.langchain.com/docs/modules/model_io/chat/#messages) abstraction.\n",
    "3. Demonstrate how to use an open-source LLM to power an `ChatAgent` pipeline\n"
   ]
--- a/libs/community/langchain_community/llms/mlx_pipeline.py
+++ b/libs/community/langchain_community/llms/mlx_pipeline.py
@ -1,7 +1,7 @@
 from __future__ import annotations

 import logging
-from typing import Any, Iterator, List, Mapping, Optional
+from typing import Any, Callable, Iterator, List, Mapping, Optional

 from langchain_core.callbacks import CallbackManagerForLLMRun
 from langchain_core.language_models.llms import LLM
@ -24,7 +24,7 @@ class MLXPipeline(LLM):
            from langchain_community.llms import MLXPipeline
            pipe = MLXPipeline.from_model_id(
                model_id="mlx-community/quantized-gemma-2b",
-                pipeline_kwargs={"max_tokens": 10},
+                pipeline_kwargs={"max_tokens": 10, "temp": 0.7},
            )
    Example passing model and tokenizer in directly:
        .. code-block:: python
@ -59,7 +59,21 @@ class MLXPipeline(LLM):
        when needed. Default: ``False``
    """
    pipeline_kwargs: Optional[dict] = None
-    """Keyword arguments passed to the pipeline."""
+    """
+    Keyword arguments passed to the pipeline. Defaults include:
+        - temp (float): Temperature for generation, default is 0.0.
+        - max_tokens (int): Maximum tokens to generate, default is 100.
+        - verbose (bool): Whether to output verbose logging, default is False.
+        - formatter (Optional[Callable]): A callable to format the output.
+          Default is None.
+        - repetition_penalty (Optional[float]): The penalty factor for
+          repeated sequences, default is None.
+        - repetition_context_size (Optional[int]): Size of the context
+          for applying repetition penalty, default is None.
+        - top_p (float): The cumulative probability threshold for
+          top-p filtering, default is 1.0.
+
+    """

    class Config:
        """Configuration for this pydantic object."""
@ -135,9 +149,32 @@ class MLXPipeline(LLM):
                "Please install it with `pip install mlx_lm`."
            )

-        pipeline_kwargs = kwargs.get("pipeline_kwargs", {})
+        pipeline_kwargs = kwargs.get("pipeline_kwargs", self.pipeline_kwargs)

-        return generate(self.model, self.tokenizer, prompt=prompt, **pipeline_kwargs)
+        temp: float = pipeline_kwargs.get("temp", 0.0)
+        max_tokens: int = pipeline_kwargs.get("max_tokens", 100)
+        verbose: bool = pipeline_kwargs.get("verbose", False)
+        formatter: Optional[Callable] = pipeline_kwargs.get("formatter", None)
+        repetition_penalty: Optional[float] = pipeline_kwargs.get(
+            "repetition_penalty", None
+        )
+        repetition_context_size: Optional[int] = pipeline_kwargs.get(
+            "repetition_context_size", None
+        )
+        top_p: float = pipeline_kwargs.get("top_p", 1.0)
+
+        return generate(
+            model=self.model,
+            tokenizer=self.tokenizer,
+            prompt=prompt,
+            temp=temp,
+            max_tokens=max_tokens,
+            verbose=verbose,
+            formatter=formatter,
+            repetition_penalty=repetition_penalty,
+            repetition_context_size=repetition_context_size,
+            top_p=top_p,
+        )

    def _stream(
        self,
@ -166,26 +203,32 @@ class MLXPipeline(LLM):
        repetition_context_size: Optional[int] = pipeline_kwargs.get(
            "repetition_context_size", None
        )
+        top_p: float = pipeline_kwargs.get("top_p", 1.0)

        prompt = self.tokenizer.encode(prompt, return_tensors="np")

        prompt_tokens = mx.array(prompt[0])

        eos_token_id = self.tokenizer.eos_token_id
+        detokenizer = self.tokenizer.detokenizer
+        detokenizer.reset()

        for (token, prob), n in zip(
            generate_step(
-                prompt_tokens,
-                self.model,
-                temp,
-                repetition_penalty,
-                repetition_context_size,
+                prompt=prompt_tokens,
+                model=self.model,
+                temp=temp,
+                repetition_penalty=repetition_penalty,
+                repetition_context_size=repetition_context_size,
+                top_p=top_p,
            ),
            range(max_new_tokens),
        ):
            # identify text to yield
            text: Optional[str] = None
-            text = self.tokenizer.decode(token.item())
+            detokenizer.add_token(token)
+            detokenizer.finalize()
+            text = detokenizer.last_segment

            # yield text, if any
            if text: