ENH: Add llm_kwargs for Xinference LLMs (#10354)

- This pr adds `llm_kwargs` to the initialization of Xinference LLMs (integrated in #8171 ). - With this enhancement, users can not only provide `generate_configs` when calling the llms for generation but also during the initialization process. This allows users to include custom configurations when utilizing LangChain features like LLMChain. - It also fixes some format issues for the docstrings.
2025-08-15 23:57:21 +00:00 · 2023-09-18 23:36:29 +08:00 · 2023-09-18 23:36:29 +08:00 · ce61840e3b
commit ce61840e3b
parent 1eefb9052b
3 changed files with 57 additions and 26 deletions
--- a/docs/extras/integrations/providers/xinference.mdx
+++ b/docs/extras/integrations/providers/xinference.mdx
@ -93,10 +93,10 @@ llm(
 ### Usage

 For more information and detailed examples, refer to the
-[example notebook for xinference](../modules/models/llms/integrations/xinference.ipynb)
+[example for xinference LLMs](/docs/integrations/llms/xinference.html)

 ### Embeddings

 Xinference also supports embedding queries and documents. See
-[example notebook for xinference embeddings](../modules/data_connection/text_embedding/integrations/xinference.ipynb) 
+[example for xinference embeddings](/docs/integrations/text_embedding/xinference.html) 
 for a more detailed demo.
--- a/libs/langchain/langchain/embeddings/xinference.py
+++ b/libs/langchain/langchain/embeddings/xinference.py
@ -8,37 +8,47 @@ class XinferenceEmbeddings(Embeddings):

    """Wrapper around xinference embedding models.
    To use, you should have the xinference library installed:
+
    .. code-block:: bash

        pip install xinference

    Check out: https://github.com/xorbitsai/inference
-    To run, you need to start a Xinference supervisor on one server and Xinference workers on the other servers
+    To run, you need to start a Xinference supervisor on one server and Xinference workers on the other servers.
+
    Example:
        To start a local instance of Xinference, run
-         .. code-block:: bash

-            $ xinference
+        .. code-block:: bash
+
+           $ xinference
+
        You can also deploy Xinference in a distributed cluster. Here are the steps:
+
        Starting the supervisor:
+
        .. code-block:: bash

-            $ xinference-supervisor
+           $ xinference-supervisor
+
        Starting the worker:
+
        .. code-block:: bash

-            $ xinference-worker
+           $ xinference-worker

    Then, launch a model using command line interface (CLI).

    Example:
+
    .. code-block:: bash

-            $ xinference launch -n orca -s 3 -q q4_0
+       $ xinference launch -n orca -s 3 -q q4_0

    It will return a model UID. Then you can use Xinference Embedding with LangChain.

    Example:
+
    .. code-block:: python

        from langchain.embeddings import XinferenceEmbeddings
--- a/libs/langchain/langchain/llms/xinference.py
+++ b/libs/langchain/langchain/llms/xinference.py
@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING, Any, Generator, List, Mapping, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, Generator, List, Mapping, Optional, Union

 from langchain.callbacks.manager import CallbackManagerForLLMRun
 from langchain.llms.base import LLM
@ -11,55 +11,65 @@ if TYPE_CHECKING:
 class Xinference(LLM):
    """Wrapper for accessing Xinference's large-scale model inference service.
    To use, you should have the xinference library installed:
+
    .. code-block:: bash

-        pip install "xinference[all]"
+       pip install "xinference[all]"

    Check out: https://github.com/xorbitsai/inference
    To run, you need to start a Xinference supervisor on one server and Xinference workers on the other servers
+
    Example:
        To start a local instance of Xinference, run
-         .. code-block:: bash

-            $ xinference
+        .. code-block:: bash
+
+           $ xinference

        You can also deploy Xinference in a distributed cluster. Here are the steps:
+
        Starting the supervisor:
+
        .. code-block:: bash

-            $ xinference-supervisor
+           $ xinference-supervisor

        Starting the worker:
+
        .. code-block:: bash

-            $ xinference-worker
+           $ xinference-worker

    Then, launch a model using command line interface (CLI).

    Example:
+
    .. code-block:: bash

-            $ xinference launch -n orca -s 3 -q q4_0
+       $ xinference launch -n orca -s 3 -q q4_0

    It will return a model UID. Then, you can use Xinference with LangChain.

    Example:
-        .. code-block:: python

-            from langchain.llms import Xinference
+    .. code-block:: python

-            llm = Xinference(
-                server_url="http://0.0.0.0:9997",
-                model_uid = {model_uid} # replace model_uid with the model UID return from launching the model
-            )
+        from langchain.llms import Xinference

-            llm(
-                prompt="Q: where can we visit in the capital of France? A:",
-                generate_config={"max_tokens": 1024, "stream": True},
-            )
+        llm = Xinference(
+            server_url="http://0.0.0.0:9997",
+            model_uid = {model_uid} # replace model_uid with the model UID return from launching the model
+        )
+
+        llm(
+            prompt="Q: where can we visit in the capital of France? A:",
+            generate_config={"max_tokens": 1024, "stream": True},
+        )

    To view all the supported builtin models, run:
+
    .. code-block:: bash
+
        $ xinference list --all

    """  # noqa: E501
@ -69,9 +79,14 @@ class Xinference(LLM):
    """URL of the xinference server"""
    model_uid: Optional[str]
    """UID of the launched model"""
+    model_kwargs: Dict[str, Any]
+    """Key word arguments to be passed to xinference.LLM"""

    def __init__(
-        self, server_url: Optional[str] = None, model_uid: Optional[str] = None
+        self,
+        server_url: Optional[str] = None,
+        model_uid: Optional[str] = None,
+        **model_kwargs: Any,
    ):
        try:
            from xinference.client import RESTfulClient
@ -81,10 +96,13 @@ class Xinference(LLM):
                " with `pip install xinference`."
            ) from e

+        model_kwargs = model_kwargs or {}
+
        super().__init__(
            **{
                "server_url": server_url,
                "model_uid": model_uid,
+                "model_kwargs": model_kwargs,
            }
        )

@ -107,6 +125,7 @@ class Xinference(LLM):
        return {
            **{"server_url": self.server_url},
            **{"model_uid": self.model_uid},
+            **{"model_kwargs": self.model_kwargs},
        }

    def _call(
@ -131,6 +150,8 @@ class Xinference(LLM):

        generate_config: "LlamaCppGenerateConfig" = kwargs.get("generate_config", {})

+        generate_config = {**self.model_kwargs, **generate_config}
+
        if stop:
            generate_config["stop"] = stop