From ce61840e3b25d48a2f5088b0887efc4e6f903439 Mon Sep 17 00:00:00 2001
From: Jiayi Ni <105399924+jiayini1119@users.noreply.github.com>
Date: Mon, 18 Sep 2023 23:36:29 +0800
Subject: [PATCH] ENH: Add `llm_kwargs` for Xinference LLMs (#10354)

- This pr adds `llm_kwargs` to the initialization of Xinference LLMs
(integrated in #8171 ).
- With this enhancement, users can not only provide `generate_configs`
when calling the llms for generation but also during the initialization
process. This allows users to include custom configurations when
utilizing LangChain features like LLMChain.
- It also fixes some format issues for the docstrings.
---
 .../integrations/providers/xinference.mdx     |  4 +-
 .../langchain/embeddings/xinference.py        | 22 +++++--
 libs/langchain/langchain/llms/xinference.py   | 57 +++++++++++++------
 3 files changed, 57 insertions(+), 26 deletions(-)

diff --git a/docs/extras/integrations/providers/xinference.mdx b/docs/extras/integrations/providers/xinference.mdx
index 3b1d57725e4..41a7f44bbc8 100644
--- a/docs/extras/integrations/providers/xinference.mdx
+++ b/docs/extras/integrations/providers/xinference.mdx
@@ -93,10 +93,10 @@ llm(
 ### Usage
 
 For more information and detailed examples, refer to the
-[example notebook for xinference](../modules/models/llms/integrations/xinference.ipynb)
+[example for xinference LLMs](/docs/integrations/llms/xinference.html)
 
 ### Embeddings
 
 Xinference also supports embedding queries and documents. See
-[example notebook for xinference embeddings](../modules/data_connection/text_embedding/integrations/xinference.ipynb) 
+[example for xinference embeddings](/docs/integrations/text_embedding/xinference.html) 
 for a more detailed demo.
\ No newline at end of file
diff --git a/libs/langchain/langchain/embeddings/xinference.py b/libs/langchain/langchain/embeddings/xinference.py
index 8be92fc64c4..1609f2b537d 100644
--- a/libs/langchain/langchain/embeddings/xinference.py
+++ b/libs/langchain/langchain/embeddings/xinference.py
@@ -8,37 +8,47 @@ class XinferenceEmbeddings(Embeddings):
 
     """Wrapper around xinference embedding models.
     To use, you should have the xinference library installed:
+
     .. code-block:: bash
 
         pip install xinference
 
     Check out: https://github.com/xorbitsai/inference
-    To run, you need to start a Xinference supervisor on one server and Xinference workers on the other servers
+    To run, you need to start a Xinference supervisor on one server and Xinference workers on the other servers.
+
     Example:
         To start a local instance of Xinference, run
-         .. code-block:: bash
 
-            $ xinference
+        .. code-block:: bash
+
+           $ xinference
+
         You can also deploy Xinference in a distributed cluster. Here are the steps:
+
         Starting the supervisor:
+
         .. code-block:: bash
 
-            $ xinference-supervisor
+           $ xinference-supervisor
+
         Starting the worker:
+
         .. code-block:: bash
 
-            $ xinference-worker
+           $ xinference-worker
 
     Then, launch a model using command line interface (CLI).
 
     Example:
+
     .. code-block:: bash
 
-            $ xinference launch -n orca -s 3 -q q4_0
+       $ xinference launch -n orca -s 3 -q q4_0
 
     It will return a model UID. Then you can use Xinference Embedding with LangChain.
 
     Example:
+
     .. code-block:: python
 
         from langchain.embeddings import XinferenceEmbeddings
diff --git a/libs/langchain/langchain/llms/xinference.py b/libs/langchain/langchain/llms/xinference.py
index faf0ef0258e..3b16838da1a 100644
--- a/libs/langchain/langchain/llms/xinference.py
+++ b/libs/langchain/langchain/llms/xinference.py
@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING, Any, Generator, List, Mapping, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, Generator, List, Mapping, Optional, Union
 
 from langchain.callbacks.manager import CallbackManagerForLLMRun
 from langchain.llms.base import LLM
@@ -11,55 +11,65 @@ if TYPE_CHECKING:
 class Xinference(LLM):
     """Wrapper for accessing Xinference's large-scale model inference service.
     To use, you should have the xinference library installed:
+
     .. code-block:: bash
 
-        pip install "xinference[all]"
+       pip install "xinference[all]"
 
     Check out: https://github.com/xorbitsai/inference
     To run, you need to start a Xinference supervisor on one server and Xinference workers on the other servers
+
     Example:
         To start a local instance of Xinference, run
-         .. code-block:: bash
 
-            $ xinference
+        .. code-block:: bash
+
+           $ xinference
 
         You can also deploy Xinference in a distributed cluster. Here are the steps:
+
         Starting the supervisor:
+
         .. code-block:: bash
 
-            $ xinference-supervisor
+           $ xinference-supervisor
 
         Starting the worker:
+
         .. code-block:: bash
 
-            $ xinference-worker
+           $ xinference-worker
 
     Then, launch a model using command line interface (CLI).
 
     Example:
+
     .. code-block:: bash
 
-            $ xinference launch -n orca -s 3 -q q4_0
+       $ xinference launch -n orca -s 3 -q q4_0
 
     It will return a model UID. Then, you can use Xinference with LangChain.
 
     Example:
-        .. code-block:: python
 
-            from langchain.llms import Xinference
+    .. code-block:: python
 
-            llm = Xinference(
-                server_url="http://0.0.0.0:9997",
-                model_uid = {model_uid} # replace model_uid with the model UID return from launching the model
-            )
+        from langchain.llms import Xinference
 
-            llm(
-                prompt="Q: where can we visit in the capital of France? A:",
-                generate_config={"max_tokens": 1024, "stream": True},
-            )
+        llm = Xinference(
+            server_url="http://0.0.0.0:9997",
+            model_uid = {model_uid} # replace model_uid with the model UID return from launching the model
+        )
+
+        llm(
+            prompt="Q: where can we visit in the capital of France? A:",
+            generate_config={"max_tokens": 1024, "stream": True},
+        )
 
     To view all the supported builtin models, run:
+
     .. code-block:: bash
+
         $ xinference list --all
 
     """  # noqa: E501
@@ -69,9 +79,14 @@ class Xinference(LLM):
     """URL of the xinference server"""
     model_uid: Optional[str]
     """UID of the launched model"""
+    model_kwargs: Dict[str, Any]
+    """Key word arguments to be passed to xinference.LLM"""
 
     def __init__(
-        self, server_url: Optional[str] = None, model_uid: Optional[str] = None
+        self,
+        server_url: Optional[str] = None,
+        model_uid: Optional[str] = None,
+        **model_kwargs: Any,
     ):
         try:
             from xinference.client import RESTfulClient
@@ -81,10 +96,13 @@ class Xinference(LLM):
                 " with `pip install xinference`."
             ) from e
 
+        model_kwargs = model_kwargs or {}
+
         super().__init__(
             **{
                 "server_url": server_url,
                 "model_uid": model_uid,
+                "model_kwargs": model_kwargs,
             }
         )
 
@@ -107,6 +125,7 @@ class Xinference(LLM):
         return {
             **{"server_url": self.server_url},
             **{"model_uid": self.model_uid},
+            **{"model_kwargs": self.model_kwargs},
         }
 
     def _call(
@@ -131,6 +150,8 @@ class Xinference(LLM):
 
         generate_config: "LlamaCppGenerateConfig" = kwargs.get("generate_config", {})
 
+        generate_config = {**self.model_kwargs, **generate_config}
+
         if stop:
             generate_config["stop"] = stop