feat(llms): add support for vLLM (#8806)

Hello langchain maintainers, this PR aims at integrating [vllm](https://vllm.readthedocs.io/en/latest/#) into langchain. This PR closes #8729. This feature clearly depends on `vllm`, but I've seen other models supported here depend on packages that are not included in the pyproject.toml (e.g. `gpt4all`, `text-generation`) so I thought it was the case for this as well. @hwchase17, @baskaryan --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
2025-09-01 11:02:37 +00:00 · 2023-08-07 16:32:02 +02:00
parent 100d9ce4c7
commit a616e19975
3 changed files with 322 additions and 0 deletions
--- a/libs/langchain/langchain/llms/init.py
+++ b/libs/langchain/langchain/llms/init.py
@@ -76,6 +76,7 @@ from langchain.llms.stochasticai import StochasticAI
 from langchain.llms.textgen import TextGen
 from langchain.llms.tongyi import Tongyi
 from langchain.llms.vertexai import VertexAI
+from langchain.llms.vllm import VLLM
 from langchain.llms.writer import Writer
 from langchain.llms.xinference import Xinference

@@ -139,6 +140,7 @@ __all__ = [
    "StochasticAI",
    "Tongyi",
    "VertexAI",
+    "VLLM",
    "Writer",
    "OctoAIEndpoint",
    "Xinference",
@@ -198,6 +200,7 @@ type_to_cls_dict: Dict[str, Type[BaseLLM]] = {
    "vertexai": VertexAI,
    "openllm": OpenLLM,
    "openllm_client": OpenLLM,
+    "vllm": VLLM,
    "writer": Writer,
    "xinference": Xinference,
 }
--- a/libs/langchain/langchain/llms/vllm.py
+++ b/libs/langchain/langchain/llms/vllm.py
@@ -0,0 +1,123 @@
+from typing import Any, Dict, List, Optional
+
+from pydantic import root_validator
+
+from langchain.callbacks.manager import CallbackManagerForLLMRun
+from langchain.llms.base import BaseLLM
+from langchain.schema.output import Generation, LLMResult
+
+
+class VLLM(BaseLLM):
+    model: str = ""
+    """The name or path of a HuggingFace Transformers model."""
+
+    tensor_parallel_size: Optional[int] = 1
+    """The number of GPUs to use for distributed execution with tensor parallelism."""
+
+    trust_remote_code: Optional[bool] = False
+    """Trust remote code (e.g., from HuggingFace) when downloading the model 
+    and tokenizer."""
+
+    n: int = 1
+    """Number of output sequences to return for the given prompt."""
+
+    best_of: Optional[int] = None
+    """Number of output sequences that are generated from the prompt."""
+
+    presence_penalty: float = 0.0
+    """Float that penalizes new tokens based on whether they appear in the 
+    generated text so far"""
+
+    frequency_penalty: float = 0.0
+    """Float that penalizes new tokens based on their frequency in the 
+    generated text so far"""
+
+    temperature: float = 1.0
+    """Float that controls the randomness of the sampling."""
+
+    top_p: float = 1.0
+    """Float that controls the cumulative probability of the top tokens to consider."""
+
+    top_k: int = -1
+    """Integer that controls the number of top tokens to consider."""
+
+    use_beam_search: bool = False
+    """Whether to use beam search instead of sampling."""
+
+    stop: Optional[List[str]] = None
+    """List of strings that stop the generation when they are generated."""
+
+    ignore_eos: bool = False
+    """Whether to ignore the EOS token and continue generating tokens after 
+    the EOS token is generated."""
+
+    max_new_tokens: int = 512
+    """Maximum number of tokens to generate per output sequence."""
+
+    client: Any  #: :meta private:
+
+    @root_validator()
+    def validate_environment(cls, values: Dict) -> Dict:
+        """Validate that python package exists in environment."""
+
+        try:
+            from vllm import LLM as VLLModel
+        except ImportError:
+            raise ImportError(
+                "Could not import vllm python package. "
+                "Please install it with `pip install vllm`."
+            )
+
+        values["client"] = VLLModel(
+            model=values["model"],
+            tensor_parallel_size=values["tensor_parallel_size"],
+            trust_remote_code=values["trust_remote_code"],
+        )
+
+        return values
+
+    @property
+    def _default_params(self) -> Dict[str, Any]:
+        """Get the default parameters for calling vllm."""
+        return {
+            "n": self.n,
+            "best_of": self.best_of,
+            "max_tokens": self.max_new_tokens,
+            "top_k": self.top_k,
+            "top_p": self.top_p,
+            "temperature": self.temperature,
+            "presence_penalty": self.presence_penalty,
+            "frequency_penalty": self.frequency_penalty,
+            "stop": self.stop,
+            "ignore_eos": self.ignore_eos,
+            "use_beam_search": self.use_beam_search,
+        }
+
+    def _generate(
+        self,
+        prompts: List[str],
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+        **kwargs: Any,
+    ) -> LLMResult:
+        """Run the LLM on the given prompt and input."""
+
+        from vllm import SamplingParams
+
+        # build sampling parameters
+        params = {**self._default_params, **kwargs, "stop": stop}
+        sampling_params = SamplingParams(**params)
+        # call the model
+        outputs = self.client.generate(prompts, sampling_params)
+
+        generations = []
+        for output in outputs:
+            text = output.outputs[0].text
+            generations.append([Generation(text=text)])
+
+        return LLMResult(generations=generations)
+
+    @property
+    def _llm_type(self) -> str:
+        """Return type of llm."""
+        return "vllm"