New LLM integration: Ctranslate2 (#10400)

## Description: I've integrated CTranslate2 with LangChain. CTranlate2 is a recently popular library for efficient inference with Transformer models that compares favorably to alternatives such as HF Text Generation Inference and vLLM in [benchmarks](https://hamel.dev/notes/llm/inference/03_inference.html).
2025-09-15 22:44:36 +00:00 · 2023-09-09 22:19:00 +02:00
parent ddd07001f3
commit 675d57df50
3 changed files with 371 additions and 0 deletions
--- a/libs/langchain/langchain/llms/init.py
+++ b/libs/langchain/langchain/llms/init.py
@@ -37,6 +37,7 @@ from langchain.llms.chatglm import ChatGLM
 from langchain.llms.clarifai import Clarifai
 from langchain.llms.cohere import Cohere
 from langchain.llms.ctransformers import CTransformers
+from langchain.llms.ctranslate2 import CTranslate2
 from langchain.llms.databricks import Databricks
 from langchain.llms.deepinfra import DeepInfra
 from langchain.llms.deepsparse import DeepSparse
@@ -100,6 +101,7 @@ __all__ = [
    "Beam",
    "Bedrock",
    "CTransformers",
+    "CTranslate2",
    "CerebriumAI",
    "ChatGLM",
    "Clarifai",
@@ -178,6 +180,7 @@ type_to_cls_dict: Dict[str, Type[BaseLLM]] = {
    "clarifai": Clarifai,
    "cohere": Cohere,
    "ctransformers": CTransformers,
+    "ctranslate2": CTranslate2,
    "databricks": Databricks,
    "deepinfra": DeepInfra,
    "deepsparse": DeepSparse,
--- a/libs/langchain/langchain/llms/ctranslate2.py
+++ b/libs/langchain/langchain/llms/ctranslate2.py
@@ -0,0 +1,128 @@
+from typing import Any, Dict, List, Optional, Union
+
+from langchain.callbacks.manager import CallbackManagerForLLMRun
+from langchain.llms.base import BaseLLM
+from langchain.pydantic_v1 import Field, root_validator
+from langchain.schema.output import Generation, LLMResult
+
+
+class CTranslate2(BaseLLM):
+    """CTranslate2 language model."""
+
+    model_path: str = ""
+    """Path to the CTranslate2 model directory."""
+
+    tokenizer_name: str = ""
+    """Name of the original Hugging Face model needed to load the proper tokenizer."""
+
+    device: str = "cpu"
+    """Device to use (possible values are: cpu, cuda, auto)."""
+
+    device_index: Union[int, List[int]] = 0
+    """Device IDs where to place this generator on."""
+
+    compute_type: Union[str, Dict[str, str]] = "default"
+    """
+    Model computation type or a dictionary mapping a device name to the computation type
+    (possible values are: default, auto, int8, int8_float32, int8_float16,
+    int8_bfloat16, int16, float16, bfloat16, float32).
+    """
+
+    max_length: int = 512
+    """Maximum generation length."""
+
+    sampling_topk: int = 1
+    """Randomly sample predictions from the top K candidates."""
+
+    sampling_topp: float = 1
+    """Keep the most probable tokens whose cumulative probability exceeds this value."""
+
+    sampling_temperature: float = 1
+    """Sampling temperature to generate more random samples."""
+
+    client: Any  #: :meta private:
+
+    tokenizer: Any  #: :meta private:
+
+    ctranslate2_kwargs: Dict[str, Any] = Field(default_factory=dict)
+    """
+    Holds any model parameters valid for `ctranslate2.Generator` call not 
+    explicitly specified.
+    """
+
+    @root_validator()
+    def validate_environment(cls, values: Dict) -> Dict:
+        """Validate that python package exists in environment."""
+
+        try:
+            import ctranslate2
+        except ImportError:
+            raise ImportError(
+                "Could not import ctranslate2 python package. "
+                "Please install it with `pip install ctranslate2`."
+            )
+
+        try:
+            import transformers
+        except ImportError:
+            raise ImportError(
+                "Could not import transformers python package. "
+                "Please install it with `pip install transformers`."
+            )
+
+        values["client"] = ctranslate2.Generator(
+            model_path=values["model_path"],
+            device=values["device"],
+            device_index=values["device_index"],
+            compute_type=values["compute_type"],
+            **values["ctranslate2_kwargs"],
+        )
+
+        values["tokenizer"] = transformers.AutoTokenizer.from_pretrained(
+            values["tokenizer_name"]
+        )
+
+        return values
+
+    @property
+    def _default_params(self) -> Dict[str, Any]:
+        """Get the default parameters."""
+        return {
+            "max_length": self.max_length,
+            "sampling_topk": self.sampling_topk,
+            "sampling_topp": self.sampling_topp,
+            "sampling_temperature": self.sampling_temperature,
+        }
+
+    def _generate(
+        self,
+        prompts: List[str],
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+        **kwargs: Any,
+    ) -> LLMResult:
+        # build sampling parameters
+        params = {**self._default_params, **kwargs}
+
+        # call the model
+        encoded_prompts = self.tokenizer(prompts)["input_ids"]
+        tokenized_prompts = [
+            self.tokenizer.convert_ids_to_tokens(encoded_prompt)
+            for encoded_prompt in encoded_prompts
+        ]
+
+        results = self.client.generate_batch(tokenized_prompts, **params)
+
+        sequences = [result.sequences_ids[0] for result in results]
+        decoded_sequences = [self.tokenizer.decode(seq) for seq in sequences]
+
+        generations = []
+        for text in decoded_sequences:
+            generations.append([Generation(text=text)])
+
+        return LLMResult(generations=generations)
+
+    @property
+    def _llm_type(self) -> str:
+        """Return type of llm."""
+        return "ctranslate2"