enable the device_map parameter in huggingface pipeline (#12731)

### Enabling `device_map` in HuggingFacePipeline For multi-gpu settings with large models, the [accelerate](https://huggingface.co/docs/accelerate/usage_guides/big_modeling#using--accelerate) library provides the `device_map` parameter to automatically distribute the model across GPUs / disk. The [Transformers pipeline](3520e37e86/src/transformers/pipelines/__init__.py (L543)) enables users to specify `device` (or) `device_map`, and handles cases (with warnings) when both are specified. However, Langchain's HuggingFacePipeline only supports specifying `device` when calling transformers which limits large models and multi-gpu use-cases. Additionally, the [default value](8bd3ce59cd/libs/langchain/langchain/llms/huggingface_pipeline.py (L72)) of `device` is initialized to `-1` , which is incompatible with the transformers pipeline when `device_map` is specified. This PR addresses the addition of `device_map` as a parameter , and solves the incompatibility of `device = -1` when `device_map` is also specified. An additional test has been added for this feature. Additionally, some existing tests no longer work since 1. `max_new_tokens` has to be specified under `pipeline_kwargs` and not `model_kwargs` 2. The GPT2 tokenizer raises a `ValueError: Pipeline with tokenizer without pad_token cannot do batching`, since the `tokenizer.pad_token` is `None` ([related issue](https://github.com/huggingface/transformers/issues/19853) on the transformers repo). This PR handles fixing these tests as well. Co-authored-by: Praveen Venkateswaran <praveen.venkateswaran@ibm.com>
2025-08-22 02:45:49 +00:00 · 2023-11-02 17:29:06 -04:00 · 2023-11-02 17:29:06 -04:00 · 21eeba075c
commit 21eeba075c
parent 3276aa3e17
2 changed files with 22 additions and 3 deletions
--- a/libs/langchain/langchain/llms/huggingface_pipeline.py
+++ b/libs/langchain/langchain/llms/huggingface_pipeline.py
@ -70,6 +70,7 @@ class HuggingFacePipeline(BaseLLM):
        model_id: str,
        task: str,
        device: Optional[int] = -1,
+        device_map: Optional[str] = None,
        model_kwargs: Optional[dict] = None,
        pipeline_kwargs: Optional[dict] = None,
        batch_size: int = DEFAULT_BATCH_SIZE,
@ -108,6 +109,9 @@ class HuggingFacePipeline(BaseLLM):
                f"Could not load the {task} model due to missing dependencies."
            ) from e

+        if tokenizer.pad_token is None:
+            tokenizer.pad_token_id = model.config.eos_token_id
+
        if (
            getattr(model, "is_loaded_in_4bit", False)
            or getattr(model, "is_loaded_in_8bit", False)
@ -129,7 +133,9 @@ class HuggingFacePipeline(BaseLLM):
                    f"Got device=={device}, "
                    f"device is required to be within [-1, {cuda_device_count})"
                )
-            if device < 0 and cuda_device_count > 0:
+            if device_map is not None and device < 0:
+                device = None
+            if device is not None and device < 0 and cuda_device_count > 0:
                logger.warning(
                    "Device has %d GPUs available. "
                    "Provide device={deviceId} to `from_model_id` to use available"
@ -147,6 +153,7 @@ class HuggingFacePipeline(BaseLLM):
            model=model,
            tokenizer=tokenizer,
            device=device,
+            device_map=device_map,
            batch_size=batch_size,
            model_kwargs=_model_kwargs,
            **_pipeline_kwargs,
--- a/libs/langchain/tests/integration_tests/llms/test_huggingface_pipeline.py
+++ b/libs/langchain/tests/integration_tests/llms/test_huggingface_pipeline.py
@ -10,7 +10,7 @@ from tests.integration_tests.llms.utils import assert_llm_equality
 def test_huggingface_pipeline_text_generation() -> None:
    """Test valid call to HuggingFace text generation model."""
    llm = HuggingFacePipeline.from_model_id(
-        model_id="gpt2", task="text-generation", model_kwargs={"max_new_tokens": 10}
+        model_id="gpt2", task="text-generation", pipeline_kwargs={"max_new_tokens": 10}
    )
    output = llm("Say foo:")
    assert isinstance(output, str)
@ -25,6 +25,18 @@ def test_huggingface_pipeline_text2text_generation() -> None:
    assert isinstance(output, str)


+def test_huggingface_pipeline_device_map() -> None:
+    """Test pipelines specifying the device map parameter."""
+    llm = HuggingFacePipeline.from_model_id(
+        model_id="gpt2",
+        task="text-generation",
+        device_map="auto",
+        pipeline_kwargs={"max_new_tokens": 10},
+    )
+    output = llm("Say foo:")
+    assert isinstance(output, str)
+
+
 def text_huggingface_pipeline_summarization() -> None:
    """Test valid call to HuggingFace summarization model."""
    llm = HuggingFacePipeline.from_model_id(
@ -37,7 +49,7 @@ def text_huggingface_pipeline_summarization() -> None:
 def test_saving_loading_llm(tmp_path: Path) -> None:
    """Test saving/loading an HuggingFaceHub LLM."""
    llm = HuggingFacePipeline.from_model_id(
-        model_id="gpt2", task="text-generation", model_kwargs={"max_new_tokens": 10}
+        model_id="gpt2", task="text-generation", pipeline_kwargs={"max_new_tokens": 10}
    )
    llm.save(file_path=tmp_path / "hf.yaml")
    loaded_llm = load_llm(tmp_path / "hf.yaml")