mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-15 23:57:21 +00:00
enable the device_map parameter in huggingface pipeline (#12731)
### Enabling `device_map` in HuggingFacePipeline For multi-gpu settings with large models, the [accelerate](https://huggingface.co/docs/accelerate/usage_guides/big_modeling#using--accelerate) library provides the `device_map` parameter to automatically distribute the model across GPUs / disk. The [Transformers pipeline](3520e37e86/src/transformers/pipelines/__init__.py (L543)
) enables users to specify `device` (or) `device_map`, and handles cases (with warnings) when both are specified. However, Langchain's HuggingFacePipeline only supports specifying `device` when calling transformers which limits large models and multi-gpu use-cases. Additionally, the [default value](8bd3ce59cd/libs/langchain/langchain/llms/huggingface_pipeline.py (L72)
) of `device` is initialized to `-1` , which is incompatible with the transformers pipeline when `device_map` is specified. This PR addresses the addition of `device_map` as a parameter , and solves the incompatibility of `device = -1` when `device_map` is also specified. An additional test has been added for this feature. Additionally, some existing tests no longer work since 1. `max_new_tokens` has to be specified under `pipeline_kwargs` and not `model_kwargs` 2. The GPT2 tokenizer raises a `ValueError: Pipeline with tokenizer without pad_token cannot do batching`, since the `tokenizer.pad_token` is `None` ([related issue](https://github.com/huggingface/transformers/issues/19853) on the transformers repo). This PR handles fixing these tests as well. Co-authored-by: Praveen Venkateswaran <praveen.venkateswaran@ibm.com>
This commit is contained in:
parent
3276aa3e17
commit
21eeba075c
@ -70,6 +70,7 @@ class HuggingFacePipeline(BaseLLM):
|
||||
model_id: str,
|
||||
task: str,
|
||||
device: Optional[int] = -1,
|
||||
device_map: Optional[str] = None,
|
||||
model_kwargs: Optional[dict] = None,
|
||||
pipeline_kwargs: Optional[dict] = None,
|
||||
batch_size: int = DEFAULT_BATCH_SIZE,
|
||||
@ -108,6 +109,9 @@ class HuggingFacePipeline(BaseLLM):
|
||||
f"Could not load the {task} model due to missing dependencies."
|
||||
) from e
|
||||
|
||||
if tokenizer.pad_token is None:
|
||||
tokenizer.pad_token_id = model.config.eos_token_id
|
||||
|
||||
if (
|
||||
getattr(model, "is_loaded_in_4bit", False)
|
||||
or getattr(model, "is_loaded_in_8bit", False)
|
||||
@ -129,7 +133,9 @@ class HuggingFacePipeline(BaseLLM):
|
||||
f"Got device=={device}, "
|
||||
f"device is required to be within [-1, {cuda_device_count})"
|
||||
)
|
||||
if device < 0 and cuda_device_count > 0:
|
||||
if device_map is not None and device < 0:
|
||||
device = None
|
||||
if device is not None and device < 0 and cuda_device_count > 0:
|
||||
logger.warning(
|
||||
"Device has %d GPUs available. "
|
||||
"Provide device={deviceId} to `from_model_id` to use available"
|
||||
@ -147,6 +153,7 @@ class HuggingFacePipeline(BaseLLM):
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
device=device,
|
||||
device_map=device_map,
|
||||
batch_size=batch_size,
|
||||
model_kwargs=_model_kwargs,
|
||||
**_pipeline_kwargs,
|
||||
|
16
libs/langchain/tests/integration_tests/llms/test_huggingface_pipeline.py
Normal file → Executable file
16
libs/langchain/tests/integration_tests/llms/test_huggingface_pipeline.py
Normal file → Executable file
@ -10,7 +10,7 @@ from tests.integration_tests.llms.utils import assert_llm_equality
|
||||
def test_huggingface_pipeline_text_generation() -> None:
|
||||
"""Test valid call to HuggingFace text generation model."""
|
||||
llm = HuggingFacePipeline.from_model_id(
|
||||
model_id="gpt2", task="text-generation", model_kwargs={"max_new_tokens": 10}
|
||||
model_id="gpt2", task="text-generation", pipeline_kwargs={"max_new_tokens": 10}
|
||||
)
|
||||
output = llm("Say foo:")
|
||||
assert isinstance(output, str)
|
||||
@ -25,6 +25,18 @@ def test_huggingface_pipeline_text2text_generation() -> None:
|
||||
assert isinstance(output, str)
|
||||
|
||||
|
||||
def test_huggingface_pipeline_device_map() -> None:
|
||||
"""Test pipelines specifying the device map parameter."""
|
||||
llm = HuggingFacePipeline.from_model_id(
|
||||
model_id="gpt2",
|
||||
task="text-generation",
|
||||
device_map="auto",
|
||||
pipeline_kwargs={"max_new_tokens": 10},
|
||||
)
|
||||
output = llm("Say foo:")
|
||||
assert isinstance(output, str)
|
||||
|
||||
|
||||
def text_huggingface_pipeline_summarization() -> None:
|
||||
"""Test valid call to HuggingFace summarization model."""
|
||||
llm = HuggingFacePipeline.from_model_id(
|
||||
@ -37,7 +49,7 @@ def text_huggingface_pipeline_summarization() -> None:
|
||||
def test_saving_loading_llm(tmp_path: Path) -> None:
|
||||
"""Test saving/loading an HuggingFaceHub LLM."""
|
||||
llm = HuggingFacePipeline.from_model_id(
|
||||
model_id="gpt2", task="text-generation", model_kwargs={"max_new_tokens": 10}
|
||||
model_id="gpt2", task="text-generation", pipeline_kwargs={"max_new_tokens": 10}
|
||||
)
|
||||
llm.save(file_path=tmp_path / "hf.yaml")
|
||||
loaded_llm = load_llm(tmp_path / "hf.yaml")
|
||||
|
Loading…
Reference in New Issue
Block a user