community: add Intel GPU support to ipex-llm llm integration (#22458)

**Description:** [IPEX-LLM](https://github.com/intel-analytics/ipex-llm) is a PyTorch library for running LLM on Intel CPU and GPU (e.g., local PC with iGPU, discrete GPU such as Arc, Flex and Max) with very low latency. This PR adds Intel GPU support to `ipex-llm` llm integration. **Dependencies:** `ipex-llm` **Contribution maintainer**: @ivy-lv11 @Oscilloscope98 **tests and docs**: - Add: langchain/docs/docs/integrations/llms/ipex_llm_gpu.ipynb - Update: langchain/docs/docs/integrations/llms/ipex_llm_gpu.ipynb - Update: langchain/libs/community/tests/llms/test_ipex_llm.py --------- Co-authored-by: ivy-lv11 <zhicunlv@gmail.com>
2025-09-05 13:06:03 +00:00 · 2024-09-02 20:49:08 +08:00
parent d19e074374
commit 566e9ba164
4 changed files with 311 additions and 34 deletions
--- a/libs/community/langchain_community/llms/ipex_llm.py
+++ b/libs/community/langchain_community/llms/ipex_llm.py
@@ -139,6 +139,16 @@ class IpexLLM(LLM):
        kwargs = kwargs or {}

        _tokenizer_id = tokenizer_id or model_id
+        # Set "cpu" as default device
+        if "device" not in _model_kwargs:
+            _model_kwargs["device"] = "cpu"
+
+        if _model_kwargs["device"] not in ["cpu", "xpu"]:
+            raise ValueError(
+                "IpexLLMBgeEmbeddings currently only supports device to be "
+                f"'cpu' or 'xpu', but you have: {_model_kwargs['device']}."
+            )
+        device = _model_kwargs.pop("device")

        try:
            tokenizer = AutoTokenizer.from_pretrained(_tokenizer_id, **_model_kwargs)
@@ -186,6 +196,8 @@ class IpexLLM(LLM):
                model_kwargs=_model_kwargs,
            )

+        model.to(device)
+
        return cls(
            model_id=model_id,
            model=model,
@@ -235,6 +247,7 @@ class IpexLLM(LLM):
            from transformers import TextStreamer

            input_ids = self.tokenizer.encode(prompt, return_tensors="pt")
+            input_ids = input_ids.to(self.model.device)
            streamer = TextStreamer(
                self.tokenizer, skip_prompt=True, skip_special_tokens=True
            )
@@ -261,6 +274,7 @@ class IpexLLM(LLM):
            return text
        else:
            input_ids = self.tokenizer.encode(prompt, return_tensors="pt")
+            input_ids = input_ids.to(self.model.device)
            if stop is not None:
                from transformers.generation.stopping_criteria import (
                    StoppingCriteriaList,
--- a/libs/community/tests/integration_tests/llms/test_ipex_llm.py
+++ b/libs/community/tests/integration_tests/llms/test_ipex_llm.py
@@ -13,12 +13,18 @@ skip_if_no_model_ids = pytest.mark.skipif(
    not model_ids_to_test, reason="TEST_IPEXLLM_MODEL_IDS environment variable not set."
 )
 model_ids_to_test = [model_id.strip() for model_id in model_ids_to_test.split(",")]  # type: ignore
+device = os.getenv("TEST_IPEXLLM_MODEL_DEVICE") or "cpu"


 def load_model(model_id: str) -> Any:
    llm = IpexLLM.from_model_id(
        model_id=model_id,
-        model_kwargs={"temperature": 0, "max_length": 16, "trust_remote_code": True},
+        model_kwargs={
+            "temperature": 0,
+            "max_length": 16,
+            "trust_remote_code": True,
+            "device": device,
+        },
    )
    return llm