community[patch]: add more data types support to ipex-llm llm integration (#20833)

- **Description**: - **add support for more data types**: by default `IpexLLM` will load the model in int4 format. This PR adds more data types support such as `sym_in5`, `sym_int8`, etc. Data formats like NF3, NF4, FP4 and FP8 are only supported on GPU and will be added in future PR. - Fix a small issue in saving/loading, update api docs - **Dependencies**: `ipex-llm` library - **Document**: In `docs/docs/integrations/llms/ipex_llm.ipynb`, added instructions for saving/loading low-bit model. - **Tests**: added new test cases to `libs/community/tests/integration_tests/llms/test_ipex_llm.py`, added config params. - **Contribution maintainer**: @shane-huang
2025-09-10 07:21:03 +00:00 · 2024-04-26 03:58:18 +08:00
parent dc921f0823
commit fd1061e7bf
5 changed files with 343 additions and 85 deletions
--- a/libs/community/langchain_community/llms/bigdl_llm.py
+++ b/libs/community/langchain_community/llms/bigdl_llm.py
@@ -23,6 +23,10 @@ class BigdlLLM(IpexLLM):
        cls,
        model_id: str,
        model_kwargs: Optional[dict] = None,
+        *,
+        tokenizer_id: Optional[str] = None,
+        load_in_4bit: bool = True,
+        load_in_low_bit: Optional[str] = None,
        **kwargs: Any,
    ) -> LLM:
        """
@@ -31,6 +35,8 @@ class BigdlLLM(IpexLLM):
        Args:
            model_id: Path for the huggingface repo id to be downloaded or
                      the huggingface checkpoint folder.
+            tokenizer_id: Path for the huggingface repo id to be downloaded or
+                      the huggingface checkpoint folder which contains the tokenizer.
            model_kwargs: Keyword arguments to pass to the model and tokenizer.
            kwargs: Extra arguments to pass to the model and tokenizer.

@@ -52,12 +58,27 @@ class BigdlLLM(IpexLLM):
                "Please install it with `pip install --pre --upgrade bigdl-llm[all]`."
            )

+        if load_in_low_bit is not None:
+            logger.warning(
+                """`load_in_low_bit` option is not supported in BigdlLLM and 
+                is ignored. For more data types support with `load_in_low_bit`, 
+                use IpexLLM instead."""
+            )
+
+        if not load_in_4bit:
+            raise ValueError(
+                "BigdlLLM only supports loading in 4-bit mode, "
+                "i.e. load_in_4bit = True. "
+                "Please install it with `pip install --pre --upgrade bigdl-llm[all]`."
+            )
+
        _model_kwargs = model_kwargs or {}
+        _tokenizer_id = tokenizer_id or model_id

        try:
-            tokenizer = AutoTokenizer.from_pretrained(model_id, **_model_kwargs)
+            tokenizer = AutoTokenizer.from_pretrained(_tokenizer_id, **_model_kwargs)
        except Exception:
-            tokenizer = LlamaTokenizer.from_pretrained(model_id, **_model_kwargs)
+            tokenizer = LlamaTokenizer.from_pretrained(_tokenizer_id, **_model_kwargs)

        try:
            model = AutoModelForCausalLM.from_pretrained(
@@ -86,6 +107,8 @@ class BigdlLLM(IpexLLM):
        cls,
        model_id: str,
        model_kwargs: Optional[dict] = None,
+        *,
+        tokenizer_id: Optional[str] = None,
        **kwargs: Any,
    ) -> LLM:
        """
@@ -94,6 +117,8 @@ class BigdlLLM(IpexLLM):
        Args:

            model_id: Path for the bigdl-llm transformers low-bit model folder.
+            tokenizer_id: Path for the huggingface repo id or local model folder
+                      which contains the tokenizer.
            model_kwargs: Keyword arguments to pass to the model and tokenizer.
            kwargs: Extra arguments to pass to the model and tokenizer.

@@ -117,10 +142,12 @@ class BigdlLLM(IpexLLM):
            )

        _model_kwargs = model_kwargs or {}
+        _tokenizer_id = tokenizer_id or model_id
+
        try:
-            tokenizer = AutoTokenizer.from_pretrained(model_id, **_model_kwargs)
+            tokenizer = AutoTokenizer.from_pretrained(_tokenizer_id, **_model_kwargs)
        except Exception:
-            tokenizer = LlamaTokenizer.from_pretrained(model_id, **_model_kwargs)
+            tokenizer = LlamaTokenizer.from_pretrained(_tokenizer_id, **_model_kwargs)

        try:
            model = AutoModelForCausalLM.load_low_bit(model_id, **_model_kwargs)
--- a/libs/community/langchain_community/llms/ipex_llm.py
+++ b/libs/community/langchain_community/llms/ipex_llm.py
@@ -42,6 +42,10 @@ class IpexLLM(LLM):
        cls,
        model_id: str,
        model_kwargs: Optional[dict] = None,
+        *,
+        tokenizer_id: Optional[str] = None,
+        load_in_4bit: bool = True,
+        load_in_low_bit: Optional[str] = None,
        **kwargs: Any,
    ) -> LLM:
        """
@@ -50,52 +54,29 @@ class IpexLLM(LLM):
        Args:
            model_id: Path for the huggingface repo id to be downloaded or
                      the huggingface checkpoint folder.
+            tokenizer_id: Path for the huggingface repo id to be downloaded or
+                      the huggingface checkpoint folder which contains the tokenizer.
+            load_in_4bit: "Whether to load model in 4bit.
+                      Unused if `load_in_low_bit` is not None.
+            load_in_low_bit: Which low bit precisions to use when loading model.
+                      Example values: 'sym_int4', 'asym_int4', 'fp4', 'nf4', 'fp8', etc.
+                      Overrides `load_in_4bit` if specified.
            model_kwargs: Keyword arguments to pass to the model and tokenizer.
            kwargs: Extra arguments to pass to the model and tokenizer.

        Returns:
            An object of IpexLLM.
+
        """
-        try:
-            from ipex_llm.transformers import (
-                AutoModel,
-                AutoModelForCausalLM,
-            )
-            from transformers import AutoTokenizer, LlamaTokenizer

-        except ImportError:
-            raise ValueError(
-                "Could not import ipex-llm or transformers. "
-                "Please install it with `pip install --pre --upgrade ipex-llm[all]`."
-            )
-
-        _model_kwargs = model_kwargs or {}
-
-        try:
-            tokenizer = AutoTokenizer.from_pretrained(model_id, **_model_kwargs)
-        except Exception:
-            tokenizer = LlamaTokenizer.from_pretrained(model_id, **_model_kwargs)
-
-        try:
-            model = AutoModelForCausalLM.from_pretrained(
-                model_id, load_in_4bit=True, **_model_kwargs
-            )
-        except Exception:
-            model = AutoModel.from_pretrained(
-                model_id, load_in_4bit=True, **_model_kwargs
-            )
-
-        if "trust_remote_code" in _model_kwargs:
-            _model_kwargs = {
-                k: v for k, v in _model_kwargs.items() if k != "trust_remote_code"
-            }
-
-        return cls(
+        return cls._load_model(
            model_id=model_id,
-            model=model,
-            tokenizer=tokenizer,
-            model_kwargs=_model_kwargs,
-            **kwargs,
+            tokenizer_id=tokenizer_id,
+            low_bit_model=False,
+            load_in_4bit=load_in_4bit,
+            load_in_low_bit=load_in_low_bit,
+            model_kwargs=model_kwargs,
+            kwargs=kwargs,
        )

    @classmethod
@@ -103,6 +84,8 @@ class IpexLLM(LLM):
        cls,
        model_id: str,
        model_kwargs: Optional[dict] = None,
+        *,
+        tokenizer_id: Optional[str] = None,
        **kwargs: Any,
    ) -> LLM:
        """
@@ -111,12 +94,36 @@ class IpexLLM(LLM):
        Args:

            model_id: Path for the ipex-llm transformers low-bit model folder.
+            tokenizer_id: Path for the huggingface repo id or local model folder
+                      which contains the tokenizer.
            model_kwargs: Keyword arguments to pass to the model and tokenizer.
            kwargs: Extra arguments to pass to the model and tokenizer.

        Returns:
            An object of IpexLLM.
        """
+
+        return cls._load_model(
+            model_id=model_id,
+            tokenizer_id=tokenizer_id,
+            low_bit_model=True,
+            load_in_4bit=False,  # not used for low-bit model
+            load_in_low_bit=None,  # not used for low-bit model
+            model_kwargs=model_kwargs,
+            kwargs=kwargs,
+        )
+
+    @classmethod
+    def _load_model(
+        cls,
+        model_id: str,
+        tokenizer_id: Optional[str] = None,
+        load_in_4bit: bool = False,
+        load_in_low_bit: Optional[str] = None,
+        low_bit_model: bool = False,
+        model_kwargs: Optional[dict] = None,
+        kwargs: Optional[dict] = None,
+    ) -> Any:
        try:
            from ipex_llm.transformers import (
                AutoModel,
@@ -126,26 +133,62 @@ class IpexLLM(LLM):

        except ImportError:
            raise ValueError(
-                "Could not import ipex-llm or transformers. "
-                "Please install it with `pip install --pre --upgrade ipex-llm[all]`."
+                "Could not import ipex-llm. "
+                "Please install `ipex-llm` properly following installation guides: "
+                "https://github.com/intel-analytics/ipex-llm?tab=readme-ov-file#install-ipex-llm."
            )

        _model_kwargs = model_kwargs or {}
-        try:
-            tokenizer = AutoTokenizer.from_pretrained(model_id, **_model_kwargs)
-        except Exception:
-            tokenizer = LlamaTokenizer.from_pretrained(model_id, **_model_kwargs)
+        kwargs = kwargs or {}
+
+        _tokenizer_id = tokenizer_id or model_id

        try:
-            model = AutoModelForCausalLM.load_low_bit(model_id, **_model_kwargs)
+            tokenizer = AutoTokenizer.from_pretrained(_tokenizer_id, **_model_kwargs)
        except Exception:
-            model = AutoModel.load_low_bit(model_id, **_model_kwargs)
+            tokenizer = LlamaTokenizer.from_pretrained(_tokenizer_id, **_model_kwargs)

+        # restore model_kwargs
        if "trust_remote_code" in _model_kwargs:
            _model_kwargs = {
                k: v for k, v in _model_kwargs.items() if k != "trust_remote_code"
            }

+        # load model with AutoModelForCausalLM and falls back to AutoModel on failure.
+        load_kwargs = {
+            "use_cache": True,
+            "trust_remote_code": True,
+        }
+
+        if not low_bit_model:
+            if load_in_low_bit is not None:
+                load_function_name = "from_pretrained"
+                load_kwargs["load_in_low_bit"] = load_in_low_bit  # type: ignore
+            else:
+                load_function_name = "from_pretrained"
+                load_kwargs["load_in_4bit"] = load_in_4bit
+        else:
+            load_function_name = "load_low_bit"
+
+        try:
+            # Attempt to load with AutoModelForCausalLM
+            model = cls._load_model_general(
+                AutoModelForCausalLM,
+                load_function_name=load_function_name,
+                model_id=model_id,
+                load_kwargs=load_kwargs,
+                model_kwargs=_model_kwargs,
+            )
+        except Exception:
+            # Fallback to AutoModel if there's an exception
+            model = cls._load_model_general(
+                AutoModel,
+                load_function_name=load_function_name,
+                model_id=model_id,
+                load_kwargs=load_kwargs,
+                model_kwargs=_model_kwargs,
+            )
+
        return cls(
            model_id=model_id,
            model=model,
@@ -154,6 +197,24 @@ class IpexLLM(LLM):
            **kwargs,
        )

+    @staticmethod
+    def _load_model_general(
+        model_class: Any,
+        load_function_name: str,
+        model_id: str,
+        load_kwargs: dict,
+        model_kwargs: dict,
+    ) -> Any:
+        """General function to attempt to load a model."""
+        try:
+            load_function = getattr(model_class, load_function_name)
+            return load_function(model_id, **{**load_kwargs, **model_kwargs})
+        except Exception as e:
+            logger.error(
+                f"Failed to load model using "
+                f"{model_class.__name__}.{load_function_name}: {e}"
+            )
+
    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        """Get the identifying parameters."""
--- a/libs/community/tests/integration_tests/llms/test_bigdl_llm.py
+++ b/libs/community/tests/integration_tests/llms/test_bigdl_llm.py
@@ -1,23 +1,43 @@
 """Test BigdlLLM"""
+import os
+
+import pytest
 from langchain_core.outputs import LLMResult

 from langchain_community.llms.bigdl_llm import BigdlLLM

+model_ids_to_test = os.getenv("TEST_BIGDLLLM_MODEL_IDS") or ""
+skip_if_no_model_ids = pytest.mark.skipif(
+    not model_ids_to_test,
+    reason="TEST_BIGDLLLM_MODEL_IDS environment variable not set.",
+)
+model_ids_to_test = [model_id.strip() for model_id in model_ids_to_test.split(",")]  # type: ignore

-def test_call() -> None:
+
+@skip_if_no_model_ids
+@pytest.mark.parametrize(
+    "model_id",
+    model_ids_to_test,
+)
+def test_call(model_id: str) -> None:
    """Test valid call to bigdl-llm."""
    llm = BigdlLLM.from_model_id(
-        model_id="lmsys/vicuna-7b-v1.5",
+        model_id=model_id,
        model_kwargs={"temperature": 0, "max_length": 16, "trust_remote_code": True},
    )
    output = llm.invoke("Hello!")
    assert isinstance(output, str)


-def test_generate() -> None:
+@skip_if_no_model_ids
+@pytest.mark.parametrize(
+    "model_id",
+    model_ids_to_test,
+)
+def test_generate(model_id: str) -> None:
    """Test valid call to bigdl-llm."""
    llm = BigdlLLM.from_model_id(
-        model_id="lmsys/vicuna-7b-v1.5",
+        model_id=model_id,
        model_kwargs={"temperature": 0, "max_length": 16, "trust_remote_code": True},
    )
    output = llm.generate(["Hello!"])
--- a/libs/community/tests/integration_tests/llms/test_ipex_llm.py
+++ b/libs/community/tests/integration_tests/llms/test_ipex_llm.py
@@ -1,25 +1,88 @@
 """Test IPEX LLM"""
+import os
+from typing import Any
+
+import pytest
 from langchain_core.outputs import LLMResult

-from langchain_community.llms.ipex_llm import IpexLLM
+from langchain_community.llms import IpexLLM
+
+model_ids_to_test = os.getenv("TEST_IPEXLLM_MODEL_IDS") or ""
+skip_if_no_model_ids = pytest.mark.skipif(
+    not model_ids_to_test, reason="TEST_IPEXLLM_MODEL_IDS environment variable not set."
+)
+model_ids_to_test = [model_id.strip() for model_id in model_ids_to_test.split(",")]  # type: ignore


-def test_call() -> None:
-    """Test valid call to ipex-llm."""
+def load_model(model_id: str) -> Any:
    llm = IpexLLM.from_model_id(
-        model_id="lmsys/vicuna-7b-v1.5",
+        model_id=model_id,
        model_kwargs={"temperature": 0, "max_length": 16, "trust_remote_code": True},
    )
+    return llm
+
+
+def load_model_more_types(model_id: str, load_in_low_bit: str) -> Any:
+    llm = IpexLLM.from_model_id(
+        model_id=model_id,
+        load_in_low_bit=load_in_low_bit,
+        model_kwargs={"temperature": 0, "max_length": 16, "trust_remote_code": True},
+    )
+    return llm
+
+
+@skip_if_no_model_ids
+@pytest.mark.parametrize(
+    "model_id",
+    model_ids_to_test,
+)
+def test_call(model_id: str) -> None:
+    """Test valid call."""
+    llm = load_model(model_id)
    output = llm.invoke("Hello!")
    assert isinstance(output, str)


-def test_generate() -> None:
-    """Test valid call to ipex-llm."""
-    llm = IpexLLM.from_model_id(
-        model_id="lmsys/vicuna-7b-v1.5",
-        model_kwargs={"temperature": 0, "max_length": 16, "trust_remote_code": True},
-    )
+@skip_if_no_model_ids
+@pytest.mark.parametrize(
+    "model_id",
+    model_ids_to_test,
+)
+def test_asym_int4(model_id: str) -> None:
+    """Test asym int4 data type."""
+    llm = load_model_more_types(model_id=model_id, load_in_low_bit="asym_int4")
+    output = llm.invoke("Hello!")
+    assert isinstance(output, str)
+
+
+@skip_if_no_model_ids
+@pytest.mark.parametrize(
+    "model_id",
+    model_ids_to_test,
+)
+def test_generate(model_id: str) -> None:
+    """Test valid generate."""
+    llm = load_model(model_id)
    output = llm.generate(["Hello!"])
    assert isinstance(output, LLMResult)
    assert isinstance(output.generations, list)
+
+
+@skip_if_no_model_ids
+@pytest.mark.parametrize(
+    "model_id",
+    model_ids_to_test,
+)
+def test_save_load_lowbit(model_id: str) -> None:
+    """Test save and load lowbit model."""
+    saved_lowbit_path = "/tmp/saved_model"
+    llm = load_model(model_id)
+    llm.model.save_low_bit(saved_lowbit_path)
+    del llm
+    loaded_llm = IpexLLM.from_model_id_low_bit(
+        model_id=saved_lowbit_path,
+        tokenizer_id=model_id,
+        model_kwargs={"temperature": 0, "max_length": 16, "trust_remote_code": True},
+    )
+    output = loaded_llm.invoke("Hello!")
+    assert isinstance(output, str)