community[patch]: add more data types support to ipex-llm llm integration (#20833)

- **Description**:  
- **add support for more data types**: by default `IpexLLM` will load
the model in int4 format. This PR adds more data types support such as
`sym_in5`, `sym_int8`, etc. Data formats like NF3, NF4, FP4 and FP8 are
only supported on GPU and will be added in future PR.
    - Fix a small issue in saving/loading, update api docs
- **Dependencies**: `ipex-llm` library
- **Document**: In `docs/docs/integrations/llms/ipex_llm.ipynb`, added
instructions for saving/loading low-bit model.
- **Tests**: added new test cases to
`libs/community/tests/integration_tests/llms/test_ipex_llm.py`, added
config params.
- **Contribution maintainer**: @shane-huang
This commit is contained in:
Shengsheng Huang
2024-04-26 03:58:18 +08:00
committed by GitHub
parent dc921f0823
commit fd1061e7bf
5 changed files with 343 additions and 85 deletions

View File

@@ -23,6 +23,10 @@ class BigdlLLM(IpexLLM):
cls,
model_id: str,
model_kwargs: Optional[dict] = None,
*,
tokenizer_id: Optional[str] = None,
load_in_4bit: bool = True,
load_in_low_bit: Optional[str] = None,
**kwargs: Any,
) -> LLM:
"""
@@ -31,6 +35,8 @@ class BigdlLLM(IpexLLM):
Args:
model_id: Path for the huggingface repo id to be downloaded or
the huggingface checkpoint folder.
tokenizer_id: Path for the huggingface repo id to be downloaded or
the huggingface checkpoint folder which contains the tokenizer.
model_kwargs: Keyword arguments to pass to the model and tokenizer.
kwargs: Extra arguments to pass to the model and tokenizer.
@@ -52,12 +58,27 @@ class BigdlLLM(IpexLLM):
"Please install it with `pip install --pre --upgrade bigdl-llm[all]`."
)
if load_in_low_bit is not None:
logger.warning(
"""`load_in_low_bit` option is not supported in BigdlLLM and
is ignored. For more data types support with `load_in_low_bit`,
use IpexLLM instead."""
)
if not load_in_4bit:
raise ValueError(
"BigdlLLM only supports loading in 4-bit mode, "
"i.e. load_in_4bit = True. "
"Please install it with `pip install --pre --upgrade bigdl-llm[all]`."
)
_model_kwargs = model_kwargs or {}
_tokenizer_id = tokenizer_id or model_id
try:
tokenizer = AutoTokenizer.from_pretrained(model_id, **_model_kwargs)
tokenizer = AutoTokenizer.from_pretrained(_tokenizer_id, **_model_kwargs)
except Exception:
tokenizer = LlamaTokenizer.from_pretrained(model_id, **_model_kwargs)
tokenizer = LlamaTokenizer.from_pretrained(_tokenizer_id, **_model_kwargs)
try:
model = AutoModelForCausalLM.from_pretrained(
@@ -86,6 +107,8 @@ class BigdlLLM(IpexLLM):
cls,
model_id: str,
model_kwargs: Optional[dict] = None,
*,
tokenizer_id: Optional[str] = None,
**kwargs: Any,
) -> LLM:
"""
@@ -94,6 +117,8 @@ class BigdlLLM(IpexLLM):
Args:
model_id: Path for the bigdl-llm transformers low-bit model folder.
tokenizer_id: Path for the huggingface repo id or local model folder
which contains the tokenizer.
model_kwargs: Keyword arguments to pass to the model and tokenizer.
kwargs: Extra arguments to pass to the model and tokenizer.
@@ -117,10 +142,12 @@ class BigdlLLM(IpexLLM):
)
_model_kwargs = model_kwargs or {}
_tokenizer_id = tokenizer_id or model_id
try:
tokenizer = AutoTokenizer.from_pretrained(model_id, **_model_kwargs)
tokenizer = AutoTokenizer.from_pretrained(_tokenizer_id, **_model_kwargs)
except Exception:
tokenizer = LlamaTokenizer.from_pretrained(model_id, **_model_kwargs)
tokenizer = LlamaTokenizer.from_pretrained(_tokenizer_id, **_model_kwargs)
try:
model = AutoModelForCausalLM.load_low_bit(model_id, **_model_kwargs)

View File

@@ -42,6 +42,10 @@ class IpexLLM(LLM):
cls,
model_id: str,
model_kwargs: Optional[dict] = None,
*,
tokenizer_id: Optional[str] = None,
load_in_4bit: bool = True,
load_in_low_bit: Optional[str] = None,
**kwargs: Any,
) -> LLM:
"""
@@ -50,52 +54,29 @@ class IpexLLM(LLM):
Args:
model_id: Path for the huggingface repo id to be downloaded or
the huggingface checkpoint folder.
tokenizer_id: Path for the huggingface repo id to be downloaded or
the huggingface checkpoint folder which contains the tokenizer.
load_in_4bit: "Whether to load model in 4bit.
Unused if `load_in_low_bit` is not None.
load_in_low_bit: Which low bit precisions to use when loading model.
Example values: 'sym_int4', 'asym_int4', 'fp4', 'nf4', 'fp8', etc.
Overrides `load_in_4bit` if specified.
model_kwargs: Keyword arguments to pass to the model and tokenizer.
kwargs: Extra arguments to pass to the model and tokenizer.
Returns:
An object of IpexLLM.
"""
try:
from ipex_llm.transformers import (
AutoModel,
AutoModelForCausalLM,
)
from transformers import AutoTokenizer, LlamaTokenizer
except ImportError:
raise ValueError(
"Could not import ipex-llm or transformers. "
"Please install it with `pip install --pre --upgrade ipex-llm[all]`."
)
_model_kwargs = model_kwargs or {}
try:
tokenizer = AutoTokenizer.from_pretrained(model_id, **_model_kwargs)
except Exception:
tokenizer = LlamaTokenizer.from_pretrained(model_id, **_model_kwargs)
try:
model = AutoModelForCausalLM.from_pretrained(
model_id, load_in_4bit=True, **_model_kwargs
)
except Exception:
model = AutoModel.from_pretrained(
model_id, load_in_4bit=True, **_model_kwargs
)
if "trust_remote_code" in _model_kwargs:
_model_kwargs = {
k: v for k, v in _model_kwargs.items() if k != "trust_remote_code"
}
return cls(
return cls._load_model(
model_id=model_id,
model=model,
tokenizer=tokenizer,
model_kwargs=_model_kwargs,
**kwargs,
tokenizer_id=tokenizer_id,
low_bit_model=False,
load_in_4bit=load_in_4bit,
load_in_low_bit=load_in_low_bit,
model_kwargs=model_kwargs,
kwargs=kwargs,
)
@classmethod
@@ -103,6 +84,8 @@ class IpexLLM(LLM):
cls,
model_id: str,
model_kwargs: Optional[dict] = None,
*,
tokenizer_id: Optional[str] = None,
**kwargs: Any,
) -> LLM:
"""
@@ -111,12 +94,36 @@ class IpexLLM(LLM):
Args:
model_id: Path for the ipex-llm transformers low-bit model folder.
tokenizer_id: Path for the huggingface repo id or local model folder
which contains the tokenizer.
model_kwargs: Keyword arguments to pass to the model and tokenizer.
kwargs: Extra arguments to pass to the model and tokenizer.
Returns:
An object of IpexLLM.
"""
return cls._load_model(
model_id=model_id,
tokenizer_id=tokenizer_id,
low_bit_model=True,
load_in_4bit=False, # not used for low-bit model
load_in_low_bit=None, # not used for low-bit model
model_kwargs=model_kwargs,
kwargs=kwargs,
)
@classmethod
def _load_model(
cls,
model_id: str,
tokenizer_id: Optional[str] = None,
load_in_4bit: bool = False,
load_in_low_bit: Optional[str] = None,
low_bit_model: bool = False,
model_kwargs: Optional[dict] = None,
kwargs: Optional[dict] = None,
) -> Any:
try:
from ipex_llm.transformers import (
AutoModel,
@@ -126,26 +133,62 @@ class IpexLLM(LLM):
except ImportError:
raise ValueError(
"Could not import ipex-llm or transformers. "
"Please install it with `pip install --pre --upgrade ipex-llm[all]`."
"Could not import ipex-llm. "
"Please install `ipex-llm` properly following installation guides: "
"https://github.com/intel-analytics/ipex-llm?tab=readme-ov-file#install-ipex-llm."
)
_model_kwargs = model_kwargs or {}
try:
tokenizer = AutoTokenizer.from_pretrained(model_id, **_model_kwargs)
except Exception:
tokenizer = LlamaTokenizer.from_pretrained(model_id, **_model_kwargs)
kwargs = kwargs or {}
_tokenizer_id = tokenizer_id or model_id
try:
model = AutoModelForCausalLM.load_low_bit(model_id, **_model_kwargs)
tokenizer = AutoTokenizer.from_pretrained(_tokenizer_id, **_model_kwargs)
except Exception:
model = AutoModel.load_low_bit(model_id, **_model_kwargs)
tokenizer = LlamaTokenizer.from_pretrained(_tokenizer_id, **_model_kwargs)
# restore model_kwargs
if "trust_remote_code" in _model_kwargs:
_model_kwargs = {
k: v for k, v in _model_kwargs.items() if k != "trust_remote_code"
}
# load model with AutoModelForCausalLM and falls back to AutoModel on failure.
load_kwargs = {
"use_cache": True,
"trust_remote_code": True,
}
if not low_bit_model:
if load_in_low_bit is not None:
load_function_name = "from_pretrained"
load_kwargs["load_in_low_bit"] = load_in_low_bit # type: ignore
else:
load_function_name = "from_pretrained"
load_kwargs["load_in_4bit"] = load_in_4bit
else:
load_function_name = "load_low_bit"
try:
# Attempt to load with AutoModelForCausalLM
model = cls._load_model_general(
AutoModelForCausalLM,
load_function_name=load_function_name,
model_id=model_id,
load_kwargs=load_kwargs,
model_kwargs=_model_kwargs,
)
except Exception:
# Fallback to AutoModel if there's an exception
model = cls._load_model_general(
AutoModel,
load_function_name=load_function_name,
model_id=model_id,
load_kwargs=load_kwargs,
model_kwargs=_model_kwargs,
)
return cls(
model_id=model_id,
model=model,
@@ -154,6 +197,24 @@ class IpexLLM(LLM):
**kwargs,
)
@staticmethod
def _load_model_general(
model_class: Any,
load_function_name: str,
model_id: str,
load_kwargs: dict,
model_kwargs: dict,
) -> Any:
"""General function to attempt to load a model."""
try:
load_function = getattr(model_class, load_function_name)
return load_function(model_id, **{**load_kwargs, **model_kwargs})
except Exception as e:
logger.error(
f"Failed to load model using "
f"{model_class.__name__}.{load_function_name}: {e}"
)
@property
def _identifying_params(self) -> Mapping[str, Any]:
"""Get the identifying parameters."""

View File

@@ -1,23 +1,43 @@
"""Test BigdlLLM"""
import os
import pytest
from langchain_core.outputs import LLMResult
from langchain_community.llms.bigdl_llm import BigdlLLM
model_ids_to_test = os.getenv("TEST_BIGDLLLM_MODEL_IDS") or ""
skip_if_no_model_ids = pytest.mark.skipif(
not model_ids_to_test,
reason="TEST_BIGDLLLM_MODEL_IDS environment variable not set.",
)
model_ids_to_test = [model_id.strip() for model_id in model_ids_to_test.split(",")] # type: ignore
def test_call() -> None:
@skip_if_no_model_ids
@pytest.mark.parametrize(
"model_id",
model_ids_to_test,
)
def test_call(model_id: str) -> None:
"""Test valid call to bigdl-llm."""
llm = BigdlLLM.from_model_id(
model_id="lmsys/vicuna-7b-v1.5",
model_id=model_id,
model_kwargs={"temperature": 0, "max_length": 16, "trust_remote_code": True},
)
output = llm.invoke("Hello!")
assert isinstance(output, str)
def test_generate() -> None:
@skip_if_no_model_ids
@pytest.mark.parametrize(
"model_id",
model_ids_to_test,
)
def test_generate(model_id: str) -> None:
"""Test valid call to bigdl-llm."""
llm = BigdlLLM.from_model_id(
model_id="lmsys/vicuna-7b-v1.5",
model_id=model_id,
model_kwargs={"temperature": 0, "max_length": 16, "trust_remote_code": True},
)
output = llm.generate(["Hello!"])

View File

@@ -1,25 +1,88 @@
"""Test IPEX LLM"""
import os
from typing import Any
import pytest
from langchain_core.outputs import LLMResult
from langchain_community.llms.ipex_llm import IpexLLM
from langchain_community.llms import IpexLLM
model_ids_to_test = os.getenv("TEST_IPEXLLM_MODEL_IDS") or ""
skip_if_no_model_ids = pytest.mark.skipif(
not model_ids_to_test, reason="TEST_IPEXLLM_MODEL_IDS environment variable not set."
)
model_ids_to_test = [model_id.strip() for model_id in model_ids_to_test.split(",")] # type: ignore
def test_call() -> None:
"""Test valid call to ipex-llm."""
def load_model(model_id: str) -> Any:
llm = IpexLLM.from_model_id(
model_id="lmsys/vicuna-7b-v1.5",
model_id=model_id,
model_kwargs={"temperature": 0, "max_length": 16, "trust_remote_code": True},
)
return llm
def load_model_more_types(model_id: str, load_in_low_bit: str) -> Any:
llm = IpexLLM.from_model_id(
model_id=model_id,
load_in_low_bit=load_in_low_bit,
model_kwargs={"temperature": 0, "max_length": 16, "trust_remote_code": True},
)
return llm
@skip_if_no_model_ids
@pytest.mark.parametrize(
"model_id",
model_ids_to_test,
)
def test_call(model_id: str) -> None:
"""Test valid call."""
llm = load_model(model_id)
output = llm.invoke("Hello!")
assert isinstance(output, str)
def test_generate() -> None:
"""Test valid call to ipex-llm."""
llm = IpexLLM.from_model_id(
model_id="lmsys/vicuna-7b-v1.5",
model_kwargs={"temperature": 0, "max_length": 16, "trust_remote_code": True},
)
@skip_if_no_model_ids
@pytest.mark.parametrize(
"model_id",
model_ids_to_test,
)
def test_asym_int4(model_id: str) -> None:
"""Test asym int4 data type."""
llm = load_model_more_types(model_id=model_id, load_in_low_bit="asym_int4")
output = llm.invoke("Hello!")
assert isinstance(output, str)
@skip_if_no_model_ids
@pytest.mark.parametrize(
"model_id",
model_ids_to_test,
)
def test_generate(model_id: str) -> None:
"""Test valid generate."""
llm = load_model(model_id)
output = llm.generate(["Hello!"])
assert isinstance(output, LLMResult)
assert isinstance(output.generations, list)
@skip_if_no_model_ids
@pytest.mark.parametrize(
"model_id",
model_ids_to_test,
)
def test_save_load_lowbit(model_id: str) -> None:
"""Test save and load lowbit model."""
saved_lowbit_path = "/tmp/saved_model"
llm = load_model(model_id)
llm.model.save_low_bit(saved_lowbit_path)
del llm
loaded_llm = IpexLLM.from_model_id_low_bit(
model_id=saved_lowbit_path,
tokenizer_id=model_id,
model_kwargs={"temperature": 0, "max_length": 16, "trust_remote_code": True},
)
output = loaded_llm.invoke("Hello!")
assert isinstance(output, str)