community[minor]: weight only quantization with intel-extension-for-transformers. (#14504)

Support weight only quantization with intel-extension-for-transformers. [Intel® Extension for Transformers](https://github.com/intel/intel-extension-for-transformers) is an innovative toolkit to accelerate Transformer-based models on Intel platforms, in particular effective on 4th Intel Xeon Scalable processor [Sapphire Rapids](https://www.intel.com/content/www/us/en/products/docs/processors/xeon-accelerated/4th-gen-xeon-scalable-processors.html) (codenamed Sapphire Rapids). The toolkit provides the below key features: * Seamless user experience of model compressions on Transformer-based models by extending [Hugging Face transformers](https://github.com/huggingface/transformers) APIs and leveraging [Intel® Neural Compressor](https://github.com/intel/neural-compressor) * Advanced software optimizations and unique compression-aware runtime. * Optimized Transformer-based model packages. * [NeuralChat](https://github.com/intel/intel-extension-for-transformers/blob/main/intel_extension_for_transformers/neural_chat), a customizable chatbot framework to create your own chatbot within minutes by leveraging a rich set of plugins and SOTA optimizations. * [Inference](https://github.com/intel/intel-extension-for-transformers/blob/main/intel_extension_for_transformers/llm/runtime/graph) of Large Language Model (LLM) in pure C/C++ with weight-only quantization kernels. This PR is an integration of weight only quantization feature with intel-extension-for-transformers. Unit test is in lib/langchain/tests/integration_tests/llm/test_weight_only_quantization.py The notebook is in docs/docs/integrations/llms/weight_only_quantization.ipynb. The document is in docs/docs/integrations/providers/weight_only_quantization.mdx. --------- Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
2025-09-11 07:50:47 +00:00 · 2024-04-04 00:21:34 +08:00
parent d6d843ec24
commit cc407e8a1b
6 changed files with 632 additions and 1 deletions
--- a/libs/community/langchain_community/llms/weight_only_quantization.py
+++ b/libs/community/langchain_community/llms/weight_only_quantization.py
@@ -0,0 +1,244 @@
+import importlib
+from typing import Any, List, Mapping, Optional
+
+from langchain_core.callbacks.manager import CallbackManagerForLLMRun
+from langchain_core.language_models.llms import LLM
+from langchain_core.pydantic_v1 import Extra
+
+from langchain_community.llms.utils import enforce_stop_tokens
+
+DEFAULT_MODEL_ID = "google/flan-t5-large"
+DEFAULT_TASK = "text2text-generation"
+VALID_TASKS = ("text2text-generation", "text-generation", "summarization")
+
+
+class WeightOnlyQuantPipeline(LLM):
+    """Weight only quantized model.
+
+    To use, you should have the `intel-extension-for-transformers` packabge and
+        `transformers` package installed.
+    intel-extension-for-transformers:
+        https://github.com/intel/intel-extension-for-transformers
+
+    Example using from_model_id:
+        .. code-block:: python
+
+            from langchain_community.llms import WeightOnlyQuantPipeline
+            from intel_extension_for_transformers.transformers import (
+                WeightOnlyQuantConfig
+            )
+            config = WeightOnlyQuantConfig
+            hf = WeightOnlyQuantPipeline.from_model_id(
+                model_id="google/flan-t5-large",
+                task="text2text-generation"
+                pipeline_kwargs={"max_new_tokens": 10},
+                quantization_config=config,
+            )
+    Example passing pipeline in directly:
+        .. code-block:: python
+
+            from langchain_community.llms import WeightOnlyQuantPipeline
+            from intel_extension_for_transformers.transformers import (
+                AutoModelForSeq2SeqLM
+            )
+            from intel_extension_for_transformers.transformers import (
+                WeightOnlyQuantConfig
+            )
+            from transformers import AutoTokenizer, pipeline
+
+            model_id = "google/flan-t5-large"
+            tokenizer = AutoTokenizer.from_pretrained(model_id)
+            config = WeightOnlyQuantConfig
+            model = AutoModelForSeq2SeqLM.from_pretrained(
+                model_id,
+                quantization_config=config,
+            )
+            pipe = pipeline(
+                "text-generation",
+                model=model,
+                tokenizer=tokenizer,
+                max_new_tokens=10,
+            )
+            hf = WeightOnlyQuantPipeline(pipeline=pipe)
+    """
+
+    pipeline: Any  #: :meta private:
+    model_id: str = DEFAULT_MODEL_ID
+    """Model name or local path to use."""
+
+    model_kwargs: Optional[dict] = None
+    """Key word arguments passed to the model."""
+
+    pipeline_kwargs: Optional[dict] = None
+    """Key word arguments passed to the pipeline."""
+
+    class Config:
+        """Configuration for this pydantic object."""
+
+        extra = Extra.allow
+
+    @classmethod
+    def from_model_id(
+        cls,
+        model_id: str,
+        task: str,
+        device: Optional[int] = -1,
+        device_map: Optional[str] = None,
+        model_kwargs: Optional[dict] = None,
+        pipeline_kwargs: Optional[dict] = None,
+        load_in_4bit: Optional[bool] = False,
+        load_in_8bit: Optional[bool] = False,
+        quantization_config: Optional[Any] = None,
+        **kwargs: Any,
+    ) -> LLM:
+        """Construct the pipeline object from model_id and task."""
+        if device_map is not None and (isinstance(device, int) and device > -1):
+            raise ValueError("`Device` and `device_map` cannot be set simultaneously!")
+        if importlib.util.find_spec("torch") is None:
+            raise ValueError(
+                "Weight only quantization pipeline only support PyTorch now!"
+            )
+
+        try:
+            from intel_extension_for_transformers.transformers import (
+                AutoModelForCausalLM,
+                AutoModelForSeq2SeqLM,
+            )
+            from intel_extension_for_transformers.utils.utils import is_ipex_available
+            from transformers import AutoTokenizer
+            from transformers import pipeline as hf_pipeline
+        except ImportError:
+            raise ValueError(
+                "Could not import transformers python package. "
+                "Please install it with `pip install transformers` "
+                "and `pip install intel-extension-for-transformers`."
+            )
+        if isinstance(device, int) and device >= 0:
+            if not is_ipex_available():
+                raise ValueError("Don't find out Intel GPU on this machine!")
+            device_map = "xpu:" + str(device)
+        elif isinstance(device, int) and device < 0:
+            device = None
+
+        if device is None:
+            if device_map is None:
+                device_map = "cpu"
+
+        _model_kwargs = model_kwargs or {}
+        tokenizer = AutoTokenizer.from_pretrained(model_id, **_model_kwargs)
+
+        try:
+            if task == "text-generation":
+                model = AutoModelForCausalLM.from_pretrained(
+                    model_id,
+                    load_in_4bit=load_in_4bit,
+                    load_in_8bit=load_in_8bit,
+                    quantization_config=quantization_config,
+                    use_llm_runtime=False,
+                    device_map=device_map,
+                    **_model_kwargs,
+                )
+            elif task in ("text2text-generation", "summarization"):
+                model = AutoModelForSeq2SeqLM.from_pretrained(
+                    model_id,
+                    load_in_4bit=load_in_4bit,
+                    load_in_8bit=load_in_8bit,
+                    quantization_config=quantization_config,
+                    use_llm_runtime=False,
+                    device_map=device_map,
+                    **_model_kwargs,
+                )
+            else:
+                raise ValueError(
+                    f"Got invalid task {task}, "
+                    f"currently only {VALID_TASKS} are supported"
+                )
+        except ImportError as e:
+            raise ValueError(
+                f"Could not load the {task} model due to missing dependencies."
+            ) from e
+
+        if "trust_remote_code" in _model_kwargs:
+            _model_kwargs = {
+                k: v for k, v in _model_kwargs.items() if k != "trust_remote_code"
+            }
+        _pipeline_kwargs = pipeline_kwargs or {}
+        pipeline = hf_pipeline(
+            task=task,
+            model=model,
+            tokenizer=tokenizer,
+            device=device,
+            model_kwargs=_model_kwargs,
+            **_pipeline_kwargs,
+        )
+        if pipeline.task not in VALID_TASKS:
+            raise ValueError(
+                f"Got invalid task {pipeline.task}, "
+                f"currently only {VALID_TASKS} are supported"
+            )
+        return cls(
+            pipeline=pipeline,
+            model_id=model_id,
+            model_kwargs=_model_kwargs,
+            pipeline_kwargs=_pipeline_kwargs,
+            **kwargs,
+        )
+
+    @property
+    def _identifying_params(self) -> Mapping[str, Any]:
+        """Get the identifying parameters."""
+        return {
+            "model_id": self.model_id,
+            "model_kwargs": self.model_kwargs,
+            "pipeline_kwargs": self.pipeline_kwargs,
+        }
+
+    @property
+    def _llm_type(self) -> str:
+        """Return type of llm."""
+        return "weight_only_quantization"
+
+    def _call(
+        self,
+        prompt: str,
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+        **kwargs: Any,
+    ) -> str:
+        """Call the HuggingFace model and return the output.
+
+        Args:
+            prompt: The prompt to use for generation.
+            stop: A list of strings to stop generation when encountered.
+
+        Returns:
+            The generated text.
+
+        Example:
+            .. code-block:: python
+
+                from langchain_community.llms import WeightOnlyQuantPipeline
+                llm = WeightOnlyQuantPipeline.from_model_id(
+                    model_id="google/flan-t5-large",
+                    task="text2text-generation",
+                )
+                llm("This is a prompt.")
+        """
+        response = self.pipeline(prompt)
+        if self.pipeline.task == "text-generation":
+            # Text generation return includes the starter text.
+            text = response[0]["generated_text"][len(prompt) :]
+        elif self.pipeline.task == "text2text-generation":
+            text = response[0]["generated_text"]
+        elif self.pipeline.task == "summarization":
+            text = response[0]["summary_text"]
+        else:
+            raise ValueError(
+                f"Got invalid task {self.pipeline.task}, "
+                f"currently only {VALID_TASKS} are supported"
+            )
+        if stop:
+            # This is a bit hacky, but I can't figure out a better way to enforce
+            # stop tokens when making calls to huggingface_hub.
+            text = enforce_stop_tokens(text, stop)
+        return text