From ae471a7dcbd4b485ca3c029324dcf1153a981f2a Mon Sep 17 00:00:00 2001
From: Shengsheng Huang <shannie.huang@gmail.com>
Date: Sat, 2 Mar 2024 02:04:53 +0800
Subject: [PATCH] community[minor]: add BigDL-LLM integrations (#17953)

- **Description**:
[`bigdl-llm`](https://github.com/intel-analytics/BigDL) is a library for
running LLM on Intel XPU (from Laptop to GPU to Cloud) using
INT4/FP4/INT8/FP8 with very low latency (for any PyTorch model). This PR
adds bigdl-llm integrations to langchain.
- **Issue**: NA
- **Dependencies**: `bigdl-llm` library
- **Contribution maintainer**: @shane-huang

Examples added:
- docs/docs/integrations/llms/bigdl.ipynb
---
 docs/docs/integrations/llms/bigdl.ipynb       | 182 ++++++++++++++
 .../langchain_community/llms/bigdl.py         | 222 ++++++++++++++++++
 .../integration_tests/llms/test_bigdl.py      |  25 ++
 3 files changed, 429 insertions(+)
 create mode 100644 docs/docs/integrations/llms/bigdl.ipynb
 create mode 100644 libs/community/langchain_community/llms/bigdl.py
 create mode 100644 libs/community/tests/integration_tests/llms/test_bigdl.py

diff --git a/docs/docs/integrations/llms/bigdl.ipynb b/docs/docs/integrations/llms/bigdl.ipynb
new file mode 100644
index 00000000000..60684898d7e
--- /dev/null
+++ b/docs/docs/integrations/llms/bigdl.ipynb
@@ -0,0 +1,182 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# BigDL-LLM\n",
+    "\n",
+    "> [BigDL-LLM](https://github.com/intel-analytics/BigDL/) is a low-bit LLM optimization library on Intel XPU (Xeon/Core/Flex/Arc/Max). It can make LLMs run extremely fast and consume much less memory on Intel platforms. It is open sourced under Apache 2.0 License.\n",
+    "\n",
+    "This example goes over how to use LangChain to interact with BigDL-LLM for text generation. \n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Update Langchain\n",
+    "\n",
+    "%pip install -qU langchain langchain-community"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install BigDL-LLM for running LLMs locally on Intel CPU."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Install BigDL\n",
+    "%pip install --pre --upgrade bigdl-llm[all]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Usage"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.chains import LLMChain\n",
+    "from langchain_community.llms.bigdl import BigdlLLM\n",
+    "from langchain_core.prompts import PromptTemplate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "template = \"USER: {question}\\nASSISTANT:\"\n",
+    "prompt = PromptTemplate(template=template, input_variables=[\"question\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Load Model: "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "69e018750ffb4de1af22ce49cd6957f4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-02-23 18:10:22,896 - INFO - Converting the current model to sym_int4 format......\n",
+      "2024-02-23 18:10:25,415 - INFO - BIGDL_OPT_IPEX: False\n"
+     ]
+    }
+   ],
+   "source": [
+    "llm = BigdlLLM.from_model_id(\n",
+    "    model_id=\"lmsys/vicuna-7b-v1.5\",\n",
+    "    model_kwargs={\"temperature\": 0, \"max_length\": 64, \"trust_remote_code\": True},\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Use it in Chains:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/anaconda3/envs/shane-langchain2/lib/python3.9/site-packages/langchain_core/_api/deprecation.py:117: LangChainDeprecationWarning: The function `run` was deprecated in LangChain 0.1.0 and will be removed in 0.2.0. Use invoke instead.\n",
+      "  warn_deprecated(\n",
+      "/opt/anaconda3/envs/shane-langchain2/lib/python3.9/site-packages/transformers/generation/utils.py:1369: UserWarning: Using `max_length`'s default (4096) to control the generation length. This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length of the generation.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
+      "AI stands for \"Artificial Intelligence.\" It refers to the development of computer systems that can perform tasks that typically require human intelligence, such as visual perception, speech recognition, decision-making, and language translation. AI can be achieved through a combination of techniques such as machine learning, natural language processing, computer vision, and robotics. The ultimate goal of AI research is to create machines that can think and learn like humans, and can even exceed human capabilities in certain areas.\n"
+     ]
+    }
+   ],
+   "source": [
+    "llm_chain = LLMChain(prompt=prompt, llm=llm)\n",
+    "\n",
+    "question = \"What is AI?\"\n",
+    "output = llm_chain.run(question)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "shane-diffusion",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/libs/community/langchain_community/llms/bigdl.py b/libs/community/langchain_community/llms/bigdl.py
new file mode 100644
index 00000000000..b786bf546ed
--- /dev/null
+++ b/libs/community/langchain_community/llms/bigdl.py
@@ -0,0 +1,222 @@
+import logging
+from typing import Any, List, Mapping, Optional
+
+from langchain_core.callbacks import CallbackManagerForLLMRun
+from langchain_core.language_models.llms import LLM
+from langchain_core.pydantic_v1 import Extra
+
+DEFAULT_MODEL_ID = "gpt2"
+
+logger = logging.getLogger(__name__)
+
+
+class BigdlLLM(LLM):
+    """Wrapper around the BigDL-LLM Transformer-INT4 model
+
+    Example:
+        .. code-block:: python
+
+            from langchain.llms import TransformersLLM
+            llm = TransformersLLM.from_model_id(model_id="THUDM/chatglm-6b")
+    """
+
+    model_id: str = DEFAULT_MODEL_ID
+    """Model name or model path to use."""
+    model_kwargs: Optional[dict] = None
+    """Keyword arguments passed to the model."""
+    model: Any  #: :meta private:
+    """BigDL-LLM Transformers-INT4 model."""
+    tokenizer: Any  #: :meta private:
+    """Huggingface tokenizer model."""
+    streaming: bool = True
+    """Whether to stream the results, token by token."""
+
+    class Config:
+        """Configuration for this pydantic object."""
+
+        extra = Extra.forbid
+
+    @classmethod
+    def from_model_id(
+        cls,
+        model_id: str,
+        model_kwargs: Optional[dict] = None,
+        **kwargs: Any,
+    ) -> LLM:
+        """
+        Construct object from model_id
+
+        Args:
+            model_id: Path for the huggingface repo id to be downloaded or
+                      the huggingface checkpoint folder.
+            model_kwargs: Keyword arguments to pass to the model and tokenizer.
+            kwargs: Extra arguments to pass to the model and tokenizer.
+
+        Returns:
+            An object of TransformersLLM.
+        """
+        try:
+            from bigdl.llm.transformers import (
+                AutoModel,
+                AutoModelForCausalLM,
+            )
+            from transformers import AutoTokenizer, LlamaTokenizer
+
+        except ImportError:
+            raise ValueError(
+                "Could not import bigdl-llm or transformers. "
+                "Please install it with `pip install --pre --upgrade bigdl-llm[all]`."
+            )
+
+        _model_kwargs = model_kwargs or {}
+
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(model_id, **_model_kwargs)
+        except Exception:
+            tokenizer = LlamaTokenizer.from_pretrained(model_id, **_model_kwargs)
+
+        try:
+            model = AutoModelForCausalLM.from_pretrained(
+                model_id, load_in_4bit=True, **_model_kwargs
+            )
+        except Exception:
+            model = AutoModel.from_pretrained(
+                model_id, load_in_4bit=True, **_model_kwargs
+            )
+
+        if "trust_remote_code" in _model_kwargs:
+            _model_kwargs = {
+                k: v for k, v in _model_kwargs.items() if k != "trust_remote_code"
+            }
+
+        return cls(
+            model_id=model_id,
+            model=model,
+            tokenizer=tokenizer,
+            model_kwargs=_model_kwargs,
+            **kwargs,
+        )
+
+    @classmethod
+    def from_model_id_low_bit(
+        cls,
+        model_id: str,
+        model_kwargs: Optional[dict] = None,
+        **kwargs: Any,
+    ) -> LLM:
+        """
+        Construct low_bit object from model_id
+
+        Args:
+
+            model_id: Path for the bigdl transformers low-bit model checkpoint folder.
+            model_kwargs: Keyword arguments to pass to the model and tokenizer.
+            kwargs: Extra arguments to pass to the model and tokenizer.
+
+        Returns:
+            An object of TransformersLLM.
+        """
+        try:
+            from bigdl.llm.transformers import (
+                AutoModel,
+                AutoModelForCausalLM,
+            )
+            from transformers import AutoTokenizer, LlamaTokenizer
+
+        except ImportError:
+            raise ValueError(
+                "Could not import bigdl-llm or transformers. "
+                "Please install it with `pip install --pre --upgrade bigdl-llm[all]`"
+            )
+
+        _model_kwargs = model_kwargs or {}
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(model_id, **_model_kwargs)
+        except Exception:
+            tokenizer = LlamaTokenizer.from_pretrained(model_id, **_model_kwargs)
+
+        try:
+            model = AutoModelForCausalLM.load_low_bit(model_id, **_model_kwargs)
+        except Exception:
+            model = AutoModel.load_low_bit(model_id, **_model_kwargs)
+
+        if "trust_remote_code" in _model_kwargs:
+            _model_kwargs = {
+                k: v for k, v in _model_kwargs.items() if k != "trust_remote_code"
+            }
+
+        return cls(
+            model_id=model_id,
+            model=model,
+            tokenizer=tokenizer,
+            model_kwargs=_model_kwargs,
+            **kwargs,
+        )
+
+    @property
+    def _identifying_params(self) -> Mapping[str, Any]:
+        """Get the identifying parameters."""
+        return {
+            "model_id": self.model_id,
+            "model_kwargs": self.model_kwargs,
+        }
+
+    @property
+    def _llm_type(self) -> str:
+        return "BigDL-llm"
+
+    def _call(
+        self,
+        prompt: str,
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+        **kwargs: Any,
+    ) -> str:
+        if self.streaming:
+            from transformers import TextStreamer
+
+            input_ids = self.tokenizer.encode(prompt, return_tensors="pt")
+            streamer = TextStreamer(
+                self.tokenizer, skip_prompt=True, skip_special_tokens=True
+            )
+            if stop is not None:
+                from transformers.generation.stopping_criteria import (
+                    StoppingCriteriaList,
+                )
+                from transformers.tools.agents import StopSequenceCriteria
+
+                # stop generation when stop words are encountered
+                # TODO: stop generation when the following one is stop word
+                stopping_criteria = StoppingCriteriaList(
+                    [StopSequenceCriteria(stop, self.tokenizer)]
+                )
+            else:
+                stopping_criteria = None
+            output = self.model.generate(
+                input_ids,
+                streamer=streamer,
+                stopping_criteria=stopping_criteria,
+                **kwargs,
+            )
+            text = self.tokenizer.decode(output[0], skip_special_tokens=True)
+            return text
+        else:
+            input_ids = self.tokenizer.encode(prompt, return_tensors="pt")
+            if stop is not None:
+                from transformers.generation.stopping_criteria import (
+                    StoppingCriteriaList,
+                )
+                from transformers.tools.agents import StopSequenceCriteria
+
+                stopping_criteria = StoppingCriteriaList(
+                    [StopSequenceCriteria(stop, self.tokenizer)]
+                )
+            else:
+                stopping_criteria = None
+            output = self.model.generate(
+                input_ids, stopping_criteria=stopping_criteria, **kwargs
+            )
+            text = self.tokenizer.decode(output[0], skip_special_tokens=True)[
+                len(prompt) :
+            ]
+            return text
diff --git a/libs/community/tests/integration_tests/llms/test_bigdl.py b/libs/community/tests/integration_tests/llms/test_bigdl.py
new file mode 100644
index 00000000000..905a373c483
--- /dev/null
+++ b/libs/community/tests/integration_tests/llms/test_bigdl.py
@@ -0,0 +1,25 @@
+"""Test BigDL LLM"""
+from langchain_core.outputs import LLMResult
+
+from langchain_community.llms.bigdl import BigdlLLM
+
+
+def test_call() -> None:
+    """Test valid call to baichuan."""
+    llm = BigdlLLM.from_model_id(
+        model_id="lmsys/vicuna-7b-v1.5",
+        model_kwargs={"temperature": 0, "max_length": 16, "trust_remote_code": True},
+    )
+    output = llm("Hello!")
+    assert isinstance(output, str)
+
+
+def test_generate() -> None:
+    """Test valid call to baichuan."""
+    llm = BigdlLLM.from_model_id(
+        model_id="lmsys/vicuna-7b-v1.5",
+        model_kwargs={"temperature": 0, "max_length": 16, "trust_remote_code": True},
+    )
+    output = llm.generate(["Hello!"])
+    assert isinstance(output, LLMResult)
+    assert isinstance(output.generations, list)