From fa6397d76a97d08588a6f8fe9bfd6dbcfefb6edf Mon Sep 17 00:00:00 2001
From: Ethan Yang <ethan.yang@intel.com>
Date: Mon, 25 Mar 2024 14:57:30 +0800
Subject: [PATCH] docs: Add OpenVINO llms docs (#19489)

Add OpenVINOpipeline instructions in docs. OpenVINO users can find more
details in this page.
---
 .../llms/huggingface_pipelines.ipynb          |   8 +-
 docs/docs/integrations/llms/openvino.ipynb    | 249 ++++++++++++++++++
 2 files changed, 253 insertions(+), 4 deletions(-)
 create mode 100644 docs/docs/integrations/llms/openvino.ipynb

diff --git a/docs/docs/integrations/llms/huggingface_pipelines.ipynb b/docs/docs/integrations/llms/huggingface_pipelines.ipynb
index 6f48849b571..a377d9d0084 100644
--- a/docs/docs/integrations/llms/huggingface_pipelines.ipynb
+++ b/docs/docs/integrations/llms/huggingface_pipelines.ipynb
@@ -256,7 +256,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!optimum-cli export openvino --model gpt2 ov_model"
+    "!optimum-cli export openvino --model gpt2 ov_model_dir"
    ]
   },
   {
@@ -274,9 +274,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!optimum-cli export openvino --model gpt2  --weight-format int8 ov_model # for 8-bit quantization\n",
+    "!optimum-cli export openvino --model gpt2  --weight-format int8 ov_model_dir # for 8-bit quantization\n",
     "\n",
-    "!optimum-cli export openvino --model gpt2  --weight-format int4 ov_model # for 4-bit quantization"
+    "!optimum-cli export openvino --model gpt2  --weight-format int4 ov_model_dir # for 4-bit quantization"
    ]
   },
   {
@@ -287,7 +287,7 @@
    "outputs": [],
    "source": [
     "ov_llm = HuggingFacePipeline.from_model_id(\n",
-    "    model_id=\"ov_model\",\n",
+    "    model_id=\"ov_model_dir\",\n",
     "    task=\"text-generation\",\n",
     "    backend=\"openvino\",\n",
     "    model_kwargs={\"device\": \"CPU\", \"ov_config\": ov_config},\n",
diff --git a/docs/docs/integrations/llms/openvino.ipynb b/docs/docs/integrations/llms/openvino.ipynb
new file mode 100644
index 00000000000..1309d65b90b
--- /dev/null
+++ b/docs/docs/integrations/llms/openvino.ipynb
@@ -0,0 +1,249 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "959300d4",
+   "metadata": {},
+   "source": [
+    "# OpenVINO Local Pipelines\n",
+    "\n",
+    "[OpenVINO™](https://github.com/openvinotoolkit/openvino) is an open-source toolkit for optimizing and deploying AI inference. The OpenVINO™ Runtime can infer models on different hardware [devices](https://github.com/openvinotoolkit/openvino?tab=readme-ov-file#supported-hardware-matrix). It can help to boost deep learning performance in computer vision, automatic speech recognition, natural language processing and other common tasks.\n",
+    "\n",
+    "OpenVINO models can be run locally through the `HuggingFacePipeline` [class](https://python.langchain.com/docs/integrations/llms/huggingface_pipeline). To deploy a model with OpenVINO, you can specify the `backend=\"openvino\"` parameter to trigger OpenVINO as backend inference framework."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4c1b8450-5eaf-4d34-8341-2d785448a1ff",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "To use, you should have the ``optimum-intel`` with OpenVINO Accelerator python [package installed](https://github.com/huggingface/optimum-intel?tab=readme-ov-file#installation)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d772b637-de00-4663-bd77-9bc96d798db2",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "%pip install --upgrade-strategy eager \"optimum[openvino,nncf]\" --quiet"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "91ad075f-71d5-4bc8-ab91-cc0ad5ef16bb",
+   "metadata": {},
+   "source": [
+    "### Model Loading\n",
+    "\n",
+    "Models can be loaded by specifying the model parameters using the `from_model_id` method.\n",
+    "\n",
+    "If you have an Intel GPU, you can specify `model_kwargs={\"device\": \"GPU\"}` to run inference on it."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "165ae236-962a-4763-8052-c4836d78a5d2",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline\n",
+    "\n",
+    "ov_config = {\"PERFORMANCE_HINT\": \"LATENCY\", \"NUM_STREAMS\": \"1\", \"CACHE_DIR\": \"\"}\n",
+    "\n",
+    "ov_llm = HuggingFacePipeline.from_model_id(\n",
+    "    model_id=\"gpt2\",\n",
+    "    task=\"text-generation\",\n",
+    "    backend=\"openvino\",\n",
+    "    model_kwargs={\"device\": \"CPU\", \"ov_config\": ov_config},\n",
+    "    pipeline_kwargs={\"max_new_tokens\": 10},\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "00104b27-0c15-4a97-b198-4512337ee211",
+   "metadata": {},
+   "source": [
+    "They can also be loaded by passing in an existing `optimum-intel` pipeline directly"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7f426a4f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from optimum.intel.openvino import OVModelForCausalLM\n",
+    "from transformers import AutoTokenizer, pipeline\n",
+    "\n",
+    "model_id = \"gpt2\"\n",
+    "device = \"CPU\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
+    "ov_model = OVModelForCausalLM.from_pretrained(\n",
+    "    model_id, device=device, ov_config=ov_config\n",
+    ")\n",
+    "ov_pipe = pipeline(\n",
+    "    \"text-generation\", model=ov_model, tokenizer=tokenizer, max_new_tokens=10\n",
+    ")\n",
+    "hf = HuggingFacePipeline(pipeline=ov_pipe)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "60e7ba8d",
+   "metadata": {},
+   "source": [
+    "### Create Chain\n",
+    "\n",
+    "With the model loaded into memory, you can compose it with a prompt to\n",
+    "form a chain."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3acf0069",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.prompts import PromptTemplate\n",
+    "\n",
+    "template = \"\"\"Question: {question}\n",
+    "\n",
+    "Answer: Let's think step by step.\"\"\"\n",
+    "prompt = PromptTemplate.from_template(template)\n",
+    "\n",
+    "chain = prompt | ov_llm\n",
+    "\n",
+    "question = \"What is electroencephalography?\"\n",
+    "\n",
+    "print(chain.invoke({\"question\": question}))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "12524837-e9ab-455a-86be-66b95f4f893a",
+   "metadata": {},
+   "source": [
+    "### Inference with local OpenVINO model\n",
+    "\n",
+    "It is possible to [export your model](https://github.com/huggingface/optimum-intel?tab=readme-ov-file#export) to the OpenVINO IR format with the CLI, and load the model from local folder.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3d1104a2-79c7-43a6-aa1c-8076a5ad7747",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!optimum-cli export openvino --model gpt2 ov_model_dir"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0f7a6d21",
+   "metadata": {},
+   "source": [
+    "It is recommended to apply 8 or 4-bit weight quantization to reduce inference latency and model footprint using `--weight-format`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "97088ea0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!optimum-cli export openvino --model gpt2  --weight-format int8 ov_model_dir # for 8-bit quantization\n",
+    "\n",
+    "!optimum-cli export openvino --model gpt2  --weight-format int4 ov_model_dir # for 4-bit quantization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ac71e60d-5595-454e-8602-03ebb0248205",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ov_llm = HuggingFacePipeline.from_model_id(\n",
+    "    model_id=\"ov_model_dir\",\n",
+    "    task=\"text-generation\",\n",
+    "    backend=\"openvino\",\n",
+    "    model_kwargs={\"device\": \"CPU\", \"ov_config\": ov_config},\n",
+    "    pipeline_kwargs={\"max_new_tokens\": 10},\n",
+    ")\n",
+    "\n",
+    "ov_chain = prompt | ov_llm\n",
+    "\n",
+    "question = \"What is electroencephalography?\"\n",
+    "\n",
+    "print(ov_chain.invoke({\"question\": question}))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a2c5726c",
+   "metadata": {},
+   "source": [
+    "You can get additional inference speed improvement with Dynamic Quantization of activations and KV-cache quantization. These options can be enabled with `ov_config` as follows:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1f9c2c5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ov_config = {\n",
+    "    \"KV_CACHE_PRECISION\": \"u8\",\n",
+    "    \"DYNAMIC_QUANTIZATION_GROUP_SIZE\": \"32\",\n",
+    "    \"PERFORMANCE_HINT\": \"LATENCY\",\n",
+    "    \"NUM_STREAMS\": \"1\",\n",
+    "    \"CACHE_DIR\": \"\",\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "da9a9239",
+   "metadata": {},
+   "source": [
+    "For more information refer to [OpenVINO LLM guide](https://docs.openvino.ai/2024/openvino-workflow/generative-ai-models-guide.html)."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}