From fa6397d76a97d08588a6f8fe9bfd6dbcfefb6edf Mon Sep 17 00:00:00 2001 From: Ethan Yang Date: Mon, 25 Mar 2024 14:57:30 +0800 Subject: [PATCH] docs: Add OpenVINO llms docs (#19489) Add OpenVINOpipeline instructions in docs. OpenVINO users can find more details in this page. --- .../llms/huggingface_pipelines.ipynb | 8 +- docs/docs/integrations/llms/openvino.ipynb | 249 ++++++++++++++++++ 2 files changed, 253 insertions(+), 4 deletions(-) create mode 100644 docs/docs/integrations/llms/openvino.ipynb diff --git a/docs/docs/integrations/llms/huggingface_pipelines.ipynb b/docs/docs/integrations/llms/huggingface_pipelines.ipynb index 6f48849b571..a377d9d0084 100644 --- a/docs/docs/integrations/llms/huggingface_pipelines.ipynb +++ b/docs/docs/integrations/llms/huggingface_pipelines.ipynb @@ -256,7 +256,7 @@ "metadata": {}, "outputs": [], "source": [ - "!optimum-cli export openvino --model gpt2 ov_model" + "!optimum-cli export openvino --model gpt2 ov_model_dir" ] }, { @@ -274,9 +274,9 @@ "metadata": {}, "outputs": [], "source": [ - "!optimum-cli export openvino --model gpt2 --weight-format int8 ov_model # for 8-bit quantization\n", + "!optimum-cli export openvino --model gpt2 --weight-format int8 ov_model_dir # for 8-bit quantization\n", "\n", - "!optimum-cli export openvino --model gpt2 --weight-format int4 ov_model # for 4-bit quantization" + "!optimum-cli export openvino --model gpt2 --weight-format int4 ov_model_dir # for 4-bit quantization" ] }, { @@ -287,7 +287,7 @@ "outputs": [], "source": [ "ov_llm = HuggingFacePipeline.from_model_id(\n", - " model_id=\"ov_model\",\n", + " model_id=\"ov_model_dir\",\n", " task=\"text-generation\",\n", " backend=\"openvino\",\n", " model_kwargs={\"device\": \"CPU\", \"ov_config\": ov_config},\n", diff --git a/docs/docs/integrations/llms/openvino.ipynb b/docs/docs/integrations/llms/openvino.ipynb new file mode 100644 index 00000000000..1309d65b90b --- /dev/null +++ b/docs/docs/integrations/llms/openvino.ipynb @@ -0,0 +1,249 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "959300d4", + "metadata": {}, + "source": [ + "# OpenVINO Local Pipelines\n", + "\n", + "[OpenVINO™](https://github.com/openvinotoolkit/openvino) is an open-source toolkit for optimizing and deploying AI inference. The OpenVINO™ Runtime can infer models on different hardware [devices](https://github.com/openvinotoolkit/openvino?tab=readme-ov-file#supported-hardware-matrix). It can help to boost deep learning performance in computer vision, automatic speech recognition, natural language processing and other common tasks.\n", + "\n", + "OpenVINO models can be run locally through the `HuggingFacePipeline` [class](https://python.langchain.com/docs/integrations/llms/huggingface_pipeline). To deploy a model with OpenVINO, you can specify the `backend=\"openvino\"` parameter to trigger OpenVINO as backend inference framework." + ] + }, + { + "cell_type": "markdown", + "id": "4c1b8450-5eaf-4d34-8341-2d785448a1ff", + "metadata": { + "tags": [] + }, + "source": [ + "To use, you should have the ``optimum-intel`` with OpenVINO Accelerator python [package installed](https://github.com/huggingface/optimum-intel?tab=readme-ov-file#installation)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d772b637-de00-4663-bd77-9bc96d798db2", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%pip install --upgrade-strategy eager \"optimum[openvino,nncf]\" --quiet" + ] + }, + { + "cell_type": "markdown", + "id": "91ad075f-71d5-4bc8-ab91-cc0ad5ef16bb", + "metadata": {}, + "source": [ + "### Model Loading\n", + "\n", + "Models can be loaded by specifying the model parameters using the `from_model_id` method.\n", + "\n", + "If you have an Intel GPU, you can specify `model_kwargs={\"device\": \"GPU\"}` to run inference on it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "165ae236-962a-4763-8052-c4836d78a5d2", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline\n", + "\n", + "ov_config = {\"PERFORMANCE_HINT\": \"LATENCY\", \"NUM_STREAMS\": \"1\", \"CACHE_DIR\": \"\"}\n", + "\n", + "ov_llm = HuggingFacePipeline.from_model_id(\n", + " model_id=\"gpt2\",\n", + " task=\"text-generation\",\n", + " backend=\"openvino\",\n", + " model_kwargs={\"device\": \"CPU\", \"ov_config\": ov_config},\n", + " pipeline_kwargs={\"max_new_tokens\": 10},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "00104b27-0c15-4a97-b198-4512337ee211", + "metadata": {}, + "source": [ + "They can also be loaded by passing in an existing `optimum-intel` pipeline directly" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f426a4f", + "metadata": {}, + "outputs": [], + "source": [ + "from optimum.intel.openvino import OVModelForCausalLM\n", + "from transformers import AutoTokenizer, pipeline\n", + "\n", + "model_id = \"gpt2\"\n", + "device = \"CPU\"\n", + "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", + "ov_model = OVModelForCausalLM.from_pretrained(\n", + " model_id, device=device, ov_config=ov_config\n", + ")\n", + "ov_pipe = pipeline(\n", + " \"text-generation\", model=ov_model, tokenizer=tokenizer, max_new_tokens=10\n", + ")\n", + "hf = HuggingFacePipeline(pipeline=ov_pipe)" + ] + }, + { + "cell_type": "markdown", + "id": "60e7ba8d", + "metadata": {}, + "source": [ + "### Create Chain\n", + "\n", + "With the model loaded into memory, you can compose it with a prompt to\n", + "form a chain." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3acf0069", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.prompts import PromptTemplate\n", + "\n", + "template = \"\"\"Question: {question}\n", + "\n", + "Answer: Let's think step by step.\"\"\"\n", + "prompt = PromptTemplate.from_template(template)\n", + "\n", + "chain = prompt | ov_llm\n", + "\n", + "question = \"What is electroencephalography?\"\n", + "\n", + "print(chain.invoke({\"question\": question}))" + ] + }, + { + "cell_type": "markdown", + "id": "12524837-e9ab-455a-86be-66b95f4f893a", + "metadata": {}, + "source": [ + "### Inference with local OpenVINO model\n", + "\n", + "It is possible to [export your model](https://github.com/huggingface/optimum-intel?tab=readme-ov-file#export) to the OpenVINO IR format with the CLI, and load the model from local folder.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d1104a2-79c7-43a6-aa1c-8076a5ad7747", + "metadata": {}, + "outputs": [], + "source": [ + "!optimum-cli export openvino --model gpt2 ov_model_dir" + ] + }, + { + "cell_type": "markdown", + "id": "0f7a6d21", + "metadata": {}, + "source": [ + "It is recommended to apply 8 or 4-bit weight quantization to reduce inference latency and model footprint using `--weight-format`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "97088ea0", + "metadata": {}, + "outputs": [], + "source": [ + "!optimum-cli export openvino --model gpt2 --weight-format int8 ov_model_dir # for 8-bit quantization\n", + "\n", + "!optimum-cli export openvino --model gpt2 --weight-format int4 ov_model_dir # for 4-bit quantization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac71e60d-5595-454e-8602-03ebb0248205", + "metadata": {}, + "outputs": [], + "source": [ + "ov_llm = HuggingFacePipeline.from_model_id(\n", + " model_id=\"ov_model_dir\",\n", + " task=\"text-generation\",\n", + " backend=\"openvino\",\n", + " model_kwargs={\"device\": \"CPU\", \"ov_config\": ov_config},\n", + " pipeline_kwargs={\"max_new_tokens\": 10},\n", + ")\n", + "\n", + "ov_chain = prompt | ov_llm\n", + "\n", + "question = \"What is electroencephalography?\"\n", + "\n", + "print(ov_chain.invoke({\"question\": question}))" + ] + }, + { + "cell_type": "markdown", + "id": "a2c5726c", + "metadata": {}, + "source": [ + "You can get additional inference speed improvement with Dynamic Quantization of activations and KV-cache quantization. These options can be enabled with `ov_config` as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1f9c2c5", + "metadata": {}, + "outputs": [], + "source": [ + "ov_config = {\n", + " \"KV_CACHE_PRECISION\": \"u8\",\n", + " \"DYNAMIC_QUANTIZATION_GROUP_SIZE\": \"32\",\n", + " \"PERFORMANCE_HINT\": \"LATENCY\",\n", + " \"NUM_STREAMS\": \"1\",\n", + " \"CACHE_DIR\": \"\",\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "da9a9239", + "metadata": {}, + "source": [ + "For more information refer to [OpenVINO LLM guide](https://docs.openvino.ai/2024/openvino-workflow/generative-ai-models-guide.html)." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}