mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-22 04:24:39 +00:00
docs: add quantization to vllm and update API (#16950)
- **Description:** Update vLLM docs to include instructions on how to use quantized models, as well as to replace the deprecated methods.
This commit is contained in:
parent
2a510c71a0
commit
71f9ea33b6
@ -82,7 +82,7 @@
|
|||||||
" temperature=0.8,\n",
|
" temperature=0.8,\n",
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"print(llm(\"What is the capital of France ?\"))"
|
"print(llm.invoke(\"What is the capital of France ?\"))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -117,8 +117,7 @@
|
|||||||
"1. The first Pokemon game was released in 1996.\n",
|
"1. The first Pokemon game was released in 1996.\n",
|
||||||
"2. The president was Bill Clinton.\n",
|
"2. The president was Bill Clinton.\n",
|
||||||
"3. Clinton was president from 1993 to 2001.\n",
|
"3. Clinton was president from 1993 to 2001.\n",
|
||||||
"4. The answer is Clinton.\n",
|
"4. The answer is Clinton.\n"
|
||||||
"\n"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -142,7 +141,7 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"question = \"Who was the US president in the year the first Pokemon game was released?\"\n",
|
"question = \"Who was the US president in the year the first Pokemon game was released?\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"print(llm_chain.run(question))"
|
"print(llm_chain.invoke(question))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -172,7 +171,36 @@
|
|||||||
" trust_remote_code=True, # mandatory for hf models\n",
|
" trust_remote_code=True, # mandatory for hf models\n",
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"llm(\"What is the future of AI?\")"
|
"llm.invoke(\"What is the future of AI?\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "d6ca8fd911d25faa",
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"## Quantization\n",
|
||||||
|
"\n",
|
||||||
|
"vLLM supports `awq` quantization. To enable it, pass `quantization` to `vllm_kwargs`."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "2cada3174c46a0ea",
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"llm_q = VLLM(\n",
|
||||||
|
" model=\"TheBloke/Llama-2-7b-Chat-AWQ\",\n",
|
||||||
|
" trust_remote_code=True,\n",
|
||||||
|
" max_new_tokens=512,\n",
|
||||||
|
" vllm_kwargs={\"quantization\": \"awq\"},\n",
|
||||||
|
")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -216,7 +244,7 @@
|
|||||||
" model_name=\"tiiuae/falcon-7b\",\n",
|
" model_name=\"tiiuae/falcon-7b\",\n",
|
||||||
" model_kwargs={\"stop\": [\".\"]},\n",
|
" model_kwargs={\"stop\": [\".\"]},\n",
|
||||||
")\n",
|
")\n",
|
||||||
"print(llm(\"Rome is\"))"
|
"print(llm.invoke(\"Rome is\"))"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
Loading…
Reference in New Issue
Block a user