From b9f3c7a0c9739372952dc2c8996714f7feea28d1 Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Wed, 6 Mar 2024 12:35:45 -0500 Subject: [PATCH] Use Case: Extraction set temperature to 0, qualify a statement (#18672) Minor changes: 1) Set temperature to 0 (important) 2) Better qualify one of the statements with confidence --- .../extraction/how_to/examples.ipynb | 10 +--- .../extraction/how_to/handle_long_text.ipynb | 10 +--- .../use_cases/extraction/how_to/parse.ipynb | 22 +++---- docs/docs/use_cases/extraction/index.ipynb | 12 +--- .../use_cases/extraction/quickstart.ipynb | 57 ++++++++++--------- 5 files changed, 46 insertions(+), 65 deletions(-) diff --git a/docs/docs/use_cases/extraction/how_to/examples.ipynb b/docs/docs/use_cases/extraction/how_to/examples.ipynb index e37511da19a..7447efb9144 100644 --- a/docs/docs/use_cases/extraction/how_to/examples.ipynb +++ b/docs/docs/use_cases/extraction/how_to/examples.ipynb @@ -430,14 +430,6 @@ " }\n", ")" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d18bb013", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -456,7 +448,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.1" + "version": "3.11.2" } }, "nbformat": 4, diff --git a/docs/docs/use_cases/extraction/how_to/handle_long_text.ipynb b/docs/docs/use_cases/extraction/how_to/handle_long_text.ipynb index 76ba7d16616..6c1b6d01f37 100644 --- a/docs/docs/use_cases/extraction/how_to/handle_long_text.ipynb +++ b/docs/docs/use_cases/extraction/how_to/handle_long_text.ipynb @@ -394,14 +394,6 @@ "* Large chunk overlap may cause the same information to be extracted twice, so be prepared to de-duplicate!\n", "* LLMs can make up data. If looking for a single fact across a large text and using a brute force approach, you may end up getting more made up data." ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b5f9685f-9d68-4155-a78c-0cb50821e21f", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -420,7 +412,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.1" + "version": "3.11.2" } }, "nbformat": 4, diff --git a/docs/docs/use_cases/extraction/how_to/parse.ipynb b/docs/docs/use_cases/extraction/how_to/parse.ipynb index 7bdf679a73a..efb6858905a 100644 --- a/docs/docs/use_cases/extraction/how_to/parse.ipynb +++ b/docs/docs/use_cases/extraction/how_to/parse.ipynb @@ -32,7 +32,7 @@ "source": [ "from langchain_anthropic.chat_models import ChatAnthropic\n", "\n", - "model = ChatAnthropic(model_name=\"claude-3-sonnet-20240229\")" + "model = ChatAnthropic(model_name=\"claude-3-sonnet-20240229\", temperature=0)" ] }, { @@ -59,7 +59,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 2, "id": "497eb023-c043-443d-ac62-2d4ea85fe1b0", "metadata": {}, "outputs": [], @@ -111,7 +111,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 3, "id": "20b99ffb-a114-49a9-a7be-154c525f8ada", "metadata": {}, "outputs": [], @@ -121,7 +121,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 4, "id": "4f3a66ce-de19-4571-9e54-67504ae3fba7", "metadata": {}, "outputs": [ @@ -149,7 +149,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 5, "id": "3a46b5fd-9242-4b8c-a4e2-3f04fc19b3a4", "metadata": {}, "outputs": [ @@ -159,7 +159,7 @@ "People(people=[Person(name='Anna', height_in_meters=1.83)])" ] }, - "execution_count": 11, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -183,7 +183,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 6, "id": "b1f11912-c1bb-4a2a-a482-79bf3996961f", "metadata": {}, "outputs": [], @@ -253,7 +253,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 7, "id": "cda52ef5-a354-47a7-9c25-45153c2389e2", "metadata": {}, "outputs": [ @@ -261,7 +261,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "System: Answer the user query. Output your answer as JSON that matches the given schema: ```json\n", + "System: Answer the user query. Output your answer as JSON that matches the given schema: ```json\n", "{'title': 'People', 'description': 'Identifying information about all people in a text.', 'type': 'object', 'properties': {'people': {'title': 'People', 'type': 'array', 'items': {'$ref': '#/definitions/Person'}}}, 'required': ['people'], 'definitions': {'Person': {'title': 'Person', 'description': 'Information about a person.', 'type': 'object', 'properties': {'name': {'title': 'Name', 'description': 'The name of the person', 'type': 'string'}, 'height_in_meters': {'title': 'Height In Meters', 'description': 'The height of the person expressed in meters.', 'type': 'number'}}, 'required': ['name', 'height_in_meters']}}}\n", "```. Make sure to wrap the answer in ```json and ``` tags\n", "Human: Anna is 23 years old and she is 6 feet tall\n" @@ -275,7 +275,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 8, "id": "993dc61a-229d-4795-a746-0d17df86b5c0", "metadata": {}, "outputs": [ @@ -285,7 +285,7 @@ "[{'people': [{'name': 'Anna', 'height_in_meters': 1.83}]}]" ] }, - "execution_count": 18, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } diff --git a/docs/docs/use_cases/extraction/index.ipynb b/docs/docs/use_cases/extraction/index.ipynb index 5e6fc6b87ad..48f141e1440 100644 --- a/docs/docs/use_cases/extraction/index.ipynb +++ b/docs/docs/use_cases/extraction/index.ipynb @@ -34,7 +34,7 @@ "\n", "- **Tool/Function Calling** Mode: Some LLMs support a *tool or function calling* mode. These LLMs can structure output according to a given **schema**. Generally, this approach is the easiest to work with and is expected to yield good results.\n", "\n", - "- **JSON Mode**: Some LLMs are can be forced to output valid JSON. This is similar to **tool/function Calling** approach, except that the schema is provided as part of the prompt. Generally, our intuition is that this performs worse than a **tool/function calling** approach.\n", + "- **JSON Mode**: Some LLMs are can be forced to output valid JSON. This is similar to **tool/function Calling** approach, except that the schema is provided as part of the prompt. Generally, our intuition is that this performs worse than a **tool/function calling** approach, but don't trust us and verify for your own use case!\n", "\n", "- **Prompting Based**: LLMs that can follow instructions well can be instructed to generate text in a desired format. The generated text can be parsed downstream using existing [Output Parsers](/docs/modules/model_io/output_parsers/) or using [custom parsers](/docs/modules/model_io/output_parsers/custom) into a structured format like JSON. This approach can be used with LLMs that **do not support** JSON mode or tool/function calling modes. This approach is more broadly applicable, though may yield worse results than models that have been fine-tuned for extraction or function calling.\n", "\n", @@ -71,14 +71,6 @@ "* [OpenAI's function and tool calling](https://platform.openai.com/docs/guides/function-calling)\n", "* For example, see [OpenAI's JSON mode](https://platform.openai.com/docs/guides/text-generation/json-mode)." ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6e171cab", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -97,7 +89,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.1" + "version": "3.11.2" } }, "nbformat": 4, diff --git a/docs/docs/use_cases/extraction/quickstart.ipynb b/docs/docs/use_cases/extraction/quickstart.ipynb index 051bdcef2fb..b69974d7b5a 100644 --- a/docs/docs/use_cases/extraction/quickstart.ipynb +++ b/docs/docs/use_cases/extraction/quickstart.ipynb @@ -68,7 +68,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 7, "id": "c141084c-fb94-4093-8d6a-81175d688e40", "metadata": {}, "outputs": [], @@ -91,9 +91,11 @@ " # Having a good description can help improve extraction results.\n", " name: Optional[str] = Field(..., description=\"The name of the person\")\n", " hair_color: Optional[str] = Field(\n", - " ..., description=\"The color of the peron's eyes if known\"\n", + " ..., description=\"The color of the peron's hair if known\"\n", " )\n", - " height_in_meters: Optional[str] = Field(..., description=\"Height in METERs\")" + " height_in_meters: Optional[str] = Field(\n", + " ..., description=\"Height measured in meters\"\n", + " )" ] }, { @@ -117,14 +119,13 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 8, "id": "a5e490f6-35ad-455e-8ae4-2bae021583ff", "metadata": {}, "outputs": [], "source": [ "from typing import Optional\n", "\n", - "from langchain.chains import create_structured_output_runnable\n", "from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder\n", "from langchain_core.pydantic_v1 import BaseModel, Field\n", "from langchain_openai import ChatOpenAI\n", @@ -162,14 +163,14 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 31, "id": "04d846a6-d5cb-4009-ac19-61e3aac0177e", "metadata": {}, "outputs": [], "source": [ "from langchain_mistralai import ChatMistralAI\n", "\n", - "llm = ChatMistralAI(model=\"mistral-large-latest\")\n", + "llm = ChatMistralAI(model=\"mistral-large-latest\", temperature=0)\n", "\n", "runnable = prompt | llm.with_structured_output(schema=Person)" ] @@ -184,17 +185,17 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 32, "id": "13165ac8-a1dc-44ce-a6ed-f52b577473e4", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Person(name='Alan Smith', hair_color='blond', height_in_meters='1.83')" + "Person(name='Alan Smith', hair_color='blond', height_in_meters='1.8288')" ] }, - "execution_count": 4, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -232,7 +233,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 33, "id": "591a0c16-7a17-4883-91ee-0d6d2fdb265c", "metadata": {}, "outputs": [], @@ -255,9 +256,11 @@ " # Having a good description can help improve extraction results.\n", " name: Optional[str] = Field(..., description=\"The name of the person\")\n", " hair_color: Optional[str] = Field(\n", - " ..., description=\"The color of the peron's eyes if known\"\n", + " ..., description=\"The color of the peron's hair if known\"\n", + " )\n", + " height_in_meters: Optional[str] = Field(\n", + " ..., description=\"Height measured in meters\"\n", " )\n", - " height_in_meters: Optional[str] = Field(..., description=\"Height in meters\")\n", "\n", "\n", "class Data(BaseModel):\n", @@ -267,26 +270,36 @@ " people: List[Person]" ] }, + { + "cell_type": "markdown", + "id": "5f5cda33-fd7b-481e-956a-703f45e40e1d", + "metadata": {}, + "source": [ + ":::{.callout-important}\n", + "Extraction might not be perfect here. Please continue to see how to use **Reference Examples** to improve the quality of extraction, and see the **guidelines** section!\n", + ":::" + ] + }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 34, "id": "cf7062cc-1d1d-4a37-9122-509d1b87f0a6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Data(people=[Person(name='Jeff', hair_color='black', height_in_meters='2'), Person(name='Anna', hair_color=None, height_in_meters=None)])" + "Data(people=[Person(name='Jeff', hair_color=None, height_in_meters=None), Person(name='Anna', hair_color=None, height_in_meters=None)])" ] }, - "execution_count": 12, + "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "runnable = prompt | llm.with_structured_output(schema=Data)\n", - "text = \"My name is Jeff and I am 2 meters. I have black hair. Anna has the same color hair as me.\"\n", + "text = \"My name is Jeff, my hair is black and i am 6 feet tall. Anna has the same color hair as me.\"\n", "runnable.invoke({\"text\": text})" ] }, @@ -318,14 +331,6 @@ "- [Use a Parsing Approach](/docs/use_cases/extraction/how_to/parse): Use a prompt based approach to extract with models that do not support **tool/function calling**.\n", "- [Guidelines](/docs/use_cases/extraction/guidelines): Guidelines for getting good performance on extraction tasks." ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "082fc1af", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -344,7 +349,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.1" + "version": "3.11.2" } }, "nbformat": 4,