From 79d8556c22956d4ce6776e5b9690e216e4fbb738 Mon Sep 17 00:00:00 2001 From: Jacob Lee Date: Wed, 26 Jun 2024 14:47:01 -0700 Subject: [PATCH] docs[patch]: Address feedback from docs users (#23550) - Updates chat few shot prompt tutorial to show off a more cohesive example - Fix async Chromium loader guide - Fix Excel loader install instructions - Reformat Html2Text page - Add install instructions to Azure OpenAI embeddings page - Add missing dep install to SQL QA tutorial @baskaryan --- docs/docs/how_to/few_shot_examples_chat.ipynb | 123 +++++++++----- .../document_loaders/async_chromium.ipynb | 74 ++++---- .../document_loaders/microsoft_excel.ipynb | 33 +++- .../document_transformers/html2text.ipynb | 160 +++++++++++------- .../text_embedding/azureopenai.ipynb | 21 ++- docs/docs/tutorials/sql_qa.ipynb | 4 +- 6 files changed, 263 insertions(+), 152 deletions(-) diff --git a/docs/docs/how_to/few_shot_examples_chat.ipynb b/docs/docs/how_to/few_shot_examples_chat.ipynb index cee98c04113..0f9f1e321f5 100644 --- a/docs/docs/how_to/few_shot_examples_chat.ipynb +++ b/docs/docs/how_to/few_shot_examples_chat.ipynb @@ -51,7 +51,7 @@ "- `examples`: A list of dictionary examples to include in the final prompt.\n", "- `example_prompt`: converts each example into 1 or more messages through its [`format_messages`](https://api.python.langchain.com/en/latest/prompts/langchain_core.prompts.chat.ChatPromptTemplate.html?highlight=format_messages#langchain_core.prompts.chat.ChatPromptTemplate.format_messages) method. A common example would be to convert each example into one human message and one AI message response, or a human message followed by a function call message.\n", "\n", - "Below is a simple demonstration. First, define the examples you'd like to include:" + "Below is a simple demonstration. First, define the examples you'd like to include. Let's give the LLM an unfamiliar mathematical operator, denoted by the \"🦜\" emoji:" ] }, { @@ -59,17 +59,7 @@ "execution_count": 1, "id": "5b79e400", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 24.0 is available.\n", - "You should consider upgrading via the '/Users/jacoblee/.pyenv/versions/3.10.5/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n", - "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n" - ] - } - ], + "outputs": [], "source": [ "%pip install -qU langchain langchain-openai langchain-chroma\n", "\n", @@ -79,9 +69,50 @@ "os.environ[\"OPENAI_API_KEY\"] = getpass()" ] }, + { + "cell_type": "markdown", + "id": "30856d92", + "metadata": {}, + "source": [ + "If we try to ask the model what the result of this expression is, it will fail:" + ] + }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, + "id": "174dec5b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AIMessage(content='The expression \"2 🦜 9\" is not a standard mathematical operation or equation. It appears to be a combination of the number 2 and the parrot emoji 🦜 followed by the number 9. It does not have a specific mathematical meaning.', response_metadata={'token_usage': {'completion_tokens': 54, 'prompt_tokens': 17, 'total_tokens': 71}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-aad12dda-5c47-4a1e-9949-6fe94e03242a-0', usage_metadata={'input_tokens': 17, 'output_tokens': 54, 'total_tokens': 71})" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain_openai import ChatOpenAI\n", + "\n", + "model = ChatOpenAI(model=\"gpt-3.5-turbo-0125\", temperature=0.0)\n", + "\n", + "model.invoke(\"What is 2 🦜 9?\")" + ] + }, + { + "cell_type": "markdown", + "id": "e6d58385", + "metadata": {}, + "source": [ + "Now let's see what happens if we give the LLM some examples to work with. We'll define some below:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, "id": "0fc5a02a-6249-4e92-95c3-30fff9671e8b", "metadata": { "tags": [] @@ -91,8 +122,8 @@ "from langchain_core.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate\n", "\n", "examples = [\n", - " {\"input\": \"2+2\", \"output\": \"4\"},\n", - " {\"input\": \"2+3\", \"output\": \"5\"},\n", + " {\"input\": \"2 🦜 2\", \"output\": \"4\"},\n", + " {\"input\": \"2 🦜 3\", \"output\": \"5\"},\n", "]" ] }, @@ -106,7 +137,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 6, "id": "65e72ad1-9060-47d0-91a1-bc130c8b98ac", "metadata": { "tags": [] @@ -116,7 +147,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[HumanMessage(content='2+2'), AIMessage(content='4'), HumanMessage(content='2+3'), AIMessage(content='5')]\n" + "[HumanMessage(content='2 🦜 2'), AIMessage(content='4'), HumanMessage(content='2 🦜 3'), AIMessage(content='5')]\n" ] } ], @@ -146,7 +177,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 7, "id": "9f86d6d9-50de-41b6-b6c7-0f9980cc0187", "metadata": { "tags": [] @@ -162,9 +193,17 @@ ")" ] }, + { + "cell_type": "markdown", + "id": "dd8029c5", + "metadata": {}, + "source": [ + "And now let's ask the model the initial question and see how it does:" + ] + }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 8, "id": "97d443b1-6fae-4b36-bede-3ff7306288a3", "metadata": { "tags": [] @@ -173,10 +212,10 @@ { "data": { "text/plain": [ - "AIMessage(content='A triangle does not have a square. The square of a number is the result of multiplying the number by itself.', response_metadata={'token_usage': {'completion_tokens': 23, 'prompt_tokens': 52, 'total_tokens': 75}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': 'fp_c2295e73ad', 'finish_reason': 'stop', 'logprobs': None}, id='run-3456c4ef-7b4d-4adb-9e02-8079de82a47a-0')" + "AIMessage(content='11', response_metadata={'token_usage': {'completion_tokens': 1, 'prompt_tokens': 60, 'total_tokens': 61}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-5ec4e051-262f-408e-ad00-3f2ebeb561c3-0', usage_metadata={'input_tokens': 60, 'output_tokens': 1, 'total_tokens': 61})" ] }, - "execution_count": 5, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -184,9 +223,9 @@ "source": [ "from langchain_openai import ChatOpenAI\n", "\n", - "chain = final_prompt | ChatOpenAI(model=\"gpt-3.5-turbo-0125\", temperature=0.0)\n", + "chain = final_prompt | model\n", "\n", - "chain.invoke({\"input\": \"What's the square of a triangle?\"})" + "chain.invoke({\"input\": \"What is 2 🦜 9?\"})" ] }, { @@ -194,6 +233,8 @@ "id": "70ab7114-f07f-46be-8874-3705a25aba5f", "metadata": {}, "source": [ + "And we can see that the model has now inferred that the parrot emoji means addition from the given few-shot examples!\n", + "\n", "## Dynamic few-shot prompting\n", "\n", "Sometimes you may want to select only a few examples from your overall set to show based on the input. For this, you can replace the `examples` passed into `FewShotChatMessagePromptTemplate` with an `example_selector`. The other components remain the same as above! Our dynamic few-shot prompt template would look like:\n", @@ -208,7 +249,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 9, "id": "ad66f06a-66fd-4fcc-8166-5d0e3c801e57", "metadata": { "tags": [] @@ -220,9 +261,9 @@ "from langchain_openai import OpenAIEmbeddings\n", "\n", "examples = [\n", - " {\"input\": \"2+2\", \"output\": \"4\"},\n", - " {\"input\": \"2+3\", \"output\": \"5\"},\n", - " {\"input\": \"2+4\", \"output\": \"6\"},\n", + " {\"input\": \"2 🦜 2\", \"output\": \"4\"},\n", + " {\"input\": \"2 🦜 3\", \"output\": \"5\"},\n", + " {\"input\": \"2 🦜 4\", \"output\": \"6\"},\n", " {\"input\": \"What did the cow say to the moon?\", \"output\": \"nothing at all\"},\n", " {\n", " \"input\": \"Write me a poem about the moon\",\n", @@ -247,7 +288,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 10, "id": "7790303a-f722-452e-8921-b14bdf20bdff", "metadata": { "tags": [] @@ -257,10 +298,10 @@ "data": { "text/plain": [ "[{'input': 'What did the cow say to the moon?', 'output': 'nothing at all'},\n", - " {'input': '2+4', 'output': '6'}]" + " {'input': '2 🦜 4', 'output': '6'}]" ] }, - "execution_count": 7, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -287,7 +328,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 11, "id": "253c255e-41d7-45f6-9d88-c7a0ced4b1bd", "metadata": { "tags": [] @@ -297,7 +338,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[HumanMessage(content='2+3'), AIMessage(content='5'), HumanMessage(content='2+2'), AIMessage(content='4')]\n" + "[HumanMessage(content='2 🦜 3'), AIMessage(content='5'), HumanMessage(content='2 🦜 4'), AIMessage(content='6')]\n" ] } ], @@ -317,7 +358,7 @@ " ),\n", ")\n", "\n", - "print(few_shot_prompt.invoke(input=\"What's 3+3?\").to_messages())" + "print(few_shot_prompt.invoke(input=\"What's 3 🦜 3?\").to_messages())" ] }, { @@ -330,7 +371,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 12, "id": "e731cb45-f0ea-422c-be37-42af2a6cb2c4", "metadata": { "tags": [] @@ -340,7 +381,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "messages=[HumanMessage(content='2+3'), AIMessage(content='5'), HumanMessage(content='2+2'), AIMessage(content='4')]\n" + "messages=[HumanMessage(content='2 🦜 3'), AIMessage(content='5'), HumanMessage(content='2 🦜 4'), AIMessage(content='6')]\n" ] } ], @@ -353,7 +394,7 @@ " ]\n", ")\n", "\n", - "print(few_shot_prompt.invoke(input=\"What's 3+3?\"))" + "print(few_shot_prompt.invoke(input=\"What's 3 🦜 3?\"))" ] }, { @@ -368,7 +409,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 13, "id": "0568cbc6-5354-47f1-ab4d-dfcc616cf583", "metadata": { "tags": [] @@ -377,10 +418,10 @@ { "data": { "text/plain": [ - "AIMessage(content='6', response_metadata={'token_usage': {'completion_tokens': 1, 'prompt_tokens': 51, 'total_tokens': 52}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': 'fp_c2295e73ad', 'finish_reason': 'stop', 'logprobs': None}, id='run-6bcbe158-a8e3-4a85-a754-1ba274a9f147-0')" + "AIMessage(content='6', response_metadata={'token_usage': {'completion_tokens': 1, 'prompt_tokens': 60, 'total_tokens': 61}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-d1863e5e-17cd-4e9d-bf7a-b9f118747a65-0', usage_metadata={'input_tokens': 60, 'output_tokens': 1, 'total_tokens': 61})" ] }, - "execution_count": 18, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -388,7 +429,7 @@ "source": [ "chain = final_prompt | ChatOpenAI(model=\"gpt-3.5-turbo-0125\", temperature=0.0)\n", "\n", - "chain.invoke({\"input\": \"What's 3+3?\"})" + "chain.invoke({\"input\": \"What's 3 🦜 3?\"})" ] }, { @@ -428,7 +469,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.10.5" } }, "nbformat": 4, diff --git a/docs/docs/integrations/document_loaders/async_chromium.ipynb b/docs/docs/integrations/document_loaders/async_chromium.ipynb index 88cc2b84ce6..15cf2e32e84 100644 --- a/docs/docs/integrations/document_loaders/async_chromium.ipynb +++ b/docs/docs/integrations/document_loaders/async_chromium.ipynb @@ -13,7 +13,7 @@ "\n", "Headless mode means that the browser is running without a graphical user interface.\n", "\n", - "`AsyncChromiumLoader` loads the page, and then we use `Html2TextTransformer` to transform to text." + "In the below example we'll use the `AsyncChromiumLoader` to loads the page, and then the [`Html2TextTransformer`](/docs/integrations/document_transformers/html2text/) to strip out the HTML tags and other semantic information." ] }, { @@ -23,48 +23,22 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install --upgrade --quiet playwright beautifulsoup4\n", + "%pip install --upgrade --quiet playwright beautifulsoup4 html2text\n", "!playwright install" ] }, - { - "cell_type": "code", - "execution_count": 2, - "id": "dd2cdea7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'