mirror of
https://github.com/hwchase17/langchain.git
synced 2026-02-08 02:00:06 +00:00
Compare commits
28 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e51fad1488 | ||
|
|
b7747017d7 | ||
|
|
2e96704d59 | ||
|
|
e9799d6821 | ||
|
|
c2d1d903fa | ||
|
|
055a53c27f | ||
|
|
231da14771 | ||
|
|
6ab432d62e | ||
|
|
07a407d89a | ||
|
|
c64f98e2bb | ||
|
|
5469d898a9 | ||
|
|
3d639d1539 | ||
|
|
91c6cea227 | ||
|
|
ba54d36787 | ||
|
|
5f8082bdd7 | ||
|
|
512c523368 | ||
|
|
e323d0cfb1 | ||
|
|
01fa2d8117 | ||
|
|
8e126bc9bd | ||
|
|
c71027e725 | ||
|
|
e85c53ce68 | ||
|
|
3e1901e1aa | ||
|
|
6a4f602156 | ||
|
|
6023d5be09 | ||
|
|
a306baacd1 | ||
|
|
44ecec3896 | ||
|
|
bc7e56e8df | ||
|
|
afc7f1b892 |
@@ -32,3 +32,8 @@ It implements a Question Answering app and contains instructions for deploying t
|
||||
## [Vercel](https://github.com/homanp/vercel-langchain)
|
||||
|
||||
A minimal example on how to run LangChain on Vercel using Flask.
|
||||
|
||||
|
||||
## [SteamShip](https://github.com/steamship-core/steamship-langchain/)
|
||||
This repository contains LangChain adapters for Steamship, enabling LangChain developers to rapidly deploy their apps on Steamship.
|
||||
This includes: production ready endpoints, horizontal scaling across dependencies, persistant storage of app state, multi-tenancy support, etc.
|
||||
|
||||
423
docs/modules/agents/examples/async_agent.ipynb
Normal file
423
docs/modules/agents/examples/async_agent.ipynb
Normal file
@@ -0,0 +1,423 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6fb92deb-d89e-439b-855d-c7f2607d794b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Async API for Agent\n",
|
||||
"\n",
|
||||
"LangChain provides async support for Agents by leveraging the [asyncio](https://docs.python.org/3/library/asyncio.html) library.\n",
|
||||
"\n",
|
||||
"Async methods are currently supported for the following `Tools`: [`SerpAPIWrapper`](https://github.com/hwchase17/langchain/blob/master/langchain/serpapi.py) and [`LLMMathChain`](https://github.com/hwchase17/langchain/blob/master/langchain/chains/llm_math/base.py). Async support for other agent tools are on the roadmap.\n",
|
||||
"\n",
|
||||
"For `Tool`s that have a `coroutine` implemented (the two mentioned above), the `AgentExecutor` will `await` them directly. Otherwise, the `AgentExecutor` will call the `Tool`'s `func` via `asyncio.get_event_loop().run_in_executor` to avoid blocking the main runloop.\n",
|
||||
"\n",
|
||||
"You can use `arun` to call an `AgentExecutor` asynchronously."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "97800378-cc34-4283-9bd0-43f336bc914c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Serial vs. Concurrent Execution\n",
|
||||
"\n",
|
||||
"In this example, we kick off agents to answer some questions serially vs. concurrently. You can see that concurrent execution significantly speeds this up."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "da5df06c-af6f-4572-b9f5-0ab971c16487",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import asyncio\n",
|
||||
"import time\n",
|
||||
"\n",
|
||||
"from langchain.agents import initialize_agent, load_tools\n",
|
||||
"from langchain.llms import OpenAI\n",
|
||||
"from langchain.callbacks.stdout import StdOutCallbackHandler\n",
|
||||
"from langchain.callbacks.base import CallbackManager\n",
|
||||
"from langchain.callbacks.tracers import LangChainTracer\n",
|
||||
"from aiohttp import ClientSession\n",
|
||||
"\n",
|
||||
"questions = [\n",
|
||||
" \"Who won the US Open men's final in 2019? What is his age raised to the 0.334 power?\",\n",
|
||||
" \"Who is Olivia Wilde's boyfriend? What is his current age raised to the 0.23 power?\",\n",
|
||||
" \"Who won the most recent formula 1 grand prix? What is their age raised to the 0.23 power?\",\n",
|
||||
" \"Who won the US Open women's final in 2019? What is her age raised to the 0.34 power?\",\n",
|
||||
" \"Who is Beyonce's husband? What is his age raised to the 0.19 power?\"\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "fd4c294e-b1d6-44b8-b32e-2765c017e503",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m I need to find out who won the US Open men's final in 2019 and then calculate his age raised to the 0.334 power.\n",
|
||||
"Action: Search\n",
|
||||
"Action Input: \"US Open men's final 2019 winner\"\u001b[0m\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3mRafael Nadal\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I need to find out Rafael Nadal's age\n",
|
||||
"Action: Search\n",
|
||||
"Action Input: \"Rafael Nadal age\"\u001b[0m\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3m36 years\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I need to calculate 36 raised to the 0.334 power\n",
|
||||
"Action: Calculator\n",
|
||||
"Action Input: 36^0.334\u001b[0m\n",
|
||||
"Observation: \u001b[36;1m\u001b[1;3mAnswer: 3.3098250249682484\n",
|
||||
"\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I now know the final answer\n",
|
||||
"Final Answer: Rafael Nadal, aged 36, won the US Open men's final in 2019 and his age raised to the 0.334 power is 3.3098250249682484.\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m I need to find out who Olivia Wilde's boyfriend is and then calculate his age raised to the 0.23 power.\n",
|
||||
"Action: Search\n",
|
||||
"Action Input: \"Olivia Wilde boyfriend\"\u001b[0m\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3mJason Sudeikis\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I need to find out Jason Sudeikis' age\n",
|
||||
"Action: Search\n",
|
||||
"Action Input: \"Jason Sudeikis age\"\u001b[0m\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3mDaniel Jason Sudeikis is an American actor, comedian, writer, and producer. In the 1990s, he began his career in improv comedy and performed with ComedySportz, iO Chicago, and The Second City.\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I need to find out Jason Sudeikis' exact age\n",
|
||||
"Action: Search\n",
|
||||
"Action Input: \"Jason Sudeikis age exact\"\u001b[0m\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3mDaniel Jason Sudeikis. (1975-09-18) September 18, 1975 (age 47). Fairfax, Virginia, U.S. · Fort Scott Community College · Actor; comedian; producer; writer · 1997– ...\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I now have the information I need to calculate the age raised to the 0.23 power\n",
|
||||
"Action: Calculator\n",
|
||||
"Action Input: 47^0.23\u001b[0m\n",
|
||||
"Observation: \u001b[36;1m\u001b[1;3mAnswer: 2.4242784855673896\n",
|
||||
"\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I now know the final answer\n",
|
||||
"Final Answer: Jason Sudeikis, Olivia Wilde's boyfriend, is 47 years old and his age raised to the 0.23 power is 2.4242784855673896.\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m I need to find out who won the grand prix and then calculate their age raised to the 0.23 power.\n",
|
||||
"Action: Search\n",
|
||||
"Action Input: \"Formula 1 Grand Prix Winner\"\u001b[0m\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3mMax Emilian Verstappen is a Belgian-Dutch racing driver and the 2021 and 2022 Formula One World Champion. He competes under the Dutch flag in Formula One with Red Bull Racing. Verstappen is the son of racing drivers Jos Verstappen, who also competed in Formula One, and Sophie Kumpen.\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I need to find out Max Emilian Verstappen's age.\n",
|
||||
"Action: Search\n",
|
||||
"Action Input: \"Max Emilian Verstappen age\"\u001b[0m\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3m25 years\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I now need to calculate 25 raised to the 0.23 power.\n",
|
||||
"Action: Calculator\n",
|
||||
"Action Input: 25^0.23\u001b[0m\n",
|
||||
"Observation: \u001b[36;1m\u001b[1;3mAnswer: 2.096651272316035\n",
|
||||
"\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I now know the final answer.\n",
|
||||
"Final Answer: Max Emilian Verstappen, who is 25 years old, won the most recent Formula 1 Grand Prix and his age raised to the 0.23 power is 2.096651272316035.\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m I need to find out who won the US Open women's final in 2019 and then calculate her age raised to the 0.34 power.\n",
|
||||
"Action: Search\n",
|
||||
"Action Input: \"US Open women's final 2019 winner\"\u001b[0m\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3mBianca Andreescu defeated Serena Williams in the final, 6–3, 7–5 to win the women's singles tennis title at the 2019 US Open. It was her first major title, and she became the first Canadian, as well as the first player born in the 2000s, to win a major singles title.\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I need to find out Bianca Andreescu's age.\n",
|
||||
"Action: Search\n",
|
||||
"Action Input: \"Bianca Andreescu age\"\u001b[0m\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3mBianca Vanessa Andreescu is a Canadian-Romanian professional tennis player. She has a career-high ranking of No. 4 in the world, and is the highest-ranked Canadian in the history of the Women's Tennis Association.\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I now know the age of Bianca Andreescu.\n",
|
||||
"Action: Calculator\n",
|
||||
"Action Input: 19^0.34\u001b[0m\n",
|
||||
"Observation: \u001b[36;1m\u001b[1;3mAnswer: 2.7212987634680084\n",
|
||||
"\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I now know the final answer.\n",
|
||||
"Final Answer: Bianca Andreescu, aged 19, won the US Open women's final in 2019. Her age raised to the 0.34 power is 2.7212987634680084.\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m I need to find out who Beyonce's husband is and then calculate his age raised to the 0.19 power.\n",
|
||||
"Action: Search\n",
|
||||
"Action Input: \"Who is Beyonce's husband?\"\u001b[0m\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3mJay-Z\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I need to find out Jay-Z's age\n",
|
||||
"Action: Search\n",
|
||||
"Action Input: \"How old is Jay-Z?\"\u001b[0m\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3m53 years\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I need to calculate 53 raised to the 0.19 power\n",
|
||||
"Action: Calculator\n",
|
||||
"Action Input: 53^0.19\u001b[0m\n",
|
||||
"Observation: \u001b[36;1m\u001b[1;3mAnswer: 2.12624064206896\n",
|
||||
"\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I now know the final answer\n",
|
||||
"Final Answer: Jay-Z is Beyonce's husband and his age raised to the 0.19 power is 2.12624064206896.\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n",
|
||||
"Serial executed in 94.83 seconds.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def generate_serially():\n",
|
||||
" for q in questions:\n",
|
||||
" llm = OpenAI(temperature=0)\n",
|
||||
" tools = load_tools([\"llm-math\", \"serpapi\"], llm=llm)\n",
|
||||
" agent = initialize_agent(\n",
|
||||
" tools, llm, agent=\"zero-shot-react-description\", verbose=True\n",
|
||||
" )\n",
|
||||
" agent.run(q)\n",
|
||||
"\n",
|
||||
"s = time.perf_counter()\n",
|
||||
"generate_serially()\n",
|
||||
"elapsed = time.perf_counter() - s\n",
|
||||
"print(f\"Serial executed in {elapsed:0.2f} seconds.\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "076d7b85-45ec-465d-8b31-c2ad119c3438",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\u001b[33;1m\u001b[1;3m I need to find out who Beyonce's husband is and then calculate his age raised to the 0.19 power.\n",
|
||||
"Action: Search\n",
|
||||
"Action Input: \"Who is Beyonce's husband?\"\u001b[0m\u001b[31;1m\u001b[1;3m I need to find out who won the grand prix and then calculate their age raised to the 0.23 power.\n",
|
||||
"Action: Search\n",
|
||||
"Action Input: \"Formula 1 Grand Prix Winner\"\u001b[0m\u001b[32;1m\u001b[1;3m I need to find out who Olivia Wilde's boyfriend is and then calculate his age raised to the 0.23 power.\n",
|
||||
"Action: Search\n",
|
||||
"Action Input: \"Olivia Wilde boyfriend\"\u001b[0m\u001b[38;5;200m\u001b[1;3m I need to find out who won the US Open women's final in 2019 and then calculate her age raised to the 0.34 power.\n",
|
||||
"Action: Search\n",
|
||||
"Action Input: \"US Open women's final 2019 winner\"\u001b[0m\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3mJay-Z\u001b[0m\n",
|
||||
"Thought:\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3mMax Emilian Verstappen is a Belgian-Dutch racing driver and the 2021 and 2022 Formula One World Champion. He competes under the Dutch flag in Formula One with Red Bull Racing. Verstappen is the son of racing drivers Jos Verstappen, who also competed in Formula One, and Sophie Kumpen.\u001b[0m\n",
|
||||
"Thought:\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3mJason Sudeikis\u001b[0m\n",
|
||||
"Thought:\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3mBianca Andreescu defeated Serena Williams in the final, 6–3, 7–5 to win the women's singles tennis title at the 2019 US Open. It was her first major title, and she became the first Canadian, as well as the first player born in the 2000s, to win a major singles title.\u001b[0m\n",
|
||||
"Thought:\u001b[31;1m\u001b[1;3m I need to find out Max Emilian Verstappen's age.\n",
|
||||
"Action: Search\n",
|
||||
"Action Input: \"Max Emilian Verstappen age\"\u001b[0m\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3m25 years\u001b[0m\n",
|
||||
"Thought:\u001b[38;5;200m\u001b[1;3m I need to find out Bianca Andreescu's age.\n",
|
||||
"Action: Search\n",
|
||||
"Action Input: \"Bianca Andreescu age\"\u001b[0m\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3mBianca Vanessa Andreescu is a Canadian-Romanian professional tennis player. She has a career-high ranking of No. 4 in the world, and is the highest-ranked Canadian in the history of the Women's Tennis Association.\u001b[0m\n",
|
||||
"Thought:\u001b[36;1m\u001b[1;3m I need to find out who won the US Open men's final in 2019 and then calculate his age raised to the 0.334 power.\n",
|
||||
"Action: Search\n",
|
||||
"Action Input: \"US Open men's final 2019 winner\"\u001b[0m\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3mRafael Nadal\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I need to find out Jason Sudeikis' age\n",
|
||||
"Action: Search\n",
|
||||
"Action Input: \"Jason Sudeikis age\"\u001b[0m\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3mDaniel Jason Sudeikis is an American actor, comedian, writer, and producer. In the 1990s, he began his career in improv comedy and performed with ComedySportz, iO Chicago, and The Second City.\u001b[0m\n",
|
||||
"Thought:\u001b[33;1m\u001b[1;3m I need to find out Jay-Z's age\n",
|
||||
"Action: Search\n",
|
||||
"Action Input: \"How old is Jay-Z?\"\u001b[0m\u001b[36;1m\u001b[1;3m I need to find out Rafael Nadal's age\n",
|
||||
"Action: Search\n",
|
||||
"Action Input: \"Rafael Nadal age\"\u001b[0m\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3m36 years\u001b[0m\n",
|
||||
"Thought:\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3m53 years\u001b[0m\n",
|
||||
"Thought:\u001b[38;5;200m\u001b[1;3m I now know the age of Bianca Andreescu.\n",
|
||||
"Action: Calculator\n",
|
||||
"Action Input: 19^0.34\u001b[0m\u001b[31;1m\u001b[1;3m I now need to calculate 25 raised to the 0.23 power.\n",
|
||||
"Action: Calculator\n",
|
||||
"Action Input: 25^0.23\u001b[0m\n",
|
||||
"Observation: \u001b[36;1m\u001b[1;3mAnswer: 2.7212987634680084\n",
|
||||
"\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I need to find out Jason Sudeikis' exact age\n",
|
||||
"Action: Search\n",
|
||||
"Action Input: \"Jason Sudeikis age exact\"\u001b[0m\u001b[33;1m\u001b[1;3m I need to calculate 53 raised to the 0.19 power\n",
|
||||
"Action: Calculator\n",
|
||||
"Action Input: 53^0.19\u001b[0m\u001b[36;1m\u001b[1;3m I need to calculate 36 raised to the 0.334 power\n",
|
||||
"Action: Calculator\n",
|
||||
"Action Input: 36^0.334\u001b[0m\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3mDaniel Jason Sudeikis. (1975-09-18) September 18, 1975 (age 47). Fairfax, Virginia, U.S. · Fort Scott Community College · Actor; comedian; producer; writer · 1997– ...\u001b[0m\n",
|
||||
"Thought:\n",
|
||||
"Observation: \u001b[36;1m\u001b[1;3mAnswer: 2.096651272316035\n",
|
||||
"\u001b[0m\n",
|
||||
"Thought:\n",
|
||||
"Observation: \u001b[36;1m\u001b[1;3mAnswer: 2.12624064206896\n",
|
||||
"\u001b[0m\n",
|
||||
"Thought:\n",
|
||||
"Observation: \u001b[36;1m\u001b[1;3mAnswer: 3.3098250249682484\n",
|
||||
"\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I now have the information I need to calculate the age raised to the 0.23 power\n",
|
||||
"Action: Calculator\n",
|
||||
"Action Input: 47^0.23\u001b[0m\n",
|
||||
"Observation: \u001b[36;1m\u001b[1;3mAnswer: 2.4242784855673896\n",
|
||||
"\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I now know the final answer.\n",
|
||||
"Final Answer: Bianca Andreescu, aged 19, won the US Open women's final in 2019. Her age raised to the 0.34 power is 2.7212987634680084.\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m I now know the final answer\n",
|
||||
"Final Answer: Jay-Z is Beyonce's husband and his age raised to the 0.19 power is 2.12624064206896.\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m I now know the final answer\n",
|
||||
"Final Answer: Rafael Nadal, aged 36, won the US Open men's final in 2019 and his age raised to the 0.334 power is 3.3098250249682484.\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m I now know the final answer\n",
|
||||
"Final Answer: Jason Sudeikis, Olivia Wilde's boyfriend, is 47 years old and his age raised to the 0.23 power is 2.4242784855673896.\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m I now know the final answer.\n",
|
||||
"Final Answer: Max Emilian Verstappen, who is 25 years old, won the most recent Formula 1 Grand Prix and his age raised to the 0.23 power is 2.096651272316035.\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n",
|
||||
"Concurrent executed in 25.06 seconds.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"async def generate_concurrently():\n",
|
||||
" agents = []\n",
|
||||
" # To make async requests in Tools more efficient, you can pass in your own aiohttp.ClientSession, \n",
|
||||
" # but you must manually close the client session at the end of your program/event loop\n",
|
||||
" aiosession = ClientSession()\n",
|
||||
" colors = [\"blue\", \"green\", \"red\", \"pink\", \"yellow\"]\n",
|
||||
" for color in colors:\n",
|
||||
" # Use a custom CallbackManager to print in different colors.\n",
|
||||
" manager = CallbackManager([StdOutCallbackHandler(color=color)])\n",
|
||||
" llm = OpenAI(temperature=0, callback_manager=manager)\n",
|
||||
" async_tools = load_tools([\"llm-math\", \"serpapi\"], llm=llm, aiosession=aiosession)\n",
|
||||
" agents.append(\n",
|
||||
" initialize_agent(async_tools, llm, agent=\"zero-shot-react-description\", verbose=True, callback_manager=manager)\n",
|
||||
" )\n",
|
||||
" tasks = [async_agent.arun(q) for async_agent, q in zip(agents, questions)]\n",
|
||||
" await asyncio.gather(*tasks)\n",
|
||||
" await aiosession.close()\n",
|
||||
"\n",
|
||||
"s = time.perf_counter()\n",
|
||||
"# If running this outside of Jupyter, use asyncio.run(generate_concurrently())\n",
|
||||
"await generate_concurrently()\n",
|
||||
"elapsed = time.perf_counter() - s\n",
|
||||
"print(f\"Concurrent executed in {elapsed:0.2f} seconds.\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "97ef285c-4a43-4a4e-9698-cd52a1bc56c9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using Tracing with Asynchronous Agents\n",
|
||||
"\n",
|
||||
"To use tracing with async agents, you must pass in a custom `CallbackManager` with `LangChainTracer` to each agent running asynchronously. This way, you avoid collisions while the trace is being collected."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "44bda05a-d33e-4e91-9a71-a0f3f96aae95",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m I need to find out who won the US Open men's final in 2019 and then calculate his age raised to the 0.334 power.\n",
|
||||
"Action: Search\n",
|
||||
"Action Input: \"US Open men's final 2019 winner\"\u001b[0m\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3mRafael Nadal\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I need to find out Rafael Nadal's age\n",
|
||||
"Action: Search\n",
|
||||
"Action Input: \"Rafael Nadal age\"\u001b[0m\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3m36 years\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I need to calculate 36 raised to the 0.334 power\n",
|
||||
"Action: Calculator\n",
|
||||
"Action Input: 36^0.334\u001b[0m\n",
|
||||
"Observation: \u001b[36;1m\u001b[1;3mAnswer: 3.3098250249682484\n",
|
||||
"\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I now know the final answer\n",
|
||||
"Final Answer: Rafael Nadal, aged 36, won the US Open men's final in 2019 and his age raised to the 0.334 power is 3.3098250249682484.\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# To make async requests in Tools more efficient, you can pass in your own aiohttp.ClientSession, \n",
|
||||
"# but you must manually close the client session at the end of your program/event loop\n",
|
||||
"aiosession = ClientSession()\n",
|
||||
"tracer = LangChainTracer()\n",
|
||||
"tracer.load_default_session()\n",
|
||||
"manager = CallbackManager([StdOutCallbackHandler(), tracer])\n",
|
||||
"\n",
|
||||
"# Pass the manager into the llm if you want llm calls traced.\n",
|
||||
"llm = OpenAI(temperature=0, callback_manager=manager)\n",
|
||||
"\n",
|
||||
"async_tools = load_tools([\"llm-math\", \"serpapi\"], llm=llm, aiosession=aiosession)\n",
|
||||
"async_agent = initialize_agent(async_tools, llm, agent=\"zero-shot-react-description\", verbose=True, callback_manager=manager)\n",
|
||||
"await async_agent.arun(questions[0])\n",
|
||||
"await aiosession.close()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -17,6 +17,7 @@ The first category of how-to guides here cover specific parts of working with ag
|
||||
|
||||
`Max Iterations <./examples/max_iterations.html>`_: How to restrict an agent to a certain number of iterations.
|
||||
|
||||
`Asynchronous <./examples/async_agent.html>`_: Covering asynchronous functionality.
|
||||
|
||||
The next set of examples are all end-to-end agents for specific applications.
|
||||
In all examples there is an Agent with a particular set of tools.
|
||||
|
||||
@@ -33,7 +33,6 @@ def run_cmd(cmd: str, _crawler: Crawler) -> None:
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
objective = "Make a reservation for 2 at 7pm at bistro vida in menlo park"
|
||||
print("\nWelcome to natbot! What is your objective?")
|
||||
i = input()
|
||||
|
||||
132
docs/modules/chains/async_chain.ipynb
Normal file
132
docs/modules/chains/async_chain.ipynb
Normal file
@@ -0,0 +1,132 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "593f7553-7038-498e-96d4-8255e5ce34f0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Async API for Chain\n",
|
||||
"\n",
|
||||
"LangChain provides async support for Chains by leveraging the [asyncio](https://docs.python.org/3/library/asyncio.html) library.\n",
|
||||
"\n",
|
||||
"Async methods are currently supported in `LLMChain` (through `arun`, `apredict`, `acall`) and `LLMMathChain` (through `arun` and `acall`). Async support for other chains is on the roadmap."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "c19c736e-ca74-4726-bb77-0a849bcc2960",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"\n",
|
||||
"BrightSmile Toothpaste Company\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"BrightSmile Toothpaste Co.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"BrightSmile Toothpaste\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"Gleaming Smile Inc.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"SparkleSmile Toothpaste\n",
|
||||
"\u001b[1mConcurrent executed in 1.54 seconds.\u001b[0m\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"BrightSmile Toothpaste Co.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"MintyFresh Toothpaste Co.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"SparkleSmile Toothpaste.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"Pearly Whites Toothpaste Co.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"BrightSmile Toothpaste.\n",
|
||||
"\u001b[1mSerial executed in 6.38 seconds.\u001b[0m\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import asyncio\n",
|
||||
"import time\n",
|
||||
"\n",
|
||||
"from langchain.llms import OpenAI\n",
|
||||
"from langchain.prompts import PromptTemplate\n",
|
||||
"from langchain.chains import LLMChain\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def generate_serially():\n",
|
||||
" llm = OpenAI(temperature=0.9)\n",
|
||||
" prompt = PromptTemplate(\n",
|
||||
" input_variables=[\"product\"],\n",
|
||||
" template=\"What is a good name for a company that makes {product}?\",\n",
|
||||
" )\n",
|
||||
" chain = LLMChain(llm=llm, prompt=prompt)\n",
|
||||
" for _ in range(5):\n",
|
||||
" resp = chain.run(product=\"toothpaste\")\n",
|
||||
" print(resp)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"async def async_generate(chain):\n",
|
||||
" resp = await chain.arun(product=\"toothpaste\")\n",
|
||||
" print(resp)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"async def generate_concurrently():\n",
|
||||
" llm = OpenAI(temperature=0.9)\n",
|
||||
" prompt = PromptTemplate(\n",
|
||||
" input_variables=[\"product\"],\n",
|
||||
" template=\"What is a good name for a company that makes {product}?\",\n",
|
||||
" )\n",
|
||||
" chain = LLMChain(llm=llm, prompt=prompt)\n",
|
||||
" tasks = [async_generate(chain) for _ in range(5)]\n",
|
||||
" await asyncio.gather(*tasks)\n",
|
||||
"\n",
|
||||
"s = time.perf_counter()\n",
|
||||
"# If running this outside of Jupyter, use asyncio.run(generate_concurrently())\n",
|
||||
"await generate_concurrently()\n",
|
||||
"elapsed = time.perf_counter() - s\n",
|
||||
"print('\\033[1m' + f\"Concurrent executed in {elapsed:0.2f} seconds.\" + '\\033[0m')\n",
|
||||
"\n",
|
||||
"s = time.perf_counter()\n",
|
||||
"generate_serially()\n",
|
||||
"elapsed = time.perf_counter() - s\n",
|
||||
"print('\\033[1m' + f\"Serial executed in {elapsed:0.2f} seconds.\" + '\\033[0m')"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -21,28 +21,83 @@
|
||||
"from langchain.vectorstores.faiss import FAISS\n",
|
||||
"from langchain.text_splitter import CharacterTextSplitter\n",
|
||||
"from langchain.llms import OpenAI\n",
|
||||
"from langchain.chains import ChatVectorDBChain"
|
||||
"from langchain.chains import ChatVectorDBChain\n",
|
||||
"from langchain.document_loaders import TextLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cdff94be",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Load in documents. You can replace this with a loader for whatever type of data you want"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 2,
|
||||
"id": "01c46e92",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = TextLoader('../../state_of_the_union.txt')\n",
|
||||
"documents = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e9be4779",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If you had multiple loaders that you wanted to combine, you do something like:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "433363a5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# loaders = [....]\n",
|
||||
"# docs = []\n",
|
||||
"# for loader in loaders:\n",
|
||||
"# docs.extend(loader.load())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "239475d2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We now split the documents, create embeddings for them, and put them in a vectorstore. This allows us to do semantic search over them."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "a8930cf7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('../../state_of_the_union.txt') as f:\n",
|
||||
" state_of_the_union = f.read()\n",
|
||||
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
|
||||
"texts = text_splitter.split_text(state_of_the_union)\n",
|
||||
"documents = text_splitter.split_documents(documents)\n",
|
||||
"\n",
|
||||
"embeddings = OpenAIEmbeddings()\n",
|
||||
"vectorstore = FAISS.from_texts(texts, embeddings)"
|
||||
"vectorstore = FAISS.from_documents(documents, embeddings)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3c96b118",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We now initialize the ChatVectorDBChain"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 7,
|
||||
"id": "7b4110f3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -157,7 +212,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -9,6 +9,7 @@ They are broken up into three categories:
|
||||
1. `Generic Chains <./generic_how_to.html>`_: Generic chains, that are meant to help build other chains rather than serve a particular purpose.
|
||||
2. `CombineDocuments Chains <./combine_docs_how_to.html>`_: Chains aimed at making it easy to work with documents (question answering, summarization, etc).
|
||||
3. `Utility Chains <./utility_how_to.html>`_: Chains consisting of an LLMChain interacting with a specific util.
|
||||
4. `Asynchronous <./async_chain.html>`_: Covering asynchronous functionality.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
171
docs/modules/document_loaders/examples/airbyte_json.ipynb
Normal file
171
docs/modules/document_loaders/examples/airbyte_json.ipynb
Normal file
@@ -0,0 +1,171 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1f3a5ebf",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Airbyte JSON\n",
|
||||
"This covers how to load any source from Airbyte into a local JSON file that can be read in as a document\n",
|
||||
"\n",
|
||||
"Prereqs:\n",
|
||||
"Have docker desktop installed\n",
|
||||
"\n",
|
||||
"Steps:\n",
|
||||
"\n",
|
||||
"1) clone Airbyte from GitHub - `git clone https://github.com/airbytehq/airbyte.git`\n",
|
||||
"\n",
|
||||
"2) switch into Airbyte directory - `cd airbyte`\n",
|
||||
"\n",
|
||||
"3) start Airbyte - `docker compose up`\n",
|
||||
"\n",
|
||||
"4) In your browser, just visit http://localhost:8000. You will be asked for a username and password. By default, that's username `airbyte` and password `password`.\n",
|
||||
"\n",
|
||||
"5) Setup any source you wish\n",
|
||||
"\n",
|
||||
"6) Set destination as Local JSON, with specified destination path - lets say `/json_data`. Set up manual sync.\n",
|
||||
"\n",
|
||||
"7) Run the connection!\n",
|
||||
"\n",
|
||||
"7) To see what files are create, you can navigate to: `file:///tmp/airbyte_local`\n",
|
||||
"\n",
|
||||
"8) Find your data and copy path. That path should be saved in the file variable below. It should start with `/tmp/airbyte_local`\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "180c8b74",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import AirbyteJSONLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "4af10665",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"_airbyte_raw_pokemon.jsonl\r\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!ls /tmp/airbyte_local/json_data/"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "721d9316",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = AirbyteJSONLoader('/tmp/airbyte_local/json_data/_airbyte_raw_pokemon.jsonl')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "9858b946",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "fca024cb",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"abilities: \n",
|
||||
"ability: \n",
|
||||
"name: blaze\n",
|
||||
"url: https://pokeapi.co/api/v2/ability/66/\n",
|
||||
"\n",
|
||||
"is_hidden: False\n",
|
||||
"slot: 1\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"ability: \n",
|
||||
"name: solar-power\n",
|
||||
"url: https://pokeapi.co/api/v2/ability/94/\n",
|
||||
"\n",
|
||||
"is_hidden: True\n",
|
||||
"slot: 3\n",
|
||||
"\n",
|
||||
"base_experience: 267\n",
|
||||
"forms: \n",
|
||||
"name: charizard\n",
|
||||
"url: https://pokeapi.co/api/v2/pokemon-form/6/\n",
|
||||
"\n",
|
||||
"game_indices: \n",
|
||||
"game_index: 180\n",
|
||||
"version: \n",
|
||||
"name: red\n",
|
||||
"url: https://pokeapi.co/api/v2/version/1/\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"game_index: 180\n",
|
||||
"version: \n",
|
||||
"name: blue\n",
|
||||
"url: https://pokeapi.co/api/v2/version/2/\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"game_index: 180\n",
|
||||
"version: \n",
|
||||
"n\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(data[0].page_content[:500])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9fa002a5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
93
docs/modules/document_loaders/examples/azlyrics.ipynb
Normal file
93
docs/modules/document_loaders/examples/azlyrics.ipynb
Normal file
@@ -0,0 +1,93 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9c31caff",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# AZLyrics\n",
|
||||
"This covers how to load AZLyrics webpages into a document format that we can use downstream."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "7e6f5726",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import AZLyricsLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "a0df4c24",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = AZLyricsLoader(\"https://www.azlyrics.com/lyrics/mileycyrus/flowers.html\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "8cd61b6e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "162fd286",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content=\"Miley Cyrus - Flowers Lyrics | AZLyrics.com\\n\\r\\nWe were good, we were gold\\nKinda dream that can't be sold\\nWe were right till we weren't\\nBuilt a home and watched it burn\\n\\nI didn't wanna leave you\\nI didn't wanna lie\\nStarted to cry but then remembered I\\n\\nI can buy myself flowers\\nWrite my name in the sand\\nTalk to myself for hours\\nSay things you don't understand\\nI can take myself dancing\\nAnd I can hold my own hand\\nYeah, I can love me better than you can\\n\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI can love me better, baby\\n\\nPaint my nails, cherry red\\nMatch the roses that you left\\nNo remorse, no regret\\nI forgive every word you said\\n\\nI didn't wanna leave you, baby\\nI didn't wanna fight\\nStarted to cry but then remembered I\\n\\nI can buy myself flowers\\nWrite my name in the sand\\nTalk to myself for hours, yeah\\nSay things you don't understand\\nI can take myself dancing\\nAnd I can hold my own hand\\nYeah, I can love me better than you can\\n\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI\\n\\nI didn't wanna wanna leave you\\nI didn't wanna fight\\nStarted to cry but then remembered I\\n\\nI can buy myself flowers\\nWrite my name in the sand\\nTalk to myself for hours (Yeah)\\nSay things you don't understand\\nI can take myself dancing\\nAnd I can hold my own hand\\nYeah, I can love me better than\\nYeah, I can love me better than you can, uh\\n\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI can love me better, baby (Than you can)\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI\\n\", lookup_str='', metadata={'source': 'https://www.azlyrics.com/lyrics/mileycyrus/flowers.html'}, lookup_index=0)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6358000c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
80
docs/modules/document_loaders/examples/everynote.ipynb
Normal file
80
docs/modules/document_loaders/examples/everynote.ipynb
Normal file
@@ -0,0 +1,80 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "56ac1584",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# EveryNote\n",
|
||||
"\n",
|
||||
"How to load EveryNote file from disk."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "1a53ece0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# !pip install pypandoc\n",
|
||||
"# import pypandoc\n",
|
||||
"\n",
|
||||
"# pypandoc.download_pandoc()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "88df766f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='testing this\\n\\nwhat happens?\\n\\nto the world?\\n', lookup_str='', metadata={'source': 'example_data/testing.enex'}, lookup_index=0)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.document_loaders import EveryNoteLoader\n",
|
||||
"\n",
|
||||
"loader = EveryNoteLoader(\"example_data/testing.enex\")\n",
|
||||
"loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c1329905",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE en-export SYSTEM "http://xml.evernote.com/pub/evernote-export4.dtd">
|
||||
<en-export export-date="20230309T035336Z" application="Evernote" version="10.53.2">
|
||||
<note>
|
||||
<title>testing</title>
|
||||
<created>20230209T034746Z</created>
|
||||
<updated>20230209T035328Z</updated>
|
||||
<note-attributes>
|
||||
<author>Harrison Chase</author>
|
||||
</note-attributes>
|
||||
<content>
|
||||
<![CDATA[<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd"><en-note><div>testing this</div><div>what happens?</div><div>to the world?</div></en-note> ]]>
|
||||
</content>
|
||||
</note>
|
||||
</en-export>
|
||||
156
docs/modules/document_loaders/examples/gcs_directory.ipynb
Normal file
156
docs/modules/document_loaders/examples/gcs_directory.ipynb
Normal file
@@ -0,0 +1,156 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0ef41fd4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# GCS Directory\n",
|
||||
"\n",
|
||||
"This covers how to load document objects from an Google Cloud Storage (GCS) directory."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "5cfb25c9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import GCSDirectoryLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "93a4d0f1",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# !pip install google-cloud-storage"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "633dc839",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = GCSDirectoryLoader(project_name=\"aist\", bucket=\"testing-hwc\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "a863467d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/Users/harrisonchase/workplace/langchain/.venv/lib/python3.10/site-packages/google/auth/_default.py:83: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. We recommend you rerun `gcloud auth application-default login` and make sure a quota project is added. Or you can use service accounts instead. For more information about service accounts, see https://cloud.google.com/docs/authentication/\n",
|
||||
" warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n",
|
||||
"/Users/harrisonchase/workplace/langchain/.venv/lib/python3.10/site-packages/google/auth/_default.py:83: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. We recommend you rerun `gcloud auth application-default login` and make sure a quota project is added. Or you can use service accounts instead. For more information about service accounts, see https://cloud.google.com/docs/authentication/\n",
|
||||
" warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': '/var/folders/y6/8_bzdg295ld6s1_97_12m4lr0000gn/T/tmpz37njh7u/fake.docx'}, lookup_index=0)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "17c0dcbb",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Specifying a prefix\n",
|
||||
"You can also specify a prefix for more finegrained control over what files to load."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "b3143c89",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = GCSDirectoryLoader(project_name=\"aist\", bucket=\"testing-hwc\", prefix=\"fake\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "226ac6f5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/Users/harrisonchase/workplace/langchain/.venv/lib/python3.10/site-packages/google/auth/_default.py:83: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. We recommend you rerun `gcloud auth application-default login` and make sure a quota project is added. Or you can use service accounts instead. For more information about service accounts, see https://cloud.google.com/docs/authentication/\n",
|
||||
" warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n",
|
||||
"/Users/harrisonchase/workplace/langchain/.venv/lib/python3.10/site-packages/google/auth/_default.py:83: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. We recommend you rerun `gcloud auth application-default login` and make sure a quota project is added. Or you can use service accounts instead. For more information about service accounts, see https://cloud.google.com/docs/authentication/\n",
|
||||
" warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': '/var/folders/y6/8_bzdg295ld6s1_97_12m4lr0000gn/T/tmpylg6291i/fake.docx'}, lookup_index=0)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f9c0734f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
104
docs/modules/document_loaders/examples/gcs_file.ipynb
Normal file
104
docs/modules/document_loaders/examples/gcs_file.ipynb
Normal file
@@ -0,0 +1,104 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0ef41fd4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# GCS File Storage\n",
|
||||
"\n",
|
||||
"This covers how to load document objects from an Google Cloud Storage (GCS) file object."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "5cfb25c9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import GCSFileLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "93a4d0f1",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# !pip install google-cloud-storage"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "633dc839",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = GCSFileLoader(project_name=\"aist\", bucket=\"testing-hwc\", blob=\"fake.docx\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "a863467d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/Users/harrisonchase/workplace/langchain/.venv/lib/python3.10/site-packages/google/auth/_default.py:83: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. We recommend you rerun `gcloud auth application-default login` and make sure a quota project is added. Or you can use service accounts instead. For more information about service accounts, see https://cloud.google.com/docs/authentication/\n",
|
||||
" warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': '/var/folders/y6/8_bzdg295ld6s1_97_12m4lr0000gn/T/tmp3srlf8n8/fake.docx'}, lookup_index=0)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "eba3002d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
83
docs/modules/document_loaders/examples/gutenberg.ipynb
Normal file
83
docs/modules/document_loaders/examples/gutenberg.ipynb
Normal file
@@ -0,0 +1,83 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "bda1f3f5",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Gutenberg\n",
|
||||
"\n",
|
||||
"This covers how to load links to Gutenberg e-books into a document format that we can use downstream."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "9bfd5e46",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import GutenbergLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "700e4ef2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = GutenbergLoader('https://www.gutenberg.org/cache/epub/69972/pg69972.txt')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "b6f28930",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7d436441",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3b74d755",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
94
docs/modules/document_loaders/examples/imsdb.ipynb
Normal file
94
docs/modules/document_loaders/examples/imsdb.ipynb
Normal file
File diff suppressed because one or more lines are too long
130
docs/modules/document_loaders/examples/online_pdf.ipynb
Normal file
130
docs/modules/document_loaders/examples/online_pdf.ipynb
Normal file
File diff suppressed because one or more lines are too long
@@ -10,6 +10,133 @@
|
||||
"This covers how to load pdfs into a document format that we can use downstream."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "743f9413",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using PyPDF\n",
|
||||
"\n",
|
||||
"Allows for tracking of page numbers as well."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "c428b0c5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import PagedPDFSplitter\n",
|
||||
"\n",
|
||||
"loader = PagedPDFSplitter(\"example_data/layout-parser-paper.pdf\")\n",
|
||||
"pages = loader.load_and_split()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ebd895e4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"An advantage of this approach is that documents can be retrieved with page numbers."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "87fa7b3a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"9: 10 Z. Shen et al.\n",
|
||||
"Fig. 4: Illustration of (a) the original historical Japanese document with layout\n",
|
||||
"detection results and (b) a recreated version of the document image that achieves\n",
|
||||
"much better character recognition recall. The reorganization algorithm rearranges\n",
|
||||
"the tokens based on the their detected bounding boxes given a maximum allowed\n",
|
||||
"height.\n",
|
||||
"4LayoutParser Community Platform\n",
|
||||
"Another focus of LayoutParser is promoting the reusability of layout detection\n",
|
||||
"models and full digitization pipelines. Similar to many existing deep learning\n",
|
||||
"libraries, LayoutParser comes with a community model hub for distributing\n",
|
||||
"layout models. End-users can upload their self-trained models to the model hub,\n",
|
||||
"and these models can be loaded into a similar interface as the currently available\n",
|
||||
"LayoutParser pre-trained models. For example, the model trained on the News\n",
|
||||
"Navigator dataset [17] has been incorporated in the model hub.\n",
|
||||
"Beyond DL models, LayoutParser also promotes the sharing of entire doc-\n",
|
||||
"ument digitization pipelines. For example, sometimes the pipeline requires the\n",
|
||||
"combination of multiple DL models to achieve better accuracy. Currently, pipelines\n",
|
||||
"are mainly described in academic papers and implementations are often not pub-\n",
|
||||
"licly available. To this end, the LayoutParser community platform also enables\n",
|
||||
"the sharing of layout pipelines to promote the discussion and reuse of techniques.\n",
|
||||
"For each shared pipeline, it has a dedicated project page, with links to the source\n",
|
||||
"code, documentation, and an outline of the approaches. A discussion panel is\n",
|
||||
"provided for exchanging ideas. Combined with the core LayoutParser library,\n",
|
||||
"users can easily build reusable components based on the shared pipelines and\n",
|
||||
"apply them to solve their unique problems.\n",
|
||||
"5 Use Cases\n",
|
||||
"The core objective of LayoutParser is to make it easier to create both large-scale\n",
|
||||
"and light-weight document digitization pipelines. Large-scale document processing\n",
|
||||
"3: 4 Z. Shen et al.\n",
|
||||
"Efficient Data AnnotationC u s t o m i z e d M o d e l T r a i n i n gModel Cust omizationDI A Model HubDI A Pipeline SharingCommunity PlatformLa y out Detection ModelsDocument Images \n",
|
||||
"T h e C o r e L a y o u t P a r s e r L i b r a r yOCR ModuleSt or age & VisualizationLa y out Data Structur e\n",
|
||||
"Fig. 1: The overall architecture of LayoutParser . For an input document image,\n",
|
||||
"the core LayoutParser library provides a set of o\u000b",
|
||||
"-the-shelf tools for layout\n",
|
||||
"detection, OCR, visualization, and storage, backed by a carefully designed layout\n",
|
||||
"data structure. LayoutParser also supports high level customization via e\u000ecient\n",
|
||||
"layout annotation and model training functions. These improve model accuracy\n",
|
||||
"on the target samples. The community platform enables the easy sharing of DIA\n",
|
||||
"models and whole digitization pipelines to promote reusability and reproducibility.\n",
|
||||
"A collection of detailed documentation, tutorials and exemplar projects make\n",
|
||||
"LayoutParser easy to learn and use.\n",
|
||||
"AllenNLP [ 8] and transformers [ 34] have provided the community with complete\n",
|
||||
"DL-based support for developing and deploying models for general computer\n",
|
||||
"vision and natural language processing problems. LayoutParser , on the other\n",
|
||||
"hand, specializes speci\f",
|
||||
"cally in DIA tasks. LayoutParser is also equipped with a\n",
|
||||
"community platform inspired by established model hubs such as Torch Hub [23]\n",
|
||||
"andTensorFlow Hub [1]. It enables the sharing of pretrained models as well as\n",
|
||||
"full document processing pipelines that are unique to DIA tasks.\n",
|
||||
"There have been a variety of document data collections to facilitate the\n",
|
||||
"development of DL models. Some examples include PRImA [ 3](magazine layouts),\n",
|
||||
"PubLayNet [ 38](academic paper layouts), Table Bank [ 18](tables in academic\n",
|
||||
"papers), Newspaper Navigator Dataset [ 16,17](newspaper \f",
|
||||
"gure layouts) and\n",
|
||||
"HJDataset [31](historical Japanese document layouts). A spectrum of models\n",
|
||||
"trained on these datasets are currently available in the LayoutParser model zoo\n",
|
||||
"to support di\u000b",
|
||||
"erent use cases.\n",
|
||||
"3 The Core LayoutParser Library\n",
|
||||
"At the core of LayoutParser is an o\u000b",
|
||||
"-the-shelf toolkit that streamlines DL-\n",
|
||||
"based document image analysis. Five components support a simple interface\n",
|
||||
"with comprehensive functionalities: 1) The layout detection models enable using\n",
|
||||
"pre-trained or self-trained DL models for layout detection with just four lines\n",
|
||||
"of code. 2) The detected layout information is stored in carefully engineered\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.vectorstores import FAISS\n",
|
||||
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
|
||||
"\n",
|
||||
"faiss_index = FAISS.from_documents(pages, OpenAIEmbeddings())\n",
|
||||
"docs = faiss_index.similarity_search(\"How will the community be engaged?\", k=2)\n",
|
||||
"for doc in docs:\n",
|
||||
" print(str(doc.metadata[\"page\"]) + \":\", doc.page_content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "09d64998",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using Unstructured"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
@@ -65,7 +192,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
78
docs/modules/document_loaders/examples/roam.ipynb
Normal file
78
docs/modules/document_loaders/examples/roam.ipynb
Normal file
@@ -0,0 +1,78 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1dc7df1d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Roam\n",
|
||||
"This notebook covers how to load documents from a Roam database. This takes a lot of inspiration from the example repo [here](https://github.com/JimmyLv/roam-qa).\n",
|
||||
"\n",
|
||||
"## 🧑 Instructions for ingesting your own dataset\n",
|
||||
"\n",
|
||||
"Export your dataset from Roam Research. You can do this by clicking on the three dots in the upper right hand corner and then clicking `Export`.\n",
|
||||
"\n",
|
||||
"When exporting, make sure to select the `Markdown & CSV` format option.\n",
|
||||
"\n",
|
||||
"This will produce a `.zip` file in your Downloads folder. Move the `.zip` file into this repository.\n",
|
||||
"\n",
|
||||
"Run the following command to unzip the zip file (replace the `Export...` with your own file name as needed).\n",
|
||||
"\n",
|
||||
"```shell\n",
|
||||
"unzip Roam-Export-1675782732639.zip -d Roam_DB\n",
|
||||
"```\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "007c5cbf",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import RoamLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a1caec59",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = ObsidianLoader(\"Roam_DB\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b1c30ff7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = loader.load()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
134
docs/modules/document_loaders/examples/s3_directory.ipynb
Normal file
134
docs/modules/document_loaders/examples/s3_directory.ipynb
Normal file
@@ -0,0 +1,134 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a634365e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# s3 Directory\n",
|
||||
"\n",
|
||||
"This covers how to load document objects from an s3 directory object."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "2f0cd6a5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import S3DirectoryLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "49815096",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#!pip install boto3"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "321cc7f1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = S3DirectoryLoader(\"testing-hwc\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "2b11d155",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': '/var/folders/y6/8_bzdg295ld6s1_97_12m4lr0000gn/T/tmpaa9xl6ch/fake.docx'}, lookup_index=0)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0690c40a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Specifying a prefix\n",
|
||||
"You can also specify a prefix for more finegrained control over what files to load."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "72d44781",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = S3DirectoryLoader(\"testing-hwc\", prefix=\"fake\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "2d3c32db",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': '/var/folders/y6/8_bzdg295ld6s1_97_12m4lr0000gn/T/tmpujbkzf_l/fake.docx'}, lookup_index=0)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "885dc280",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
94
docs/modules/document_loaders/examples/s3_file.ipynb
Normal file
94
docs/modules/document_loaders/examples/s3_file.ipynb
Normal file
@@ -0,0 +1,94 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "66a7777e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# s3 File\n",
|
||||
"\n",
|
||||
"This covers how to load document objects from an s3 file object."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "9ec8a3b3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import S3FileLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "43128d8d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#!pip install boto3"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "35d6809a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = S3FileLoader(\"testing-hwc\", \"fake.docx\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "efd6be84",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': '/var/folders/y6/8_bzdg295ld6s1_97_12m4lr0000gn/T/tmpxvave6wl/fake.docx'}, lookup_index=0)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "93689594",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
78
docs/modules/document_loaders/examples/url.ipynb
Normal file
78
docs/modules/document_loaders/examples/url.ipynb
Normal file
@@ -0,0 +1,78 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2dfc4698",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# URL\n",
|
||||
"\n",
|
||||
"This covers how to load HTML documents from a list of URLs into a document format that we can use downstream."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "16c3699e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
" from langchain.document_loaders import UnstructuredURLLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "836fbac1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"urls = [\n",
|
||||
" \"https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-8-2023\",\n",
|
||||
" \"https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-9-2023\"\n",
|
||||
"]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "00f46fda",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = UnstructuredURLLoader(urls=urls)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "b68a26b3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loader.load()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
117
docs/modules/document_loaders/examples/web_base.ipynb
Normal file
117
docs/modules/document_loaders/examples/web_base.ipynb
Normal file
File diff suppressed because one or more lines are too long
137
docs/modules/document_loaders/examples/youtube.ipynb
Normal file
137
docs/modules/document_loaders/examples/youtube.ipynb
Normal file
@@ -0,0 +1,137 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "df770c72",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# YouTube\n",
|
||||
"\n",
|
||||
"How to load documents from YouTube transcripts."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "da4a867f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import YoutubeLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "34a25b57",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# !pip install youtube-transcript-api"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "bc8b308a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = YoutubeLoader.from_youtube_url(\"https://www.youtube.com/watch?v=QsYGlZkevEg\", add_video_info=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "d073dd36",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='LADIES AND GENTLEMEN, PEDRO PASCAL! [ CHEERS AND APPLAUSE ] >> THANK YOU, THANK YOU. THANK YOU VERY MUCH. I\\'M SO EXCITED TO BE HERE. THANK YOU. I SPENT THE LAST YEAR SHOOTING A SHOW CALLED \"THE LAST OF US\" ON HBO. FOR SOME HBO SHOES, YOU GET TO SHOOT IN A FIVE STAR ITALIAN RESORT SURROUNDED BY BEAUTIFUL PEOPLE, BUT I SAID, NO, THAT\\'S TOO EASY. I WANT TO SHOOT IN A FREEZING CANADIAN FOREST WHILE BEING CHASED AROUND BY A GUY WHOSE HEAD LOOKS LIKE A GENITAL WART. IT IS AN HONOR BEING A PART OF THESE HUGE FRANCHISEs LIKE \"GAME OF THRONES\" AND \"STAR WARS,\" BUT I\\'M STILL GETTING USED TO PEOPLE RECOGNIZING ME. THE OTHER DAY, A GUY STOPPED ME ON THE STREET AND SAYS, MY SON LOVES \"THE MANDALORIAN\" AND THE NEXT THING I KNOW, I\\'M FACE TIMING WITH A 6-YEAR-OLD WHO HAS NO IDEA WHO I AM BECAUSE MY CHARACTER WEARS A MASK THE ENTIRE SHOW. THE GUY IS LIKE, DO THE MANDO VOICE, BUT IT\\'S LIKE A BEDROOM VOICE. WITHOUT THE MASK, IT JUST SOUNDS PORNY. PEOPLE WALKING BY ON THE STREET SEE ME WHISPERING TO A 6-YEAR-OLD KID. I CAN BRING YOU IN WARM, OR I CAN BRING YOU IN COLD. EVEN THOUGH I CAME TO THE U.S. WHEN I WAS LITTLE, I WAS BORN IN CHILE, AND I HAVE 34 FIRST COUSINS WHO ARE STILL THERE. THEY\\'RE VERY PROUD OF ME. I KNOW THEY\\'RE PROUD BECAUSE THEY GIVE MY PHONE NUMBER TO EVERY PERSON THEY MEET, WHICH MEANS EVERY DAY, SOMEONE IN SANTIAGO WILL TEXT ME STUFF LIKE, CAN YOU COME TO MY WEDDING, OR CAN YOU SING MY PRIEST HAPPY BIRTHDAY, OR IS BABY YODA MEAN IN REAL LIFE. SO I HAVE TO BE LIKE NO, NO, AND HIS NAME IS GROGU. BUT MY COUSINS WEREN\\'T ALWAYS SO PROUD. EARLY IN MY CAREER, I PLAYED SMALL PARTS IN EVERY CRIME SHOW. I EVEN PLAYED TWO DIFFERENT CHARACTERS ON \"LAW AND ORDER.\" TITO CABASSA WHO LOOKED LIKE THIS. AND ONE YEAR LATER, I PLAYED REGGIE LUCKMAN WHO LOOKS LIKE THIS. AND THAT, MY FRIENDS, IS CALLED RANGE. BUT IT IS AMAZING TO BE HERE, LIKE I SAID. I WAS BORN IN CHILE, AND NINE MONTHS LATER, MY PARENTS FLED AND BROUGHT ME AND MY SISTER TO THE U.S. THEY WERE SO BRAVE, AND WITHOUT THEM, I WOULDN\\'T BE HERE IN THIS WONDERFUL COUNTRY, AND I CERTAINLY WOULDN\\'T BE STANDING HERE WITH YOU ALL TONIGHT. SO TO ALL MY FAMILY WATCHING IN CHILE, I WANT TO SAY [ SPEAKING NON-ENGLISH ] WHICH MEANS, I LOVE YOU, I MISS YOU, AND STOP GIVING OUT MY PHONE NUMBER. WE\\'VE GOT AN AMAZING SHOW FOR YOU TONIGHT. COLDPLAY IS HERE, SO STICK', lookup_str='', metadata={'source': 'QsYGlZkevEg', 'title': 'Pedro Pascal Monologue - SNL', 'description': 'First-time host Pedro Pascal talks about filming The Last of Us and being recognized by fans.\\n\\nSaturday Night Live. Stream now on Peacock: https://pck.tv/3uQxh4q\\n\\nSubscribe to SNL: https://goo.gl/tUsXwM\\nStream Current Full Episodes: http://www.nbc.com/saturday-night-live\\n\\nWATCH PAST SNL SEASONS\\nGoogle Play - http://bit.ly/SNLGooglePlay\\niTunes - http://bit.ly/SNLiTunes\\n\\nSNL ON SOCIAL\\nSNL Instagram: http://instagram.com/nbcsnl\\nSNL Facebook: https://www.facebook.com/snl\\nSNL Twitter: https://twitter.com/nbcsnl\\nSNL TikTok: https://www.tiktok.com/@nbcsnl\\n\\nGET MORE NBC\\nLike NBC: http://Facebook.com/NBC\\nFollow NBC: http://Twitter.com/NBC\\nNBC Tumblr: http://NBCtv.tumblr.com/\\nYouTube: http://www.youtube.com/nbc\\nNBC Instagram: http://instagram.com/nbc\\n\\n#SNL #PedroPascal #SNL48 #Coldplay', 'view_count': 1175057, 'thumbnail_url': 'https://i.ytimg.com/vi/QsYGlZkevEg/sddefault.jpg', 'publish_date': datetime.datetime(2023, 2, 4, 0, 0), 'length': 224, 'author': 'Saturday Night Live'}, lookup_index=0)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6b278a1b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Add video info"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "ba28af69",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# ! pip install pytube"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "9b8ea390",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = YoutubeLoader.from_youtube_url(\"https://www.youtube.com/watch?v=QsYGlZkevEg\", add_video_info=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "97b98e92",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='LADIES AND GENTLEMEN, PEDRO PASCAL! [ CHEERS AND APPLAUSE ] >> THANK YOU, THANK YOU. THANK YOU VERY MUCH. I\\'M SO EXCITED TO BE HERE. THANK YOU. I SPENT THE LAST YEAR SHOOTING A SHOW CALLED \"THE LAST OF US\" ON HBO. FOR SOME HBO SHOES, YOU GET TO SHOOT IN A FIVE STAR ITALIAN RESORT SURROUNDED BY BEAUTIFUL PEOPLE, BUT I SAID, NO, THAT\\'S TOO EASY. I WANT TO SHOOT IN A FREEZING CANADIAN FOREST WHILE BEING CHASED AROUND BY A GUY WHOSE HEAD LOOKS LIKE A GENITAL WART. IT IS AN HONOR BEING A PART OF THESE HUGE FRANCHISEs LIKE \"GAME OF THRONES\" AND \"STAR WARS,\" BUT I\\'M STILL GETTING USED TO PEOPLE RECOGNIZING ME. THE OTHER DAY, A GUY STOPPED ME ON THE STREET AND SAYS, MY SON LOVES \"THE MANDALORIAN\" AND THE NEXT THING I KNOW, I\\'M FACE TIMING WITH A 6-YEAR-OLD WHO HAS NO IDEA WHO I AM BECAUSE MY CHARACTER WEARS A MASK THE ENTIRE SHOW. THE GUY IS LIKE, DO THE MANDO VOICE, BUT IT\\'S LIKE A BEDROOM VOICE. WITHOUT THE MASK, IT JUST SOUNDS PORNY. PEOPLE WALKING BY ON THE STREET SEE ME WHISPERING TO A 6-YEAR-OLD KID. I CAN BRING YOU IN WARM, OR I CAN BRING YOU IN COLD. EVEN THOUGH I CAME TO THE U.S. WHEN I WAS LITTLE, I WAS BORN IN CHILE, AND I HAVE 34 FIRST COUSINS WHO ARE STILL THERE. THEY\\'RE VERY PROUD OF ME. I KNOW THEY\\'RE PROUD BECAUSE THEY GIVE MY PHONE NUMBER TO EVERY PERSON THEY MEET, WHICH MEANS EVERY DAY, SOMEONE IN SANTIAGO WILL TEXT ME STUFF LIKE, CAN YOU COME TO MY WEDDING, OR CAN YOU SING MY PRIEST HAPPY BIRTHDAY, OR IS BABY YODA MEAN IN REAL LIFE. SO I HAVE TO BE LIKE NO, NO, AND HIS NAME IS GROGU. BUT MY COUSINS WEREN\\'T ALWAYS SO PROUD. EARLY IN MY CAREER, I PLAYED SMALL PARTS IN EVERY CRIME SHOW. I EVEN PLAYED TWO DIFFERENT CHARACTERS ON \"LAW AND ORDER.\" TITO CABASSA WHO LOOKED LIKE THIS. AND ONE YEAR LATER, I PLAYED REGGIE LUCKMAN WHO LOOKS LIKE THIS. AND THAT, MY FRIENDS, IS CALLED RANGE. BUT IT IS AMAZING TO BE HERE, LIKE I SAID. I WAS BORN IN CHILE, AND NINE MONTHS LATER, MY PARENTS FLED AND BROUGHT ME AND MY SISTER TO THE U.S. THEY WERE SO BRAVE, AND WITHOUT THEM, I WOULDN\\'T BE HERE IN THIS WONDERFUL COUNTRY, AND I CERTAINLY WOULDN\\'T BE STANDING HERE WITH YOU ALL TONIGHT. SO TO ALL MY FAMILY WATCHING IN CHILE, I WANT TO SAY [ SPEAKING NON-ENGLISH ] WHICH MEANS, I LOVE YOU, I MISS YOU, AND STOP GIVING OUT MY PHONE NUMBER. WE\\'VE GOT AN AMAZING SHOW FOR YOU TONIGHT. COLDPLAY IS HERE, SO STICK', lookup_str='', metadata={'source': 'QsYGlZkevEg', 'title': 'Pedro Pascal Monologue - SNL', 'description': 'First-time host Pedro Pascal talks about filming The Last of Us and being recognized by fans.\\n\\nSaturday Night Live. Stream now on Peacock: https://pck.tv/3uQxh4q\\n\\nSubscribe to SNL: https://goo.gl/tUsXwM\\nStream Current Full Episodes: http://www.nbc.com/saturday-night-live\\n\\nWATCH PAST SNL SEASONS\\nGoogle Play - http://bit.ly/SNLGooglePlay\\niTunes - http://bit.ly/SNLiTunes\\n\\nSNL ON SOCIAL\\nSNL Instagram: http://instagram.com/nbcsnl\\nSNL Facebook: https://www.facebook.com/snl\\nSNL Twitter: https://twitter.com/nbcsnl\\nSNL TikTok: https://www.tiktok.com/@nbcsnl\\n\\nGET MORE NBC\\nLike NBC: http://Facebook.com/NBC\\nFollow NBC: http://Twitter.com/NBC\\nNBC Tumblr: http://NBCtv.tumblr.com/\\nYouTube: http://www.youtube.com/nbc\\nNBC Instagram: http://instagram.com/nbc\\n\\n#SNL #PedroPascal #SNL48 #Coldplay', 'view_count': 1175057, 'thumbnail_url': 'https://i.ytimg.com/vi/QsYGlZkevEg/sddefault.jpg', 'publish_date': datetime.datetime(2023, 2, 4, 0, 0), 'length': 224, 'author': 'Saturday Night Live'}, lookup_index=0)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"loader.load()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -25,6 +25,34 @@ There are a lot of different document loaders that LangChain supports. Below are
|
||||
|
||||
`Obsidian <./examples/obsidian.html>`_: A walkthrough of how to load data from an Obsidian file dump.
|
||||
|
||||
`Roam <./examples/roam.html>`_: A walkthrough of how to load data from a Roam file export.
|
||||
|
||||
`EveryNote <./examples/everynote.html>`_: A walkthrough of how to load data from a EveryNote (`.enex`) file.
|
||||
|
||||
`YouTube <./examples/youtube.html>`_: A walkthrough of how to load the transcript from a YouTube video.
|
||||
|
||||
`s3 File <./examples/s3_file.html>`_: A walkthrough of how to load a file from s3.
|
||||
|
||||
`s3 Directory <./examples/s3_directory.html>`_: A walkthrough of how to load all files in a directory from s3.
|
||||
|
||||
`GCS File <./examples/gcs_file.html>`_: A walkthrough of how to load a file from Google Cloud Storage (GCS).
|
||||
|
||||
`GCS Directory <./examples/gcs_directory.html>`_: A walkthrough of how to load all files in a directory from Google Cloud Storage (GCS).
|
||||
|
||||
`Web Base <./examples/web_base.html>`_: A walkthrough of how to load all text data from webpages.
|
||||
|
||||
`IMSDb <./examples/imsdb.html>`_: A walkthrough of how to load all text data from IMSDb webpage.
|
||||
|
||||
`AZLyrics <./examples/azlyrics.html>`_: A walkthrough of how to load all text data from AZLyrics webpage.
|
||||
|
||||
`College Confidential <./examples/college_confidential.html>`_: A walkthrough of how to load all text data from College Confidential webpage.
|
||||
|
||||
`Gutenberg <./examples/gutenberg.html>`_: A walkthrough of how to load data from a Gutenberg ebook text.
|
||||
|
||||
`Airbyte Json <./examples/airbyte_json.html>`_: A walkthrough of how to load data from a local Airbyte JSON file.
|
||||
|
||||
`Online PDF <./examples/online_pdf.html>`_: A walkthrough of how to load data from an online PDF.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:glob:
|
||||
|
||||
150
docs/modules/llms/async_llm.ipynb
Normal file
150
docs/modules/llms/async_llm.ipynb
Normal file
@@ -0,0 +1,150 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f6574496-b360-4ffa-9523-7fd34a590164",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Async API for LLM\n",
|
||||
"\n",
|
||||
"LangChain provides async support for LLMs by leveraging the [asyncio](https://docs.python.org/3/library/asyncio.html) library.\n",
|
||||
"\n",
|
||||
"Async support is particularly useful for calling multiple LLMs concurrently, as these calls are network-bound. Currently, only `OpenAI` is supported, but async support for other LLMs is on the roadmap.\n",
|
||||
"\n",
|
||||
"You can use the `agenerate` method to call an OpenAI LLM asynchronously."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "5e49e96c-0f88-466d-b3d3-ea0966bdf19e",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"\n",
|
||||
"I'm doing well. How about you?\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"I'm doing well, thank you. How about you?\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"I'm doing well, thank you. How about you?\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"I'm doing well, thank you. How about you?\n",
|
||||
"\n",
|
||||
"I am doing quite well. How about you?\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"I'm doing well, thank you. How about you?\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"I'm doing great, thank you! How about you?\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"I'm doing well, thanks for asking. How about you?\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"I'm doing well, thank you. How about you?\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"I'm doing well, thank you. How about you?\n",
|
||||
"\u001b[1mConcurrent executed in 1.93 seconds.\u001b[0m\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"I'm doing well, thank you. How about you?\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"I'm doing well, thank you. How about you?\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"I'm doing well, thank you. How about you?\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"I'm doing well, thank you. How about you?\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"I'm doing well, thank you. How about you?\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"I'm doing well, thank you. How about you?\n",
|
||||
"\n",
|
||||
"I'm doing well, thank you. How about you?\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"I'm doing well, thank you. How about you?\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"I'm doing well, thank you. How about you?\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"I'm doing great, thank you. How about you?\n",
|
||||
"\u001b[1mSerial executed in 10.54 seconds.\u001b[0m\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import time\n",
|
||||
"import asyncio\n",
|
||||
"\n",
|
||||
"from langchain.llms import OpenAI\n",
|
||||
"\n",
|
||||
"def generate_serially():\n",
|
||||
" llm = OpenAI(temperature=0.9)\n",
|
||||
" for _ in range(10):\n",
|
||||
" resp = llm.generate([\"Hello, how are you?\"])\n",
|
||||
" print(resp.generations[0][0].text)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"async def async_generate(llm):\n",
|
||||
" resp = await llm.agenerate([\"Hello, how are you?\"])\n",
|
||||
" print(resp.generations[0][0].text)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"async def generate_concurrently():\n",
|
||||
" llm = OpenAI(temperature=0.9)\n",
|
||||
" tasks = [async_generate(llm) for _ in range(10)]\n",
|
||||
" await asyncio.gather(*tasks)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"s = time.perf_counter()\n",
|
||||
"# If running this outside of Jupyter, use asyncio.run(generate_concurrently())\n",
|
||||
"await generate_concurrently() \n",
|
||||
"elapsed = time.perf_counter() - s\n",
|
||||
"print('\\033[1m' + f\"Concurrent executed in {elapsed:0.2f} seconds.\" + '\\033[0m')\n",
|
||||
"\n",
|
||||
"s = time.perf_counter()\n",
|
||||
"generate_serially()\n",
|
||||
"elapsed = time.perf_counter() - s\n",
|
||||
"print('\\033[1m' + f\"Serial executed in {elapsed:0.2f} seconds.\" + '\\033[0m')"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -7,6 +7,7 @@ They are split into two categories:
|
||||
|
||||
1. `Generic Functionality <./generic_how_to.html>`_: Covering generic functionality all LLMs should have.
|
||||
2. `Integrations <./integrations.html>`_: Covering integrations with various LLM providers.
|
||||
3. `Asynchronous <./async_llm.html>`_: Covering asynchronous functionality.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
@@ -5,9 +5,9 @@
|
||||
"id": "959300d4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# HuggingFace Hub\n",
|
||||
"# Hugging Face Hub\n",
|
||||
"\n",
|
||||
"This example showcases how to connect to the HuggingFace Hub."
|
||||
"This example showcases how to connect to the Hugging Face Hub."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -20,7 +20,7 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"The Seattle Seahawks won the Super Bowl in 2010. Justin Beiber was born in 2010. The\n"
|
||||
"The Seattle Seahawks won the Super Bowl in 2010. Justin Beiber was born in 2010. The final answer: Seattle Seahawks.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -31,7 +31,7 @@
|
||||
"\n",
|
||||
"Answer: Let's think step by step.\"\"\"\n",
|
||||
"prompt = PromptTemplate(template=template, input_variables=[\"question\"])\n",
|
||||
"llm_chain = LLMChain(prompt=prompt, llm=HuggingFaceHub(repo_id=\"google/flan-t5-xl\", model_kwargs={\"temperature\":1e-10}))\n",
|
||||
"llm_chain = LLMChain(prompt=prompt, llm=HuggingFaceHub(repo_id=\"google/flan-t5-xl\", model_kwargs={\"temperature\":0, \"max_length\":64}))\n",
|
||||
"\n",
|
||||
"question = \"What NFL team won the Super Bowl in the year Justin Beiber was born?\"\n",
|
||||
"\n",
|
||||
|
||||
@@ -77,7 +77,7 @@
|
||||
" memory=ConversationalBufferWindowMemory(k=2),\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"output = chatgpt_chain.predict(human_input=\"I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply wiht the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\")\n",
|
||||
"output = chatgpt_chain.predict(human_input=\"I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply with the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\")\n",
|
||||
"print(output)"
|
||||
]
|
||||
},
|
||||
@@ -103,7 +103,7 @@
|
||||
"\n",
|
||||
"Overall, Assistant is a powerful tool that can help with a wide range of tasks and provide valuable insights and information on a wide range of topics. Whether you need help with a specific question or just want to have a conversation about a particular topic, Assistant is here to assist.\n",
|
||||
"\n",
|
||||
"Human: I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply wiht the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\n",
|
||||
"Human: I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply with the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\n",
|
||||
"AI: \n",
|
||||
"```\n",
|
||||
"$ pwd\n",
|
||||
@@ -148,7 +148,7 @@
|
||||
"\n",
|
||||
"Overall, Assistant is a powerful tool that can help with a wide range of tasks and provide valuable insights and information on a wide range of topics. Whether you need help with a specific question or just want to have a conversation about a particular topic, Assistant is here to assist.\n",
|
||||
"\n",
|
||||
"Human: I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply wiht the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\n",
|
||||
"Human: I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply with the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\n",
|
||||
"AI: \n",
|
||||
"```\n",
|
||||
"$ pwd\n",
|
||||
@@ -915,14 +915,14 @@
|
||||
" \"response\": \"Artificial intelligence (AI) is the simulation of human intelligence processes by machines, especially computer systems. These processes include learning (the acquisition of information and rules for using the information), reasoning (using the rules to reach approximate or definite conclusions) and self-correction. AI is used to develop computer systems that can think and act like humans.\"\n",
|
||||
"}\n",
|
||||
"```\n",
|
||||
"Human: curl --header \"Content-Type:application/json\" --request POST --data '{\"message\": \"I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply wiht the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\"}' https://chat.openai.com/chat\n",
|
||||
"Human: curl --header \"Content-Type:application/json\" --request POST --data '{\"message\": \"I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply with the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\"}' https://chat.openai.com/chat\n",
|
||||
"Assistant:\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished LLMChain chain.\u001b[0m\n",
|
||||
" \n",
|
||||
"\n",
|
||||
"```\n",
|
||||
"$ curl --header \"Content-Type:application/json\" --request POST --data '{\"message\": \"I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply wiht the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\"}' https://chat.openai.com/chat\n",
|
||||
"$ curl --header \"Content-Type:application/json\" --request POST --data '{\"message\": \"I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply with the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\"}' https://chat.openai.com/chat\n",
|
||||
"\n",
|
||||
"{\n",
|
||||
" \"response\": \"```\\n/current/working/directory\\n```\"\n",
|
||||
@@ -932,7 +932,7 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"output = chatgpt_chain.predict(human_input=\"\"\"curl --header \"Content-Type:application/json\" --request POST --data '{\"message\": \"I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply wiht the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\"}' https://chat.openai.com/chat\"\"\")\n",
|
||||
"output = chatgpt_chain.predict(human_input=\"\"\"curl --header \"Content-Type:application/json\" --request POST --data '{\"message\": \"I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply with the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\"}' https://chat.openai.com/chat\"\"\")\n",
|
||||
"print(output)"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
"\n",
|
||||
"This notebook walks through using an agent optimized for conversation. Other agents are often optimized for using tools to figure out the best response, which is not ideal in a conversational setting where you may want the agent to be able to chat with the user as well.\n",
|
||||
"\n",
|
||||
"This is accomplisehd with a specific type of agent (`conversational-react-description`) which expects to be used with a memory component."
|
||||
"This is accomplished with a specific type of agent (`conversational-react-description`) which expects to be used with a memory component."
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -68,7 +68,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 4,
|
||||
"id": "67baf32e",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
@@ -98,6 +98,68 @@
|
||||
"print(docs[0].page_content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "fb6baaf8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Add texts\n",
|
||||
"You can easily add text to a vectorstore with the `add_texts` method. It will return a list of document IDs (in case you need to use them downstream)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "70758e4f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['64108bd0-4d91-485c-9743-1e18debdd59e']"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docsearch.add_texts([\"Ankush went to Princeton\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "4edeb88f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query = \"Where did Ankush go to college?\"\n",
|
||||
"docs = docsearch.similarity_search(query)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "1cba64a2",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(page_content='Ankush went to Princeton', lookup_str='', metadata={}, lookup_index=0)"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "bbf5ec44",
|
||||
@@ -646,7 +708,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -6,7 +6,7 @@ These agents can be used to power the next generation of personal assistants -
|
||||
systems that intelligently understand what you mean, and then can take actions to help you accomplish your goal.
|
||||
|
||||
Agents are a core use of LangChain - so much so that there is a whole module dedicated to them.
|
||||
Therefor, we recommend that you check out that documentation for detailed instruction on how to work
|
||||
Therefore, we recommend that you check out that documentation for detailed instruction on how to work
|
||||
with them.
|
||||
|
||||
- [Agent Documentation](../modules/agents.rst)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
"""Chain that takes in an input and produces an action and action input."""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
from abc import abstractmethod
|
||||
@@ -71,6 +72,19 @@ class Agent(BaseModel):
|
||||
tool=parsed_output[0], tool_input=parsed_output[1], log=full_output
|
||||
)
|
||||
|
||||
async def _aget_next_action(self, full_inputs: Dict[str, str]) -> AgentAction:
|
||||
full_output = await self.llm_chain.apredict(**full_inputs)
|
||||
parsed_output = self._extract_tool_and_input(full_output)
|
||||
while parsed_output is None:
|
||||
full_output = self._fix_text(full_output)
|
||||
full_inputs["agent_scratchpad"] += full_output
|
||||
output = await self.llm_chain.apredict(**full_inputs)
|
||||
full_output += output
|
||||
parsed_output = self._extract_tool_and_input(full_output)
|
||||
return AgentAction(
|
||||
tool=parsed_output[0], tool_input=parsed_output[1], log=full_output
|
||||
)
|
||||
|
||||
def plan(
|
||||
self, intermediate_steps: List[Tuple[AgentAction, str]], **kwargs: Any
|
||||
) -> Union[AgentAction, AgentFinish]:
|
||||
@@ -84,15 +98,40 @@ class Agent(BaseModel):
|
||||
Returns:
|
||||
Action specifying what tool to use.
|
||||
"""
|
||||
thoughts = self._construct_scratchpad(intermediate_steps)
|
||||
new_inputs = {"agent_scratchpad": thoughts, "stop": self._stop}
|
||||
full_inputs = {**kwargs, **new_inputs}
|
||||
|
||||
full_inputs = self.get_full_inputs(intermediate_steps, **kwargs)
|
||||
action = self._get_next_action(full_inputs)
|
||||
if action.tool == self.finish_tool_name:
|
||||
return AgentFinish({"output": action.tool_input}, action.log)
|
||||
return action
|
||||
|
||||
async def aplan(
|
||||
self, intermediate_steps: List[Tuple[AgentAction, str]], **kwargs: Any
|
||||
) -> Union[AgentAction, AgentFinish]:
|
||||
"""Given input, decided what to do.
|
||||
|
||||
Args:
|
||||
intermediate_steps: Steps the LLM has taken to date,
|
||||
along with observations
|
||||
**kwargs: User inputs.
|
||||
|
||||
Returns:
|
||||
Action specifying what tool to use.
|
||||
"""
|
||||
full_inputs = self.get_full_inputs(intermediate_steps, **kwargs)
|
||||
action = await self._aget_next_action(full_inputs)
|
||||
if action.tool == self.finish_tool_name:
|
||||
return AgentFinish({"output": action.tool_input}, action.log)
|
||||
return action
|
||||
|
||||
def get_full_inputs(
|
||||
self, intermediate_steps: List[Tuple[AgentAction, str]], **kwargs: Any
|
||||
) -> Dict[str, Any]:
|
||||
"""Create the full inputs for the LLMChain from intermediate steps."""
|
||||
thoughts = self._construct_scratchpad(intermediate_steps)
|
||||
new_inputs = {"agent_scratchpad": thoughts, "stop": self._stop}
|
||||
full_inputs = {**kwargs, **new_inputs}
|
||||
return full_inputs
|
||||
|
||||
def prepare_for_new_call(self) -> None:
|
||||
"""Prepare the agent for new call, if needed."""
|
||||
pass
|
||||
@@ -338,6 +377,14 @@ class AgentExecutor(Chain, BaseModel):
|
||||
|
||||
def _call(self, inputs: Dict[str, str]) -> Dict[str, Any]:
|
||||
"""Run text through and get agent response."""
|
||||
# Make sure that every tool is synchronous (not a coroutine)
|
||||
for tool in self.tools:
|
||||
if asyncio.iscoroutinefunction(tool.func):
|
||||
raise ValueError(
|
||||
"Tools cannot be asynchronous for `run` method. "
|
||||
"Please use `arun` instead."
|
||||
)
|
||||
|
||||
# Do any preparation necessary when receiving a new input.
|
||||
self.agent.prepare_for_new_call()
|
||||
# Construct a mapping of tool name to tool for easy lookup
|
||||
@@ -399,3 +446,81 @@ class AgentExecutor(Chain, BaseModel):
|
||||
self.early_stopping_method, intermediate_steps, **inputs
|
||||
)
|
||||
return self._return(output, intermediate_steps)
|
||||
|
||||
async def _acall(self, inputs: Dict[str, str]) -> Dict[str, str]:
|
||||
"""Run text through and get agent response."""
|
||||
# Make sure that every tool is asynchronous (a coroutine)
|
||||
for tool in self.tools:
|
||||
if tool.coroutine and not asyncio.iscoroutinefunction(tool.coroutine):
|
||||
raise ValueError(
|
||||
"The coroutine for the tool must be a coroutine function."
|
||||
)
|
||||
|
||||
# Do any preparation necessary when receiving a new input.
|
||||
self.agent.prepare_for_new_call()
|
||||
# Construct a mapping of tool name to tool for easy lookup
|
||||
name_to_tool_map = {tool.name: tool for tool in self.tools}
|
||||
# We construct a mapping from each tool to a color, used for logging.
|
||||
color_mapping = get_color_mapping(
|
||||
[tool.name for tool in self.tools], excluded_colors=["green"]
|
||||
)
|
||||
intermediate_steps: List[Tuple[AgentAction, str]] = []
|
||||
# Let's start tracking the iterations the agent has gone through
|
||||
iterations = 0
|
||||
# We now enter the agent loop (until it returns something).
|
||||
while self._should_continue(iterations):
|
||||
# Call the LLM to see what to do.
|
||||
output = await self.agent.aplan(intermediate_steps, **inputs)
|
||||
# If the tool chosen is the finishing tool, then we end and return.
|
||||
if isinstance(output, AgentFinish):
|
||||
return self._return(output, intermediate_steps)
|
||||
|
||||
# Otherwise we lookup the tool
|
||||
if output.tool in name_to_tool_map:
|
||||
tool = name_to_tool_map[output.tool]
|
||||
self.callback_manager.on_tool_start(
|
||||
{"name": str(tool.func)[:60] + "..."},
|
||||
output,
|
||||
verbose=self.verbose,
|
||||
)
|
||||
try:
|
||||
# We then call the tool on the tool input to get an observation
|
||||
observation = (
|
||||
await tool.coroutine(output.tool_input)
|
||||
if tool.coroutine
|
||||
# If the tool is not a coroutine, we run it in the executor
|
||||
# to avoid blocking the event loop.
|
||||
else await asyncio.get_event_loop().run_in_executor(
|
||||
None, tool.func, output.tool_input
|
||||
)
|
||||
)
|
||||
color = color_mapping[output.tool]
|
||||
return_direct = tool.return_direct
|
||||
except (KeyboardInterrupt, Exception) as e:
|
||||
self.callback_manager.on_tool_error(e, verbose=self.verbose)
|
||||
raise e
|
||||
else:
|
||||
self.callback_manager.on_tool_start(
|
||||
{"name": "N/A"}, output, verbose=self.verbose
|
||||
)
|
||||
observation = f"{output.tool} is not a valid tool, try another one."
|
||||
color = None
|
||||
return_direct = False
|
||||
llm_prefix = "" if return_direct else self.agent.llm_prefix
|
||||
self.callback_manager.on_tool_end(
|
||||
observation,
|
||||
color=color,
|
||||
observation_prefix=self.agent.observation_prefix,
|
||||
llm_prefix=llm_prefix,
|
||||
verbose=self.verbose,
|
||||
)
|
||||
intermediate_steps.append((output, observation))
|
||||
if return_direct:
|
||||
# Set the log to "" because we do not want to log it.
|
||||
output = AgentFinish({self.agent.return_values[0]: observation}, "")
|
||||
return self._return(output, intermediate_steps)
|
||||
iterations += 1
|
||||
output = self.agent.return_stopped_response(
|
||||
self.early_stopping_method, intermediate_steps, **inputs
|
||||
)
|
||||
return self._return(output, intermediate_steps)
|
||||
|
||||
@@ -39,6 +39,7 @@ class ConversationalAgent(Agent):
|
||||
tools: List[Tool],
|
||||
prefix: str = PREFIX,
|
||||
suffix: str = SUFFIX,
|
||||
format_instructions: str = FORMAT_INSTRUCTIONS,
|
||||
ai_prefix: str = "AI",
|
||||
human_prefix: str = "Human",
|
||||
input_variables: Optional[List[str]] = None,
|
||||
@@ -61,7 +62,7 @@ class ConversationalAgent(Agent):
|
||||
[f"> {tool.name}: {tool.description}" for tool in tools]
|
||||
)
|
||||
tool_names = ", ".join([tool.name for tool in tools])
|
||||
format_instructions = FORMAT_INSTRUCTIONS.format(
|
||||
format_instructions = format_instructions.format(
|
||||
tool_names=tool_names, ai_prefix=ai_prefix, human_prefix=human_prefix
|
||||
)
|
||||
template = "\n\n".join([prefix, tool_strings, format_instructions, suffix])
|
||||
@@ -93,6 +94,7 @@ class ConversationalAgent(Agent):
|
||||
callback_manager: Optional[BaseCallbackManager] = None,
|
||||
prefix: str = PREFIX,
|
||||
suffix: str = SUFFIX,
|
||||
format_instructions: str = FORMAT_INSTRUCTIONS,
|
||||
ai_prefix: str = "AI",
|
||||
human_prefix: str = "Human",
|
||||
input_variables: Optional[List[str]] = None,
|
||||
@@ -106,6 +108,7 @@ class ConversationalAgent(Agent):
|
||||
human_prefix=human_prefix,
|
||||
prefix=prefix,
|
||||
suffix=suffix,
|
||||
format_instructions=format_instructions,
|
||||
input_variables=input_variables,
|
||||
)
|
||||
llm_chain = LLMChain(
|
||||
|
||||
@@ -65,9 +65,10 @@ def _get_pal_colored_objects(llm: BaseLLM) -> Tool:
|
||||
|
||||
def _get_llm_math(llm: BaseLLM) -> Tool:
|
||||
return Tool(
|
||||
"Calculator",
|
||||
LLMMathChain(llm=llm).run,
|
||||
"Useful for when you need to answer questions about math.",
|
||||
name="Calculator",
|
||||
description="Useful for when you need to answer questions about math.",
|
||||
func=LLMMathChain(llm=llm, callback_manager=llm.callback_manager).run,
|
||||
coroutine=LLMMathChain(llm=llm, callback_manager=llm.callback_manager).arun,
|
||||
)
|
||||
|
||||
|
||||
@@ -132,9 +133,10 @@ def _get_google_search(**kwargs: Any) -> Tool:
|
||||
|
||||
def _get_serpapi(**kwargs: Any) -> Tool:
|
||||
return Tool(
|
||||
"Search",
|
||||
SerpAPIWrapper(**kwargs).run,
|
||||
"A search engine. Useful for when you need to answer questions about current events. Input should be a search query.",
|
||||
name="Search",
|
||||
description="A search engine. Useful for when you need to answer questions about current events. Input should be a search query.",
|
||||
func=SerpAPIWrapper(**kwargs).run,
|
||||
coroutine=SerpAPIWrapper(**kwargs).arun,
|
||||
)
|
||||
|
||||
|
||||
@@ -145,7 +147,7 @@ _EXTRA_LLM_TOOLS = {
|
||||
_EXTRA_OPTIONAL_TOOLS = {
|
||||
"wolfram-alpha": (_get_wolfram_alpha, ["wolfram_alpha_appid"]),
|
||||
"google-search": (_get_google_search, ["google_api_key", "google_cse_id"]),
|
||||
"serpapi": (_get_serpapi, ["serpapi_api_key"]),
|
||||
"serpapi": (_get_serpapi, ["serpapi_api_key", "aiosession"]),
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -72,6 +72,7 @@ class ZeroShotAgent(Agent):
|
||||
tools: List[Tool],
|
||||
prefix: str = PREFIX,
|
||||
suffix: str = SUFFIX,
|
||||
format_instructions: str = FORMAT_INSTRUCTIONS,
|
||||
input_variables: Optional[List[str]] = None,
|
||||
) -> PromptTemplate:
|
||||
"""Create prompt in the style of the zero shot agent.
|
||||
@@ -88,7 +89,7 @@ class ZeroShotAgent(Agent):
|
||||
"""
|
||||
tool_strings = "\n".join([f"{tool.name}: {tool.description}" for tool in tools])
|
||||
tool_names = ", ".join([tool.name for tool in tools])
|
||||
format_instructions = FORMAT_INSTRUCTIONS.format(tool_names=tool_names)
|
||||
format_instructions = format_instructions.format(tool_names=tool_names)
|
||||
template = "\n\n".join([prefix, tool_strings, format_instructions, suffix])
|
||||
if input_variables is None:
|
||||
input_variables = ["input", "agent_scratchpad"]
|
||||
@@ -102,13 +103,18 @@ class ZeroShotAgent(Agent):
|
||||
callback_manager: Optional[BaseCallbackManager] = None,
|
||||
prefix: str = PREFIX,
|
||||
suffix: str = SUFFIX,
|
||||
format_instructions: str = FORMAT_INSTRUCTIONS,
|
||||
input_variables: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> Agent:
|
||||
"""Construct an agent from an LLM and tools."""
|
||||
cls._validate_tools(tools)
|
||||
prompt = cls.create_prompt(
|
||||
tools, prefix=prefix, suffix=suffix, input_variables=input_variables
|
||||
tools,
|
||||
prefix=prefix,
|
||||
suffix=suffix,
|
||||
format_instructions=format_instructions,
|
||||
input_variables=input_variables,
|
||||
)
|
||||
llm_chain = LLMChain(
|
||||
llm=llm,
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
"""Interface for tools."""
|
||||
import asyncio
|
||||
from dataclasses import dataclass
|
||||
from inspect import signature
|
||||
from typing import Any, Callable, Optional, Union
|
||||
from typing import Any, Awaitable, Callable, Optional, Union
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -12,9 +13,13 @@ class Tool:
|
||||
func: Callable[[str], str]
|
||||
description: Optional[str] = None
|
||||
return_direct: bool = False
|
||||
# If the tool has a coroutine, then we can use this to run it asynchronously
|
||||
coroutine: Optional[Callable[[str], Awaitable[str]]] = None
|
||||
|
||||
def __call__(self, *args: Any, **kwargs: Any) -> str:
|
||||
"""Make tools callable by piping through to `func`."""
|
||||
if asyncio.iscoroutinefunction(self.func):
|
||||
raise TypeError("Coroutine cannot be called directly")
|
||||
return self.func(*args, **kwargs)
|
||||
|
||||
|
||||
|
||||
@@ -4,7 +4,12 @@ from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from sqlalchemy import Column, Integer, String, create_engine, select
|
||||
from sqlalchemy.engine.base import Engine
|
||||
from sqlalchemy.orm import Session, declarative_base
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
try:
|
||||
from sqlalchemy.orm import declarative_base
|
||||
except ImportError:
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
|
||||
from langchain.schema import Generation
|
||||
|
||||
|
||||
@@ -9,6 +9,10 @@ from langchain.schema import AgentAction, AgentFinish, LLMResult
|
||||
class StdOutCallbackHandler(BaseCallbackHandler):
|
||||
"""Callback Handler that prints to std out."""
|
||||
|
||||
def __init__(self, color: str = "green") -> None:
|
||||
"""Initialize callback handler."""
|
||||
self.color = color
|
||||
|
||||
def on_llm_start(
|
||||
self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any
|
||||
) -> None:
|
||||
@@ -50,7 +54,7 @@ class StdOutCallbackHandler(BaseCallbackHandler):
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Print out the log in specified color."""
|
||||
print_text(action.log, color=color)
|
||||
print_text(action.log, color=color if color else self.color)
|
||||
|
||||
def on_tool_end(
|
||||
self,
|
||||
@@ -62,7 +66,7 @@ class StdOutCallbackHandler(BaseCallbackHandler):
|
||||
) -> None:
|
||||
"""If not the final action, print out observation."""
|
||||
print_text(f"\n{observation_prefix}")
|
||||
print_text(output, color=color)
|
||||
print_text(output, color=color if color else self.color)
|
||||
print_text(f"\n{llm_prefix}")
|
||||
|
||||
def on_tool_error(
|
||||
@@ -79,10 +83,10 @@ class StdOutCallbackHandler(BaseCallbackHandler):
|
||||
**kwargs: Optional[str],
|
||||
) -> None:
|
||||
"""Run when agent ends."""
|
||||
print_text(text, color=color, end=end)
|
||||
print_text(text, color=color if color else self.color, end=end)
|
||||
|
||||
def on_agent_finish(
|
||||
self, finish: AgentFinish, color: Optional[str] = None, **kwargs: Any
|
||||
) -> None:
|
||||
"""Run on agent end."""
|
||||
print_text(finish.log, color=color, end="\n")
|
||||
print_text(finish.log, color=color if self.color else color, end="\n")
|
||||
|
||||
@@ -111,6 +111,10 @@ class Chain(BaseModel, ABC):
|
||||
def _call(self, inputs: Dict[str, str]) -> Dict[str, str]:
|
||||
"""Run the logic of this chain and return the output."""
|
||||
|
||||
async def _acall(self, inputs: Dict[str, str]) -> Dict[str, str]:
|
||||
"""Run the logic of this chain and return the output."""
|
||||
raise NotImplementedError("Async call not supported for this chain type.")
|
||||
|
||||
def __call__(
|
||||
self, inputs: Union[Dict[str, Any], Any], return_only_outputs: bool = False
|
||||
) -> Dict[str, Any]:
|
||||
@@ -125,6 +129,65 @@ class Chain(BaseModel, ABC):
|
||||
chain will be returned. Defaults to False.
|
||||
|
||||
"""
|
||||
inputs = self.prep_inputs(inputs)
|
||||
self.callback_manager.on_chain_start(
|
||||
{"name": self.__class__.__name__},
|
||||
inputs,
|
||||
verbose=self.verbose,
|
||||
)
|
||||
try:
|
||||
outputs = self._call(inputs)
|
||||
except (KeyboardInterrupt, Exception) as e:
|
||||
self.callback_manager.on_chain_error(e, verbose=self.verbose)
|
||||
raise e
|
||||
self.callback_manager.on_chain_end(outputs, verbose=self.verbose)
|
||||
return self.prep_outputs(inputs, outputs, return_only_outputs)
|
||||
|
||||
async def acall(
|
||||
self, inputs: Union[Dict[str, Any], Any], return_only_outputs: bool = False
|
||||
) -> Dict[str, Any]:
|
||||
"""Run the logic of this chain and add to output if desired.
|
||||
|
||||
Args:
|
||||
inputs: Dictionary of inputs, or single input if chain expects
|
||||
only one param.
|
||||
return_only_outputs: boolean for whether to return only outputs in the
|
||||
response. If True, only new keys generated by this chain will be
|
||||
returned. If False, both input keys and new keys generated by this
|
||||
chain will be returned. Defaults to False.
|
||||
|
||||
"""
|
||||
inputs = self.prep_inputs(inputs)
|
||||
self.callback_manager.on_chain_start(
|
||||
{"name": self.__class__.__name__},
|
||||
inputs,
|
||||
verbose=self.verbose,
|
||||
)
|
||||
try:
|
||||
outputs = await self._acall(inputs)
|
||||
except (KeyboardInterrupt, Exception) as e:
|
||||
self.callback_manager.on_chain_error(e, verbose=self.verbose)
|
||||
raise e
|
||||
self.callback_manager.on_chain_end(outputs, verbose=self.verbose)
|
||||
return self.prep_outputs(inputs, outputs, return_only_outputs)
|
||||
|
||||
def prep_outputs(
|
||||
self,
|
||||
inputs: Dict[str, str],
|
||||
outputs: Dict[str, str],
|
||||
return_only_outputs: bool = False,
|
||||
) -> Dict[str, str]:
|
||||
"""Validate and prep outputs."""
|
||||
self._validate_outputs(outputs)
|
||||
if self.memory is not None:
|
||||
self.memory.save_context(inputs, outputs)
|
||||
if return_only_outputs:
|
||||
return outputs
|
||||
else:
|
||||
return {**inputs, **outputs}
|
||||
|
||||
def prep_inputs(self, inputs: Union[Dict[str, Any], Any]) -> Dict[str, str]:
|
||||
"""Validate and prep inputs."""
|
||||
if not isinstance(inputs, dict):
|
||||
_input_keys = set(self.input_keys)
|
||||
if self.memory is not None:
|
||||
@@ -143,24 +206,7 @@ class Chain(BaseModel, ABC):
|
||||
external_context = self.memory.load_memory_variables(inputs)
|
||||
inputs = dict(inputs, **external_context)
|
||||
self._validate_inputs(inputs)
|
||||
self.callback_manager.on_chain_start(
|
||||
{"name": self.__class__.__name__},
|
||||
inputs,
|
||||
verbose=self.verbose,
|
||||
)
|
||||
try:
|
||||
outputs = self._call(inputs)
|
||||
except (KeyboardInterrupt, Exception) as e:
|
||||
self.callback_manager.on_chain_error(e, verbose=self.verbose)
|
||||
raise e
|
||||
self.callback_manager.on_chain_end(outputs, verbose=self.verbose)
|
||||
self._validate_outputs(outputs)
|
||||
if self.memory is not None:
|
||||
self.memory.save_context(inputs, outputs)
|
||||
if return_only_outputs:
|
||||
return outputs
|
||||
else:
|
||||
return {**inputs, **outputs}
|
||||
return inputs
|
||||
|
||||
def apply(self, input_list: List[Dict[str, Any]]) -> List[Dict[str, str]]:
|
||||
"""Call the chain on all inputs in the list."""
|
||||
@@ -187,6 +233,27 @@ class Chain(BaseModel, ABC):
|
||||
f" but not both. Got args: {args} and kwargs: {kwargs}."
|
||||
)
|
||||
|
||||
async def arun(self, *args: str, **kwargs: str) -> str:
|
||||
"""Run the chain as text in, text out or multiple variables, text out."""
|
||||
if len(self.output_keys) != 1:
|
||||
raise ValueError(
|
||||
f"`run` not supported when there is not exactly "
|
||||
f"one output key. Got {self.output_keys}."
|
||||
)
|
||||
|
||||
if args and not kwargs:
|
||||
if len(args) != 1:
|
||||
raise ValueError("`run` supports only one positional argument.")
|
||||
return (await self.acall(args[0]))[self.output_keys[0]]
|
||||
|
||||
if kwargs and not args:
|
||||
return (await self.acall(kwargs))[self.output_keys[0]]
|
||||
|
||||
raise ValueError(
|
||||
f"`run` supported with either positional arguments or keyword arguments"
|
||||
f" but not both. Got args: {args} and kwargs: {kwargs}."
|
||||
)
|
||||
|
||||
def dict(self, **kwargs: Any) -> Dict:
|
||||
"""Return dictionary representation of chain."""
|
||||
if self.memory is not None:
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
"""Chain that just formats a prompt and calls an LLM."""
|
||||
from typing import Any, Dict, List, Sequence, Union
|
||||
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
|
||||
|
||||
from pydantic import BaseModel, Extra
|
||||
|
||||
@@ -55,6 +55,20 @@ class LLMChain(Chain, BaseModel):
|
||||
|
||||
def generate(self, input_list: List[Dict[str, Any]]) -> LLMResult:
|
||||
"""Generate LLM result from inputs."""
|
||||
prompts, stop = self.prep_prompts(input_list)
|
||||
response = self.llm.generate(prompts, stop=stop)
|
||||
return response
|
||||
|
||||
async def agenerate(self, input_list: List[Dict[str, Any]]) -> LLMResult:
|
||||
"""Generate LLM result from inputs."""
|
||||
prompts, stop = self.prep_prompts(input_list)
|
||||
response = await self.llm.agenerate(prompts, stop=stop)
|
||||
return response
|
||||
|
||||
def prep_prompts(
|
||||
self, input_list: List[Dict[str, Any]]
|
||||
) -> Tuple[List[str], Optional[List[str]]]:
|
||||
"""Prepare prompts from inputs."""
|
||||
stop = None
|
||||
if "stop" in input_list[0]:
|
||||
stop = input_list[0]["stop"]
|
||||
@@ -70,12 +84,20 @@ class LLMChain(Chain, BaseModel):
|
||||
"If `stop` is present in any inputs, should be present in all."
|
||||
)
|
||||
prompts.append(prompt)
|
||||
response = self.llm.generate(prompts, stop=stop)
|
||||
return response
|
||||
return prompts, stop
|
||||
|
||||
def apply(self, input_list: List[Dict[str, Any]]) -> List[Dict[str, str]]:
|
||||
"""Utilize the LLM generate method for speed gains."""
|
||||
response = self.generate(input_list)
|
||||
return self.create_outputs(response)
|
||||
|
||||
async def aapply(self, input_list: List[Dict[str, Any]]) -> List[Dict[str, str]]:
|
||||
"""Utilize the LLM generate method for speed gains."""
|
||||
response = await self.agenerate(input_list)
|
||||
return self.create_outputs(response)
|
||||
|
||||
def create_outputs(self, response: LLMResult) -> List[Dict[str, str]]:
|
||||
"""Create outputs from response."""
|
||||
outputs = []
|
||||
for generation in response.generations:
|
||||
# Get the text of the top generated string.
|
||||
@@ -86,6 +108,9 @@ class LLMChain(Chain, BaseModel):
|
||||
def _call(self, inputs: Dict[str, Any]) -> Dict[str, str]:
|
||||
return self.apply([inputs])[0]
|
||||
|
||||
async def _acall(self, inputs: Dict[str, Any]) -> Dict[str, str]:
|
||||
return (await self.aapply([inputs]))[0]
|
||||
|
||||
def predict(self, **kwargs: Any) -> str:
|
||||
"""Format prompt with kwargs and pass to LLM.
|
||||
|
||||
@@ -102,6 +127,22 @@ class LLMChain(Chain, BaseModel):
|
||||
"""
|
||||
return self(kwargs)[self.output_key]
|
||||
|
||||
async def apredict(self, **kwargs: Any) -> str:
|
||||
"""Format prompt with kwargs and pass to LLM.
|
||||
|
||||
Args:
|
||||
**kwargs: Keys to pass to prompt template.
|
||||
|
||||
Returns:
|
||||
Completion from LLM.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
completion = llm.predict(adjective="funny")
|
||||
"""
|
||||
return (await self.acall(kwargs))[self.output_key]
|
||||
|
||||
def predict_and_parse(self, **kwargs: Any) -> Union[str, List[str], Dict[str, str]]:
|
||||
"""Call predict and then parse the results."""
|
||||
result = self.predict(**kwargs)
|
||||
|
||||
@@ -50,11 +50,8 @@ class LLMMathChain(Chain, BaseModel):
|
||||
"""
|
||||
return [self.output_key]
|
||||
|
||||
def _call(self, inputs: Dict[str, str]) -> Dict[str, str]:
|
||||
llm_executor = LLMChain(prompt=self.prompt, llm=self.llm)
|
||||
def _process_llm_result(self, t: str) -> Dict[str, str]:
|
||||
python_executor = PythonREPL()
|
||||
self.callback_manager.on_text(inputs[self.input_key], verbose=self.verbose)
|
||||
t = llm_executor.predict(question=inputs[self.input_key], stop=["```output"])
|
||||
self.callback_manager.on_text(t, color="green", verbose=self.verbose)
|
||||
t = t.strip()
|
||||
if t.startswith("```python"):
|
||||
@@ -69,6 +66,24 @@ class LLMMathChain(Chain, BaseModel):
|
||||
raise ValueError(f"unknown format from LLM: {t}")
|
||||
return {self.output_key: answer}
|
||||
|
||||
def _call(self, inputs: Dict[str, str]) -> Dict[str, str]:
|
||||
llm_executor = LLMChain(
|
||||
prompt=self.prompt, llm=self.llm, callback_manager=self.callback_manager
|
||||
)
|
||||
self.callback_manager.on_text(inputs[self.input_key], verbose=self.verbose)
|
||||
t = llm_executor.predict(question=inputs[self.input_key], stop=["```output"])
|
||||
return self._process_llm_result(t)
|
||||
|
||||
async def _acall(self, inputs: Dict[str, str]) -> Dict[str, str]:
|
||||
llm_executor = LLMChain(
|
||||
prompt=self.prompt, llm=self.llm, callback_manager=self.callback_manager
|
||||
)
|
||||
self.callback_manager.on_text(inputs[self.input_key], verbose=self.verbose)
|
||||
t = await llm_executor.apredict(
|
||||
question=inputs[self.input_key], stop=["```output"]
|
||||
)
|
||||
return self._process_llm_result(t)
|
||||
|
||||
@property
|
||||
def _chain_type(self) -> str:
|
||||
return "llm_math_chain"
|
||||
|
||||
@@ -93,7 +93,6 @@ def _load_refine_chain(
|
||||
verbose: Optional[bool] = None,
|
||||
**kwargs: Any,
|
||||
) -> RefineDocumentsChain:
|
||||
|
||||
initial_chain = LLMChain(llm=llm, prompt=question_prompt, verbose=verbose)
|
||||
_refine_llm = refine_llm or llm
|
||||
refine_chain = LLMChain(llm=_refine_llm, prompt=refine_prompt, verbose=verbose)
|
||||
|
||||
@@ -1,19 +1,37 @@
|
||||
"""All different types of document loaders."""
|
||||
|
||||
from langchain.document_loaders.airbyte_json import AirbyteJSONLoader
|
||||
from langchain.document_loaders.azlyrics import AZLyricsLoader
|
||||
from langchain.document_loaders.college_confidential import CollegeConfidentialLoader
|
||||
from langchain.document_loaders.directory import DirectoryLoader
|
||||
from langchain.document_loaders.docx import UnstructuredDocxLoader
|
||||
from langchain.document_loaders.email import UnstructuredEmailLoader
|
||||
from langchain.document_loaders.everynote import EveryNoteLoader
|
||||
from langchain.document_loaders.gcs_directory import GCSDirectoryLoader
|
||||
from langchain.document_loaders.gcs_file import GCSFileLoader
|
||||
from langchain.document_loaders.googledrive import GoogleDriveLoader
|
||||
from langchain.document_loaders.gutenberg import GutenbergLoader
|
||||
from langchain.document_loaders.html import UnstructuredHTMLLoader
|
||||
from langchain.document_loaders.imsdb import IMSDbLoader
|
||||
from langchain.document_loaders.notion import NotionDirectoryLoader
|
||||
from langchain.document_loaders.obsidian import ObsidianLoader
|
||||
from langchain.document_loaders.online_pdf import OnlinePDFLoader
|
||||
from langchain.document_loaders.paged_pdf import PagedPDFSplitter
|
||||
from langchain.document_loaders.pdf import UnstructuredPDFLoader
|
||||
from langchain.document_loaders.powerpoint import UnstructuredPowerPointLoader
|
||||
from langchain.document_loaders.readthedocs import ReadTheDocsLoader
|
||||
from langchain.document_loaders.roam import RoamLoader
|
||||
from langchain.document_loaders.s3_directory import S3DirectoryLoader
|
||||
from langchain.document_loaders.s3_file import S3FileLoader
|
||||
from langchain.document_loaders.text import TextLoader
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
from langchain.document_loaders.url import UnstructuredURLLoader
|
||||
from langchain.document_loaders.web_base import WebBaseLoader
|
||||
from langchain.document_loaders.youtube import YoutubeLoader
|
||||
|
||||
__all__ = [
|
||||
"UnstructuredFileLoader",
|
||||
"UnstructuredURLLoader",
|
||||
"DirectoryLoader",
|
||||
"NotionDirectoryLoader",
|
||||
"ReadTheDocsLoader",
|
||||
@@ -24,4 +42,20 @@ __all__ = [
|
||||
"ObsidianLoader",
|
||||
"UnstructuredDocxLoader",
|
||||
"UnstructuredEmailLoader",
|
||||
"RoamLoader",
|
||||
"YoutubeLoader",
|
||||
"S3FileLoader",
|
||||
"TextLoader",
|
||||
"S3DirectoryLoader",
|
||||
"GCSFileLoader",
|
||||
"GCSDirectoryLoader",
|
||||
"WebBaseLoader",
|
||||
"IMSDbLoader",
|
||||
"AZLyricsLoader",
|
||||
"CollegeConfidentialLoader",
|
||||
"GutenbergLoader",
|
||||
"PagedPDFSplitter",
|
||||
"EveryNoteLoader",
|
||||
"AirbyteJSONLoader",
|
||||
"OnlinePDFLoader",
|
||||
]
|
||||
|
||||
41
langchain/document_loaders/airbyte_json.py
Normal file
41
langchain/document_loaders/airbyte_json.py
Normal file
@@ -0,0 +1,41 @@
|
||||
"""Loader that loads local airbyte json files."""
|
||||
import json
|
||||
from typing import Any, List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
def _stringify_value(val: Any) -> str:
|
||||
if isinstance(val, str):
|
||||
return val
|
||||
elif isinstance(val, dict):
|
||||
return "\n" + _stringify_dict(val)
|
||||
elif isinstance(val, list):
|
||||
return "\n".join(_stringify_value(v) for v in val)
|
||||
else:
|
||||
return str(val)
|
||||
|
||||
|
||||
def _stringify_dict(data: dict) -> str:
|
||||
text = ""
|
||||
for key, value in data.items():
|
||||
text += key + ": " + _stringify_value(data[key]) + "\n"
|
||||
return text
|
||||
|
||||
|
||||
class AirbyteJSONLoader(BaseLoader):
|
||||
"""Loader that loads local airbyte json files."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path. This should start with '/tmp/airbyte_local/'."""
|
||||
self.file_path = file_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load file."""
|
||||
text = ""
|
||||
for line in open(self.file_path, "r"):
|
||||
data = json.loads(line)["_airbyte_data"]
|
||||
text += _stringify_dict(data)
|
||||
metadata = {"source": self.file_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
22
langchain/document_loaders/azlyrics.py
Normal file
22
langchain/document_loaders/azlyrics.py
Normal file
@@ -0,0 +1,22 @@
|
||||
"""Loader that loads AZLyrics."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.web_base import WebBaseLoader
|
||||
|
||||
|
||||
class AZLyricsLoader(WebBaseLoader):
|
||||
"""Loader that loads AZLyrics webpages."""
|
||||
|
||||
def __init__(self, web_path: str):
|
||||
"""Initialize with webpage path."""
|
||||
self.web_path = web_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load webpage."""
|
||||
soup = self.scrape()
|
||||
title = soup.title.text
|
||||
lyrics = soup.find_all("div", {"class": ""})[2].text
|
||||
text = title + lyrics
|
||||
metadata = {"source": self.web_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
20
langchain/document_loaders/college_confidential.py
Normal file
20
langchain/document_loaders/college_confidential.py
Normal file
@@ -0,0 +1,20 @@
|
||||
"""Loader that loads College Confidential."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.web_base import WebBaseLoader
|
||||
|
||||
|
||||
class CollegeConfidentialLoader(WebBaseLoader):
|
||||
"""Loader that loads College Confidential webpages."""
|
||||
|
||||
def __init__(self, web_path: str):
|
||||
"""Initialize with webpage path."""
|
||||
self.web_path = web_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load webpage."""
|
||||
soup = self.scrape()
|
||||
text = soup.select_one("main[class='skin-handler']").text
|
||||
metadata = {"source": self.web_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
82
langchain/document_loaders/everynote.py
Normal file
82
langchain/document_loaders/everynote.py
Normal file
@@ -0,0 +1,82 @@
|
||||
"""Load documents from Everynote.
|
||||
|
||||
https://gist.github.com/foxmask/7b29c43a161e001ff04afdb2f181e31c
|
||||
"""
|
||||
import hashlib
|
||||
from base64 import b64decode
|
||||
from time import strptime
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
def _parse_content(content: str) -> str:
|
||||
from pypandoc import convert_text
|
||||
|
||||
text = convert_text(content, "org", format="html")
|
||||
return text
|
||||
|
||||
|
||||
def _parse_resource(resource: list) -> dict:
|
||||
rsc_dict: Dict[str, Any] = {}
|
||||
for elem in resource:
|
||||
if elem.tag == "data":
|
||||
# Some times elem.text is None
|
||||
rsc_dict[elem.tag] = b64decode(elem.text) if elem.text else b""
|
||||
rsc_dict["hash"] = hashlib.md5(rsc_dict[elem.tag]).hexdigest()
|
||||
else:
|
||||
rsc_dict[elem.tag] = elem.text
|
||||
|
||||
return rsc_dict
|
||||
|
||||
|
||||
def _parse_note(note: List) -> dict:
|
||||
note_dict: Dict[str, Any] = {}
|
||||
resources = []
|
||||
for elem in note:
|
||||
if elem.tag == "content":
|
||||
note_dict[elem.tag] = _parse_content(elem.text)
|
||||
# A copy of original content
|
||||
note_dict["content-raw"] = elem.text
|
||||
elif elem.tag == "resource":
|
||||
resources.append(_parse_resource(elem))
|
||||
elif elem.tag == "created" or elem.tag == "updated":
|
||||
note_dict[elem.tag] = strptime(elem.text, "%Y%m%dT%H%M%SZ")
|
||||
else:
|
||||
note_dict[elem.tag] = elem.text
|
||||
|
||||
note_dict["resource"] = resources
|
||||
|
||||
return note_dict
|
||||
|
||||
|
||||
def _parse_note_xml(xml_file: str) -> str:
|
||||
"""Parse everynote xml."""
|
||||
# Without huge_tree set to True, parser may complain about huge text node
|
||||
# Try to recover, because there may be " ", which will cause
|
||||
# "XMLSyntaxError: Entity 'nbsp' not defined"
|
||||
from lxml import etree
|
||||
|
||||
context = etree.iterparse(
|
||||
xml_file, encoding="utf-8", strip_cdata=False, huge_tree=True, recover=True
|
||||
)
|
||||
result_string = ""
|
||||
for action, elem in context:
|
||||
if elem.tag == "note":
|
||||
result_string += _parse_note(elem)["content"]
|
||||
return result_string
|
||||
|
||||
|
||||
class EveryNoteLoader(BaseLoader):
|
||||
"""Loader to load in EverNnote files.."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
self.file_path = file_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load document from EveryNote file."""
|
||||
text = _parse_note_xml(self.file_path)
|
||||
metadata = {"source": self.file_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
32
langchain/document_loaders/gcs_directory.py
Normal file
32
langchain/document_loaders/gcs_directory.py
Normal file
@@ -0,0 +1,32 @@
|
||||
"""Loading logic for loading documents from an GCS directory."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.document_loaders.gcs_file import GCSFileLoader
|
||||
|
||||
|
||||
class GCSDirectoryLoader(BaseLoader):
|
||||
"""Loading logic for loading documents from GCS."""
|
||||
|
||||
def __init__(self, project_name: str, bucket: str, prefix: str = ""):
|
||||
"""Initialize with bucket and key name."""
|
||||
self.project_name = project_name
|
||||
self.bucket = bucket
|
||||
self.prefix = prefix
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents."""
|
||||
try:
|
||||
from google.cloud import storage
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"Could not import google-cloud-storage python package. "
|
||||
"Please it install it with `pip install google-cloud-storage`."
|
||||
)
|
||||
client = storage.Client(project=self.project_name)
|
||||
docs = []
|
||||
for blob in client.list_blobs(self.bucket, prefix=self.prefix):
|
||||
loader = GCSFileLoader(self.project_name, self.bucket, blob.name)
|
||||
docs.extend(loader.load())
|
||||
return docs
|
||||
40
langchain/document_loaders/gcs_file.py
Normal file
40
langchain/document_loaders/gcs_file.py
Normal file
@@ -0,0 +1,40 @@
|
||||
"""Loading logic for loading documents from a GCS file."""
|
||||
import tempfile
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
|
||||
class GCSFileLoader(BaseLoader):
|
||||
"""Loading logic for loading documents from GCS."""
|
||||
|
||||
def __init__(self, project_name: str, bucket: str, blob: str):
|
||||
"""Initialize with bucket and key name."""
|
||||
self.bucket = bucket
|
||||
self.blob = blob
|
||||
self.project_name = project_name
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents."""
|
||||
try:
|
||||
from google.cloud import storage
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"Could not import google-cloud-storage python package. "
|
||||
"Please it install it with `pip install google-cloud-storage`."
|
||||
)
|
||||
|
||||
# Initialise a client
|
||||
storage_client = storage.Client(self.project_name)
|
||||
# Create a bucket object for our bucket
|
||||
bucket = storage_client.get_bucket(self.bucket)
|
||||
# Create a blob object from the filepath
|
||||
blob = bucket.blob(self.blob)
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
file_path = f"{temp_dir}/{self.blob}"
|
||||
# Download the file to a destination
|
||||
blob.download_to_filename(file_path)
|
||||
loader = UnstructuredFileLoader(file_path)
|
||||
return loader.load()
|
||||
28
langchain/document_loaders/gutenberg.py
Normal file
28
langchain/document_loaders/gutenberg.py
Normal file
@@ -0,0 +1,28 @@
|
||||
"""Loader that loads .txt web files."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class GutenbergLoader(BaseLoader):
|
||||
"""Loader that uses urllib to load .txt web files."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
if not file_path.startswith("https://www.gutenberg.org"):
|
||||
raise ValueError("file path must start with 'https://www.gutenberg.org'")
|
||||
|
||||
if not file_path.endswith(".txt"):
|
||||
raise ValueError("file path must end with '.txt'")
|
||||
|
||||
self.file_path = file_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load file."""
|
||||
from urllib.request import urlopen
|
||||
|
||||
elements = urlopen(self.file_path)
|
||||
text = "\n\n".join([str(el.decode("utf-8-sig")) for el in elements])
|
||||
metadata = {"source": self.file_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
20
langchain/document_loaders/imsdb.py
Normal file
20
langchain/document_loaders/imsdb.py
Normal file
@@ -0,0 +1,20 @@
|
||||
"""Loader that loads IMSDb."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.web_base import WebBaseLoader
|
||||
|
||||
|
||||
class IMSDbLoader(WebBaseLoader):
|
||||
"""Loader that loads IMSDb webpages."""
|
||||
|
||||
def __init__(self, web_path: str):
|
||||
"""Initialize with webpage path."""
|
||||
self.web_path = web_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load webpage."""
|
||||
soup = self.scrape()
|
||||
text = soup.select_one("td[class='scrtext']").text
|
||||
metadata = {"source": self.web_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
@@ -1,4 +1,4 @@
|
||||
"""Loader that loads Notion directory dump."""
|
||||
"""Loader that loads Obsidian directory dump."""
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
|
||||
29
langchain/document_loaders/online_pdf.py
Normal file
29
langchain/document_loaders/online_pdf.py
Normal file
@@ -0,0 +1,29 @@
|
||||
"""Loader that loads online PDF files."""
|
||||
|
||||
import tempfile
|
||||
from typing import List
|
||||
|
||||
import requests
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.document_loaders.pdf import UnstructuredPDFLoader
|
||||
|
||||
|
||||
class OnlinePDFLoader(BaseLoader):
|
||||
"""Loader that loads online PDFs."""
|
||||
|
||||
def __init__(self, web_path: str):
|
||||
"""Initialize with file path."""
|
||||
self.web_path = web_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents."""
|
||||
r = requests.get(self.web_path)
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
file_path = f"{temp_dir}/online_file.pdf"
|
||||
file = open(file_path, "wb")
|
||||
file.write(r.content)
|
||||
file.close()
|
||||
loader = UnstructuredPDFLoader(file_path)
|
||||
return loader.load()
|
||||
36
langchain/document_loaders/paged_pdf.py
Normal file
36
langchain/document_loaders/paged_pdf.py
Normal file
@@ -0,0 +1,36 @@
|
||||
"""Loads a PDF with pypdf and chunks at character level."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class PagedPDFSplitter(BaseLoader):
|
||||
"""Loads a PDF with pypdf and chunks at character level.
|
||||
|
||||
Loader also stores page numbers in metadatas.
|
||||
"""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
try:
|
||||
import pypdf # noqa:F401
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"pypdf package not found, please install it with " "`pip install pypdf`"
|
||||
)
|
||||
self._file_path = file_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load given path as pages."""
|
||||
import pypdf
|
||||
|
||||
pdf_file_obj = open(self._file_path, "rb")
|
||||
pdf_reader = pypdf.PdfReader(pdf_file_obj)
|
||||
docs = []
|
||||
for i, page in enumerate(pdf_reader.pages):
|
||||
text = page.extract_text()
|
||||
metadata = {"source": self._file_path, "page": i}
|
||||
docs.append(Document(page_content=text, metadata=metadata))
|
||||
pdf_file_obj.close()
|
||||
return docs
|
||||
@@ -19,7 +19,11 @@ class ReadTheDocsLoader(BaseLoader):
|
||||
|
||||
def _clean_data(data: str) -> str:
|
||||
soup = BeautifulSoup(data)
|
||||
text = soup.find_all("main", {"id": "main-content"})[0].get_text()
|
||||
text = soup.find_all("main", {"id": "main-content"})
|
||||
if len(text) != 0:
|
||||
text = text[0].get_text()
|
||||
else:
|
||||
text = ""
|
||||
return "\n".join([t for t in text.split("\n") if t])
|
||||
|
||||
docs = []
|
||||
|
||||
25
langchain/document_loaders/roam.py
Normal file
25
langchain/document_loaders/roam.py
Normal file
@@ -0,0 +1,25 @@
|
||||
"""Loader that loads Roam directory dump."""
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class RoamLoader(BaseLoader):
|
||||
"""Loader that loads Roam files from disk."""
|
||||
|
||||
def __init__(self, path: str):
|
||||
"""Initialize with path."""
|
||||
self.file_path = path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents."""
|
||||
ps = list(Path(self.file_path).glob("**/*.md"))
|
||||
docs = []
|
||||
for p in ps:
|
||||
with open(p) as f:
|
||||
text = f.read()
|
||||
metadata = {"source": str(p)}
|
||||
docs.append(Document(page_content=text, metadata=metadata))
|
||||
return docs
|
||||
32
langchain/document_loaders/s3_directory.py
Normal file
32
langchain/document_loaders/s3_directory.py
Normal file
@@ -0,0 +1,32 @@
|
||||
"""Loading logic for loading documents from an s3 directory."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.document_loaders.s3_file import S3FileLoader
|
||||
|
||||
|
||||
class S3DirectoryLoader(BaseLoader):
|
||||
"""Loading logic for loading documents from s3."""
|
||||
|
||||
def __init__(self, bucket: str, prefix: str = ""):
|
||||
"""Initialize with bucket and key name."""
|
||||
self.bucket = bucket
|
||||
self.prefix = prefix
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents."""
|
||||
try:
|
||||
import boto3
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"Could not import boto3 python package. "
|
||||
"Please it install it with `pip install boto3`."
|
||||
)
|
||||
s3 = boto3.resource("s3")
|
||||
bucket = s3.Bucket(self.bucket)
|
||||
docs = []
|
||||
for obj in bucket.objects.filter(Prefix=self.prefix):
|
||||
loader = S3FileLoader(self.bucket, obj.key)
|
||||
docs.extend(loader.load())
|
||||
return docs
|
||||
32
langchain/document_loaders/s3_file.py
Normal file
32
langchain/document_loaders/s3_file.py
Normal file
@@ -0,0 +1,32 @@
|
||||
"""Loading logic for loading documents from an s3 file."""
|
||||
import tempfile
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
|
||||
class S3FileLoader(BaseLoader):
|
||||
"""Loading logic for loading documents from s3."""
|
||||
|
||||
def __init__(self, bucket: str, key: str):
|
||||
"""Initialize with bucket and key name."""
|
||||
self.bucket = bucket
|
||||
self.key = key
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents."""
|
||||
try:
|
||||
import boto3
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"Could not import boto3 python package. "
|
||||
"Please it install it with `pip install boto3`."
|
||||
)
|
||||
s3 = boto3.client("s3")
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
file_path = f"{temp_dir}/{self.key}"
|
||||
s3.download_file(self.bucket, self.key, file_path)
|
||||
loader = UnstructuredFileLoader(file_path)
|
||||
return loader.load()
|
||||
20
langchain/document_loaders/text.py
Normal file
20
langchain/document_loaders/text.py
Normal file
@@ -0,0 +1,20 @@
|
||||
"""Load text files."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class TextLoader(BaseLoader):
|
||||
"""Load text files."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
self.file_path = file_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load from file path."""
|
||||
with open(self.file_path) as f:
|
||||
text = f.read()
|
||||
metadata = {"source": self.file_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
32
langchain/document_loaders/url.py
Normal file
32
langchain/document_loaders/url.py
Normal file
@@ -0,0 +1,32 @@
|
||||
"""Loader that loads PDF files."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class UnstructuredURLLoader(BaseLoader):
|
||||
"""Loader that uses unstructured to load HTML files."""
|
||||
|
||||
def __init__(self, urls: List[str]):
|
||||
"""Initialize with file path."""
|
||||
try:
|
||||
import unstructured # noqa:F401
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"unstructured package not found, please install it with "
|
||||
"`pip install unstructured`"
|
||||
)
|
||||
self.urls = urls
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load file."""
|
||||
from unstructured.partition.html import partition_html
|
||||
|
||||
docs: List[Document] = list()
|
||||
for url in self.urls:
|
||||
elements = partition_html(url=url)
|
||||
text = "\n\n".join([str(el) for el in elements])
|
||||
metadata = {"source": url}
|
||||
docs.append(Document(page_content=text, metadata=metadata))
|
||||
return docs
|
||||
29
langchain/document_loaders/web_base.py
Normal file
29
langchain/document_loaders/web_base.py
Normal file
@@ -0,0 +1,29 @@
|
||||
"""Web base loader class."""
|
||||
from typing import List
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class WebBaseLoader(BaseLoader):
|
||||
"""Loader that uses urllib and beautiful soup to load webpages."""
|
||||
|
||||
def __init__(self, web_path: str):
|
||||
"""Initialize with webpage path."""
|
||||
self.web_path = web_path
|
||||
|
||||
def scrape(self) -> BeautifulSoup:
|
||||
"""Scrape data from webpage and return it in BeautifulSoup format."""
|
||||
html_doc = requests.get(self.web_path)
|
||||
soup = BeautifulSoup(html_doc.text, "html.parser")
|
||||
return soup
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load data into document objects."""
|
||||
soup = self.scrape()
|
||||
text = soup.get_text()
|
||||
metadata = {"source": self.web_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
76
langchain/document_loaders/youtube.py
Normal file
76
langchain/document_loaders/youtube.py
Normal file
@@ -0,0 +1,76 @@
|
||||
"""Loader that loads YouTube transcript."""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class YoutubeLoader(BaseLoader):
|
||||
"""Loader that loads Youtube transcripts."""
|
||||
|
||||
def __init__(self, video_id: str, add_video_info: bool = False):
|
||||
"""Initialize with YouTube video ID."""
|
||||
self.video_id = video_id
|
||||
self.add_video_info = add_video_info
|
||||
|
||||
@classmethod
|
||||
def from_youtube_url(cls, youtube_url: str, **kwargs: Any) -> YoutubeLoader:
|
||||
"""Parse out video id from YouTube url."""
|
||||
video_id = youtube_url.split("youtube.com/watch?v=")[-1]
|
||||
return cls(video_id, **kwargs)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents."""
|
||||
try:
|
||||
from youtube_transcript_api import YouTubeTranscriptApi
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import youtube_transcript_api python package. "
|
||||
"Please it install it with `pip install youtube-transcript-api`."
|
||||
)
|
||||
|
||||
metadata = {"source": self.video_id}
|
||||
|
||||
if self.add_video_info:
|
||||
# Get more video meta info
|
||||
# Such as title, description, thumbnail url, publish_date
|
||||
video_info = self._get_video_info()
|
||||
metadata.update(video_info)
|
||||
|
||||
transcript_pieces = YouTubeTranscriptApi.get_transcript(self.video_id)
|
||||
transcript = " ".join([t["text"].strip(" ") for t in transcript_pieces])
|
||||
|
||||
return [Document(page_content=transcript, metadata=metadata)]
|
||||
|
||||
def _get_video_info(self) -> dict:
|
||||
"""Get important video information.
|
||||
|
||||
Components are:
|
||||
- title
|
||||
- description
|
||||
- thumbnail url,
|
||||
- publish_date
|
||||
- channel_author
|
||||
- and more.
|
||||
"""
|
||||
try:
|
||||
from pytube import YouTube
|
||||
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import pytube python package. "
|
||||
"Please it install it with `pip install pytube`."
|
||||
)
|
||||
yt = YouTube(f"https://www.youtube.com/watch?v={self.video_id}")
|
||||
video_info = {
|
||||
"title": yt.title,
|
||||
"description": yt.description,
|
||||
"view_count": yt.views,
|
||||
"thumbnail_url": yt.thumbnail_url,
|
||||
"publish_date": yt.publish_date,
|
||||
"length": yt.length,
|
||||
"author": yt.author,
|
||||
}
|
||||
return video_info
|
||||
@@ -75,20 +75,27 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
||||
text = text.replace("\n", " ")
|
||||
return self.client.create(input=[text], engine=engine)["data"][0]["embedding"]
|
||||
|
||||
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
def embed_documents(
|
||||
self, texts: List[str], chunk_size: int = 1000
|
||||
) -> List[List[float]]:
|
||||
"""Call out to OpenAI's embedding endpoint for embedding search docs.
|
||||
|
||||
Args:
|
||||
texts: The list of texts to embed.
|
||||
chunk_size: The maximum number of texts to send to OpenAI at once
|
||||
(max 1000).
|
||||
|
||||
Returns:
|
||||
List of embeddings, one for each text.
|
||||
"""
|
||||
responses = [
|
||||
self._embedding_func(text, engine=self.document_model_name)
|
||||
for text in texts
|
||||
]
|
||||
return responses
|
||||
# handle large batches of texts
|
||||
results = []
|
||||
for i in range(0, len(texts), chunk_size):
|
||||
response = self.client.create(
|
||||
input=texts[i : i + chunk_size], engine=self.document_model_name
|
||||
)
|
||||
results += [r["embedding"] for r in response["data"]]
|
||||
return results
|
||||
|
||||
def embed_query(self, text: str) -> List[float]:
|
||||
"""Call out to OpenAI's embedding endpoint for embedding query text.
|
||||
|
||||
@@ -6,6 +6,7 @@ _TEXT_COLOR_MAPPING = {
|
||||
"yellow": "33;1",
|
||||
"pink": "38;5;200",
|
||||
"green": "32;1",
|
||||
"red": "31;1",
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
import json
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Mapping, Optional, Union
|
||||
from typing import Any, Dict, List, Mapping, Optional, Tuple, Union
|
||||
|
||||
import yaml
|
||||
from pydantic import BaseModel, Extra, Field, validator
|
||||
@@ -17,6 +17,42 @@ def _get_verbosity() -> bool:
|
||||
return langchain.verbose
|
||||
|
||||
|
||||
def get_prompts(
|
||||
params: Dict[str, Any], prompts: List[str]
|
||||
) -> Tuple[Dict[int, List], str, List[int], List[str]]:
|
||||
"""Get prompts that are already cached."""
|
||||
llm_string = str(sorted([(k, v) for k, v in params.items()]))
|
||||
missing_prompts = []
|
||||
missing_prompt_idxs = []
|
||||
existing_prompts = {}
|
||||
for i, prompt in enumerate(prompts):
|
||||
if langchain.llm_cache is not None:
|
||||
cache_val = langchain.llm_cache.lookup(prompt, llm_string)
|
||||
if isinstance(cache_val, list):
|
||||
existing_prompts[i] = cache_val
|
||||
else:
|
||||
missing_prompts.append(prompt)
|
||||
missing_prompt_idxs.append(i)
|
||||
return existing_prompts, llm_string, missing_prompt_idxs, missing_prompts
|
||||
|
||||
|
||||
def update_cache(
|
||||
existing_prompts: Dict[int, List],
|
||||
llm_string: str,
|
||||
missing_prompt_idxs: List[int],
|
||||
new_results: LLMResult,
|
||||
prompts: List[str],
|
||||
) -> Optional[dict]:
|
||||
"""Update the cache and get the LLM output."""
|
||||
for i, result in enumerate(new_results.generations):
|
||||
existing_prompts[missing_prompt_idxs[i]] = result
|
||||
prompt = prompts[missing_prompt_idxs[i]]
|
||||
if langchain.llm_cache is not None:
|
||||
langchain.llm_cache.update(prompt, llm_string, result)
|
||||
llm_output = new_results.llm_output
|
||||
return llm_output
|
||||
|
||||
|
||||
class BaseLLM(BaseModel, ABC):
|
||||
"""LLM wrapper should take in a prompt and return a string."""
|
||||
|
||||
@@ -58,6 +94,12 @@ class BaseLLM(BaseModel, ABC):
|
||||
) -> LLMResult:
|
||||
"""Run the LLM on the given prompts."""
|
||||
|
||||
@abstractmethod
|
||||
async def _agenerate(
|
||||
self, prompts: List[str], stop: Optional[List[str]] = None
|
||||
) -> LLMResult:
|
||||
"""Run the LLM on the given prompts."""
|
||||
|
||||
def generate(
|
||||
self, prompts: List[str], stop: Optional[List[str]] = None
|
||||
) -> LLMResult:
|
||||
@@ -88,17 +130,12 @@ class BaseLLM(BaseModel, ABC):
|
||||
return output
|
||||
params = self.dict()
|
||||
params["stop"] = stop
|
||||
llm_string = str(sorted([(k, v) for k, v in params.items()]))
|
||||
missing_prompts = []
|
||||
missing_prompt_idxs = []
|
||||
existing_prompts = {}
|
||||
for i, prompt in enumerate(prompts):
|
||||
cache_val = langchain.llm_cache.lookup(prompt, llm_string)
|
||||
if isinstance(cache_val, list):
|
||||
existing_prompts[i] = cache_val
|
||||
else:
|
||||
missing_prompts.append(prompt)
|
||||
missing_prompt_idxs.append(i)
|
||||
(
|
||||
existing_prompts,
|
||||
llm_string,
|
||||
missing_prompt_idxs,
|
||||
missing_prompts,
|
||||
) = get_prompts(params, prompts)
|
||||
if len(missing_prompts) > 0:
|
||||
self.callback_manager.on_llm_start(
|
||||
{"name": self.__class__.__name__}, missing_prompts, verbose=self.verbose
|
||||
@@ -109,11 +146,56 @@ class BaseLLM(BaseModel, ABC):
|
||||
self.callback_manager.on_llm_error(e, verbose=self.verbose)
|
||||
raise e
|
||||
self.callback_manager.on_llm_end(new_results, verbose=self.verbose)
|
||||
for i, result in enumerate(new_results.generations):
|
||||
existing_prompts[missing_prompt_idxs[i]] = result
|
||||
prompt = prompts[missing_prompt_idxs[i]]
|
||||
langchain.llm_cache.update(prompt, llm_string, result)
|
||||
llm_output = new_results.llm_output
|
||||
llm_output = update_cache(
|
||||
existing_prompts, llm_string, missing_prompt_idxs, new_results, prompts
|
||||
)
|
||||
else:
|
||||
llm_output = {}
|
||||
generations = [existing_prompts[i] for i in range(len(prompts))]
|
||||
return LLMResult(generations=generations, llm_output=llm_output)
|
||||
|
||||
async def agenerate(
|
||||
self, prompts: List[str], stop: Optional[List[str]] = None
|
||||
) -> LLMResult:
|
||||
"""Run the LLM on the given prompt and input."""
|
||||
disregard_cache = self.cache is not None and not self.cache
|
||||
if langchain.llm_cache is None or disregard_cache:
|
||||
# This happens when langchain.cache is None, but self.cache is True
|
||||
if self.cache is not None and self.cache:
|
||||
raise ValueError(
|
||||
"Asked to cache, but no cache found at `langchain.cache`."
|
||||
)
|
||||
self.callback_manager.on_llm_start(
|
||||
{"name": self.__class__.__name__}, prompts, verbose=self.verbose
|
||||
)
|
||||
try:
|
||||
output = await self._agenerate(prompts, stop=stop)
|
||||
except (KeyboardInterrupt, Exception) as e:
|
||||
self.callback_manager.on_llm_error(e, verbose=self.verbose)
|
||||
raise e
|
||||
self.callback_manager.on_llm_end(output, verbose=self.verbose)
|
||||
return output
|
||||
params = self.dict()
|
||||
params["stop"] = stop
|
||||
(
|
||||
existing_prompts,
|
||||
llm_string,
|
||||
missing_prompt_idxs,
|
||||
missing_prompts,
|
||||
) = get_prompts(params, prompts)
|
||||
if len(missing_prompts) > 0:
|
||||
self.callback_manager.on_llm_start(
|
||||
{"name": self.__class__.__name__}, missing_prompts, verbose=self.verbose
|
||||
)
|
||||
try:
|
||||
new_results = await self._agenerate(missing_prompts, stop=stop)
|
||||
except (KeyboardInterrupt, Exception) as e:
|
||||
self.callback_manager.on_llm_error(e, verbose=self.verbose)
|
||||
raise e
|
||||
self.callback_manager.on_llm_end(new_results, verbose=self.verbose)
|
||||
llm_output = update_cache(
|
||||
existing_prompts, llm_string, missing_prompt_idxs, new_results, prompts
|
||||
)
|
||||
else:
|
||||
llm_output = {}
|
||||
generations = [existing_prompts[i] for i in range(len(prompts))]
|
||||
@@ -219,3 +301,9 @@ class LLM(BaseLLM):
|
||||
text = self._call(prompt, stop=stop)
|
||||
generations.append([Generation(text=text)])
|
||||
return LLMResult(generations=generations)
|
||||
|
||||
async def _agenerate(
|
||||
self, prompts: List[str], stop: Optional[List[str]] = None
|
||||
) -> LLMResult:
|
||||
"""Run the LLM on the given prompt and input."""
|
||||
raise NotImplementedError("Async generation not implemented for this LLM.")
|
||||
|
||||
@@ -1,11 +1,22 @@
|
||||
"""Wrapper around OpenAI APIs."""
|
||||
import logging
|
||||
import sys
|
||||
from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple, Union
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
Dict,
|
||||
Generator,
|
||||
List,
|
||||
Mapping,
|
||||
Optional,
|
||||
Set,
|
||||
Tuple,
|
||||
Union,
|
||||
)
|
||||
|
||||
from pydantic import BaseModel, Extra, Field, root_validator
|
||||
from tenacity import (
|
||||
after_log,
|
||||
before_sleep_log,
|
||||
retry,
|
||||
retry_if_exception_type,
|
||||
stop_after_attempt,
|
||||
@@ -19,6 +30,18 @@ from langchain.utils import get_from_dict_or_env
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def update_token_usage(
|
||||
keys: Set[str], response: Dict[str, Any], token_usage: Dict[str, Any]
|
||||
) -> None:
|
||||
"""Update token usage."""
|
||||
_keys_to_use = keys.intersection(response["usage"])
|
||||
for _key in _keys_to_use:
|
||||
if _key not in token_usage:
|
||||
token_usage[_key] = response["usage"][_key]
|
||||
else:
|
||||
token_usage[_key] += response["usage"][_key]
|
||||
|
||||
|
||||
class BaseOpenAI(BaseLLM, BaseModel):
|
||||
"""Wrapper around OpenAI large language models.
|
||||
|
||||
@@ -124,16 +147,14 @@ class BaseOpenAI(BaseLLM, BaseModel):
|
||||
}
|
||||
return {**normal_params, **self.model_kwargs}
|
||||
|
||||
def completion_with_retry(self, **kwargs: Any) -> Any:
|
||||
"""Use tenacity to retry the completion call."""
|
||||
def _create_retry_decorator(self) -> Callable[[Any], Any]:
|
||||
import openai
|
||||
|
||||
min_seconds = 4
|
||||
max_seconds = 10
|
||||
# Wait 2^x * 1 second between each retry starting with
|
||||
# 4 seconds, then up to 10 seconds, then 10 seconds afterwards
|
||||
|
||||
@retry(
|
||||
return retry(
|
||||
reraise=True,
|
||||
stop=stop_after_attempt(self.max_retries),
|
||||
wait=wait_exponential(multiplier=1, min=min_seconds, max=max_seconds),
|
||||
@@ -143,13 +164,30 @@ class BaseOpenAI(BaseLLM, BaseModel):
|
||||
| retry_if_exception_type(openai.error.APIConnectionError)
|
||||
| retry_if_exception_type(openai.error.RateLimitError)
|
||||
),
|
||||
after=after_log(logger, logging.DEBUG),
|
||||
before_sleep=before_sleep_log(logger, logging.WARNING),
|
||||
)
|
||||
|
||||
def completion_with_retry(self, **kwargs: Any) -> Any:
|
||||
"""Use tenacity to retry the completion call."""
|
||||
retry_decorator = self._create_retry_decorator()
|
||||
|
||||
@retry_decorator
|
||||
def _completion_with_retry(**kwargs: Any) -> Any:
|
||||
return self.client.create(**kwargs)
|
||||
|
||||
return _completion_with_retry(**kwargs)
|
||||
|
||||
async def acompletion_with_retry(self, **kwargs: Any) -> Any:
|
||||
"""Use tenacity to retry the async completion call."""
|
||||
retry_decorator = self._create_retry_decorator()
|
||||
|
||||
@retry_decorator
|
||||
async def _completion_with_retry(**kwargs: Any) -> Any:
|
||||
# Use OpenAI's async api https://github.com/openai/openai-python#async-api
|
||||
return await self.client.acreate(**kwargs)
|
||||
|
||||
return await _completion_with_retry(**kwargs)
|
||||
|
||||
def _generate(
|
||||
self, prompts: List[str], stop: Optional[List[str]] = None
|
||||
) -> LLMResult:
|
||||
@@ -169,11 +207,47 @@ class BaseOpenAI(BaseLLM, BaseModel):
|
||||
"""
|
||||
# TODO: write a unit test for this
|
||||
params = self._invocation_params
|
||||
sub_prompts = self.get_sub_prompts(params, prompts, stop)
|
||||
choices = []
|
||||
token_usage: Dict[str, int] = {}
|
||||
# Get the token usage from the response.
|
||||
# Includes prompt, completion, and total tokens used.
|
||||
_keys = {"completion_tokens", "prompt_tokens", "total_tokens"}
|
||||
for _prompts in sub_prompts:
|
||||
response = self.completion_with_retry(prompt=_prompts, **params)
|
||||
choices.extend(response["choices"])
|
||||
update_token_usage(_keys, response, token_usage)
|
||||
return self.create_llm_result(choices, prompts, token_usage)
|
||||
|
||||
async def _agenerate(
|
||||
self, prompts: List[str], stop: Optional[List[str]] = None
|
||||
) -> LLMResult:
|
||||
"""Call out to OpenAI's endpoint async with k unique prompts."""
|
||||
params = self._invocation_params
|
||||
sub_prompts = self.get_sub_prompts(params, prompts, stop)
|
||||
choices = []
|
||||
token_usage: Dict[str, int] = {}
|
||||
# Get the token usage from the response.
|
||||
# Includes prompt, completion, and total tokens used.
|
||||
_keys = {"completion_tokens", "prompt_tokens", "total_tokens"}
|
||||
for _prompts in sub_prompts:
|
||||
# Use OpenAI's async api https://github.com/openai/openai-python#async-api
|
||||
response = await self.acompletion_with_retry(prompt=_prompts, **params)
|
||||
choices.extend(response["choices"])
|
||||
update_token_usage(_keys, response, token_usage)
|
||||
return self.create_llm_result(choices, prompts, token_usage)
|
||||
|
||||
def get_sub_prompts(
|
||||
self,
|
||||
params: Dict[str, Any],
|
||||
prompts: List[str],
|
||||
stop: Optional[List[str]] = None,
|
||||
) -> List[List[str]]:
|
||||
"""Get the sub prompts for llm call."""
|
||||
if stop is not None:
|
||||
if "stop" in params:
|
||||
raise ValueError("`stop` found in both the input and default params.")
|
||||
params["stop"] = stop
|
||||
|
||||
if params["max_tokens"] == -1:
|
||||
if len(prompts) != 1:
|
||||
raise ValueError(
|
||||
@@ -184,20 +258,12 @@ class BaseOpenAI(BaseLLM, BaseModel):
|
||||
prompts[i : i + self.batch_size]
|
||||
for i in range(0, len(prompts), self.batch_size)
|
||||
]
|
||||
choices = []
|
||||
token_usage = {}
|
||||
# Get the token usage from the response.
|
||||
# Includes prompt, completion, and total tokens used.
|
||||
_keys = {"completion_tokens", "prompt_tokens", "total_tokens"}
|
||||
for _prompts in sub_prompts:
|
||||
response = self.completion_with_retry(prompt=_prompts, **params)
|
||||
choices.extend(response["choices"])
|
||||
_keys_to_use = _keys.intersection(response["usage"])
|
||||
for _key in _keys_to_use:
|
||||
if _key not in token_usage:
|
||||
token_usage[_key] = response["usage"][_key]
|
||||
else:
|
||||
token_usage[_key] += response["usage"][_key]
|
||||
return sub_prompts
|
||||
|
||||
def create_llm_result(
|
||||
self, choices: Any, prompts: List[str], token_usage: Dict[str, int]
|
||||
) -> LLMResult:
|
||||
"""Create the LLMResult from the choices and prompts."""
|
||||
generations = []
|
||||
for i, prompt in enumerate(prompts):
|
||||
sub_choices = choices[i * self.n : (i + 1) * self.n]
|
||||
|
||||
@@ -4,8 +4,9 @@ Heavily borrowed from https://github.com/ofirpress/self-ask
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
from typing import Any, Dict, Optional
|
||||
from typing import Any, Dict, Optional, Tuple
|
||||
|
||||
import aiohttp
|
||||
from pydantic import BaseModel, Extra, Field, root_validator
|
||||
|
||||
from langchain.utils import get_from_dict_or_env
|
||||
@@ -34,6 +35,37 @@ def _get_default_params() -> dict:
|
||||
}
|
||||
|
||||
|
||||
def process_response(res: dict) -> str:
|
||||
"""Process response from SerpAPI."""
|
||||
if "error" in res.keys():
|
||||
raise ValueError(f"Got error from SerpAPI: {res['error']}")
|
||||
if "answer_box" in res.keys() and "answer" in res["answer_box"].keys():
|
||||
toret = res["answer_box"]["answer"]
|
||||
elif "answer_box" in res.keys() and "snippet" in res["answer_box"].keys():
|
||||
toret = res["answer_box"]["snippet"]
|
||||
elif (
|
||||
"answer_box" in res.keys()
|
||||
and "snippet_highlighted_words" in res["answer_box"].keys()
|
||||
):
|
||||
toret = res["answer_box"]["snippet_highlighted_words"][0]
|
||||
elif (
|
||||
"sports_results" in res.keys()
|
||||
and "game_spotlight" in res["sports_results"].keys()
|
||||
):
|
||||
toret = res["sports_results"]["game_spotlight"]
|
||||
elif (
|
||||
"knowledge_graph" in res.keys()
|
||||
and "description" in res["knowledge_graph"].keys()
|
||||
):
|
||||
toret = res["knowledge_graph"]["description"]
|
||||
elif "snippet" in res["organic_results"][0].keys():
|
||||
toret = res["organic_results"][0]["snippet"]
|
||||
|
||||
else:
|
||||
toret = "No good search result found"
|
||||
return toret
|
||||
|
||||
|
||||
class SerpAPIWrapper(BaseModel):
|
||||
"""Wrapper around SerpAPI.
|
||||
|
||||
@@ -51,11 +83,13 @@ class SerpAPIWrapper(BaseModel):
|
||||
search_engine: Any #: :meta private:
|
||||
params: dict = Field(default_factory=_get_default_params)
|
||||
serpapi_api_key: Optional[str] = None
|
||||
aiosession: Optional[aiohttp.ClientSession] = None
|
||||
|
||||
class Config:
|
||||
"""Configuration for this pydantic object."""
|
||||
|
||||
extra = Extra.forbid
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
@root_validator()
|
||||
def validate_environment(cls, values: Dict) -> Dict:
|
||||
@@ -75,45 +109,47 @@ class SerpAPIWrapper(BaseModel):
|
||||
)
|
||||
return values
|
||||
|
||||
async def arun(self, query: str) -> str:
|
||||
"""Use aiohttp to run query through SerpAPI and parse result."""
|
||||
|
||||
def construct_url_and_params() -> Tuple[str, Dict[str, str]]:
|
||||
params = self.get_params(query)
|
||||
params["source"] = "python"
|
||||
if self.serpapi_api_key:
|
||||
params["serp_api_key"] = self.serpapi_api_key
|
||||
params["output"] = "json"
|
||||
url = "https://serpapi.com/search"
|
||||
return url, params
|
||||
|
||||
url, params = construct_url_and_params()
|
||||
if not self.aiosession:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url, params=params) as response:
|
||||
res = await response.json()
|
||||
else:
|
||||
async with self.aiosession.get(url, params=params) as response:
|
||||
res = await response.json()
|
||||
|
||||
return process_response(res)
|
||||
|
||||
def run(self, query: str) -> str:
|
||||
"""Run query through SerpAPI and parse result."""
|
||||
params = self.get_params(query)
|
||||
with HiddenPrints():
|
||||
search = self.search_engine(params)
|
||||
res = search.get_dict()
|
||||
return process_response(res)
|
||||
|
||||
def get_params(self, query: str) -> Dict[str, str]:
|
||||
"""Get parameters for SerpAPI."""
|
||||
_params = {
|
||||
"api_key": self.serpapi_api_key,
|
||||
"q": query,
|
||||
}
|
||||
params = {**self.params, **_params}
|
||||
with HiddenPrints():
|
||||
search = self.search_engine(params)
|
||||
res = search.get_dict()
|
||||
if "error" in res.keys():
|
||||
raise ValueError(f"Got error from SerpAPI: {res['error']}")
|
||||
if "answer_box" in res.keys() and "answer" in res["answer_box"].keys():
|
||||
toret = res["answer_box"]["answer"]
|
||||
elif "answer_box" in res.keys() and "snippet" in res["answer_box"].keys():
|
||||
toret = res["answer_box"]["snippet"]
|
||||
elif (
|
||||
"answer_box" in res.keys()
|
||||
and "snippet_highlighted_words" in res["answer_box"].keys()
|
||||
):
|
||||
toret = res["answer_box"]["snippet_highlighted_words"][0]
|
||||
elif (
|
||||
"sports_results" in res.keys()
|
||||
and "game_spotlight" in res["sports_results"].keys()
|
||||
):
|
||||
toret = res["sports_results"]["game_spotlight"]
|
||||
elif (
|
||||
"knowledge_graph" in res.keys()
|
||||
and "description" in res["knowledge_graph"].keys()
|
||||
):
|
||||
toret = res["knowledge_graph"]["description"]
|
||||
elif "snippet" in res["organic_results"][0].keys():
|
||||
toret = res["organic_results"][0]["snippet"]
|
||||
|
||||
else:
|
||||
toret = "No good search result found"
|
||||
return toret
|
||||
return params
|
||||
|
||||
|
||||
# For backwards compatability
|
||||
# For backwards compatibility
|
||||
|
||||
SerpAPIChain = SerpAPIWrapper
|
||||
|
||||
@@ -17,40 +17,30 @@ class SQLDatabase:
|
||||
ignore_tables: Optional[List[str]] = None,
|
||||
include_tables: Optional[List[str]] = None,
|
||||
sample_rows_in_table_info: int = 0,
|
||||
# TODO: deprecate.
|
||||
sample_row_in_table_info: bool = False,
|
||||
):
|
||||
"""Create engine from database URI."""
|
||||
if sample_row_in_table_info and sample_rows_in_table_info > 0:
|
||||
raise ValueError(
|
||||
"Only one of `sample_row_in_table_info` "
|
||||
"and `sample_rows_in_table_info` should be set"
|
||||
)
|
||||
self._engine = engine
|
||||
self._schema = schema
|
||||
if include_tables and ignore_tables:
|
||||
raise ValueError("Cannot specify both include_tables and ignore_tables")
|
||||
|
||||
self._inspector = inspect(self._engine)
|
||||
self._all_tables = self._inspector.get_table_names(schema=schema)
|
||||
self._include_tables = include_tables or []
|
||||
self._all_tables = set(self._inspector.get_table_names(schema=schema))
|
||||
self._include_tables = set(include_tables) if include_tables else set()
|
||||
if self._include_tables:
|
||||
missing_tables = set(self._include_tables).difference(self._all_tables)
|
||||
missing_tables = self._include_tables - self._all_tables
|
||||
if missing_tables:
|
||||
raise ValueError(
|
||||
f"include_tables {missing_tables} not found in database"
|
||||
)
|
||||
self._ignore_tables = ignore_tables or []
|
||||
self._ignore_tables = set(ignore_tables) if ignore_tables else set()
|
||||
if self._ignore_tables:
|
||||
missing_tables = set(self._ignore_tables).difference(self._all_tables)
|
||||
missing_tables = self._ignore_tables - self._all_tables
|
||||
if missing_tables:
|
||||
raise ValueError(
|
||||
f"ignore_tables {missing_tables} not found in database"
|
||||
)
|
||||
self._sample_rows_in_table_info = sample_rows_in_table_info
|
||||
# TODO: deprecate
|
||||
if sample_row_in_table_info:
|
||||
self._sample_rows_in_table_info = 1
|
||||
|
||||
@classmethod
|
||||
def from_uri(cls, database_uri: str, **kwargs: Any) -> SQLDatabase:
|
||||
@@ -66,7 +56,7 @@ class SQLDatabase:
|
||||
"""Get names of tables available."""
|
||||
if self._include_tables:
|
||||
return self._include_tables
|
||||
return set(self._all_tables) - set(self._ignore_tables)
|
||||
return self._all_tables - self._ignore_tables
|
||||
|
||||
@property
|
||||
def table_info(self) -> str:
|
||||
@@ -91,7 +81,6 @@ class SQLDatabase:
|
||||
|
||||
tables = []
|
||||
for table_name in all_table_names:
|
||||
|
||||
columns = []
|
||||
for column in self._inspector.get_columns(table_name, schema=self._schema):
|
||||
columns.append(f"{column['name']} ({str(column['type'])})")
|
||||
|
||||
@@ -3,7 +3,17 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Callable, Iterable, List, Optional
|
||||
from typing import (
|
||||
AbstractSet,
|
||||
Any,
|
||||
Callable,
|
||||
Collection,
|
||||
Iterable,
|
||||
List,
|
||||
Literal,
|
||||
Optional,
|
||||
Union,
|
||||
)
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
|
||||
@@ -114,7 +124,11 @@ class TextSplitter(ABC):
|
||||
|
||||
@classmethod
|
||||
def from_tiktoken_encoder(
|
||||
cls, encoding_name: str = "gpt2", **kwargs: Any
|
||||
cls,
|
||||
encoding_name: str = "gpt2",
|
||||
allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),
|
||||
disallowed_special: Union[Literal["all"], Collection[str]] = "all",
|
||||
**kwargs: Any,
|
||||
) -> TextSplitter:
|
||||
"""Text splitter that uses tiktoken encoder to count length."""
|
||||
try:
|
||||
@@ -125,11 +139,19 @@ class TextSplitter(ABC):
|
||||
"This is needed in order to calculate max_tokens_for_prompt. "
|
||||
"Please it install it with `pip install tiktoken`."
|
||||
)
|
||||
|
||||
# create a GPT-3 encoder instance
|
||||
enc = tiktoken.get_encoding(encoding_name)
|
||||
|
||||
def _tiktoken_encoder(text: str) -> int:
|
||||
return len(enc.encode(text))
|
||||
def _tiktoken_encoder(text: str, **kwargs: Any) -> int:
|
||||
return len(
|
||||
enc.encode(
|
||||
text,
|
||||
allowed_special=allowed_special,
|
||||
disallowed_special=disallowed_special,
|
||||
**kwargs,
|
||||
)
|
||||
)
|
||||
|
||||
return cls(length_function=_tiktoken_encoder, **kwargs)
|
||||
|
||||
@@ -155,7 +177,13 @@ class CharacterTextSplitter(TextSplitter):
|
||||
class TokenTextSplitter(TextSplitter):
|
||||
"""Implementation of splitting text that looks at tokens."""
|
||||
|
||||
def __init__(self, encoding_name: str = "gpt2", **kwargs: Any):
|
||||
def __init__(
|
||||
self,
|
||||
encoding_name: str = "gpt2",
|
||||
allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),
|
||||
disallowed_special: Union[Literal["all"], Collection[str]] = "all",
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""Create a new TextSplitter."""
|
||||
super().__init__(**kwargs)
|
||||
try:
|
||||
@@ -168,11 +196,17 @@ class TokenTextSplitter(TextSplitter):
|
||||
)
|
||||
# create a GPT-3 encoder instance
|
||||
self._tokenizer = tiktoken.get_encoding(encoding_name)
|
||||
self._allowed_special = allowed_special
|
||||
self._disallowed_special = disallowed_special
|
||||
|
||||
def split_text(self, text: str) -> List[str]:
|
||||
"""Split incoming text and return chunks."""
|
||||
splits = []
|
||||
input_ids = self._tokenizer.encode(text)
|
||||
input_ids = self._tokenizer.encode(
|
||||
text,
|
||||
allowed_special=self._allowed_special,
|
||||
disallowed_special=self._disallowed_special,
|
||||
)
|
||||
start_idx = 0
|
||||
cur_idx = min(start_idx + self._chunk_size, len(input_ids))
|
||||
chunk_ids = input_ids[start_idx:cur_idx]
|
||||
|
||||
1283
poetry.lock
generated
1283
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "langchain"
|
||||
version = "0.0.79"
|
||||
version = "0.0.83"
|
||||
description = "Building applications with LLMs through composability"
|
||||
authors = []
|
||||
license = "MIT"
|
||||
@@ -37,6 +37,15 @@ qdrant-client = {version = "^0.11.7", optional = true}
|
||||
dataclasses-json = "^0.5.7"
|
||||
tensorflow-text = {version = "^2.11.0", optional = true, python = "^3.10, <3.12"}
|
||||
tenacity = "^8.1.0"
|
||||
cohere = {version = "^3", optional = true}
|
||||
openai = {version = "^0", optional = true}
|
||||
nlpcloud = {version = "^1", optional = true}
|
||||
huggingface_hub = {version = "^0", optional = true}
|
||||
google-search-results = {version = "^2", optional = true}
|
||||
sentence-transformers = {version = "^2", optional = true}
|
||||
aiohttp = "^3.8.3"
|
||||
pypdf = {version = "^3.4.0", optional = true}
|
||||
|
||||
|
||||
[tool.poetry.group.docs.dependencies]
|
||||
autodoc_pydantic = "^1.8.0"
|
||||
@@ -60,14 +69,15 @@ duckdb-engine = "^0.6.6"
|
||||
pytest-watcher = "^0.2.6"
|
||||
freezegun = "^1.2.2"
|
||||
responses = "^0.22.0"
|
||||
pytest-asyncio = "^0.20.3"
|
||||
|
||||
[tool.poetry.group.lint.dependencies]
|
||||
flake8-docstrings = "^1.6.0"
|
||||
black = "^22.10.0"
|
||||
isort = "^5.10.1"
|
||||
flake8 = "^6.0.0"
|
||||
types-toml = "^0.10.8.1"
|
||||
types-redis = "^4.3.21.6"
|
||||
black = "^23.1.0"
|
||||
|
||||
[tool.poetry.group.typing.dependencies]
|
||||
mypy = "^0.991"
|
||||
@@ -83,7 +93,7 @@ playwright = "^1.28.0"
|
||||
|
||||
[tool.poetry.extras]
|
||||
llms = ["cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"]
|
||||
all = ["cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "elasticsearch", "google-search-results", "faiss-cpu", "sentence_transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text"]
|
||||
all = ["cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "elasticsearch", "google-search-results", "faiss-cpu", "sentence_transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf"]
|
||||
|
||||
[tool.isort]
|
||||
profile = "black"
|
||||
|
||||
@@ -8,7 +8,18 @@ def test_openai_embedding_documents() -> None:
|
||||
embedding = OpenAIEmbeddings()
|
||||
output = embedding.embed_documents(documents)
|
||||
assert len(output) == 1
|
||||
assert len(output[0]) == 2048
|
||||
assert len(output[0]) == 1536
|
||||
|
||||
|
||||
def test_openai_embedding_documents_multiple() -> None:
|
||||
"""Test openai embeddings."""
|
||||
documents = ["foo bar", "bar foo", "foo"]
|
||||
embedding = OpenAIEmbeddings()
|
||||
output = embedding.embed_documents(documents, chunk_size=2)
|
||||
assert len(output) == 3
|
||||
assert len(output[0]) == 1536
|
||||
assert len(output[1]) == 1536
|
||||
assert len(output[2]) == 1536
|
||||
|
||||
|
||||
def test_openai_embedding_query() -> None:
|
||||
@@ -16,4 +27,4 @@ def test_openai_embedding_query() -> None:
|
||||
document = "foo bar"
|
||||
embedding = OpenAIEmbeddings()
|
||||
output = embedding.embed_query(document)
|
||||
assert len(output) == 2048
|
||||
assert len(output) == 1536
|
||||
|
||||
BIN
tests/integration_tests/examples/hello.pdf
Normal file
BIN
tests/integration_tests/examples/hello.pdf
Normal file
Binary file not shown.
@@ -7,6 +7,7 @@ import pytest
|
||||
|
||||
from langchain.llms.loading import load_llm
|
||||
from langchain.llms.openai import OpenAI
|
||||
from langchain.schema import LLMResult
|
||||
|
||||
|
||||
def test_openai_call() -> None:
|
||||
@@ -74,3 +75,11 @@ def test_openai_streaming_error() -> None:
|
||||
llm = OpenAI(best_of=2)
|
||||
with pytest.raises(ValueError):
|
||||
llm.stream("I'm Pickle Rick")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_openai_async_generate() -> None:
|
||||
"""Test async generation."""
|
||||
llm = OpenAI(max_tokens=10)
|
||||
output = await llm.agenerate(["Hello, how are you?"])
|
||||
assert isinstance(output, LLMResult)
|
||||
|
||||
19
tests/integration_tests/test_pdf_pagesplitter.py
Normal file
19
tests/integration_tests/test_pdf_pagesplitter.py
Normal file
@@ -0,0 +1,19 @@
|
||||
"""Test splitting with page numbers included."""
|
||||
import os
|
||||
|
||||
from langchain.document_loaders import PagedPDFSplitter
|
||||
from langchain.embeddings.openai import OpenAIEmbeddings
|
||||
from langchain.vectorstores import FAISS
|
||||
|
||||
|
||||
def test_pdf_pagesplitter() -> None:
|
||||
"""Test splitting with page numbers included."""
|
||||
script_dir = os.path.dirname(__file__)
|
||||
loader = PagedPDFSplitter(os.path.join(script_dir, "examples/hello.pdf"))
|
||||
docs = loader.load()
|
||||
assert "page" in docs[0].metadata
|
||||
assert "source" in docs[0].metadata
|
||||
|
||||
faiss_index = FAISS.from_documents(docs, OpenAIEmbeddings())
|
||||
docs = faiss_index.similarity_search("Complete this sentence: Hello", k=1)
|
||||
assert "Hello world" in docs[0].page_content
|
||||
@@ -33,6 +33,11 @@ class FakeLLM(BaseLLM, BaseModel):
|
||||
) -> LLMResult:
|
||||
return LLMResult(generations=[[Generation(text="foo") for _ in range(self.n)]])
|
||||
|
||||
async def _agenerate(
|
||||
self, prompts: List[str], stop: Optional[List[str]] = None
|
||||
) -> LLMResult:
|
||||
return LLMResult(generations=[[Generation(text="foo") for _ in range(self.n)]])
|
||||
|
||||
@property
|
||||
def _llm_type(self) -> str:
|
||||
"""Return type of llm."""
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
"""Test base LLM functionality."""
|
||||
from sqlalchemy import Column, Integer, Sequence, String, create_engine
|
||||
from sqlalchemy.orm import declarative_base
|
||||
|
||||
try:
|
||||
from sqlalchemy.orm import declarative_base
|
||||
except ImportError:
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
|
||||
import langchain
|
||||
from langchain.cache import InMemoryCache, SQLAlchemyCache
|
||||
|
||||
Reference in New Issue
Block a user