mirror of
https://github.com/hwchase17/langchain.git
synced 2026-02-07 17:50:35 +00:00
Compare commits
29 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6d44a2285c | ||
|
|
0998577dfe | ||
|
|
bbb06ca4cf | ||
|
|
0b6aa6a024 | ||
|
|
10e7297306 | ||
|
|
e51fad1488 | ||
|
|
b7747017d7 | ||
|
|
2e96704d59 | ||
|
|
e9799d6821 | ||
|
|
c2d1d903fa | ||
|
|
055a53c27f | ||
|
|
231da14771 | ||
|
|
6ab432d62e | ||
|
|
07a407d89a | ||
|
|
c64f98e2bb | ||
|
|
5469d898a9 | ||
|
|
3d639d1539 | ||
|
|
91c6cea227 | ||
|
|
ba54d36787 | ||
|
|
5f8082bdd7 | ||
|
|
512c523368 | ||
|
|
e323d0cfb1 | ||
|
|
01fa2d8117 | ||
|
|
8e126bc9bd | ||
|
|
c71027e725 | ||
|
|
e85c53ce68 | ||
|
|
3e1901e1aa | ||
|
|
6a4f602156 | ||
|
|
6023d5be09 |
@@ -32,3 +32,8 @@ It implements a Question Answering app and contains instructions for deploying t
|
||||
## [Vercel](https://github.com/homanp/vercel-langchain)
|
||||
|
||||
A minimal example on how to run LangChain on Vercel using Flask.
|
||||
|
||||
|
||||
## [SteamShip](https://github.com/steamship-core/steamship-langchain/)
|
||||
This repository contains LangChain adapters for Steamship, enabling LangChain developers to rapidly deploy their apps on Steamship.
|
||||
This includes: production ready endpoints, horizontal scaling across dependencies, persistant storage of app state, multi-tenancy support, etc.
|
||||
|
||||
@@ -166,7 +166,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -33,7 +33,6 @@ def run_cmd(cmd: str, _crawler: Crawler) -> None:
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
objective = "Make a reservation for 2 at 7pm at bistro vida in menlo park"
|
||||
print("\nWelcome to natbot! What is your objective?")
|
||||
i = input()
|
||||
|
||||
@@ -21,28 +21,83 @@
|
||||
"from langchain.vectorstores.faiss import FAISS\n",
|
||||
"from langchain.text_splitter import CharacterTextSplitter\n",
|
||||
"from langchain.llms import OpenAI\n",
|
||||
"from langchain.chains import ChatVectorDBChain"
|
||||
"from langchain.chains import ChatVectorDBChain\n",
|
||||
"from langchain.document_loaders import TextLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cdff94be",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Load in documents. You can replace this with a loader for whatever type of data you want"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 2,
|
||||
"id": "01c46e92",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = TextLoader('../../state_of_the_union.txt')\n",
|
||||
"documents = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e9be4779",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If you had multiple loaders that you wanted to combine, you do something like:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "433363a5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# loaders = [....]\n",
|
||||
"# docs = []\n",
|
||||
"# for loader in loaders:\n",
|
||||
"# docs.extend(loader.load())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "239475d2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We now split the documents, create embeddings for them, and put them in a vectorstore. This allows us to do semantic search over them."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "a8930cf7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('../../state_of_the_union.txt') as f:\n",
|
||||
" state_of_the_union = f.read()\n",
|
||||
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
|
||||
"texts = text_splitter.split_text(state_of_the_union)\n",
|
||||
"documents = text_splitter.split_documents(documents)\n",
|
||||
"\n",
|
||||
"embeddings = OpenAIEmbeddings()\n",
|
||||
"vectorstore = FAISS.from_texts(texts, embeddings)"
|
||||
"vectorstore = FAISS.from_documents(documents, embeddings)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3c96b118",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We now initialize the ChatVectorDBChain"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 7,
|
||||
"id": "7b4110f3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -157,7 +212,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
171
docs/modules/document_loaders/examples/airbyte_json.ipynb
Normal file
171
docs/modules/document_loaders/examples/airbyte_json.ipynb
Normal file
@@ -0,0 +1,171 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1f3a5ebf",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Airbyte JSON\n",
|
||||
"This covers how to load any source from Airbyte into a local JSON file that can be read in as a document\n",
|
||||
"\n",
|
||||
"Prereqs:\n",
|
||||
"Have docker desktop installed\n",
|
||||
"\n",
|
||||
"Steps:\n",
|
||||
"\n",
|
||||
"1) Clone Airbyte from GitHub - `git clone https://github.com/airbytehq/airbyte.git`\n",
|
||||
"\n",
|
||||
"2) Switch into Airbyte directory - `cd airbyte`\n",
|
||||
"\n",
|
||||
"3) Start Airbyte - `docker compose up`\n",
|
||||
"\n",
|
||||
"4) In your browser, just visit http://localhost:8000. You will be asked for a username and password. By default, that's username `airbyte` and password `password`.\n",
|
||||
"\n",
|
||||
"5) Setup any source you wish.\n",
|
||||
"\n",
|
||||
"6) Set destination as Local JSON, with specified destination path - lets say `/json_data`. Set up manual sync.\n",
|
||||
"\n",
|
||||
"7) Run the connection!\n",
|
||||
"\n",
|
||||
"7) To see what files are create, you can navigate to: `file:///tmp/airbyte_local`\n",
|
||||
"\n",
|
||||
"8) Find your data and copy path. That path should be saved in the file variable below. It should start with `/tmp/airbyte_local`\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "180c8b74",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import AirbyteJSONLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "4af10665",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"_airbyte_raw_pokemon.jsonl\r\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!ls /tmp/airbyte_local/json_data/"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "721d9316",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = AirbyteJSONLoader('/tmp/airbyte_local/json_data/_airbyte_raw_pokemon.jsonl')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "9858b946",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "fca024cb",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"abilities: \n",
|
||||
"ability: \n",
|
||||
"name: blaze\n",
|
||||
"url: https://pokeapi.co/api/v2/ability/66/\n",
|
||||
"\n",
|
||||
"is_hidden: False\n",
|
||||
"slot: 1\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"ability: \n",
|
||||
"name: solar-power\n",
|
||||
"url: https://pokeapi.co/api/v2/ability/94/\n",
|
||||
"\n",
|
||||
"is_hidden: True\n",
|
||||
"slot: 3\n",
|
||||
"\n",
|
||||
"base_experience: 267\n",
|
||||
"forms: \n",
|
||||
"name: charizard\n",
|
||||
"url: https://pokeapi.co/api/v2/pokemon-form/6/\n",
|
||||
"\n",
|
||||
"game_indices: \n",
|
||||
"game_index: 180\n",
|
||||
"version: \n",
|
||||
"name: red\n",
|
||||
"url: https://pokeapi.co/api/v2/version/1/\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"game_index: 180\n",
|
||||
"version: \n",
|
||||
"name: blue\n",
|
||||
"url: https://pokeapi.co/api/v2/version/2/\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"game_index: 180\n",
|
||||
"version: \n",
|
||||
"n\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(data[0].page_content[:500])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9fa002a5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
93
docs/modules/document_loaders/examples/azlyrics.ipynb
Normal file
93
docs/modules/document_loaders/examples/azlyrics.ipynb
Normal file
@@ -0,0 +1,93 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9c31caff",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# AZLyrics\n",
|
||||
"This covers how to load AZLyrics webpages into a document format that we can use downstream."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "7e6f5726",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import AZLyricsLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "a0df4c24",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = AZLyricsLoader(\"https://www.azlyrics.com/lyrics/mileycyrus/flowers.html\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "8cd61b6e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "162fd286",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content=\"Miley Cyrus - Flowers Lyrics | AZLyrics.com\\n\\r\\nWe were good, we were gold\\nKinda dream that can't be sold\\nWe were right till we weren't\\nBuilt a home and watched it burn\\n\\nI didn't wanna leave you\\nI didn't wanna lie\\nStarted to cry but then remembered I\\n\\nI can buy myself flowers\\nWrite my name in the sand\\nTalk to myself for hours\\nSay things you don't understand\\nI can take myself dancing\\nAnd I can hold my own hand\\nYeah, I can love me better than you can\\n\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI can love me better, baby\\n\\nPaint my nails, cherry red\\nMatch the roses that you left\\nNo remorse, no regret\\nI forgive every word you said\\n\\nI didn't wanna leave you, baby\\nI didn't wanna fight\\nStarted to cry but then remembered I\\n\\nI can buy myself flowers\\nWrite my name in the sand\\nTalk to myself for hours, yeah\\nSay things you don't understand\\nI can take myself dancing\\nAnd I can hold my own hand\\nYeah, I can love me better than you can\\n\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI\\n\\nI didn't wanna wanna leave you\\nI didn't wanna fight\\nStarted to cry but then remembered I\\n\\nI can buy myself flowers\\nWrite my name in the sand\\nTalk to myself for hours (Yeah)\\nSay things you don't understand\\nI can take myself dancing\\nAnd I can hold my own hand\\nYeah, I can love me better than\\nYeah, I can love me better than you can, uh\\n\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI can love me better, baby (Than you can)\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI\\n\", lookup_str='', metadata={'source': 'https://www.azlyrics.com/lyrics/mileycyrus/flowers.html'}, lookup_index=0)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6358000c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
@@ -61,10 +61,61 @@
|
||||
"data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8bf50cba",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Retain Elements\n",
|
||||
"\n",
|
||||
"Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "b9592eaf",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = UnstructuredEmailLoader('example_data/fake-email.eml', mode=\"elements\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "0b16d03f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "d7bdc5e5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(page_content='This is a test email to use for unit tests.', lookup_str='', metadata={'source': 'example_data/fake-email.eml'}, lookup_index=0)"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4ef9a5f4",
|
||||
"id": "6a074515",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
@@ -86,7 +137,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
80
docs/modules/document_loaders/examples/everynote.ipynb
Normal file
80
docs/modules/document_loaders/examples/everynote.ipynb
Normal file
@@ -0,0 +1,80 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "56ac1584",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# EveryNote\n",
|
||||
"\n",
|
||||
"How to load EveryNote file from disk."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "1a53ece0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# !pip install pypandoc\n",
|
||||
"# import pypandoc\n",
|
||||
"\n",
|
||||
"# pypandoc.download_pandoc()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "88df766f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='testing this\\n\\nwhat happens?\\n\\nto the world?\\n', lookup_str='', metadata={'source': 'example_data/testing.enex'}, lookup_index=0)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.document_loaders import EveryNoteLoader\n",
|
||||
"\n",
|
||||
"loader = EveryNoteLoader(\"example_data/testing.enex\")\n",
|
||||
"loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c1329905",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE en-export SYSTEM "http://xml.evernote.com/pub/evernote-export4.dtd">
|
||||
<en-export export-date="20230309T035336Z" application="Evernote" version="10.53.2">
|
||||
<note>
|
||||
<title>testing</title>
|
||||
<created>20230209T034746Z</created>
|
||||
<updated>20230209T035328Z</updated>
|
||||
<note-attributes>
|
||||
<author>Harrison Chase</author>
|
||||
</note-attributes>
|
||||
<content>
|
||||
<![CDATA[<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd"><en-note><div>testing this</div><div>what happens?</div><div>to the world?</div></en-note> ]]>
|
||||
</content>
|
||||
</note>
|
||||
</en-export>
|
||||
83
docs/modules/document_loaders/examples/gutenberg.ipynb
Normal file
83
docs/modules/document_loaders/examples/gutenberg.ipynb
Normal file
@@ -0,0 +1,83 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "bda1f3f5",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Gutenberg\n",
|
||||
"\n",
|
||||
"This covers how to load links to Gutenberg e-books into a document format that we can use downstream."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "9bfd5e46",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import GutenbergLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "700e4ef2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = GutenbergLoader('https://www.gutenberg.org/cache/epub/69972/pg69972.txt')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "b6f28930",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7d436441",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3b74d755",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
94
docs/modules/document_loaders/examples/imsdb.ipynb
Normal file
94
docs/modules/document_loaders/examples/imsdb.ipynb
Normal file
File diff suppressed because one or more lines are too long
@@ -61,10 +61,61 @@
|
||||
"data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5d1472e9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Retain Elements\n",
|
||||
"\n",
|
||||
"Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "93abf60b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = UnstructuredDocxLoader('example_data/fake.docx', mode=\"elements\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "c35cdbcc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "fae2d730",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 'example_data/fake.docx'}, lookup_index=0)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "61953c83",
|
||||
"id": "961a7b1d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
@@ -86,7 +137,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
130
docs/modules/document_loaders/examples/online_pdf.ipynb
Normal file
130
docs/modules/document_loaders/examples/online_pdf.ipynb
Normal file
File diff suppressed because one or more lines are too long
@@ -10,9 +10,136 @@
|
||||
"This covers how to load pdfs into a document format that we can use downstream."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "743f9413",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using PyPDF\n",
|
||||
"\n",
|
||||
"Allows for tracking of page numbers as well."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "c428b0c5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import PagedPDFSplitter\n",
|
||||
"\n",
|
||||
"loader = PagedPDFSplitter(\"example_data/layout-parser-paper.pdf\")\n",
|
||||
"pages = loader.load_and_split()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ebd895e4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"An advantage of this approach is that documents can be retrieved with page numbers."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "87fa7b3a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"9: 10 Z. Shen et al.\n",
|
||||
"Fig. 4: Illustration of (a) the original historical Japanese document with layout\n",
|
||||
"detection results and (b) a recreated version of the document image that achieves\n",
|
||||
"much better character recognition recall. The reorganization algorithm rearranges\n",
|
||||
"the tokens based on the their detected bounding boxes given a maximum allowed\n",
|
||||
"height.\n",
|
||||
"4LayoutParser Community Platform\n",
|
||||
"Another focus of LayoutParser is promoting the reusability of layout detection\n",
|
||||
"models and full digitization pipelines. Similar to many existing deep learning\n",
|
||||
"libraries, LayoutParser comes with a community model hub for distributing\n",
|
||||
"layout models. End-users can upload their self-trained models to the model hub,\n",
|
||||
"and these models can be loaded into a similar interface as the currently available\n",
|
||||
"LayoutParser pre-trained models. For example, the model trained on the News\n",
|
||||
"Navigator dataset [17] has been incorporated in the model hub.\n",
|
||||
"Beyond DL models, LayoutParser also promotes the sharing of entire doc-\n",
|
||||
"ument digitization pipelines. For example, sometimes the pipeline requires the\n",
|
||||
"combination of multiple DL models to achieve better accuracy. Currently, pipelines\n",
|
||||
"are mainly described in academic papers and implementations are often not pub-\n",
|
||||
"licly available. To this end, the LayoutParser community platform also enables\n",
|
||||
"the sharing of layout pipelines to promote the discussion and reuse of techniques.\n",
|
||||
"For each shared pipeline, it has a dedicated project page, with links to the source\n",
|
||||
"code, documentation, and an outline of the approaches. A discussion panel is\n",
|
||||
"provided for exchanging ideas. Combined with the core LayoutParser library,\n",
|
||||
"users can easily build reusable components based on the shared pipelines and\n",
|
||||
"apply them to solve their unique problems.\n",
|
||||
"5 Use Cases\n",
|
||||
"The core objective of LayoutParser is to make it easier to create both large-scale\n",
|
||||
"and light-weight document digitization pipelines. Large-scale document processing\n",
|
||||
"3: 4 Z. Shen et al.\n",
|
||||
"Efficient Data AnnotationC u s t o m i z e d M o d e l T r a i n i n gModel Cust omizationDI A Model HubDI A Pipeline SharingCommunity PlatformLa y out Detection ModelsDocument Images \n",
|
||||
"T h e C o r e L a y o u t P a r s e r L i b r a r yOCR ModuleSt or age & VisualizationLa y out Data Structur e\n",
|
||||
"Fig. 1: The overall architecture of LayoutParser . For an input document image,\n",
|
||||
"the core LayoutParser library provides a set of o\u000b",
|
||||
"-the-shelf tools for layout\n",
|
||||
"detection, OCR, visualization, and storage, backed by a carefully designed layout\n",
|
||||
"data structure. LayoutParser also supports high level customization via e\u000ecient\n",
|
||||
"layout annotation and model training functions. These improve model accuracy\n",
|
||||
"on the target samples. The community platform enables the easy sharing of DIA\n",
|
||||
"models and whole digitization pipelines to promote reusability and reproducibility.\n",
|
||||
"A collection of detailed documentation, tutorials and exemplar projects make\n",
|
||||
"LayoutParser easy to learn and use.\n",
|
||||
"AllenNLP [ 8] and transformers [ 34] have provided the community with complete\n",
|
||||
"DL-based support for developing and deploying models for general computer\n",
|
||||
"vision and natural language processing problems. LayoutParser , on the other\n",
|
||||
"hand, specializes speci\f",
|
||||
"cally in DIA tasks. LayoutParser is also equipped with a\n",
|
||||
"community platform inspired by established model hubs such as Torch Hub [23]\n",
|
||||
"andTensorFlow Hub [1]. It enables the sharing of pretrained models as well as\n",
|
||||
"full document processing pipelines that are unique to DIA tasks.\n",
|
||||
"There have been a variety of document data collections to facilitate the\n",
|
||||
"development of DL models. Some examples include PRImA [ 3](magazine layouts),\n",
|
||||
"PubLayNet [ 38](academic paper layouts), Table Bank [ 18](tables in academic\n",
|
||||
"papers), Newspaper Navigator Dataset [ 16,17](newspaper \f",
|
||||
"gure layouts) and\n",
|
||||
"HJDataset [31](historical Japanese document layouts). A spectrum of models\n",
|
||||
"trained on these datasets are currently available in the LayoutParser model zoo\n",
|
||||
"to support di\u000b",
|
||||
"erent use cases.\n",
|
||||
"3 The Core LayoutParser Library\n",
|
||||
"At the core of LayoutParser is an o\u000b",
|
||||
"-the-shelf toolkit that streamlines DL-\n",
|
||||
"based document image analysis. Five components support a simple interface\n",
|
||||
"with comprehensive functionalities: 1) The layout detection models enable using\n",
|
||||
"pre-trained or self-trained DL models for layout detection with just four lines\n",
|
||||
"of code. 2) The detected layout information is stored in carefully engineered\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.vectorstores import FAISS\n",
|
||||
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
|
||||
"\n",
|
||||
"faiss_index = FAISS.from_documents(pages, OpenAIEmbeddings())\n",
|
||||
"docs = faiss_index.similarity_search(\"How will the community be engaged?\", k=2)\n",
|
||||
"for doc in docs:\n",
|
||||
" print(str(doc.metadata[\"page\"]) + \":\", doc.page_content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "09d64998",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using Unstructured"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "0cc0cd42",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -22,7 +149,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 4,
|
||||
"id": "082d557c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -32,8 +159,38 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "5c41106f",
|
||||
"execution_count": null,
|
||||
"id": "df11c953",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "09957371",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Retain Elements\n",
|
||||
"\n",
|
||||
"Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0fab833b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = UnstructuredPDFLoader(\"example_data/layout-parser-paper.pdf\", mode=\"elements\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c3e8ff1b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -43,7 +200,55 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "54fb6b62",
|
||||
"id": "43c23d2d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "21998d18",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using PDFMiner"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "2f0cc9ff",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import PDFMinerLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "42b531e8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = PDFMinerLoader(\"example_data/layout-parser-paper.pdf\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "010d5cdd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7301c473",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
@@ -65,7 +270,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -61,10 +61,61 @@
|
||||
"data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "525d6b67",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Retain Elements\n",
|
||||
"\n",
|
||||
"Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "064f9162",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = UnstructuredPowerPointLoader(\"example_data/fake-power-point.pptx\", mode=\"elements\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "abefbbdb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "a547c534",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(page_content='Adding a Bullet Slide', lookup_str='', metadata={'source': 'example_data/fake-power-point.pptx'}, lookup_index=0)"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0c55f1cf",
|
||||
"id": "381d4139",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
@@ -86,7 +137,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -12,6 +12,40 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "2886982e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# # Install package\n",
|
||||
"# !pip install unstructured"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "54d62efd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# # Install other dependencies\n",
|
||||
"# # https://github.com/Unstructured-IO/unstructured/blob/main/docs/source/installing.rst\n",
|
||||
"# !brew install libmagic"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "af6a64f5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# import nltk\n",
|
||||
"# nltk.download('punkt')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "79d3e549",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -21,7 +55,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 5,
|
||||
"id": "2593d1dc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -31,7 +65,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 6,
|
||||
"id": "fe34e941",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -39,10 +73,86 @@
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "ee449788",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.\\n\\nLast year COVID-19 kept us apart. This year we are finally together again.\\n\\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans.\\n\\nWith a duty to one another to the American people to the Constit'"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs[0].page_content[:400]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7874d01d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Retain Elements\n",
|
||||
"\n",
|
||||
"Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "ff5b616d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = UnstructuredFileLoader(\"../../state_of_the_union.txt\", mode=\"elements\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "feca3b6c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "fec5bbac",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0),\n",
|
||||
" Document(page_content='Last year COVID-19 kept us apart. This year we are finally together again.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0),\n",
|
||||
" Document(page_content='Tonight, we meet as Democrats Republicans and Independents. But most importantly as Americans.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0),\n",
|
||||
" Document(page_content='With a duty to one another to the American people to the Constitution.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0),\n",
|
||||
" Document(page_content='And with an unwavering resolve that freedom will always triumph over tyranny.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs[:5]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "24e577e5",
|
||||
"id": "8ca8a648",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
@@ -64,7 +174,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
78
docs/modules/document_loaders/examples/url.ipynb
Normal file
78
docs/modules/document_loaders/examples/url.ipynb
Normal file
@@ -0,0 +1,78 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2dfc4698",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# URL\n",
|
||||
"\n",
|
||||
"This covers how to load HTML documents from a list of URLs into a document format that we can use downstream."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "16c3699e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
" from langchain.document_loaders import UnstructuredURLLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "836fbac1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"urls = [\n",
|
||||
" \"https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-8-2023\",\n",
|
||||
" \"https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-9-2023\"\n",
|
||||
"]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "00f46fda",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = UnstructuredURLLoader(urls=urls)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "b68a26b3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loader.load()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
117
docs/modules/document_loaders/examples/web_base.ipynb
Normal file
117
docs/modules/document_loaders/examples/web_base.ipynb
Normal file
File diff suppressed because one or more lines are too long
@@ -39,7 +39,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = YoutubeLoader.from_youtube_url(\"https://www.youtube.com/watch?v=QsYGlZkevEg\")"
|
||||
"loader = YoutubeLoader.from_youtube_url(\"https://www.youtube.com/watch?v=QsYGlZkevEg\", add_video_info=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -51,7 +51,7 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='LADIES AND GENTLEMEN, PEDRO PASCAL! [ CHEERS AND APPLAUSE ] >> THANK YOU, THANK YOU. THANK YOU VERY MUCH. I\\'M SO EXCITED TO BE HERE. THANK YOU. I SPENT THE LAST YEAR SHOOTING A SHOW CALLED \"THE LAST OF US\" ON HBO. FOR SOME HBO SHOES, YOU GET TO SHOOT IN A FIVE STAR ITALIAN RESORT SURROUNDED BY BEAUTIFUL PEOPLE, BUT I SAID, NO, THAT\\'S TOO EASY. I WANT TO SHOOT IN A FREEZING CANADIAN FOREST WHILE BEING CHASED AROUND BY A GUY WHOSE HEAD LOOKS LIKE A GENITAL WART. IT IS AN HONOR BEING A PART OF THESE HUGE FRANCHISEs LIKE \"GAME OF THRONES\" AND \"STAR WARS,\" BUT I\\'M STILL GETTING USED TO PEOPLE RECOGNIZING ME. THE OTHER DAY, A GUY STOPPED ME ON THE STREET AND SAYS, MY SON LOVES \"THE MANDALORIAN\" AND THE NEXT THING I KNOW, I\\'M FACE TIMING WITH A 6-YEAR-OLD WHO HAS NO IDEA WHO I AM BECAUSE MY CHARACTER WEARS A MASK THE ENTIRE SHOW. THE GUY IS LIKE, DO THE MANDO VOICE, BUT IT\\'S LIKE A BEDROOM VOICE. WITHOUT THE MASK, IT JUST SOUNDS PORNY. PEOPLE WALKING BY ON THE STREET SEE ME WHISPERING TO A 6-YEAR-OLD KID. I CAN BRING YOU IN WARM, OR I CAN BRING YOU IN COLD. EVEN THOUGH I CAME TO THE U.S. WHEN I WAS LITTLE, I WAS BORN IN CHILE, AND I HAVE 34 FIRST COUSINS WHO ARE STILL THERE. THEY\\'RE VERY PROUD OF ME. I KNOW THEY\\'RE PROUD BECAUSE THEY GIVE MY PHONE NUMBER TO EVERY PERSON THEY MEET, WHICH MEANS EVERY DAY, SOMEONE IN SANTIAGO WILL TEXT ME STUFF LIKE, CAN YOU COME TO MY WEDDING, OR CAN YOU SING MY PRIEST HAPPY BIRTHDAY, OR IS BABY YODA MEAN IN REAL LIFE. SO I HAVE TO BE LIKE NO, NO, AND HIS NAME IS GROGU. BUT MY COUSINS WEREN\\'T ALWAYS SO PROUD. EARLY IN MY CAREER, I PLAYED SMALL PARTS IN EVERY CRIME SHOW. I EVEN PLAYED TWO DIFFERENT CHARACTERS ON \"LAW AND ORDER.\" TITO CABASSA WHO LOOKED LIKE THIS. AND ONE YEAR LATER, I PLAYED REGGIE LUCKMAN WHO LOOKS LIKE THIS. AND THAT, MY FRIENDS, IS CALLED RANGE. BUT IT IS AMAZING TO BE HERE, LIKE I SAID. I WAS BORN IN CHILE, AND NINE MONTHS LATER, MY PARENTS FLED AND BROUGHT ME AND MY SISTER TO THE U.S. THEY WERE SO BRAVE, AND WITHOUT THEM, I WOULDN\\'T BE HERE IN THIS WONDERFUL COUNTRY, AND I CERTAINLY WOULDN\\'T BE STANDING HERE WITH YOU ALL TONIGHT. SO TO ALL MY FAMILY WATCHING IN CHILE, I WANT TO SAY [ SPEAKING NON-ENGLISH ] WHICH MEANS, I LOVE YOU, I MISS YOU, AND STOP GIVING OUT MY PHONE NUMBER. WE\\'VE GOT AN AMAZING SHOW FOR YOU TONIGHT. COLDPLAY IS HERE, SO STICK', lookup_str='', metadata={'source': 'QsYGlZkevEg'}, lookup_index=0)]"
|
||||
"[Document(page_content='LADIES AND GENTLEMEN, PEDRO PASCAL! [ CHEERS AND APPLAUSE ] >> THANK YOU, THANK YOU. THANK YOU VERY MUCH. I\\'M SO EXCITED TO BE HERE. THANK YOU. I SPENT THE LAST YEAR SHOOTING A SHOW CALLED \"THE LAST OF US\" ON HBO. FOR SOME HBO SHOES, YOU GET TO SHOOT IN A FIVE STAR ITALIAN RESORT SURROUNDED BY BEAUTIFUL PEOPLE, BUT I SAID, NO, THAT\\'S TOO EASY. I WANT TO SHOOT IN A FREEZING CANADIAN FOREST WHILE BEING CHASED AROUND BY A GUY WHOSE HEAD LOOKS LIKE A GENITAL WART. IT IS AN HONOR BEING A PART OF THESE HUGE FRANCHISEs LIKE \"GAME OF THRONES\" AND \"STAR WARS,\" BUT I\\'M STILL GETTING USED TO PEOPLE RECOGNIZING ME. THE OTHER DAY, A GUY STOPPED ME ON THE STREET AND SAYS, MY SON LOVES \"THE MANDALORIAN\" AND THE NEXT THING I KNOW, I\\'M FACE TIMING WITH A 6-YEAR-OLD WHO HAS NO IDEA WHO I AM BECAUSE MY CHARACTER WEARS A MASK THE ENTIRE SHOW. THE GUY IS LIKE, DO THE MANDO VOICE, BUT IT\\'S LIKE A BEDROOM VOICE. WITHOUT THE MASK, IT JUST SOUNDS PORNY. PEOPLE WALKING BY ON THE STREET SEE ME WHISPERING TO A 6-YEAR-OLD KID. I CAN BRING YOU IN WARM, OR I CAN BRING YOU IN COLD. EVEN THOUGH I CAME TO THE U.S. WHEN I WAS LITTLE, I WAS BORN IN CHILE, AND I HAVE 34 FIRST COUSINS WHO ARE STILL THERE. THEY\\'RE VERY PROUD OF ME. I KNOW THEY\\'RE PROUD BECAUSE THEY GIVE MY PHONE NUMBER TO EVERY PERSON THEY MEET, WHICH MEANS EVERY DAY, SOMEONE IN SANTIAGO WILL TEXT ME STUFF LIKE, CAN YOU COME TO MY WEDDING, OR CAN YOU SING MY PRIEST HAPPY BIRTHDAY, OR IS BABY YODA MEAN IN REAL LIFE. SO I HAVE TO BE LIKE NO, NO, AND HIS NAME IS GROGU. BUT MY COUSINS WEREN\\'T ALWAYS SO PROUD. EARLY IN MY CAREER, I PLAYED SMALL PARTS IN EVERY CRIME SHOW. I EVEN PLAYED TWO DIFFERENT CHARACTERS ON \"LAW AND ORDER.\" TITO CABASSA WHO LOOKED LIKE THIS. AND ONE YEAR LATER, I PLAYED REGGIE LUCKMAN WHO LOOKS LIKE THIS. AND THAT, MY FRIENDS, IS CALLED RANGE. BUT IT IS AMAZING TO BE HERE, LIKE I SAID. I WAS BORN IN CHILE, AND NINE MONTHS LATER, MY PARENTS FLED AND BROUGHT ME AND MY SISTER TO THE U.S. THEY WERE SO BRAVE, AND WITHOUT THEM, I WOULDN\\'T BE HERE IN THIS WONDERFUL COUNTRY, AND I CERTAINLY WOULDN\\'T BE STANDING HERE WITH YOU ALL TONIGHT. SO TO ALL MY FAMILY WATCHING IN CHILE, I WANT TO SAY [ SPEAKING NON-ENGLISH ] WHICH MEANS, I LOVE YOU, I MISS YOU, AND STOP GIVING OUT MY PHONE NUMBER. WE\\'VE GOT AN AMAZING SHOW FOR YOU TONIGHT. COLDPLAY IS HERE, SO STICK', lookup_str='', metadata={'source': 'QsYGlZkevEg', 'title': 'Pedro Pascal Monologue - SNL', 'description': 'First-time host Pedro Pascal talks about filming The Last of Us and being recognized by fans.\\n\\nSaturday Night Live. Stream now on Peacock: https://pck.tv/3uQxh4q\\n\\nSubscribe to SNL: https://goo.gl/tUsXwM\\nStream Current Full Episodes: http://www.nbc.com/saturday-night-live\\n\\nWATCH PAST SNL SEASONS\\nGoogle Play - http://bit.ly/SNLGooglePlay\\niTunes - http://bit.ly/SNLiTunes\\n\\nSNL ON SOCIAL\\nSNL Instagram: http://instagram.com/nbcsnl\\nSNL Facebook: https://www.facebook.com/snl\\nSNL Twitter: https://twitter.com/nbcsnl\\nSNL TikTok: https://www.tiktok.com/@nbcsnl\\n\\nGET MORE NBC\\nLike NBC: http://Facebook.com/NBC\\nFollow NBC: http://Twitter.com/NBC\\nNBC Tumblr: http://NBCtv.tumblr.com/\\nYouTube: http://www.youtube.com/nbc\\nNBC Instagram: http://instagram.com/nbc\\n\\n#SNL #PedroPascal #SNL48 #Coldplay', 'view_count': 1175057, 'thumbnail_url': 'https://i.ytimg.com/vi/QsYGlZkevEg/sddefault.jpg', 'publish_date': datetime.datetime(2023, 2, 4, 0, 0), 'length': 224, 'author': 'Saturday Night Live'}, lookup_index=0)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
@@ -62,6 +62,55 @@
|
||||
"source": [
|
||||
"loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6b278a1b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Add video info"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "ba28af69",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# ! pip install pytube"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "9b8ea390",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = YoutubeLoader.from_youtube_url(\"https://www.youtube.com/watch?v=QsYGlZkevEg\", add_video_info=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "97b98e92",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='LADIES AND GENTLEMEN, PEDRO PASCAL! [ CHEERS AND APPLAUSE ] >> THANK YOU, THANK YOU. THANK YOU VERY MUCH. I\\'M SO EXCITED TO BE HERE. THANK YOU. I SPENT THE LAST YEAR SHOOTING A SHOW CALLED \"THE LAST OF US\" ON HBO. FOR SOME HBO SHOES, YOU GET TO SHOOT IN A FIVE STAR ITALIAN RESORT SURROUNDED BY BEAUTIFUL PEOPLE, BUT I SAID, NO, THAT\\'S TOO EASY. I WANT TO SHOOT IN A FREEZING CANADIAN FOREST WHILE BEING CHASED AROUND BY A GUY WHOSE HEAD LOOKS LIKE A GENITAL WART. IT IS AN HONOR BEING A PART OF THESE HUGE FRANCHISEs LIKE \"GAME OF THRONES\" AND \"STAR WARS,\" BUT I\\'M STILL GETTING USED TO PEOPLE RECOGNIZING ME. THE OTHER DAY, A GUY STOPPED ME ON THE STREET AND SAYS, MY SON LOVES \"THE MANDALORIAN\" AND THE NEXT THING I KNOW, I\\'M FACE TIMING WITH A 6-YEAR-OLD WHO HAS NO IDEA WHO I AM BECAUSE MY CHARACTER WEARS A MASK THE ENTIRE SHOW. THE GUY IS LIKE, DO THE MANDO VOICE, BUT IT\\'S LIKE A BEDROOM VOICE. WITHOUT THE MASK, IT JUST SOUNDS PORNY. PEOPLE WALKING BY ON THE STREET SEE ME WHISPERING TO A 6-YEAR-OLD KID. I CAN BRING YOU IN WARM, OR I CAN BRING YOU IN COLD. EVEN THOUGH I CAME TO THE U.S. WHEN I WAS LITTLE, I WAS BORN IN CHILE, AND I HAVE 34 FIRST COUSINS WHO ARE STILL THERE. THEY\\'RE VERY PROUD OF ME. I KNOW THEY\\'RE PROUD BECAUSE THEY GIVE MY PHONE NUMBER TO EVERY PERSON THEY MEET, WHICH MEANS EVERY DAY, SOMEONE IN SANTIAGO WILL TEXT ME STUFF LIKE, CAN YOU COME TO MY WEDDING, OR CAN YOU SING MY PRIEST HAPPY BIRTHDAY, OR IS BABY YODA MEAN IN REAL LIFE. SO I HAVE TO BE LIKE NO, NO, AND HIS NAME IS GROGU. BUT MY COUSINS WEREN\\'T ALWAYS SO PROUD. EARLY IN MY CAREER, I PLAYED SMALL PARTS IN EVERY CRIME SHOW. I EVEN PLAYED TWO DIFFERENT CHARACTERS ON \"LAW AND ORDER.\" TITO CABASSA WHO LOOKED LIKE THIS. AND ONE YEAR LATER, I PLAYED REGGIE LUCKMAN WHO LOOKS LIKE THIS. AND THAT, MY FRIENDS, IS CALLED RANGE. BUT IT IS AMAZING TO BE HERE, LIKE I SAID. I WAS BORN IN CHILE, AND NINE MONTHS LATER, MY PARENTS FLED AND BROUGHT ME AND MY SISTER TO THE U.S. THEY WERE SO BRAVE, AND WITHOUT THEM, I WOULDN\\'T BE HERE IN THIS WONDERFUL COUNTRY, AND I CERTAINLY WOULDN\\'T BE STANDING HERE WITH YOU ALL TONIGHT. SO TO ALL MY FAMILY WATCHING IN CHILE, I WANT TO SAY [ SPEAKING NON-ENGLISH ] WHICH MEANS, I LOVE YOU, I MISS YOU, AND STOP GIVING OUT MY PHONE NUMBER. WE\\'VE GOT AN AMAZING SHOW FOR YOU TONIGHT. COLDPLAY IS HERE, SO STICK', lookup_str='', metadata={'source': 'QsYGlZkevEg', 'title': 'Pedro Pascal Monologue - SNL', 'description': 'First-time host Pedro Pascal talks about filming The Last of Us and being recognized by fans.\\n\\nSaturday Night Live. Stream now on Peacock: https://pck.tv/3uQxh4q\\n\\nSubscribe to SNL: https://goo.gl/tUsXwM\\nStream Current Full Episodes: http://www.nbc.com/saturday-night-live\\n\\nWATCH PAST SNL SEASONS\\nGoogle Play - http://bit.ly/SNLGooglePlay\\niTunes - http://bit.ly/SNLiTunes\\n\\nSNL ON SOCIAL\\nSNL Instagram: http://instagram.com/nbcsnl\\nSNL Facebook: https://www.facebook.com/snl\\nSNL Twitter: https://twitter.com/nbcsnl\\nSNL TikTok: https://www.tiktok.com/@nbcsnl\\n\\nGET MORE NBC\\nLike NBC: http://Facebook.com/NBC\\nFollow NBC: http://Twitter.com/NBC\\nNBC Tumblr: http://NBCtv.tumblr.com/\\nYouTube: http://www.youtube.com/nbc\\nNBC Instagram: http://instagram.com/nbc\\n\\n#SNL #PedroPascal #SNL48 #Coldplay', 'view_count': 1175057, 'thumbnail_url': 'https://i.ytimg.com/vi/QsYGlZkevEg/sddefault.jpg', 'publish_date': datetime.datetime(2023, 2, 4, 0, 0), 'length': 224, 'author': 'Saturday Night Live'}, lookup_index=0)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"loader.load()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@@ -80,7 +129,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -27,6 +27,8 @@ There are a lot of different document loaders that LangChain supports. Below are
|
||||
|
||||
`Roam <./examples/roam.html>`_: A walkthrough of how to load data from a Roam file export.
|
||||
|
||||
`EveryNote <./examples/everynote.html>`_: A walkthrough of how to load data from a EveryNote (`.enex`) file.
|
||||
|
||||
`YouTube <./examples/youtube.html>`_: A walkthrough of how to load the transcript from a YouTube video.
|
||||
|
||||
`s3 File <./examples/s3_file.html>`_: A walkthrough of how to load a file from s3.
|
||||
@@ -37,6 +39,19 @@ There are a lot of different document loaders that LangChain supports. Below are
|
||||
|
||||
`GCS Directory <./examples/gcs_directory.html>`_: A walkthrough of how to load all files in a directory from Google Cloud Storage (GCS).
|
||||
|
||||
`Web Base <./examples/web_base.html>`_: A walkthrough of how to load all text data from webpages.
|
||||
|
||||
`IMSDb <./examples/imsdb.html>`_: A walkthrough of how to load all text data from IMSDb webpage.
|
||||
|
||||
`AZLyrics <./examples/azlyrics.html>`_: A walkthrough of how to load all text data from AZLyrics webpage.
|
||||
|
||||
`College Confidential <./examples/college_confidential.html>`_: A walkthrough of how to load all text data from College Confidential webpage.
|
||||
|
||||
`Gutenberg <./examples/gutenberg.html>`_: A walkthrough of how to load data from a Gutenberg ebook text.
|
||||
|
||||
`Airbyte Json <./examples/airbyte_json.html>`_: A walkthrough of how to load data from a local Airbyte JSON file.
|
||||
|
||||
`Online PDF <./examples/online_pdf.html>`_: A walkthrough of how to load data from an online PDF.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
138
docs/modules/llms/examples/fake_llm.ipynb
Normal file
138
docs/modules/llms/examples/fake_llm.ipynb
Normal file
@@ -0,0 +1,138 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "052dfe58",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Fake LLM\n",
|
||||
"We expose a fake LLM class that can be used for testing. This allows you to mock out calls to the LLM and simulate what would happen if the LLM responded in a certain way.\n",
|
||||
"\n",
|
||||
"In this notebook we go over how to use this.\n",
|
||||
"\n",
|
||||
"We start this with using the FakeLLM in an agent."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "ef97ac4d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.llms.fake import FakeListLLM"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "9a0a160f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.agents import load_tools\n",
|
||||
"from langchain.agents import initialize_agent"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "b272258c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tools = load_tools([\"python_repl\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "94096c4c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"responses=[\n",
|
||||
" \"Action: Python REPL\\nAction Input: print(2 + 2)\",\n",
|
||||
" \"Final Answer: 4\"\n",
|
||||
"]\n",
|
||||
"llm = FakeListLLM(responses=responses)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"id": "da226d02",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"agent = initialize_agent(tools, llm, agent=\"zero-shot-react-description\", verbose=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"id": "44c13426",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3mAction: Python REPL\n",
|
||||
"Action Input: print(2 + 2)\u001b[0m\n",
|
||||
"Observation: \u001b[36;1m\u001b[1;3m4\n",
|
||||
"\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3mFinal Answer: 4\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'4'"
|
||||
]
|
||||
},
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"agent.run(\"whats 2 + 2\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "814c2858",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -11,6 +11,8 @@ The examples here all address certain "how-to" guides for working with LLMs.
|
||||
|
||||
`Token Usage Tracking <./examples/token_usage_tracking.html>`_: How to track the token usage of various chains/agents/LLM calls.
|
||||
|
||||
`Fake LLM <./examples/fake_llm.html>`_: How to create and use a fake LLM for testing and debugging purposes.
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
@@ -5,9 +5,9 @@
|
||||
"id": "959300d4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# HuggingFace Hub\n",
|
||||
"# Hugging Face Hub\n",
|
||||
"\n",
|
||||
"This example showcases how to connect to the HuggingFace Hub."
|
||||
"This example showcases how to connect to the Hugging Face Hub."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -20,7 +20,7 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"The Seattle Seahawks won the Super Bowl in 2010. Justin Beiber was born in 2010. The\n"
|
||||
"The Seattle Seahawks won the Super Bowl in 2010. Justin Beiber was born in 2010. The final answer: Seattle Seahawks.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -31,7 +31,7 @@
|
||||
"\n",
|
||||
"Answer: Let's think step by step.\"\"\"\n",
|
||||
"prompt = PromptTemplate(template=template, input_variables=[\"question\"])\n",
|
||||
"llm_chain = LLMChain(prompt=prompt, llm=HuggingFaceHub(repo_id=\"google/flan-t5-xl\", model_kwargs={\"temperature\":1e-10}))\n",
|
||||
"llm_chain = LLMChain(prompt=prompt, llm=HuggingFaceHub(repo_id=\"google/flan-t5-xl\", model_kwargs={\"temperature\":0, \"max_length\":64}))\n",
|
||||
"\n",
|
||||
"question = \"What NFL team won the Super Bowl in the year Justin Beiber was born?\"\n",
|
||||
"\n",
|
||||
|
||||
@@ -77,7 +77,7 @@
|
||||
" memory=ConversationalBufferWindowMemory(k=2),\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"output = chatgpt_chain.predict(human_input=\"I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply wiht the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\")\n",
|
||||
"output = chatgpt_chain.predict(human_input=\"I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply with the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\")\n",
|
||||
"print(output)"
|
||||
]
|
||||
},
|
||||
@@ -103,7 +103,7 @@
|
||||
"\n",
|
||||
"Overall, Assistant is a powerful tool that can help with a wide range of tasks and provide valuable insights and information on a wide range of topics. Whether you need help with a specific question or just want to have a conversation about a particular topic, Assistant is here to assist.\n",
|
||||
"\n",
|
||||
"Human: I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply wiht the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\n",
|
||||
"Human: I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply with the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\n",
|
||||
"AI: \n",
|
||||
"```\n",
|
||||
"$ pwd\n",
|
||||
@@ -148,7 +148,7 @@
|
||||
"\n",
|
||||
"Overall, Assistant is a powerful tool that can help with a wide range of tasks and provide valuable insights and information on a wide range of topics. Whether you need help with a specific question or just want to have a conversation about a particular topic, Assistant is here to assist.\n",
|
||||
"\n",
|
||||
"Human: I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply wiht the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\n",
|
||||
"Human: I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply with the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\n",
|
||||
"AI: \n",
|
||||
"```\n",
|
||||
"$ pwd\n",
|
||||
@@ -915,14 +915,14 @@
|
||||
" \"response\": \"Artificial intelligence (AI) is the simulation of human intelligence processes by machines, especially computer systems. These processes include learning (the acquisition of information and rules for using the information), reasoning (using the rules to reach approximate or definite conclusions) and self-correction. AI is used to develop computer systems that can think and act like humans.\"\n",
|
||||
"}\n",
|
||||
"```\n",
|
||||
"Human: curl --header \"Content-Type:application/json\" --request POST --data '{\"message\": \"I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply wiht the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\"}' https://chat.openai.com/chat\n",
|
||||
"Human: curl --header \"Content-Type:application/json\" --request POST --data '{\"message\": \"I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply with the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\"}' https://chat.openai.com/chat\n",
|
||||
"Assistant:\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished LLMChain chain.\u001b[0m\n",
|
||||
" \n",
|
||||
"\n",
|
||||
"```\n",
|
||||
"$ curl --header \"Content-Type:application/json\" --request POST --data '{\"message\": \"I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply wiht the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\"}' https://chat.openai.com/chat\n",
|
||||
"$ curl --header \"Content-Type:application/json\" --request POST --data '{\"message\": \"I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply with the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\"}' https://chat.openai.com/chat\n",
|
||||
"\n",
|
||||
"{\n",
|
||||
" \"response\": \"```\\n/current/working/directory\\n```\"\n",
|
||||
@@ -932,7 +932,7 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"output = chatgpt_chain.predict(human_input=\"\"\"curl --header \"Content-Type:application/json\" --request POST --data '{\"message\": \"I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply wiht the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\"}' https://chat.openai.com/chat\"\"\")\n",
|
||||
"output = chatgpt_chain.predict(human_input=\"\"\"curl --header \"Content-Type:application/json\" --request POST --data '{\"message\": \"I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply with the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\"}' https://chat.openai.com/chat\"\"\")\n",
|
||||
"print(output)"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -68,7 +68,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 4,
|
||||
"id": "67baf32e",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
@@ -98,6 +98,68 @@
|
||||
"print(docs[0].page_content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "fb6baaf8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Add texts\n",
|
||||
"You can easily add text to a vectorstore with the `add_texts` method. It will return a list of document IDs (in case you need to use them downstream)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "70758e4f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['64108bd0-4d91-485c-9743-1e18debdd59e']"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docsearch.add_texts([\"Ankush went to Princeton\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "4edeb88f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query = \"Where did Ankush go to college?\"\n",
|
||||
"docs = docsearch.similarity_search(query)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "1cba64a2",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(page_content='Ankush went to Princeton', lookup_str='', metadata={}, lookup_index=0)"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "bbf5ec44",
|
||||
@@ -646,7 +708,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -6,7 +6,7 @@ These agents can be used to power the next generation of personal assistants -
|
||||
systems that intelligently understand what you mean, and then can take actions to help you accomplish your goal.
|
||||
|
||||
Agents are a core use of LangChain - so much so that there is a whole module dedicated to them.
|
||||
Therefor, we recommend that you check out that documentation for detailed instruction on how to work
|
||||
Therefore, we recommend that you check out that documentation for detailed instruction on how to work
|
||||
with them.
|
||||
|
||||
- [Agent Documentation](../modules/agents.rst)
|
||||
|
||||
@@ -39,6 +39,7 @@ class ConversationalAgent(Agent):
|
||||
tools: List[Tool],
|
||||
prefix: str = PREFIX,
|
||||
suffix: str = SUFFIX,
|
||||
format_instructions: str = FORMAT_INSTRUCTIONS,
|
||||
ai_prefix: str = "AI",
|
||||
human_prefix: str = "Human",
|
||||
input_variables: Optional[List[str]] = None,
|
||||
@@ -61,7 +62,7 @@ class ConversationalAgent(Agent):
|
||||
[f"> {tool.name}: {tool.description}" for tool in tools]
|
||||
)
|
||||
tool_names = ", ".join([tool.name for tool in tools])
|
||||
format_instructions = FORMAT_INSTRUCTIONS.format(
|
||||
format_instructions = format_instructions.format(
|
||||
tool_names=tool_names, ai_prefix=ai_prefix, human_prefix=human_prefix
|
||||
)
|
||||
template = "\n\n".join([prefix, tool_strings, format_instructions, suffix])
|
||||
@@ -93,6 +94,7 @@ class ConversationalAgent(Agent):
|
||||
callback_manager: Optional[BaseCallbackManager] = None,
|
||||
prefix: str = PREFIX,
|
||||
suffix: str = SUFFIX,
|
||||
format_instructions: str = FORMAT_INSTRUCTIONS,
|
||||
ai_prefix: str = "AI",
|
||||
human_prefix: str = "Human",
|
||||
input_variables: Optional[List[str]] = None,
|
||||
@@ -106,6 +108,7 @@ class ConversationalAgent(Agent):
|
||||
human_prefix=human_prefix,
|
||||
prefix=prefix,
|
||||
suffix=suffix,
|
||||
format_instructions=format_instructions,
|
||||
input_variables=input_variables,
|
||||
)
|
||||
llm_chain = LLMChain(
|
||||
|
||||
@@ -72,6 +72,7 @@ class ZeroShotAgent(Agent):
|
||||
tools: List[Tool],
|
||||
prefix: str = PREFIX,
|
||||
suffix: str = SUFFIX,
|
||||
format_instructions: str = FORMAT_INSTRUCTIONS,
|
||||
input_variables: Optional[List[str]] = None,
|
||||
) -> PromptTemplate:
|
||||
"""Create prompt in the style of the zero shot agent.
|
||||
@@ -88,7 +89,7 @@ class ZeroShotAgent(Agent):
|
||||
"""
|
||||
tool_strings = "\n".join([f"{tool.name}: {tool.description}" for tool in tools])
|
||||
tool_names = ", ".join([tool.name for tool in tools])
|
||||
format_instructions = FORMAT_INSTRUCTIONS.format(tool_names=tool_names)
|
||||
format_instructions = format_instructions.format(tool_names=tool_names)
|
||||
template = "\n\n".join([prefix, tool_strings, format_instructions, suffix])
|
||||
if input_variables is None:
|
||||
input_variables = ["input", "agent_scratchpad"]
|
||||
@@ -102,13 +103,18 @@ class ZeroShotAgent(Agent):
|
||||
callback_manager: Optional[BaseCallbackManager] = None,
|
||||
prefix: str = PREFIX,
|
||||
suffix: str = SUFFIX,
|
||||
format_instructions: str = FORMAT_INSTRUCTIONS,
|
||||
input_variables: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> Agent:
|
||||
"""Construct an agent from an LLM and tools."""
|
||||
cls._validate_tools(tools)
|
||||
prompt = cls.create_prompt(
|
||||
tools, prefix=prefix, suffix=suffix, input_variables=input_variables
|
||||
tools,
|
||||
prefix=prefix,
|
||||
suffix=suffix,
|
||||
format_instructions=format_instructions,
|
||||
input_variables=input_variables,
|
||||
)
|
||||
llm_chain = LLMChain(
|
||||
llm=llm,
|
||||
|
||||
@@ -4,7 +4,12 @@ from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from sqlalchemy import Column, Integer, String, create_engine, select
|
||||
from sqlalchemy.engine.base import Engine
|
||||
from sqlalchemy.orm import Session, declarative_base
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
try:
|
||||
from sqlalchemy.orm import declarative_base
|
||||
except ImportError:
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
|
||||
from langchain.schema import Generation
|
||||
|
||||
|
||||
@@ -93,7 +93,6 @@ def _load_refine_chain(
|
||||
verbose: Optional[bool] = None,
|
||||
**kwargs: Any,
|
||||
) -> RefineDocumentsChain:
|
||||
|
||||
initial_chain = LLMChain(llm=llm, prompt=question_prompt, verbose=verbose)
|
||||
_refine_llm = refine_llm or llm
|
||||
refine_chain = LLMChain(llm=_refine_llm, prompt=refine_prompt, verbose=verbose)
|
||||
|
||||
@@ -1,25 +1,37 @@
|
||||
"""All different types of document loaders."""
|
||||
|
||||
from langchain.document_loaders.airbyte_json import AirbyteJSONLoader
|
||||
from langchain.document_loaders.azlyrics import AZLyricsLoader
|
||||
from langchain.document_loaders.college_confidential import CollegeConfidentialLoader
|
||||
from langchain.document_loaders.directory import DirectoryLoader
|
||||
from langchain.document_loaders.docx import UnstructuredDocxLoader
|
||||
from langchain.document_loaders.email import UnstructuredEmailLoader
|
||||
from langchain.document_loaders.everynote import EveryNoteLoader
|
||||
from langchain.document_loaders.gcs_directory import GCSDirectoryLoader
|
||||
from langchain.document_loaders.gcs_file import GCSFileLoader
|
||||
from langchain.document_loaders.googledrive import GoogleDriveLoader
|
||||
from langchain.document_loaders.gutenberg import GutenbergLoader
|
||||
from langchain.document_loaders.html import UnstructuredHTMLLoader
|
||||
from langchain.document_loaders.imsdb import IMSDbLoader
|
||||
from langchain.document_loaders.notion import NotionDirectoryLoader
|
||||
from langchain.document_loaders.obsidian import ObsidianLoader
|
||||
from langchain.document_loaders.pdf import UnstructuredPDFLoader
|
||||
from langchain.document_loaders.online_pdf import OnlinePDFLoader
|
||||
from langchain.document_loaders.paged_pdf import PagedPDFSplitter
|
||||
from langchain.document_loaders.pdf import PDFMinerLoader, UnstructuredPDFLoader
|
||||
from langchain.document_loaders.powerpoint import UnstructuredPowerPointLoader
|
||||
from langchain.document_loaders.readthedocs import ReadTheDocsLoader
|
||||
from langchain.document_loaders.roam import RoamLoader
|
||||
from langchain.document_loaders.s3_directory import S3DirectoryLoader
|
||||
from langchain.document_loaders.s3_file import S3FileLoader
|
||||
from langchain.document_loaders.text import TextLoader
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
from langchain.document_loaders.url import UnstructuredURLLoader
|
||||
from langchain.document_loaders.web_base import WebBaseLoader
|
||||
from langchain.document_loaders.youtube import YoutubeLoader
|
||||
|
||||
__all__ = [
|
||||
"UnstructuredFileLoader",
|
||||
"UnstructuredURLLoader",
|
||||
"DirectoryLoader",
|
||||
"NotionDirectoryLoader",
|
||||
"ReadTheDocsLoader",
|
||||
@@ -33,7 +45,18 @@ __all__ = [
|
||||
"RoamLoader",
|
||||
"YoutubeLoader",
|
||||
"S3FileLoader",
|
||||
"TextLoader",
|
||||
"S3DirectoryLoader",
|
||||
"GCSFileLoader",
|
||||
"GCSDirectoryLoader",
|
||||
"WebBaseLoader",
|
||||
"IMSDbLoader",
|
||||
"AZLyricsLoader",
|
||||
"CollegeConfidentialLoader",
|
||||
"GutenbergLoader",
|
||||
"PagedPDFSplitter",
|
||||
"EveryNoteLoader",
|
||||
"AirbyteJSONLoader",
|
||||
"OnlinePDFLoader",
|
||||
"PDFMinerLoader",
|
||||
]
|
||||
|
||||
41
langchain/document_loaders/airbyte_json.py
Normal file
41
langchain/document_loaders/airbyte_json.py
Normal file
@@ -0,0 +1,41 @@
|
||||
"""Loader that loads local airbyte json files."""
|
||||
import json
|
||||
from typing import Any, List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
def _stringify_value(val: Any) -> str:
|
||||
if isinstance(val, str):
|
||||
return val
|
||||
elif isinstance(val, dict):
|
||||
return "\n" + _stringify_dict(val)
|
||||
elif isinstance(val, list):
|
||||
return "\n".join(_stringify_value(v) for v in val)
|
||||
else:
|
||||
return str(val)
|
||||
|
||||
|
||||
def _stringify_dict(data: dict) -> str:
|
||||
text = ""
|
||||
for key, value in data.items():
|
||||
text += key + ": " + _stringify_value(data[key]) + "\n"
|
||||
return text
|
||||
|
||||
|
||||
class AirbyteJSONLoader(BaseLoader):
|
||||
"""Loader that loads local airbyte json files."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path. This should start with '/tmp/airbyte_local/'."""
|
||||
self.file_path = file_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load file."""
|
||||
text = ""
|
||||
for line in open(self.file_path, "r"):
|
||||
data = json.loads(line)["_airbyte_data"]
|
||||
text += _stringify_dict(data)
|
||||
metadata = {"source": self.file_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
22
langchain/document_loaders/azlyrics.py
Normal file
22
langchain/document_loaders/azlyrics.py
Normal file
@@ -0,0 +1,22 @@
|
||||
"""Loader that loads AZLyrics."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.web_base import WebBaseLoader
|
||||
|
||||
|
||||
class AZLyricsLoader(WebBaseLoader):
|
||||
"""Loader that loads AZLyrics webpages."""
|
||||
|
||||
def __init__(self, web_path: str):
|
||||
"""Initialize with webpage path."""
|
||||
self.web_path = web_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load webpage."""
|
||||
soup = self.scrape()
|
||||
title = soup.title.text
|
||||
lyrics = soup.find_all("div", {"class": ""})[2].text
|
||||
text = title + lyrics
|
||||
metadata = {"source": self.web_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
20
langchain/document_loaders/college_confidential.py
Normal file
20
langchain/document_loaders/college_confidential.py
Normal file
@@ -0,0 +1,20 @@
|
||||
"""Loader that loads College Confidential."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.web_base import WebBaseLoader
|
||||
|
||||
|
||||
class CollegeConfidentialLoader(WebBaseLoader):
|
||||
"""Loader that loads College Confidential webpages."""
|
||||
|
||||
def __init__(self, web_path: str):
|
||||
"""Initialize with webpage path."""
|
||||
self.web_path = web_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load webpage."""
|
||||
soup = self.scrape()
|
||||
text = soup.select_one("main[class='skin-handler']").text
|
||||
metadata = {"source": self.web_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
@@ -1,29 +1,13 @@
|
||||
"""Loader that loads Microsoft Word files."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
|
||||
class UnstructuredDocxLoader(BaseLoader):
|
||||
class UnstructuredDocxLoader(UnstructuredFileLoader):
|
||||
"""Loader that uses unstructured to load Microsoft Word files."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
try:
|
||||
import unstructured # noqa:F401
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"unstructured package not found, please install it with "
|
||||
"`pip install unstructured`"
|
||||
)
|
||||
self.file_path = file_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load file."""
|
||||
def _get_elements(self) -> List:
|
||||
from unstructured.partition.docx import partition_docx
|
||||
|
||||
elements = partition_docx(filename=self.file_path)
|
||||
text = "\n\n".join([str(el) for el in elements])
|
||||
metadata = {"source": self.file_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
return partition_docx(filename=self.file_path)
|
||||
|
||||
@@ -1,29 +1,13 @@
|
||||
"""Loader that loads email files."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
|
||||
class UnstructuredEmailLoader(BaseLoader):
|
||||
class UnstructuredEmailLoader(UnstructuredFileLoader):
|
||||
"""Loader that uses unstructured to load email files."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
try:
|
||||
import unstructured # noqa:F401
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"unstructured package not found, please install it with "
|
||||
"`pip install unstructured`"
|
||||
)
|
||||
self.file_path = file_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load file."""
|
||||
def _get_elements(self) -> List:
|
||||
from unstructured.partition.email import partition_email
|
||||
|
||||
elements = partition_email(filename=self.file_path)
|
||||
text = "\n\n".join([str(el) for el in elements])
|
||||
metadata = {"source": self.file_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
return partition_email(filename=self.file_path)
|
||||
|
||||
82
langchain/document_loaders/everynote.py
Normal file
82
langchain/document_loaders/everynote.py
Normal file
@@ -0,0 +1,82 @@
|
||||
"""Load documents from Everynote.
|
||||
|
||||
https://gist.github.com/foxmask/7b29c43a161e001ff04afdb2f181e31c
|
||||
"""
|
||||
import hashlib
|
||||
from base64 import b64decode
|
||||
from time import strptime
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
def _parse_content(content: str) -> str:
|
||||
from pypandoc import convert_text
|
||||
|
||||
text = convert_text(content, "org", format="html")
|
||||
return text
|
||||
|
||||
|
||||
def _parse_resource(resource: list) -> dict:
|
||||
rsc_dict: Dict[str, Any] = {}
|
||||
for elem in resource:
|
||||
if elem.tag == "data":
|
||||
# Some times elem.text is None
|
||||
rsc_dict[elem.tag] = b64decode(elem.text) if elem.text else b""
|
||||
rsc_dict["hash"] = hashlib.md5(rsc_dict[elem.tag]).hexdigest()
|
||||
else:
|
||||
rsc_dict[elem.tag] = elem.text
|
||||
|
||||
return rsc_dict
|
||||
|
||||
|
||||
def _parse_note(note: List) -> dict:
|
||||
note_dict: Dict[str, Any] = {}
|
||||
resources = []
|
||||
for elem in note:
|
||||
if elem.tag == "content":
|
||||
note_dict[elem.tag] = _parse_content(elem.text)
|
||||
# A copy of original content
|
||||
note_dict["content-raw"] = elem.text
|
||||
elif elem.tag == "resource":
|
||||
resources.append(_parse_resource(elem))
|
||||
elif elem.tag == "created" or elem.tag == "updated":
|
||||
note_dict[elem.tag] = strptime(elem.text, "%Y%m%dT%H%M%SZ")
|
||||
else:
|
||||
note_dict[elem.tag] = elem.text
|
||||
|
||||
note_dict["resource"] = resources
|
||||
|
||||
return note_dict
|
||||
|
||||
|
||||
def _parse_note_xml(xml_file: str) -> str:
|
||||
"""Parse everynote xml."""
|
||||
# Without huge_tree set to True, parser may complain about huge text node
|
||||
# Try to recover, because there may be " ", which will cause
|
||||
# "XMLSyntaxError: Entity 'nbsp' not defined"
|
||||
from lxml import etree
|
||||
|
||||
context = etree.iterparse(
|
||||
xml_file, encoding="utf-8", strip_cdata=False, huge_tree=True, recover=True
|
||||
)
|
||||
result_string = ""
|
||||
for action, elem in context:
|
||||
if elem.tag == "note":
|
||||
result_string += _parse_note(elem)["content"]
|
||||
return result_string
|
||||
|
||||
|
||||
class EveryNoteLoader(BaseLoader):
|
||||
"""Loader to load in EverNnote files.."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
self.file_path = file_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load document from EveryNote file."""
|
||||
text = _parse_note_xml(self.file_path)
|
||||
metadata = {"source": self.file_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
28
langchain/document_loaders/gutenberg.py
Normal file
28
langchain/document_loaders/gutenberg.py
Normal file
@@ -0,0 +1,28 @@
|
||||
"""Loader that loads .txt web files."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class GutenbergLoader(BaseLoader):
|
||||
"""Loader that uses urllib to load .txt web files."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
if not file_path.startswith("https://www.gutenberg.org"):
|
||||
raise ValueError("file path must start with 'https://www.gutenberg.org'")
|
||||
|
||||
if not file_path.endswith(".txt"):
|
||||
raise ValueError("file path must end with '.txt'")
|
||||
|
||||
self.file_path = file_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load file."""
|
||||
from urllib.request import urlopen
|
||||
|
||||
elements = urlopen(self.file_path)
|
||||
text = "\n\n".join([str(el.decode("utf-8-sig")) for el in elements])
|
||||
metadata = {"source": self.file_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
@@ -1,29 +1,13 @@
|
||||
"""Loader that loads PDF files."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
|
||||
class UnstructuredHTMLLoader(BaseLoader):
|
||||
class UnstructuredHTMLLoader(UnstructuredFileLoader):
|
||||
"""Loader that uses unstructured to load HTML files."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
try:
|
||||
import unstructured # noqa:F401
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"unstructured package not found, please install it with "
|
||||
"`pip install unstructured`"
|
||||
)
|
||||
self.file_path = file_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load file."""
|
||||
def _get_elements(self) -> List:
|
||||
from unstructured.partition.html import partition_html
|
||||
|
||||
elements = partition_html(filename=self.file_path)
|
||||
text = "\n\n".join([str(el) for el in elements])
|
||||
metadata = {"source": self.file_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
return partition_html(filename=self.file_path)
|
||||
|
||||
20
langchain/document_loaders/imsdb.py
Normal file
20
langchain/document_loaders/imsdb.py
Normal file
@@ -0,0 +1,20 @@
|
||||
"""Loader that loads IMSDb."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.web_base import WebBaseLoader
|
||||
|
||||
|
||||
class IMSDbLoader(WebBaseLoader):
|
||||
"""Loader that loads IMSDb webpages."""
|
||||
|
||||
def __init__(self, web_path: str):
|
||||
"""Initialize with webpage path."""
|
||||
self.web_path = web_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load webpage."""
|
||||
soup = self.scrape()
|
||||
text = soup.select_one("td[class='scrtext']").text
|
||||
metadata = {"source": self.web_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
29
langchain/document_loaders/online_pdf.py
Normal file
29
langchain/document_loaders/online_pdf.py
Normal file
@@ -0,0 +1,29 @@
|
||||
"""Loader that loads online PDF files."""
|
||||
|
||||
import tempfile
|
||||
from typing import List
|
||||
|
||||
import requests
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.document_loaders.pdf import UnstructuredPDFLoader
|
||||
|
||||
|
||||
class OnlinePDFLoader(BaseLoader):
|
||||
"""Loader that loads online PDFs."""
|
||||
|
||||
def __init__(self, web_path: str):
|
||||
"""Initialize with file path."""
|
||||
self.web_path = web_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents."""
|
||||
r = requests.get(self.web_path)
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
file_path = f"{temp_dir}/online_file.pdf"
|
||||
file = open(file_path, "wb")
|
||||
file.write(r.content)
|
||||
file.close()
|
||||
loader = UnstructuredPDFLoader(file_path)
|
||||
return loader.load()
|
||||
36
langchain/document_loaders/paged_pdf.py
Normal file
36
langchain/document_loaders/paged_pdf.py
Normal file
@@ -0,0 +1,36 @@
|
||||
"""Loads a PDF with pypdf and chunks at character level."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class PagedPDFSplitter(BaseLoader):
|
||||
"""Loads a PDF with pypdf and chunks at character level.
|
||||
|
||||
Loader also stores page numbers in metadatas.
|
||||
"""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
try:
|
||||
import pypdf # noqa:F401
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"pypdf package not found, please install it with " "`pip install pypdf`"
|
||||
)
|
||||
self._file_path = file_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load given path as pages."""
|
||||
import pypdf
|
||||
|
||||
pdf_file_obj = open(self._file_path, "rb")
|
||||
pdf_reader = pypdf.PdfReader(pdf_file_obj)
|
||||
docs = []
|
||||
for i, page in enumerate(pdf_reader.pages):
|
||||
text = page.extract_text()
|
||||
metadata = {"source": self._file_path, "page": i}
|
||||
docs.append(Document(page_content=text, metadata=metadata))
|
||||
pdf_file_obj.close()
|
||||
return docs
|
||||
@@ -3,27 +3,36 @@ from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
|
||||
class UnstructuredPDFLoader(BaseLoader):
|
||||
class UnstructuredPDFLoader(UnstructuredFileLoader):
|
||||
"""Loader that uses unstructured to load PDF files."""
|
||||
|
||||
def _get_elements(self) -> List:
|
||||
from unstructured.partition.pdf import partition_pdf
|
||||
|
||||
return partition_pdf(filename=self.file_path)
|
||||
|
||||
|
||||
class PDFMinerLoader(BaseLoader):
|
||||
"""Loader that uses PDFMiner to load PDF files."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
try:
|
||||
import unstructured # noqa:F401
|
||||
from pdfminer.high_level import extract_text # noqa:F401
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"unstructured package not found, please install it with "
|
||||
"`pip install unstructured`"
|
||||
"pdfminer package not found, please install it with "
|
||||
"`pip install pdfminer.six`"
|
||||
)
|
||||
self.file_path = file_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load file."""
|
||||
from unstructured.partition.pdf import partition_pdf
|
||||
from pdfminer.high_level import extract_text
|
||||
|
||||
elements = partition_pdf(filename=self.file_path)
|
||||
text = "\n\n".join([str(el) for el in elements])
|
||||
text = extract_text(self.file_path)
|
||||
metadata = {"source": self.file_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
|
||||
@@ -1,29 +1,13 @@
|
||||
"""Loader that loads powerpoint files."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
|
||||
class UnstructuredPowerPointLoader(BaseLoader):
|
||||
class UnstructuredPowerPointLoader(UnstructuredFileLoader):
|
||||
"""Loader that uses unstructured to load powerpoint files."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
try:
|
||||
import unstructured # noqa:F401
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"unstructured package not found, please install it with "
|
||||
"`pip install unstructured`"
|
||||
)
|
||||
self.file_path = file_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load file."""
|
||||
def _get_elements(self) -> List:
|
||||
from unstructured.partition.pptx import partition_pptx
|
||||
|
||||
elements = partition_pptx(filename=self.file_path)
|
||||
text = "\n\n".join([str(el) for el in elements])
|
||||
metadata = {"source": self.file_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
return partition_pptx(filename=self.file_path)
|
||||
|
||||
@@ -19,7 +19,11 @@ class ReadTheDocsLoader(BaseLoader):
|
||||
|
||||
def _clean_data(data: str) -> str:
|
||||
soup = BeautifulSoup(data)
|
||||
text = soup.find_all("main", {"id": "main-content"})[0].get_text()
|
||||
text = soup.find_all("main", {"id": "main-content"})
|
||||
if len(text) != 0:
|
||||
text = text[0].get_text()
|
||||
else:
|
||||
text = ""
|
||||
return "\n".join([t for t in text.split("\n") if t])
|
||||
|
||||
docs = []
|
||||
|
||||
20
langchain/document_loaders/text.py
Normal file
20
langchain/document_loaders/text.py
Normal file
@@ -0,0 +1,20 @@
|
||||
"""Load text files."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class TextLoader(BaseLoader):
|
||||
"""Load text files."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
self.file_path = file_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load from file path."""
|
||||
with open(self.file_path) as f:
|
||||
text = f.read()
|
||||
metadata = {"source": self.file_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
@@ -8,7 +8,7 @@ from langchain.document_loaders.base import BaseLoader
|
||||
class UnstructuredFileLoader(BaseLoader):
|
||||
"""Loader that uses unstructured to load files."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
def __init__(self, file_path: str, mode: str = "single"):
|
||||
"""Initialize with file path."""
|
||||
try:
|
||||
import unstructured # noqa:F401
|
||||
@@ -17,13 +17,30 @@ class UnstructuredFileLoader(BaseLoader):
|
||||
"unstructured package not found, please install it with "
|
||||
"`pip install unstructured`"
|
||||
)
|
||||
_valid_modes = {"single", "elements"}
|
||||
if mode not in _valid_modes:
|
||||
raise ValueError(
|
||||
f"Got {mode} for `mode`, but should be one of `{_valid_modes}`"
|
||||
)
|
||||
self.file_path = file_path
|
||||
self.mode = mode
|
||||
|
||||
def _get_elements(self) -> List:
|
||||
from unstructured.partition.auto import partition
|
||||
|
||||
return partition(filename=self.file_path)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load file."""
|
||||
from unstructured.partition.auto import partition
|
||||
|
||||
elements = partition(filename=self.file_path)
|
||||
text = "\n\n".join([str(el) for el in elements])
|
||||
elements = self._get_elements()
|
||||
metadata = {"source": self.file_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
if self.mode == "elements":
|
||||
docs = [
|
||||
Document(page_content=str(el), metadata=metadata) for el in elements
|
||||
]
|
||||
elif self.mode == "single":
|
||||
text = "\n\n".join([str(el) for el in elements])
|
||||
docs = [Document(page_content=text, metadata=metadata)]
|
||||
else:
|
||||
raise ValueError(f"mode of {self.mode} not supported.")
|
||||
return docs
|
||||
|
||||
32
langchain/document_loaders/url.py
Normal file
32
langchain/document_loaders/url.py
Normal file
@@ -0,0 +1,32 @@
|
||||
"""Loader that loads PDF files."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class UnstructuredURLLoader(BaseLoader):
|
||||
"""Loader that uses unstructured to load HTML files."""
|
||||
|
||||
def __init__(self, urls: List[str]):
|
||||
"""Initialize with file path."""
|
||||
try:
|
||||
import unstructured # noqa:F401
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"unstructured package not found, please install it with "
|
||||
"`pip install unstructured`"
|
||||
)
|
||||
self.urls = urls
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load file."""
|
||||
from unstructured.partition.html import partition_html
|
||||
|
||||
docs: List[Document] = list()
|
||||
for url in self.urls:
|
||||
elements = partition_html(url=url)
|
||||
text = "\n\n".join([str(el) for el in elements])
|
||||
metadata = {"source": url}
|
||||
docs.append(Document(page_content=text, metadata=metadata))
|
||||
return docs
|
||||
29
langchain/document_loaders/web_base.py
Normal file
29
langchain/document_loaders/web_base.py
Normal file
@@ -0,0 +1,29 @@
|
||||
"""Web base loader class."""
|
||||
from typing import List
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class WebBaseLoader(BaseLoader):
|
||||
"""Loader that uses urllib and beautiful soup to load webpages."""
|
||||
|
||||
def __init__(self, web_path: str):
|
||||
"""Initialize with webpage path."""
|
||||
self.web_path = web_path
|
||||
|
||||
def scrape(self) -> BeautifulSoup:
|
||||
"""Scrape data from webpage and return it in BeautifulSoup format."""
|
||||
html_doc = requests.get(self.web_path)
|
||||
soup = BeautifulSoup(html_doc.text, "html.parser")
|
||||
return soup
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load data into document objects."""
|
||||
soup = self.scrape()
|
||||
text = soup.get_text()
|
||||
metadata = {"source": self.web_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
@@ -1,7 +1,7 @@
|
||||
"""Loader that loads YouTube transcript."""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import List
|
||||
from typing import Any, List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
@@ -10,26 +10,67 @@ from langchain.document_loaders.base import BaseLoader
|
||||
class YoutubeLoader(BaseLoader):
|
||||
"""Loader that loads Youtube transcripts."""
|
||||
|
||||
def __init__(self, video_id: str):
|
||||
def __init__(self, video_id: str, add_video_info: bool = False):
|
||||
"""Initialize with YouTube video ID."""
|
||||
self.video_id = video_id
|
||||
self.add_video_info = add_video_info
|
||||
|
||||
@classmethod
|
||||
def from_youtube_url(cls, youtube_url: str) -> YoutubeLoader:
|
||||
def from_youtube_url(cls, youtube_url: str, **kwargs: Any) -> YoutubeLoader:
|
||||
"""Parse out video id from YouTube url."""
|
||||
video_id = youtube_url.split("youtube.com/watch?v=")[-1]
|
||||
return cls(video_id)
|
||||
return cls(video_id, **kwargs)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents."""
|
||||
try:
|
||||
from youtube_transcript_api import YouTubeTranscriptApi
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
raise ImportError(
|
||||
"Could not import youtube_transcript_api python package. "
|
||||
"Please it install it with `pip install youtube-transcript-api`."
|
||||
)
|
||||
|
||||
metadata = {"source": self.video_id}
|
||||
|
||||
if self.add_video_info:
|
||||
# Get more video meta info
|
||||
# Such as title, description, thumbnail url, publish_date
|
||||
video_info = self._get_video_info()
|
||||
metadata.update(video_info)
|
||||
|
||||
transcript_pieces = YouTubeTranscriptApi.get_transcript(self.video_id)
|
||||
transcript = " ".join([t["text"].strip(" ") for t in transcript_pieces])
|
||||
metadata = {"source": self.video_id}
|
||||
|
||||
return [Document(page_content=transcript, metadata=metadata)]
|
||||
|
||||
def _get_video_info(self) -> dict:
|
||||
"""Get important video information.
|
||||
|
||||
Components are:
|
||||
- title
|
||||
- description
|
||||
- thumbnail url,
|
||||
- publish_date
|
||||
- channel_author
|
||||
- and more.
|
||||
"""
|
||||
try:
|
||||
from pytube import YouTube
|
||||
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import pytube python package. "
|
||||
"Please it install it with `pip install pytube`."
|
||||
)
|
||||
yt = YouTube(f"https://www.youtube.com/watch?v={self.video_id}")
|
||||
video_info = {
|
||||
"title": yt.title,
|
||||
"description": yt.description,
|
||||
"view_count": yt.views,
|
||||
"thumbnail_url": yt.thumbnail_url,
|
||||
"publish_date": yt.publish_date,
|
||||
"length": yt.length,
|
||||
"author": yt.author,
|
||||
}
|
||||
return video_info
|
||||
|
||||
@@ -75,20 +75,27 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
||||
text = text.replace("\n", " ")
|
||||
return self.client.create(input=[text], engine=engine)["data"][0]["embedding"]
|
||||
|
||||
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
def embed_documents(
|
||||
self, texts: List[str], chunk_size: int = 1000
|
||||
) -> List[List[float]]:
|
||||
"""Call out to OpenAI's embedding endpoint for embedding search docs.
|
||||
|
||||
Args:
|
||||
texts: The list of texts to embed.
|
||||
chunk_size: The maximum number of texts to send to OpenAI at once
|
||||
(max 1000).
|
||||
|
||||
Returns:
|
||||
List of embeddings, one for each text.
|
||||
"""
|
||||
responses = [
|
||||
self._embedding_func(text, engine=self.document_model_name)
|
||||
for text in texts
|
||||
]
|
||||
return responses
|
||||
# handle large batches of texts
|
||||
results = []
|
||||
for i in range(0, len(texts), chunk_size):
|
||||
response = self.client.create(
|
||||
input=texts[i : i + chunk_size], engine=self.document_model_name
|
||||
)
|
||||
results += [r["embedding"] for r in response["data"]]
|
||||
return results
|
||||
|
||||
def embed_query(self, text: str) -> List[float]:
|
||||
"""Call out to OpenAI's embedding endpoint for embedding query text.
|
||||
|
||||
28
langchain/llms/fake.py
Normal file
28
langchain/llms/fake.py
Normal file
@@ -0,0 +1,28 @@
|
||||
"""Fake LLM wrapper for testing purposes."""
|
||||
from typing import Any, List, Mapping, Optional
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from langchain.llms.base import LLM
|
||||
|
||||
|
||||
class FakeListLLM(LLM, BaseModel):
|
||||
"""Fake LLM wrapper for testing purposes."""
|
||||
|
||||
responses: List
|
||||
i: int = 0
|
||||
|
||||
@property
|
||||
def _llm_type(self) -> str:
|
||||
"""Return type of llm."""
|
||||
return "fake-list"
|
||||
|
||||
def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
|
||||
"""First try to lookup in queries, else return 'foo' or 'bar'."""
|
||||
response = self.responses[self.i]
|
||||
self.i += 1
|
||||
return response
|
||||
|
||||
@property
|
||||
def _identifying_params(self) -> Mapping[str, Any]:
|
||||
return {}
|
||||
@@ -17,40 +17,30 @@ class SQLDatabase:
|
||||
ignore_tables: Optional[List[str]] = None,
|
||||
include_tables: Optional[List[str]] = None,
|
||||
sample_rows_in_table_info: int = 0,
|
||||
# TODO: deprecate.
|
||||
sample_row_in_table_info: bool = False,
|
||||
):
|
||||
"""Create engine from database URI."""
|
||||
if sample_row_in_table_info and sample_rows_in_table_info > 0:
|
||||
raise ValueError(
|
||||
"Only one of `sample_row_in_table_info` "
|
||||
"and `sample_rows_in_table_info` should be set"
|
||||
)
|
||||
self._engine = engine
|
||||
self._schema = schema
|
||||
if include_tables and ignore_tables:
|
||||
raise ValueError("Cannot specify both include_tables and ignore_tables")
|
||||
|
||||
self._inspector = inspect(self._engine)
|
||||
self._all_tables = self._inspector.get_table_names(schema=schema)
|
||||
self._include_tables = include_tables or []
|
||||
self._all_tables = set(self._inspector.get_table_names(schema=schema))
|
||||
self._include_tables = set(include_tables) if include_tables else set()
|
||||
if self._include_tables:
|
||||
missing_tables = set(self._include_tables).difference(self._all_tables)
|
||||
missing_tables = self._include_tables - self._all_tables
|
||||
if missing_tables:
|
||||
raise ValueError(
|
||||
f"include_tables {missing_tables} not found in database"
|
||||
)
|
||||
self._ignore_tables = ignore_tables or []
|
||||
self._ignore_tables = set(ignore_tables) if ignore_tables else set()
|
||||
if self._ignore_tables:
|
||||
missing_tables = set(self._ignore_tables).difference(self._all_tables)
|
||||
missing_tables = self._ignore_tables - self._all_tables
|
||||
if missing_tables:
|
||||
raise ValueError(
|
||||
f"ignore_tables {missing_tables} not found in database"
|
||||
)
|
||||
self._sample_rows_in_table_info = sample_rows_in_table_info
|
||||
# TODO: deprecate
|
||||
if sample_row_in_table_info:
|
||||
self._sample_rows_in_table_info = 1
|
||||
|
||||
@classmethod
|
||||
def from_uri(cls, database_uri: str, **kwargs: Any) -> SQLDatabase:
|
||||
@@ -66,7 +56,7 @@ class SQLDatabase:
|
||||
"""Get names of tables available."""
|
||||
if self._include_tables:
|
||||
return self._include_tables
|
||||
return set(self._all_tables) - set(self._ignore_tables)
|
||||
return self._all_tables - self._ignore_tables
|
||||
|
||||
@property
|
||||
def table_info(self) -> str:
|
||||
@@ -91,7 +81,6 @@ class SQLDatabase:
|
||||
|
||||
tables = []
|
||||
for table_name in all_table_names:
|
||||
|
||||
columns = []
|
||||
for column in self._inspector.get_columns(table_name, schema=self._schema):
|
||||
columns.append(f"{column['name']} ({str(column['type'])})")
|
||||
|
||||
@@ -3,7 +3,17 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Callable, Iterable, List, Optional
|
||||
from typing import (
|
||||
AbstractSet,
|
||||
Any,
|
||||
Callable,
|
||||
Collection,
|
||||
Iterable,
|
||||
List,
|
||||
Literal,
|
||||
Optional,
|
||||
Union,
|
||||
)
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
|
||||
@@ -114,7 +124,11 @@ class TextSplitter(ABC):
|
||||
|
||||
@classmethod
|
||||
def from_tiktoken_encoder(
|
||||
cls, encoding_name: str = "gpt2", **kwargs: Any
|
||||
cls,
|
||||
encoding_name: str = "gpt2",
|
||||
allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),
|
||||
disallowed_special: Union[Literal["all"], Collection[str]] = "all",
|
||||
**kwargs: Any,
|
||||
) -> TextSplitter:
|
||||
"""Text splitter that uses tiktoken encoder to count length."""
|
||||
try:
|
||||
@@ -125,11 +139,19 @@ class TextSplitter(ABC):
|
||||
"This is needed in order to calculate max_tokens_for_prompt. "
|
||||
"Please it install it with `pip install tiktoken`."
|
||||
)
|
||||
|
||||
# create a GPT-3 encoder instance
|
||||
enc = tiktoken.get_encoding(encoding_name)
|
||||
|
||||
def _tiktoken_encoder(text: str) -> int:
|
||||
return len(enc.encode(text))
|
||||
def _tiktoken_encoder(text: str, **kwargs: Any) -> int:
|
||||
return len(
|
||||
enc.encode(
|
||||
text,
|
||||
allowed_special=allowed_special,
|
||||
disallowed_special=disallowed_special,
|
||||
**kwargs,
|
||||
)
|
||||
)
|
||||
|
||||
return cls(length_function=_tiktoken_encoder, **kwargs)
|
||||
|
||||
@@ -155,7 +177,13 @@ class CharacterTextSplitter(TextSplitter):
|
||||
class TokenTextSplitter(TextSplitter):
|
||||
"""Implementation of splitting text that looks at tokens."""
|
||||
|
||||
def __init__(self, encoding_name: str = "gpt2", **kwargs: Any):
|
||||
def __init__(
|
||||
self,
|
||||
encoding_name: str = "gpt2",
|
||||
allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),
|
||||
disallowed_special: Union[Literal["all"], Collection[str]] = "all",
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""Create a new TextSplitter."""
|
||||
super().__init__(**kwargs)
|
||||
try:
|
||||
@@ -168,11 +196,17 @@ class TokenTextSplitter(TextSplitter):
|
||||
)
|
||||
# create a GPT-3 encoder instance
|
||||
self._tokenizer = tiktoken.get_encoding(encoding_name)
|
||||
self._allowed_special = allowed_special
|
||||
self._disallowed_special = disallowed_special
|
||||
|
||||
def split_text(self, text: str) -> List[str]:
|
||||
"""Split incoming text and return chunks."""
|
||||
splits = []
|
||||
input_ids = self._tokenizer.encode(text)
|
||||
input_ids = self._tokenizer.encode(
|
||||
text,
|
||||
allowed_special=self._allowed_special,
|
||||
disallowed_special=self._disallowed_special,
|
||||
)
|
||||
start_idx = 0
|
||||
cur_idx = min(start_idx + self._chunk_size, len(input_ids))
|
||||
chunk_ids = input_ids[start_idx:cur_idx]
|
||||
|
||||
783
poetry.lock
generated
783
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "langchain"
|
||||
version = "0.0.80"
|
||||
version = "0.0.84"
|
||||
description = "Building applications with LLMs through composability"
|
||||
authors = []
|
||||
license = "MIT"
|
||||
@@ -37,7 +37,15 @@ qdrant-client = {version = "^0.11.7", optional = true}
|
||||
dataclasses-json = "^0.5.7"
|
||||
tensorflow-text = {version = "^2.11.0", optional = true, python = "^3.10, <3.12"}
|
||||
tenacity = "^8.1.0"
|
||||
cohere = {version = "^3", optional = true}
|
||||
openai = {version = "^0", optional = true}
|
||||
nlpcloud = {version = "^1", optional = true}
|
||||
huggingface_hub = {version = "^0", optional = true}
|
||||
google-search-results = {version = "^2", optional = true}
|
||||
sentence-transformers = {version = "^2", optional = true}
|
||||
aiohttp = "^3.8.3"
|
||||
pypdf = {version = "^3.4.0", optional = true}
|
||||
|
||||
|
||||
[tool.poetry.group.docs.dependencies]
|
||||
autodoc_pydantic = "^1.8.0"
|
||||
@@ -65,11 +73,11 @@ pytest-asyncio = "^0.20.3"
|
||||
|
||||
[tool.poetry.group.lint.dependencies]
|
||||
flake8-docstrings = "^1.6.0"
|
||||
black = "^22.10.0"
|
||||
isort = "^5.10.1"
|
||||
flake8 = "^6.0.0"
|
||||
types-toml = "^0.10.8.1"
|
||||
types-redis = "^4.3.21.6"
|
||||
black = "^23.1.0"
|
||||
|
||||
[tool.poetry.group.typing.dependencies]
|
||||
mypy = "^0.991"
|
||||
@@ -85,7 +93,7 @@ playwright = "^1.28.0"
|
||||
|
||||
[tool.poetry.extras]
|
||||
llms = ["cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"]
|
||||
all = ["cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "elasticsearch", "google-search-results", "faiss-cpu", "sentence_transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text"]
|
||||
all = ["cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "elasticsearch", "google-search-results", "faiss-cpu", "sentence_transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf"]
|
||||
|
||||
[tool.isort]
|
||||
profile = "black"
|
||||
|
||||
@@ -8,7 +8,18 @@ def test_openai_embedding_documents() -> None:
|
||||
embedding = OpenAIEmbeddings()
|
||||
output = embedding.embed_documents(documents)
|
||||
assert len(output) == 1
|
||||
assert len(output[0]) == 2048
|
||||
assert len(output[0]) == 1536
|
||||
|
||||
|
||||
def test_openai_embedding_documents_multiple() -> None:
|
||||
"""Test openai embeddings."""
|
||||
documents = ["foo bar", "bar foo", "foo"]
|
||||
embedding = OpenAIEmbeddings()
|
||||
output = embedding.embed_documents(documents, chunk_size=2)
|
||||
assert len(output) == 3
|
||||
assert len(output[0]) == 1536
|
||||
assert len(output[1]) == 1536
|
||||
assert len(output[2]) == 1536
|
||||
|
||||
|
||||
def test_openai_embedding_query() -> None:
|
||||
@@ -16,4 +27,4 @@ def test_openai_embedding_query() -> None:
|
||||
document = "foo bar"
|
||||
embedding = OpenAIEmbeddings()
|
||||
output = embedding.embed_query(document)
|
||||
assert len(output) == 2048
|
||||
assert len(output) == 1536
|
||||
|
||||
BIN
tests/integration_tests/examples/hello.pdf
Normal file
BIN
tests/integration_tests/examples/hello.pdf
Normal file
Binary file not shown.
19
tests/integration_tests/test_pdf_pagesplitter.py
Normal file
19
tests/integration_tests/test_pdf_pagesplitter.py
Normal file
@@ -0,0 +1,19 @@
|
||||
"""Test splitting with page numbers included."""
|
||||
import os
|
||||
|
||||
from langchain.document_loaders import PagedPDFSplitter
|
||||
from langchain.embeddings.openai import OpenAIEmbeddings
|
||||
from langchain.vectorstores import FAISS
|
||||
|
||||
|
||||
def test_pdf_pagesplitter() -> None:
|
||||
"""Test splitting with page numbers included."""
|
||||
script_dir = os.path.dirname(__file__)
|
||||
loader = PagedPDFSplitter(os.path.join(script_dir, "examples/hello.pdf"))
|
||||
docs = loader.load()
|
||||
assert "page" in docs[0].metadata
|
||||
assert "source" in docs[0].metadata
|
||||
|
||||
faiss_index = FAISS.from_documents(docs, OpenAIEmbeddings())
|
||||
docs = faiss_index.similarity_search("Complete this sentence: Hello", k=1)
|
||||
assert "Hello world" in docs[0].page_content
|
||||
@@ -1,6 +1,10 @@
|
||||
"""Test base LLM functionality."""
|
||||
from sqlalchemy import Column, Integer, Sequence, String, create_engine
|
||||
from sqlalchemy.orm import declarative_base
|
||||
|
||||
try:
|
||||
from sqlalchemy.orm import declarative_base
|
||||
except ImportError:
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
|
||||
import langchain
|
||||
from langchain.cache import InMemoryCache, SQLAlchemyCache
|
||||
|
||||
Reference in New Issue
Block a user