mirror of
https://github.com/hwchase17/langchain.git
synced 2026-02-08 02:00:06 +00:00
Compare commits
21 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6d44a2285c | ||
|
|
0998577dfe | ||
|
|
bbb06ca4cf | ||
|
|
0b6aa6a024 | ||
|
|
10e7297306 | ||
|
|
e51fad1488 | ||
|
|
b7747017d7 | ||
|
|
2e96704d59 | ||
|
|
e9799d6821 | ||
|
|
c2d1d903fa | ||
|
|
055a53c27f | ||
|
|
231da14771 | ||
|
|
6ab432d62e | ||
|
|
07a407d89a | ||
|
|
c64f98e2bb | ||
|
|
5469d898a9 | ||
|
|
3d639d1539 | ||
|
|
91c6cea227 | ||
|
|
ba54d36787 | ||
|
|
5f8082bdd7 | ||
|
|
512c523368 |
@@ -166,7 +166,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -33,7 +33,6 @@ def run_cmd(cmd: str, _crawler: Crawler) -> None:
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
objective = "Make a reservation for 2 at 7pm at bistro vida in menlo park"
|
||||
print("\nWelcome to natbot! What is your objective?")
|
||||
i = input()
|
||||
|
||||
@@ -21,28 +21,83 @@
|
||||
"from langchain.vectorstores.faiss import FAISS\n",
|
||||
"from langchain.text_splitter import CharacterTextSplitter\n",
|
||||
"from langchain.llms import OpenAI\n",
|
||||
"from langchain.chains import ChatVectorDBChain"
|
||||
"from langchain.chains import ChatVectorDBChain\n",
|
||||
"from langchain.document_loaders import TextLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cdff94be",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Load in documents. You can replace this with a loader for whatever type of data you want"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 2,
|
||||
"id": "01c46e92",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = TextLoader('../../state_of_the_union.txt')\n",
|
||||
"documents = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e9be4779",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If you had multiple loaders that you wanted to combine, you do something like:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "433363a5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# loaders = [....]\n",
|
||||
"# docs = []\n",
|
||||
"# for loader in loaders:\n",
|
||||
"# docs.extend(loader.load())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "239475d2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We now split the documents, create embeddings for them, and put them in a vectorstore. This allows us to do semantic search over them."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "a8930cf7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('../../state_of_the_union.txt') as f:\n",
|
||||
" state_of_the_union = f.read()\n",
|
||||
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
|
||||
"texts = text_splitter.split_text(state_of_the_union)\n",
|
||||
"documents = text_splitter.split_documents(documents)\n",
|
||||
"\n",
|
||||
"embeddings = OpenAIEmbeddings()\n",
|
||||
"vectorstore = FAISS.from_texts(texts, embeddings)"
|
||||
"vectorstore = FAISS.from_documents(documents, embeddings)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3c96b118",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We now initialize the ChatVectorDBChain"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 7,
|
||||
"id": "7b4110f3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -157,7 +212,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
171
docs/modules/document_loaders/examples/airbyte_json.ipynb
Normal file
171
docs/modules/document_loaders/examples/airbyte_json.ipynb
Normal file
@@ -0,0 +1,171 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1f3a5ebf",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Airbyte JSON\n",
|
||||
"This covers how to load any source from Airbyte into a local JSON file that can be read in as a document\n",
|
||||
"\n",
|
||||
"Prereqs:\n",
|
||||
"Have docker desktop installed\n",
|
||||
"\n",
|
||||
"Steps:\n",
|
||||
"\n",
|
||||
"1) Clone Airbyte from GitHub - `git clone https://github.com/airbytehq/airbyte.git`\n",
|
||||
"\n",
|
||||
"2) Switch into Airbyte directory - `cd airbyte`\n",
|
||||
"\n",
|
||||
"3) Start Airbyte - `docker compose up`\n",
|
||||
"\n",
|
||||
"4) In your browser, just visit http://localhost:8000. You will be asked for a username and password. By default, that's username `airbyte` and password `password`.\n",
|
||||
"\n",
|
||||
"5) Setup any source you wish.\n",
|
||||
"\n",
|
||||
"6) Set destination as Local JSON, with specified destination path - lets say `/json_data`. Set up manual sync.\n",
|
||||
"\n",
|
||||
"7) Run the connection!\n",
|
||||
"\n",
|
||||
"7) To see what files are create, you can navigate to: `file:///tmp/airbyte_local`\n",
|
||||
"\n",
|
||||
"8) Find your data and copy path. That path should be saved in the file variable below. It should start with `/tmp/airbyte_local`\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "180c8b74",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import AirbyteJSONLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "4af10665",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"_airbyte_raw_pokemon.jsonl\r\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!ls /tmp/airbyte_local/json_data/"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "721d9316",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = AirbyteJSONLoader('/tmp/airbyte_local/json_data/_airbyte_raw_pokemon.jsonl')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "9858b946",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "fca024cb",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"abilities: \n",
|
||||
"ability: \n",
|
||||
"name: blaze\n",
|
||||
"url: https://pokeapi.co/api/v2/ability/66/\n",
|
||||
"\n",
|
||||
"is_hidden: False\n",
|
||||
"slot: 1\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"ability: \n",
|
||||
"name: solar-power\n",
|
||||
"url: https://pokeapi.co/api/v2/ability/94/\n",
|
||||
"\n",
|
||||
"is_hidden: True\n",
|
||||
"slot: 3\n",
|
||||
"\n",
|
||||
"base_experience: 267\n",
|
||||
"forms: \n",
|
||||
"name: charizard\n",
|
||||
"url: https://pokeapi.co/api/v2/pokemon-form/6/\n",
|
||||
"\n",
|
||||
"game_indices: \n",
|
||||
"game_index: 180\n",
|
||||
"version: \n",
|
||||
"name: red\n",
|
||||
"url: https://pokeapi.co/api/v2/version/1/\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"game_index: 180\n",
|
||||
"version: \n",
|
||||
"name: blue\n",
|
||||
"url: https://pokeapi.co/api/v2/version/2/\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"game_index: 180\n",
|
||||
"version: \n",
|
||||
"n\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(data[0].page_content[:500])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9fa002a5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -61,10 +61,61 @@
|
||||
"data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8bf50cba",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Retain Elements\n",
|
||||
"\n",
|
||||
"Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "b9592eaf",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = UnstructuredEmailLoader('example_data/fake-email.eml', mode=\"elements\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "0b16d03f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "d7bdc5e5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(page_content='This is a test email to use for unit tests.', lookup_str='', metadata={'source': 'example_data/fake-email.eml'}, lookup_index=0)"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4ef9a5f4",
|
||||
"id": "6a074515",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
@@ -86,7 +137,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
80
docs/modules/document_loaders/examples/everynote.ipynb
Normal file
80
docs/modules/document_loaders/examples/everynote.ipynb
Normal file
@@ -0,0 +1,80 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "56ac1584",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# EveryNote\n",
|
||||
"\n",
|
||||
"How to load EveryNote file from disk."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "1a53ece0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# !pip install pypandoc\n",
|
||||
"# import pypandoc\n",
|
||||
"\n",
|
||||
"# pypandoc.download_pandoc()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "88df766f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='testing this\\n\\nwhat happens?\\n\\nto the world?\\n', lookup_str='', metadata={'source': 'example_data/testing.enex'}, lookup_index=0)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.document_loaders import EveryNoteLoader\n",
|
||||
"\n",
|
||||
"loader = EveryNoteLoader(\"example_data/testing.enex\")\n",
|
||||
"loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c1329905",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE en-export SYSTEM "http://xml.evernote.com/pub/evernote-export4.dtd">
|
||||
<en-export export-date="20230309T035336Z" application="Evernote" version="10.53.2">
|
||||
<note>
|
||||
<title>testing</title>
|
||||
<created>20230209T034746Z</created>
|
||||
<updated>20230209T035328Z</updated>
|
||||
<note-attributes>
|
||||
<author>Harrison Chase</author>
|
||||
</note-attributes>
|
||||
<content>
|
||||
<![CDATA[<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd"><en-note><div>testing this</div><div>what happens?</div><div>to the world?</div></en-note> ]]>
|
||||
</content>
|
||||
</note>
|
||||
</en-export>
|
||||
@@ -61,10 +61,61 @@
|
||||
"data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5d1472e9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Retain Elements\n",
|
||||
"\n",
|
||||
"Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "93abf60b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = UnstructuredDocxLoader('example_data/fake.docx', mode=\"elements\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "c35cdbcc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "fae2d730",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 'example_data/fake.docx'}, lookup_index=0)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "61953c83",
|
||||
"id": "961a7b1d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
@@ -86,7 +137,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
130
docs/modules/document_loaders/examples/online_pdf.ipynb
Normal file
130
docs/modules/document_loaders/examples/online_pdf.ipynb
Normal file
File diff suppressed because one or more lines are too long
@@ -10,9 +10,136 @@
|
||||
"This covers how to load pdfs into a document format that we can use downstream."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "743f9413",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using PyPDF\n",
|
||||
"\n",
|
||||
"Allows for tracking of page numbers as well."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "c428b0c5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import PagedPDFSplitter\n",
|
||||
"\n",
|
||||
"loader = PagedPDFSplitter(\"example_data/layout-parser-paper.pdf\")\n",
|
||||
"pages = loader.load_and_split()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ebd895e4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"An advantage of this approach is that documents can be retrieved with page numbers."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "87fa7b3a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"9: 10 Z. Shen et al.\n",
|
||||
"Fig. 4: Illustration of (a) the original historical Japanese document with layout\n",
|
||||
"detection results and (b) a recreated version of the document image that achieves\n",
|
||||
"much better character recognition recall. The reorganization algorithm rearranges\n",
|
||||
"the tokens based on the their detected bounding boxes given a maximum allowed\n",
|
||||
"height.\n",
|
||||
"4LayoutParser Community Platform\n",
|
||||
"Another focus of LayoutParser is promoting the reusability of layout detection\n",
|
||||
"models and full digitization pipelines. Similar to many existing deep learning\n",
|
||||
"libraries, LayoutParser comes with a community model hub for distributing\n",
|
||||
"layout models. End-users can upload their self-trained models to the model hub,\n",
|
||||
"and these models can be loaded into a similar interface as the currently available\n",
|
||||
"LayoutParser pre-trained models. For example, the model trained on the News\n",
|
||||
"Navigator dataset [17] has been incorporated in the model hub.\n",
|
||||
"Beyond DL models, LayoutParser also promotes the sharing of entire doc-\n",
|
||||
"ument digitization pipelines. For example, sometimes the pipeline requires the\n",
|
||||
"combination of multiple DL models to achieve better accuracy. Currently, pipelines\n",
|
||||
"are mainly described in academic papers and implementations are often not pub-\n",
|
||||
"licly available. To this end, the LayoutParser community platform also enables\n",
|
||||
"the sharing of layout pipelines to promote the discussion and reuse of techniques.\n",
|
||||
"For each shared pipeline, it has a dedicated project page, with links to the source\n",
|
||||
"code, documentation, and an outline of the approaches. A discussion panel is\n",
|
||||
"provided for exchanging ideas. Combined with the core LayoutParser library,\n",
|
||||
"users can easily build reusable components based on the shared pipelines and\n",
|
||||
"apply them to solve their unique problems.\n",
|
||||
"5 Use Cases\n",
|
||||
"The core objective of LayoutParser is to make it easier to create both large-scale\n",
|
||||
"and light-weight document digitization pipelines. Large-scale document processing\n",
|
||||
"3: 4 Z. Shen et al.\n",
|
||||
"Efficient Data AnnotationC u s t o m i z e d M o d e l T r a i n i n gModel Cust omizationDI A Model HubDI A Pipeline SharingCommunity PlatformLa y out Detection ModelsDocument Images \n",
|
||||
"T h e C o r e L a y o u t P a r s e r L i b r a r yOCR ModuleSt or age & VisualizationLa y out Data Structur e\n",
|
||||
"Fig. 1: The overall architecture of LayoutParser . For an input document image,\n",
|
||||
"the core LayoutParser library provides a set of o\u000b",
|
||||
"-the-shelf tools for layout\n",
|
||||
"detection, OCR, visualization, and storage, backed by a carefully designed layout\n",
|
||||
"data structure. LayoutParser also supports high level customization via e\u000ecient\n",
|
||||
"layout annotation and model training functions. These improve model accuracy\n",
|
||||
"on the target samples. The community platform enables the easy sharing of DIA\n",
|
||||
"models and whole digitization pipelines to promote reusability and reproducibility.\n",
|
||||
"A collection of detailed documentation, tutorials and exemplar projects make\n",
|
||||
"LayoutParser easy to learn and use.\n",
|
||||
"AllenNLP [ 8] and transformers [ 34] have provided the community with complete\n",
|
||||
"DL-based support for developing and deploying models for general computer\n",
|
||||
"vision and natural language processing problems. LayoutParser , on the other\n",
|
||||
"hand, specializes speci\f",
|
||||
"cally in DIA tasks. LayoutParser is also equipped with a\n",
|
||||
"community platform inspired by established model hubs such as Torch Hub [23]\n",
|
||||
"andTensorFlow Hub [1]. It enables the sharing of pretrained models as well as\n",
|
||||
"full document processing pipelines that are unique to DIA tasks.\n",
|
||||
"There have been a variety of document data collections to facilitate the\n",
|
||||
"development of DL models. Some examples include PRImA [ 3](magazine layouts),\n",
|
||||
"PubLayNet [ 38](academic paper layouts), Table Bank [ 18](tables in academic\n",
|
||||
"papers), Newspaper Navigator Dataset [ 16,17](newspaper \f",
|
||||
"gure layouts) and\n",
|
||||
"HJDataset [31](historical Japanese document layouts). A spectrum of models\n",
|
||||
"trained on these datasets are currently available in the LayoutParser model zoo\n",
|
||||
"to support di\u000b",
|
||||
"erent use cases.\n",
|
||||
"3 The Core LayoutParser Library\n",
|
||||
"At the core of LayoutParser is an o\u000b",
|
||||
"-the-shelf toolkit that streamlines DL-\n",
|
||||
"based document image analysis. Five components support a simple interface\n",
|
||||
"with comprehensive functionalities: 1) The layout detection models enable using\n",
|
||||
"pre-trained or self-trained DL models for layout detection with just four lines\n",
|
||||
"of code. 2) The detected layout information is stored in carefully engineered\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.vectorstores import FAISS\n",
|
||||
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
|
||||
"\n",
|
||||
"faiss_index = FAISS.from_documents(pages, OpenAIEmbeddings())\n",
|
||||
"docs = faiss_index.similarity_search(\"How will the community be engaged?\", k=2)\n",
|
||||
"for doc in docs:\n",
|
||||
" print(str(doc.metadata[\"page\"]) + \":\", doc.page_content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "09d64998",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using Unstructured"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "0cc0cd42",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -22,7 +149,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 4,
|
||||
"id": "082d557c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -32,8 +159,38 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "5c41106f",
|
||||
"execution_count": null,
|
||||
"id": "df11c953",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "09957371",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Retain Elements\n",
|
||||
"\n",
|
||||
"Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0fab833b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = UnstructuredPDFLoader(\"example_data/layout-parser-paper.pdf\", mode=\"elements\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c3e8ff1b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -43,7 +200,55 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "54fb6b62",
|
||||
"id": "43c23d2d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "21998d18",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using PDFMiner"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "2f0cc9ff",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import PDFMinerLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "42b531e8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = PDFMinerLoader(\"example_data/layout-parser-paper.pdf\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "010d5cdd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7301c473",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
@@ -65,7 +270,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -61,10 +61,61 @@
|
||||
"data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "525d6b67",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Retain Elements\n",
|
||||
"\n",
|
||||
"Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "064f9162",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = UnstructuredPowerPointLoader(\"example_data/fake-power-point.pptx\", mode=\"elements\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "abefbbdb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "a547c534",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(page_content='Adding a Bullet Slide', lookup_str='', metadata={'source': 'example_data/fake-power-point.pptx'}, lookup_index=0)"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0c55f1cf",
|
||||
"id": "381d4139",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
@@ -86,7 +137,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -12,6 +12,40 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "2886982e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# # Install package\n",
|
||||
"# !pip install unstructured"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "54d62efd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# # Install other dependencies\n",
|
||||
"# # https://github.com/Unstructured-IO/unstructured/blob/main/docs/source/installing.rst\n",
|
||||
"# !brew install libmagic"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "af6a64f5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# import nltk\n",
|
||||
"# nltk.download('punkt')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "79d3e549",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -21,7 +55,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 5,
|
||||
"id": "2593d1dc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -31,7 +65,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 6,
|
||||
"id": "fe34e941",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -39,10 +73,86 @@
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "ee449788",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.\\n\\nLast year COVID-19 kept us apart. This year we are finally together again.\\n\\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans.\\n\\nWith a duty to one another to the American people to the Constit'"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs[0].page_content[:400]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7874d01d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Retain Elements\n",
|
||||
"\n",
|
||||
"Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "ff5b616d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = UnstructuredFileLoader(\"../../state_of_the_union.txt\", mode=\"elements\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "feca3b6c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "fec5bbac",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0),\n",
|
||||
" Document(page_content='Last year COVID-19 kept us apart. This year we are finally together again.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0),\n",
|
||||
" Document(page_content='Tonight, we meet as Democrats Republicans and Independents. But most importantly as Americans.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0),\n",
|
||||
" Document(page_content='With a duty to one another to the American people to the Constitution.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0),\n",
|
||||
" Document(page_content='And with an unwavering resolve that freedom will always triumph over tyranny.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs[:5]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "24e577e5",
|
||||
"id": "8ca8a648",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
@@ -64,7 +174,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
78
docs/modules/document_loaders/examples/url.ipynb
Normal file
78
docs/modules/document_loaders/examples/url.ipynb
Normal file
@@ -0,0 +1,78 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2dfc4698",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# URL\n",
|
||||
"\n",
|
||||
"This covers how to load HTML documents from a list of URLs into a document format that we can use downstream."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "16c3699e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
" from langchain.document_loaders import UnstructuredURLLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "836fbac1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"urls = [\n",
|
||||
" \"https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-8-2023\",\n",
|
||||
" \"https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-9-2023\"\n",
|
||||
"]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "00f46fda",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = UnstructuredURLLoader(urls=urls)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "b68a26b3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loader.load()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -27,6 +27,8 @@ There are a lot of different document loaders that LangChain supports. Below are
|
||||
|
||||
`Roam <./examples/roam.html>`_: A walkthrough of how to load data from a Roam file export.
|
||||
|
||||
`EveryNote <./examples/everynote.html>`_: A walkthrough of how to load data from a EveryNote (`.enex`) file.
|
||||
|
||||
`YouTube <./examples/youtube.html>`_: A walkthrough of how to load the transcript from a YouTube video.
|
||||
|
||||
`s3 File <./examples/s3_file.html>`_: A walkthrough of how to load a file from s3.
|
||||
@@ -47,6 +49,10 @@ There are a lot of different document loaders that LangChain supports. Below are
|
||||
|
||||
`Gutenberg <./examples/gutenberg.html>`_: A walkthrough of how to load data from a Gutenberg ebook text.
|
||||
|
||||
`Airbyte Json <./examples/airbyte_json.html>`_: A walkthrough of how to load data from a local Airbyte JSON file.
|
||||
|
||||
`Online PDF <./examples/online_pdf.html>`_: A walkthrough of how to load data from an online PDF.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:glob:
|
||||
|
||||
138
docs/modules/llms/examples/fake_llm.ipynb
Normal file
138
docs/modules/llms/examples/fake_llm.ipynb
Normal file
@@ -0,0 +1,138 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "052dfe58",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Fake LLM\n",
|
||||
"We expose a fake LLM class that can be used for testing. This allows you to mock out calls to the LLM and simulate what would happen if the LLM responded in a certain way.\n",
|
||||
"\n",
|
||||
"In this notebook we go over how to use this.\n",
|
||||
"\n",
|
||||
"We start this with using the FakeLLM in an agent."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "ef97ac4d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.llms.fake import FakeListLLM"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "9a0a160f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.agents import load_tools\n",
|
||||
"from langchain.agents import initialize_agent"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "b272258c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tools = load_tools([\"python_repl\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "94096c4c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"responses=[\n",
|
||||
" \"Action: Python REPL\\nAction Input: print(2 + 2)\",\n",
|
||||
" \"Final Answer: 4\"\n",
|
||||
"]\n",
|
||||
"llm = FakeListLLM(responses=responses)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"id": "da226d02",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"agent = initialize_agent(tools, llm, agent=\"zero-shot-react-description\", verbose=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"id": "44c13426",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3mAction: Python REPL\n",
|
||||
"Action Input: print(2 + 2)\u001b[0m\n",
|
||||
"Observation: \u001b[36;1m\u001b[1;3m4\n",
|
||||
"\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3mFinal Answer: 4\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'4'"
|
||||
]
|
||||
},
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"agent.run(\"whats 2 + 2\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "814c2858",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -11,6 +11,8 @@ The examples here all address certain "how-to" guides for working with LLMs.
|
||||
|
||||
`Token Usage Tracking <./examples/token_usage_tracking.html>`_: How to track the token usage of various chains/agents/LLM calls.
|
||||
|
||||
`Fake LLM <./examples/fake_llm.html>`_: How to create and use a fake LLM for testing and debugging purposes.
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"The Seattle Seahawks won the Super Bowl in 2010. Justin Beiber was born in 2010. The\n"
|
||||
"The Seattle Seahawks won the Super Bowl in 2010. Justin Beiber was born in 2010. The final answer: Seattle Seahawks.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -31,7 +31,7 @@
|
||||
"\n",
|
||||
"Answer: Let's think step by step.\"\"\"\n",
|
||||
"prompt = PromptTemplate(template=template, input_variables=[\"question\"])\n",
|
||||
"llm_chain = LLMChain(prompt=prompt, llm=HuggingFaceHub(repo_id=\"google/flan-t5-xl\", model_kwargs={\"temperature\":1e-10}))\n",
|
||||
"llm_chain = LLMChain(prompt=prompt, llm=HuggingFaceHub(repo_id=\"google/flan-t5-xl\", model_kwargs={\"temperature\":0, \"max_length\":64}))\n",
|
||||
"\n",
|
||||
"question = \"What NFL team won the Super Bowl in the year Justin Beiber was born?\"\n",
|
||||
"\n",
|
||||
|
||||
@@ -77,7 +77,7 @@
|
||||
" memory=ConversationalBufferWindowMemory(k=2),\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"output = chatgpt_chain.predict(human_input=\"I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply wiht the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\")\n",
|
||||
"output = chatgpt_chain.predict(human_input=\"I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply with the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\")\n",
|
||||
"print(output)"
|
||||
]
|
||||
},
|
||||
@@ -103,7 +103,7 @@
|
||||
"\n",
|
||||
"Overall, Assistant is a powerful tool that can help with a wide range of tasks and provide valuable insights and information on a wide range of topics. Whether you need help with a specific question or just want to have a conversation about a particular topic, Assistant is here to assist.\n",
|
||||
"\n",
|
||||
"Human: I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply wiht the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\n",
|
||||
"Human: I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply with the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\n",
|
||||
"AI: \n",
|
||||
"```\n",
|
||||
"$ pwd\n",
|
||||
@@ -148,7 +148,7 @@
|
||||
"\n",
|
||||
"Overall, Assistant is a powerful tool that can help with a wide range of tasks and provide valuable insights and information on a wide range of topics. Whether you need help with a specific question or just want to have a conversation about a particular topic, Assistant is here to assist.\n",
|
||||
"\n",
|
||||
"Human: I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply wiht the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\n",
|
||||
"Human: I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply with the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\n",
|
||||
"AI: \n",
|
||||
"```\n",
|
||||
"$ pwd\n",
|
||||
@@ -915,14 +915,14 @@
|
||||
" \"response\": \"Artificial intelligence (AI) is the simulation of human intelligence processes by machines, especially computer systems. These processes include learning (the acquisition of information and rules for using the information), reasoning (using the rules to reach approximate or definite conclusions) and self-correction. AI is used to develop computer systems that can think and act like humans.\"\n",
|
||||
"}\n",
|
||||
"```\n",
|
||||
"Human: curl --header \"Content-Type:application/json\" --request POST --data '{\"message\": \"I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply wiht the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\"}' https://chat.openai.com/chat\n",
|
||||
"Human: curl --header \"Content-Type:application/json\" --request POST --data '{\"message\": \"I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply with the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\"}' https://chat.openai.com/chat\n",
|
||||
"Assistant:\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished LLMChain chain.\u001b[0m\n",
|
||||
" \n",
|
||||
"\n",
|
||||
"```\n",
|
||||
"$ curl --header \"Content-Type:application/json\" --request POST --data '{\"message\": \"I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply wiht the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\"}' https://chat.openai.com/chat\n",
|
||||
"$ curl --header \"Content-Type:application/json\" --request POST --data '{\"message\": \"I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply with the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\"}' https://chat.openai.com/chat\n",
|
||||
"\n",
|
||||
"{\n",
|
||||
" \"response\": \"```\\n/current/working/directory\\n```\"\n",
|
||||
@@ -932,7 +932,7 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"output = chatgpt_chain.predict(human_input=\"\"\"curl --header \"Content-Type:application/json\" --request POST --data '{\"message\": \"I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply wiht the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\"}' https://chat.openai.com/chat\"\"\")\n",
|
||||
"output = chatgpt_chain.predict(human_input=\"\"\"curl --header \"Content-Type:application/json\" --request POST --data '{\"message\": \"I want you to act as a Linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply with the terminal output inside one unique code block, and nothing else. Do not write explanations. Do not type commands unless I instruct you to do so. When I need to tell you something in English I will do so by putting text inside curly brackets {like this}. My first command is pwd.\"}' https://chat.openai.com/chat\"\"\")\n",
|
||||
"print(output)"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -68,7 +68,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 4,
|
||||
"id": "67baf32e",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
@@ -98,6 +98,68 @@
|
||||
"print(docs[0].page_content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "fb6baaf8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Add texts\n",
|
||||
"You can easily add text to a vectorstore with the `add_texts` method. It will return a list of document IDs (in case you need to use them downstream)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "70758e4f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['64108bd0-4d91-485c-9743-1e18debdd59e']"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docsearch.add_texts([\"Ankush went to Princeton\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "4edeb88f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query = \"Where did Ankush go to college?\"\n",
|
||||
"docs = docsearch.similarity_search(query)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "1cba64a2",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(page_content='Ankush went to Princeton', lookup_str='', metadata={}, lookup_index=0)"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "bbf5ec44",
|
||||
@@ -646,7 +708,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -39,6 +39,7 @@ class ConversationalAgent(Agent):
|
||||
tools: List[Tool],
|
||||
prefix: str = PREFIX,
|
||||
suffix: str = SUFFIX,
|
||||
format_instructions: str = FORMAT_INSTRUCTIONS,
|
||||
ai_prefix: str = "AI",
|
||||
human_prefix: str = "Human",
|
||||
input_variables: Optional[List[str]] = None,
|
||||
@@ -61,7 +62,7 @@ class ConversationalAgent(Agent):
|
||||
[f"> {tool.name}: {tool.description}" for tool in tools]
|
||||
)
|
||||
tool_names = ", ".join([tool.name for tool in tools])
|
||||
format_instructions = FORMAT_INSTRUCTIONS.format(
|
||||
format_instructions = format_instructions.format(
|
||||
tool_names=tool_names, ai_prefix=ai_prefix, human_prefix=human_prefix
|
||||
)
|
||||
template = "\n\n".join([prefix, tool_strings, format_instructions, suffix])
|
||||
@@ -93,6 +94,7 @@ class ConversationalAgent(Agent):
|
||||
callback_manager: Optional[BaseCallbackManager] = None,
|
||||
prefix: str = PREFIX,
|
||||
suffix: str = SUFFIX,
|
||||
format_instructions: str = FORMAT_INSTRUCTIONS,
|
||||
ai_prefix: str = "AI",
|
||||
human_prefix: str = "Human",
|
||||
input_variables: Optional[List[str]] = None,
|
||||
@@ -106,6 +108,7 @@ class ConversationalAgent(Agent):
|
||||
human_prefix=human_prefix,
|
||||
prefix=prefix,
|
||||
suffix=suffix,
|
||||
format_instructions=format_instructions,
|
||||
input_variables=input_variables,
|
||||
)
|
||||
llm_chain = LLMChain(
|
||||
|
||||
@@ -72,6 +72,7 @@ class ZeroShotAgent(Agent):
|
||||
tools: List[Tool],
|
||||
prefix: str = PREFIX,
|
||||
suffix: str = SUFFIX,
|
||||
format_instructions: str = FORMAT_INSTRUCTIONS,
|
||||
input_variables: Optional[List[str]] = None,
|
||||
) -> PromptTemplate:
|
||||
"""Create prompt in the style of the zero shot agent.
|
||||
@@ -88,7 +89,7 @@ class ZeroShotAgent(Agent):
|
||||
"""
|
||||
tool_strings = "\n".join([f"{tool.name}: {tool.description}" for tool in tools])
|
||||
tool_names = ", ".join([tool.name for tool in tools])
|
||||
format_instructions = FORMAT_INSTRUCTIONS.format(tool_names=tool_names)
|
||||
format_instructions = format_instructions.format(tool_names=tool_names)
|
||||
template = "\n\n".join([prefix, tool_strings, format_instructions, suffix])
|
||||
if input_variables is None:
|
||||
input_variables = ["input", "agent_scratchpad"]
|
||||
@@ -102,13 +103,18 @@ class ZeroShotAgent(Agent):
|
||||
callback_manager: Optional[BaseCallbackManager] = None,
|
||||
prefix: str = PREFIX,
|
||||
suffix: str = SUFFIX,
|
||||
format_instructions: str = FORMAT_INSTRUCTIONS,
|
||||
input_variables: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> Agent:
|
||||
"""Construct an agent from an LLM and tools."""
|
||||
cls._validate_tools(tools)
|
||||
prompt = cls.create_prompt(
|
||||
tools, prefix=prefix, suffix=suffix, input_variables=input_variables
|
||||
tools,
|
||||
prefix=prefix,
|
||||
suffix=suffix,
|
||||
format_instructions=format_instructions,
|
||||
input_variables=input_variables,
|
||||
)
|
||||
llm_chain = LLMChain(
|
||||
llm=llm,
|
||||
|
||||
@@ -4,7 +4,12 @@ from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from sqlalchemy import Column, Integer, String, create_engine, select
|
||||
from sqlalchemy.engine.base import Engine
|
||||
from sqlalchemy.orm import Session, declarative_base
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
try:
|
||||
from sqlalchemy.orm import declarative_base
|
||||
except ImportError:
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
|
||||
from langchain.schema import Generation
|
||||
|
||||
|
||||
@@ -93,7 +93,6 @@ def _load_refine_chain(
|
||||
verbose: Optional[bool] = None,
|
||||
**kwargs: Any,
|
||||
) -> RefineDocumentsChain:
|
||||
|
||||
initial_chain = LLMChain(llm=llm, prompt=question_prompt, verbose=verbose)
|
||||
_refine_llm = refine_llm or llm
|
||||
refine_chain = LLMChain(llm=_refine_llm, prompt=refine_prompt, verbose=verbose)
|
||||
|
||||
@@ -1,10 +1,12 @@
|
||||
"""All different types of document loaders."""
|
||||
|
||||
from langchain.document_loaders.airbyte_json import AirbyteJSONLoader
|
||||
from langchain.document_loaders.azlyrics import AZLyricsLoader
|
||||
from langchain.document_loaders.college_confidential import CollegeConfidentialLoader
|
||||
from langchain.document_loaders.directory import DirectoryLoader
|
||||
from langchain.document_loaders.docx import UnstructuredDocxLoader
|
||||
from langchain.document_loaders.email import UnstructuredEmailLoader
|
||||
from langchain.document_loaders.everynote import EveryNoteLoader
|
||||
from langchain.document_loaders.gcs_directory import GCSDirectoryLoader
|
||||
from langchain.document_loaders.gcs_file import GCSFileLoader
|
||||
from langchain.document_loaders.googledrive import GoogleDriveLoader
|
||||
@@ -13,18 +15,23 @@ from langchain.document_loaders.html import UnstructuredHTMLLoader
|
||||
from langchain.document_loaders.imsdb import IMSDbLoader
|
||||
from langchain.document_loaders.notion import NotionDirectoryLoader
|
||||
from langchain.document_loaders.obsidian import ObsidianLoader
|
||||
from langchain.document_loaders.pdf import UnstructuredPDFLoader
|
||||
from langchain.document_loaders.online_pdf import OnlinePDFLoader
|
||||
from langchain.document_loaders.paged_pdf import PagedPDFSplitter
|
||||
from langchain.document_loaders.pdf import PDFMinerLoader, UnstructuredPDFLoader
|
||||
from langchain.document_loaders.powerpoint import UnstructuredPowerPointLoader
|
||||
from langchain.document_loaders.readthedocs import ReadTheDocsLoader
|
||||
from langchain.document_loaders.roam import RoamLoader
|
||||
from langchain.document_loaders.s3_directory import S3DirectoryLoader
|
||||
from langchain.document_loaders.s3_file import S3FileLoader
|
||||
from langchain.document_loaders.text import TextLoader
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
from langchain.document_loaders.url import UnstructuredURLLoader
|
||||
from langchain.document_loaders.web_base import WebBaseLoader
|
||||
from langchain.document_loaders.youtube import YoutubeLoader
|
||||
|
||||
__all__ = [
|
||||
"UnstructuredFileLoader",
|
||||
"UnstructuredURLLoader",
|
||||
"DirectoryLoader",
|
||||
"NotionDirectoryLoader",
|
||||
"ReadTheDocsLoader",
|
||||
@@ -38,6 +45,7 @@ __all__ = [
|
||||
"RoamLoader",
|
||||
"YoutubeLoader",
|
||||
"S3FileLoader",
|
||||
"TextLoader",
|
||||
"S3DirectoryLoader",
|
||||
"GCSFileLoader",
|
||||
"GCSDirectoryLoader",
|
||||
@@ -46,4 +54,9 @@ __all__ = [
|
||||
"AZLyricsLoader",
|
||||
"CollegeConfidentialLoader",
|
||||
"GutenbergLoader",
|
||||
"PagedPDFSplitter",
|
||||
"EveryNoteLoader",
|
||||
"AirbyteJSONLoader",
|
||||
"OnlinePDFLoader",
|
||||
"PDFMinerLoader",
|
||||
]
|
||||
|
||||
41
langchain/document_loaders/airbyte_json.py
Normal file
41
langchain/document_loaders/airbyte_json.py
Normal file
@@ -0,0 +1,41 @@
|
||||
"""Loader that loads local airbyte json files."""
|
||||
import json
|
||||
from typing import Any, List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
def _stringify_value(val: Any) -> str:
|
||||
if isinstance(val, str):
|
||||
return val
|
||||
elif isinstance(val, dict):
|
||||
return "\n" + _stringify_dict(val)
|
||||
elif isinstance(val, list):
|
||||
return "\n".join(_stringify_value(v) for v in val)
|
||||
else:
|
||||
return str(val)
|
||||
|
||||
|
||||
def _stringify_dict(data: dict) -> str:
|
||||
text = ""
|
||||
for key, value in data.items():
|
||||
text += key + ": " + _stringify_value(data[key]) + "\n"
|
||||
return text
|
||||
|
||||
|
||||
class AirbyteJSONLoader(BaseLoader):
|
||||
"""Loader that loads local airbyte json files."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path. This should start with '/tmp/airbyte_local/'."""
|
||||
self.file_path = file_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load file."""
|
||||
text = ""
|
||||
for line in open(self.file_path, "r"):
|
||||
data = json.loads(line)["_airbyte_data"]
|
||||
text += _stringify_dict(data)
|
||||
metadata = {"source": self.file_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
@@ -1,29 +1,13 @@
|
||||
"""Loader that loads Microsoft Word files."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
|
||||
class UnstructuredDocxLoader(BaseLoader):
|
||||
class UnstructuredDocxLoader(UnstructuredFileLoader):
|
||||
"""Loader that uses unstructured to load Microsoft Word files."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
try:
|
||||
import unstructured # noqa:F401
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"unstructured package not found, please install it with "
|
||||
"`pip install unstructured`"
|
||||
)
|
||||
self.file_path = file_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load file."""
|
||||
def _get_elements(self) -> List:
|
||||
from unstructured.partition.docx import partition_docx
|
||||
|
||||
elements = partition_docx(filename=self.file_path)
|
||||
text = "\n\n".join([str(el) for el in elements])
|
||||
metadata = {"source": self.file_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
return partition_docx(filename=self.file_path)
|
||||
|
||||
@@ -1,29 +1,13 @@
|
||||
"""Loader that loads email files."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
|
||||
class UnstructuredEmailLoader(BaseLoader):
|
||||
class UnstructuredEmailLoader(UnstructuredFileLoader):
|
||||
"""Loader that uses unstructured to load email files."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
try:
|
||||
import unstructured # noqa:F401
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"unstructured package not found, please install it with "
|
||||
"`pip install unstructured`"
|
||||
)
|
||||
self.file_path = file_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load file."""
|
||||
def _get_elements(self) -> List:
|
||||
from unstructured.partition.email import partition_email
|
||||
|
||||
elements = partition_email(filename=self.file_path)
|
||||
text = "\n\n".join([str(el) for el in elements])
|
||||
metadata = {"source": self.file_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
return partition_email(filename=self.file_path)
|
||||
|
||||
82
langchain/document_loaders/everynote.py
Normal file
82
langchain/document_loaders/everynote.py
Normal file
@@ -0,0 +1,82 @@
|
||||
"""Load documents from Everynote.
|
||||
|
||||
https://gist.github.com/foxmask/7b29c43a161e001ff04afdb2f181e31c
|
||||
"""
|
||||
import hashlib
|
||||
from base64 import b64decode
|
||||
from time import strptime
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
def _parse_content(content: str) -> str:
|
||||
from pypandoc import convert_text
|
||||
|
||||
text = convert_text(content, "org", format="html")
|
||||
return text
|
||||
|
||||
|
||||
def _parse_resource(resource: list) -> dict:
|
||||
rsc_dict: Dict[str, Any] = {}
|
||||
for elem in resource:
|
||||
if elem.tag == "data":
|
||||
# Some times elem.text is None
|
||||
rsc_dict[elem.tag] = b64decode(elem.text) if elem.text else b""
|
||||
rsc_dict["hash"] = hashlib.md5(rsc_dict[elem.tag]).hexdigest()
|
||||
else:
|
||||
rsc_dict[elem.tag] = elem.text
|
||||
|
||||
return rsc_dict
|
||||
|
||||
|
||||
def _parse_note(note: List) -> dict:
|
||||
note_dict: Dict[str, Any] = {}
|
||||
resources = []
|
||||
for elem in note:
|
||||
if elem.tag == "content":
|
||||
note_dict[elem.tag] = _parse_content(elem.text)
|
||||
# A copy of original content
|
||||
note_dict["content-raw"] = elem.text
|
||||
elif elem.tag == "resource":
|
||||
resources.append(_parse_resource(elem))
|
||||
elif elem.tag == "created" or elem.tag == "updated":
|
||||
note_dict[elem.tag] = strptime(elem.text, "%Y%m%dT%H%M%SZ")
|
||||
else:
|
||||
note_dict[elem.tag] = elem.text
|
||||
|
||||
note_dict["resource"] = resources
|
||||
|
||||
return note_dict
|
||||
|
||||
|
||||
def _parse_note_xml(xml_file: str) -> str:
|
||||
"""Parse everynote xml."""
|
||||
# Without huge_tree set to True, parser may complain about huge text node
|
||||
# Try to recover, because there may be " ", which will cause
|
||||
# "XMLSyntaxError: Entity 'nbsp' not defined"
|
||||
from lxml import etree
|
||||
|
||||
context = etree.iterparse(
|
||||
xml_file, encoding="utf-8", strip_cdata=False, huge_tree=True, recover=True
|
||||
)
|
||||
result_string = ""
|
||||
for action, elem in context:
|
||||
if elem.tag == "note":
|
||||
result_string += _parse_note(elem)["content"]
|
||||
return result_string
|
||||
|
||||
|
||||
class EveryNoteLoader(BaseLoader):
|
||||
"""Loader to load in EverNnote files.."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
self.file_path = file_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load document from EveryNote file."""
|
||||
text = _parse_note_xml(self.file_path)
|
||||
metadata = {"source": self.file_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
@@ -1,29 +1,13 @@
|
||||
"""Loader that loads PDF files."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
|
||||
class UnstructuredHTMLLoader(BaseLoader):
|
||||
class UnstructuredHTMLLoader(UnstructuredFileLoader):
|
||||
"""Loader that uses unstructured to load HTML files."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
try:
|
||||
import unstructured # noqa:F401
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"unstructured package not found, please install it with "
|
||||
"`pip install unstructured`"
|
||||
)
|
||||
self.file_path = file_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load file."""
|
||||
def _get_elements(self) -> List:
|
||||
from unstructured.partition.html import partition_html
|
||||
|
||||
elements = partition_html(filename=self.file_path)
|
||||
text = "\n\n".join([str(el) for el in elements])
|
||||
metadata = {"source": self.file_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
return partition_html(filename=self.file_path)
|
||||
|
||||
29
langchain/document_loaders/online_pdf.py
Normal file
29
langchain/document_loaders/online_pdf.py
Normal file
@@ -0,0 +1,29 @@
|
||||
"""Loader that loads online PDF files."""
|
||||
|
||||
import tempfile
|
||||
from typing import List
|
||||
|
||||
import requests
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.document_loaders.pdf import UnstructuredPDFLoader
|
||||
|
||||
|
||||
class OnlinePDFLoader(BaseLoader):
|
||||
"""Loader that loads online PDFs."""
|
||||
|
||||
def __init__(self, web_path: str):
|
||||
"""Initialize with file path."""
|
||||
self.web_path = web_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents."""
|
||||
r = requests.get(self.web_path)
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
file_path = f"{temp_dir}/online_file.pdf"
|
||||
file = open(file_path, "wb")
|
||||
file.write(r.content)
|
||||
file.close()
|
||||
loader = UnstructuredPDFLoader(file_path)
|
||||
return loader.load()
|
||||
36
langchain/document_loaders/paged_pdf.py
Normal file
36
langchain/document_loaders/paged_pdf.py
Normal file
@@ -0,0 +1,36 @@
|
||||
"""Loads a PDF with pypdf and chunks at character level."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class PagedPDFSplitter(BaseLoader):
|
||||
"""Loads a PDF with pypdf and chunks at character level.
|
||||
|
||||
Loader also stores page numbers in metadatas.
|
||||
"""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
try:
|
||||
import pypdf # noqa:F401
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"pypdf package not found, please install it with " "`pip install pypdf`"
|
||||
)
|
||||
self._file_path = file_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load given path as pages."""
|
||||
import pypdf
|
||||
|
||||
pdf_file_obj = open(self._file_path, "rb")
|
||||
pdf_reader = pypdf.PdfReader(pdf_file_obj)
|
||||
docs = []
|
||||
for i, page in enumerate(pdf_reader.pages):
|
||||
text = page.extract_text()
|
||||
metadata = {"source": self._file_path, "page": i}
|
||||
docs.append(Document(page_content=text, metadata=metadata))
|
||||
pdf_file_obj.close()
|
||||
return docs
|
||||
@@ -3,27 +3,36 @@ from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
|
||||
class UnstructuredPDFLoader(BaseLoader):
|
||||
class UnstructuredPDFLoader(UnstructuredFileLoader):
|
||||
"""Loader that uses unstructured to load PDF files."""
|
||||
|
||||
def _get_elements(self) -> List:
|
||||
from unstructured.partition.pdf import partition_pdf
|
||||
|
||||
return partition_pdf(filename=self.file_path)
|
||||
|
||||
|
||||
class PDFMinerLoader(BaseLoader):
|
||||
"""Loader that uses PDFMiner to load PDF files."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
try:
|
||||
import unstructured # noqa:F401
|
||||
from pdfminer.high_level import extract_text # noqa:F401
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"unstructured package not found, please install it with "
|
||||
"`pip install unstructured`"
|
||||
"pdfminer package not found, please install it with "
|
||||
"`pip install pdfminer.six`"
|
||||
)
|
||||
self.file_path = file_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load file."""
|
||||
from unstructured.partition.pdf import partition_pdf
|
||||
from pdfminer.high_level import extract_text
|
||||
|
||||
elements = partition_pdf(filename=self.file_path)
|
||||
text = "\n\n".join([str(el) for el in elements])
|
||||
text = extract_text(self.file_path)
|
||||
metadata = {"source": self.file_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
|
||||
@@ -1,29 +1,13 @@
|
||||
"""Loader that loads powerpoint files."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
|
||||
class UnstructuredPowerPointLoader(BaseLoader):
|
||||
class UnstructuredPowerPointLoader(UnstructuredFileLoader):
|
||||
"""Loader that uses unstructured to load powerpoint files."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
try:
|
||||
import unstructured # noqa:F401
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"unstructured package not found, please install it with "
|
||||
"`pip install unstructured`"
|
||||
)
|
||||
self.file_path = file_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load file."""
|
||||
def _get_elements(self) -> List:
|
||||
from unstructured.partition.pptx import partition_pptx
|
||||
|
||||
elements = partition_pptx(filename=self.file_path)
|
||||
text = "\n\n".join([str(el) for el in elements])
|
||||
metadata = {"source": self.file_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
return partition_pptx(filename=self.file_path)
|
||||
|
||||
20
langchain/document_loaders/text.py
Normal file
20
langchain/document_loaders/text.py
Normal file
@@ -0,0 +1,20 @@
|
||||
"""Load text files."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class TextLoader(BaseLoader):
|
||||
"""Load text files."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
self.file_path = file_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load from file path."""
|
||||
with open(self.file_path) as f:
|
||||
text = f.read()
|
||||
metadata = {"source": self.file_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
@@ -8,7 +8,7 @@ from langchain.document_loaders.base import BaseLoader
|
||||
class UnstructuredFileLoader(BaseLoader):
|
||||
"""Loader that uses unstructured to load files."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
def __init__(self, file_path: str, mode: str = "single"):
|
||||
"""Initialize with file path."""
|
||||
try:
|
||||
import unstructured # noqa:F401
|
||||
@@ -17,13 +17,30 @@ class UnstructuredFileLoader(BaseLoader):
|
||||
"unstructured package not found, please install it with "
|
||||
"`pip install unstructured`"
|
||||
)
|
||||
_valid_modes = {"single", "elements"}
|
||||
if mode not in _valid_modes:
|
||||
raise ValueError(
|
||||
f"Got {mode} for `mode`, but should be one of `{_valid_modes}`"
|
||||
)
|
||||
self.file_path = file_path
|
||||
self.mode = mode
|
||||
|
||||
def _get_elements(self) -> List:
|
||||
from unstructured.partition.auto import partition
|
||||
|
||||
return partition(filename=self.file_path)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load file."""
|
||||
from unstructured.partition.auto import partition
|
||||
|
||||
elements = partition(filename=self.file_path)
|
||||
text = "\n\n".join([str(el) for el in elements])
|
||||
elements = self._get_elements()
|
||||
metadata = {"source": self.file_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
if self.mode == "elements":
|
||||
docs = [
|
||||
Document(page_content=str(el), metadata=metadata) for el in elements
|
||||
]
|
||||
elif self.mode == "single":
|
||||
text = "\n\n".join([str(el) for el in elements])
|
||||
docs = [Document(page_content=text, metadata=metadata)]
|
||||
else:
|
||||
raise ValueError(f"mode of {self.mode} not supported.")
|
||||
return docs
|
||||
|
||||
32
langchain/document_loaders/url.py
Normal file
32
langchain/document_loaders/url.py
Normal file
@@ -0,0 +1,32 @@
|
||||
"""Loader that loads PDF files."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class UnstructuredURLLoader(BaseLoader):
|
||||
"""Loader that uses unstructured to load HTML files."""
|
||||
|
||||
def __init__(self, urls: List[str]):
|
||||
"""Initialize with file path."""
|
||||
try:
|
||||
import unstructured # noqa:F401
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"unstructured package not found, please install it with "
|
||||
"`pip install unstructured`"
|
||||
)
|
||||
self.urls = urls
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load file."""
|
||||
from unstructured.partition.html import partition_html
|
||||
|
||||
docs: List[Document] = list()
|
||||
for url in self.urls:
|
||||
elements = partition_html(url=url)
|
||||
text = "\n\n".join([str(el) for el in elements])
|
||||
metadata = {"source": url}
|
||||
docs.append(Document(page_content=text, metadata=metadata))
|
||||
return docs
|
||||
@@ -75,20 +75,27 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
||||
text = text.replace("\n", " ")
|
||||
return self.client.create(input=[text], engine=engine)["data"][0]["embedding"]
|
||||
|
||||
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
def embed_documents(
|
||||
self, texts: List[str], chunk_size: int = 1000
|
||||
) -> List[List[float]]:
|
||||
"""Call out to OpenAI's embedding endpoint for embedding search docs.
|
||||
|
||||
Args:
|
||||
texts: The list of texts to embed.
|
||||
chunk_size: The maximum number of texts to send to OpenAI at once
|
||||
(max 1000).
|
||||
|
||||
Returns:
|
||||
List of embeddings, one for each text.
|
||||
"""
|
||||
responses = [
|
||||
self._embedding_func(text, engine=self.document_model_name)
|
||||
for text in texts
|
||||
]
|
||||
return responses
|
||||
# handle large batches of texts
|
||||
results = []
|
||||
for i in range(0, len(texts), chunk_size):
|
||||
response = self.client.create(
|
||||
input=texts[i : i + chunk_size], engine=self.document_model_name
|
||||
)
|
||||
results += [r["embedding"] for r in response["data"]]
|
||||
return results
|
||||
|
||||
def embed_query(self, text: str) -> List[float]:
|
||||
"""Call out to OpenAI's embedding endpoint for embedding query text.
|
||||
|
||||
28
langchain/llms/fake.py
Normal file
28
langchain/llms/fake.py
Normal file
@@ -0,0 +1,28 @@
|
||||
"""Fake LLM wrapper for testing purposes."""
|
||||
from typing import Any, List, Mapping, Optional
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from langchain.llms.base import LLM
|
||||
|
||||
|
||||
class FakeListLLM(LLM, BaseModel):
|
||||
"""Fake LLM wrapper for testing purposes."""
|
||||
|
||||
responses: List
|
||||
i: int = 0
|
||||
|
||||
@property
|
||||
def _llm_type(self) -> str:
|
||||
"""Return type of llm."""
|
||||
return "fake-list"
|
||||
|
||||
def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
|
||||
"""First try to lookup in queries, else return 'foo' or 'bar'."""
|
||||
response = self.responses[self.i]
|
||||
self.i += 1
|
||||
return response
|
||||
|
||||
@property
|
||||
def _identifying_params(self) -> Mapping[str, Any]:
|
||||
return {}
|
||||
@@ -17,40 +17,30 @@ class SQLDatabase:
|
||||
ignore_tables: Optional[List[str]] = None,
|
||||
include_tables: Optional[List[str]] = None,
|
||||
sample_rows_in_table_info: int = 0,
|
||||
# TODO: deprecate.
|
||||
sample_row_in_table_info: bool = False,
|
||||
):
|
||||
"""Create engine from database URI."""
|
||||
if sample_row_in_table_info and sample_rows_in_table_info > 0:
|
||||
raise ValueError(
|
||||
"Only one of `sample_row_in_table_info` "
|
||||
"and `sample_rows_in_table_info` should be set"
|
||||
)
|
||||
self._engine = engine
|
||||
self._schema = schema
|
||||
if include_tables and ignore_tables:
|
||||
raise ValueError("Cannot specify both include_tables and ignore_tables")
|
||||
|
||||
self._inspector = inspect(self._engine)
|
||||
self._all_tables = self._inspector.get_table_names(schema=schema)
|
||||
self._include_tables = include_tables or []
|
||||
self._all_tables = set(self._inspector.get_table_names(schema=schema))
|
||||
self._include_tables = set(include_tables) if include_tables else set()
|
||||
if self._include_tables:
|
||||
missing_tables = set(self._include_tables).difference(self._all_tables)
|
||||
missing_tables = self._include_tables - self._all_tables
|
||||
if missing_tables:
|
||||
raise ValueError(
|
||||
f"include_tables {missing_tables} not found in database"
|
||||
)
|
||||
self._ignore_tables = ignore_tables or []
|
||||
self._ignore_tables = set(ignore_tables) if ignore_tables else set()
|
||||
if self._ignore_tables:
|
||||
missing_tables = set(self._ignore_tables).difference(self._all_tables)
|
||||
missing_tables = self._ignore_tables - self._all_tables
|
||||
if missing_tables:
|
||||
raise ValueError(
|
||||
f"ignore_tables {missing_tables} not found in database"
|
||||
)
|
||||
self._sample_rows_in_table_info = sample_rows_in_table_info
|
||||
# TODO: deprecate
|
||||
if sample_row_in_table_info:
|
||||
self._sample_rows_in_table_info = 1
|
||||
|
||||
@classmethod
|
||||
def from_uri(cls, database_uri: str, **kwargs: Any) -> SQLDatabase:
|
||||
@@ -66,7 +56,7 @@ class SQLDatabase:
|
||||
"""Get names of tables available."""
|
||||
if self._include_tables:
|
||||
return self._include_tables
|
||||
return set(self._all_tables) - set(self._ignore_tables)
|
||||
return self._all_tables - self._ignore_tables
|
||||
|
||||
@property
|
||||
def table_info(self) -> str:
|
||||
@@ -91,7 +81,6 @@ class SQLDatabase:
|
||||
|
||||
tables = []
|
||||
for table_name in all_table_names:
|
||||
|
||||
columns = []
|
||||
for column in self._inspector.get_columns(table_name, schema=self._schema):
|
||||
columns.append(f"{column['name']} ({str(column['type'])})")
|
||||
|
||||
@@ -3,7 +3,17 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Callable, Iterable, List, Optional
|
||||
from typing import (
|
||||
AbstractSet,
|
||||
Any,
|
||||
Callable,
|
||||
Collection,
|
||||
Iterable,
|
||||
List,
|
||||
Literal,
|
||||
Optional,
|
||||
Union,
|
||||
)
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
|
||||
@@ -114,7 +124,11 @@ class TextSplitter(ABC):
|
||||
|
||||
@classmethod
|
||||
def from_tiktoken_encoder(
|
||||
cls, encoding_name: str = "gpt2", **kwargs: Any
|
||||
cls,
|
||||
encoding_name: str = "gpt2",
|
||||
allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),
|
||||
disallowed_special: Union[Literal["all"], Collection[str]] = "all",
|
||||
**kwargs: Any,
|
||||
) -> TextSplitter:
|
||||
"""Text splitter that uses tiktoken encoder to count length."""
|
||||
try:
|
||||
@@ -125,11 +139,19 @@ class TextSplitter(ABC):
|
||||
"This is needed in order to calculate max_tokens_for_prompt. "
|
||||
"Please it install it with `pip install tiktoken`."
|
||||
)
|
||||
|
||||
# create a GPT-3 encoder instance
|
||||
enc = tiktoken.get_encoding(encoding_name)
|
||||
|
||||
def _tiktoken_encoder(text: str) -> int:
|
||||
return len(enc.encode(text))
|
||||
def _tiktoken_encoder(text: str, **kwargs: Any) -> int:
|
||||
return len(
|
||||
enc.encode(
|
||||
text,
|
||||
allowed_special=allowed_special,
|
||||
disallowed_special=disallowed_special,
|
||||
**kwargs,
|
||||
)
|
||||
)
|
||||
|
||||
return cls(length_function=_tiktoken_encoder, **kwargs)
|
||||
|
||||
@@ -155,7 +177,13 @@ class CharacterTextSplitter(TextSplitter):
|
||||
class TokenTextSplitter(TextSplitter):
|
||||
"""Implementation of splitting text that looks at tokens."""
|
||||
|
||||
def __init__(self, encoding_name: str = "gpt2", **kwargs: Any):
|
||||
def __init__(
|
||||
self,
|
||||
encoding_name: str = "gpt2",
|
||||
allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),
|
||||
disallowed_special: Union[Literal["all"], Collection[str]] = "all",
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""Create a new TextSplitter."""
|
||||
super().__init__(**kwargs)
|
||||
try:
|
||||
@@ -168,11 +196,17 @@ class TokenTextSplitter(TextSplitter):
|
||||
)
|
||||
# create a GPT-3 encoder instance
|
||||
self._tokenizer = tiktoken.get_encoding(encoding_name)
|
||||
self._allowed_special = allowed_special
|
||||
self._disallowed_special = disallowed_special
|
||||
|
||||
def split_text(self, text: str) -> List[str]:
|
||||
"""Split incoming text and return chunks."""
|
||||
splits = []
|
||||
input_ids = self._tokenizer.encode(text)
|
||||
input_ids = self._tokenizer.encode(
|
||||
text,
|
||||
allowed_special=self._allowed_special,
|
||||
disallowed_special=self._disallowed_special,
|
||||
)
|
||||
start_idx = 0
|
||||
cur_idx = min(start_idx + self._chunk_size, len(input_ids))
|
||||
chunk_ids = input_ids[start_idx:cur_idx]
|
||||
|
||||
783
poetry.lock
generated
783
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "langchain"
|
||||
version = "0.0.81"
|
||||
version = "0.0.84"
|
||||
description = "Building applications with LLMs through composability"
|
||||
authors = []
|
||||
license = "MIT"
|
||||
@@ -37,7 +37,15 @@ qdrant-client = {version = "^0.11.7", optional = true}
|
||||
dataclasses-json = "^0.5.7"
|
||||
tensorflow-text = {version = "^2.11.0", optional = true, python = "^3.10, <3.12"}
|
||||
tenacity = "^8.1.0"
|
||||
cohere = {version = "^3", optional = true}
|
||||
openai = {version = "^0", optional = true}
|
||||
nlpcloud = {version = "^1", optional = true}
|
||||
huggingface_hub = {version = "^0", optional = true}
|
||||
google-search-results = {version = "^2", optional = true}
|
||||
sentence-transformers = {version = "^2", optional = true}
|
||||
aiohttp = "^3.8.3"
|
||||
pypdf = {version = "^3.4.0", optional = true}
|
||||
|
||||
|
||||
[tool.poetry.group.docs.dependencies]
|
||||
autodoc_pydantic = "^1.8.0"
|
||||
@@ -65,11 +73,11 @@ pytest-asyncio = "^0.20.3"
|
||||
|
||||
[tool.poetry.group.lint.dependencies]
|
||||
flake8-docstrings = "^1.6.0"
|
||||
black = "^22.10.0"
|
||||
isort = "^5.10.1"
|
||||
flake8 = "^6.0.0"
|
||||
types-toml = "^0.10.8.1"
|
||||
types-redis = "^4.3.21.6"
|
||||
black = "^23.1.0"
|
||||
|
||||
[tool.poetry.group.typing.dependencies]
|
||||
mypy = "^0.991"
|
||||
@@ -85,7 +93,7 @@ playwright = "^1.28.0"
|
||||
|
||||
[tool.poetry.extras]
|
||||
llms = ["cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"]
|
||||
all = ["cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "elasticsearch", "google-search-results", "faiss-cpu", "sentence_transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text"]
|
||||
all = ["cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "elasticsearch", "google-search-results", "faiss-cpu", "sentence_transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf"]
|
||||
|
||||
[tool.isort]
|
||||
profile = "black"
|
||||
|
||||
@@ -8,7 +8,18 @@ def test_openai_embedding_documents() -> None:
|
||||
embedding = OpenAIEmbeddings()
|
||||
output = embedding.embed_documents(documents)
|
||||
assert len(output) == 1
|
||||
assert len(output[0]) == 2048
|
||||
assert len(output[0]) == 1536
|
||||
|
||||
|
||||
def test_openai_embedding_documents_multiple() -> None:
|
||||
"""Test openai embeddings."""
|
||||
documents = ["foo bar", "bar foo", "foo"]
|
||||
embedding = OpenAIEmbeddings()
|
||||
output = embedding.embed_documents(documents, chunk_size=2)
|
||||
assert len(output) == 3
|
||||
assert len(output[0]) == 1536
|
||||
assert len(output[1]) == 1536
|
||||
assert len(output[2]) == 1536
|
||||
|
||||
|
||||
def test_openai_embedding_query() -> None:
|
||||
@@ -16,4 +27,4 @@ def test_openai_embedding_query() -> None:
|
||||
document = "foo bar"
|
||||
embedding = OpenAIEmbeddings()
|
||||
output = embedding.embed_query(document)
|
||||
assert len(output) == 2048
|
||||
assert len(output) == 1536
|
||||
|
||||
BIN
tests/integration_tests/examples/hello.pdf
Normal file
BIN
tests/integration_tests/examples/hello.pdf
Normal file
Binary file not shown.
19
tests/integration_tests/test_pdf_pagesplitter.py
Normal file
19
tests/integration_tests/test_pdf_pagesplitter.py
Normal file
@@ -0,0 +1,19 @@
|
||||
"""Test splitting with page numbers included."""
|
||||
import os
|
||||
|
||||
from langchain.document_loaders import PagedPDFSplitter
|
||||
from langchain.embeddings.openai import OpenAIEmbeddings
|
||||
from langchain.vectorstores import FAISS
|
||||
|
||||
|
||||
def test_pdf_pagesplitter() -> None:
|
||||
"""Test splitting with page numbers included."""
|
||||
script_dir = os.path.dirname(__file__)
|
||||
loader = PagedPDFSplitter(os.path.join(script_dir, "examples/hello.pdf"))
|
||||
docs = loader.load()
|
||||
assert "page" in docs[0].metadata
|
||||
assert "source" in docs[0].metadata
|
||||
|
||||
faiss_index = FAISS.from_documents(docs, OpenAIEmbeddings())
|
||||
docs = faiss_index.similarity_search("Complete this sentence: Hello", k=1)
|
||||
assert "Hello world" in docs[0].page_content
|
||||
@@ -1,6 +1,10 @@
|
||||
"""Test base LLM functionality."""
|
||||
from sqlalchemy import Column, Integer, Sequence, String, create_engine
|
||||
from sqlalchemy.orm import declarative_base
|
||||
|
||||
try:
|
||||
from sqlalchemy.orm import declarative_base
|
||||
except ImportError:
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
|
||||
import langchain
|
||||
from langchain.cache import InMemoryCache, SQLAlchemyCache
|
||||
|
||||
Reference in New Issue
Block a user