mirror of
https://github.com/hwchase17/langchain.git
synced 2026-04-21 19:27:58 +00:00
Compare commits
20 Commits
langchain-
...
cc/depreca
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
39d1759b66 | ||
|
|
7005c4fe5b | ||
|
|
1eace6523d | ||
|
|
bacf4c58ef | ||
|
|
b71b5bd3d7 | ||
|
|
31364de10c | ||
|
|
8a70754dfe | ||
|
|
9bd4459f9a | ||
|
|
50c1ecc5f1 | ||
|
|
f51a9024ae | ||
|
|
8afbab4cf6 | ||
|
|
66e30efa61 | ||
|
|
ba167dc158 | ||
|
|
44f69063b1 | ||
|
|
f18b77fd59 | ||
|
|
966b408634 | ||
|
|
bd261456f6 | ||
|
|
15254d1027 | ||
|
|
d38c9c7026 | ||
|
|
d249318f94 |
@@ -182,7 +182,7 @@ pprint(data)
|
||||
</CodeOutputBlock>
|
||||
|
||||
|
||||
Another option is set `jq_schema='.'` and provide `content_key`:
|
||||
Another option is to set `jq_schema='.'` and provide `content_key`:
|
||||
|
||||
```python
|
||||
loader = JSONLoader(
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -53,7 +53,8 @@
|
||||
"import getpass\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"os.environ[\"TOGETHER_API_KEY\"] = getpass.getpass(\"Enter your Together API key: \")"
|
||||
"if \"TOGETHER_API_KEY\" not in os.environ:\n",
|
||||
" os.environ[\"TOGETHER_API_KEY\"] = getpass.getpass(\"Enter your Together API key: \")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -87,21 +88,10 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": null,
|
||||
"id": "652d6238-1f87-422a-b135-f5abbb8652fc",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.1.2\u001b[0m\n",
|
||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
|
||||
"Note: you may need to restart the kernel to use updated packages.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -qU langchain-together"
|
||||
]
|
||||
@@ -113,14 +103,12 @@
|
||||
"source": [
|
||||
"## Instantiation\n",
|
||||
"\n",
|
||||
"Now we can instantiate our model object and generate chat completions:\n",
|
||||
"\n",
|
||||
"- TODO: Update model instantiation with relevant params."
|
||||
"Now we can instantiate our model object and generate chat completions:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 3,
|
||||
"id": "cb09c344-1836-4e0c-acf8-11d13ac1dbae",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -147,7 +135,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 4,
|
||||
"id": "62e0dbc3",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
@@ -156,10 +144,10 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content=\"J'adore la programmation.\", response_metadata={'token_usage': {'completion_tokens': 9, 'prompt_tokens': 35, 'total_tokens': 44}, 'model_name': 'meta-llama/Llama-3-70b-chat-hf', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-79efa49b-dbaf-4ef8-9dce-958533823ef6-0', usage_metadata={'input_tokens': 35, 'output_tokens': 9, 'total_tokens': 44})"
|
||||
"AIMessage(content=\"J'adore la programmation.\", response_metadata={'token_usage': {'completion_tokens': 9, 'prompt_tokens': 35, 'total_tokens': 44}, 'model_name': 'meta-llama/Llama-3-70b-chat-hf', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-eabcbe33-cdd8-45b8-ab0b-f90b6e7dfad8-0', usage_metadata={'input_tokens': 35, 'output_tokens': 9, 'total_tokens': 44})"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@@ -178,7 +166,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 5,
|
||||
"id": "d86145b3-bfef-46e8-b227-4dda5c9c2705",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -206,17 +194,17 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 7,
|
||||
"id": "e197d1d7-a070-4c96-9f8a-a0e86d046e0b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content='Ich liebe das Programmieren.', response_metadata={'token_usage': {'completion_tokens': 7, 'prompt_tokens': 30, 'total_tokens': 37}, 'model_name': 'meta-llama/Llama-3-70b-chat-hf', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-80bba5fa-1723-4242-8d5a-c09b76b8350b-0', usage_metadata={'input_tokens': 30, 'output_tokens': 7, 'total_tokens': 37})"
|
||||
"AIMessage(content='Ich liebe das Programmieren.', response_metadata={'token_usage': {'completion_tokens': 7, 'prompt_tokens': 30, 'total_tokens': 37}, 'model_name': 'meta-llama/Llama-3-70b-chat-hf', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-a249aa24-ee31-46ba-9bf9-f4eb135b0a95-0', usage_metadata={'input_tokens': 30, 'output_tokens': 7, 'total_tokens': 37})"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@@ -271,7 +259,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
"version": "3.11.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
243
docs/docs/integrations/document_loaders/bshtml.ipynb
Normal file
243
docs/docs/integrations/document_loaders/bshtml.ipynb
Normal file
@@ -0,0 +1,243 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# BSHTMLLoader\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"This notebook provides a quick overview for getting started with BeautifulSoup4 [document loader](https://python.langchain.com/v0.2/docs/concepts/#document-loaders). For detailed documentation of all __ModuleName__Loader features and configurations head to the [API reference](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.html_bs.BSHTMLLoader.html).\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
"### Integration details\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"| Class | Package | Local | Serializable | JS support|\n",
|
||||
"| :--- | :--- | :---: | :---: | :---: |\n",
|
||||
"| [BSHTMLLoader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.html_bs.BSHTMLLoader.html) | [langchain_community](https://api.python.langchain.com/en/latest/community_api_reference.html) | ✅ | ❌ | ❌ | \n",
|
||||
"### Loader features\n",
|
||||
"| Source | Document Lazy Loading | Native Async Support\n",
|
||||
"| :---: | :---: | :---: | \n",
|
||||
"| BSHTMLLoader | ✅ | ❌ | \n",
|
||||
"\n",
|
||||
"## Setup\n",
|
||||
"\n",
|
||||
"To access BSHTMLLoader document loader you'll need to install the `langchain-community` integration package and the `bs4` python package.\n",
|
||||
"\n",
|
||||
"### Credentials\n",
|
||||
"\n",
|
||||
"No credentials are needed to use the `BSHTMLLoader` class."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If you want to get automated best in-class tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")\n",
|
||||
"# os.environ[\"LANGSMITH_TRACING\"] = \"true\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Installation\n",
|
||||
"\n",
|
||||
"Install **langchain_community** and **bs4**."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -qU langchain_community bs4"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialization\n",
|
||||
"\n",
|
||||
"Now we can instantiate our model object and load documents:\n",
|
||||
"\n",
|
||||
"- TODO: Update model instantiation with relevant params."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.document_loaders import BSHTMLLoader\n",
|
||||
"\n",
|
||||
"loader = BSHTMLLoader(\n",
|
||||
" file_path=\"./example_data/fake-content.html\",\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(metadata={'source': './example_data/fake-content.html', 'title': 'Test Title'}, page_content='\\nTest Title\\n\\n\\nMy First Heading\\nMy first paragraph.\\n\\n\\n')"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs = loader.load()\n",
|
||||
"docs[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'source': './example_data/fake-content.html', 'title': 'Test Title'}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(docs[0].metadata)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Lazy Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(metadata={'source': './example_data/fake-content.html', 'title': 'Test Title'}, page_content='\\nTest Title\\n\\n\\nMy First Heading\\nMy first paragraph.\\n\\n\\n')"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"page = []\n",
|
||||
"for doc in loader.lazy_load():\n",
|
||||
" page.append(doc)\n",
|
||||
" if len(page) >= 10:\n",
|
||||
" # do some paged operation, e.g.\n",
|
||||
" # index.upsert(page)\n",
|
||||
"\n",
|
||||
" page = []\n",
|
||||
"page[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Adding separator to BS4\n",
|
||||
"\n",
|
||||
"We can also pass a separator to use when calling get_text on the soup"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"page_content='\n",
|
||||
", Test Title, \n",
|
||||
", \n",
|
||||
", \n",
|
||||
", My First Heading, \n",
|
||||
", My first paragraph., \n",
|
||||
", \n",
|
||||
", \n",
|
||||
"' metadata={'source': './example_data/fake-content.html', 'title': 'Test Title'}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"loader = BSHTMLLoader(\n",
|
||||
" file_path=\"./example_data/fake-content.html\", get_text_separator=\", \"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"docs = loader.load()\n",
|
||||
"print(docs[0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## API reference\n",
|
||||
"\n",
|
||||
"For detailed documentation of all BSHTMLLoader features and configurations head to the API reference: https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.html_bs.BSHTMLLoader.html"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
# Sample Markdown Document
|
||||
|
||||
## Introduction
|
||||
|
||||
Welcome to this sample Markdown document. Markdown is a lightweight markup language used for formatting text. It's widely used for documentation, readme files, and more.
|
||||
|
||||
## Features
|
||||
|
||||
### Headers
|
||||
|
||||
Markdown supports multiple levels of headers:
|
||||
|
||||
- **Header 1**: `# Header 1`
|
||||
- **Header 2**: `## Header 2`
|
||||
- **Header 3**: `### Header 3`
|
||||
|
||||
### Lists
|
||||
|
||||
#### Unordered List
|
||||
|
||||
- Item 1
|
||||
- Item 2
|
||||
- Subitem 2.1
|
||||
- Subitem 2.2
|
||||
|
||||
#### Ordered List
|
||||
|
||||
1. First item
|
||||
2. Second item
|
||||
3. Third item
|
||||
|
||||
### Links
|
||||
|
||||
[OpenAI](https://www.openai.com) is an AI research organization.
|
||||
|
||||
### Images
|
||||
|
||||
Here's an example image:
|
||||
|
||||

|
||||
|
||||
### Code
|
||||
|
||||
#### Inline Code
|
||||
|
||||
Use `code` for inline code snippets.
|
||||
|
||||
#### Code Block
|
||||
|
||||
```python
|
||||
def greet(name):
|
||||
return f"Hello, {name}!"
|
||||
|
||||
print(greet("World"))
|
||||
```
|
||||
@@ -30,6 +30,7 @@
|
||||
{
|
||||
"sender_name": "User 2",
|
||||
"timestamp_ms": 1675595060730,
|
||||
"content": "",
|
||||
"photos": [
|
||||
{"uri": "url_of_some_picture.jpg", "creation_timestamp": 1675595059}
|
||||
]
|
||||
|
||||
@@ -21,24 +21,24 @@ loader = CSVLoader(
|
||||
data = loader.load()
|
||||
```
|
||||
|
||||
## Common File Types
|
||||
|
||||
The below document loaders allow you to load data from common data formats.
|
||||
|
||||
<CategoryTable category="common_loaders" />
|
||||
|
||||
## PDFs
|
||||
|
||||
The below document loaders allow you to load documents.
|
||||
|
||||
<CategoryTable category="pdf_loaders" />
|
||||
|
||||
## Webpages
|
||||
|
||||
The below document loaders allow you to load webpages.
|
||||
|
||||
<CategoryTable category="webpage_loaders" />
|
||||
|
||||
## PDFs
|
||||
|
||||
The below document loaders allow you to load PDF documents.
|
||||
|
||||
<CategoryTable category="pdf_loaders" />
|
||||
|
||||
## Common File Types
|
||||
|
||||
The below document loaders allow you to load data from common data formats.
|
||||
|
||||
<CategoryTable category="common_loaders" />
|
||||
|
||||
|
||||
## All document loaders
|
||||
|
||||
|
||||
348
docs/docs/integrations/document_loaders/json.ipynb
Normal file
348
docs/docs/integrations/document_loaders/json.ipynb
Normal file
@@ -0,0 +1,348 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# JSONLoader\n",
|
||||
"\n",
|
||||
"This notebook provides a quick overview for getting started with JSON [document loader](https://python.langchain.com/v0.2/docs/concepts/#document-loaders). For detailed documentation of all JSONLoader features and configurations head to the [API reference](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.json_loader.JSONLoader.html).\n",
|
||||
"\n",
|
||||
"- TODO: Add any other relevant links, like information about underlying API, etc.\n",
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
"### Integration details\n",
|
||||
"\n",
|
||||
"| Class | Package | Local | Serializable | [JS support](https://js.langchain.com/v0.2/docs/integrations/document_loaders/file_loaders/json/)|\n",
|
||||
"| :--- | :--- | :---: | :---: | :---: |\n",
|
||||
"| [JSONLoader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.json_loader.JSONLoader.html) | [langchain_community](https://api.python.langchain.com/en/latest/community_api_reference.html) | ✅ | ❌ | ✅ | \n",
|
||||
"### Loader features\n",
|
||||
"| Source | Document Lazy Loading | Native Async Support\n",
|
||||
"| :---: | :---: | :---: | \n",
|
||||
"| JSONLoader | ✅ | ❌ | \n",
|
||||
"\n",
|
||||
"## Setup\n",
|
||||
"\n",
|
||||
"To access JSON document loader you'll need to install the `langchain-community` integration package as well as the ``jq`` python package.\n",
|
||||
"\n",
|
||||
"### Credentials\n",
|
||||
"\n",
|
||||
"No credentials are required to use the `JSONLoader` class."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If you want to get automated best in-class tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")\n",
|
||||
"# os.environ[\"LANGSMITH_TRACING\"] = \"true\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Installation\n",
|
||||
"\n",
|
||||
"Install **langchain_community** and **jq**:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -qU langchain_community jq "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialization\n",
|
||||
"\n",
|
||||
"Now we can instantiate our model object and load documents:\n",
|
||||
"\n",
|
||||
"- TODO: Update model instantiation with relevant params."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.document_loaders import JSONLoader\n",
|
||||
"\n",
|
||||
"loader = JSONLoader(\n",
|
||||
" file_path=\"./example_data/facebook_chat.json\",\n",
|
||||
" jq_schema=\".messages[].content\",\n",
|
||||
" text_content=False,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(metadata={'source': '/Users/isaachershenson/Documents/langchain/docs/docs/integrations/document_loaders/example_data/facebook_chat.json', 'seq_num': 1}, page_content='Bye!')"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs = loader.load()\n",
|
||||
"docs[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'source': '/Users/isaachershenson/Documents/langchain/docs/docs/integrations/document_loaders/example_data/facebook_chat.json', 'seq_num': 1}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(docs[0].metadata)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Lazy Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pages = []\n",
|
||||
"for doc in loader.lazy_load():\n",
|
||||
" pages.append(doc)\n",
|
||||
" if len(pages) >= 10:\n",
|
||||
" # do some paged operation, e.g.\n",
|
||||
" # index.upsert(pages)\n",
|
||||
"\n",
|
||||
" pages = []"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Read from JSON Lines file\n",
|
||||
"\n",
|
||||
"If you want to load documents from a JSON Lines file, you pass `json_lines=True`\n",
|
||||
"and specify `jq_schema` to extract `page_content` from a single JSON object."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"page_content='Bye!' metadata={'source': '/Users/isaachershenson/Documents/langchain/docs/docs/integrations/document_loaders/example_data/facebook_chat_messages.jsonl', 'seq_num': 1}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"loader = JSONLoader(\n",
|
||||
" file_path=\"./example_data/facebook_chat_messages.jsonl\",\n",
|
||||
" jq_schema=\".content\",\n",
|
||||
" text_content=False,\n",
|
||||
" json_lines=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"docs = loader.load()\n",
|
||||
"print(docs[0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Read specific content keys\n",
|
||||
"\n",
|
||||
"Another option is to set `jq_schema='.'` and provide a `content_key` in order to only load specific content:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"page_content='User 2' metadata={'source': '/Users/isaachershenson/Documents/langchain/docs/docs/integrations/document_loaders/example_data/facebook_chat_messages.jsonl', 'seq_num': 1}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"loader = JSONLoader(\n",
|
||||
" file_path=\"./example_data/facebook_chat_messages.jsonl\",\n",
|
||||
" jq_schema=\".\",\n",
|
||||
" content_key=\"sender_name\",\n",
|
||||
" json_lines=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"docs = loader.load()\n",
|
||||
"print(docs[0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## JSON file with jq schema `content_key`\n",
|
||||
"\n",
|
||||
"To load documents from a JSON file using the `content_key` within the jq schema, set `is_content_key_jq_parsable=True`. Ensure that `content_key` is compatible and can be parsed using the jq schema."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"page_content='Bye!' metadata={'source': '/Users/isaachershenson/Documents/langchain/docs/docs/integrations/document_loaders/example_data/facebook_chat.json', 'seq_num': 1}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"loader = JSONLoader(\n",
|
||||
" file_path=\"./example_data/facebook_chat.json\",\n",
|
||||
" jq_schema=\".messages[]\",\n",
|
||||
" content_key=\".content\",\n",
|
||||
" is_content_key_jq_parsable=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"docs = loader.load()\n",
|
||||
"print(docs[0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Extracting metadata\n",
|
||||
"\n",
|
||||
"Generally, we want to include metadata available in the JSON file into the documents that we create from the content.\n",
|
||||
"\n",
|
||||
"The following demonstrates how metadata can be extracted using the `JSONLoader`.\n",
|
||||
"\n",
|
||||
"There are some key changes to be noted. In the previous example where we didn't collect the metadata, we managed to directly specify in the schema where the value for the `page_content` can be extracted from.\n",
|
||||
"\n",
|
||||
"In this example, we have to tell the loader to iterate over the records in the `messages` field. The jq_schema then has to be `.messages[]`\n",
|
||||
"\n",
|
||||
"This allows us to pass the records (dict) into the `metadata_func` that has to be implemented. The `metadata_func` is responsible for identifying which pieces of information in the record should be included in the metadata stored in the final `Document` object.\n",
|
||||
"\n",
|
||||
"Additionally, we now have to explicitly specify in the loader, via the `content_key` argument, the key from the record where the value for the `page_content` needs to be extracted from."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'source': '/Users/isaachershenson/Documents/langchain/docs/docs/integrations/document_loaders/example_data/facebook_chat.json', 'seq_num': 1, 'sender_name': 'User 2', 'timestamp_ms': 1675597571851}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Define the metadata extraction function.\n",
|
||||
"def metadata_func(record: dict, metadata: dict) -> dict:\n",
|
||||
" metadata[\"sender_name\"] = record.get(\"sender_name\")\n",
|
||||
" metadata[\"timestamp_ms\"] = record.get(\"timestamp_ms\")\n",
|
||||
"\n",
|
||||
" return metadata\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"loader = JSONLoader(\n",
|
||||
" file_path=\"./example_data/facebook_chat.json\",\n",
|
||||
" jq_schema=\".messages[]\",\n",
|
||||
" content_key=\"content\",\n",
|
||||
" metadata_func=metadata_func,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"docs = loader.load()\n",
|
||||
"print(docs[0].metadata)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## API reference\n",
|
||||
"\n",
|
||||
"For detailed documentation of all JSONLoader features and configurations head to the API reference: https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.json_loader.JSONLoader.html"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
178
docs/docs/integrations/document_loaders/mathpix.ipynb
Normal file
178
docs/docs/integrations/document_loaders/mathpix.ipynb
Normal file
@@ -0,0 +1,178 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# MathPixPDFLoader\n",
|
||||
"\n",
|
||||
"Inspired by Daniel Gross's snippet here: [https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21](https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21)\n",
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
"### Integration details\n",
|
||||
"\n",
|
||||
"| Class | Package | Local | Serializable | JS support|\n",
|
||||
"| :--- | :--- | :---: | :---: | :---: |\n",
|
||||
"| [MathPixPDFLoader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.MathpixPDFLoader.html) | [langchain_community](https://api.python.langchain.com/en/latest/community_api_reference.html) | ✅ | ❌ | ❌ | \n",
|
||||
"### Loader features\n",
|
||||
"| Source | Document Lazy Loading | Native Async Support\n",
|
||||
"| :---: | :---: | :---: | \n",
|
||||
"| MathPixPDFLoader | ✅ | ❌ | \n",
|
||||
"\n",
|
||||
"## Setup\n",
|
||||
"\n",
|
||||
"### Credentials\n",
|
||||
"\n",
|
||||
"Sign up for Mathpix and [create an API key](https://mathpix.com/docs/ocr/creating-an-api-key) to set the `MATHPIX_API_KEY` variables in your environment"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import getpass\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"if \"MATHPIX_API_KEY\" not in os.environ:\n",
|
||||
" os.environ[\"MATHPIX_API_KEY\"] = getpass.getpass(\"Enter your Mathpix API key: \")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If you want to get automated best in-class tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")\n",
|
||||
"# os.environ[\"LANGSMITH_TRACING\"] = \"true\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Installation\n",
|
||||
"\n",
|
||||
"Install **langchain_community**."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -qU langchain_community"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialization\n",
|
||||
"\n",
|
||||
"Now we are ready to initialize our loader:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.document_loaders import MathpixPDFLoader\n",
|
||||
"\n",
|
||||
"file_path = \"./example_data/layout-parser-paper.pdf\"\n",
|
||||
"loader = MathpixPDFLoader(file_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = loader.load()\n",
|
||||
"docs[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(docs[0].metadata)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Lazy Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"page = []\n",
|
||||
"for doc in loader.lazy_load():\n",
|
||||
" page.append(doc)\n",
|
||||
" if len(page) >= 10:\n",
|
||||
" # do some paged operation, e.g.\n",
|
||||
" # index.upsert(page)\n",
|
||||
"\n",
|
||||
" page = []"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## API reference\n",
|
||||
"\n",
|
||||
"For detailed documentation of all MathpixPDFLoader features and configurations head to the API reference: https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.MathpixPDFLoader.html"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
317
docs/docs/integrations/document_loaders/pdfminer.ipynb
Normal file
317
docs/docs/integrations/document_loaders/pdfminer.ipynb
Normal file
File diff suppressed because one or more lines are too long
183
docs/docs/integrations/document_loaders/pdfplumber.ipynb
Normal file
183
docs/docs/integrations/document_loaders/pdfplumber.ipynb
Normal file
@@ -0,0 +1,183 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# PDFPlumber\n",
|
||||
"\n",
|
||||
"Like PyMuPDF, the output Documents contain detailed metadata about the PDF and its pages, and returns one document per page.\n",
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
"### Integration details\n",
|
||||
"\n",
|
||||
"| Class | Package | Local | Serializable | JS support|\n",
|
||||
"| :--- | :--- | :---: | :---: | :---: |\n",
|
||||
"| [PDFPlumberLoader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PDFPlumberLoader.html) | [langchain_community](https://api.python.langchain.com/en/latest/community_api_reference.html) | ✅ | ❌ | ❌ | \n",
|
||||
"### Loader features\n",
|
||||
"| Source | Document Lazy Loading | Native Async Support\n",
|
||||
"| :---: | :---: | :---: | \n",
|
||||
"| PDFPlumberLoader | ✅ | ❌ | \n",
|
||||
"\n",
|
||||
"## Setup\n",
|
||||
"\n",
|
||||
"### Credentials\n",
|
||||
"\n",
|
||||
"No credentials are needed to use this loader."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If you want to get automated best in-class tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")\n",
|
||||
"# os.environ[\"LANGSMITH_TRACING\"] = \"true\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Installation\n",
|
||||
"\n",
|
||||
"Install **langchain_community**."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -qU langchain_community"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialization\n",
|
||||
"\n",
|
||||
"Now we can instantiate our model object and load documents:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.document_loaders import PDFPlumberLoader\n",
|
||||
"\n",
|
||||
"loader = PDFPlumberLoader(\"./example_data/layout-parser-paper.pdf\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(metadata={'source': './example_data/layout-parser-paper.pdf', 'file_path': './example_data/layout-parser-paper.pdf', 'page': 0, 'total_pages': 16, 'Author': '', 'CreationDate': 'D:20210622012710Z', 'Creator': 'LaTeX with hyperref', 'Keywords': '', 'ModDate': 'D:20210622012710Z', 'PTEX.Fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'Producer': 'pdfTeX-1.40.21', 'Subject': '', 'Title': '', 'Trapped': 'False'}, page_content='LayoutParser: A Unified Toolkit for Deep\\nLearning Based Document Image Analysis\\nZejiang Shen1 ((cid:0)), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain\\nLee4, Jacob Carlson3, and Weining Li5\\n1 Allen Institute for AI\\nshannons@allenai.org\\n2 Brown University\\nruochen zhang@brown.edu\\n3 Harvard University\\n{melissadell,jacob carlson}@fas.harvard.edu\\n4 University of Washington\\nbcgl@cs.washington.edu\\n5 University of Waterloo\\nw422li@uwaterloo.ca\\nAbstract. Recentadvancesindocumentimageanalysis(DIA)havebeen\\nprimarily driven by the application of neural networks. Ideally, research\\noutcomescouldbeeasilydeployedinproductionandextendedforfurther\\ninvestigation. However, various factors like loosely organized codebases\\nand sophisticated model configurations complicate the easy reuse of im-\\nportantinnovationsbyawideaudience.Thoughtherehavebeenon-going\\nefforts to improve reusability and simplify deep learning (DL) model\\ndevelopmentindisciplineslikenaturallanguageprocessingandcomputer\\nvision, none of them are optimized for challenges in the domain of DIA.\\nThis represents a major gap in the existing toolkit, as DIA is central to\\nacademicresearchacross awiderangeof disciplinesinthesocialsciences\\nand humanities. This paper introduces LayoutParser, an open-source\\nlibrary for streamlining the usage of DL in DIA research and applica-\\ntions. The core LayoutParser library comes with a set of simple and\\nintuitiveinterfacesforapplyingandcustomizingDLmodelsforlayoutde-\\ntection,characterrecognition,andmanyotherdocumentprocessingtasks.\\nTo promote extensibility, LayoutParser also incorporates a community\\nplatform for sharing both pre-trained models and full document digiti-\\nzation pipelines. We demonstrate that LayoutParser is helpful for both\\nlightweight and large-scale digitization pipelines in real-word use cases.\\nThe library is publicly available at https://layout-parser.github.io.\\nKeywords: DocumentImageAnalysis·DeepLearning·LayoutAnalysis\\n· Character Recognition · Open Source library · Toolkit.\\n1 Introduction\\nDeep Learning(DL)-based approaches are the state-of-the-art for a wide range of\\ndocumentimageanalysis(DIA)tasksincludingdocumentimageclassification[11,\\n1202\\nnuJ\\n12\\n]VC.sc[\\n2v84351.3012:viXra\\n')"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs = loader.load()\n",
|
||||
"docs[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'source': './example_data/layout-parser-paper.pdf', 'file_path': './example_data/layout-parser-paper.pdf', 'page': 0, 'total_pages': 16, 'Author': '', 'CreationDate': 'D:20210622012710Z', 'Creator': 'LaTeX with hyperref', 'Keywords': '', 'ModDate': 'D:20210622012710Z', 'PTEX.Fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'Producer': 'pdfTeX-1.40.21', 'Subject': '', 'Title': '', 'Trapped': 'False'}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(docs[0].metadata)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Lazy Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"page = []\n",
|
||||
"for doc in loader.lazy_load():\n",
|
||||
" page.append(doc)\n",
|
||||
" if len(page) >= 10:\n",
|
||||
" # do some paged operation, e.g.\n",
|
||||
" # index.upsert(page)\n",
|
||||
"\n",
|
||||
" page = []"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## API reference\n",
|
||||
"\n",
|
||||
"For detailed documentation of all PDFPlumberLoader features and configurations head to the API reference: https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PDFPlumberLoader.html"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
185
docs/docs/integrations/document_loaders/pymupdf.ipynb
Normal file
185
docs/docs/integrations/document_loaders/pymupdf.ipynb
Normal file
@@ -0,0 +1,185 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# PyMuPDF\n",
|
||||
"\n",
|
||||
"`PyMuPDF` is optimized for speed, and contains detailed metadata about the PDF and its pages. It returns one document per page.\n",
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
"### Integration details\n",
|
||||
"\n",
|
||||
"| Class | Package | Local | Serializable | JS support|\n",
|
||||
"| :--- | :--- | :---: | :---: | :---: |\n",
|
||||
"| [PyMuPDFLoader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyMuPDFLoader.html) | [langchain_community](https://api.python.langchain.com/en/latest/community_api_reference.html) | ✅ | ❌ | ❌ | \n",
|
||||
"### Loader features\n",
|
||||
"| Source | Document Lazy Loading | Native Async Support\n",
|
||||
"| :---: | :---: | :---: | \n",
|
||||
"| PyMuPDFLoader | ✅ | ❌ | \n",
|
||||
"\n",
|
||||
"## Setup\n",
|
||||
"\n",
|
||||
"### Credentials\n",
|
||||
"\n",
|
||||
"No credentials are needed to use the `PyMuPDFLoader`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If you want to get automated best in-class tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")\n",
|
||||
"# os.environ[\"LANGSMITH_TRACING\"] = \"true\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Installation\n",
|
||||
"\n",
|
||||
"Install **langchain_community** and **pymupdf**."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -qU langchain-community pymupdf"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialization\n",
|
||||
"\n",
|
||||
"Now we can initialize our loader and start loading documents. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.document_loaders import PyMuPDFLoader\n",
|
||||
"\n",
|
||||
"loader = PyMuPDFLoader(\"./example_data/layout-parser-paper.pdf\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load\n",
|
||||
"\n",
|
||||
"You can pass along any of the options from the [PyMuPDF documentation](https://pymupdf.readthedocs.io/en/latest/app1.html#plain-text/) as keyword arguments in the `load` call, and it will be pass along to the `get_text()` call."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(metadata={'source': './example_data/layout-parser-paper.pdf', 'file_path': './example_data/layout-parser-paper.pdf', 'page': 0, 'total_pages': 16, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'LaTeX with hyperref', 'producer': 'pdfTeX-1.40.21', 'creationDate': 'D:20210622012710Z', 'modDate': 'D:20210622012710Z', 'trapped': ''}, page_content='LayoutParser: A Unified Toolkit for Deep\\nLearning Based Document Image Analysis\\nZejiang Shen1 (\\x00), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain\\nLee4, Jacob Carlson3, and Weining Li5\\n1 Allen Institute for AI\\nshannons@allenai.org\\n2 Brown University\\nruochen zhang@brown.edu\\n3 Harvard University\\n{melissadell,jacob carlson}@fas.harvard.edu\\n4 University of Washington\\nbcgl@cs.washington.edu\\n5 University of Waterloo\\nw422li@uwaterloo.ca\\nAbstract. Recent advances in document image analysis (DIA) have been\\nprimarily driven by the application of neural networks. Ideally, research\\noutcomes could be easily deployed in production and extended for further\\ninvestigation. However, various factors like loosely organized codebases\\nand sophisticated model configurations complicate the easy reuse of im-\\nportant innovations by a wide audience. Though there have been on-going\\nefforts to improve reusability and simplify deep learning (DL) model\\ndevelopment in disciplines like natural language processing and computer\\nvision, none of them are optimized for challenges in the domain of DIA.\\nThis represents a major gap in the existing toolkit, as DIA is central to\\nacademic research across a wide range of disciplines in the social sciences\\nand humanities. This paper introduces LayoutParser, an open-source\\nlibrary for streamlining the usage of DL in DIA research and applica-\\ntions. The core LayoutParser library comes with a set of simple and\\nintuitive interfaces for applying and customizing DL models for layout de-\\ntection, character recognition, and many other document processing tasks.\\nTo promote extensibility, LayoutParser also incorporates a community\\nplatform for sharing both pre-trained models and full document digiti-\\nzation pipelines. We demonstrate that LayoutParser is helpful for both\\nlightweight and large-scale digitization pipelines in real-word use cases.\\nThe library is publicly available at https://layout-parser.github.io.\\nKeywords: Document Image Analysis · Deep Learning · Layout Analysis\\n· Character Recognition · Open Source library · Toolkit.\\n1\\nIntroduction\\nDeep Learning(DL)-based approaches are the state-of-the-art for a wide range of\\ndocument image analysis (DIA) tasks including document image classification [11,\\narXiv:2103.15348v2 [cs.CV] 21 Jun 2021\\n')"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs = loader.load()\n",
|
||||
"docs[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'source': './example_data/layout-parser-paper.pdf', 'file_path': './example_data/layout-parser-paper.pdf', 'page': 0, 'total_pages': 16, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'LaTeX with hyperref', 'producer': 'pdfTeX-1.40.21', 'creationDate': 'D:20210622012710Z', 'modDate': 'D:20210622012710Z', 'trapped': ''}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(docs[0].metadata)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Lazy Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"page = []\n",
|
||||
"for doc in loader.lazy_load():\n",
|
||||
" page.append(doc)\n",
|
||||
" if len(page) >= 10:\n",
|
||||
" # do some paged operation, e.g.\n",
|
||||
" # index.upsert(page)\n",
|
||||
"\n",
|
||||
" page = []"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## API reference\n",
|
||||
"\n",
|
||||
"For detailed documentation of all PyMuPDFLoader features and configurations head to the API reference: https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyMuPDFLoader.html"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
187
docs/docs/integrations/document_loaders/pypdfdirectory.ipynb
Normal file
187
docs/docs/integrations/document_loaders/pypdfdirectory.ipynb
Normal file
@@ -0,0 +1,187 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# PyPDFDirectoryLoader\n",
|
||||
"\n",
|
||||
"This loader loads all PDF files from a specific directory.\n",
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
"### Integration details\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"| Class | Package | Local | Serializable | JS support|\n",
|
||||
"| :--- | :--- | :---: | :---: | :---: |\n",
|
||||
"| [PyPDFDirectoryLoader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFDirectoryLoader.html) | [langchain_community](https://api.python.langchain.com/en/latest/community_api_reference.html) | ✅ | ❌ | ❌ | \n",
|
||||
"### Loader features\n",
|
||||
"| Source | Document Lazy Loading | Native Async Support\n",
|
||||
"| :---: | :---: | :---: | \n",
|
||||
"| PyPDFDirectoryLoader | ✅ | ❌ | \n",
|
||||
"\n",
|
||||
"## Setup\n",
|
||||
"\n",
|
||||
"### Credentials\n",
|
||||
"\n",
|
||||
"No credentials are needed for this loader."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If you want to get automated best in-class tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")\n",
|
||||
"# os.environ[\"LANGSMITH_TRACING\"] = \"true\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Installation\n",
|
||||
"\n",
|
||||
"Install **langchain_community**."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -qU langchain_community"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialization\n",
|
||||
"\n",
|
||||
"Now we can instantiate our model object and load documents:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.document_loaders import PyPDFDirectoryLoader\n",
|
||||
"\n",
|
||||
"directory_path = (\n",
|
||||
" \"../../docs/integrations/document_loaders/example_data/layout-parser-paper.pdf\"\n",
|
||||
")\n",
|
||||
"loader = PyPDFDirectoryLoader(\"example_data/\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(metadata={'source': 'example_data/layout-parser-paper.pdf', 'page': 0}, page_content='LayoutParser : A Unified Toolkit for Deep\\nLearning Based Document Image Analysis\\nZejiang Shen1( \\x00), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain\\nLee4, Jacob Carlson3, and Weining Li5\\n1Allen Institute for AI\\nshannons@allenai.org\\n2Brown University\\nruochen zhang@brown.edu\\n3Harvard University\\n{melissadell,jacob carlson }@fas.harvard.edu\\n4University of Washington\\nbcgl@cs.washington.edu\\n5University of Waterloo\\nw422li@uwaterloo.ca\\nAbstract. Recent advances in document image analysis (DIA) have been\\nprimarily driven by the application of neural networks. Ideally, research\\noutcomes could be easily deployed in production and extended for further\\ninvestigation. However, various factors like loosely organized codebases\\nand sophisticated model configurations complicate the easy reuse of im-\\nportant innovations by a wide audience. Though there have been on-going\\nefforts to improve reusability and simplify deep learning (DL) model\\ndevelopment in disciplines like natural language processing and computer\\nvision, none of them are optimized for challenges in the domain of DIA.\\nThis represents a major gap in the existing toolkit, as DIA is central to\\nacademic research across a wide range of disciplines in the social sciences\\nand humanities. This paper introduces LayoutParser , an open-source\\nlibrary for streamlining the usage of DL in DIA research and applica-\\ntions. The core LayoutParser library comes with a set of simple and\\nintuitive interfaces for applying and customizing DL models for layout de-\\ntection, character recognition, and many other document processing tasks.\\nTo promote extensibility, LayoutParser also incorporates a community\\nplatform for sharing both pre-trained models and full document digiti-\\nzation pipelines. We demonstrate that LayoutParser is helpful for both\\nlightweight and large-scale digitization pipelines in real-word use cases.\\nThe library is publicly available at https://layout-parser.github.io .\\nKeywords: Document Image Analysis ·Deep Learning ·Layout Analysis\\n·Character Recognition ·Open Source library ·Toolkit.\\n1 Introduction\\nDeep Learning(DL)-based approaches are the state-of-the-art for a wide range of\\ndocument image analysis (DIA) tasks including document image classification [ 11,arXiv:2103.15348v2 [cs.CV] 21 Jun 2021')"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs = loader.load()\n",
|
||||
"docs[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'source': 'example_data/layout-parser-paper.pdf', 'page': 0}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(docs[0].metadata)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Lazy Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"page = []\n",
|
||||
"for doc in loader.lazy_load():\n",
|
||||
" page.append(doc)\n",
|
||||
" if len(page) >= 10:\n",
|
||||
" # do some paged operation, e.g.\n",
|
||||
" # index.upsert(page)\n",
|
||||
"\n",
|
||||
" page = []"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## API reference\n",
|
||||
"\n",
|
||||
"For detailed documentation of all PyPDFDirectoryLoader features and configurations head to the API reference: https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFDirectoryLoader.html"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
188
docs/docs/integrations/document_loaders/pypdfium2.ipynb
Normal file
188
docs/docs/integrations/document_loaders/pypdfium2.ipynb
Normal file
@@ -0,0 +1,188 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# PyPDFium2Loader\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"This notebook provides a quick overview for getting started with PyPDFium2 [document loader](https://python.langchain.com/v0.2/docs/concepts/#document-loaders). For detailed documentation of all __ModuleName__Loader features and configurations head to the [API reference](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFium2Loader.html).\n",
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
"### Integration details\n",
|
||||
"\n",
|
||||
"| Class | Package | Local | Serializable | JS support|\n",
|
||||
"| :--- | :--- | :---: | :---: | :---: |\n",
|
||||
"| [PyPDFium2Loader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFium2Loader.html) | [langchain_community](https://api.python.langchain.com/en/latest/community_api_reference.html) | ✅ | ❌ | ❌ | \n",
|
||||
"### Loader features\n",
|
||||
"| Source | Document Lazy Loading | Native Async Support\n",
|
||||
"| :---: | :---: | :---: | \n",
|
||||
"| PyPDFium2Loader | ✅ | ❌ | \n",
|
||||
"\n",
|
||||
"## Setup\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"To access PyPDFium2 document loader you'll need to install the `langchain-community` integration package.\n",
|
||||
"\n",
|
||||
"### Credentials\n",
|
||||
"\n",
|
||||
"No credentials are needed."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If you want to get automated best in-class tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")\n",
|
||||
"# os.environ[\"LANGSMITH_TRACING\"] = \"true\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Installation\n",
|
||||
"\n",
|
||||
"Install **langchain_community**."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -qU langchain_community"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialization\n",
|
||||
"\n",
|
||||
"Now we can instantiate our model object and load documents:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.document_loaders import PyPDFium2Loader\n",
|
||||
"\n",
|
||||
"file_path = \"./example_data/layout-parser-paper.pdf\"\n",
|
||||
"loader = PyPDFium2Loader(file_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(metadata={'source': './example_data/layout-parser-paper.pdf', 'page': 0}, page_content='LayoutParser: A Unified Toolkit for Deep\\r\\nLearning Based Document Image Analysis\\r\\nZejiang Shen\\r\\n1\\r\\n(), Ruochen Zhang\\r\\n2\\r\\n, Melissa Dell\\r\\n3\\r\\n, Benjamin Charles Germain\\r\\nLee\\r\\n4\\r\\n, Jacob Carlson\\r\\n3\\r\\n, and Weining Li\\r\\n5\\r\\n1 Allen Institute for AI\\r\\nshannons@allenai.org 2 Brown University\\r\\nruochen zhang@brown.edu 3 Harvard University\\r\\n{melissadell,jacob carlson}@fas.harvard.edu\\r\\n4 University of Washington\\r\\nbcgl@cs.washington.edu 5 University of Waterloo\\r\\nw422li@uwaterloo.ca\\r\\nAbstract. Recent advances in document image analysis (DIA) have been\\r\\nprimarily driven by the application of neural networks. Ideally, research\\r\\noutcomes could be easily deployed in production and extended for further\\r\\ninvestigation. However, various factors like loosely organized codebases\\r\\nand sophisticated model configurations complicate the easy reuse of im\\x02portant innovations by a wide audience. Though there have been on-going\\r\\nefforts to improve reusability and simplify deep learning (DL) model\\r\\ndevelopment in disciplines like natural language processing and computer\\r\\nvision, none of them are optimized for challenges in the domain of DIA.\\r\\nThis represents a major gap in the existing toolkit, as DIA is central to\\r\\nacademic research across a wide range of disciplines in the social sciences\\r\\nand humanities. This paper introduces LayoutParser, an open-source\\r\\nlibrary for streamlining the usage of DL in DIA research and applica\\x02tions. The core LayoutParser library comes with a set of simple and\\r\\nintuitive interfaces for applying and customizing DL models for layout de\\x02tection, character recognition, and many other document processing tasks.\\r\\nTo promote extensibility, LayoutParser also incorporates a community\\r\\nplatform for sharing both pre-trained models and full document digiti\\x02zation pipelines. We demonstrate that LayoutParser is helpful for both\\r\\nlightweight and large-scale digitization pipelines in real-word use cases.\\r\\nThe library is publicly available at https://layout-parser.github.io.\\r\\nKeywords: Document Image Analysis· Deep Learning· Layout Analysis\\r\\n· Character Recognition· Open Source library· Toolkit.\\r\\n1 Introduction\\r\\nDeep Learning(DL)-based approaches are the state-of-the-art for a wide range of\\r\\ndocument image analysis (DIA) tasks including document image classification [11,\\r\\narXiv:2103.15348v2 [cs.CV] 21 Jun 2021\\n')"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs = loader.load()\n",
|
||||
"docs[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'source': './example_data/layout-parser-paper.pdf', 'page': 0}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(docs[0].metadata)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Lazy Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"page = []\n",
|
||||
"for doc in loader.lazy_load():\n",
|
||||
" page.append(doc)\n",
|
||||
" if len(page) >= 10:\n",
|
||||
" # do some paged operation, e.g.\n",
|
||||
" # index.upsert(page)\n",
|
||||
"\n",
|
||||
" page = []"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## API reference\n",
|
||||
"\n",
|
||||
"For detailed documentation of all PyPDFium2Loader features and configurations head to the API reference: https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFium2Loader.html"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -0,0 +1,269 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# UnstructuredMarkdownLoader\n",
|
||||
"\n",
|
||||
"This notebook provides a quick overview for getting started with UnstructuredMarkdown [document loader](https://python.langchain.com/v0.2/docs/concepts/#document-loaders). For detailed documentation of all __ModuleName__Loader features and configurations head to the [API reference](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.markdown.UnstructuredMarkdownLoader.html).\n",
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
"### Integration details\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"| Class | Package | Local | Serializable | [JS support](https://js.langchain.com/v0.2/docs/integrations/document_loaders/file_loaders/unstructured/)|\n",
|
||||
"| :--- | :--- | :---: | :---: | :---: |\n",
|
||||
"| [UnstructuredMarkdownLoader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.markdown.UnstructuredMarkdownLoader.html) | [langchain_community](https://api.python.langchain.com/en/latest/community_api_reference.html) | ❌ | ❌ | ✅ | \n",
|
||||
"### Loader features\n",
|
||||
"| Source | Document Lazy Loading | Native Async Support\n",
|
||||
"| :---: | :---: | :---: | \n",
|
||||
"| UnstructuredMarkdownLoader | ✅ | ❌ | \n",
|
||||
"\n",
|
||||
"## Setup\n",
|
||||
"\n",
|
||||
"To access UnstructuredMarkdownLoader document loader you'll need to install the `langchain-community` integration package and the `unstructured` python package.\n",
|
||||
"\n",
|
||||
"### Credentials\n",
|
||||
"\n",
|
||||
"No credentials are needed to use this loader."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If you want to get automated best in-class tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")\n",
|
||||
"# os.environ[\"LANGSMITH_TRACING\"] = \"true\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Installation\n",
|
||||
"\n",
|
||||
"Install **langchain_community** and **unstructured**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -qU langchain_community unstructured"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialization\n",
|
||||
"\n",
|
||||
"Now we can instantiate our model object and load documents. \n",
|
||||
"\n",
|
||||
"You can run the loader in one of two modes: \"single\" and \"elements\". If you use \"single\" mode, the document will be returned as a single `Document` object. If you use \"elements\" mode, the unstructured library will split the document into elements such as `Title` and `NarrativeText`. You can pass in additional `unstructured` kwargs after mode to apply different `unstructured` settings."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.document_loaders import UnstructuredMarkdownLoader\n",
|
||||
"\n",
|
||||
"loader = UnstructuredMarkdownLoader(\n",
|
||||
" \"./example_data/example.md\",\n",
|
||||
" mode=\"single\",\n",
|
||||
" strategy=\"fast\",\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(metadata={'source': './example_data/example.md'}, page_content='Sample Markdown Document\\n\\nIntroduction\\n\\nWelcome to this sample Markdown document. Markdown is a lightweight markup language used for formatting text. It\\'s widely used for documentation, readme files, and more.\\n\\nFeatures\\n\\nHeaders\\n\\nMarkdown supports multiple levels of headers:\\n\\nHeader 1: # Header 1\\n\\nHeader 2: ## Header 2\\n\\nHeader 3: ### Header 3\\n\\nLists\\n\\nUnordered List\\n\\nItem 1\\n\\nItem 2\\n\\nSubitem 2.1\\n\\nSubitem 2.2\\n\\nOrdered List\\n\\nFirst item\\n\\nSecond item\\n\\nThird item\\n\\nLinks\\n\\nOpenAI is an AI research organization.\\n\\nImages\\n\\nHere\\'s an example image:\\n\\nCode\\n\\nInline Code\\n\\nUse code for inline code snippets.\\n\\nCode Block\\n\\n```python def greet(name): return f\"Hello, {name}!\"\\n\\nprint(greet(\"World\")) ```')"
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs = loader.load()\n",
|
||||
"docs[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'source': './example_data/example.md'}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(docs[0].metadata)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Lazy Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(metadata={'source': './example_data/example.md', 'link_texts': ['OpenAI'], 'link_urls': ['https://www.openai.com'], 'last_modified': '2024-08-14T15:04:18', 'languages': ['eng'], 'parent_id': 'de1f74bf226224377ab4d8b54f215bb9', 'filetype': 'text/markdown', 'file_directory': './example_data', 'filename': 'example.md', 'category': 'NarrativeText', 'element_id': '898a542a261f7dc65e0072d1e847d535'}, page_content='OpenAI is an AI research organization.')"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"page = []\n",
|
||||
"for doc in loader.lazy_load():\n",
|
||||
" page.append(doc)\n",
|
||||
" if len(page) >= 10:\n",
|
||||
" # do some paged operation, e.g.\n",
|
||||
" # index.upsert(page)\n",
|
||||
"\n",
|
||||
" page = []\n",
|
||||
"page[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load Elements\n",
|
||||
"\n",
|
||||
"In this example we will load in the `elements` mode, which will return a list of the different elements in the markdown document:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"29"
|
||||
]
|
||||
},
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain_community.document_loaders import UnstructuredMarkdownLoader\n",
|
||||
"\n",
|
||||
"loader = UnstructuredMarkdownLoader(\n",
|
||||
" \"./example_data/example.md\",\n",
|
||||
" mode=\"elements\",\n",
|
||||
" strategy=\"fast\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"docs = loader.load()\n",
|
||||
"len(docs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"As you see there are 29 elements that were pulled from the `example.md` file. The first element is the title of the document as expected:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Sample Markdown Document'"
|
||||
]
|
||||
},
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs[0].page_content"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## API reference\n",
|
||||
"\n",
|
||||
"For detailed documentation of all UnstructuredMarkdownLoader features and configurations head to the API reference: https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.markdown.UnstructuredMarkdownLoader.html"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
332
docs/docs/versions/migrating_chains/constitutional_chain.ipynb
Normal file
332
docs/docs/versions/migrating_chains/constitutional_chain.ipynb
Normal file
@@ -0,0 +1,332 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b57124cc-60a0-4c18-b7ce-3e483d1024a2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"title: Migrating from ConstitutionalChain\n",
|
||||
"---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ce8457ed-c0b1-4a74-abbd-9d3d2211270f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"[ConstitutionalChain](https://api.python.langchain.com/en/latest/chains/langchain.chains.constitutional_ai.base.ConstitutionalChain.html) allowed for a LLM to critique and revise generations based on [principles](https://api.python.langchain.com/en/latest/chains/langchain.chains.constitutional_ai.models.ConstitutionalPrinciple.html), structured as combinations of critique and revision requests. For example, a principle might include a request to identify harmful content, and a request to rewrite the content.\n",
|
||||
"\n",
|
||||
"In `ConstitutionalChain`, this structure of critique requests and associated revisions was formatted into a LLM prompt and parsed out of string responses. This is more naturally achieved via [structured output](/docs/how_to/structured_output/) features of chat models. We can construct a simple chain in [LangGraph](https://langchain-ai.github.io/langgraph/) for this purpose. Some advantages of this approach include:\n",
|
||||
"\n",
|
||||
"- Leverage tool-calling capabilities of chat models that have been fine-tuned for this purpose;\n",
|
||||
"- Reduce parsing errors from extracting expression from a string LLM response;\n",
|
||||
"- Delegation of instructions to [message roles](/docs/concepts/#messages) (e.g., chat models can understand what a `ToolMessage` represents without the need for additional prompting);\n",
|
||||
"- Support for streaming, both of individual tokens and chain steps."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b99b47ec",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install --upgrade --quiet langchain-openai"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "717c8673",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"from getpass import getpass\n",
|
||||
"\n",
|
||||
"os.environ[\"OPENAI_API_KEY\"] = getpass()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e3621b62-a037-42b8-8faa-59575608bb8b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Legacy\n",
|
||||
"\n",
|
||||
"<details open>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "f91c9809-8ee7-4e38-881d-0ace4f6ea883",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.chains import ConstitutionalChain, LLMChain\n",
|
||||
"from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple\n",
|
||||
"from langchain_core.prompts import PromptTemplate\n",
|
||||
"from langchain_openai import OpenAI\n",
|
||||
"\n",
|
||||
"llm = OpenAI()\n",
|
||||
"\n",
|
||||
"qa_prompt = PromptTemplate(\n",
|
||||
" template=\"Q: {question} A:\",\n",
|
||||
" input_variables=[\"question\"],\n",
|
||||
")\n",
|
||||
"qa_chain = LLMChain(llm=llm, prompt=qa_prompt)\n",
|
||||
"\n",
|
||||
"constitutional_chain = ConstitutionalChain.from_llm(\n",
|
||||
" llm=llm,\n",
|
||||
" chain=qa_chain,\n",
|
||||
" constitutional_principles=[\n",
|
||||
" ConstitutionalPrinciple(\n",
|
||||
" critique_request=\"Tell if this answer is good.\",\n",
|
||||
" revision_request=\"Give a better answer.\",\n",
|
||||
" )\n",
|
||||
" ],\n",
|
||||
" return_intermediate_steps=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"result = constitutional_chain.invoke(\"What is the meaning of life?\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "fa3d11a1-ac1f-4a9a-9ab3-b7b244daa506",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'question': 'What is the meaning of life?',\n",
|
||||
" 'output': 'The meaning of life is a deeply personal and ever-evolving concept. It is a journey of self-discovery and growth, and can be different for each individual. Some may find meaning in relationships, others in achieving their goals, and some may never find a concrete answer. Ultimately, the meaning of life is what we make of it.',\n",
|
||||
" 'initial_output': ' The meaning of life is a subjective concept that can vary from person to person. Some may believe that the purpose of life is to find happiness and fulfillment, while others may see it as a journey of self-discovery and personal growth. Ultimately, the meaning of life is something that each individual must determine for themselves.',\n",
|
||||
" 'critiques_and_revisions': [('This answer is good in that it recognizes and acknowledges the subjective nature of the question and provides a valid and thoughtful response. However, it could have also mentioned that the meaning of life is a complex and deeply personal concept that can also change and evolve over time for each individual. Critique Needed.',\n",
|
||||
" 'The meaning of life is a deeply personal and ever-evolving concept. It is a journey of self-discovery and growth, and can be different for each individual. Some may find meaning in relationships, others in achieving their goals, and some may never find a concrete answer. Ultimately, the meaning of life is what we make of it.')]}"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"result"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "374ae108-f1a0-4723-9237-5259c8123c04",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Above, we've returned intermediate steps showing:\n",
|
||||
"\n",
|
||||
"- The original question;\n",
|
||||
"- The initial output;\n",
|
||||
"- Critiques and revisions;\n",
|
||||
"- The final output (matching a revision)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cdc3b527-c09e-4c77-9711-c3cc4506cd95",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"</details>\n",
|
||||
"\n",
|
||||
"## LangGraph\n",
|
||||
"\n",
|
||||
"<details open>\n",
|
||||
"\n",
|
||||
"Below, we use the [.with_structured_output](/docs/how_to/structured_output/) method to simultaneously generate (1) a judgment of whether a critique is needed, and (2) the critique. We surface all prompts involved for clarity and ease of customizability.\n",
|
||||
"\n",
|
||||
"Note that we are also able to stream intermediate steps with this implementation, so we can monitor and if needed intervene during its execution."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "917fdb73-2411-4fcc-9add-c32dc5c745da",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from typing import List, Optional, Tuple\n",
|
||||
"\n",
|
||||
"from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple\n",
|
||||
"from langchain.chains.constitutional_ai.prompts import (\n",
|
||||
" CRITIQUE_PROMPT,\n",
|
||||
" REVISION_PROMPT,\n",
|
||||
")\n",
|
||||
"from langchain_core.output_parsers import StrOutputParser\n",
|
||||
"from langchain_core.prompts import ChatPromptTemplate\n",
|
||||
"from langchain_openai import ChatOpenAI\n",
|
||||
"from langgraph.graph import END, START, StateGraph\n",
|
||||
"from typing_extensions import Annotated, TypedDict\n",
|
||||
"\n",
|
||||
"llm = ChatOpenAI(model=\"gpt-4o-mini\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class Critique(TypedDict):\n",
|
||||
" \"\"\"Generate a critique, if needed.\"\"\"\n",
|
||||
"\n",
|
||||
" critique_needed: Annotated[bool, ..., \"Whether or not a critique is needed.\"]\n",
|
||||
" critique: Annotated[str, ..., \"If needed, the critique.\"]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"critique_prompt = ChatPromptTemplate.from_template(\n",
|
||||
" \"Critique this response according to the critique request. \"\n",
|
||||
" \"If no critique is needed, specify that.\\n\\n\"\n",
|
||||
" \"Query: {query}\\n\\n\"\n",
|
||||
" \"Response: {response}\\n\\n\"\n",
|
||||
" \"Critique request: {critique_request}\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"revision_prompt = ChatPromptTemplate.from_template(\n",
|
||||
" \"Revise this response according to the critique and reivsion request.\\n\\n\"\n",
|
||||
" \"Query: {query}\\n\\n\"\n",
|
||||
" \"Response: {response}\\n\\n\"\n",
|
||||
" \"Critique request: {critique_request}\\n\\n\"\n",
|
||||
" \"Critique: {critique}\\n\\n\"\n",
|
||||
" \"If the critique does not identify anything worth changing, ignore the \"\n",
|
||||
" \"revision request and return 'No revisions needed'. If the critique \"\n",
|
||||
" \"does identify something worth changing, revise the response based on \"\n",
|
||||
" \"the revision request.\\n\\n\"\n",
|
||||
" \"Revision Request: {revision_request}\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"chain = llm | StrOutputParser()\n",
|
||||
"critique_chain = critique_prompt | llm.with_structured_output(Critique)\n",
|
||||
"revision_chain = revision_prompt | llm | StrOutputParser()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class State(TypedDict):\n",
|
||||
" query: str\n",
|
||||
" constitutional_principles: List[ConstitutionalPrinciple]\n",
|
||||
" initial_response: str\n",
|
||||
" critiques_and_revisions: List[Tuple[str, str]]\n",
|
||||
" response: str\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"async def generate_response(state: State):\n",
|
||||
" \"\"\"Generate initial response.\"\"\"\n",
|
||||
" response = await chain.ainvoke(state[\"query\"])\n",
|
||||
" return {\"response\": response, \"initial_response\": response}\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"async def critique_and_revise(state: State):\n",
|
||||
" \"\"\"Critique and revise response according to principles.\"\"\"\n",
|
||||
" critiques_and_revisions = []\n",
|
||||
" response = state[\"initial_response\"]\n",
|
||||
" for principle in state[\"constitutional_principles\"]:\n",
|
||||
" critique = await critique_chain.ainvoke(\n",
|
||||
" {\n",
|
||||
" \"query\": state[\"query\"],\n",
|
||||
" \"response\": response,\n",
|
||||
" \"critique_request\": principle.critique_request,\n",
|
||||
" }\n",
|
||||
" )\n",
|
||||
" if critique[\"critique_needed\"]:\n",
|
||||
" revision = await revision_chain.ainvoke(\n",
|
||||
" {\n",
|
||||
" \"query\": state[\"query\"],\n",
|
||||
" \"response\": response,\n",
|
||||
" \"critique_request\": principle.critique_request,\n",
|
||||
" \"critique\": critique[\"critique\"],\n",
|
||||
" \"revision_request\": principle.revision_request,\n",
|
||||
" }\n",
|
||||
" )\n",
|
||||
" response = revision\n",
|
||||
" critiques_and_revisions.append((critique[\"critique\"], revision))\n",
|
||||
" else:\n",
|
||||
" critiques_and_revisions.append((critique[\"critique\"], \"\"))\n",
|
||||
" return {\n",
|
||||
" \"critiques_and_revisions\": critiques_and_revisions,\n",
|
||||
" \"response\": response,\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"graph = StateGraph(State)\n",
|
||||
"graph.add_node(\"generate_response\", generate_response)\n",
|
||||
"graph.add_node(\"critique_and_revise\", critique_and_revise)\n",
|
||||
"\n",
|
||||
"graph.add_edge(START, \"generate_response\")\n",
|
||||
"graph.add_edge(\"generate_response\", \"critique_and_revise\")\n",
|
||||
"graph.add_edge(\"critique_and_revise\", END)\n",
|
||||
"app = graph.compile()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "01aac88d-464e-431f-b92e-746dcb743e1b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{}\n",
|
||||
"{'initial_response': 'Finding purpose, connection, and joy in our experiences and relationships.', 'response': 'Finding purpose, connection, and joy in our experiences and relationships.'}\n",
|
||||
"{'initial_response': 'Finding purpose, connection, and joy in our experiences and relationships.', 'critiques_and_revisions': [(\"The response exceeds the 10-word limit, providing a more elaborate answer than requested. A concise response, such as 'To seek purpose and joy in life,' would better align with the query.\", 'To seek purpose and joy in life.')], 'response': 'To seek purpose and joy in life.'}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"constitutional_principles = [\n",
|
||||
" ConstitutionalPrinciple(\n",
|
||||
" critique_request=\"Tell if this answer is good.\",\n",
|
||||
" revision_request=\"Give a better answer.\",\n",
|
||||
" )\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"query = \"What is the meaning of life? Answer in 10 words or fewer.\"\n",
|
||||
"\n",
|
||||
"async for step in app.astream(\n",
|
||||
" {\"query\": query, \"constitutional_principles\": constitutional_principles},\n",
|
||||
" stream_mode=\"values\",\n",
|
||||
"):\n",
|
||||
" subset = [\"initial_response\", \"critiques_and_revisions\", \"response\"]\n",
|
||||
" print({k: v for k, v in step.items() if k in subset})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b2717810",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"</details>\n",
|
||||
"\n",
|
||||
"## Next steps\n",
|
||||
"\n",
|
||||
"See guides for generating structured output [here](/docs/how_to/structured_output/).\n",
|
||||
"\n",
|
||||
"Check out the [LangGraph documentation](https://langchain-ai.github.io/langgraph/) for detail on building with LangGraph."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -45,5 +45,7 @@ The below pages assist with migration from various specific chains to LCEL and L
|
||||
- [RefineDocumentsChain](/docs/versions/migrating_chains/refine_docs_chain)
|
||||
- [LLMRouterChain](/docs/versions/migrating_chains/llm_router_chain)
|
||||
- [MultiPromptChain](/docs/versions/migrating_chains/multi_prompt_chain)
|
||||
- [LLMMathChain](/docs/versions/migrating_chains/llm_math_chain)
|
||||
- [ConstitutionalChain](/docs/versions/migrating_chains/constitutional_chain)
|
||||
|
||||
Check out the [LCEL conceptual docs](/docs/concepts/#langchain-expression-language-lcel) and [LangGraph docs](https://langchain-ai.github.io/langgraph/) for more background information.
|
||||
281
docs/docs/versions/migrating_chains/llm_math_chain.ipynb
Normal file
281
docs/docs/versions/migrating_chains/llm_math_chain.ipynb
Normal file
File diff suppressed because one or more lines are too long
@@ -510,6 +510,55 @@ const FEATURE_TABLES = {
|
||||
source: "Uses AWS API to load PDFs",
|
||||
api: "API",
|
||||
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.AmazonTextractPDFLoader.html"
|
||||
},
|
||||
{
|
||||
name: "MathPix",
|
||||
link: "mathpix",
|
||||
source: "Uses MathPix to laod PDFs",
|
||||
api: "Package",
|
||||
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.MathpixPDFLoader.html"
|
||||
},
|
||||
{
|
||||
name: "PDFPlumber",
|
||||
link: "pdfplumber",
|
||||
source: "Load PDF files using PDFPlumber",
|
||||
api: "Package",
|
||||
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PDFPlumberLoader.html"
|
||||
},
|
||||
{
|
||||
name: "PyPDFDirectry",
|
||||
link: "pypdfdirectory",
|
||||
source: "Load a directory with PDF files",
|
||||
api: "Package",
|
||||
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFDirectoryLoader.html"
|
||||
},
|
||||
{
|
||||
name: "PyPDFium2",
|
||||
link: "pypdfium2",
|
||||
source: "Load PDF files using PyPDFium2",
|
||||
api: "Package",
|
||||
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFium2Loader.html"
|
||||
},
|
||||
{
|
||||
name: "UnstructuredPDFLoader",
|
||||
link: "unstructured_pdfloader",
|
||||
source: "Load PDF files using Unstructured",
|
||||
api: "Package",
|
||||
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.UnstructuredPDFLoader.html"
|
||||
},
|
||||
{
|
||||
name: "PyMuPDF",
|
||||
link: "pymupdf",
|
||||
source: "Load PDF files using PyMuPDF",
|
||||
api: "Package",
|
||||
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyMuPDFLoader.html"
|
||||
},
|
||||
{
|
||||
name: "PDFMiner",
|
||||
link: "pdfminer",
|
||||
source: "Load PDF files using PDFMiner",
|
||||
api: "Package",
|
||||
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PDFMinerLoader.html"
|
||||
}
|
||||
]
|
||||
},
|
||||
@@ -540,6 +589,24 @@ const FEATURE_TABLES = {
|
||||
source: "All file types",
|
||||
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_unstructured.document_loaders.UnstructuredLoader.html"
|
||||
},
|
||||
{
|
||||
name: "JSONLoader",
|
||||
link: "json",
|
||||
source: "JSON files",
|
||||
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.json_loader.JSONLoader.html"
|
||||
},
|
||||
{
|
||||
name: "UnstructuredMarkdownLoader",
|
||||
link: "unstructured_markdown",
|
||||
source: "Markdown files",
|
||||
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.markdown.UnstructuredMarkdownLoader.html"
|
||||
},
|
||||
{
|
||||
name: "BSHTMLLoader",
|
||||
link: "bshtml",
|
||||
source: "HTML files",
|
||||
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.html_bs.BSHTMLLoader.html"
|
||||
}
|
||||
]
|
||||
},
|
||||
vectorstores: {
|
||||
|
||||
@@ -0,0 +1,8 @@
|
||||
"""Implement a GPT-3 driven browser.
|
||||
|
||||
Heavily influenced from https://github.com/nat/natbot
|
||||
"""
|
||||
|
||||
from langchain_community.chains.natbot.base import NatBotChain
|
||||
|
||||
__all__ = ["NatBotChain"]
|
||||
3
libs/community/langchain_community/chains/natbot/base.py
Normal file
3
libs/community/langchain_community/chains/natbot/base.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from langchain.chains import NatBotChain
|
||||
|
||||
__all__ = ["NatBotChain"]
|
||||
@@ -0,0 +1,7 @@
|
||||
from langchain.chains.natbot.crawler import (
|
||||
Crawler,
|
||||
ElementInViewPort,
|
||||
black_listed_elements,
|
||||
)
|
||||
|
||||
__all__ = ["ElementInViewPort", "Crawler", "black_listed_elements"]
|
||||
@@ -0,0 +1,3 @@
|
||||
from langchain.chains.natbot.prompt import PROMPT
|
||||
|
||||
__all__ = ["PROMPT"]
|
||||
@@ -10,7 +10,74 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BSHTMLLoader(BaseLoader):
|
||||
"""Load `HTML` files and parse them with `beautiful soup`."""
|
||||
"""
|
||||
__ModuleName__ document loader integration
|
||||
|
||||
Setup:
|
||||
Install ``langchain-community`` and ``bs4``.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install -U langchain-community bs4
|
||||
|
||||
Instantiate:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.document_loaders import BSHTMLLoader
|
||||
|
||||
loader = BSHTMLLoader(
|
||||
file_path="./example_data/fake-content.html",
|
||||
)
|
||||
|
||||
Lazy load:
|
||||
.. code-block:: python
|
||||
|
||||
docs = []
|
||||
docs_lazy = loader.lazy_load()
|
||||
|
||||
# async variant:
|
||||
# docs_lazy = await loader.alazy_load()
|
||||
|
||||
for doc in docs_lazy:
|
||||
docs.append(doc)
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
||||
Test Title
|
||||
|
||||
|
||||
My First Heading
|
||||
My first paragraph.
|
||||
|
||||
|
||||
|
||||
{'source': './example_data/fake-content.html', 'title': 'Test Title'}
|
||||
|
||||
Async load:
|
||||
.. code-block:: python
|
||||
|
||||
docs = await loader.aload()
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
||||
|
||||
Test Title
|
||||
|
||||
|
||||
My First Heading
|
||||
My first paragraph.
|
||||
|
||||
|
||||
|
||||
{'source': './example_data/fake-content.html', 'title': 'Test Title'}
|
||||
|
||||
""" # noqa: E501
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -13,19 +13,60 @@ class UnstructuredMarkdownLoader(UnstructuredFileLoader):
|
||||
You can pass in additional unstructured kwargs after mode to apply
|
||||
different unstructured settings.
|
||||
|
||||
Examples
|
||||
--------
|
||||
from langchain_community.document_loaders import UnstructuredMarkdownLoader
|
||||
Setup:
|
||||
Install ``langchain-community``.
|
||||
|
||||
loader = UnstructuredMarkdownLoader(
|
||||
"example.md", mode="elements", strategy="fast",
|
||||
)
|
||||
docs = loader.load()
|
||||
.. code-block:: bash
|
||||
|
||||
pip install -U langchain-community
|
||||
|
||||
Instantiate:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.document_loaders import UnstructuredMarkdownLoader
|
||||
|
||||
loader = UnstructuredMarkdownLoader(
|
||||
"./example_data/example.md",
|
||||
mode="elements",
|
||||
strategy="fast",
|
||||
)
|
||||
|
||||
Lazy load:
|
||||
.. code-block:: python
|
||||
|
||||
docs = []
|
||||
docs_lazy = loader.lazy_load()
|
||||
|
||||
# async variant:
|
||||
# docs_lazy = await loader.alazy_load()
|
||||
|
||||
for doc in docs_lazy:
|
||||
docs.append(doc)
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
Sample Markdown Document
|
||||
{'source': './example_data/example.md', 'category_depth': 0, 'last_modified': '2024-08-14T15:04:18', 'languages': ['eng'], 'filetype': 'text/markdown', 'file_directory': './example_data', 'filename': 'example.md', 'category': 'Title', 'element_id': '3d0b313864598e704aa26c728ecb61e5'}
|
||||
|
||||
|
||||
Async load:
|
||||
.. code-block:: python
|
||||
|
||||
docs = await loader.aload()
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
Sample Markdown Document
|
||||
{'source': './example_data/example.md', 'category_depth': 0, 'last_modified': '2024-08-14T15:04:18', 'languages': ['eng'], 'filetype': 'text/markdown', 'file_directory': './example_data', 'filename': 'example.md', 'category': 'Title', 'element_id': '3d0b313864598e704aa26c728ecb61e5'}
|
||||
|
||||
References
|
||||
----------
|
||||
https://unstructured-io.github.io/unstructured/core/partition.html#partition-md
|
||||
"""
|
||||
""" # noqa: E501
|
||||
|
||||
def _get_elements(self) -> List:
|
||||
from unstructured.__version__ import __version__ as __unstructured_version__
|
||||
|
||||
@@ -6,14 +6,6 @@ from langchain_core.documents import Document
|
||||
from langchain_community.chat_models import ChatOpenAI
|
||||
|
||||
|
||||
def test_llm_construction_with_kwargs() -> None:
|
||||
llm_chain_kwargs = {"verbose": True}
|
||||
compressor = LLMChainExtractor.from_llm(
|
||||
ChatOpenAI(), llm_chain_kwargs=llm_chain_kwargs
|
||||
)
|
||||
assert compressor.llm_chain.verbose is True
|
||||
|
||||
|
||||
def test_llm_chain_extractor() -> None:
|
||||
texts = [
|
||||
"The Roman Empire followed the Roman Republic.",
|
||||
|
||||
@@ -25,7 +25,7 @@ model_name = os.getenv("OPENAI_EMBEDDINGS_MODEL_NAME", "text-embedding-ada-002")
|
||||
INDEX_NAME = "langchain-test-index"
|
||||
INDEX_NAME_VECTOR_HNSW = "langchain-test-index-hnsw"
|
||||
NAMESPACE = "langchain_test_db.langchain_test_collection"
|
||||
CONNECTION_STRING: str = "mongodb+srv://akataria:Basket24ball@akataria-vector-search-testing.mongocluster.cosmos.azure.com/?tls=true&authMechanism=SCRAM-SHA-256&retrywrites=false&maxIdleTimeMS=120000"
|
||||
CONNECTION_STRING: str = os.environ.get("MONGODB_VCORE_URI", "")
|
||||
DB_NAME, COLLECTION_NAME = NAMESPACE.split(".")
|
||||
|
||||
num_lists = 3
|
||||
|
||||
@@ -2,11 +2,10 @@
|
||||
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from langchain.chains.natbot.base import NatBotChain
|
||||
from langchain_core.callbacks.manager import CallbackManagerForLLMRun
|
||||
from langchain_core.language_models.llms import LLM
|
||||
|
||||
from langchain.chains.natbot.base import NatBotChain
|
||||
|
||||
|
||||
class FakeLLM(LLM):
|
||||
"""Fake LLM wrapper for testing purposes."""
|
||||
@@ -180,7 +180,11 @@ class SemanticChunker(BaseDocumentTransformer):
|
||||
x = max(min(self.number_of_chunks, x1), x2)
|
||||
|
||||
# Linear interpolation formula
|
||||
y = y1 + ((y2 - y1) / (x2 - x1)) * (x - x1)
|
||||
if x2 == x1:
|
||||
y = y2
|
||||
else:
|
||||
y = y1 + ((y2 - y1) / (x2 - x1)) * (x - x1)
|
||||
|
||||
y = min(max(y, 0), 100)
|
||||
|
||||
return cast(float, np.percentile(distances, y))
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from langchain_core._api import deprecated
|
||||
from langchain_core.callbacks import CallbackManagerForChainRun
|
||||
from langchain_core.language_models import BaseLanguageModel
|
||||
from langchain_core.prompts import BasePromptTemplate
|
||||
@@ -13,9 +14,151 @@ from langchain.chains.constitutional_ai.prompts import CRITIQUE_PROMPT, REVISION
|
||||
from langchain.chains.llm import LLMChain
|
||||
|
||||
|
||||
@deprecated(
|
||||
since="0.2.13",
|
||||
message=(
|
||||
"This class is deprecated and will be removed in langchain 1.0. "
|
||||
"See API reference for replacement: "
|
||||
"https://api.python.langchain.com/en/latest/chains/langchain.chains.constitutional_ai.base.ConstitutionalChain.html" # noqa: E501
|
||||
),
|
||||
removal="1.0",
|
||||
)
|
||||
class ConstitutionalChain(Chain):
|
||||
"""Chain for applying constitutional principles.
|
||||
|
||||
Note: this class is deprecated. See below for a replacement implementation
|
||||
using LangGraph. The benefits of this implementation are:
|
||||
|
||||
- Uses LLM tool calling features instead of parsing string responses;
|
||||
- Support for both token-by-token and step-by-step streaming;
|
||||
- Support for checkpointing and memory of chat history;
|
||||
- Easier to modify or extend (e.g., with additional tools, structured responses, etc.)
|
||||
|
||||
Install LangGraph with:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install -U langgraph
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from langchain.chains.constitutional_ai.prompts import (
|
||||
CRITIQUE_PROMPT,
|
||||
REVISION_PROMPT,
|
||||
)
|
||||
from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple
|
||||
from langchain_core.output_parsers import StrOutputParser
|
||||
from langchain_core.prompts import ChatPromptTemplate
|
||||
from langchain_openai import ChatOpenAI
|
||||
from langgraph.graph import END, START, StateGraph
|
||||
from typing_extensions import Annotated, TypedDict
|
||||
|
||||
llm = ChatOpenAI(model="gpt-4o-mini")
|
||||
|
||||
class Critique(TypedDict):
|
||||
\"\"\"Generate a critique, if needed.\"\"\"
|
||||
critique_needed: Annotated[bool, ..., "Whether or not a critique is needed."]
|
||||
critique: Annotated[str, ..., "If needed, the critique."]
|
||||
|
||||
critique_prompt = ChatPromptTemplate.from_template(
|
||||
"Critique this response according to the critique request. "
|
||||
"If no critique is needed, specify that.\\n\\n"
|
||||
"Query: {query}\\n\\n"
|
||||
"Response: {response}\\n\\n"
|
||||
"Critique request: {critique_request}"
|
||||
)
|
||||
|
||||
revision_prompt = ChatPromptTemplate.from_template(
|
||||
"Revise this response according to the critique and reivsion request.\\n\\n"
|
||||
"Query: {query}\\n\\n"
|
||||
"Response: {response}\\n\\n"
|
||||
"Critique request: {critique_request}\\n\\n"
|
||||
"Critique: {critique}\\n\\n"
|
||||
"If the critique does not identify anything worth changing, ignore the "
|
||||
"revision request and return 'No revisions needed'. If the critique "
|
||||
"does identify something worth changing, revise the response based on "
|
||||
"the revision request.\\n\\n"
|
||||
"Revision Request: {revision_request}"
|
||||
)
|
||||
|
||||
chain = llm | StrOutputParser()
|
||||
critique_chain = critique_prompt | llm.with_structured_output(Critique)
|
||||
revision_chain = revision_prompt | llm | StrOutputParser()
|
||||
|
||||
|
||||
class State(TypedDict):
|
||||
query: str
|
||||
constitutional_principles: List[ConstitutionalPrinciple]
|
||||
initial_response: str
|
||||
critiques_and_revisions: List[Tuple[str, str]]
|
||||
response: str
|
||||
|
||||
|
||||
async def generate_response(state: State):
|
||||
\"\"\"Generate initial response.\"\"\"
|
||||
response = await chain.ainvoke(state["query"])
|
||||
return {"response": response, "initial_response": response}
|
||||
|
||||
async def critique_and_revise(state: State):
|
||||
\"\"\"Critique and revise response according to principles.\"\"\"
|
||||
critiques_and_revisions = []
|
||||
response = state["initial_response"]
|
||||
for principle in state["constitutional_principles"]:
|
||||
critique = await critique_chain.ainvoke(
|
||||
{
|
||||
"query": state["query"],
|
||||
"response": response,
|
||||
"critique_request": principle.critique_request,
|
||||
}
|
||||
)
|
||||
if critique["critique_needed"]:
|
||||
revision = await revision_chain.ainvoke(
|
||||
{
|
||||
"query": state["query"],
|
||||
"response": response,
|
||||
"critique_request": principle.critique_request,
|
||||
"critique": critique["critique"],
|
||||
"revision_request": principle.revision_request,
|
||||
}
|
||||
)
|
||||
response = revision
|
||||
critiques_and_revisions.append((critique["critique"], revision))
|
||||
else:
|
||||
critiques_and_revisions.append((critique["critique"], ""))
|
||||
return {
|
||||
"critiques_and_revisions": critiques_and_revisions,
|
||||
"response": response,
|
||||
}
|
||||
|
||||
graph = StateGraph(State)
|
||||
graph.add_node("generate_response", generate_response)
|
||||
graph.add_node("critique_and_revise", critique_and_revise)
|
||||
|
||||
graph.add_edge(START, "generate_response")
|
||||
graph.add_edge("generate_response", "critique_and_revise")
|
||||
graph.add_edge("critique_and_revise", END)
|
||||
app = graph.compile()
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
constitutional_principles=[
|
||||
ConstitutionalPrinciple(
|
||||
critique_request="Tell if this answer is good.",
|
||||
revision_request="Give a better answer.",
|
||||
)
|
||||
]
|
||||
|
||||
query = "What is the meaning of life? Answer in 10 words or fewer."
|
||||
|
||||
async for step in app.astream(
|
||||
{"query": query, "constitutional_principles": constitutional_principles},
|
||||
stream_mode="values",
|
||||
):
|
||||
subset = ["initial_response", "critiques_and_revisions", "response"]
|
||||
print({k: v for k, v in step.items() if k in subset})
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
@@ -44,7 +187,7 @@ class ConstitutionalChain(Chain):
|
||||
)
|
||||
|
||||
constitutional_chain.run(question="What is the meaning of life?")
|
||||
"""
|
||||
""" # noqa: E501
|
||||
|
||||
chain: LLMChain
|
||||
constitutional_principles: List[ConstitutionalPrinciple]
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from abc import abstractmethod
|
||||
from typing import Any, Dict, List, Optional, Sequence, Tuple
|
||||
|
||||
import numpy as np
|
||||
@@ -9,10 +8,12 @@ from langchain_core.callbacks import (
|
||||
CallbackManagerForChainRun,
|
||||
)
|
||||
from langchain_core.language_models import BaseLanguageModel
|
||||
from langchain_core.outputs import Generation
|
||||
from langchain_core.messages import AIMessage
|
||||
from langchain_core.output_parsers import StrOutputParser
|
||||
from langchain_core.prompts import BasePromptTemplate
|
||||
from langchain_core.pydantic_v1 import Field
|
||||
from langchain_core.retrievers import BaseRetriever
|
||||
from langchain_core.runnables import Runnable
|
||||
|
||||
from langchain.chains.base import Chain
|
||||
from langchain.chains.flare.prompts import (
|
||||
@@ -23,51 +24,14 @@ from langchain.chains.flare.prompts import (
|
||||
from langchain.chains.llm import LLMChain
|
||||
|
||||
|
||||
class _ResponseChain(LLMChain):
|
||||
"""Base class for chains that generate responses."""
|
||||
|
||||
prompt: BasePromptTemplate = PROMPT
|
||||
|
||||
@classmethod
|
||||
def is_lc_serializable(cls) -> bool:
|
||||
return False
|
||||
|
||||
@property
|
||||
def input_keys(self) -> List[str]:
|
||||
return self.prompt.input_variables
|
||||
|
||||
def generate_tokens_and_log_probs(
|
||||
self,
|
||||
_input: Dict[str, Any],
|
||||
*,
|
||||
run_manager: Optional[CallbackManagerForChainRun] = None,
|
||||
) -> Tuple[Sequence[str], Sequence[float]]:
|
||||
llm_result = self.generate([_input], run_manager=run_manager)
|
||||
return self._extract_tokens_and_log_probs(llm_result.generations[0])
|
||||
|
||||
@abstractmethod
|
||||
def _extract_tokens_and_log_probs(
|
||||
self, generations: List[Generation]
|
||||
) -> Tuple[Sequence[str], Sequence[float]]:
|
||||
"""Extract tokens and log probs from response."""
|
||||
|
||||
|
||||
class _OpenAIResponseChain(_ResponseChain):
|
||||
"""Chain that generates responses from user input and context."""
|
||||
|
||||
llm: BaseLanguageModel
|
||||
|
||||
def _extract_tokens_and_log_probs(
|
||||
self, generations: List[Generation]
|
||||
) -> Tuple[Sequence[str], Sequence[float]]:
|
||||
tokens = []
|
||||
log_probs = []
|
||||
for gen in generations:
|
||||
if gen.generation_info is None:
|
||||
raise ValueError
|
||||
tokens.extend(gen.generation_info["logprobs"]["tokens"])
|
||||
log_probs.extend(gen.generation_info["logprobs"]["token_logprobs"])
|
||||
return tokens, log_probs
|
||||
def _extract_tokens_and_log_probs(response: AIMessage) -> Tuple[List[str], List[float]]:
|
||||
"""Extract tokens and log probabilities from chat model response."""
|
||||
tokens = []
|
||||
log_probs = []
|
||||
for token in response.response_metadata["logprobs"]["content"]:
|
||||
tokens.append(token["token"])
|
||||
log_probs.append(token["logprob"])
|
||||
return tokens, log_probs
|
||||
|
||||
|
||||
class QuestionGeneratorChain(LLMChain):
|
||||
@@ -111,9 +75,9 @@ class FlareChain(Chain):
|
||||
"""Chain that combines a retriever, a question generator,
|
||||
and a response generator."""
|
||||
|
||||
question_generator_chain: QuestionGeneratorChain
|
||||
question_generator_chain: Runnable
|
||||
"""Chain that generates questions from uncertain spans."""
|
||||
response_chain: _ResponseChain
|
||||
response_chain: Runnable
|
||||
"""Chain that generates responses from user input and context."""
|
||||
output_parser: FinishedOutputParser = Field(default_factory=FinishedOutputParser)
|
||||
"""Parser that determines whether the chain is finished."""
|
||||
@@ -152,12 +116,16 @@ class FlareChain(Chain):
|
||||
for question in questions:
|
||||
docs.extend(self.retriever.invoke(question))
|
||||
context = "\n\n".join(d.page_content for d in docs)
|
||||
result = self.response_chain.predict(
|
||||
user_input=user_input,
|
||||
context=context,
|
||||
response=response,
|
||||
callbacks=callbacks,
|
||||
result = self.response_chain.invoke(
|
||||
{
|
||||
"user_input": user_input,
|
||||
"context": context,
|
||||
"response": response,
|
||||
},
|
||||
{"callbacks": callbacks},
|
||||
)
|
||||
if isinstance(result, AIMessage):
|
||||
result = result.content
|
||||
marginal, finished = self.output_parser.parse(result)
|
||||
return marginal, finished
|
||||
|
||||
@@ -178,13 +146,18 @@ class FlareChain(Chain):
|
||||
for span in low_confidence_spans
|
||||
]
|
||||
callbacks = _run_manager.get_child()
|
||||
question_gen_outputs = self.question_generator_chain.apply(
|
||||
question_gen_inputs, callbacks=callbacks
|
||||
)
|
||||
questions = [
|
||||
output[self.question_generator_chain.output_keys[0]]
|
||||
for output in question_gen_outputs
|
||||
]
|
||||
if isinstance(self.question_generator_chain, LLMChain):
|
||||
question_gen_outputs = self.question_generator_chain.apply(
|
||||
question_gen_inputs, callbacks=callbacks
|
||||
)
|
||||
questions = [
|
||||
output[self.question_generator_chain.output_keys[0]]
|
||||
for output in question_gen_outputs
|
||||
]
|
||||
else:
|
||||
questions = self.question_generator_chain.batch(
|
||||
question_gen_inputs, config={"callbacks": callbacks}
|
||||
)
|
||||
_run_manager.on_text(
|
||||
f"Generated Questions: {questions}", color="yellow", end="\n"
|
||||
)
|
||||
@@ -206,8 +179,10 @@ class FlareChain(Chain):
|
||||
f"Current Response: {response}", color="blue", end="\n"
|
||||
)
|
||||
_input = {"user_input": user_input, "context": "", "response": response}
|
||||
tokens, log_probs = self.response_chain.generate_tokens_and_log_probs(
|
||||
_input, run_manager=_run_manager
|
||||
tokens, log_probs = _extract_tokens_and_log_probs(
|
||||
self.response_chain.invoke(
|
||||
_input, {"callbacks": _run_manager.get_child()}
|
||||
)
|
||||
)
|
||||
low_confidence_spans = _low_confidence_spans(
|
||||
tokens,
|
||||
@@ -251,18 +226,16 @@ class FlareChain(Chain):
|
||||
FlareChain class with the given language model.
|
||||
"""
|
||||
try:
|
||||
from langchain_openai import OpenAI
|
||||
from langchain_openai import ChatOpenAI
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"OpenAI is required for FlareChain. "
|
||||
"Please install langchain-openai."
|
||||
"pip install langchain-openai"
|
||||
)
|
||||
question_gen_chain = QuestionGeneratorChain(llm=llm)
|
||||
response_llm = OpenAI(
|
||||
max_tokens=max_generation_len, model_kwargs={"logprobs": 1}, temperature=0
|
||||
)
|
||||
response_chain = _OpenAIResponseChain(llm=response_llm)
|
||||
llm = ChatOpenAI(max_tokens=max_generation_len, logprobs=True, temperature=0)
|
||||
response_chain = PROMPT | llm
|
||||
question_gen_chain = QUESTION_GENERATOR_PROMPT | llm | StrOutputParser()
|
||||
return cls(
|
||||
question_generator_chain=question_gen_chain,
|
||||
response_chain=response_chain,
|
||||
|
||||
@@ -11,7 +11,9 @@ import numpy as np
|
||||
from langchain_core.callbacks import CallbackManagerForChainRun
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.language_models import BaseLanguageModel
|
||||
from langchain_core.output_parsers import StrOutputParser
|
||||
from langchain_core.prompts import BasePromptTemplate
|
||||
from langchain_core.runnables import Runnable
|
||||
|
||||
from langchain.chains.base import Chain
|
||||
from langchain.chains.hyde.prompts import PROMPT_MAP
|
||||
@@ -25,7 +27,7 @@ class HypotheticalDocumentEmbedder(Chain, Embeddings):
|
||||
"""
|
||||
|
||||
base_embeddings: Embeddings
|
||||
llm_chain: LLMChain
|
||||
llm_chain: Runnable
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
@@ -34,12 +36,15 @@ class HypotheticalDocumentEmbedder(Chain, Embeddings):
|
||||
@property
|
||||
def input_keys(self) -> List[str]:
|
||||
"""Input keys for Hyde's LLM chain."""
|
||||
return self.llm_chain.input_keys
|
||||
return self.llm_chain.input_schema.schema()["required"]
|
||||
|
||||
@property
|
||||
def output_keys(self) -> List[str]:
|
||||
"""Output keys for Hyde's LLM chain."""
|
||||
return self.llm_chain.output_keys
|
||||
if isinstance(self.llm_chain, LLMChain):
|
||||
return self.llm_chain.output_keys
|
||||
else:
|
||||
return ["text"]
|
||||
|
||||
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
"""Call the base embeddings."""
|
||||
@@ -51,9 +56,12 @@ class HypotheticalDocumentEmbedder(Chain, Embeddings):
|
||||
|
||||
def embed_query(self, text: str) -> List[float]:
|
||||
"""Generate a hypothetical document and embedded it."""
|
||||
var_name = self.llm_chain.input_keys[0]
|
||||
result = self.llm_chain.generate([{var_name: text}])
|
||||
documents = [generation.text for generation in result.generations[0]]
|
||||
var_name = self.input_keys[0]
|
||||
result = self.llm_chain.invoke({var_name: text})
|
||||
if isinstance(self.llm_chain, LLMChain):
|
||||
documents = [result[self.output_keys[0]]]
|
||||
else:
|
||||
documents = [result]
|
||||
embeddings = self.embed_documents(documents)
|
||||
return self.combine_embeddings(embeddings)
|
||||
|
||||
@@ -64,7 +72,9 @@ class HypotheticalDocumentEmbedder(Chain, Embeddings):
|
||||
) -> Dict[str, str]:
|
||||
"""Call the internal llm chain."""
|
||||
_run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
|
||||
return self.llm_chain(inputs, callbacks=_run_manager.get_child())
|
||||
return self.llm_chain.invoke(
|
||||
inputs, config={"callbacks": _run_manager.get_child()}
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_llm(
|
||||
@@ -86,7 +96,7 @@ class HypotheticalDocumentEmbedder(Chain, Embeddings):
|
||||
f"of {list(PROMPT_MAP.keys())}."
|
||||
)
|
||||
|
||||
llm_chain = LLMChain(llm=llm, prompt=prompt)
|
||||
llm_chain = prompt | llm | StrOutputParser()
|
||||
return cls(base_embeddings=base_embeddings, llm_chain=llm_chain, **kwargs)
|
||||
|
||||
@property
|
||||
|
||||
@@ -7,6 +7,7 @@ import re
|
||||
import warnings
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from langchain_core._api import deprecated
|
||||
from langchain_core.callbacks import (
|
||||
AsyncCallbackManagerForChainRun,
|
||||
CallbackManagerForChainRun,
|
||||
@@ -20,16 +21,132 @@ from langchain.chains.llm import LLMChain
|
||||
from langchain.chains.llm_math.prompt import PROMPT
|
||||
|
||||
|
||||
@deprecated(
|
||||
since="0.2.13",
|
||||
message=(
|
||||
"This class is deprecated and will be removed in langchain 1.0. "
|
||||
"See API reference for replacement: "
|
||||
"https://api.python.langchain.com/en/latest/chains/langchain.chains.llm_math.base.LLMMathChain.html" # noqa: E501
|
||||
),
|
||||
removal="1.0",
|
||||
)
|
||||
class LLMMathChain(Chain):
|
||||
"""Chain that interprets a prompt and executes python code to do math.
|
||||
|
||||
Note: this class is deprecated. See below for a replacement implementation
|
||||
using LangGraph. The benefits of this implementation are:
|
||||
|
||||
- Uses LLM tool calling features;
|
||||
- Support for both token-by-token and step-by-step streaming;
|
||||
- Support for checkpointing and memory of chat history;
|
||||
- Easier to modify or extend (e.g., with additional tools, structured responses, etc.)
|
||||
|
||||
Install LangGraph with:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install -U langgraph
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import math
|
||||
from typing import Annotated, Sequence
|
||||
|
||||
from langchain_core.messages import BaseMessage
|
||||
from langchain_core.runnables import RunnableConfig
|
||||
from langchain_core.tools import tool
|
||||
from langchain_openai import ChatOpenAI
|
||||
from langgraph.graph import END, StateGraph
|
||||
from langgraph.graph.message import add_messages
|
||||
from langgraph.prebuilt.tool_node import ToolNode
|
||||
import numexpr
|
||||
from typing_extensions import TypedDict
|
||||
|
||||
@tool
|
||||
def calculator(expression: str) -> str:
|
||||
\"\"\"Calculate expression using Python's numexpr library.
|
||||
|
||||
Expression should be a single line mathematical expression
|
||||
that solves the problem.
|
||||
|
||||
Examples:
|
||||
"37593 * 67" for "37593 times 67"
|
||||
"37593**(1/5)" for "37593^(1/5)"
|
||||
\"\"\"
|
||||
local_dict = {"pi": math.pi, "e": math.e}
|
||||
return str(
|
||||
numexpr.evaluate(
|
||||
expression.strip(),
|
||||
global_dict={}, # restrict access to globals
|
||||
local_dict=local_dict, # add common mathematical functions
|
||||
)
|
||||
)
|
||||
|
||||
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
|
||||
tools = [calculator]
|
||||
llm_with_tools = llm.bind_tools(tools, tool_choice="any")
|
||||
|
||||
class ChainState(TypedDict):
|
||||
\"\"\"LangGraph state.\"\"\"
|
||||
|
||||
messages: Annotated[Sequence[BaseMessage], add_messages]
|
||||
|
||||
async def acall_chain(state: ChainState, config: RunnableConfig):
|
||||
last_message = state["messages"][-1]
|
||||
response = await llm_with_tools.ainvoke(state["messages"], config)
|
||||
return {"messages": [response]}
|
||||
|
||||
async def acall_model(state: ChainState, config: RunnableConfig):
|
||||
response = await llm.ainvoke(state["messages"], config)
|
||||
return {"messages": [response]}
|
||||
|
||||
graph_builder = StateGraph(ChainState)
|
||||
graph_builder.add_node("call_tool", acall_chain)
|
||||
graph_builder.add_node("execute_tool", ToolNode(tools))
|
||||
graph_builder.add_node("call_model", acall_model)
|
||||
graph_builder.set_entry_point("call_tool")
|
||||
graph_builder.add_edge("call_tool", "execute_tool")
|
||||
graph_builder.add_edge("execute_tool", "call_model")
|
||||
graph_builder.add_edge("call_model", END)
|
||||
chain = graph_builder.compile()
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
example_query = "What is 551368 divided by 82"
|
||||
|
||||
events = chain.astream(
|
||||
{"messages": [("user", example_query)]},
|
||||
stream_mode="values",
|
||||
)
|
||||
async for event in events:
|
||||
event["messages"][-1].pretty_print()
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
================================ Human Message =================================
|
||||
|
||||
What is 551368 divided by 82
|
||||
================================== Ai Message ==================================
|
||||
Tool Calls:
|
||||
calculator (call_MEiGXuJjJ7wGU4aOT86QuGJS)
|
||||
Call ID: call_MEiGXuJjJ7wGU4aOT86QuGJS
|
||||
Args:
|
||||
expression: 551368 / 82
|
||||
================================= Tool Message =================================
|
||||
Name: calculator
|
||||
|
||||
6724.0
|
||||
================================== Ai Message ==================================
|
||||
|
||||
551368 divided by 82 equals 6724.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain.chains import LLMMathChain
|
||||
from langchain_community.llms import OpenAI
|
||||
llm_math = LLMMathChain.from_llm(OpenAI())
|
||||
"""
|
||||
""" # noqa: E501
|
||||
|
||||
llm_chain: LLMChain
|
||||
llm: Optional[BaseLanguageModel] = None
|
||||
|
||||
@@ -5,15 +5,27 @@ from __future__ import annotations
|
||||
import warnings
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from langchain_core._api import deprecated
|
||||
from langchain_core.callbacks import CallbackManagerForChainRun
|
||||
from langchain_core.language_models import BaseLanguageModel
|
||||
from langchain_core.output_parsers import StrOutputParser
|
||||
from langchain_core.pydantic_v1 import root_validator
|
||||
from langchain_core.runnables import Runnable
|
||||
|
||||
from langchain.chains.base import Chain
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.chains.natbot.prompt import PROMPT
|
||||
|
||||
|
||||
@deprecated(
|
||||
since="0.2.13",
|
||||
message=(
|
||||
"Importing NatBotChain from langchain is deprecated and will be removed in "
|
||||
"langchain 1.0. Please import from langchain_community instead: "
|
||||
"from langchain_community.chains.natbot import NatBotChain. "
|
||||
"You may need to pip install -U langchain-community."
|
||||
),
|
||||
removal="1.0",
|
||||
)
|
||||
class NatBotChain(Chain):
|
||||
"""Implement an LLM driven browser.
|
||||
|
||||
@@ -37,7 +49,7 @@ class NatBotChain(Chain):
|
||||
natbot = NatBotChain.from_default("Buy me a new hat.")
|
||||
"""
|
||||
|
||||
llm_chain: LLMChain
|
||||
llm_chain: Runnable
|
||||
objective: str
|
||||
"""Objective that NatBot is tasked with completing."""
|
||||
llm: Optional[BaseLanguageModel] = None
|
||||
@@ -60,7 +72,7 @@ class NatBotChain(Chain):
|
||||
"class method."
|
||||
)
|
||||
if "llm_chain" not in values and values["llm"] is not None:
|
||||
values["llm_chain"] = LLMChain(llm=values["llm"], prompt=PROMPT)
|
||||
values["llm_chain"] = PROMPT | values["llm"] | StrOutputParser()
|
||||
return values
|
||||
|
||||
@classmethod
|
||||
@@ -77,7 +89,7 @@ class NatBotChain(Chain):
|
||||
cls, llm: BaseLanguageModel, objective: str, **kwargs: Any
|
||||
) -> NatBotChain:
|
||||
"""Load from LLM."""
|
||||
llm_chain = LLMChain(llm=llm, prompt=PROMPT)
|
||||
llm_chain = PROMPT | llm | StrOutputParser()
|
||||
return cls(llm_chain=llm_chain, objective=objective, **kwargs)
|
||||
|
||||
@property
|
||||
@@ -104,12 +116,14 @@ class NatBotChain(Chain):
|
||||
_run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
|
||||
url = inputs[self.input_url_key]
|
||||
browser_content = inputs[self.input_browser_content_key]
|
||||
llm_cmd = self.llm_chain.predict(
|
||||
objective=self.objective,
|
||||
url=url[:100],
|
||||
previous_command=self.previous_command,
|
||||
browser_content=browser_content[:4500],
|
||||
callbacks=_run_manager.get_child(),
|
||||
llm_cmd = self.llm_chain.invoke(
|
||||
{
|
||||
"objective": self.objective,
|
||||
"url": url[:100],
|
||||
"previous_command": self.previous_command,
|
||||
"browser_content": browser_content[:4500],
|
||||
},
|
||||
config={"callbacks": _run_manager.get_child()},
|
||||
)
|
||||
llm_cmd = llm_cmd.strip()
|
||||
self.previous_command = llm_cmd
|
||||
|
||||
@@ -27,11 +27,11 @@ from langchain_core.callbacks.manager import (
|
||||
from langchain_core.exceptions import OutputParserException
|
||||
from langchain_core.language_models import BaseLanguageModel
|
||||
from langchain_core.language_models.chat_models import BaseChatModel
|
||||
from langchain_core.output_parsers import BaseOutputParser
|
||||
from langchain_core.output_parsers import BaseOutputParser, StrOutputParser
|
||||
from langchain_core.pydantic_v1 import Field
|
||||
from langchain_core.runnables import Runnable
|
||||
from langchain_core.tools import BaseTool
|
||||
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.evaluation.agents.trajectory_eval_prompt import (
|
||||
EVAL_CHAT_PROMPT,
|
||||
TOOL_FREE_EVAL_CHAT_PROMPT,
|
||||
@@ -147,7 +147,7 @@ class TrajectoryEvalChain(AgentTrajectoryEvaluator, LLMEvalChain):
|
||||
|
||||
agent_tools: Optional[List[BaseTool]] = None
|
||||
"""A list of tools available to the agent."""
|
||||
eval_chain: LLMChain
|
||||
eval_chain: Runnable
|
||||
"""The language model chain used for evaluation."""
|
||||
output_parser: TrajectoryOutputParser = Field(
|
||||
default_factory=TrajectoryOutputParser
|
||||
@@ -253,7 +253,7 @@ The following is the expected answer. Use this to measure correctness:
|
||||
prompt = EVAL_CHAT_PROMPT
|
||||
else:
|
||||
prompt = TOOL_FREE_EVAL_CHAT_PROMPT
|
||||
eval_chain = LLMChain(llm=llm, prompt=prompt)
|
||||
eval_chain = prompt | llm | StrOutputParser()
|
||||
return cls(
|
||||
agent_tools=agent_tools, # type: ignore[arg-type]
|
||||
eval_chain=eval_chain,
|
||||
@@ -303,8 +303,8 @@ The following is the expected answer. Use this to measure correctness:
|
||||
if self.agent_tools:
|
||||
chain_input["tool_descriptions"] = self._tools_description
|
||||
_run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
|
||||
raw_output = self.eval_chain.run(
|
||||
chain_input, callbacks=_run_manager.get_child()
|
||||
raw_output = self.eval_chain.invoke(
|
||||
chain_input, {"callbacks": _run_manager.get_child()}
|
||||
)
|
||||
return cast(dict, self.output_parser.parse(raw_output))
|
||||
|
||||
@@ -327,8 +327,8 @@ The following is the expected answer. Use this to measure correctness:
|
||||
if self.agent_tools:
|
||||
chain_input["tool_descriptions"] = self._tools_description
|
||||
_run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager()
|
||||
raw_output = await self.eval_chain.arun(
|
||||
chain_input, callbacks=_run_manager.get_child()
|
||||
raw_output = await self.eval_chain.ainvoke(
|
||||
chain_input, {"callbacks": _run_manager.get_child()}
|
||||
)
|
||||
return cast(dict, self.output_parser.parse(raw_output))
|
||||
|
||||
|
||||
@@ -6,14 +6,15 @@ import logging
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
from langchain_core.callbacks.manager import Callbacks
|
||||
from langchain_core.callbacks.manager import CallbackManagerForChainRun, Callbacks
|
||||
from langchain_core.language_models import BaseLanguageModel
|
||||
from langchain_core.output_parsers import BaseOutputParser
|
||||
from langchain_core.prompts.base import BasePromptTemplate
|
||||
from langchain_core.prompts.prompt import PromptTemplate
|
||||
from langchain_core.pydantic_v1 import Field
|
||||
from langchain_core.runnables import RunnableConfig
|
||||
|
||||
from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.evaluation.comparison.prompt import (
|
||||
COMPARISON_TEMPLATE,
|
||||
COMPARISON_TEMPLATE_WITH_REFERENCE,
|
||||
@@ -151,7 +152,7 @@ class PairwiseStringResultOutputParser(BaseOutputParser[dict]):
|
||||
}
|
||||
|
||||
|
||||
class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
|
||||
class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain):
|
||||
"""A chain for comparing two outputs, such as the outputs
|
||||
of two models, prompts, or outputs of a single model on similar inputs.
|
||||
|
||||
@@ -186,6 +187,10 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
|
||||
output_parser: BaseOutputParser = Field(
|
||||
default_factory=PairwiseStringResultOutputParser
|
||||
)
|
||||
llm: BaseLanguageModel
|
||||
"""The language model to use for scoring."""
|
||||
prompt: BasePromptTemplate
|
||||
"""The prompt to use for scoring."""
|
||||
|
||||
@classmethod
|
||||
def is_lc_serializable(cls) -> bool:
|
||||
@@ -228,6 +233,22 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
|
||||
" (EvaluatorType.LABELED_PAIRWISE_STRING) instead."
|
||||
)
|
||||
|
||||
@property
|
||||
def input_keys(self) -> List[str]:
|
||||
"""Will be whatever keys the prompt expects.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
return self.prompt.input_variables
|
||||
|
||||
@property
|
||||
def output_keys(self) -> List[str]:
|
||||
"""Will always return text key.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
return [self.output_key]
|
||||
|
||||
@classmethod
|
||||
def from_llm(
|
||||
cls,
|
||||
@@ -305,6 +326,19 @@ Performance may be significantly worse with other models."
|
||||
parsed[RUN_KEY] = result[RUN_KEY]
|
||||
return parsed
|
||||
|
||||
def _call(
|
||||
self,
|
||||
inputs: Dict[str, Any],
|
||||
run_manager: Optional[CallbackManagerForChainRun] = None,
|
||||
) -> Dict[str, Any]:
|
||||
if run_manager:
|
||||
config = RunnableConfig(callbacks=run_manager.get_child())
|
||||
else:
|
||||
config = None
|
||||
chain = self.prompt | self.llm | self.output_parser
|
||||
response = chain.invoke(inputs, config=config)
|
||||
return {self.output_key: response}
|
||||
|
||||
def _evaluate_string_pairs(
|
||||
self,
|
||||
*,
|
||||
@@ -338,13 +372,17 @@ Performance may be significantly worse with other models."
|
||||
|
||||
"""
|
||||
input_ = self._prepare_input(prediction, prediction_b, input, reference)
|
||||
result = self(
|
||||
inputs=input_,
|
||||
callbacks=callbacks,
|
||||
tags=tags,
|
||||
metadata=metadata,
|
||||
include_run_info=include_run_info,
|
||||
)
|
||||
|
||||
# prep config
|
||||
config: RunnableConfig = {}
|
||||
if callbacks is not None:
|
||||
config["callbacks"] = callbacks
|
||||
if tags is not None:
|
||||
config["tags"] = tags
|
||||
if metadata is not None:
|
||||
config["metadata"] = metadata
|
||||
|
||||
result = self.invoke(input_, config=config, include_run_info=include_run_info)
|
||||
return self._prepare_output(result)
|
||||
|
||||
async def _aevaluate_string_pairs(
|
||||
@@ -380,13 +418,20 @@ Performance may be significantly worse with other models."
|
||||
|
||||
"""
|
||||
input_ = self._prepare_input(prediction, prediction_b, input, reference)
|
||||
result = await self.acall(
|
||||
inputs=input_,
|
||||
callbacks=callbacks,
|
||||
tags=tags,
|
||||
metadata=metadata,
|
||||
include_run_info=include_run_info,
|
||||
|
||||
# prep config
|
||||
config: RunnableConfig = {}
|
||||
if callbacks is not None:
|
||||
config["callbacks"] = callbacks
|
||||
if tags is not None:
|
||||
config["tags"] = tags
|
||||
if metadata is not None:
|
||||
config["metadata"] = metadata
|
||||
|
||||
result = await self.ainvoke(
|
||||
input_, config=config, include_run_info=include_run_info
|
||||
)
|
||||
|
||||
return self._prepare_output(result)
|
||||
|
||||
|
||||
|
||||
@@ -4,14 +4,14 @@ import re
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Mapping, Optional, Union
|
||||
|
||||
from langchain_core.callbacks.manager import Callbacks
|
||||
from langchain_core.callbacks.manager import CallbackManagerForChainRun, Callbacks
|
||||
from langchain_core.language_models import BaseLanguageModel
|
||||
from langchain_core.output_parsers import BaseOutputParser
|
||||
from langchain_core.prompts import BasePromptTemplate
|
||||
from langchain_core.pydantic_v1 import Field
|
||||
from langchain_core.runnables import RunnableConfig
|
||||
|
||||
from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.evaluation.criteria.prompt import PROMPT, PROMPT_WITH_REFERENCES
|
||||
from langchain.evaluation.schema import LLMEvalChain, StringEvaluator
|
||||
from langchain.schema import RUN_KEY
|
||||
@@ -164,7 +164,7 @@ def resolve_criteria(
|
||||
return criteria_
|
||||
|
||||
|
||||
class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
class CriteriaEvalChain(StringEvaluator, LLMEvalChain):
|
||||
"""LLM Chain for evaluating runs against criteria.
|
||||
|
||||
Parameters
|
||||
@@ -184,7 +184,7 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
reference labels in the prompt. Otherwise, the `PROMPT` template will be
|
||||
used, which is a reference-free prompt.
|
||||
**kwargs : Any
|
||||
Additional keyword arguments to pass to the `LLMChain` constructor.
|
||||
Additional keyword arguments to pass to the `Chain` constructor.
|
||||
|
||||
Returns
|
||||
-------
|
||||
@@ -231,6 +231,10 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
criterion_name: str
|
||||
"""The name of the criterion being evaluated."""
|
||||
output_key: str = "results" #: :meta private:
|
||||
llm: BaseLanguageModel
|
||||
"""The language model to use for scoring."""
|
||||
prompt: BasePromptTemplate
|
||||
"""The prompt to use for scoring."""
|
||||
|
||||
@classmethod
|
||||
def is_lc_serializable(cls) -> bool:
|
||||
@@ -267,6 +271,22 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
"\nTo use references, use the labeled_criteria instead."
|
||||
)
|
||||
|
||||
@property
|
||||
def input_keys(self) -> List[str]:
|
||||
"""Will be whatever keys the prompt expects.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
return self.prompt.input_variables
|
||||
|
||||
@property
|
||||
def output_keys(self) -> List[str]:
|
||||
"""Will always return text key.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
return [self.output_key]
|
||||
|
||||
@classmethod
|
||||
def _resolve_prompt(
|
||||
cls, prompt: Optional[BasePromptTemplate] = None
|
||||
@@ -332,7 +352,7 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
The prompt template to use for generating prompts. If not provided,
|
||||
a default prompt template will be used.
|
||||
**kwargs : Any
|
||||
Additional keyword arguments to pass to the `LLMChain`
|
||||
Additional keyword arguments to pass to the `Chain`
|
||||
constructor.
|
||||
|
||||
Returns
|
||||
@@ -396,6 +416,19 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
parsed[RUN_KEY] = result[RUN_KEY]
|
||||
return parsed
|
||||
|
||||
def _call(
|
||||
self,
|
||||
inputs: Dict[str, Any],
|
||||
run_manager: Optional[CallbackManagerForChainRun] = None,
|
||||
) -> Dict[str, Any]:
|
||||
if run_manager:
|
||||
config = RunnableConfig(callbacks=run_manager.get_child())
|
||||
else:
|
||||
config = None
|
||||
chain = self.prompt | self.llm | self.output_parser
|
||||
response = chain.invoke(inputs, config=config)
|
||||
return {self.output_key: response}
|
||||
|
||||
def _evaluate_strings(
|
||||
self,
|
||||
*,
|
||||
@@ -420,7 +453,7 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
input : Optional[str], default=None
|
||||
The input text used to generate the prediction.
|
||||
**kwargs : Any
|
||||
Additional keyword arguments to pass to the `LLMChain` `__call__`
|
||||
Additional keyword arguments to pass to the `Chain` `invoke`
|
||||
method.
|
||||
|
||||
Returns
|
||||
@@ -442,13 +475,17 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
)
|
||||
"""
|
||||
input_ = self._get_eval_input(prediction, reference, input)
|
||||
result = self(
|
||||
input_,
|
||||
callbacks=callbacks,
|
||||
tags=tags,
|
||||
metadata=metadata,
|
||||
include_run_info=include_run_info,
|
||||
)
|
||||
|
||||
# prep config
|
||||
config: RunnableConfig = {}
|
||||
if callbacks is not None:
|
||||
config["callbacks"] = callbacks
|
||||
if tags is not None:
|
||||
config["tags"] = tags
|
||||
if metadata is not None:
|
||||
config["metadata"] = metadata
|
||||
|
||||
result = self.invoke(input_, config=config, include_run_info=include_run_info)
|
||||
return self._prepare_output(result)
|
||||
|
||||
async def _aevaluate_strings(
|
||||
@@ -475,7 +512,7 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
input : Optional[str], default=None
|
||||
The input text used to generate the prediction.
|
||||
**kwargs : Any
|
||||
Additional keyword arguments to pass to the `LLMChain` `acall`
|
||||
Additional keyword arguments to pass to the `Chain` `acall`
|
||||
method.
|
||||
|
||||
Returns
|
||||
@@ -497,12 +534,18 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
)
|
||||
"""
|
||||
input_ = self._get_eval_input(prediction, reference, input)
|
||||
result = await self.acall(
|
||||
input_,
|
||||
callbacks=callbacks,
|
||||
tags=tags,
|
||||
metadata=metadata,
|
||||
include_run_info=include_run_info,
|
||||
|
||||
# prep config
|
||||
config: RunnableConfig = {}
|
||||
if callbacks is not None:
|
||||
config["callbacks"] = callbacks
|
||||
if tags is not None:
|
||||
config["tags"] = tags
|
||||
if metadata is not None:
|
||||
config["metadata"] = metadata
|
||||
|
||||
result = await self.ainvoke(
|
||||
input_, config=config, include_run_info=include_run_info
|
||||
)
|
||||
return self._prepare_output(result)
|
||||
|
||||
@@ -556,7 +599,7 @@ class LabeledCriteriaEvalChain(CriteriaEvalChain):
|
||||
The prompt template to use for generating prompts. If not provided,
|
||||
a default prompt will be used.
|
||||
**kwargs : Any
|
||||
Additional keyword arguments to pass to the `LLMChain`
|
||||
Additional keyword arguments to pass to the `Chain`
|
||||
constructor.
|
||||
|
||||
Returns
|
||||
|
||||
@@ -4,13 +4,15 @@ from __future__ import annotations
|
||||
|
||||
import re
|
||||
import string
|
||||
from typing import Any, List, Optional, Sequence, Tuple
|
||||
from typing import Any, Dict, List, Optional, Sequence, Tuple
|
||||
|
||||
from langchain_core.callbacks.manager import Callbacks
|
||||
from langchain_core.callbacks.manager import CallbackManagerForChainRun, Callbacks
|
||||
from langchain_core.language_models import BaseLanguageModel
|
||||
from langchain_core.output_parsers import StrOutputParser
|
||||
from langchain_core.prompts import PromptTemplate
|
||||
from langchain_core.prompts.base import BasePromptTemplate
|
||||
from langchain_core.runnables import RunnableConfig
|
||||
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.evaluation.qa.eval_prompt import CONTEXT_PROMPT, COT_PROMPT, PROMPT
|
||||
from langchain.evaluation.schema import LLMEvalChain, StringEvaluator
|
||||
from langchain.schema import RUN_KEY
|
||||
@@ -67,10 +69,14 @@ def _parse_string_eval_output(text: str) -> dict:
|
||||
}
|
||||
|
||||
|
||||
class QAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
|
||||
class QAEvalChain(StringEvaluator, LLMEvalChain):
|
||||
"""LLM Chain for evaluating question answering."""
|
||||
|
||||
output_key: str = "results" #: :meta private:
|
||||
llm: BaseLanguageModel
|
||||
"""The language model to use for scoring."""
|
||||
prompt: BasePromptTemplate
|
||||
"""The prompt to use for scoring."""
|
||||
|
||||
class Config:
|
||||
extra = "ignore"
|
||||
@@ -91,6 +97,35 @@ class QAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
|
||||
def requires_input(self) -> bool:
|
||||
return True
|
||||
|
||||
@property
|
||||
def input_keys(self) -> List[str]:
|
||||
"""Will be whatever keys the prompt expects.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
return self.prompt.input_variables
|
||||
|
||||
@property
|
||||
def output_keys(self) -> List[str]:
|
||||
"""Will always return text key.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
return [self.output_key]
|
||||
|
||||
def _call(
|
||||
self,
|
||||
inputs: Dict[str, Any],
|
||||
run_manager: Optional[CallbackManagerForChainRun] = None,
|
||||
) -> Dict[str, Any]:
|
||||
if run_manager:
|
||||
config = RunnableConfig(callbacks=run_manager.get_child())
|
||||
else:
|
||||
config = None
|
||||
chain = self.prompt | self.llm | StrOutputParser()
|
||||
response = chain.invoke(inputs, config=config)
|
||||
return {self.output_key: response}
|
||||
|
||||
@classmethod
|
||||
def from_llm(
|
||||
cls,
|
||||
@@ -141,8 +176,14 @@ class QAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
|
||||
}
|
||||
for i, example in enumerate(examples)
|
||||
]
|
||||
if callbacks:
|
||||
config = RunnableConfig(callbacks=callbacks)
|
||||
else:
|
||||
config = None
|
||||
outputs = self.batch(inputs, config=config)
|
||||
|
||||
return self.apply(inputs, callbacks=callbacks)
|
||||
# Subset to output key only
|
||||
return [{self.output_key: output[self.output_key]} for output in outputs]
|
||||
|
||||
def _prepare_output(self, result: dict) -> dict:
|
||||
parsed_result = _parse_string_eval_output(result[self.output_key])
|
||||
@@ -174,13 +215,17 @@ class QAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
|
||||
Returns:
|
||||
dict: The evaluation results containing the score or value.
|
||||
"""
|
||||
result = self(
|
||||
if callbacks:
|
||||
config = RunnableConfig(callbacks=callbacks)
|
||||
else:
|
||||
config = None
|
||||
result = self.invoke(
|
||||
{
|
||||
"query": input,
|
||||
"answer": reference,
|
||||
"result": prediction,
|
||||
},
|
||||
callbacks=callbacks,
|
||||
config=config,
|
||||
include_run_info=include_run_info,
|
||||
)
|
||||
return self._prepare_output(result)
|
||||
@@ -195,17 +240,31 @@ class QAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
|
||||
include_run_info: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> dict:
|
||||
result = await self.acall(
|
||||
inputs={"query": input, "answer": reference, "result": prediction},
|
||||
callbacks=callbacks,
|
||||
if callbacks:
|
||||
config = RunnableConfig(callbacks=callbacks)
|
||||
else:
|
||||
config = None
|
||||
result = await self.ainvoke(
|
||||
{
|
||||
"query": input,
|
||||
"answer": reference,
|
||||
"result": prediction,
|
||||
},
|
||||
config=config,
|
||||
include_run_info=include_run_info,
|
||||
)
|
||||
return self._prepare_output(result)
|
||||
|
||||
|
||||
class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
|
||||
class ContextQAEvalChain(StringEvaluator, LLMEvalChain):
|
||||
"""LLM Chain for evaluating QA w/o GT based on context"""
|
||||
|
||||
output_key: str = "text" #: :meta private:
|
||||
llm: BaseLanguageModel
|
||||
"""The language model to use for scoring."""
|
||||
prompt: BasePromptTemplate
|
||||
"""The prompt to use for scoring."""
|
||||
|
||||
@classmethod
|
||||
def is_lc_serializable(cls) -> bool:
|
||||
return False
|
||||
@@ -220,6 +279,22 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
|
||||
"""Whether the chain requires an input string."""
|
||||
return True
|
||||
|
||||
@property
|
||||
def input_keys(self) -> List[str]:
|
||||
"""Will be whatever keys the prompt expects.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
return self.prompt.input_variables
|
||||
|
||||
@property
|
||||
def output_keys(self) -> List[str]:
|
||||
"""Will always return text key.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
return [self.output_key]
|
||||
|
||||
class Config:
|
||||
extra = "ignore"
|
||||
|
||||
@@ -236,6 +311,19 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
|
||||
def evaluation_name(self) -> str:
|
||||
return "Contextual Accuracy"
|
||||
|
||||
def _call(
|
||||
self,
|
||||
inputs: Dict[str, Any],
|
||||
run_manager: Optional[CallbackManagerForChainRun] = None,
|
||||
) -> Dict[str, Any]:
|
||||
if run_manager:
|
||||
config = RunnableConfig(callbacks=run_manager.get_child())
|
||||
else:
|
||||
config = None
|
||||
chain = self.prompt | self.llm | StrOutputParser()
|
||||
response = chain.invoke(inputs, config=config)
|
||||
return {self.output_key: response}
|
||||
|
||||
@classmethod
|
||||
def from_llm(
|
||||
cls,
|
||||
@@ -281,8 +369,13 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
|
||||
}
|
||||
for i, example in enumerate(examples)
|
||||
]
|
||||
if callbacks:
|
||||
config = RunnableConfig(callbacks=callbacks)
|
||||
else:
|
||||
config = None
|
||||
outputs = self.batch(inputs, config=config)
|
||||
|
||||
return self.apply(inputs, callbacks=callbacks)
|
||||
return [{self.output_key: output[self.output_key]} for output in outputs]
|
||||
|
||||
def _prepare_output(self, result: dict) -> dict:
|
||||
parsed_result = _parse_string_eval_output(result[self.output_key])
|
||||
@@ -300,13 +393,17 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
|
||||
include_run_info: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> dict:
|
||||
result = self(
|
||||
if callbacks:
|
||||
config = RunnableConfig(callbacks=callbacks)
|
||||
else:
|
||||
config = None
|
||||
result = self.invoke(
|
||||
{
|
||||
"query": input,
|
||||
"context": reference,
|
||||
"result": prediction,
|
||||
},
|
||||
callbacks=callbacks,
|
||||
config=config,
|
||||
include_run_info=include_run_info,
|
||||
)
|
||||
return self._prepare_output(result)
|
||||
@@ -321,9 +418,17 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
|
||||
include_run_info: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> dict:
|
||||
result = await self.acall(
|
||||
inputs={"query": input, "context": reference, "result": prediction},
|
||||
callbacks=callbacks,
|
||||
if callbacks:
|
||||
config = RunnableConfig(callbacks=callbacks)
|
||||
else:
|
||||
config = None
|
||||
result = await self.ainvoke(
|
||||
{
|
||||
"query": input,
|
||||
"context": reference,
|
||||
"result": prediction,
|
||||
},
|
||||
config=config,
|
||||
include_run_info=include_run_info,
|
||||
)
|
||||
return self._prepare_output(result)
|
||||
|
||||
@@ -4,6 +4,7 @@ from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
from langchain_core._api import deprecated
|
||||
from langchain_core.language_models import BaseLanguageModel
|
||||
from langchain_core.output_parsers import BaseLLMOutputParser
|
||||
from langchain_core.pydantic_v1 import Field
|
||||
@@ -17,8 +18,44 @@ _QA_OUTPUT_PARSER = RegexParser(
|
||||
)
|
||||
|
||||
|
||||
@deprecated(
|
||||
since="0.2.13",
|
||||
message=(
|
||||
"This class is deprecated and will be removed in langchain 1.0. "
|
||||
"See API reference for replacement: "
|
||||
"https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.qa.generate_chain.QAGenerateChain.html" # noqa: E501
|
||||
),
|
||||
removal="1.0",
|
||||
)
|
||||
class QAGenerateChain(LLMChain):
|
||||
"""LLM Chain for generating examples for question answering."""
|
||||
"""LLM Chain for generating examples for question answering.
|
||||
|
||||
Note: this class is deprecated. See below for a replacement implementation
|
||||
that leverages LLM tool calling features.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_core.prompts import ChatPromptTemplate
|
||||
from langchain_openai import ChatOpenAI
|
||||
from typing_extensions import TypedDict
|
||||
|
||||
template = \"\"\"You are a teacher coming up with questions to ask on a quiz.
|
||||
Given the following document, please generate a question and answer based on that document.
|
||||
|
||||
These questions should be detailed and be based explicitly on information in the document.
|
||||
\"\"\"
|
||||
|
||||
prompt = ChatPromptTemplate.from_template(template)
|
||||
|
||||
class QuestionAndAnswer(TypedDict):
|
||||
\"\"\"Question and answer based on document.\"\"\"
|
||||
question: str
|
||||
answer: str
|
||||
|
||||
llm = ChatOpenAI(model="gpt-4o-mini").with_structured_output(QuestionAndAnswer)
|
||||
llm.invoke("...")
|
||||
|
||||
""" # noqa: E501
|
||||
|
||||
output_parser: BaseLLMOutputParser = Field(default=_QA_OUTPUT_PARSER)
|
||||
output_key: str = "qa_pairs"
|
||||
|
||||
@@ -6,14 +6,16 @@ import logging
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
from langchain_core.callbacks import CallbackManagerForChainRun
|
||||
from langchain_core.callbacks.manager import Callbacks
|
||||
from langchain_core.language_models import BaseLanguageModel
|
||||
from langchain_core.output_parsers import BaseOutputParser
|
||||
from langchain_core.prompts.base import BasePromptTemplate
|
||||
from langchain_core.prompts.prompt import PromptTemplate
|
||||
from langchain_core.pydantic_v1 import Field
|
||||
from langchain_core.runnables import RunnableConfig
|
||||
|
||||
from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.evaluation.criteria.eval_chain import (
|
||||
CRITERIA_TYPE,
|
||||
Criteria,
|
||||
@@ -144,7 +146,7 @@ class ScoreStringResultOutputParser(BaseOutputParser[dict]):
|
||||
}
|
||||
|
||||
|
||||
class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
class ScoreStringEvalChain(StringEvaluator, LLMEvalChain):
|
||||
"""A chain for scoring on a scale of 1-10 the output of a model.
|
||||
|
||||
Attributes:
|
||||
@@ -178,10 +180,43 @@ class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
"""The value to normalize the score by, if specified."""
|
||||
criterion_name: str
|
||||
"""The name of the criterion being evaluated."""
|
||||
llm: BaseLanguageModel
|
||||
"""The language model to use for scoring."""
|
||||
prompt: BasePromptTemplate
|
||||
"""The prompt to use for scoring."""
|
||||
|
||||
class Config:
|
||||
extra = "ignore"
|
||||
|
||||
@property
|
||||
def input_keys(self) -> List[str]:
|
||||
"""Will be whatever keys the prompt expects.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
return self.prompt.input_variables
|
||||
|
||||
@property
|
||||
def output_keys(self) -> List[str]:
|
||||
"""Will always return text key.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
return [self.output_key]
|
||||
|
||||
def _call(
|
||||
self,
|
||||
inputs: Dict[str, Any],
|
||||
run_manager: Optional[CallbackManagerForChainRun] = None,
|
||||
) -> Dict[str, Any]:
|
||||
if run_manager:
|
||||
config = RunnableConfig(callbacks=run_manager.get_child())
|
||||
else:
|
||||
config = None
|
||||
chain = self.prompt | self.llm | self.output_parser
|
||||
response = chain.invoke(inputs, config=config)
|
||||
return {self.output_key: response}
|
||||
|
||||
@classmethod
|
||||
def is_lc_serializable(cls) -> bool:
|
||||
return False
|
||||
@@ -348,13 +383,17 @@ Performance may be significantly worse with other models."
|
||||
|
||||
"""
|
||||
input_ = self._prepare_input(prediction, input, reference)
|
||||
result = self(
|
||||
inputs=input_,
|
||||
callbacks=callbacks,
|
||||
tags=tags,
|
||||
metadata=metadata,
|
||||
include_run_info=include_run_info,
|
||||
)
|
||||
|
||||
# prep config
|
||||
config: RunnableConfig = {}
|
||||
if callbacks is not None:
|
||||
config["callbacks"] = callbacks
|
||||
if tags is not None:
|
||||
config["tags"] = tags
|
||||
if metadata is not None:
|
||||
config["metadata"] = metadata
|
||||
|
||||
result = self.invoke(input_, config=config, include_run_info=include_run_info)
|
||||
return self._prepare_output(result)
|
||||
|
||||
async def _aevaluate_string_pairs(
|
||||
@@ -385,12 +424,18 @@ Performance may be significantly worse with other models."
|
||||
|
||||
"""
|
||||
input_ = self._prepare_input(prediction, input, reference)
|
||||
result = await self.acall(
|
||||
inputs=input_,
|
||||
callbacks=callbacks,
|
||||
tags=tags,
|
||||
metadata=metadata,
|
||||
include_run_info=include_run_info,
|
||||
|
||||
# prep config
|
||||
config: RunnableConfig = {}
|
||||
if callbacks is not None:
|
||||
config["callbacks"] = callbacks
|
||||
if tags is not None:
|
||||
config["tags"] = tags
|
||||
if metadata is not None:
|
||||
config["metadata"] = metadata
|
||||
|
||||
result = await self.ainvoke(
|
||||
input_, config=config, include_run_info=include_run_info
|
||||
)
|
||||
return self._prepare_output(result)
|
||||
|
||||
|
||||
@@ -8,8 +8,9 @@ from typing import Any, Callable, Dict, Optional, Sequence, cast
|
||||
from langchain_core.callbacks.manager import Callbacks
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.language_models import BaseLanguageModel
|
||||
from langchain_core.output_parsers import BaseOutputParser
|
||||
from langchain_core.output_parsers import BaseOutputParser, StrOutputParser
|
||||
from langchain_core.prompts import PromptTemplate
|
||||
from langchain_core.runnables import Runnable
|
||||
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.retrievers.document_compressors.base import BaseDocumentCompressor
|
||||
@@ -49,12 +50,15 @@ class LLMChainExtractor(BaseDocumentCompressor):
|
||||
"""Document compressor that uses an LLM chain to extract
|
||||
the relevant parts of documents."""
|
||||
|
||||
llm_chain: LLMChain
|
||||
llm_chain: Runnable
|
||||
"""LLM wrapper to use for compressing documents."""
|
||||
|
||||
get_input: Callable[[str, Document], dict] = default_get_input
|
||||
"""Callable for constructing the chain input from the query and a Document."""
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
def compress_documents(
|
||||
self,
|
||||
documents: Sequence[Document],
|
||||
@@ -65,10 +69,13 @@ class LLMChainExtractor(BaseDocumentCompressor):
|
||||
compressed_docs = []
|
||||
for doc in documents:
|
||||
_input = self.get_input(query, doc)
|
||||
output_dict = self.llm_chain.invoke(_input, config={"callbacks": callbacks})
|
||||
output = output_dict[self.llm_chain.output_key]
|
||||
if self.llm_chain.prompt.output_parser is not None:
|
||||
output = self.llm_chain.prompt.output_parser.parse(output)
|
||||
output_ = self.llm_chain.invoke(_input, config={"callbacks": callbacks})
|
||||
if isinstance(self.llm_chain, LLMChain):
|
||||
output = output_[self.llm_chain.output_key]
|
||||
if self.llm_chain.prompt.output_parser is not None:
|
||||
output = self.llm_chain.prompt.output_parser.parse(output)
|
||||
else:
|
||||
output = output_
|
||||
if len(output) == 0:
|
||||
continue
|
||||
compressed_docs.append(
|
||||
@@ -85,9 +92,7 @@ class LLMChainExtractor(BaseDocumentCompressor):
|
||||
"""Compress page content of raw documents asynchronously."""
|
||||
outputs = await asyncio.gather(
|
||||
*[
|
||||
self.llm_chain.apredict_and_parse(
|
||||
**self.get_input(query, doc), callbacks=callbacks
|
||||
)
|
||||
self.llm_chain.ainvoke(self.get_input(query, doc), callbacks=callbacks)
|
||||
for doc in documents
|
||||
]
|
||||
)
|
||||
@@ -111,5 +116,9 @@ class LLMChainExtractor(BaseDocumentCompressor):
|
||||
"""Initialize from LLM."""
|
||||
_prompt = prompt if prompt is not None else _get_default_chain_prompt()
|
||||
_get_input = get_input if get_input is not None else default_get_input
|
||||
llm_chain = LLMChain(llm=llm, prompt=_prompt, **(llm_chain_kwargs or {}))
|
||||
if _prompt.output_parser is not None:
|
||||
parser = _prompt.output_parser
|
||||
else:
|
||||
parser = StrOutputParser()
|
||||
llm_chain = _prompt | llm | parser
|
||||
return cls(llm_chain=llm_chain, get_input=_get_input) # type: ignore[arg-type]
|
||||
|
||||
@@ -5,7 +5,9 @@ from typing import Any, Callable, Dict, Optional, Sequence
|
||||
from langchain_core.callbacks.manager import Callbacks
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.language_models import BaseLanguageModel
|
||||
from langchain_core.output_parsers import StrOutputParser
|
||||
from langchain_core.prompts import BasePromptTemplate, PromptTemplate
|
||||
from langchain_core.runnables import Runnable
|
||||
from langchain_core.runnables.config import RunnableConfig
|
||||
|
||||
from langchain.chains import LLMChain
|
||||
@@ -32,13 +34,16 @@ def default_get_input(query: str, doc: Document) -> Dict[str, Any]:
|
||||
class LLMChainFilter(BaseDocumentCompressor):
|
||||
"""Filter that drops documents that aren't relevant to the query."""
|
||||
|
||||
llm_chain: LLMChain
|
||||
llm_chain: Runnable
|
||||
"""LLM wrapper to use for filtering documents.
|
||||
The chain prompt is expected to have a BooleanOutputParser."""
|
||||
|
||||
get_input: Callable[[str, Document], dict] = default_get_input
|
||||
"""Callable for constructing the chain input from the query and a Document."""
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
def compress_documents(
|
||||
self,
|
||||
documents: Sequence[Document],
|
||||
@@ -56,11 +61,15 @@ class LLMChainFilter(BaseDocumentCompressor):
|
||||
documents,
|
||||
)
|
||||
|
||||
for output_dict, doc in outputs:
|
||||
for output_, doc in outputs:
|
||||
include_doc = None
|
||||
output = output_dict[self.llm_chain.output_key]
|
||||
if self.llm_chain.prompt.output_parser is not None:
|
||||
include_doc = self.llm_chain.prompt.output_parser.parse(output)
|
||||
if isinstance(self.llm_chain, LLMChain):
|
||||
output = output_[self.llm_chain.output_key]
|
||||
if self.llm_chain.prompt.output_parser is not None:
|
||||
include_doc = self.llm_chain.prompt.output_parser.parse(output)
|
||||
else:
|
||||
if isinstance(output_, bool):
|
||||
include_doc = output_
|
||||
if include_doc:
|
||||
filtered_docs.append(doc)
|
||||
|
||||
@@ -82,11 +91,15 @@ class LLMChainFilter(BaseDocumentCompressor):
|
||||
),
|
||||
documents,
|
||||
)
|
||||
for output_dict, doc in outputs:
|
||||
for output_, doc in outputs:
|
||||
include_doc = None
|
||||
output = output_dict[self.llm_chain.output_key]
|
||||
if self.llm_chain.prompt.output_parser is not None:
|
||||
include_doc = self.llm_chain.prompt.output_parser.parse(output)
|
||||
if isinstance(self.llm_chain, LLMChain):
|
||||
output = output_[self.llm_chain.output_key]
|
||||
if self.llm_chain.prompt.output_parser is not None:
|
||||
include_doc = self.llm_chain.prompt.output_parser.parse(output)
|
||||
else:
|
||||
if isinstance(output_, bool):
|
||||
include_doc = output_
|
||||
if include_doc:
|
||||
filtered_docs.append(doc)
|
||||
|
||||
@@ -110,5 +123,9 @@ class LLMChainFilter(BaseDocumentCompressor):
|
||||
A LLMChainFilter that uses the given language model.
|
||||
"""
|
||||
_prompt = prompt if prompt is not None else _get_default_chain_prompt()
|
||||
llm_chain = LLMChain(llm=llm, prompt=_prompt)
|
||||
if _prompt.output_parser is not None:
|
||||
parser = _prompt.output_parser
|
||||
else:
|
||||
parser = StrOutputParser()
|
||||
llm_chain = _prompt | llm | parser
|
||||
return cls(llm_chain=llm_chain, **kwargs)
|
||||
|
||||
@@ -7,11 +7,11 @@ from langchain_core.callbacks import (
|
||||
)
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.language_models import BaseLLM
|
||||
from langchain_core.output_parsers import StrOutputParser
|
||||
from langchain_core.prompts import BasePromptTemplate
|
||||
from langchain_core.prompts.prompt import PromptTemplate
|
||||
from langchain_core.retrievers import BaseRetriever
|
||||
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain_core.runnables import Runnable
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -30,7 +30,7 @@ class RePhraseQueryRetriever(BaseRetriever):
|
||||
Then, retrieve docs for the re-phrased query."""
|
||||
|
||||
retriever: BaseRetriever
|
||||
llm_chain: LLMChain
|
||||
llm_chain: Runnable
|
||||
|
||||
@classmethod
|
||||
def from_llm(
|
||||
@@ -51,8 +51,7 @@ class RePhraseQueryRetriever(BaseRetriever):
|
||||
Returns:
|
||||
RePhraseQueryRetriever
|
||||
"""
|
||||
|
||||
llm_chain = LLMChain(llm=llm, prompt=prompt)
|
||||
llm_chain = prompt | llm | StrOutputParser()
|
||||
return cls(
|
||||
retriever=retriever,
|
||||
llm_chain=llm_chain,
|
||||
@@ -72,8 +71,9 @@ class RePhraseQueryRetriever(BaseRetriever):
|
||||
Returns:
|
||||
Relevant documents for re-phrased question
|
||||
"""
|
||||
response = self.llm_chain(query, callbacks=run_manager.get_child())
|
||||
re_phrased_question = response["text"]
|
||||
re_phrased_question = self.llm_chain.invoke(
|
||||
query, {"callbacks": run_manager.get_child()}
|
||||
)
|
||||
logger.info(f"Re-phrased question: {re_phrased_question}")
|
||||
docs = self.retriever.invoke(
|
||||
re_phrased_question, config={"callbacks": run_manager.get_child()}
|
||||
|
||||
4
libs/langchain/poetry.lock
generated
4
libs/langchain/poetry.lock
generated
@@ -1872,7 +1872,7 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "langchain-core"
|
||||
version = "0.2.31"
|
||||
version = "0.2.32"
|
||||
description = "Building applications with LLMs through composability"
|
||||
optional = false
|
||||
python-versions = ">=3.8.1,<4.0"
|
||||
@@ -4711,4 +4711,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools",
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.8.1,<4.0"
|
||||
content-hash = "6503a7f00cec168c1c1b165ac5fa01cd239f20e0437225176683c98100e36d33"
|
||||
content-hash = "6fdb171d86fbc444f698e3a1835224bd47e76218facbc69d478cd2a8cb72406f"
|
||||
|
||||
@@ -33,7 +33,7 @@ langchain-server = "langchain.server:main"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = ">=3.8.1,<4.0"
|
||||
langchain-core = "^0.2.31"
|
||||
langchain-core = "^0.2.32"
|
||||
langchain-text-splitters = "^0.2.0"
|
||||
langsmith = "^0.1.17"
|
||||
pydantic = ">=1,<3"
|
||||
|
||||
@@ -0,0 +1,84 @@
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.language_models import FakeListChatModel
|
||||
|
||||
from langchain.retrievers.document_compressors import LLMChainExtractor
|
||||
|
||||
|
||||
def test_llm_chain_extractor() -> None:
|
||||
documents = [
|
||||
Document(
|
||||
page_content=(
|
||||
"The sky is blue. Candlepin bowling is popular in New England."
|
||||
),
|
||||
metadata={"a": 1},
|
||||
),
|
||||
Document(
|
||||
page_content=(
|
||||
"Mercury is the closest planet to the Sun. "
|
||||
"Candlepin bowling balls are smaller."
|
||||
),
|
||||
metadata={"b": 2},
|
||||
),
|
||||
Document(page_content="The moon is round.", metadata={"c": 3}),
|
||||
]
|
||||
llm = FakeListChatModel(
|
||||
responses=[
|
||||
"Candlepin bowling is popular in New England.",
|
||||
"Candlepin bowling balls are smaller.",
|
||||
"NO_OUTPUT",
|
||||
]
|
||||
)
|
||||
doc_compressor = LLMChainExtractor.from_llm(llm)
|
||||
output = doc_compressor.compress_documents(
|
||||
documents, "Tell me about Candlepin bowling."
|
||||
)
|
||||
expected = documents = [
|
||||
Document(
|
||||
page_content="Candlepin bowling is popular in New England.",
|
||||
metadata={"a": 1},
|
||||
),
|
||||
Document(
|
||||
page_content="Candlepin bowling balls are smaller.", metadata={"b": 2}
|
||||
),
|
||||
]
|
||||
assert output == expected
|
||||
|
||||
|
||||
async def test_llm_chain_extractor_async() -> None:
|
||||
documents = [
|
||||
Document(
|
||||
page_content=(
|
||||
"The sky is blue. Candlepin bowling is popular in New England."
|
||||
),
|
||||
metadata={"a": 1},
|
||||
),
|
||||
Document(
|
||||
page_content=(
|
||||
"Mercury is the closest planet to the Sun. "
|
||||
"Candlepin bowling balls are smaller."
|
||||
),
|
||||
metadata={"b": 2},
|
||||
),
|
||||
Document(page_content="The moon is round.", metadata={"c": 3}),
|
||||
]
|
||||
llm = FakeListChatModel(
|
||||
responses=[
|
||||
"Candlepin bowling is popular in New England.",
|
||||
"Candlepin bowling balls are smaller.",
|
||||
"NO_OUTPUT",
|
||||
]
|
||||
)
|
||||
doc_compressor = LLMChainExtractor.from_llm(llm)
|
||||
output = await doc_compressor.acompress_documents(
|
||||
documents, "Tell me about Candlepin bowling."
|
||||
)
|
||||
expected = documents = [
|
||||
Document(
|
||||
page_content="Candlepin bowling is popular in New England.",
|
||||
metadata={"a": 1},
|
||||
),
|
||||
Document(
|
||||
page_content="Candlepin bowling balls are smaller.", metadata={"b": 2}
|
||||
),
|
||||
]
|
||||
assert output == expected
|
||||
@@ -0,0 +1,46 @@
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.language_models import FakeListChatModel
|
||||
|
||||
from langchain.retrievers.document_compressors import LLMChainFilter
|
||||
|
||||
|
||||
def test_llm_chain_filter() -> None:
|
||||
documents = [
|
||||
Document(
|
||||
page_content="Candlepin bowling is popular in New England.",
|
||||
metadata={"a": 1},
|
||||
),
|
||||
Document(
|
||||
page_content="Candlepin bowling balls are smaller.",
|
||||
metadata={"b": 2},
|
||||
),
|
||||
Document(page_content="The moon is round.", metadata={"c": 3}),
|
||||
]
|
||||
llm = FakeListChatModel(responses=["YES", "YES", "NO"])
|
||||
doc_compressor = LLMChainFilter.from_llm(llm)
|
||||
output = doc_compressor.compress_documents(
|
||||
documents, "Tell me about Candlepin bowling."
|
||||
)
|
||||
expected = documents[:2]
|
||||
assert output == expected
|
||||
|
||||
|
||||
async def test_llm_chain_extractor_async() -> None:
|
||||
documents = [
|
||||
Document(
|
||||
page_content="Candlepin bowling is popular in New England.",
|
||||
metadata={"a": 1},
|
||||
),
|
||||
Document(
|
||||
page_content="Candlepin bowling balls are smaller.",
|
||||
metadata={"b": 2},
|
||||
),
|
||||
Document(page_content="The moon is round.", metadata={"c": 3}),
|
||||
]
|
||||
llm = FakeListChatModel(responses=["YES", "YES", "NO"])
|
||||
doc_compressor = LLMChainFilter.from_llm(llm)
|
||||
output = await doc_compressor.acompress_documents(
|
||||
documents, "Tell me about Candlepin bowling."
|
||||
)
|
||||
expected = documents[:2]
|
||||
assert output == expected
|
||||
Reference in New Issue
Block a user