Compare commits

...

20 Commits

Author SHA1 Message Date
Chester Curme
39d1759b66 update docstring 2024-08-15 14:08:48 -04:00
Chester Curme
7005c4fe5b CriteriaEvalChain 2024-08-15 14:06:30 -04:00
Chester Curme
1eace6523d PairwiseStringEvalChain 2024-08-15 12:43:28 -04:00
Chester Curme
bacf4c58ef update 2024-08-15 12:40:11 -04:00
Chester Curme
b71b5bd3d7 update 2024-08-15 12:31:50 -04:00
Chester Curme
31364de10c propagate include_run_info 2024-08-15 12:23:25 -04:00
Chester Curme
8a70754dfe ContextQAEvalChain 2024-08-15 12:21:11 -04:00
Chester Curme
9bd4459f9a lint 2024-08-15 12:06:06 -04:00
Chester Curme
50c1ecc5f1 QAEvalChain 2024-08-15 11:55:39 -04:00
Chester Curme
f51a9024ae Merge branch 'master' into cc/deprecate_evaluators 2024-08-15 11:11:25 -04:00
ccurme
8afbab4cf6 langchain[patch]: deprecate various chains (#25310)
- [x] NatbotChain: move to community, deprecate langchain version.
Update to use `prompt | llm | output_parser` instead of LLMChain.
- [x] LLMMathChain: deprecate + add langgraph replacement example to API
ref
- [x] HypotheticalDocumentEmbedder (retriever): update to use `prompt |
llm | output_parser` instead of LLMChain
- [x] FlareChain: update to use `prompt | llm | output_parser` instead
of LLMChain
- [x] ConstitutionalChain: deprecate + add langgraph replacement example
to API ref
- [x] LLMChainExtractor (document compressor): update to use `prompt |
llm | output_parser` instead of LLMChain
- [x] LLMChainFilter (document compressor): update to use `prompt | llm
| output_parser` instead of LLMChain
- [x] RePhraseQueryRetriever (retriever): update to use `prompt | llm |
output_parser` instead of LLMChain
2024-08-15 10:49:26 -04:00
Luke
66e30efa61 experimental: Fix divide by 0 error (#25439)
Within the semantic chunker, when calling `_threshold_from_clusters`
there is the possibility for a divide by 0 error if the
`number_of_chunks` is equal to the length of `distances`.

Fix simply implements a check if these values match to prevent the error
and enable chunking to continue.
2024-08-15 14:46:30 +00:00
ccurme
ba167dc158 community[patch]: update connection string in azure cosmos integration test (#25438) 2024-08-15 14:07:54 +00:00
Eugene Yurtsev
44f69063b1 docs[patch]: Fix a few typos in the chat integration docs for TogetherAI (#25424)
Fix a few minor typos
2024-08-15 09:48:36 -04:00
Isaac Francisco
f18b77fd59 [docs]: pdf loaders (#25425) 2024-08-14 21:44:57 -07:00
Isaac Francisco
966b408634 [docs]: doc loader changes (#25417) 2024-08-14 19:46:33 -07:00
ccurme
bd261456f6 langchain: bump core to 0.2.32 (#25421) 2024-08-15 00:00:42 +00:00
Chester Curme
15254d1027 QAGenerateChain 2024-08-13 14:23:49 -04:00
Chester Curme
d38c9c7026 ScoreStringEvalChain 2024-08-13 14:22:09 -04:00
Chester Curme
d249318f94 TrajectoryEvalChain 2024-08-13 14:22:04 -04:00
48 changed files with 4180 additions and 824 deletions

View File

@@ -182,7 +182,7 @@ pprint(data)
</CodeOutputBlock>
Another option is set `jq_schema='.'` and provide `content_key`:
Another option is to set `jq_schema='.'` and provide `content_key`:
```python
loader = JSONLoader(

File diff suppressed because one or more lines are too long

View File

@@ -53,7 +53,8 @@
"import getpass\n",
"import os\n",
"\n",
"os.environ[\"TOGETHER_API_KEY\"] = getpass.getpass(\"Enter your Together API key: \")"
"if \"TOGETHER_API_KEY\" not in os.environ:\n",
" os.environ[\"TOGETHER_API_KEY\"] = getpass.getpass(\"Enter your Together API key: \")"
]
},
{
@@ -87,21 +88,10 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"id": "652d6238-1f87-422a-b135-f5abbb8652fc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.1.2\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"outputs": [],
"source": [
"%pip install -qU langchain-together"
]
@@ -113,14 +103,12 @@
"source": [
"## Instantiation\n",
"\n",
"Now we can instantiate our model object and generate chat completions:\n",
"\n",
"- TODO: Update model instantiation with relevant params."
"Now we can instantiate our model object and generate chat completions:"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 3,
"id": "cb09c344-1836-4e0c-acf8-11d13ac1dbae",
"metadata": {},
"outputs": [],
@@ -147,7 +135,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 4,
"id": "62e0dbc3",
"metadata": {
"tags": []
@@ -156,10 +144,10 @@
{
"data": {
"text/plain": [
"AIMessage(content=\"J'adore la programmation.\", response_metadata={'token_usage': {'completion_tokens': 9, 'prompt_tokens': 35, 'total_tokens': 44}, 'model_name': 'meta-llama/Llama-3-70b-chat-hf', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-79efa49b-dbaf-4ef8-9dce-958533823ef6-0', usage_metadata={'input_tokens': 35, 'output_tokens': 9, 'total_tokens': 44})"
"AIMessage(content=\"J'adore la programmation.\", response_metadata={'token_usage': {'completion_tokens': 9, 'prompt_tokens': 35, 'total_tokens': 44}, 'model_name': 'meta-llama/Llama-3-70b-chat-hf', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-eabcbe33-cdd8-45b8-ab0b-f90b6e7dfad8-0', usage_metadata={'input_tokens': 35, 'output_tokens': 9, 'total_tokens': 44})"
]
},
"execution_count": 6,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@@ -178,7 +166,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 5,
"id": "d86145b3-bfef-46e8-b227-4dda5c9c2705",
"metadata": {},
"outputs": [
@@ -206,17 +194,17 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 7,
"id": "e197d1d7-a070-4c96-9f8a-a0e86d046e0b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"AIMessage(content='Ich liebe das Programmieren.', response_metadata={'token_usage': {'completion_tokens': 7, 'prompt_tokens': 30, 'total_tokens': 37}, 'model_name': 'meta-llama/Llama-3-70b-chat-hf', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-80bba5fa-1723-4242-8d5a-c09b76b8350b-0', usage_metadata={'input_tokens': 30, 'output_tokens': 7, 'total_tokens': 37})"
"AIMessage(content='Ich liebe das Programmieren.', response_metadata={'token_usage': {'completion_tokens': 7, 'prompt_tokens': 30, 'total_tokens': 37}, 'model_name': 'meta-llama/Llama-3-70b-chat-hf', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-a249aa24-ee31-46ba-9bf9-f4eb135b0a95-0', usage_metadata={'input_tokens': 30, 'output_tokens': 7, 'total_tokens': 37})"
]
},
"execution_count": 8,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@@ -271,7 +259,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
"version": "3.11.4"
}
},
"nbformat": 4,

View File

@@ -0,0 +1,243 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# BSHTMLLoader\n",
"\n",
"\n",
"This notebook provides a quick overview for getting started with BeautifulSoup4 [document loader](https://python.langchain.com/v0.2/docs/concepts/#document-loaders). For detailed documentation of all __ModuleName__Loader features and configurations head to the [API reference](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.html_bs.BSHTMLLoader.html).\n",
"\n",
"\n",
"## Overview\n",
"### Integration details\n",
"\n",
"\n",
"| Class | Package | Local | Serializable | JS support|\n",
"| :--- | :--- | :---: | :---: | :---: |\n",
"| [BSHTMLLoader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.html_bs.BSHTMLLoader.html) | [langchain_community](https://api.python.langchain.com/en/latest/community_api_reference.html) | ✅ | ❌ | ❌ | \n",
"### Loader features\n",
"| Source | Document Lazy Loading | Native Async Support\n",
"| :---: | :---: | :---: | \n",
"| BSHTMLLoader | ✅ | ❌ | \n",
"\n",
"## Setup\n",
"\n",
"To access BSHTMLLoader document loader you'll need to install the `langchain-community` integration package and the `bs4` python package.\n",
"\n",
"### Credentials\n",
"\n",
"No credentials are needed to use the `BSHTMLLoader` class."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If you want to get automated best in-class tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")\n",
"# os.environ[\"LANGSMITH_TRACING\"] = \"true\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Installation\n",
"\n",
"Install **langchain_community** and **bs4**."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%pip install -qU langchain_community bs4"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialization\n",
"\n",
"Now we can instantiate our model object and load documents:\n",
"\n",
"- TODO: Update model instantiation with relevant params."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from langchain_community.document_loaders import BSHTMLLoader\n",
"\n",
"loader = BSHTMLLoader(\n",
" file_path=\"./example_data/fake-content.html\",\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Document(metadata={'source': './example_data/fake-content.html', 'title': 'Test Title'}, page_content='\\nTest Title\\n\\n\\nMy First Heading\\nMy first paragraph.\\n\\n\\n')"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs = loader.load()\n",
"docs[0]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'source': './example_data/fake-content.html', 'title': 'Test Title'}\n"
]
}
],
"source": [
"print(docs[0].metadata)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Lazy Load"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Document(metadata={'source': './example_data/fake-content.html', 'title': 'Test Title'}, page_content='\\nTest Title\\n\\n\\nMy First Heading\\nMy first paragraph.\\n\\n\\n')"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"page = []\n",
"for doc in loader.lazy_load():\n",
" page.append(doc)\n",
" if len(page) >= 10:\n",
" # do some paged operation, e.g.\n",
" # index.upsert(page)\n",
"\n",
" page = []\n",
"page[0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Adding separator to BS4\n",
"\n",
"We can also pass a separator to use when calling get_text on the soup"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"page_content='\n",
", Test Title, \n",
", \n",
", \n",
", My First Heading, \n",
", My first paragraph., \n",
", \n",
", \n",
"' metadata={'source': './example_data/fake-content.html', 'title': 'Test Title'}\n"
]
}
],
"source": [
"loader = BSHTMLLoader(\n",
" file_path=\"./example_data/fake-content.html\", get_text_separator=\", \"\n",
")\n",
"\n",
"docs = loader.load()\n",
"print(docs[0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## API reference\n",
"\n",
"For detailed documentation of all BSHTMLLoader features and configurations head to the API reference: https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.html_bs.BSHTMLLoader.html"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,55 @@
# Sample Markdown Document
## Introduction
Welcome to this sample Markdown document. Markdown is a lightweight markup language used for formatting text. It's widely used for documentation, readme files, and more.
## Features
### Headers
Markdown supports multiple levels of headers:
- **Header 1**: `# Header 1`
- **Header 2**: `## Header 2`
- **Header 3**: `### Header 3`
### Lists
#### Unordered List
- Item 1
- Item 2
- Subitem 2.1
- Subitem 2.2
#### Ordered List
1. First item
2. Second item
3. Third item
### Links
[OpenAI](https://www.openai.com) is an AI research organization.
### Images
Here's an example image:
![Sample Image](https://via.placeholder.com/150)
### Code
#### Inline Code
Use `code` for inline code snippets.
#### Code Block
```python
def greet(name):
return f"Hello, {name}!"
print(greet("World"))
```

View File

@@ -30,6 +30,7 @@
{
"sender_name": "User 2",
"timestamp_ms": 1675595060730,
"content": "",
"photos": [
{"uri": "url_of_some_picture.jpg", "creation_timestamp": 1675595059}
]

View File

@@ -21,24 +21,24 @@ loader = CSVLoader(
data = loader.load()
```
## Common File Types
The below document loaders allow you to load data from common data formats.
<CategoryTable category="common_loaders" />
## PDFs
The below document loaders allow you to load documents.
<CategoryTable category="pdf_loaders" />
## Webpages
The below document loaders allow you to load webpages.
<CategoryTable category="webpage_loaders" />
## PDFs
The below document loaders allow you to load PDF documents.
<CategoryTable category="pdf_loaders" />
## Common File Types
The below document loaders allow you to load data from common data formats.
<CategoryTable category="common_loaders" />
## All document loaders

View File

@@ -0,0 +1,348 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# JSONLoader\n",
"\n",
"This notebook provides a quick overview for getting started with JSON [document loader](https://python.langchain.com/v0.2/docs/concepts/#document-loaders). For detailed documentation of all JSONLoader features and configurations head to the [API reference](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.json_loader.JSONLoader.html).\n",
"\n",
"- TODO: Add any other relevant links, like information about underlying API, etc.\n",
"\n",
"## Overview\n",
"### Integration details\n",
"\n",
"| Class | Package | Local | Serializable | [JS support](https://js.langchain.com/v0.2/docs/integrations/document_loaders/file_loaders/json/)|\n",
"| :--- | :--- | :---: | :---: | :---: |\n",
"| [JSONLoader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.json_loader.JSONLoader.html) | [langchain_community](https://api.python.langchain.com/en/latest/community_api_reference.html) | ✅ | ❌ | ✅ | \n",
"### Loader features\n",
"| Source | Document Lazy Loading | Native Async Support\n",
"| :---: | :---: | :---: | \n",
"| JSONLoader | ✅ | ❌ | \n",
"\n",
"## Setup\n",
"\n",
"To access JSON document loader you'll need to install the `langchain-community` integration package as well as the ``jq`` python package.\n",
"\n",
"### Credentials\n",
"\n",
"No credentials are required to use the `JSONLoader` class."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If you want to get automated best in-class tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")\n",
"# os.environ[\"LANGSMITH_TRACING\"] = \"true\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Installation\n",
"\n",
"Install **langchain_community** and **jq**:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%pip install -qU langchain_community jq "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialization\n",
"\n",
"Now we can instantiate our model object and load documents:\n",
"\n",
"- TODO: Update model instantiation with relevant params."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from langchain_community.document_loaders import JSONLoader\n",
"\n",
"loader = JSONLoader(\n",
" file_path=\"./example_data/facebook_chat.json\",\n",
" jq_schema=\".messages[].content\",\n",
" text_content=False,\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Document(metadata={'source': '/Users/isaachershenson/Documents/langchain/docs/docs/integrations/document_loaders/example_data/facebook_chat.json', 'seq_num': 1}, page_content='Bye!')"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs = loader.load()\n",
"docs[0]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'source': '/Users/isaachershenson/Documents/langchain/docs/docs/integrations/document_loaders/example_data/facebook_chat.json', 'seq_num': 1}\n"
]
}
],
"source": [
"print(docs[0].metadata)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Lazy Load"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"pages = []\n",
"for doc in loader.lazy_load():\n",
" pages.append(doc)\n",
" if len(pages) >= 10:\n",
" # do some paged operation, e.g.\n",
" # index.upsert(pages)\n",
"\n",
" pages = []"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Read from JSON Lines file\n",
"\n",
"If you want to load documents from a JSON Lines file, you pass `json_lines=True`\n",
"and specify `jq_schema` to extract `page_content` from a single JSON object."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"page_content='Bye!' metadata={'source': '/Users/isaachershenson/Documents/langchain/docs/docs/integrations/document_loaders/example_data/facebook_chat_messages.jsonl', 'seq_num': 1}\n"
]
}
],
"source": [
"loader = JSONLoader(\n",
" file_path=\"./example_data/facebook_chat_messages.jsonl\",\n",
" jq_schema=\".content\",\n",
" text_content=False,\n",
" json_lines=True,\n",
")\n",
"\n",
"docs = loader.load()\n",
"print(docs[0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Read specific content keys\n",
"\n",
"Another option is to set `jq_schema='.'` and provide a `content_key` in order to only load specific content:"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"page_content='User 2' metadata={'source': '/Users/isaachershenson/Documents/langchain/docs/docs/integrations/document_loaders/example_data/facebook_chat_messages.jsonl', 'seq_num': 1}\n"
]
}
],
"source": [
"loader = JSONLoader(\n",
" file_path=\"./example_data/facebook_chat_messages.jsonl\",\n",
" jq_schema=\".\",\n",
" content_key=\"sender_name\",\n",
" json_lines=True,\n",
")\n",
"\n",
"docs = loader.load()\n",
"print(docs[0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## JSON file with jq schema `content_key`\n",
"\n",
"To load documents from a JSON file using the `content_key` within the jq schema, set `is_content_key_jq_parsable=True`. Ensure that `content_key` is compatible and can be parsed using the jq schema."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"page_content='Bye!' metadata={'source': '/Users/isaachershenson/Documents/langchain/docs/docs/integrations/document_loaders/example_data/facebook_chat.json', 'seq_num': 1}\n"
]
}
],
"source": [
"loader = JSONLoader(\n",
" file_path=\"./example_data/facebook_chat.json\",\n",
" jq_schema=\".messages[]\",\n",
" content_key=\".content\",\n",
" is_content_key_jq_parsable=True,\n",
")\n",
"\n",
"docs = loader.load()\n",
"print(docs[0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Extracting metadata\n",
"\n",
"Generally, we want to include metadata available in the JSON file into the documents that we create from the content.\n",
"\n",
"The following demonstrates how metadata can be extracted using the `JSONLoader`.\n",
"\n",
"There are some key changes to be noted. In the previous example where we didn't collect the metadata, we managed to directly specify in the schema where the value for the `page_content` can be extracted from.\n",
"\n",
"In this example, we have to tell the loader to iterate over the records in the `messages` field. The jq_schema then has to be `.messages[]`\n",
"\n",
"This allows us to pass the records (dict) into the `metadata_func` that has to be implemented. The `metadata_func` is responsible for identifying which pieces of information in the record should be included in the metadata stored in the final `Document` object.\n",
"\n",
"Additionally, we now have to explicitly specify in the loader, via the `content_key` argument, the key from the record where the value for the `page_content` needs to be extracted from."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'source': '/Users/isaachershenson/Documents/langchain/docs/docs/integrations/document_loaders/example_data/facebook_chat.json', 'seq_num': 1, 'sender_name': 'User 2', 'timestamp_ms': 1675597571851}\n"
]
}
],
"source": [
"# Define the metadata extraction function.\n",
"def metadata_func(record: dict, metadata: dict) -> dict:\n",
" metadata[\"sender_name\"] = record.get(\"sender_name\")\n",
" metadata[\"timestamp_ms\"] = record.get(\"timestamp_ms\")\n",
"\n",
" return metadata\n",
"\n",
"\n",
"loader = JSONLoader(\n",
" file_path=\"./example_data/facebook_chat.json\",\n",
" jq_schema=\".messages[]\",\n",
" content_key=\"content\",\n",
" metadata_func=metadata_func,\n",
")\n",
"\n",
"docs = loader.load()\n",
"print(docs[0].metadata)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## API reference\n",
"\n",
"For detailed documentation of all JSONLoader features and configurations head to the API reference: https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.json_loader.JSONLoader.html"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,178 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# MathPixPDFLoader\n",
"\n",
"Inspired by Daniel Gross's snippet here: [https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21](https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21)\n",
"\n",
"## Overview\n",
"### Integration details\n",
"\n",
"| Class | Package | Local | Serializable | JS support|\n",
"| :--- | :--- | :---: | :---: | :---: |\n",
"| [MathPixPDFLoader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.MathpixPDFLoader.html) | [langchain_community](https://api.python.langchain.com/en/latest/community_api_reference.html) | ✅ | ❌ | ❌ | \n",
"### Loader features\n",
"| Source | Document Lazy Loading | Native Async Support\n",
"| :---: | :---: | :---: | \n",
"| MathPixPDFLoader | ✅ | ❌ | \n",
"\n",
"## Setup\n",
"\n",
"### Credentials\n",
"\n",
"Sign up for Mathpix and [create an API key](https://mathpix.com/docs/ocr/creating-an-api-key) to set the `MATHPIX_API_KEY` variables in your environment"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import getpass\n",
"import os\n",
"\n",
"if \"MATHPIX_API_KEY\" not in os.environ:\n",
" os.environ[\"MATHPIX_API_KEY\"] = getpass.getpass(\"Enter your Mathpix API key: \")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If you want to get automated best in-class tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")\n",
"# os.environ[\"LANGSMITH_TRACING\"] = \"true\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Installation\n",
"\n",
"Install **langchain_community**."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%pip install -qU langchain_community"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialization\n",
"\n",
"Now we are ready to initialize our loader:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from langchain_community.document_loaders import MathpixPDFLoader\n",
"\n",
"file_path = \"./example_data/layout-parser-paper.pdf\"\n",
"loader = MathpixPDFLoader(file_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"docs = loader.load()\n",
"docs[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(docs[0].metadata)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Lazy Load"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"page = []\n",
"for doc in loader.lazy_load():\n",
" page.append(doc)\n",
" if len(page) >= 10:\n",
" # do some paged operation, e.g.\n",
" # index.upsert(page)\n",
"\n",
" page = []"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## API reference\n",
"\n",
"For detailed documentation of all MathpixPDFLoader features and configurations head to the API reference: https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.MathpixPDFLoader.html"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,183 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# PDFPlumber\n",
"\n",
"Like PyMuPDF, the output Documents contain detailed metadata about the PDF and its pages, and returns one document per page.\n",
"\n",
"## Overview\n",
"### Integration details\n",
"\n",
"| Class | Package | Local | Serializable | JS support|\n",
"| :--- | :--- | :---: | :---: | :---: |\n",
"| [PDFPlumberLoader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PDFPlumberLoader.html) | [langchain_community](https://api.python.langchain.com/en/latest/community_api_reference.html) | ✅ | ❌ | ❌ | \n",
"### Loader features\n",
"| Source | Document Lazy Loading | Native Async Support\n",
"| :---: | :---: | :---: | \n",
"| PDFPlumberLoader | ✅ | ❌ | \n",
"\n",
"## Setup\n",
"\n",
"### Credentials\n",
"\n",
"No credentials are needed to use this loader."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If you want to get automated best in-class tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")\n",
"# os.environ[\"LANGSMITH_TRACING\"] = \"true\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Installation\n",
"\n",
"Install **langchain_community**."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%pip install -qU langchain_community"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialization\n",
"\n",
"Now we can instantiate our model object and load documents:"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"from langchain_community.document_loaders import PDFPlumberLoader\n",
"\n",
"loader = PDFPlumberLoader(\"./example_data/layout-parser-paper.pdf\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Document(metadata={'source': './example_data/layout-parser-paper.pdf', 'file_path': './example_data/layout-parser-paper.pdf', 'page': 0, 'total_pages': 16, 'Author': '', 'CreationDate': 'D:20210622012710Z', 'Creator': 'LaTeX with hyperref', 'Keywords': '', 'ModDate': 'D:20210622012710Z', 'PTEX.Fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'Producer': 'pdfTeX-1.40.21', 'Subject': '', 'Title': '', 'Trapped': 'False'}, page_content='LayoutParser: A Unified Toolkit for Deep\\nLearning Based Document Image Analysis\\nZejiang Shen1 ((cid:0)), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain\\nLee4, Jacob Carlson3, and Weining Li5\\n1 Allen Institute for AI\\nshannons@allenai.org\\n2 Brown University\\nruochen zhang@brown.edu\\n3 Harvard University\\n{melissadell,jacob carlson}@fas.harvard.edu\\n4 University of Washington\\nbcgl@cs.washington.edu\\n5 University of Waterloo\\nw422li@uwaterloo.ca\\nAbstract. Recentadvancesindocumentimageanalysis(DIA)havebeen\\nprimarily driven by the application of neural networks. Ideally, research\\noutcomescouldbeeasilydeployedinproductionandextendedforfurther\\ninvestigation. However, various factors like loosely organized codebases\\nand sophisticated model configurations complicate the easy reuse of im-\\nportantinnovationsbyawideaudience.Thoughtherehavebeenon-going\\nefforts to improve reusability and simplify deep learning (DL) model\\ndevelopmentindisciplineslikenaturallanguageprocessingandcomputer\\nvision, none of them are optimized for challenges in the domain of DIA.\\nThis represents a major gap in the existing toolkit, as DIA is central to\\nacademicresearchacross awiderangeof disciplinesinthesocialsciences\\nand humanities. This paper introduces LayoutParser, an open-source\\nlibrary for streamlining the usage of DL in DIA research and applica-\\ntions. The core LayoutParser library comes with a set of simple and\\nintuitiveinterfacesforapplyingandcustomizingDLmodelsforlayoutde-\\ntection,characterrecognition,andmanyotherdocumentprocessingtasks.\\nTo promote extensibility, LayoutParser also incorporates a community\\nplatform for sharing both pre-trained models and full document digiti-\\nzation pipelines. We demonstrate that LayoutParser is helpful for both\\nlightweight and large-scale digitization pipelines in real-word use cases.\\nThe library is publicly available at https://layout-parser.github.io.\\nKeywords: DocumentImageAnalysis·DeepLearning·LayoutAnalysis\\n· Character Recognition · Open Source library · Toolkit.\\n1 Introduction\\nDeep Learning(DL)-based approaches are the state-of-the-art for a wide range of\\ndocumentimageanalysis(DIA)tasksincludingdocumentimageclassification[11,\\n1202\\nnuJ\\n12\\n]VC.sc[\\n2v84351.3012:viXra\\n')"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs = loader.load()\n",
"docs[0]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'source': './example_data/layout-parser-paper.pdf', 'file_path': './example_data/layout-parser-paper.pdf', 'page': 0, 'total_pages': 16, 'Author': '', 'CreationDate': 'D:20210622012710Z', 'Creator': 'LaTeX with hyperref', 'Keywords': '', 'ModDate': 'D:20210622012710Z', 'PTEX.Fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'Producer': 'pdfTeX-1.40.21', 'Subject': '', 'Title': '', 'Trapped': 'False'}\n"
]
}
],
"source": [
"print(docs[0].metadata)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Lazy Load"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"page = []\n",
"for doc in loader.lazy_load():\n",
" page.append(doc)\n",
" if len(page) >= 10:\n",
" # do some paged operation, e.g.\n",
" # index.upsert(page)\n",
"\n",
" page = []"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## API reference\n",
"\n",
"For detailed documentation of all PDFPlumberLoader features and configurations head to the API reference: https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PDFPlumberLoader.html"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,185 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# PyMuPDF\n",
"\n",
"`PyMuPDF` is optimized for speed, and contains detailed metadata about the PDF and its pages. It returns one document per page.\n",
"\n",
"## Overview\n",
"### Integration details\n",
"\n",
"| Class | Package | Local | Serializable | JS support|\n",
"| :--- | :--- | :---: | :---: | :---: |\n",
"| [PyMuPDFLoader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyMuPDFLoader.html) | [langchain_community](https://api.python.langchain.com/en/latest/community_api_reference.html) | ✅ | ❌ | ❌ | \n",
"### Loader features\n",
"| Source | Document Lazy Loading | Native Async Support\n",
"| :---: | :---: | :---: | \n",
"| PyMuPDFLoader | ✅ | ❌ | \n",
"\n",
"## Setup\n",
"\n",
"### Credentials\n",
"\n",
"No credentials are needed to use the `PyMuPDFLoader`."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If you want to get automated best in-class tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")\n",
"# os.environ[\"LANGSMITH_TRACING\"] = \"true\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Installation\n",
"\n",
"Install **langchain_community** and **pymupdf**."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%pip install -qU langchain-community pymupdf"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialization\n",
"\n",
"Now we can initialize our loader and start loading documents. "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from langchain_community.document_loaders import PyMuPDFLoader\n",
"\n",
"loader = PyMuPDFLoader(\"./example_data/layout-parser-paper.pdf\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load\n",
"\n",
"You can pass along any of the options from the [PyMuPDF documentation](https://pymupdf.readthedocs.io/en/latest/app1.html#plain-text/) as keyword arguments in the `load` call, and it will be pass along to the `get_text()` call."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Document(metadata={'source': './example_data/layout-parser-paper.pdf', 'file_path': './example_data/layout-parser-paper.pdf', 'page': 0, 'total_pages': 16, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'LaTeX with hyperref', 'producer': 'pdfTeX-1.40.21', 'creationDate': 'D:20210622012710Z', 'modDate': 'D:20210622012710Z', 'trapped': ''}, page_content='LayoutParser: A Unified Toolkit for Deep\\nLearning Based Document Image Analysis\\nZejiang Shen1 (\\x00), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain\\nLee4, Jacob Carlson3, and Weining Li5\\n1 Allen Institute for AI\\nshannons@allenai.org\\n2 Brown University\\nruochen zhang@brown.edu\\n3 Harvard University\\n{melissadell,jacob carlson}@fas.harvard.edu\\n4 University of Washington\\nbcgl@cs.washington.edu\\n5 University of Waterloo\\nw422li@uwaterloo.ca\\nAbstract. Recent advances in document image analysis (DIA) have been\\nprimarily driven by the application of neural networks. Ideally, research\\noutcomes could be easily deployed in production and extended for further\\ninvestigation. However, various factors like loosely organized codebases\\nand sophisticated model configurations complicate the easy reuse of im-\\nportant innovations by a wide audience. Though there have been on-going\\nefforts to improve reusability and simplify deep learning (DL) model\\ndevelopment in disciplines like natural language processing and computer\\nvision, none of them are optimized for challenges in the domain of DIA.\\nThis represents a major gap in the existing toolkit, as DIA is central to\\nacademic research across a wide range of disciplines in the social sciences\\nand humanities. This paper introduces LayoutParser, an open-source\\nlibrary for streamlining the usage of DL in DIA research and applica-\\ntions. The core LayoutParser library comes with a set of simple and\\nintuitive interfaces for applying and customizing DL models for layout de-\\ntection, character recognition, and many other document processing tasks.\\nTo promote extensibility, LayoutParser also incorporates a community\\nplatform for sharing both pre-trained models and full document digiti-\\nzation pipelines. We demonstrate that LayoutParser is helpful for both\\nlightweight and large-scale digitization pipelines in real-word use cases.\\nThe library is publicly available at https://layout-parser.github.io.\\nKeywords: Document Image Analysis · Deep Learning · Layout Analysis\\n· Character Recognition · Open Source library · Toolkit.\\n1\\nIntroduction\\nDeep Learning(DL)-based approaches are the state-of-the-art for a wide range of\\ndocument image analysis (DIA) tasks including document image classification [11,\\narXiv:2103.15348v2 [cs.CV] 21 Jun 2021\\n')"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs = loader.load()\n",
"docs[0]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'source': './example_data/layout-parser-paper.pdf', 'file_path': './example_data/layout-parser-paper.pdf', 'page': 0, 'total_pages': 16, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'LaTeX with hyperref', 'producer': 'pdfTeX-1.40.21', 'creationDate': 'D:20210622012710Z', 'modDate': 'D:20210622012710Z', 'trapped': ''}\n"
]
}
],
"source": [
"print(docs[0].metadata)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Lazy Load"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"page = []\n",
"for doc in loader.lazy_load():\n",
" page.append(doc)\n",
" if len(page) >= 10:\n",
" # do some paged operation, e.g.\n",
" # index.upsert(page)\n",
"\n",
" page = []"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## API reference\n",
"\n",
"For detailed documentation of all PyMuPDFLoader features and configurations head to the API reference: https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyMuPDFLoader.html"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,187 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# PyPDFDirectoryLoader\n",
"\n",
"This loader loads all PDF files from a specific directory.\n",
"\n",
"## Overview\n",
"### Integration details\n",
"\n",
"\n",
"| Class | Package | Local | Serializable | JS support|\n",
"| :--- | :--- | :---: | :---: | :---: |\n",
"| [PyPDFDirectoryLoader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFDirectoryLoader.html) | [langchain_community](https://api.python.langchain.com/en/latest/community_api_reference.html) | ✅ | ❌ | ❌ | \n",
"### Loader features\n",
"| Source | Document Lazy Loading | Native Async Support\n",
"| :---: | :---: | :---: | \n",
"| PyPDFDirectoryLoader | ✅ | ❌ | \n",
"\n",
"## Setup\n",
"\n",
"### Credentials\n",
"\n",
"No credentials are needed for this loader."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If you want to get automated best in-class tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")\n",
"# os.environ[\"LANGSMITH_TRACING\"] = \"true\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Installation\n",
"\n",
"Install **langchain_community**."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%pip install -qU langchain_community"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialization\n",
"\n",
"Now we can instantiate our model object and load documents:"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from langchain_community.document_loaders import PyPDFDirectoryLoader\n",
"\n",
"directory_path = (\n",
" \"../../docs/integrations/document_loaders/example_data/layout-parser-paper.pdf\"\n",
")\n",
"loader = PyPDFDirectoryLoader(\"example_data/\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Document(metadata={'source': 'example_data/layout-parser-paper.pdf', 'page': 0}, page_content='LayoutParser : A Unified Toolkit for Deep\\nLearning Based Document Image Analysis\\nZejiang Shen1( \\x00), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain\\nLee4, Jacob Carlson3, and Weining Li5\\n1Allen Institute for AI\\nshannons@allenai.org\\n2Brown University\\nruochen zhang@brown.edu\\n3Harvard University\\n{melissadell,jacob carlson }@fas.harvard.edu\\n4University of Washington\\nbcgl@cs.washington.edu\\n5University of Waterloo\\nw422li@uwaterloo.ca\\nAbstract. Recent advances in document image analysis (DIA) have been\\nprimarily driven by the application of neural networks. Ideally, research\\noutcomes could be easily deployed in production and extended for further\\ninvestigation. However, various factors like loosely organized codebases\\nand sophisticated model configurations complicate the easy reuse of im-\\nportant innovations by a wide audience. Though there have been on-going\\nefforts to improve reusability and simplify deep learning (DL) model\\ndevelopment in disciplines like natural language processing and computer\\nvision, none of them are optimized for challenges in the domain of DIA.\\nThis represents a major gap in the existing toolkit, as DIA is central to\\nacademic research across a wide range of disciplines in the social sciences\\nand humanities. This paper introduces LayoutParser , an open-source\\nlibrary for streamlining the usage of DL in DIA research and applica-\\ntions. The core LayoutParser library comes with a set of simple and\\nintuitive interfaces for applying and customizing DL models for layout de-\\ntection, character recognition, and many other document processing tasks.\\nTo promote extensibility, LayoutParser also incorporates a community\\nplatform for sharing both pre-trained models and full document digiti-\\nzation pipelines. We demonstrate that LayoutParser is helpful for both\\nlightweight and large-scale digitization pipelines in real-word use cases.\\nThe library is publicly available at https://layout-parser.github.io .\\nKeywords: Document Image Analysis ·Deep Learning ·Layout Analysis\\n·Character Recognition ·Open Source library ·Toolkit.\\n1 Introduction\\nDeep Learning(DL)-based approaches are the state-of-the-art for a wide range of\\ndocument image analysis (DIA) tasks including document image classification [ 11,arXiv:2103.15348v2 [cs.CV] 21 Jun 2021')"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs = loader.load()\n",
"docs[0]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'source': 'example_data/layout-parser-paper.pdf', 'page': 0}\n"
]
}
],
"source": [
"print(docs[0].metadata)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Lazy Load"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"page = []\n",
"for doc in loader.lazy_load():\n",
" page.append(doc)\n",
" if len(page) >= 10:\n",
" # do some paged operation, e.g.\n",
" # index.upsert(page)\n",
"\n",
" page = []"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## API reference\n",
"\n",
"For detailed documentation of all PyPDFDirectoryLoader features and configurations head to the API reference: https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFDirectoryLoader.html"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,188 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# PyPDFium2Loader\n",
"\n",
"\n",
"This notebook provides a quick overview for getting started with PyPDFium2 [document loader](https://python.langchain.com/v0.2/docs/concepts/#document-loaders). For detailed documentation of all __ModuleName__Loader features and configurations head to the [API reference](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFium2Loader.html).\n",
"\n",
"## Overview\n",
"### Integration details\n",
"\n",
"| Class | Package | Local | Serializable | JS support|\n",
"| :--- | :--- | :---: | :---: | :---: |\n",
"| [PyPDFium2Loader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFium2Loader.html) | [langchain_community](https://api.python.langchain.com/en/latest/community_api_reference.html) | ✅ | ❌ | ❌ | \n",
"### Loader features\n",
"| Source | Document Lazy Loading | Native Async Support\n",
"| :---: | :---: | :---: | \n",
"| PyPDFium2Loader | ✅ | ❌ | \n",
"\n",
"## Setup\n",
"\n",
"\n",
"To access PyPDFium2 document loader you'll need to install the `langchain-community` integration package.\n",
"\n",
"### Credentials\n",
"\n",
"No credentials are needed."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If you want to get automated best in-class tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")\n",
"# os.environ[\"LANGSMITH_TRACING\"] = \"true\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Installation\n",
"\n",
"Install **langchain_community**."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%pip install -qU langchain_community"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialization\n",
"\n",
"Now we can instantiate our model object and load documents:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from langchain_community.document_loaders import PyPDFium2Loader\n",
"\n",
"file_path = \"./example_data/layout-parser-paper.pdf\"\n",
"loader = PyPDFium2Loader(file_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Document(metadata={'source': './example_data/layout-parser-paper.pdf', 'page': 0}, page_content='LayoutParser: A Unified Toolkit for Deep\\r\\nLearning Based Document Image Analysis\\r\\nZejiang Shen\\r\\n1\\r\\n(), Ruochen Zhang\\r\\n2\\r\\n, Melissa Dell\\r\\n3\\r\\n, Benjamin Charles Germain\\r\\nLee\\r\\n4\\r\\n, Jacob Carlson\\r\\n3\\r\\n, and Weining Li\\r\\n5\\r\\n1 Allen Institute for AI\\r\\nshannons@allenai.org 2 Brown University\\r\\nruochen zhang@brown.edu 3 Harvard University\\r\\n{melissadell,jacob carlson}@fas.harvard.edu\\r\\n4 University of Washington\\r\\nbcgl@cs.washington.edu 5 University of Waterloo\\r\\nw422li@uwaterloo.ca\\r\\nAbstract. Recent advances in document image analysis (DIA) have been\\r\\nprimarily driven by the application of neural networks. Ideally, research\\r\\noutcomes could be easily deployed in production and extended for further\\r\\ninvestigation. However, various factors like loosely organized codebases\\r\\nand sophisticated model configurations complicate the easy reuse of im\\x02portant innovations by a wide audience. Though there have been on-going\\r\\nefforts to improve reusability and simplify deep learning (DL) model\\r\\ndevelopment in disciplines like natural language processing and computer\\r\\nvision, none of them are optimized for challenges in the domain of DIA.\\r\\nThis represents a major gap in the existing toolkit, as DIA is central to\\r\\nacademic research across a wide range of disciplines in the social sciences\\r\\nand humanities. This paper introduces LayoutParser, an open-source\\r\\nlibrary for streamlining the usage of DL in DIA research and applica\\x02tions. The core LayoutParser library comes with a set of simple and\\r\\nintuitive interfaces for applying and customizing DL models for layout de\\x02tection, character recognition, and many other document processing tasks.\\r\\nTo promote extensibility, LayoutParser also incorporates a community\\r\\nplatform for sharing both pre-trained models and full document digiti\\x02zation pipelines. We demonstrate that LayoutParser is helpful for both\\r\\nlightweight and large-scale digitization pipelines in real-word use cases.\\r\\nThe library is publicly available at https://layout-parser.github.io.\\r\\nKeywords: Document Image Analysis· Deep Learning· Layout Analysis\\r\\n· Character Recognition· Open Source library· Toolkit.\\r\\n1 Introduction\\r\\nDeep Learning(DL)-based approaches are the state-of-the-art for a wide range of\\r\\ndocument image analysis (DIA) tasks including document image classification [11,\\r\\narXiv:2103.15348v2 [cs.CV] 21 Jun 2021\\n')"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs = loader.load()\n",
"docs[0]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'source': './example_data/layout-parser-paper.pdf', 'page': 0}\n"
]
}
],
"source": [
"print(docs[0].metadata)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Lazy Load"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"page = []\n",
"for doc in loader.lazy_load():\n",
" page.append(doc)\n",
" if len(page) >= 10:\n",
" # do some paged operation, e.g.\n",
" # index.upsert(page)\n",
"\n",
" page = []"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## API reference\n",
"\n",
"For detailed documentation of all PyPDFium2Loader features and configurations head to the API reference: https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFium2Loader.html"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,269 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# UnstructuredMarkdownLoader\n",
"\n",
"This notebook provides a quick overview for getting started with UnstructuredMarkdown [document loader](https://python.langchain.com/v0.2/docs/concepts/#document-loaders). For detailed documentation of all __ModuleName__Loader features and configurations head to the [API reference](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.markdown.UnstructuredMarkdownLoader.html).\n",
"\n",
"## Overview\n",
"### Integration details\n",
"\n",
"\n",
"| Class | Package | Local | Serializable | [JS support](https://js.langchain.com/v0.2/docs/integrations/document_loaders/file_loaders/unstructured/)|\n",
"| :--- | :--- | :---: | :---: | :---: |\n",
"| [UnstructuredMarkdownLoader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.markdown.UnstructuredMarkdownLoader.html) | [langchain_community](https://api.python.langchain.com/en/latest/community_api_reference.html) | ❌ | ❌ | ✅ | \n",
"### Loader features\n",
"| Source | Document Lazy Loading | Native Async Support\n",
"| :---: | :---: | :---: | \n",
"| UnstructuredMarkdownLoader | ✅ | ❌ | \n",
"\n",
"## Setup\n",
"\n",
"To access UnstructuredMarkdownLoader document loader you'll need to install the `langchain-community` integration package and the `unstructured` python package.\n",
"\n",
"### Credentials\n",
"\n",
"No credentials are needed to use this loader."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If you want to get automated best in-class tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")\n",
"# os.environ[\"LANGSMITH_TRACING\"] = \"true\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Installation\n",
"\n",
"Install **langchain_community** and **unstructured**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%pip install -qU langchain_community unstructured"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialization\n",
"\n",
"Now we can instantiate our model object and load documents. \n",
"\n",
"You can run the loader in one of two modes: \"single\" and \"elements\". If you use \"single\" mode, the document will be returned as a single `Document` object. If you use \"elements\" mode, the unstructured library will split the document into elements such as `Title` and `NarrativeText`. You can pass in additional `unstructured` kwargs after mode to apply different `unstructured` settings."
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"from langchain_community.document_loaders import UnstructuredMarkdownLoader\n",
"\n",
"loader = UnstructuredMarkdownLoader(\n",
" \"./example_data/example.md\",\n",
" mode=\"single\",\n",
" strategy=\"fast\",\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Document(metadata={'source': './example_data/example.md'}, page_content='Sample Markdown Document\\n\\nIntroduction\\n\\nWelcome to this sample Markdown document. Markdown is a lightweight markup language used for formatting text. It\\'s widely used for documentation, readme files, and more.\\n\\nFeatures\\n\\nHeaders\\n\\nMarkdown supports multiple levels of headers:\\n\\nHeader 1: # Header 1\\n\\nHeader 2: ## Header 2\\n\\nHeader 3: ### Header 3\\n\\nLists\\n\\nUnordered List\\n\\nItem 1\\n\\nItem 2\\n\\nSubitem 2.1\\n\\nSubitem 2.2\\n\\nOrdered List\\n\\nFirst item\\n\\nSecond item\\n\\nThird item\\n\\nLinks\\n\\nOpenAI is an AI research organization.\\n\\nImages\\n\\nHere\\'s an example image:\\n\\nCode\\n\\nInline Code\\n\\nUse code for inline code snippets.\\n\\nCode Block\\n\\n```python def greet(name): return f\"Hello, {name}!\"\\n\\nprint(greet(\"World\")) ```')"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs = loader.load()\n",
"docs[0]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'source': './example_data/example.md'}\n"
]
}
],
"source": [
"print(docs[0].metadata)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Lazy Load"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Document(metadata={'source': './example_data/example.md', 'link_texts': ['OpenAI'], 'link_urls': ['https://www.openai.com'], 'last_modified': '2024-08-14T15:04:18', 'languages': ['eng'], 'parent_id': 'de1f74bf226224377ab4d8b54f215bb9', 'filetype': 'text/markdown', 'file_directory': './example_data', 'filename': 'example.md', 'category': 'NarrativeText', 'element_id': '898a542a261f7dc65e0072d1e847d535'}, page_content='OpenAI is an AI research organization.')"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"page = []\n",
"for doc in loader.lazy_load():\n",
" page.append(doc)\n",
" if len(page) >= 10:\n",
" # do some paged operation, e.g.\n",
" # index.upsert(page)\n",
"\n",
" page = []\n",
"page[0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load Elements\n",
"\n",
"In this example we will load in the `elements` mode, which will return a list of the different elements in the markdown document:"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"29"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from langchain_community.document_loaders import UnstructuredMarkdownLoader\n",
"\n",
"loader = UnstructuredMarkdownLoader(\n",
" \"./example_data/example.md\",\n",
" mode=\"elements\",\n",
" strategy=\"fast\",\n",
")\n",
"\n",
"docs = loader.load()\n",
"len(docs)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"As you see there are 29 elements that were pulled from the `example.md` file. The first element is the title of the document as expected:"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Sample Markdown Document'"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs[0].page_content"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## API reference\n",
"\n",
"For detailed documentation of all UnstructuredMarkdownLoader features and configurations head to the API reference: https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.markdown.UnstructuredMarkdownLoader.html"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,332 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "b57124cc-60a0-4c18-b7ce-3e483d1024a2",
"metadata": {},
"source": [
"---\n",
"title: Migrating from ConstitutionalChain\n",
"---"
]
},
{
"cell_type": "markdown",
"id": "ce8457ed-c0b1-4a74-abbd-9d3d2211270f",
"metadata": {},
"source": [
"[ConstitutionalChain](https://api.python.langchain.com/en/latest/chains/langchain.chains.constitutional_ai.base.ConstitutionalChain.html) allowed for a LLM to critique and revise generations based on [principles](https://api.python.langchain.com/en/latest/chains/langchain.chains.constitutional_ai.models.ConstitutionalPrinciple.html), structured as combinations of critique and revision requests. For example, a principle might include a request to identify harmful content, and a request to rewrite the content.\n",
"\n",
"In `ConstitutionalChain`, this structure of critique requests and associated revisions was formatted into a LLM prompt and parsed out of string responses. This is more naturally achieved via [structured output](/docs/how_to/structured_output/) features of chat models. We can construct a simple chain in [LangGraph](https://langchain-ai.github.io/langgraph/) for this purpose. Some advantages of this approach include:\n",
"\n",
"- Leverage tool-calling capabilities of chat models that have been fine-tuned for this purpose;\n",
"- Reduce parsing errors from extracting expression from a string LLM response;\n",
"- Delegation of instructions to [message roles](/docs/concepts/#messages) (e.g., chat models can understand what a `ToolMessage` represents without the need for additional prompting);\n",
"- Support for streaming, both of individual tokens and chain steps."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b99b47ec",
"metadata": {},
"outputs": [],
"source": [
"%pip install --upgrade --quiet langchain-openai"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "717c8673",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from getpass import getpass\n",
"\n",
"os.environ[\"OPENAI_API_KEY\"] = getpass()"
]
},
{
"cell_type": "markdown",
"id": "e3621b62-a037-42b8-8faa-59575608bb8b",
"metadata": {},
"source": [
"## Legacy\n",
"\n",
"<details open>"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "f91c9809-8ee7-4e38-881d-0ace4f6ea883",
"metadata": {},
"outputs": [],
"source": [
"from langchain.chains import ConstitutionalChain, LLMChain\n",
"from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple\n",
"from langchain_core.prompts import PromptTemplate\n",
"from langchain_openai import OpenAI\n",
"\n",
"llm = OpenAI()\n",
"\n",
"qa_prompt = PromptTemplate(\n",
" template=\"Q: {question} A:\",\n",
" input_variables=[\"question\"],\n",
")\n",
"qa_chain = LLMChain(llm=llm, prompt=qa_prompt)\n",
"\n",
"constitutional_chain = ConstitutionalChain.from_llm(\n",
" llm=llm,\n",
" chain=qa_chain,\n",
" constitutional_principles=[\n",
" ConstitutionalPrinciple(\n",
" critique_request=\"Tell if this answer is good.\",\n",
" revision_request=\"Give a better answer.\",\n",
" )\n",
" ],\n",
" return_intermediate_steps=True,\n",
")\n",
"\n",
"result = constitutional_chain.invoke(\"What is the meaning of life?\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "fa3d11a1-ac1f-4a9a-9ab3-b7b244daa506",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'question': 'What is the meaning of life?',\n",
" 'output': 'The meaning of life is a deeply personal and ever-evolving concept. It is a journey of self-discovery and growth, and can be different for each individual. Some may find meaning in relationships, others in achieving their goals, and some may never find a concrete answer. Ultimately, the meaning of life is what we make of it.',\n",
" 'initial_output': ' The meaning of life is a subjective concept that can vary from person to person. Some may believe that the purpose of life is to find happiness and fulfillment, while others may see it as a journey of self-discovery and personal growth. Ultimately, the meaning of life is something that each individual must determine for themselves.',\n",
" 'critiques_and_revisions': [('This answer is good in that it recognizes and acknowledges the subjective nature of the question and provides a valid and thoughtful response. However, it could have also mentioned that the meaning of life is a complex and deeply personal concept that can also change and evolve over time for each individual. Critique Needed.',\n",
" 'The meaning of life is a deeply personal and ever-evolving concept. It is a journey of self-discovery and growth, and can be different for each individual. Some may find meaning in relationships, others in achieving their goals, and some may never find a concrete answer. Ultimately, the meaning of life is what we make of it.')]}"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result"
]
},
{
"cell_type": "markdown",
"id": "374ae108-f1a0-4723-9237-5259c8123c04",
"metadata": {},
"source": [
"Above, we've returned intermediate steps showing:\n",
"\n",
"- The original question;\n",
"- The initial output;\n",
"- Critiques and revisions;\n",
"- The final output (matching a revision)."
]
},
{
"cell_type": "markdown",
"id": "cdc3b527-c09e-4c77-9711-c3cc4506cd95",
"metadata": {},
"source": [
"</details>\n",
"\n",
"## LangGraph\n",
"\n",
"<details open>\n",
"\n",
"Below, we use the [.with_structured_output](/docs/how_to/structured_output/) method to simultaneously generate (1) a judgment of whether a critique is needed, and (2) the critique. We surface all prompts involved for clarity and ease of customizability.\n",
"\n",
"Note that we are also able to stream intermediate steps with this implementation, so we can monitor and if needed intervene during its execution."
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "917fdb73-2411-4fcc-9add-c32dc5c745da",
"metadata": {},
"outputs": [],
"source": [
"from typing import List, Optional, Tuple\n",
"\n",
"from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple\n",
"from langchain.chains.constitutional_ai.prompts import (\n",
" CRITIQUE_PROMPT,\n",
" REVISION_PROMPT,\n",
")\n",
"from langchain_core.output_parsers import StrOutputParser\n",
"from langchain_core.prompts import ChatPromptTemplate\n",
"from langchain_openai import ChatOpenAI\n",
"from langgraph.graph import END, START, StateGraph\n",
"from typing_extensions import Annotated, TypedDict\n",
"\n",
"llm = ChatOpenAI(model=\"gpt-4o-mini\")\n",
"\n",
"\n",
"class Critique(TypedDict):\n",
" \"\"\"Generate a critique, if needed.\"\"\"\n",
"\n",
" critique_needed: Annotated[bool, ..., \"Whether or not a critique is needed.\"]\n",
" critique: Annotated[str, ..., \"If needed, the critique.\"]\n",
"\n",
"\n",
"critique_prompt = ChatPromptTemplate.from_template(\n",
" \"Critique this response according to the critique request. \"\n",
" \"If no critique is needed, specify that.\\n\\n\"\n",
" \"Query: {query}\\n\\n\"\n",
" \"Response: {response}\\n\\n\"\n",
" \"Critique request: {critique_request}\"\n",
")\n",
"\n",
"revision_prompt = ChatPromptTemplate.from_template(\n",
" \"Revise this response according to the critique and reivsion request.\\n\\n\"\n",
" \"Query: {query}\\n\\n\"\n",
" \"Response: {response}\\n\\n\"\n",
" \"Critique request: {critique_request}\\n\\n\"\n",
" \"Critique: {critique}\\n\\n\"\n",
" \"If the critique does not identify anything worth changing, ignore the \"\n",
" \"revision request and return 'No revisions needed'. If the critique \"\n",
" \"does identify something worth changing, revise the response based on \"\n",
" \"the revision request.\\n\\n\"\n",
" \"Revision Request: {revision_request}\"\n",
")\n",
"\n",
"chain = llm | StrOutputParser()\n",
"critique_chain = critique_prompt | llm.with_structured_output(Critique)\n",
"revision_chain = revision_prompt | llm | StrOutputParser()\n",
"\n",
"\n",
"class State(TypedDict):\n",
" query: str\n",
" constitutional_principles: List[ConstitutionalPrinciple]\n",
" initial_response: str\n",
" critiques_and_revisions: List[Tuple[str, str]]\n",
" response: str\n",
"\n",
"\n",
"async def generate_response(state: State):\n",
" \"\"\"Generate initial response.\"\"\"\n",
" response = await chain.ainvoke(state[\"query\"])\n",
" return {\"response\": response, \"initial_response\": response}\n",
"\n",
"\n",
"async def critique_and_revise(state: State):\n",
" \"\"\"Critique and revise response according to principles.\"\"\"\n",
" critiques_and_revisions = []\n",
" response = state[\"initial_response\"]\n",
" for principle in state[\"constitutional_principles\"]:\n",
" critique = await critique_chain.ainvoke(\n",
" {\n",
" \"query\": state[\"query\"],\n",
" \"response\": response,\n",
" \"critique_request\": principle.critique_request,\n",
" }\n",
" )\n",
" if critique[\"critique_needed\"]:\n",
" revision = await revision_chain.ainvoke(\n",
" {\n",
" \"query\": state[\"query\"],\n",
" \"response\": response,\n",
" \"critique_request\": principle.critique_request,\n",
" \"critique\": critique[\"critique\"],\n",
" \"revision_request\": principle.revision_request,\n",
" }\n",
" )\n",
" response = revision\n",
" critiques_and_revisions.append((critique[\"critique\"], revision))\n",
" else:\n",
" critiques_and_revisions.append((critique[\"critique\"], \"\"))\n",
" return {\n",
" \"critiques_and_revisions\": critiques_and_revisions,\n",
" \"response\": response,\n",
" }\n",
"\n",
"\n",
"graph = StateGraph(State)\n",
"graph.add_node(\"generate_response\", generate_response)\n",
"graph.add_node(\"critique_and_revise\", critique_and_revise)\n",
"\n",
"graph.add_edge(START, \"generate_response\")\n",
"graph.add_edge(\"generate_response\", \"critique_and_revise\")\n",
"graph.add_edge(\"critique_and_revise\", END)\n",
"app = graph.compile()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "01aac88d-464e-431f-b92e-746dcb743e1b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{}\n",
"{'initial_response': 'Finding purpose, connection, and joy in our experiences and relationships.', 'response': 'Finding purpose, connection, and joy in our experiences and relationships.'}\n",
"{'initial_response': 'Finding purpose, connection, and joy in our experiences and relationships.', 'critiques_and_revisions': [(\"The response exceeds the 10-word limit, providing a more elaborate answer than requested. A concise response, such as 'To seek purpose and joy in life,' would better align with the query.\", 'To seek purpose and joy in life.')], 'response': 'To seek purpose and joy in life.'}\n"
]
}
],
"source": [
"constitutional_principles = [\n",
" ConstitutionalPrinciple(\n",
" critique_request=\"Tell if this answer is good.\",\n",
" revision_request=\"Give a better answer.\",\n",
" )\n",
"]\n",
"\n",
"query = \"What is the meaning of life? Answer in 10 words or fewer.\"\n",
"\n",
"async for step in app.astream(\n",
" {\"query\": query, \"constitutional_principles\": constitutional_principles},\n",
" stream_mode=\"values\",\n",
"):\n",
" subset = [\"initial_response\", \"critiques_and_revisions\", \"response\"]\n",
" print({k: v for k, v in step.items() if k in subset})"
]
},
{
"cell_type": "markdown",
"id": "b2717810",
"metadata": {},
"source": [
"</details>\n",
"\n",
"## Next steps\n",
"\n",
"See guides for generating structured output [here](/docs/how_to/structured_output/).\n",
"\n",
"Check out the [LangGraph documentation](https://langchain-ai.github.io/langgraph/) for detail on building with LangGraph."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -45,5 +45,7 @@ The below pages assist with migration from various specific chains to LCEL and L
- [RefineDocumentsChain](/docs/versions/migrating_chains/refine_docs_chain)
- [LLMRouterChain](/docs/versions/migrating_chains/llm_router_chain)
- [MultiPromptChain](/docs/versions/migrating_chains/multi_prompt_chain)
- [LLMMathChain](/docs/versions/migrating_chains/llm_math_chain)
- [ConstitutionalChain](/docs/versions/migrating_chains/constitutional_chain)
Check out the [LCEL conceptual docs](/docs/concepts/#langchain-expression-language-lcel) and [LangGraph docs](https://langchain-ai.github.io/langgraph/) for more background information.

File diff suppressed because one or more lines are too long

View File

@@ -510,6 +510,55 @@ const FEATURE_TABLES = {
source: "Uses AWS API to load PDFs",
api: "API",
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.AmazonTextractPDFLoader.html"
},
{
name: "MathPix",
link: "mathpix",
source: "Uses MathPix to laod PDFs",
api: "Package",
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.MathpixPDFLoader.html"
},
{
name: "PDFPlumber",
link: "pdfplumber",
source: "Load PDF files using PDFPlumber",
api: "Package",
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PDFPlumberLoader.html"
},
{
name: "PyPDFDirectry",
link: "pypdfdirectory",
source: "Load a directory with PDF files",
api: "Package",
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFDirectoryLoader.html"
},
{
name: "PyPDFium2",
link: "pypdfium2",
source: "Load PDF files using PyPDFium2",
api: "Package",
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFium2Loader.html"
},
{
name: "UnstructuredPDFLoader",
link: "unstructured_pdfloader",
source: "Load PDF files using Unstructured",
api: "Package",
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.UnstructuredPDFLoader.html"
},
{
name: "PyMuPDF",
link: "pymupdf",
source: "Load PDF files using PyMuPDF",
api: "Package",
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyMuPDFLoader.html"
},
{
name: "PDFMiner",
link: "pdfminer",
source: "Load PDF files using PDFMiner",
api: "Package",
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PDFMinerLoader.html"
}
]
},
@@ -540,6 +589,24 @@ const FEATURE_TABLES = {
source: "All file types",
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_unstructured.document_loaders.UnstructuredLoader.html"
},
{
name: "JSONLoader",
link: "json",
source: "JSON files",
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.json_loader.JSONLoader.html"
},
{
name: "UnstructuredMarkdownLoader",
link: "unstructured_markdown",
source: "Markdown files",
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.markdown.UnstructuredMarkdownLoader.html"
},
{
name: "BSHTMLLoader",
link: "bshtml",
source: "HTML files",
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.html_bs.BSHTMLLoader.html"
}
]
},
vectorstores: {

View File

@@ -0,0 +1,8 @@
"""Implement a GPT-3 driven browser.
Heavily influenced from https://github.com/nat/natbot
"""
from langchain_community.chains.natbot.base import NatBotChain
__all__ = ["NatBotChain"]

View File

@@ -0,0 +1,3 @@
from langchain.chains import NatBotChain
__all__ = ["NatBotChain"]

View File

@@ -0,0 +1,7 @@
from langchain.chains.natbot.crawler import (
Crawler,
ElementInViewPort,
black_listed_elements,
)
__all__ = ["ElementInViewPort", "Crawler", "black_listed_elements"]

View File

@@ -0,0 +1,3 @@
from langchain.chains.natbot.prompt import PROMPT
__all__ = ["PROMPT"]

View File

@@ -10,7 +10,74 @@ logger = logging.getLogger(__name__)
class BSHTMLLoader(BaseLoader):
"""Load `HTML` files and parse them with `beautiful soup`."""
"""
__ModuleName__ document loader integration
Setup:
Install ``langchain-community`` and ``bs4``.
.. code-block:: bash
pip install -U langchain-community bs4
Instantiate:
.. code-block:: python
from langchain_community.document_loaders import BSHTMLLoader
loader = BSHTMLLoader(
file_path="./example_data/fake-content.html",
)
Lazy load:
.. code-block:: python
docs = []
docs_lazy = loader.lazy_load()
# async variant:
# docs_lazy = await loader.alazy_load()
for doc in docs_lazy:
docs.append(doc)
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
Test Title
My First Heading
My first paragraph.
{'source': './example_data/fake-content.html', 'title': 'Test Title'}
Async load:
.. code-block:: python
docs = await loader.aload()
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
Test Title
My First Heading
My first paragraph.
{'source': './example_data/fake-content.html', 'title': 'Test Title'}
""" # noqa: E501
def __init__(
self,

View File

@@ -13,19 +13,60 @@ class UnstructuredMarkdownLoader(UnstructuredFileLoader):
You can pass in additional unstructured kwargs after mode to apply
different unstructured settings.
Examples
--------
from langchain_community.document_loaders import UnstructuredMarkdownLoader
Setup:
Install ``langchain-community``.
loader = UnstructuredMarkdownLoader(
"example.md", mode="elements", strategy="fast",
)
docs = loader.load()
.. code-block:: bash
pip install -U langchain-community
Instantiate:
.. code-block:: python
from langchain_community.document_loaders import UnstructuredMarkdownLoader
loader = UnstructuredMarkdownLoader(
"./example_data/example.md",
mode="elements",
strategy="fast",
)
Lazy load:
.. code-block:: python
docs = []
docs_lazy = loader.lazy_load()
# async variant:
# docs_lazy = await loader.alazy_load()
for doc in docs_lazy:
docs.append(doc)
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
Sample Markdown Document
{'source': './example_data/example.md', 'category_depth': 0, 'last_modified': '2024-08-14T15:04:18', 'languages': ['eng'], 'filetype': 'text/markdown', 'file_directory': './example_data', 'filename': 'example.md', 'category': 'Title', 'element_id': '3d0b313864598e704aa26c728ecb61e5'}
Async load:
.. code-block:: python
docs = await loader.aload()
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
Sample Markdown Document
{'source': './example_data/example.md', 'category_depth': 0, 'last_modified': '2024-08-14T15:04:18', 'languages': ['eng'], 'filetype': 'text/markdown', 'file_directory': './example_data', 'filename': 'example.md', 'category': 'Title', 'element_id': '3d0b313864598e704aa26c728ecb61e5'}
References
----------
https://unstructured-io.github.io/unstructured/core/partition.html#partition-md
"""
""" # noqa: E501
def _get_elements(self) -> List:
from unstructured.__version__ import __version__ as __unstructured_version__

View File

@@ -6,14 +6,6 @@ from langchain_core.documents import Document
from langchain_community.chat_models import ChatOpenAI
def test_llm_construction_with_kwargs() -> None:
llm_chain_kwargs = {"verbose": True}
compressor = LLMChainExtractor.from_llm(
ChatOpenAI(), llm_chain_kwargs=llm_chain_kwargs
)
assert compressor.llm_chain.verbose is True
def test_llm_chain_extractor() -> None:
texts = [
"The Roman Empire followed the Roman Republic.",

View File

@@ -25,7 +25,7 @@ model_name = os.getenv("OPENAI_EMBEDDINGS_MODEL_NAME", "text-embedding-ada-002")
INDEX_NAME = "langchain-test-index"
INDEX_NAME_VECTOR_HNSW = "langchain-test-index-hnsw"
NAMESPACE = "langchain_test_db.langchain_test_collection"
CONNECTION_STRING: str = "mongodb+srv://akataria:Basket24ball@akataria-vector-search-testing.mongocluster.cosmos.azure.com/?tls=true&authMechanism=SCRAM-SHA-256&retrywrites=false&maxIdleTimeMS=120000"
CONNECTION_STRING: str = os.environ.get("MONGODB_VCORE_URI", "")
DB_NAME, COLLECTION_NAME = NAMESPACE.split(".")
num_lists = 3

View File

@@ -2,11 +2,10 @@
from typing import Any, Dict, List, Optional
from langchain.chains.natbot.base import NatBotChain
from langchain_core.callbacks.manager import CallbackManagerForLLMRun
from langchain_core.language_models.llms import LLM
from langchain.chains.natbot.base import NatBotChain
class FakeLLM(LLM):
"""Fake LLM wrapper for testing purposes."""

View File

@@ -180,7 +180,11 @@ class SemanticChunker(BaseDocumentTransformer):
x = max(min(self.number_of_chunks, x1), x2)
# Linear interpolation formula
y = y1 + ((y2 - y1) / (x2 - x1)) * (x - x1)
if x2 == x1:
y = y2
else:
y = y1 + ((y2 - y1) / (x2 - x1)) * (x - x1)
y = min(max(y, 0), 100)
return cast(float, np.percentile(distances, y))

View File

@@ -2,6 +2,7 @@
from typing import Any, Dict, List, Optional
from langchain_core._api import deprecated
from langchain_core.callbacks import CallbackManagerForChainRun
from langchain_core.language_models import BaseLanguageModel
from langchain_core.prompts import BasePromptTemplate
@@ -13,9 +14,151 @@ from langchain.chains.constitutional_ai.prompts import CRITIQUE_PROMPT, REVISION
from langchain.chains.llm import LLMChain
@deprecated(
since="0.2.13",
message=(
"This class is deprecated and will be removed in langchain 1.0. "
"See API reference for replacement: "
"https://api.python.langchain.com/en/latest/chains/langchain.chains.constitutional_ai.base.ConstitutionalChain.html" # noqa: E501
),
removal="1.0",
)
class ConstitutionalChain(Chain):
"""Chain for applying constitutional principles.
Note: this class is deprecated. See below for a replacement implementation
using LangGraph. The benefits of this implementation are:
- Uses LLM tool calling features instead of parsing string responses;
- Support for both token-by-token and step-by-step streaming;
- Support for checkpointing and memory of chat history;
- Easier to modify or extend (e.g., with additional tools, structured responses, etc.)
Install LangGraph with:
.. code-block:: bash
pip install -U langgraph
.. code-block:: python
from typing import List, Optional, Tuple
from langchain.chains.constitutional_ai.prompts import (
CRITIQUE_PROMPT,
REVISION_PROMPT,
)
from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langgraph.graph import END, START, StateGraph
from typing_extensions import Annotated, TypedDict
llm = ChatOpenAI(model="gpt-4o-mini")
class Critique(TypedDict):
\"\"\"Generate a critique, if needed.\"\"\"
critique_needed: Annotated[bool, ..., "Whether or not a critique is needed."]
critique: Annotated[str, ..., "If needed, the critique."]
critique_prompt = ChatPromptTemplate.from_template(
"Critique this response according to the critique request. "
"If no critique is needed, specify that.\\n\\n"
"Query: {query}\\n\\n"
"Response: {response}\\n\\n"
"Critique request: {critique_request}"
)
revision_prompt = ChatPromptTemplate.from_template(
"Revise this response according to the critique and reivsion request.\\n\\n"
"Query: {query}\\n\\n"
"Response: {response}\\n\\n"
"Critique request: {critique_request}\\n\\n"
"Critique: {critique}\\n\\n"
"If the critique does not identify anything worth changing, ignore the "
"revision request and return 'No revisions needed'. If the critique "
"does identify something worth changing, revise the response based on "
"the revision request.\\n\\n"
"Revision Request: {revision_request}"
)
chain = llm | StrOutputParser()
critique_chain = critique_prompt | llm.with_structured_output(Critique)
revision_chain = revision_prompt | llm | StrOutputParser()
class State(TypedDict):
query: str
constitutional_principles: List[ConstitutionalPrinciple]
initial_response: str
critiques_and_revisions: List[Tuple[str, str]]
response: str
async def generate_response(state: State):
\"\"\"Generate initial response.\"\"\"
response = await chain.ainvoke(state["query"])
return {"response": response, "initial_response": response}
async def critique_and_revise(state: State):
\"\"\"Critique and revise response according to principles.\"\"\"
critiques_and_revisions = []
response = state["initial_response"]
for principle in state["constitutional_principles"]:
critique = await critique_chain.ainvoke(
{
"query": state["query"],
"response": response,
"critique_request": principle.critique_request,
}
)
if critique["critique_needed"]:
revision = await revision_chain.ainvoke(
{
"query": state["query"],
"response": response,
"critique_request": principle.critique_request,
"critique": critique["critique"],
"revision_request": principle.revision_request,
}
)
response = revision
critiques_and_revisions.append((critique["critique"], revision))
else:
critiques_and_revisions.append((critique["critique"], ""))
return {
"critiques_and_revisions": critiques_and_revisions,
"response": response,
}
graph = StateGraph(State)
graph.add_node("generate_response", generate_response)
graph.add_node("critique_and_revise", critique_and_revise)
graph.add_edge(START, "generate_response")
graph.add_edge("generate_response", "critique_and_revise")
graph.add_edge("critique_and_revise", END)
app = graph.compile()
.. code-block:: python
constitutional_principles=[
ConstitutionalPrinciple(
critique_request="Tell if this answer is good.",
revision_request="Give a better answer.",
)
]
query = "What is the meaning of life? Answer in 10 words or fewer."
async for step in app.astream(
{"query": query, "constitutional_principles": constitutional_principles},
stream_mode="values",
):
subset = ["initial_response", "critiques_and_revisions", "response"]
print({k: v for k, v in step.items() if k in subset})
Example:
.. code-block:: python
@@ -44,7 +187,7 @@ class ConstitutionalChain(Chain):
)
constitutional_chain.run(question="What is the meaning of life?")
"""
""" # noqa: E501
chain: LLMChain
constitutional_principles: List[ConstitutionalPrinciple]

View File

@@ -1,7 +1,6 @@
from __future__ import annotations
import re
from abc import abstractmethod
from typing import Any, Dict, List, Optional, Sequence, Tuple
import numpy as np
@@ -9,10 +8,12 @@ from langchain_core.callbacks import (
CallbackManagerForChainRun,
)
from langchain_core.language_models import BaseLanguageModel
from langchain_core.outputs import Generation
from langchain_core.messages import AIMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import BasePromptTemplate
from langchain_core.pydantic_v1 import Field
from langchain_core.retrievers import BaseRetriever
from langchain_core.runnables import Runnable
from langchain.chains.base import Chain
from langchain.chains.flare.prompts import (
@@ -23,51 +24,14 @@ from langchain.chains.flare.prompts import (
from langchain.chains.llm import LLMChain
class _ResponseChain(LLMChain):
"""Base class for chains that generate responses."""
prompt: BasePromptTemplate = PROMPT
@classmethod
def is_lc_serializable(cls) -> bool:
return False
@property
def input_keys(self) -> List[str]:
return self.prompt.input_variables
def generate_tokens_and_log_probs(
self,
_input: Dict[str, Any],
*,
run_manager: Optional[CallbackManagerForChainRun] = None,
) -> Tuple[Sequence[str], Sequence[float]]:
llm_result = self.generate([_input], run_manager=run_manager)
return self._extract_tokens_and_log_probs(llm_result.generations[0])
@abstractmethod
def _extract_tokens_and_log_probs(
self, generations: List[Generation]
) -> Tuple[Sequence[str], Sequence[float]]:
"""Extract tokens and log probs from response."""
class _OpenAIResponseChain(_ResponseChain):
"""Chain that generates responses from user input and context."""
llm: BaseLanguageModel
def _extract_tokens_and_log_probs(
self, generations: List[Generation]
) -> Tuple[Sequence[str], Sequence[float]]:
tokens = []
log_probs = []
for gen in generations:
if gen.generation_info is None:
raise ValueError
tokens.extend(gen.generation_info["logprobs"]["tokens"])
log_probs.extend(gen.generation_info["logprobs"]["token_logprobs"])
return tokens, log_probs
def _extract_tokens_and_log_probs(response: AIMessage) -> Tuple[List[str], List[float]]:
"""Extract tokens and log probabilities from chat model response."""
tokens = []
log_probs = []
for token in response.response_metadata["logprobs"]["content"]:
tokens.append(token["token"])
log_probs.append(token["logprob"])
return tokens, log_probs
class QuestionGeneratorChain(LLMChain):
@@ -111,9 +75,9 @@ class FlareChain(Chain):
"""Chain that combines a retriever, a question generator,
and a response generator."""
question_generator_chain: QuestionGeneratorChain
question_generator_chain: Runnable
"""Chain that generates questions from uncertain spans."""
response_chain: _ResponseChain
response_chain: Runnable
"""Chain that generates responses from user input and context."""
output_parser: FinishedOutputParser = Field(default_factory=FinishedOutputParser)
"""Parser that determines whether the chain is finished."""
@@ -152,12 +116,16 @@ class FlareChain(Chain):
for question in questions:
docs.extend(self.retriever.invoke(question))
context = "\n\n".join(d.page_content for d in docs)
result = self.response_chain.predict(
user_input=user_input,
context=context,
response=response,
callbacks=callbacks,
result = self.response_chain.invoke(
{
"user_input": user_input,
"context": context,
"response": response,
},
{"callbacks": callbacks},
)
if isinstance(result, AIMessage):
result = result.content
marginal, finished = self.output_parser.parse(result)
return marginal, finished
@@ -178,13 +146,18 @@ class FlareChain(Chain):
for span in low_confidence_spans
]
callbacks = _run_manager.get_child()
question_gen_outputs = self.question_generator_chain.apply(
question_gen_inputs, callbacks=callbacks
)
questions = [
output[self.question_generator_chain.output_keys[0]]
for output in question_gen_outputs
]
if isinstance(self.question_generator_chain, LLMChain):
question_gen_outputs = self.question_generator_chain.apply(
question_gen_inputs, callbacks=callbacks
)
questions = [
output[self.question_generator_chain.output_keys[0]]
for output in question_gen_outputs
]
else:
questions = self.question_generator_chain.batch(
question_gen_inputs, config={"callbacks": callbacks}
)
_run_manager.on_text(
f"Generated Questions: {questions}", color="yellow", end="\n"
)
@@ -206,8 +179,10 @@ class FlareChain(Chain):
f"Current Response: {response}", color="blue", end="\n"
)
_input = {"user_input": user_input, "context": "", "response": response}
tokens, log_probs = self.response_chain.generate_tokens_and_log_probs(
_input, run_manager=_run_manager
tokens, log_probs = _extract_tokens_and_log_probs(
self.response_chain.invoke(
_input, {"callbacks": _run_manager.get_child()}
)
)
low_confidence_spans = _low_confidence_spans(
tokens,
@@ -251,18 +226,16 @@ class FlareChain(Chain):
FlareChain class with the given language model.
"""
try:
from langchain_openai import OpenAI
from langchain_openai import ChatOpenAI
except ImportError:
raise ImportError(
"OpenAI is required for FlareChain. "
"Please install langchain-openai."
"pip install langchain-openai"
)
question_gen_chain = QuestionGeneratorChain(llm=llm)
response_llm = OpenAI(
max_tokens=max_generation_len, model_kwargs={"logprobs": 1}, temperature=0
)
response_chain = _OpenAIResponseChain(llm=response_llm)
llm = ChatOpenAI(max_tokens=max_generation_len, logprobs=True, temperature=0)
response_chain = PROMPT | llm
question_gen_chain = QUESTION_GENERATOR_PROMPT | llm | StrOutputParser()
return cls(
question_generator_chain=question_gen_chain,
response_chain=response_chain,

View File

@@ -11,7 +11,9 @@ import numpy as np
from langchain_core.callbacks import CallbackManagerForChainRun
from langchain_core.embeddings import Embeddings
from langchain_core.language_models import BaseLanguageModel
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import BasePromptTemplate
from langchain_core.runnables import Runnable
from langchain.chains.base import Chain
from langchain.chains.hyde.prompts import PROMPT_MAP
@@ -25,7 +27,7 @@ class HypotheticalDocumentEmbedder(Chain, Embeddings):
"""
base_embeddings: Embeddings
llm_chain: LLMChain
llm_chain: Runnable
class Config:
arbitrary_types_allowed = True
@@ -34,12 +36,15 @@ class HypotheticalDocumentEmbedder(Chain, Embeddings):
@property
def input_keys(self) -> List[str]:
"""Input keys for Hyde's LLM chain."""
return self.llm_chain.input_keys
return self.llm_chain.input_schema.schema()["required"]
@property
def output_keys(self) -> List[str]:
"""Output keys for Hyde's LLM chain."""
return self.llm_chain.output_keys
if isinstance(self.llm_chain, LLMChain):
return self.llm_chain.output_keys
else:
return ["text"]
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Call the base embeddings."""
@@ -51,9 +56,12 @@ class HypotheticalDocumentEmbedder(Chain, Embeddings):
def embed_query(self, text: str) -> List[float]:
"""Generate a hypothetical document and embedded it."""
var_name = self.llm_chain.input_keys[0]
result = self.llm_chain.generate([{var_name: text}])
documents = [generation.text for generation in result.generations[0]]
var_name = self.input_keys[0]
result = self.llm_chain.invoke({var_name: text})
if isinstance(self.llm_chain, LLMChain):
documents = [result[self.output_keys[0]]]
else:
documents = [result]
embeddings = self.embed_documents(documents)
return self.combine_embeddings(embeddings)
@@ -64,7 +72,9 @@ class HypotheticalDocumentEmbedder(Chain, Embeddings):
) -> Dict[str, str]:
"""Call the internal llm chain."""
_run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
return self.llm_chain(inputs, callbacks=_run_manager.get_child())
return self.llm_chain.invoke(
inputs, config={"callbacks": _run_manager.get_child()}
)
@classmethod
def from_llm(
@@ -86,7 +96,7 @@ class HypotheticalDocumentEmbedder(Chain, Embeddings):
f"of {list(PROMPT_MAP.keys())}."
)
llm_chain = LLMChain(llm=llm, prompt=prompt)
llm_chain = prompt | llm | StrOutputParser()
return cls(base_embeddings=base_embeddings, llm_chain=llm_chain, **kwargs)
@property

View File

@@ -7,6 +7,7 @@ import re
import warnings
from typing import Any, Dict, List, Optional
from langchain_core._api import deprecated
from langchain_core.callbacks import (
AsyncCallbackManagerForChainRun,
CallbackManagerForChainRun,
@@ -20,16 +21,132 @@ from langchain.chains.llm import LLMChain
from langchain.chains.llm_math.prompt import PROMPT
@deprecated(
since="0.2.13",
message=(
"This class is deprecated and will be removed in langchain 1.0. "
"See API reference for replacement: "
"https://api.python.langchain.com/en/latest/chains/langchain.chains.llm_math.base.LLMMathChain.html" # noqa: E501
),
removal="1.0",
)
class LLMMathChain(Chain):
"""Chain that interprets a prompt and executes python code to do math.
Note: this class is deprecated. See below for a replacement implementation
using LangGraph. The benefits of this implementation are:
- Uses LLM tool calling features;
- Support for both token-by-token and step-by-step streaming;
- Support for checkpointing and memory of chat history;
- Easier to modify or extend (e.g., with additional tools, structured responses, etc.)
Install LangGraph with:
.. code-block:: bash
pip install -U langgraph
.. code-block:: python
import math
from typing import Annotated, Sequence
from langchain_core.messages import BaseMessage
from langchain_core.runnables import RunnableConfig
from langchain_core.tools import tool
from langchain_openai import ChatOpenAI
from langgraph.graph import END, StateGraph
from langgraph.graph.message import add_messages
from langgraph.prebuilt.tool_node import ToolNode
import numexpr
from typing_extensions import TypedDict
@tool
def calculator(expression: str) -> str:
\"\"\"Calculate expression using Python's numexpr library.
Expression should be a single line mathematical expression
that solves the problem.
Examples:
"37593 * 67" for "37593 times 67"
"37593**(1/5)" for "37593^(1/5)"
\"\"\"
local_dict = {"pi": math.pi, "e": math.e}
return str(
numexpr.evaluate(
expression.strip(),
global_dict={}, # restrict access to globals
local_dict=local_dict, # add common mathematical functions
)
)
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
tools = [calculator]
llm_with_tools = llm.bind_tools(tools, tool_choice="any")
class ChainState(TypedDict):
\"\"\"LangGraph state.\"\"\"
messages: Annotated[Sequence[BaseMessage], add_messages]
async def acall_chain(state: ChainState, config: RunnableConfig):
last_message = state["messages"][-1]
response = await llm_with_tools.ainvoke(state["messages"], config)
return {"messages": [response]}
async def acall_model(state: ChainState, config: RunnableConfig):
response = await llm.ainvoke(state["messages"], config)
return {"messages": [response]}
graph_builder = StateGraph(ChainState)
graph_builder.add_node("call_tool", acall_chain)
graph_builder.add_node("execute_tool", ToolNode(tools))
graph_builder.add_node("call_model", acall_model)
graph_builder.set_entry_point("call_tool")
graph_builder.add_edge("call_tool", "execute_tool")
graph_builder.add_edge("execute_tool", "call_model")
graph_builder.add_edge("call_model", END)
chain = graph_builder.compile()
.. code-block:: python
example_query = "What is 551368 divided by 82"
events = chain.astream(
{"messages": [("user", example_query)]},
stream_mode="values",
)
async for event in events:
event["messages"][-1].pretty_print()
.. code-block:: none
================================ Human Message =================================
What is 551368 divided by 82
================================== Ai Message ==================================
Tool Calls:
calculator (call_MEiGXuJjJ7wGU4aOT86QuGJS)
Call ID: call_MEiGXuJjJ7wGU4aOT86QuGJS
Args:
expression: 551368 / 82
================================= Tool Message =================================
Name: calculator
6724.0
================================== Ai Message ==================================
551368 divided by 82 equals 6724.
Example:
.. code-block:: python
from langchain.chains import LLMMathChain
from langchain_community.llms import OpenAI
llm_math = LLMMathChain.from_llm(OpenAI())
"""
""" # noqa: E501
llm_chain: LLMChain
llm: Optional[BaseLanguageModel] = None

View File

@@ -5,15 +5,27 @@ from __future__ import annotations
import warnings
from typing import Any, Dict, List, Optional
from langchain_core._api import deprecated
from langchain_core.callbacks import CallbackManagerForChainRun
from langchain_core.language_models import BaseLanguageModel
from langchain_core.output_parsers import StrOutputParser
from langchain_core.pydantic_v1 import root_validator
from langchain_core.runnables import Runnable
from langchain.chains.base import Chain
from langchain.chains.llm import LLMChain
from langchain.chains.natbot.prompt import PROMPT
@deprecated(
since="0.2.13",
message=(
"Importing NatBotChain from langchain is deprecated and will be removed in "
"langchain 1.0. Please import from langchain_community instead: "
"from langchain_community.chains.natbot import NatBotChain. "
"You may need to pip install -U langchain-community."
),
removal="1.0",
)
class NatBotChain(Chain):
"""Implement an LLM driven browser.
@@ -37,7 +49,7 @@ class NatBotChain(Chain):
natbot = NatBotChain.from_default("Buy me a new hat.")
"""
llm_chain: LLMChain
llm_chain: Runnable
objective: str
"""Objective that NatBot is tasked with completing."""
llm: Optional[BaseLanguageModel] = None
@@ -60,7 +72,7 @@ class NatBotChain(Chain):
"class method."
)
if "llm_chain" not in values and values["llm"] is not None:
values["llm_chain"] = LLMChain(llm=values["llm"], prompt=PROMPT)
values["llm_chain"] = PROMPT | values["llm"] | StrOutputParser()
return values
@classmethod
@@ -77,7 +89,7 @@ class NatBotChain(Chain):
cls, llm: BaseLanguageModel, objective: str, **kwargs: Any
) -> NatBotChain:
"""Load from LLM."""
llm_chain = LLMChain(llm=llm, prompt=PROMPT)
llm_chain = PROMPT | llm | StrOutputParser()
return cls(llm_chain=llm_chain, objective=objective, **kwargs)
@property
@@ -104,12 +116,14 @@ class NatBotChain(Chain):
_run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
url = inputs[self.input_url_key]
browser_content = inputs[self.input_browser_content_key]
llm_cmd = self.llm_chain.predict(
objective=self.objective,
url=url[:100],
previous_command=self.previous_command,
browser_content=browser_content[:4500],
callbacks=_run_manager.get_child(),
llm_cmd = self.llm_chain.invoke(
{
"objective": self.objective,
"url": url[:100],
"previous_command": self.previous_command,
"browser_content": browser_content[:4500],
},
config={"callbacks": _run_manager.get_child()},
)
llm_cmd = llm_cmd.strip()
self.previous_command = llm_cmd

View File

@@ -27,11 +27,11 @@ from langchain_core.callbacks.manager import (
from langchain_core.exceptions import OutputParserException
from langchain_core.language_models import BaseLanguageModel
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.output_parsers import BaseOutputParser
from langchain_core.output_parsers import BaseOutputParser, StrOutputParser
from langchain_core.pydantic_v1 import Field
from langchain_core.runnables import Runnable
from langchain_core.tools import BaseTool
from langchain.chains.llm import LLMChain
from langchain.evaluation.agents.trajectory_eval_prompt import (
EVAL_CHAT_PROMPT,
TOOL_FREE_EVAL_CHAT_PROMPT,
@@ -147,7 +147,7 @@ class TrajectoryEvalChain(AgentTrajectoryEvaluator, LLMEvalChain):
agent_tools: Optional[List[BaseTool]] = None
"""A list of tools available to the agent."""
eval_chain: LLMChain
eval_chain: Runnable
"""The language model chain used for evaluation."""
output_parser: TrajectoryOutputParser = Field(
default_factory=TrajectoryOutputParser
@@ -253,7 +253,7 @@ The following is the expected answer. Use this to measure correctness:
prompt = EVAL_CHAT_PROMPT
else:
prompt = TOOL_FREE_EVAL_CHAT_PROMPT
eval_chain = LLMChain(llm=llm, prompt=prompt)
eval_chain = prompt | llm | StrOutputParser()
return cls(
agent_tools=agent_tools, # type: ignore[arg-type]
eval_chain=eval_chain,
@@ -303,8 +303,8 @@ The following is the expected answer. Use this to measure correctness:
if self.agent_tools:
chain_input["tool_descriptions"] = self._tools_description
_run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
raw_output = self.eval_chain.run(
chain_input, callbacks=_run_manager.get_child()
raw_output = self.eval_chain.invoke(
chain_input, {"callbacks": _run_manager.get_child()}
)
return cast(dict, self.output_parser.parse(raw_output))
@@ -327,8 +327,8 @@ The following is the expected answer. Use this to measure correctness:
if self.agent_tools:
chain_input["tool_descriptions"] = self._tools_description
_run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager()
raw_output = await self.eval_chain.arun(
chain_input, callbacks=_run_manager.get_child()
raw_output = await self.eval_chain.ainvoke(
chain_input, {"callbacks": _run_manager.get_child()}
)
return cast(dict, self.output_parser.parse(raw_output))

View File

@@ -6,14 +6,15 @@ import logging
import re
from typing import Any, Dict, List, Optional, Union
from langchain_core.callbacks.manager import Callbacks
from langchain_core.callbacks.manager import CallbackManagerForChainRun, Callbacks
from langchain_core.language_models import BaseLanguageModel
from langchain_core.output_parsers import BaseOutputParser
from langchain_core.prompts.base import BasePromptTemplate
from langchain_core.prompts.prompt import PromptTemplate
from langchain_core.pydantic_v1 import Field
from langchain_core.runnables import RunnableConfig
from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple
from langchain.chains.llm import LLMChain
from langchain.evaluation.comparison.prompt import (
COMPARISON_TEMPLATE,
COMPARISON_TEMPLATE_WITH_REFERENCE,
@@ -151,7 +152,7 @@ class PairwiseStringResultOutputParser(BaseOutputParser[dict]):
}
class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain):
"""A chain for comparing two outputs, such as the outputs
of two models, prompts, or outputs of a single model on similar inputs.
@@ -186,6 +187,10 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
output_parser: BaseOutputParser = Field(
default_factory=PairwiseStringResultOutputParser
)
llm: BaseLanguageModel
"""The language model to use for scoring."""
prompt: BasePromptTemplate
"""The prompt to use for scoring."""
@classmethod
def is_lc_serializable(cls) -> bool:
@@ -228,6 +233,22 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
" (EvaluatorType.LABELED_PAIRWISE_STRING) instead."
)
@property
def input_keys(self) -> List[str]:
"""Will be whatever keys the prompt expects.
:meta private:
"""
return self.prompt.input_variables
@property
def output_keys(self) -> List[str]:
"""Will always return text key.
:meta private:
"""
return [self.output_key]
@classmethod
def from_llm(
cls,
@@ -305,6 +326,19 @@ Performance may be significantly worse with other models."
parsed[RUN_KEY] = result[RUN_KEY]
return parsed
def _call(
self,
inputs: Dict[str, Any],
run_manager: Optional[CallbackManagerForChainRun] = None,
) -> Dict[str, Any]:
if run_manager:
config = RunnableConfig(callbacks=run_manager.get_child())
else:
config = None
chain = self.prompt | self.llm | self.output_parser
response = chain.invoke(inputs, config=config)
return {self.output_key: response}
def _evaluate_string_pairs(
self,
*,
@@ -338,13 +372,17 @@ Performance may be significantly worse with other models."
"""
input_ = self._prepare_input(prediction, prediction_b, input, reference)
result = self(
inputs=input_,
callbacks=callbacks,
tags=tags,
metadata=metadata,
include_run_info=include_run_info,
)
# prep config
config: RunnableConfig = {}
if callbacks is not None:
config["callbacks"] = callbacks
if tags is not None:
config["tags"] = tags
if metadata is not None:
config["metadata"] = metadata
result = self.invoke(input_, config=config, include_run_info=include_run_info)
return self._prepare_output(result)
async def _aevaluate_string_pairs(
@@ -380,13 +418,20 @@ Performance may be significantly worse with other models."
"""
input_ = self._prepare_input(prediction, prediction_b, input, reference)
result = await self.acall(
inputs=input_,
callbacks=callbacks,
tags=tags,
metadata=metadata,
include_run_info=include_run_info,
# prep config
config: RunnableConfig = {}
if callbacks is not None:
config["callbacks"] = callbacks
if tags is not None:
config["tags"] = tags
if metadata is not None:
config["metadata"] = metadata
result = await self.ainvoke(
input_, config=config, include_run_info=include_run_info
)
return self._prepare_output(result)

View File

@@ -4,14 +4,14 @@ import re
from enum import Enum
from typing import Any, Dict, List, Mapping, Optional, Union
from langchain_core.callbacks.manager import Callbacks
from langchain_core.callbacks.manager import CallbackManagerForChainRun, Callbacks
from langchain_core.language_models import BaseLanguageModel
from langchain_core.output_parsers import BaseOutputParser
from langchain_core.prompts import BasePromptTemplate
from langchain_core.pydantic_v1 import Field
from langchain_core.runnables import RunnableConfig
from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple
from langchain.chains.llm import LLMChain
from langchain.evaluation.criteria.prompt import PROMPT, PROMPT_WITH_REFERENCES
from langchain.evaluation.schema import LLMEvalChain, StringEvaluator
from langchain.schema import RUN_KEY
@@ -164,7 +164,7 @@ def resolve_criteria(
return criteria_
class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
class CriteriaEvalChain(StringEvaluator, LLMEvalChain):
"""LLM Chain for evaluating runs against criteria.
Parameters
@@ -184,7 +184,7 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
reference labels in the prompt. Otherwise, the `PROMPT` template will be
used, which is a reference-free prompt.
**kwargs : Any
Additional keyword arguments to pass to the `LLMChain` constructor.
Additional keyword arguments to pass to the `Chain` constructor.
Returns
-------
@@ -231,6 +231,10 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
criterion_name: str
"""The name of the criterion being evaluated."""
output_key: str = "results" #: :meta private:
llm: BaseLanguageModel
"""The language model to use for scoring."""
prompt: BasePromptTemplate
"""The prompt to use for scoring."""
@classmethod
def is_lc_serializable(cls) -> bool:
@@ -267,6 +271,22 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
"\nTo use references, use the labeled_criteria instead."
)
@property
def input_keys(self) -> List[str]:
"""Will be whatever keys the prompt expects.
:meta private:
"""
return self.prompt.input_variables
@property
def output_keys(self) -> List[str]:
"""Will always return text key.
:meta private:
"""
return [self.output_key]
@classmethod
def _resolve_prompt(
cls, prompt: Optional[BasePromptTemplate] = None
@@ -332,7 +352,7 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
The prompt template to use for generating prompts. If not provided,
a default prompt template will be used.
**kwargs : Any
Additional keyword arguments to pass to the `LLMChain`
Additional keyword arguments to pass to the `Chain`
constructor.
Returns
@@ -396,6 +416,19 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
parsed[RUN_KEY] = result[RUN_KEY]
return parsed
def _call(
self,
inputs: Dict[str, Any],
run_manager: Optional[CallbackManagerForChainRun] = None,
) -> Dict[str, Any]:
if run_manager:
config = RunnableConfig(callbacks=run_manager.get_child())
else:
config = None
chain = self.prompt | self.llm | self.output_parser
response = chain.invoke(inputs, config=config)
return {self.output_key: response}
def _evaluate_strings(
self,
*,
@@ -420,7 +453,7 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
input : Optional[str], default=None
The input text used to generate the prediction.
**kwargs : Any
Additional keyword arguments to pass to the `LLMChain` `__call__`
Additional keyword arguments to pass to the `Chain` `invoke`
method.
Returns
@@ -442,13 +475,17 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
)
"""
input_ = self._get_eval_input(prediction, reference, input)
result = self(
input_,
callbacks=callbacks,
tags=tags,
metadata=metadata,
include_run_info=include_run_info,
)
# prep config
config: RunnableConfig = {}
if callbacks is not None:
config["callbacks"] = callbacks
if tags is not None:
config["tags"] = tags
if metadata is not None:
config["metadata"] = metadata
result = self.invoke(input_, config=config, include_run_info=include_run_info)
return self._prepare_output(result)
async def _aevaluate_strings(
@@ -475,7 +512,7 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
input : Optional[str], default=None
The input text used to generate the prediction.
**kwargs : Any
Additional keyword arguments to pass to the `LLMChain` `acall`
Additional keyword arguments to pass to the `Chain` `acall`
method.
Returns
@@ -497,12 +534,18 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
)
"""
input_ = self._get_eval_input(prediction, reference, input)
result = await self.acall(
input_,
callbacks=callbacks,
tags=tags,
metadata=metadata,
include_run_info=include_run_info,
# prep config
config: RunnableConfig = {}
if callbacks is not None:
config["callbacks"] = callbacks
if tags is not None:
config["tags"] = tags
if metadata is not None:
config["metadata"] = metadata
result = await self.ainvoke(
input_, config=config, include_run_info=include_run_info
)
return self._prepare_output(result)
@@ -556,7 +599,7 @@ class LabeledCriteriaEvalChain(CriteriaEvalChain):
The prompt template to use for generating prompts. If not provided,
a default prompt will be used.
**kwargs : Any
Additional keyword arguments to pass to the `LLMChain`
Additional keyword arguments to pass to the `Chain`
constructor.
Returns

View File

@@ -4,13 +4,15 @@ from __future__ import annotations
import re
import string
from typing import Any, List, Optional, Sequence, Tuple
from typing import Any, Dict, List, Optional, Sequence, Tuple
from langchain_core.callbacks.manager import Callbacks
from langchain_core.callbacks.manager import CallbackManagerForChainRun, Callbacks
from langchain_core.language_models import BaseLanguageModel
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.prompts.base import BasePromptTemplate
from langchain_core.runnables import RunnableConfig
from langchain.chains.llm import LLMChain
from langchain.evaluation.qa.eval_prompt import CONTEXT_PROMPT, COT_PROMPT, PROMPT
from langchain.evaluation.schema import LLMEvalChain, StringEvaluator
from langchain.schema import RUN_KEY
@@ -67,10 +69,14 @@ def _parse_string_eval_output(text: str) -> dict:
}
class QAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
class QAEvalChain(StringEvaluator, LLMEvalChain):
"""LLM Chain for evaluating question answering."""
output_key: str = "results" #: :meta private:
llm: BaseLanguageModel
"""The language model to use for scoring."""
prompt: BasePromptTemplate
"""The prompt to use for scoring."""
class Config:
extra = "ignore"
@@ -91,6 +97,35 @@ class QAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
def requires_input(self) -> bool:
return True
@property
def input_keys(self) -> List[str]:
"""Will be whatever keys the prompt expects.
:meta private:
"""
return self.prompt.input_variables
@property
def output_keys(self) -> List[str]:
"""Will always return text key.
:meta private:
"""
return [self.output_key]
def _call(
self,
inputs: Dict[str, Any],
run_manager: Optional[CallbackManagerForChainRun] = None,
) -> Dict[str, Any]:
if run_manager:
config = RunnableConfig(callbacks=run_manager.get_child())
else:
config = None
chain = self.prompt | self.llm | StrOutputParser()
response = chain.invoke(inputs, config=config)
return {self.output_key: response}
@classmethod
def from_llm(
cls,
@@ -141,8 +176,14 @@ class QAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
}
for i, example in enumerate(examples)
]
if callbacks:
config = RunnableConfig(callbacks=callbacks)
else:
config = None
outputs = self.batch(inputs, config=config)
return self.apply(inputs, callbacks=callbacks)
# Subset to output key only
return [{self.output_key: output[self.output_key]} for output in outputs]
def _prepare_output(self, result: dict) -> dict:
parsed_result = _parse_string_eval_output(result[self.output_key])
@@ -174,13 +215,17 @@ class QAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
Returns:
dict: The evaluation results containing the score or value.
"""
result = self(
if callbacks:
config = RunnableConfig(callbacks=callbacks)
else:
config = None
result = self.invoke(
{
"query": input,
"answer": reference,
"result": prediction,
},
callbacks=callbacks,
config=config,
include_run_info=include_run_info,
)
return self._prepare_output(result)
@@ -195,17 +240,31 @@ class QAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
include_run_info: bool = False,
**kwargs: Any,
) -> dict:
result = await self.acall(
inputs={"query": input, "answer": reference, "result": prediction},
callbacks=callbacks,
if callbacks:
config = RunnableConfig(callbacks=callbacks)
else:
config = None
result = await self.ainvoke(
{
"query": input,
"answer": reference,
"result": prediction,
},
config=config,
include_run_info=include_run_info,
)
return self._prepare_output(result)
class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
class ContextQAEvalChain(StringEvaluator, LLMEvalChain):
"""LLM Chain for evaluating QA w/o GT based on context"""
output_key: str = "text" #: :meta private:
llm: BaseLanguageModel
"""The language model to use for scoring."""
prompt: BasePromptTemplate
"""The prompt to use for scoring."""
@classmethod
def is_lc_serializable(cls) -> bool:
return False
@@ -220,6 +279,22 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
"""Whether the chain requires an input string."""
return True
@property
def input_keys(self) -> List[str]:
"""Will be whatever keys the prompt expects.
:meta private:
"""
return self.prompt.input_variables
@property
def output_keys(self) -> List[str]:
"""Will always return text key.
:meta private:
"""
return [self.output_key]
class Config:
extra = "ignore"
@@ -236,6 +311,19 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
def evaluation_name(self) -> str:
return "Contextual Accuracy"
def _call(
self,
inputs: Dict[str, Any],
run_manager: Optional[CallbackManagerForChainRun] = None,
) -> Dict[str, Any]:
if run_manager:
config = RunnableConfig(callbacks=run_manager.get_child())
else:
config = None
chain = self.prompt | self.llm | StrOutputParser()
response = chain.invoke(inputs, config=config)
return {self.output_key: response}
@classmethod
def from_llm(
cls,
@@ -281,8 +369,13 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
}
for i, example in enumerate(examples)
]
if callbacks:
config = RunnableConfig(callbacks=callbacks)
else:
config = None
outputs = self.batch(inputs, config=config)
return self.apply(inputs, callbacks=callbacks)
return [{self.output_key: output[self.output_key]} for output in outputs]
def _prepare_output(self, result: dict) -> dict:
parsed_result = _parse_string_eval_output(result[self.output_key])
@@ -300,13 +393,17 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
include_run_info: bool = False,
**kwargs: Any,
) -> dict:
result = self(
if callbacks:
config = RunnableConfig(callbacks=callbacks)
else:
config = None
result = self.invoke(
{
"query": input,
"context": reference,
"result": prediction,
},
callbacks=callbacks,
config=config,
include_run_info=include_run_info,
)
return self._prepare_output(result)
@@ -321,9 +418,17 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
include_run_info: bool = False,
**kwargs: Any,
) -> dict:
result = await self.acall(
inputs={"query": input, "context": reference, "result": prediction},
callbacks=callbacks,
if callbacks:
config = RunnableConfig(callbacks=callbacks)
else:
config = None
result = await self.ainvoke(
{
"query": input,
"context": reference,
"result": prediction,
},
config=config,
include_run_info=include_run_info,
)
return self._prepare_output(result)

View File

@@ -4,6 +4,7 @@ from __future__ import annotations
from typing import Any
from langchain_core._api import deprecated
from langchain_core.language_models import BaseLanguageModel
from langchain_core.output_parsers import BaseLLMOutputParser
from langchain_core.pydantic_v1 import Field
@@ -17,8 +18,44 @@ _QA_OUTPUT_PARSER = RegexParser(
)
@deprecated(
since="0.2.13",
message=(
"This class is deprecated and will be removed in langchain 1.0. "
"See API reference for replacement: "
"https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.qa.generate_chain.QAGenerateChain.html" # noqa: E501
),
removal="1.0",
)
class QAGenerateChain(LLMChain):
"""LLM Chain for generating examples for question answering."""
"""LLM Chain for generating examples for question answering.
Note: this class is deprecated. See below for a replacement implementation
that leverages LLM tool calling features.
.. code-block:: python
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from typing_extensions import TypedDict
template = \"\"\"You are a teacher coming up with questions to ask on a quiz.
Given the following document, please generate a question and answer based on that document.
These questions should be detailed and be based explicitly on information in the document.
\"\"\"
prompt = ChatPromptTemplate.from_template(template)
class QuestionAndAnswer(TypedDict):
\"\"\"Question and answer based on document.\"\"\"
question: str
answer: str
llm = ChatOpenAI(model="gpt-4o-mini").with_structured_output(QuestionAndAnswer)
llm.invoke("...")
""" # noqa: E501
output_parser: BaseLLMOutputParser = Field(default=_QA_OUTPUT_PARSER)
output_key: str = "qa_pairs"

View File

@@ -6,14 +6,16 @@ import logging
import re
from typing import Any, Dict, List, Optional, Union
from langchain_core.callbacks import CallbackManagerForChainRun
from langchain_core.callbacks.manager import Callbacks
from langchain_core.language_models import BaseLanguageModel
from langchain_core.output_parsers import BaseOutputParser
from langchain_core.prompts.base import BasePromptTemplate
from langchain_core.prompts.prompt import PromptTemplate
from langchain_core.pydantic_v1 import Field
from langchain_core.runnables import RunnableConfig
from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple
from langchain.chains.llm import LLMChain
from langchain.evaluation.criteria.eval_chain import (
CRITERIA_TYPE,
Criteria,
@@ -144,7 +146,7 @@ class ScoreStringResultOutputParser(BaseOutputParser[dict]):
}
class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
class ScoreStringEvalChain(StringEvaluator, LLMEvalChain):
"""A chain for scoring on a scale of 1-10 the output of a model.
Attributes:
@@ -178,10 +180,43 @@ class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
"""The value to normalize the score by, if specified."""
criterion_name: str
"""The name of the criterion being evaluated."""
llm: BaseLanguageModel
"""The language model to use for scoring."""
prompt: BasePromptTemplate
"""The prompt to use for scoring."""
class Config:
extra = "ignore"
@property
def input_keys(self) -> List[str]:
"""Will be whatever keys the prompt expects.
:meta private:
"""
return self.prompt.input_variables
@property
def output_keys(self) -> List[str]:
"""Will always return text key.
:meta private:
"""
return [self.output_key]
def _call(
self,
inputs: Dict[str, Any],
run_manager: Optional[CallbackManagerForChainRun] = None,
) -> Dict[str, Any]:
if run_manager:
config = RunnableConfig(callbacks=run_manager.get_child())
else:
config = None
chain = self.prompt | self.llm | self.output_parser
response = chain.invoke(inputs, config=config)
return {self.output_key: response}
@classmethod
def is_lc_serializable(cls) -> bool:
return False
@@ -348,13 +383,17 @@ Performance may be significantly worse with other models."
"""
input_ = self._prepare_input(prediction, input, reference)
result = self(
inputs=input_,
callbacks=callbacks,
tags=tags,
metadata=metadata,
include_run_info=include_run_info,
)
# prep config
config: RunnableConfig = {}
if callbacks is not None:
config["callbacks"] = callbacks
if tags is not None:
config["tags"] = tags
if metadata is not None:
config["metadata"] = metadata
result = self.invoke(input_, config=config, include_run_info=include_run_info)
return self._prepare_output(result)
async def _aevaluate_string_pairs(
@@ -385,12 +424,18 @@ Performance may be significantly worse with other models."
"""
input_ = self._prepare_input(prediction, input, reference)
result = await self.acall(
inputs=input_,
callbacks=callbacks,
tags=tags,
metadata=metadata,
include_run_info=include_run_info,
# prep config
config: RunnableConfig = {}
if callbacks is not None:
config["callbacks"] = callbacks
if tags is not None:
config["tags"] = tags
if metadata is not None:
config["metadata"] = metadata
result = await self.ainvoke(
input_, config=config, include_run_info=include_run_info
)
return self._prepare_output(result)

View File

@@ -8,8 +8,9 @@ from typing import Any, Callable, Dict, Optional, Sequence, cast
from langchain_core.callbacks.manager import Callbacks
from langchain_core.documents import Document
from langchain_core.language_models import BaseLanguageModel
from langchain_core.output_parsers import BaseOutputParser
from langchain_core.output_parsers import BaseOutputParser, StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import Runnable
from langchain.chains.llm import LLMChain
from langchain.retrievers.document_compressors.base import BaseDocumentCompressor
@@ -49,12 +50,15 @@ class LLMChainExtractor(BaseDocumentCompressor):
"""Document compressor that uses an LLM chain to extract
the relevant parts of documents."""
llm_chain: LLMChain
llm_chain: Runnable
"""LLM wrapper to use for compressing documents."""
get_input: Callable[[str, Document], dict] = default_get_input
"""Callable for constructing the chain input from the query and a Document."""
class Config:
arbitrary_types_allowed = True
def compress_documents(
self,
documents: Sequence[Document],
@@ -65,10 +69,13 @@ class LLMChainExtractor(BaseDocumentCompressor):
compressed_docs = []
for doc in documents:
_input = self.get_input(query, doc)
output_dict = self.llm_chain.invoke(_input, config={"callbacks": callbacks})
output = output_dict[self.llm_chain.output_key]
if self.llm_chain.prompt.output_parser is not None:
output = self.llm_chain.prompt.output_parser.parse(output)
output_ = self.llm_chain.invoke(_input, config={"callbacks": callbacks})
if isinstance(self.llm_chain, LLMChain):
output = output_[self.llm_chain.output_key]
if self.llm_chain.prompt.output_parser is not None:
output = self.llm_chain.prompt.output_parser.parse(output)
else:
output = output_
if len(output) == 0:
continue
compressed_docs.append(
@@ -85,9 +92,7 @@ class LLMChainExtractor(BaseDocumentCompressor):
"""Compress page content of raw documents asynchronously."""
outputs = await asyncio.gather(
*[
self.llm_chain.apredict_and_parse(
**self.get_input(query, doc), callbacks=callbacks
)
self.llm_chain.ainvoke(self.get_input(query, doc), callbacks=callbacks)
for doc in documents
]
)
@@ -111,5 +116,9 @@ class LLMChainExtractor(BaseDocumentCompressor):
"""Initialize from LLM."""
_prompt = prompt if prompt is not None else _get_default_chain_prompt()
_get_input = get_input if get_input is not None else default_get_input
llm_chain = LLMChain(llm=llm, prompt=_prompt, **(llm_chain_kwargs or {}))
if _prompt.output_parser is not None:
parser = _prompt.output_parser
else:
parser = StrOutputParser()
llm_chain = _prompt | llm | parser
return cls(llm_chain=llm_chain, get_input=_get_input) # type: ignore[arg-type]

View File

@@ -5,7 +5,9 @@ from typing import Any, Callable, Dict, Optional, Sequence
from langchain_core.callbacks.manager import Callbacks
from langchain_core.documents import Document
from langchain_core.language_models import BaseLanguageModel
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import BasePromptTemplate, PromptTemplate
from langchain_core.runnables import Runnable
from langchain_core.runnables.config import RunnableConfig
from langchain.chains import LLMChain
@@ -32,13 +34,16 @@ def default_get_input(query: str, doc: Document) -> Dict[str, Any]:
class LLMChainFilter(BaseDocumentCompressor):
"""Filter that drops documents that aren't relevant to the query."""
llm_chain: LLMChain
llm_chain: Runnable
"""LLM wrapper to use for filtering documents.
The chain prompt is expected to have a BooleanOutputParser."""
get_input: Callable[[str, Document], dict] = default_get_input
"""Callable for constructing the chain input from the query and a Document."""
class Config:
arbitrary_types_allowed = True
def compress_documents(
self,
documents: Sequence[Document],
@@ -56,11 +61,15 @@ class LLMChainFilter(BaseDocumentCompressor):
documents,
)
for output_dict, doc in outputs:
for output_, doc in outputs:
include_doc = None
output = output_dict[self.llm_chain.output_key]
if self.llm_chain.prompt.output_parser is not None:
include_doc = self.llm_chain.prompt.output_parser.parse(output)
if isinstance(self.llm_chain, LLMChain):
output = output_[self.llm_chain.output_key]
if self.llm_chain.prompt.output_parser is not None:
include_doc = self.llm_chain.prompt.output_parser.parse(output)
else:
if isinstance(output_, bool):
include_doc = output_
if include_doc:
filtered_docs.append(doc)
@@ -82,11 +91,15 @@ class LLMChainFilter(BaseDocumentCompressor):
),
documents,
)
for output_dict, doc in outputs:
for output_, doc in outputs:
include_doc = None
output = output_dict[self.llm_chain.output_key]
if self.llm_chain.prompt.output_parser is not None:
include_doc = self.llm_chain.prompt.output_parser.parse(output)
if isinstance(self.llm_chain, LLMChain):
output = output_[self.llm_chain.output_key]
if self.llm_chain.prompt.output_parser is not None:
include_doc = self.llm_chain.prompt.output_parser.parse(output)
else:
if isinstance(output_, bool):
include_doc = output_
if include_doc:
filtered_docs.append(doc)
@@ -110,5 +123,9 @@ class LLMChainFilter(BaseDocumentCompressor):
A LLMChainFilter that uses the given language model.
"""
_prompt = prompt if prompt is not None else _get_default_chain_prompt()
llm_chain = LLMChain(llm=llm, prompt=_prompt)
if _prompt.output_parser is not None:
parser = _prompt.output_parser
else:
parser = StrOutputParser()
llm_chain = _prompt | llm | parser
return cls(llm_chain=llm_chain, **kwargs)

View File

@@ -7,11 +7,11 @@ from langchain_core.callbacks import (
)
from langchain_core.documents import Document
from langchain_core.language_models import BaseLLM
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import BasePromptTemplate
from langchain_core.prompts.prompt import PromptTemplate
from langchain_core.retrievers import BaseRetriever
from langchain.chains.llm import LLMChain
from langchain_core.runnables import Runnable
logger = logging.getLogger(__name__)
@@ -30,7 +30,7 @@ class RePhraseQueryRetriever(BaseRetriever):
Then, retrieve docs for the re-phrased query."""
retriever: BaseRetriever
llm_chain: LLMChain
llm_chain: Runnable
@classmethod
def from_llm(
@@ -51,8 +51,7 @@ class RePhraseQueryRetriever(BaseRetriever):
Returns:
RePhraseQueryRetriever
"""
llm_chain = LLMChain(llm=llm, prompt=prompt)
llm_chain = prompt | llm | StrOutputParser()
return cls(
retriever=retriever,
llm_chain=llm_chain,
@@ -72,8 +71,9 @@ class RePhraseQueryRetriever(BaseRetriever):
Returns:
Relevant documents for re-phrased question
"""
response = self.llm_chain(query, callbacks=run_manager.get_child())
re_phrased_question = response["text"]
re_phrased_question = self.llm_chain.invoke(
query, {"callbacks": run_manager.get_child()}
)
logger.info(f"Re-phrased question: {re_phrased_question}")
docs = self.retriever.invoke(
re_phrased_question, config={"callbacks": run_manager.get_child()}

View File

@@ -1872,7 +1872,7 @@ files = [
[[package]]
name = "langchain-core"
version = "0.2.31"
version = "0.2.32"
description = "Building applications with LLMs through composability"
optional = false
python-versions = ">=3.8.1,<4.0"
@@ -4711,4 +4711,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools",
[metadata]
lock-version = "2.0"
python-versions = ">=3.8.1,<4.0"
content-hash = "6503a7f00cec168c1c1b165ac5fa01cd239f20e0437225176683c98100e36d33"
content-hash = "6fdb171d86fbc444f698e3a1835224bd47e76218facbc69d478cd2a8cb72406f"

View File

@@ -33,7 +33,7 @@ langchain-server = "langchain.server:main"
[tool.poetry.dependencies]
python = ">=3.8.1,<4.0"
langchain-core = "^0.2.31"
langchain-core = "^0.2.32"
langchain-text-splitters = "^0.2.0"
langsmith = "^0.1.17"
pydantic = ">=1,<3"

View File

@@ -0,0 +1,84 @@
from langchain_core.documents import Document
from langchain_core.language_models import FakeListChatModel
from langchain.retrievers.document_compressors import LLMChainExtractor
def test_llm_chain_extractor() -> None:
documents = [
Document(
page_content=(
"The sky is blue. Candlepin bowling is popular in New England."
),
metadata={"a": 1},
),
Document(
page_content=(
"Mercury is the closest planet to the Sun. "
"Candlepin bowling balls are smaller."
),
metadata={"b": 2},
),
Document(page_content="The moon is round.", metadata={"c": 3}),
]
llm = FakeListChatModel(
responses=[
"Candlepin bowling is popular in New England.",
"Candlepin bowling balls are smaller.",
"NO_OUTPUT",
]
)
doc_compressor = LLMChainExtractor.from_llm(llm)
output = doc_compressor.compress_documents(
documents, "Tell me about Candlepin bowling."
)
expected = documents = [
Document(
page_content="Candlepin bowling is popular in New England.",
metadata={"a": 1},
),
Document(
page_content="Candlepin bowling balls are smaller.", metadata={"b": 2}
),
]
assert output == expected
async def test_llm_chain_extractor_async() -> None:
documents = [
Document(
page_content=(
"The sky is blue. Candlepin bowling is popular in New England."
),
metadata={"a": 1},
),
Document(
page_content=(
"Mercury is the closest planet to the Sun. "
"Candlepin bowling balls are smaller."
),
metadata={"b": 2},
),
Document(page_content="The moon is round.", metadata={"c": 3}),
]
llm = FakeListChatModel(
responses=[
"Candlepin bowling is popular in New England.",
"Candlepin bowling balls are smaller.",
"NO_OUTPUT",
]
)
doc_compressor = LLMChainExtractor.from_llm(llm)
output = await doc_compressor.acompress_documents(
documents, "Tell me about Candlepin bowling."
)
expected = documents = [
Document(
page_content="Candlepin bowling is popular in New England.",
metadata={"a": 1},
),
Document(
page_content="Candlepin bowling balls are smaller.", metadata={"b": 2}
),
]
assert output == expected

View File

@@ -0,0 +1,46 @@
from langchain_core.documents import Document
from langchain_core.language_models import FakeListChatModel
from langchain.retrievers.document_compressors import LLMChainFilter
def test_llm_chain_filter() -> None:
documents = [
Document(
page_content="Candlepin bowling is popular in New England.",
metadata={"a": 1},
),
Document(
page_content="Candlepin bowling balls are smaller.",
metadata={"b": 2},
),
Document(page_content="The moon is round.", metadata={"c": 3}),
]
llm = FakeListChatModel(responses=["YES", "YES", "NO"])
doc_compressor = LLMChainFilter.from_llm(llm)
output = doc_compressor.compress_documents(
documents, "Tell me about Candlepin bowling."
)
expected = documents[:2]
assert output == expected
async def test_llm_chain_extractor_async() -> None:
documents = [
Document(
page_content="Candlepin bowling is popular in New England.",
metadata={"a": 1},
),
Document(
page_content="Candlepin bowling balls are smaller.",
metadata={"b": 2},
),
Document(page_content="The moon is round.", metadata={"c": 3}),
]
llm = FakeListChatModel(responses=["YES", "YES", "NO"])
doc_compressor = LLMChainFilter.from_llm(llm)
output = await doc_compressor.acompress_documents(
documents, "Tell me about Candlepin bowling."
)
expected = documents[:2]
assert output == expected