mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-26 16:43:35 +00:00
Chat Loaders (#9708)
Still working out interface/notebooks + need discord data dump to test out things other than copy+paste Update: - Going to remove the 'user_id' arg in the loaders themselves and just standardize on putting the "sender" arg in the extra kwargs. Then can provide a utility function to map these to ai and human messages - Going to move the discord one into just a notebook since I don't have a good dump to test on and copy+paste maybe isn't the greatest thing to support in v0 - Need to do more testing on slack since it seems the dump only includes channels and NOT 1 on 1 convos - --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
parent
0f48e6c36e
commit
dff00ea91e
325
docs/extras/integrations/chat_loaders/discord.ipynb
Normal file
325
docs/extras/integrations/chat_loaders/discord.ipynb
Normal file
@ -0,0 +1,325 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "c4ff9336-1cf3-459e-bd70-d1314c1da6a0",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Discord\n",
|
||||||
|
"\n",
|
||||||
|
"This notebook shows how to create your own chat loader that works on copy-pasted messages (from dms) to a list of LangChain messages.\n",
|
||||||
|
"\n",
|
||||||
|
"The process has four steps:\n",
|
||||||
|
"1. Create the chat .txt file by copying chats from the Discord app and pasting them in a file on your local computer\n",
|
||||||
|
"2. Copy the chat loader definition from below to a local file.\n",
|
||||||
|
"3. Initialize the `DiscordChatLoader` with the file path pointed to the text file.\n",
|
||||||
|
"4. Call `loader.load()` (or `loader.lazy_load()`) to perform the conversion.\n",
|
||||||
|
"\n",
|
||||||
|
"## 1. Creat message dump\n",
|
||||||
|
"\n",
|
||||||
|
"Currently (2023/08/23) this loader only supports .txt files in the format generated by copying messages in the app to your clipboard and pasting in a file. Below is an example."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "e4ccfdfa-6869-4d67-90a0-ab99f01b7553",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Overwriting discord_chats.txt\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"%%writefile discord_chats.txt\n",
|
||||||
|
"talkingtower — 08/15/2023 11:10 AM\n",
|
||||||
|
"Love music! Do you like jazz?\n",
|
||||||
|
"reporterbob — 08/15/2023 9:27 PM\n",
|
||||||
|
"Yes! Jazz is fantastic. Ever heard this one?\n",
|
||||||
|
"Website\n",
|
||||||
|
"Listen to classic jazz track...\n",
|
||||||
|
"\n",
|
||||||
|
"talkingtower — Yesterday at 5:03 AM\n",
|
||||||
|
"Indeed! Great choice. 🎷\n",
|
||||||
|
"reporterbob — Yesterday at 5:23 AM\n",
|
||||||
|
"Thanks! How about some virtual sightseeing?\n",
|
||||||
|
"Website\n",
|
||||||
|
"Virtual tour of famous landmarks...\n",
|
||||||
|
"\n",
|
||||||
|
"talkingtower — Today at 2:38 PM\n",
|
||||||
|
"Sounds fun! Let's explore.\n",
|
||||||
|
"reporterbob — Today at 2:56 PM\n",
|
||||||
|
"Enjoy the tour! See you around.\n",
|
||||||
|
"talkingtower — Today at 3:00 PM\n",
|
||||||
|
"Thank you! Goodbye! 👋\n",
|
||||||
|
"reporterbob — Today at 3:02 PM\n",
|
||||||
|
"Farewell! Happy exploring."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "359565a7-dad3-403c-a73c-6414b1295127",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 2. Define chat loader\n",
|
||||||
|
"\n",
|
||||||
|
"LangChain currently does not support "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "a429e0c4-4d7d-45f8-bbbb-c7fc5229f6af",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import logging\n",
|
||||||
|
"import re\n",
|
||||||
|
"from typing import Iterator, List\n",
|
||||||
|
"\n",
|
||||||
|
"from langchain import schema\n",
|
||||||
|
"from langchain.chat_loaders import base as chat_loaders\n",
|
||||||
|
"\n",
|
||||||
|
"logger = logging.getLogger()\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"class DiscordChatLoader(chat_loaders.BaseChatLoader):\n",
|
||||||
|
" \n",
|
||||||
|
" def __init__(self, path: str):\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" Initialize the Discord chat loader.\n",
|
||||||
|
"\n",
|
||||||
|
" Args:\n",
|
||||||
|
" path: Path to the exported Discord chat text file.\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" self.path = path\n",
|
||||||
|
" self._message_line_regex = re.compile(\n",
|
||||||
|
" r\"(.+?) — (\\w{3,9} \\d{1,2}(?:st|nd|rd|th)?(?:, \\d{4})? \\d{1,2}:\\d{2} (?:AM|PM)|Today at \\d{1,2}:\\d{2} (?:AM|PM)|Yesterday at \\d{1,2}:\\d{2} (?:AM|PM))\", # noqa\n",
|
||||||
|
" flags=re.DOTALL,\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
" def _load_single_chat_session_from_txt(\n",
|
||||||
|
" self, file_path: str\n",
|
||||||
|
" ) -> chat_loaders.ChatSession:\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" Load a single chat session from a text file.\n",
|
||||||
|
"\n",
|
||||||
|
" Args:\n",
|
||||||
|
" file_path: Path to the text file containing the chat messages.\n",
|
||||||
|
"\n",
|
||||||
|
" Returns:\n",
|
||||||
|
" A `ChatSession` object containing the loaded chat messages.\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
||||||
|
" lines = file.readlines()\n",
|
||||||
|
"\n",
|
||||||
|
" results: List[schema.BaseMessage] = []\n",
|
||||||
|
" current_sender = None\n",
|
||||||
|
" current_timestamp = None\n",
|
||||||
|
" current_content = []\n",
|
||||||
|
" for line in lines:\n",
|
||||||
|
" if re.match(\n",
|
||||||
|
" r\".+? — (\\d{2}/\\d{2}/\\d{4} \\d{1,2}:\\d{2} (?:AM|PM)|Today at \\d{1,2}:\\d{2} (?:AM|PM)|Yesterday at \\d{1,2}:\\d{2} (?:AM|PM))\", # noqa\n",
|
||||||
|
" line,\n",
|
||||||
|
" ):\n",
|
||||||
|
" if current_sender and current_content:\n",
|
||||||
|
" results.append(\n",
|
||||||
|
" schema.HumanMessage(\n",
|
||||||
|
" content=\"\".join(current_content).strip(),\n",
|
||||||
|
" additional_kwargs={\n",
|
||||||
|
" \"sender\": current_sender,\n",
|
||||||
|
" \"events\": [{\"message_time\": current_timestamp}],\n",
|
||||||
|
" },\n",
|
||||||
|
" )\n",
|
||||||
|
" )\n",
|
||||||
|
" current_sender, current_timestamp = line.split(\" — \")[:2]\n",
|
||||||
|
" current_content = [\n",
|
||||||
|
" line[len(current_sender) + len(current_timestamp) + 4 :].strip()\n",
|
||||||
|
" ]\n",
|
||||||
|
" elif re.match(r\"\\[\\d{1,2}:\\d{2} (?:AM|PM)\\]\", line.strip()):\n",
|
||||||
|
" results.append(\n",
|
||||||
|
" schema.HumanMessage(\n",
|
||||||
|
" content=\"\".join(current_content).strip(),\n",
|
||||||
|
" additional_kwargs={\n",
|
||||||
|
" \"sender\": current_sender,\n",
|
||||||
|
" \"events\": [{\"message_time\": current_timestamp}],\n",
|
||||||
|
" },\n",
|
||||||
|
" )\n",
|
||||||
|
" )\n",
|
||||||
|
" current_timestamp = line.strip()[1:-1]\n",
|
||||||
|
" current_content = []\n",
|
||||||
|
" else:\n",
|
||||||
|
" current_content.append(\"\\n\" + line.strip())\n",
|
||||||
|
"\n",
|
||||||
|
" if current_sender and current_content:\n",
|
||||||
|
" results.append(\n",
|
||||||
|
" schema.HumanMessage(\n",
|
||||||
|
" content=\"\".join(current_content).strip(),\n",
|
||||||
|
" additional_kwargs={\n",
|
||||||
|
" \"sender\": current_sender,\n",
|
||||||
|
" \"events\": [{\"message_time\": current_timestamp}],\n",
|
||||||
|
" },\n",
|
||||||
|
" )\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
" return chat_loaders.ChatSession(messages=results)\n",
|
||||||
|
"\n",
|
||||||
|
" def lazy_load(self) -> Iterator[chat_loaders.ChatSession]:\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" Lazy load the messages from the chat file and yield them in the required format.\n",
|
||||||
|
"\n",
|
||||||
|
" Yields:\n",
|
||||||
|
" A `ChatSession` object containing the loaded chat messages.\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" yield self._load_single_chat_session_from_txt(self.path)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "c8240393-48be-44d2-b0d6-52c215cd8ac2",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 2. Create loader\n",
|
||||||
|
"\n",
|
||||||
|
"We will point to the file we just wrote to disk."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "1268de40-b0e5-445d-9cd8-54856cd0293a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = DiscordChatLoader(\n",
|
||||||
|
" path=\"./discord_chats.txt\",\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "4928df4b-ae31-48a7-bd76-be3ecee1f3e0",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 3. Load Messages\n",
|
||||||
|
"\n",
|
||||||
|
"Assuming the format is correct, the loader will convert the chats to langchain messages."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "c8a0836d-4a22-4790-bfe9-97f2145bb0d6",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from typing import List\n",
|
||||||
|
"from langchain.chat_loaders.base import ChatSession\n",
|
||||||
|
"from langchain.chat_loaders.utils import (\n",
|
||||||
|
" map_ai_messages,\n",
|
||||||
|
" merge_chat_runs,\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"raw_messages = loader.lazy_load()\n",
|
||||||
|
"# Merge consecutive messages from the same sender into a single message\n",
|
||||||
|
"merged_messages = merge_chat_runs(raw_messages)\n",
|
||||||
|
"# Convert messages from \"talkingtower\" to AI messages\n",
|
||||||
|
"messages: List[ChatSession] = list(map_ai_messages(merged_messages, sender=\"talkingtower\"))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "1913963b-c44e-4f7a-aba7-0423c9b8bd59",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[{'messages': [AIMessage(content='Love music! Do you like jazz?', additional_kwargs={'sender': 'talkingtower', 'events': [{'message_time': '08/15/2023 11:10 AM\\n'}]}, example=False),\n",
|
||||||
|
" HumanMessage(content='Yes! Jazz is fantastic. Ever heard this one?\\nWebsite\\nListen to classic jazz track...', additional_kwargs={'sender': 'reporterbob', 'events': [{'message_time': '08/15/2023 9:27 PM\\n'}]}, example=False),\n",
|
||||||
|
" AIMessage(content='Indeed! Great choice. 🎷', additional_kwargs={'sender': 'talkingtower', 'events': [{'message_time': 'Yesterday at 5:03 AM\\n'}]}, example=False),\n",
|
||||||
|
" HumanMessage(content='Thanks! How about some virtual sightseeing?\\nWebsite\\nVirtual tour of famous landmarks...', additional_kwargs={'sender': 'reporterbob', 'events': [{'message_time': 'Yesterday at 5:23 AM\\n'}]}, example=False),\n",
|
||||||
|
" AIMessage(content=\"Sounds fun! Let's explore.\", additional_kwargs={'sender': 'talkingtower', 'events': [{'message_time': 'Today at 2:38 PM\\n'}]}, example=False),\n",
|
||||||
|
" HumanMessage(content='Enjoy the tour! See you around.', additional_kwargs={'sender': 'reporterbob', 'events': [{'message_time': 'Today at 2:56 PM\\n'}]}, example=False),\n",
|
||||||
|
" AIMessage(content='Thank you! Goodbye! 👋', additional_kwargs={'sender': 'talkingtower', 'events': [{'message_time': 'Today at 3:00 PM\\n'}]}, example=False),\n",
|
||||||
|
" HumanMessage(content='Farewell! Happy exploring.', additional_kwargs={'sender': 'reporterbob', 'events': [{'message_time': 'Today at 3:02 PM\\n'}]}, example=False)]}]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"messages"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "8595a518-5c89-44aa-94a7-ca51e7e2a5fa",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Next Steps\n",
|
||||||
|
"\n",
|
||||||
|
"You can then use these messages how you see fit, such as finetuning a model, few-shot example selection, or directly make predictions for the next message "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "08ff0a1e-fca0-4da3-aacd-d7401f99d946",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Thank you! Have a wonderful day! 🌟"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from langchain.chat_models import ChatOpenAI\n",
|
||||||
|
"\n",
|
||||||
|
"llm = ChatOpenAI()\n",
|
||||||
|
"\n",
|
||||||
|
"for chunk in llm.stream(messages[0]['messages']):\n",
|
||||||
|
" print(chunk.content, end=\"\", flush=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "50a5251f-074a-4a3c-a2b0-b1de85e0ac6a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
579
docs/extras/integrations/chat_loaders/facebook.ipynb
Normal file
579
docs/extras/integrations/chat_loaders/facebook.ipynb
Normal file
@ -0,0 +1,579 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "e4bd269b",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Facebook Messenger\n",
|
||||||
|
"\n",
|
||||||
|
"This notebook shows how to load data from Facebook in a format you can finetune on. The overall steps are:\n",
|
||||||
|
"\n",
|
||||||
|
"1. Download your messenger data to disk.\n",
|
||||||
|
"2. Create the Chat Loader and call `loader.load()` (or `loader.lazy_load()`) to perform the conversion.\n",
|
||||||
|
"3. Optionally use `merge_chat_runs` to combine message from the same sender in sequence, and/or `map_ai_messages` to convert messages from the specified sender to the \"AIMessage\" class. Once you've done this, call `convert_messages_for_finetuning` to prepare your data for fine-tuning.\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"Once this has been done, you can fine-tune your model. To do so you would complete the following steps:\n",
|
||||||
|
"\n",
|
||||||
|
"4. Upload your messages to OpenAI and run a fine-tuning job.\n",
|
||||||
|
"6. Use the resulting model in your LangChain app!\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"Let's begin.\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"## 1. Download Data\n",
|
||||||
|
"\n",
|
||||||
|
"To download your own messenger data, following instructions [here](https://www.zapptales.com/en/download-facebook-messenger-chat-history-how-to/). IMPORTANT - make sure to download them in JSON format (not HTML).\n",
|
||||||
|
"\n",
|
||||||
|
"We are hosting an example dump at [this google drive link](https://drive.google.com/file/d/1rh1s1o2i7B-Sk1v9o8KNgivLVGwJ-osV/view?usp=sharing) that we will use in this walkthrough."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "647f2158-a42e-4634-b283-b8492caf542a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"File file.zip downloaded.\n",
|
||||||
|
"File file.zip has been unzipped.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# This uses some example data\n",
|
||||||
|
"import requests\n",
|
||||||
|
"import zipfile\n",
|
||||||
|
"\n",
|
||||||
|
"def download_and_unzip(url: str, output_path: str = 'file.zip') -> None:\n",
|
||||||
|
" file_id = url.split('/')[-2]\n",
|
||||||
|
" download_url = f'https://drive.google.com/uc?export=download&id={file_id}'\n",
|
||||||
|
"\n",
|
||||||
|
" response = requests.get(download_url)\n",
|
||||||
|
" if response.status_code != 200:\n",
|
||||||
|
" print('Failed to download the file.')\n",
|
||||||
|
" return\n",
|
||||||
|
"\n",
|
||||||
|
" with open(output_path, 'wb') as file:\n",
|
||||||
|
" file.write(response.content)\n",
|
||||||
|
" print(f'File {output_path} downloaded.')\n",
|
||||||
|
"\n",
|
||||||
|
" with zipfile.ZipFile(output_path, 'r') as zip_ref:\n",
|
||||||
|
" zip_ref.extractall()\n",
|
||||||
|
" print(f'File {output_path} has been unzipped.')\n",
|
||||||
|
"\n",
|
||||||
|
"# URL of the file to download\n",
|
||||||
|
"url = 'https://drive.google.com/file/d/1rh1s1o2i7B-Sk1v9o8KNgivLVGwJ-osV/view?usp=sharing'\n",
|
||||||
|
"\n",
|
||||||
|
"# Download and unzip\n",
|
||||||
|
"download_and_unzip(url)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "48ef8bb1-fc28-453c-835a-94a552f05a91",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 2. Create Chat Loader\n",
|
||||||
|
"\n",
|
||||||
|
"We have 2 different `FacebookMessengerChatLoader` classes, one for an entire directory of chats, and one to load individual files. We"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "a0869bc6",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"directory_path = \"./hogwarts\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "0460bf25",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.chat_loaders.facebook_messenger import (\n",
|
||||||
|
" SingleFileFacebookMessengerChatLoader,\n",
|
||||||
|
" FolderFacebookMessengerChatLoader,\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "f61ee277",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = SingleFileFacebookMessengerChatLoader(\n",
|
||||||
|
" path=\"./hogwarts/inbox/HermioneGranger/messages_Hermione_Granger.json\",\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"id": "ec466ad7",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[HumanMessage(content=\"Hi Hermione! How's your summer going so far?\", additional_kwargs={'sender': 'Harry Potter'}, example=False),\n",
|
||||||
|
" HumanMessage(content=\"Harry! Lovely to hear from you. My summer is going well, though I do miss everyone. I'm spending most of my time going through my books and researching fascinating new topics. How about you?\", additional_kwargs={'sender': 'Hermione Granger'}, example=False),\n",
|
||||||
|
" HumanMessage(content=\"I miss you all too. The Dursleys are being their usual unpleasant selves but I'm getting by. At least I can practice some spells in my room without them knowing. Let me know if you find anything good in your researching!\", additional_kwargs={'sender': 'Harry Potter'}, example=False)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"chat_session = loader.load()[0]\n",
|
||||||
|
"chat_session[\"messages\"][:3]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"id": "8a3ee473",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = FolderFacebookMessengerChatLoader(\n",
|
||||||
|
" path=\"./hogwarts\",\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"id": "9f41e122",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"9"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"chat_sessions = loader.load()\n",
|
||||||
|
"len(chat_sessions)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "d4aa3580-adc1-4b48-9bba-0e8e8d9f44ce",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 3. Prepare for fine-tuning\n",
|
||||||
|
"\n",
|
||||||
|
"Calling `load()` returns all the chat messages we could extract as human messages. When conversing with chat bots, conversations typically follow a more strict alternating dialogue pattern relative to real conversations. \n",
|
||||||
|
"\n",
|
||||||
|
"You can choose to merge message \"runs\" (consecutive messages from the same sender) and select a sender to represent the \"AI\". The fine-tuned LLM will learn to generate these AI messages."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"id": "5a78030d-b757-4bbe-8a6c-841056f46df7",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.chat_loaders.utils import (\n",
|
||||||
|
" merge_chat_runs,\n",
|
||||||
|
" map_ai_messages,\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 17,
|
||||||
|
"id": "ff35b028-78bf-4c5b-9ec6-939fe67de7f7",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"merged_sessions = merge_chat_runs(chat_sessions)\n",
|
||||||
|
"alternating_sessions = list(map_ai_messages(merged_sessions, \"Harry Potter\"))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 19,
|
||||||
|
"id": "4b11906e-a496-4d01-9f0d-1938c14147bf",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[AIMessage(content=\"Professor Snape, I was hoping I could speak with you for a moment about something that's been concerning me lately.\", additional_kwargs={'sender': 'Harry Potter'}, example=False),\n",
|
||||||
|
" HumanMessage(content=\"What is it, Potter? I'm quite busy at the moment.\", additional_kwargs={'sender': 'Severus Snape'}, example=False),\n",
|
||||||
|
" AIMessage(content=\"I apologize for the interruption, sir. I'll be brief. I've noticed some strange activity around the school grounds at night. I saw a cloaked figure lurking near the Forbidden Forest last night. I'm worried someone may be plotting something sinister.\", additional_kwargs={'sender': 'Harry Potter'}, example=False)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 19,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Now all of Harry Potter's messages will take the AI message class\n",
|
||||||
|
"# which maps to the 'assistant' role in OpenAI's training format\n",
|
||||||
|
"alternating_sessions[0]['messages'][:3]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "d985478d-062e-47b9-ae9a-102f59be07c0",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"#### Now we can convert to OpenAI format dictionaries"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 20,
|
||||||
|
"id": "21372331",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.adapters.openai import convert_messages_for_finetuning"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 38,
|
||||||
|
"id": "92c5ae7a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Prepared 9 dialogues for training\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"training_data = convert_messages_for_finetuning(alternating_sessions)\n",
|
||||||
|
"print(f\"Prepared {len(training_data)} dialogues for training\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 33,
|
||||||
|
"id": "dfcbd181",
|
||||||
|
"metadata": {
|
||||||
|
"scrolled": true
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[{'role': 'assistant',\n",
|
||||||
|
" 'content': \"Professor Snape, I was hoping I could speak with you for a moment about something that's been concerning me lately.\"},\n",
|
||||||
|
" {'role': 'user',\n",
|
||||||
|
" 'content': \"What is it, Potter? I'm quite busy at the moment.\"},\n",
|
||||||
|
" {'role': 'assistant',\n",
|
||||||
|
" 'content': \"I apologize for the interruption, sir. I'll be brief. I've noticed some strange activity around the school grounds at night. I saw a cloaked figure lurking near the Forbidden Forest last night. I'm worried someone may be plotting something sinister.\"}]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 33,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"training_data[0][:3]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "f1a9fd64-4f9f-42d3-b5dc-2a340e51e9e7",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"OpenAI currently requires at least 10 training examples for a fine-tuning job, though they recommend between 50-100 for most tasks. Since we only have 9 chat sessions, we can subdivide them (optionally with some overlap) so that each training example is comprised of a portion of a whole conversation.\n",
|
||||||
|
"\n",
|
||||||
|
"Facebook chat sessions (1 per person) often span multiple days and conversations,\n",
|
||||||
|
"so the long-range dependencies may not be that important to model anyhow."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 42,
|
||||||
|
"id": "13cd290a-b1e9-4686-bb5e-d99de8b8612b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"100"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 42,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Our chat is alternating, we will make each datapoint a group of 8 messages,\n",
|
||||||
|
"# with 2 messages overlapping\n",
|
||||||
|
"chunk_size = 8\n",
|
||||||
|
"overlap = 2\n",
|
||||||
|
"\n",
|
||||||
|
"training_examples = [\n",
|
||||||
|
" conversation_messages[i: i + chunk_size] \n",
|
||||||
|
" for conversation_messages in training_data\n",
|
||||||
|
" for i in range(\n",
|
||||||
|
" 0, len(conversation_messages) - chunk_size + 1, \n",
|
||||||
|
" chunk_size - overlap)\n",
|
||||||
|
"]\n",
|
||||||
|
"\n",
|
||||||
|
"len(training_examples)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "cc8baf41-ff07-4492-96bd-b2472ee7cef9",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 4. Fine-tune the model\n",
|
||||||
|
"\n",
|
||||||
|
"It's time to fine-tune the model. Make sure you have `openai` installed\n",
|
||||||
|
"and have set your `OPENAI_API_KEY` appropriately"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 43,
|
||||||
|
"id": "95ce3f63-3c80-44b2-9060-534ad74e16fa",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# %pip install -U openai --quiet"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 58,
|
||||||
|
"id": "ab9e28eb",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"File file-zCyNBeg4snpbBL7VkvsuhCz8 ready afer 30.55 seconds.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import json\n",
|
||||||
|
"from io import BytesIO\n",
|
||||||
|
"import time\n",
|
||||||
|
"\n",
|
||||||
|
"import openai\n",
|
||||||
|
"\n",
|
||||||
|
"# We will write the jsonl file in memory\n",
|
||||||
|
"my_file = BytesIO()\n",
|
||||||
|
"for m in training_examples:\n",
|
||||||
|
" my_file.write((json.dumps({\"messages\": m}) + \"\\n\").encode('utf-8'))\n",
|
||||||
|
"\n",
|
||||||
|
"my_file.seek(0)\n",
|
||||||
|
"training_file = openai.File.create(\n",
|
||||||
|
" file=my_file,\n",
|
||||||
|
" purpose='fine-tune'\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"# OpenAI audits each training file for compliance reasons.\n",
|
||||||
|
"# This make take a few minutes\n",
|
||||||
|
"status = openai.File.retrieve(training_file.id).status\n",
|
||||||
|
"start_time = time.time()\n",
|
||||||
|
"while status != \"processed\":\n",
|
||||||
|
" print(f\"Status=[{status}]... {time.time() - start_time:.2f}s\", end=\"\\r\", flush=True)\n",
|
||||||
|
" time.sleep(5)\n",
|
||||||
|
" status = openai.File.retrieve(training_file.id).status\n",
|
||||||
|
"print(f\"File {training_file.id} ready after {time.time() - start_time:.2f} seconds.\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "759a7f51-fde9-4b75-aaa9-e600e6537bd1",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"With the file ready, it's time to kick off a training job."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 59,
|
||||||
|
"id": "3f451425",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"job = openai.FineTuningJob.create(\n",
|
||||||
|
" training_file=training_file.id,\n",
|
||||||
|
" model=\"gpt-3.5-turbo\",\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "489b23ef-5e14-42a9-bafb-44220ec6960b",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Grab a cup of tea while your model is being prepared. This may take some time!"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 60,
|
||||||
|
"id": "bac1637a-c087-4523-ade1-c47f9bf4c6f4",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Status=[running]... 908.87s\r"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"status = openai.FineTuningJob.retrieve(job.id).status\n",
|
||||||
|
"start_time = time.time()\n",
|
||||||
|
"while status != \"succeeded\":\n",
|
||||||
|
" print(f\"Status=[{status}]... {time.time() - start_time:.2f}s\", end=\"\\r\", flush=True)\n",
|
||||||
|
" time.sleep(5)\n",
|
||||||
|
" job = openai.FineTuningJob.retrieve(job.id)\n",
|
||||||
|
" status = job.status"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 66,
|
||||||
|
"id": "535895e1-bc69-40e5-82ed-e24ed2baeeee",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"ft:gpt-3.5-turbo-0613:personal::7rDwkaOq\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(job.fine_tuned_model)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "502ff73b-f9e9-49ce-ba45-401811e57946",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 5. Use in LangChain\n",
|
||||||
|
"\n",
|
||||||
|
"You can use the resulting model ID directly the `ChatOpenAI` model class."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 67,
|
||||||
|
"id": "3925d60d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.chat_models import ChatOpenAI\n",
|
||||||
|
"\n",
|
||||||
|
"model = ChatOpenAI(\n",
|
||||||
|
" model=job.fine_tuned_model,\n",
|
||||||
|
" temperature=1,\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 69,
|
||||||
|
"id": "7190cf2e-ab34-4ceb-bdad-45f24f069c29",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.prompts import ChatPromptTemplate\n",
|
||||||
|
"from langchain.schema.output_parser import StrOutputParser\n",
|
||||||
|
"\n",
|
||||||
|
"prompt = ChatPromptTemplate.from_messages(\n",
|
||||||
|
" [\n",
|
||||||
|
" (\"human\", \"{input}\"),\n",
|
||||||
|
" ]\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"chain = prompt | model | StrOutputParser()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 72,
|
||||||
|
"id": "f02057e9-f914-40b1-9c9d-9432ff594b98",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"The usual - Potions, Transfiguration, Defense Against the Dark Arts. What about you?"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for tok in chain.stream({\"input\": \"What classes are you taking?\"}):\n",
|
||||||
|
" print(tok, end=\"\", flush=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "35331503-3cc6-4d64-955e-64afe6b5fef3",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
188
docs/extras/integrations/chat_loaders/index.mdx
Normal file
188
docs/extras/integrations/chat_loaders/index.mdx
Normal file
@ -0,0 +1,188 @@
|
|||||||
|
---
|
||||||
|
sidebar_position: 0
|
||||||
|
---
|
||||||
|
|
||||||
|
# Chat loaders
|
||||||
|
|
||||||
|
Like document loaders, chat loaders are utilities designed to help load conversations from popular communication platforms such as Facebook, Slack, Discord, etc. These are loaded into memory as LangChain chat message objects. Such utilities facilitate tasks such as fine-tuning a language model to match your personal style or voice.
|
||||||
|
|
||||||
|
This brief guide will illustrate the process using [OpenAI's fine-tuning API](https://platform.openai.com/docs/guides/fine-tuning) comprised of six steps:
|
||||||
|
|
||||||
|
1. Export your Facebook Messenger chat data in a compatible format for your intended chat loader.
|
||||||
|
2. Load the chat data into memory as LangChain chat message objects. (_this is what is covered in each integration notebook in this section of the documentation_).
|
||||||
|
- Assign a person to the "AI" role and optionally filter, group, and merge messages.
|
||||||
|
3. Export these acquired messages in a format expected by the fine-tuning API.
|
||||||
|
4. Upload this data to OpenAI.
|
||||||
|
5. Fine-tune your model.
|
||||||
|
6. Implement the fine-tuned model in LangChain.
|
||||||
|
|
||||||
|
This guide is not wholly comprehensive but is designed to take you through the fundamentals of going from raw data to fine-tuned model.
|
||||||
|
|
||||||
|
We will demonstrate the procedure through an example of fine-tuning a `gpt-3.5-turbo` model on Facebook Messenger data.
|
||||||
|
|
||||||
|
### 1. Export your chat data
|
||||||
|
|
||||||
|
To export your Facebook messenger data, you can follow the [instructions here](https://www.zapptales.com/en/download-facebook-messenger-chat-history-how-to/).
|
||||||
|
|
||||||
|
:::important JSON format
|
||||||
|
You must select "JSON format" (instead of HTML) when exporting your data to be compatible with the current loader.
|
||||||
|
:::
|
||||||
|
|
||||||
|
OpenAI requires at least 10 examples to fine-tune your model, but they recommend between 50-100 for more optimal results.
|
||||||
|
You can use the example data stored at [this google drive link](https://drive.google.com/file/d/1rh1s1o2i7B-Sk1v9o8KNgivLVGwJ-osV/view?usp=sharing) to test the process.
|
||||||
|
|
||||||
|
### 2. Load the chat
|
||||||
|
|
||||||
|
Once you've obtained your chat data, you can load it into memory as LangChain chat message objects. Here’s an example of loading data using the Python code:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from langchain.chat_loaders.facebook_messenger import FolderFacebookMessengerChatLoader
|
||||||
|
|
||||||
|
loader = FolderFacebookMessengerChatLoader(
|
||||||
|
path="./facebook_messenger_chats",
|
||||||
|
)
|
||||||
|
|
||||||
|
chat_sessions = loader.load()
|
||||||
|
```
|
||||||
|
|
||||||
|
In this snippet, we point the loader to a directory of Facebook chat dumps which are then loaded as multiple "sessions" of messages, one session per conversation file.
|
||||||
|
|
||||||
|
Once you've loaded the messages, you should decide which person you want to fine-tune the model to (usually yourself). You can also decide to merge consecutive messages from the same sender into a single chat message.
|
||||||
|
For both of these tasks, you can use the chat_loaders utilities to do so:
|
||||||
|
|
||||||
|
```
|
||||||
|
from langchain.chat_loaders.utils import (
|
||||||
|
merge_chat_runs,
|
||||||
|
map_ai_messages,
|
||||||
|
)
|
||||||
|
|
||||||
|
merged_sessions = merge_chat_runs(chat_sessions)
|
||||||
|
alternating_sessions = list(map_ai_messages(merged_sessions, "My Name"))
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Export messages to OpenAI format
|
||||||
|
|
||||||
|
Convert the chat messages to dictionaries using the `convert_messages_for_finetuning` function. Then, group the data into chunks for better context modeling and overlap management.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from langchain.adapters.openai import convert_messages_for_finetuning
|
||||||
|
|
||||||
|
openai_messages = convert_messages_for_finetuning(chat_sessions)
|
||||||
|
```
|
||||||
|
|
||||||
|
At this point, the data is ready for upload to OpenAI. You can choose to split up conversations into smaller chunks for training if you
|
||||||
|
do not have enough conversations to train on. Feel free to play around with different chunk sizes or with adding system messages to the fine-tuning data.
|
||||||
|
|
||||||
|
```python
|
||||||
|
chunk_size = 8
|
||||||
|
overlap = 2
|
||||||
|
|
||||||
|
message_groups = [
|
||||||
|
conversation_messages[i: i + chunk_size]
|
||||||
|
for conversation_messages in openai_messages
|
||||||
|
for i in range(
|
||||||
|
0, len(conversation_messages) - chunk_size + 1,
|
||||||
|
chunk_size - overlap)
|
||||||
|
]
|
||||||
|
|
||||||
|
len(message_groups)
|
||||||
|
# 9
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Upload the data to OpenAI
|
||||||
|
|
||||||
|
Ensure you have set your OpenAI API key by following these [instructions](https://platform.openai.com/account/api-keys), then upload the training file.
|
||||||
|
An audit is performed to ensure data compliance, so you may have to wait a few minutes for the dataset to become ready for use.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
import io
|
||||||
|
|
||||||
|
import openai
|
||||||
|
|
||||||
|
my_file = io.BytesIO()
|
||||||
|
for group in message_groups:
|
||||||
|
my_file.write((json.dumps({"messages": group}) + "\n").encode('utf-8'))
|
||||||
|
|
||||||
|
my_file.seek(0)
|
||||||
|
training_file = openai.File.create(
|
||||||
|
file=my_file,
|
||||||
|
purpose='fine-tune'
|
||||||
|
)
|
||||||
|
|
||||||
|
# Wait while the file is processed
|
||||||
|
status = openai.File.retrieve(training_file.id).status
|
||||||
|
start_time = time.time()
|
||||||
|
while status != "processed":
|
||||||
|
print(f"Status=[{status}]... {time.time() - start_time:.2f}s", end="\r", flush=True)
|
||||||
|
time.sleep(5)
|
||||||
|
status = openai.File.retrieve(training_file.id).status
|
||||||
|
print(f"File {training_file.id} ready after {time.time() - start_time:.2f} seconds.")
|
||||||
|
```
|
||||||
|
|
||||||
|
Once this is done, you can proceed to the model training!
|
||||||
|
|
||||||
|
### 5. Fine-tune the model
|
||||||
|
|
||||||
|
Start the fine-tuning job with your chosen base model.
|
||||||
|
|
||||||
|
```python
|
||||||
|
job = openai.FineTuningJob.create(
|
||||||
|
training_file=training_file.id,
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
This might take a while. Check the status with `openai.FineTuningJob.retrieve(job.id).status` and wait for it to report `succeeded`.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# It may take 10-20+ minutes to complete training.
|
||||||
|
status = openai.FineTuningJob.retrieve(job.id).status
|
||||||
|
start_time = time.time()
|
||||||
|
while status != "succeeded":
|
||||||
|
print(f"Status=[{status}]... {time.time() - start_time:.2f}s", end="\r", flush=True)
|
||||||
|
time.sleep(5)
|
||||||
|
job = openai.FineTuningJob.retrieve(job.id)
|
||||||
|
status = job.status
|
||||||
|
```
|
||||||
|
|
||||||
|
### 6. Use the model in LangChain
|
||||||
|
|
||||||
|
You're almost there! Use the fine-tuned model in LangChain.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from langchain import chat_models
|
||||||
|
|
||||||
|
model_name = job.fine_tuned_model
|
||||||
|
# Example: ft:gpt-3.5-turbo-0613:personal::5mty86jblapsed
|
||||||
|
model = chat_models.ChatOpenAI(model=model_name)
|
||||||
|
```
|
||||||
|
|
||||||
|
```python
|
||||||
|
from langchain.prompts import ChatPromptTemplate
|
||||||
|
from langchain.schema.output_parser import StrOutputParser
|
||||||
|
|
||||||
|
prompt = ChatPromptTemplate.from_messages(
|
||||||
|
[
|
||||||
|
("human", "{input}"),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
chain = prompt | model | StrOutputParser()
|
||||||
|
|
||||||
|
for tok in chain.stream({"input": "What classes are you taking?"}):
|
||||||
|
print(tok, end="", flush=True)
|
||||||
|
|
||||||
|
# The usual - Potions, Transfiguration, Defense Against the Dark Arts. What about you?
|
||||||
|
```
|
||||||
|
|
||||||
|
And that's it! You've successfully fine-tuned a model and used it in LangChain.
|
||||||
|
|
||||||
|
## Supported Chat Loaders
|
||||||
|
|
||||||
|
LangChain currently supports the following chat loaders. Feel free to contribute more!
|
||||||
|
|
||||||
|
import DocCardList from "@theme/DocCardList";
|
||||||
|
|
||||||
|
<DocCardList />
|
163
docs/extras/integrations/chat_loaders/slack.ipynb
Normal file
163
docs/extras/integrations/chat_loaders/slack.ipynb
Normal file
@ -0,0 +1,163 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "01fcfa2f-33a9-48f3-835a-b1956c394d6b",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Slack\n",
|
||||||
|
"\n",
|
||||||
|
"This notebook shows how to use the Slack chat loader. This class helps map exported slack conversations to LangChain chat messages.\n",
|
||||||
|
"\n",
|
||||||
|
"The process has three steps:\n",
|
||||||
|
"1. Export the desired conversation thread by following the [instructions here](https://slack.com/help/articles/1500001548241-Request-to-export-all-conversations).\n",
|
||||||
|
"2. Create the `SlackChatLoader` with the file path pointed to the json file or directory of JSON files\n",
|
||||||
|
"3. Call `loader.load()` (or `loader.lazy_load()`) to perform the conversion. Optionally use `merge_chat_runs` to combine message from the same sender in sequence, and/or `map_ai_messages` to convert messages from the specified sender to the \"AIMessage\" class.\n",
|
||||||
|
"\n",
|
||||||
|
"## 1. Creat message dump\n",
|
||||||
|
"\n",
|
||||||
|
"Currently (2023/08/23) this loader best supports a zip directory of files in the format generated by exporting your a direct message converstion from Slack. Follow up-to-date instructions from slack on how to do so.\n",
|
||||||
|
"\n",
|
||||||
|
"We have an example in the LangChain repo."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "a79d35bf-5f21-4063-84bf-a60845c1c51f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import requests\n",
|
||||||
|
"\n",
|
||||||
|
"permalink = \"https://raw.githubusercontent.com/langchain-ai/langchain/342087bdfa3ac31d622385d0f2d09cf5e06c8db3/libs/langchain/tests/integration_tests/examples/slack_export.zip\"\n",
|
||||||
|
"response = requests.get(permalink)\n",
|
||||||
|
"with open(\"slack_dump.zip\", \"wb\") as f:\n",
|
||||||
|
" f.write(response.content)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "cf60f703-76f1-4602-a723-02c59535c1af",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 2. Create the Chat Loader\n",
|
||||||
|
"\n",
|
||||||
|
"Provide the loader with the file path to the zip directory. You can optionally specify the user id that maps to an ai message as well an configure whether to merge message runs."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "4b8b432a-d2bc-49e1-b35f-761730a8fd6d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.chat_loaders.slack import SlackChatLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "8ec6661b-0aca-48ae-9e2b-6412856c287b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = SlackChatLoader(\n",
|
||||||
|
" path=\"slack_dump.zip\",\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "8805a7c5-84b4-49f5-8989-0022f2054ace",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 3. Load messages\n",
|
||||||
|
"\n",
|
||||||
|
"The `load()` (or `lazy_load`) methods return a list of \"ChatSessions\" that currently just contain a list of messages per loaded conversation."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "fcd69b3e-020d-4a15-8a0d-61c2d34e1ee1",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from typing import List\n",
|
||||||
|
"from langchain.chat_loaders.base import ChatSession\n",
|
||||||
|
"from langchain.chat_loaders.utils import (\n",
|
||||||
|
" map_ai_messages,\n",
|
||||||
|
" merge_chat_runs,\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"raw_messages = loader.lazy_load()\n",
|
||||||
|
"# Merge consecutive messages from the same sender into a single message\n",
|
||||||
|
"merged_messages = merge_chat_runs(raw_messages)\n",
|
||||||
|
"# Convert messages from \"U0500003428\" to AI messages\n",
|
||||||
|
"messages: List[ChatSession] = list(map_ai_messages(merged_messages, sender=\"U0500003428\"))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "7d033f87-cd0c-4f44-a753-41b871c1e919",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Next Steps\n",
|
||||||
|
"\n",
|
||||||
|
"You can then use these messages how you see fit, such as finetuning a model, few-shot example selection, or directly make predictions for the next message. "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "7d8a1629-5d9e-49b3-b978-3add57027d59",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Hi, \n",
|
||||||
|
"\n",
|
||||||
|
"I hope you're doing well. I wanted to reach out and ask if you'd be available to meet up for coffee sometime next week. I'd love to catch up and hear about what's been going on in your life. Let me know if you're interested and we can find a time that works for both of us. \n",
|
||||||
|
"\n",
|
||||||
|
"Looking forward to hearing from you!\n",
|
||||||
|
"\n",
|
||||||
|
"Best, [Your Name]"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from langchain.chat_models import ChatOpenAI\n",
|
||||||
|
"\n",
|
||||||
|
"llm = ChatOpenAI()\n",
|
||||||
|
"\n",
|
||||||
|
"for chunk in llm.stream(messages[1]['messages']):\n",
|
||||||
|
" print(chunk.content, end=\"\", flush=True)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
206
docs/extras/integrations/chat_loaders/telegram.ipynb
Normal file
206
docs/extras/integrations/chat_loaders/telegram.ipynb
Normal file
@ -0,0 +1,206 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "735455a6-f82e-4252-b545-27385ef883f4",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Telegram\n",
|
||||||
|
"\n",
|
||||||
|
"This notebook shows how to use the Telegram chat loader. This class helps map exported Telegram conversations to LangChain chat messages.\n",
|
||||||
|
"\n",
|
||||||
|
"The process has three steps:\n",
|
||||||
|
"1. Export the chat .txt file by copying chats from the Discord app and pasting them in a file on your local computer\n",
|
||||||
|
"2. Create the `TelegramChatLoader` with the file path pointed to the json file or directory of JSON files\n",
|
||||||
|
"3. Call `loader.load()` (or `loader.lazy_load()`) to perform the conversion. Optionally use `merge_chat_runs` to combine message from the same sender in sequence, and/or `map_ai_messages` to convert messages from the specified sender to the \"AIMessage\" class.\n",
|
||||||
|
"\n",
|
||||||
|
"## 1. Creat message dump\n",
|
||||||
|
"\n",
|
||||||
|
"Currently (2023/08/23) this loader best supports json files in the format generated by exporting your chat history from the [Telegram Desktop App](https://desktop.telegram.org/).\n",
|
||||||
|
"\n",
|
||||||
|
"**Important:** There are 'lite' versions of telegram such as \"Telegram for MacOS\" that lack the export functionality. Please make sure you use the correct app to export the file.\n",
|
||||||
|
"\n",
|
||||||
|
"To make the export:\n",
|
||||||
|
"1. Download and open telegram desktop\n",
|
||||||
|
"2. Select a conversation\n",
|
||||||
|
"3. Navigate to the conversation settings (currently the three dots in the top right corner)\n",
|
||||||
|
"4. Click \"Export Chat History\"\n",
|
||||||
|
"5. Unselect photos and other media. Select \"Machine-readable JSON\" format to export.\n",
|
||||||
|
"\n",
|
||||||
|
"An example is below: "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "285f2044-0f58-4b92-addb-9f8569076734",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Overwriting telegram_conversation.json\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"%%writefile telegram_conversation.json\n",
|
||||||
|
"{\n",
|
||||||
|
" \"name\": \"Jiminy\",\n",
|
||||||
|
" \"type\": \"personal_chat\",\n",
|
||||||
|
" \"id\": 5965280513,\n",
|
||||||
|
" \"messages\": [\n",
|
||||||
|
" {\n",
|
||||||
|
" \"id\": 1,\n",
|
||||||
|
" \"type\": \"message\",\n",
|
||||||
|
" \"date\": \"2023-08-23T13:11:23\",\n",
|
||||||
|
" \"date_unixtime\": \"1692821483\",\n",
|
||||||
|
" \"from\": \"Jiminy Cricket\",\n",
|
||||||
|
" \"from_id\": \"user123450513\",\n",
|
||||||
|
" \"text\": \"You better trust your conscience\",\n",
|
||||||
|
" \"text_entities\": [\n",
|
||||||
|
" {\n",
|
||||||
|
" \"type\": \"plain\",\n",
|
||||||
|
" \"text\": \"You better trust your conscience\"\n",
|
||||||
|
" }\n",
|
||||||
|
" ]\n",
|
||||||
|
" },\n",
|
||||||
|
" {\n",
|
||||||
|
" \"id\": 2,\n",
|
||||||
|
" \"type\": \"message\",\n",
|
||||||
|
" \"date\": \"2023-08-23T13:13:20\",\n",
|
||||||
|
" \"date_unixtime\": \"1692821600\",\n",
|
||||||
|
" \"from\": \"Batman & Robin\",\n",
|
||||||
|
" \"from_id\": \"user6565661032\",\n",
|
||||||
|
" \"text\": \"What did you just say?\",\n",
|
||||||
|
" \"text_entities\": [\n",
|
||||||
|
" {\n",
|
||||||
|
" \"type\": \"plain\",\n",
|
||||||
|
" \"text\": \"What did you just say?\"\n",
|
||||||
|
" }\n",
|
||||||
|
" ]\n",
|
||||||
|
" }\n",
|
||||||
|
" ]\n",
|
||||||
|
"}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "7cc109f4-4c92-4cd3-8143-c322776c3f03",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 2. Create the Chat Loader\n",
|
||||||
|
"\n",
|
||||||
|
"All that's required is the file path. You can optionally specify the user name that maps to an ai message as well an configure whether to merge message runs."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "111f7767-573c-42d4-86f0-bd766bbaa071",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.chat_loaders.telegram import TelegramChatLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "a4226efa-2640-4990-a20c-6861d1887329",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = TelegramChatLoader(\n",
|
||||||
|
" path=\"./telegram_conversation.json\", \n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "71699fb7-7815-4c89-8d96-30e8fada6923",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 3. Load messages\n",
|
||||||
|
"\n",
|
||||||
|
"The `load()` (or `lazy_load`) methods return a list of \"ChatSessions\" that currently just contain a list of messages per loaded conversation."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "81121efb-c875-4a77-ad1e-fe26b3d7e812",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from typing import List\n",
|
||||||
|
"from langchain.chat_loaders.base import ChatSession\n",
|
||||||
|
"from langchain.chat_loaders.utils import (\n",
|
||||||
|
" map_ai_messages,\n",
|
||||||
|
" merge_chat_runs,\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"raw_messages = loader.lazy_load()\n",
|
||||||
|
"# Merge consecutive messages from the same sender into a single message\n",
|
||||||
|
"merged_messages = merge_chat_runs(raw_messages)\n",
|
||||||
|
"# Convert messages from \"Jiminy Cricket\" to AI messages\n",
|
||||||
|
"messages: List[ChatSession] = list(map_ai_messages(merged_messages, sender=\"Jiminy Cricket\"))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "b9089c05-7375-41ca-a2f9-672a845314e4",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Next Steps\n",
|
||||||
|
"\n",
|
||||||
|
"You can then use these messages how you see fit, such as finetuning a model, few-shot example selection, or directly make predictions for the next message "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "637a6f5d-6944-4722-9361-a76ef5e9dd2a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"I said, \"You better trust your conscience.\""
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from langchain.chat_models import ChatOpenAI\n",
|
||||||
|
"\n",
|
||||||
|
"llm = ChatOpenAI()\n",
|
||||||
|
"\n",
|
||||||
|
"for chunk in llm.stream(messages[0]['messages']):\n",
|
||||||
|
" print(chunk.content, end=\"\", flush=True)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
204
docs/extras/integrations/chat_loaders/whatsapp.ipynb
Normal file
204
docs/extras/integrations/chat_loaders/whatsapp.ipynb
Normal file
@ -0,0 +1,204 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "735455a6-f82e-4252-b545-27385ef883f4",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# WhatsApp\n",
|
||||||
|
"\n",
|
||||||
|
"This notebook shows how to use the WhatsApp chat loader. This class helps map exported Telegram conversations to LangChain chat messages.\n",
|
||||||
|
"\n",
|
||||||
|
"The process has three steps:\n",
|
||||||
|
"1. Export the chat conversations to computer\n",
|
||||||
|
"2. Create the `WhatsAppChatLoader` with the file path pointed to the json file or directory of JSON files\n",
|
||||||
|
"3. Call `loader.load()` (or `loader.lazy_load()`) to perform the conversion.\n",
|
||||||
|
"\n",
|
||||||
|
"## 1. Creat message dump\n",
|
||||||
|
"\n",
|
||||||
|
"To make the export of your WhatsApp conversation(s), complete the following steps:\n",
|
||||||
|
"\n",
|
||||||
|
"1. Open the target conversation\n",
|
||||||
|
"2. Click the three dots in the top right corner and select \"More\".\n",
|
||||||
|
"3. Then select \"Export chat\" and choose \"Without media\".\n",
|
||||||
|
"\n",
|
||||||
|
"An example of the data format for each converation is below: "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "285f2044-0f58-4b92-addb-9f8569076734",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Writing whatsapp_chat.txt\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"%%writefile whatsapp_chat.txt\n",
|
||||||
|
"[8/15/23, 9:12:33 AM] Dr. Feather: Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them.\n",
|
||||||
|
"[8/15/23, 9:12:43 AM] Dr. Feather: I spotted a rare Hyacinth Macaw yesterday in the Amazon Rainforest. Such a magnificent creature!\n",
|
||||||
|
"[8/15/23, 9:12:48 AM] Dr. Feather: image omitted\n",
|
||||||
|
"[8/15/23, 9:13:15 AM] Jungle Jane: That's stunning! Were you able to observe its behavior?\n",
|
||||||
|
"[8/15/23, 9:13:23 AM] Dr. Feather: image omitted\n",
|
||||||
|
"[8/15/23, 9:14:02 AM] Dr. Feather: Yes, it seemed quite social with other macaws. They're known for their playful nature.\n",
|
||||||
|
"[8/15/23, 9:14:15 AM] Jungle Jane: How's the research going on parrot communication?\n",
|
||||||
|
"[8/15/23, 9:14:30 AM] Dr. Feather: image omitted\n",
|
||||||
|
"[8/15/23, 9:14:50 AM] Dr. Feather: It's progressing well. We're learning so much about how they use sound and color to communicate.\n",
|
||||||
|
"[8/15/23, 9:15:10 AM] Jungle Jane: That's fascinating! Can't wait to read your paper on it.\n",
|
||||||
|
"[8/15/23, 9:15:20 AM] Dr. Feather: Thank you! I'll send you a draft soon.\n",
|
||||||
|
"[8/15/23, 9:25:16 PM] Jungle Jane: Looking forward to it! Keep up the great work."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "7cc109f4-4c92-4cd3-8143-c322776c3f03",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 2. Create the Chat Loader\n",
|
||||||
|
"\n",
|
||||||
|
"The WhatsAppChatLoader accepts the resulting zip file, unzipped directory, or the path to any of the chat `.txt` files therein.\n",
|
||||||
|
"\n",
|
||||||
|
"Provide that as well as the user name you want to take on the role of \"AI\" when finetuning."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "111f7767-573c-42d4-86f0-bd766bbaa071",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.chat_loaders.whatsapp import WhatsAppChatLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"id": "a4226efa-2640-4990-a20c-6861d1887329",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = WhatsAppChatLoader(\n",
|
||||||
|
" path=\"./whatsapp_chat.txt\", \n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "71699fb7-7815-4c89-8d96-30e8fada6923",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 3. Load messages\n",
|
||||||
|
"\n",
|
||||||
|
"The `load()` (or `lazy_load`) methods return a list of \"ChatSessions\" that currently store the list of messages per loaded conversation."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"id": "81121efb-c875-4a77-ad1e-fe26b3d7e812",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[{'messages': [AIMessage(content='I spotted a rare Hyacinth Macaw yesterday in the Amazon Rainforest. Such a magnificent creature!', additional_kwargs={'sender': 'Dr. Feather', 'events': [{'message_time': '8/15/23, 9:12:43 AM'}]}, example=False),\n",
|
||||||
|
" HumanMessage(content=\"That's stunning! Were you able to observe its behavior?\", additional_kwargs={'sender': 'Jungle Jane', 'events': [{'message_time': '8/15/23, 9:13:15 AM'}]}, example=False),\n",
|
||||||
|
" AIMessage(content=\"Yes, it seemed quite social with other macaws. They're known for their playful nature.\", additional_kwargs={'sender': 'Dr. Feather', 'events': [{'message_time': '8/15/23, 9:14:02 AM'}]}, example=False),\n",
|
||||||
|
" HumanMessage(content=\"How's the research going on parrot communication?\", additional_kwargs={'sender': 'Jungle Jane', 'events': [{'message_time': '8/15/23, 9:14:15 AM'}]}, example=False),\n",
|
||||||
|
" AIMessage(content=\"It's progressing well. We're learning so much about how they use sound and color to communicate.\", additional_kwargs={'sender': 'Dr. Feather', 'events': [{'message_time': '8/15/23, 9:14:50 AM'}]}, example=False),\n",
|
||||||
|
" HumanMessage(content=\"That's fascinating! Can't wait to read your paper on it.\", additional_kwargs={'sender': 'Jungle Jane', 'events': [{'message_time': '8/15/23, 9:15:10 AM'}]}, example=False),\n",
|
||||||
|
" AIMessage(content=\"Thank you! I'll send you a draft soon.\", additional_kwargs={'sender': 'Dr. Feather', 'events': [{'message_time': '8/15/23, 9:15:20 AM'}]}, example=False),\n",
|
||||||
|
" HumanMessage(content='Looking forward to it! Keep up the great work.', additional_kwargs={'sender': 'Jungle Jane', 'events': [{'message_time': '8/15/23, 9:25:16 PM'}]}, example=False)]}]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from typing import List\n",
|
||||||
|
"from langchain.chat_loaders.base import ChatSession\n",
|
||||||
|
"from langchain.chat_loaders.utils import (\n",
|
||||||
|
" map_ai_messages,\n",
|
||||||
|
" merge_chat_runs,\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"raw_messages = loader.lazy_load()\n",
|
||||||
|
"# Merge consecutive messages from the same sender into a single message\n",
|
||||||
|
"merged_messages = merge_chat_runs(raw_messages)\n",
|
||||||
|
"# Convert messages from \"Dr. Feather\" to AI messages\n",
|
||||||
|
"messages: List[ChatSession] = list(map_ai_messages(merged_messages, sender=\"Dr. Feather\"))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "b9089c05-7375-41ca-a2f9-672a845314e4",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Next Steps\n",
|
||||||
|
"\n",
|
||||||
|
"You can then use these messages how you see fit, such as finetuning a model, few-shot example selection, or directly make predictions for the next message."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"id": "637a6f5d-6944-4722-9361-a76ef5e9dd2a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Thank you for the encouragement! I'll do my best to continue studying and sharing fascinating insights about parrot communication."
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from langchain.chat_models import ChatOpenAI\n",
|
||||||
|
"\n",
|
||||||
|
"llm = ChatOpenAI()\n",
|
||||||
|
"\n",
|
||||||
|
"for chunk in llm.stream(messages[0]['messages']):\n",
|
||||||
|
" print(chunk.content, end=\"\", flush=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "16156643-cfbd-444f-b4ae-198eb44f0267",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -15,6 +15,7 @@ from typing import (
|
|||||||
|
|
||||||
from typing_extensions import Literal
|
from typing_extensions import Literal
|
||||||
|
|
||||||
|
from langchain.chat_loaders.base import ChatSession
|
||||||
from langchain.schema.messages import (
|
from langchain.schema.messages import (
|
||||||
AIMessage,
|
AIMessage,
|
||||||
AIMessageChunk,
|
AIMessageChunk,
|
||||||
@ -206,3 +207,19 @@ class ChatCompletion:
|
|||||||
_convert_message_chunk_to_delta(c, i)
|
_convert_message_chunk_to_delta(c, i)
|
||||||
async for i, c in aenumerate(model_config.astream(converted_messages))
|
async for i, c in aenumerate(model_config.astream(converted_messages))
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _has_assistant_message(session: ChatSession) -> bool:
|
||||||
|
"""Check if chat session has an assistant message."""
|
||||||
|
return any([isinstance(m, AIMessage) for m in session["messages"]])
|
||||||
|
|
||||||
|
|
||||||
|
def convert_messages_for_finetuning(
|
||||||
|
sessions: Iterable[ChatSession],
|
||||||
|
) -> List[List[dict]]:
|
||||||
|
"""Convert messages to a list of lists of dictionaries for fine-tuning."""
|
||||||
|
return [
|
||||||
|
[convert_message_to_dict(s) for s in session["messages"]]
|
||||||
|
for session in sessions
|
||||||
|
if _has_assistant_message(session)
|
||||||
|
]
|
||||||
|
6
libs/langchain/langchain/chat_loaders/__init__.py
Normal file
6
libs/langchain/langchain/chat_loaders/__init__.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
"""Load chat messages from common communications platforms for finetuning.
|
||||||
|
|
||||||
|
This module provides functions to load chat messages from various
|
||||||
|
communications platforms such as Facebook Messenger, Telegram, and
|
||||||
|
WhatsApp. The loaded chat messages can be used for finetuning models.
|
||||||
|
"""
|
31
libs/langchain/langchain/chat_loaders/base.py
Normal file
31
libs/langchain/langchain/chat_loaders/base.py
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
"""Base definitions for chat loaders.
|
||||||
|
|
||||||
|
A chat loader is a class that loads chat messages from an external
|
||||||
|
source such as a file or a database. The chat messages can then be
|
||||||
|
used for finetuning.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import Iterator, List, Sequence, TypedDict
|
||||||
|
|
||||||
|
from langchain.schema.messages import BaseMessage
|
||||||
|
|
||||||
|
|
||||||
|
class ChatSession(TypedDict):
|
||||||
|
"""A chat session represents a single
|
||||||
|
conversation, channel, or other group of messages."""
|
||||||
|
|
||||||
|
messages: Sequence[BaseMessage]
|
||||||
|
"""The LangChain chat messages loaded from the source."""
|
||||||
|
|
||||||
|
|
||||||
|
class BaseChatLoader(ABC):
|
||||||
|
"""Base class for chat loaders."""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def lazy_load(self) -> Iterator[ChatSession]:
|
||||||
|
"""Lazy load the chat sessions."""
|
||||||
|
|
||||||
|
def load(self) -> List[ChatSession]:
|
||||||
|
"""Eagerly load the chat sessions into memory."""
|
||||||
|
return list(self.lazy_load())
|
77
libs/langchain/langchain/chat_loaders/facebook_messenger.py
Normal file
77
libs/langchain/langchain/chat_loaders/facebook_messenger.py
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterator, Union
|
||||||
|
|
||||||
|
from langchain.chat_loaders.base import BaseChatLoader, ChatSession
|
||||||
|
from langchain.schema.messages import HumanMessage
|
||||||
|
|
||||||
|
logger = logging.getLogger(__file__)
|
||||||
|
|
||||||
|
|
||||||
|
class SingleFileFacebookMessengerChatLoader(BaseChatLoader):
|
||||||
|
"""A chat loader for loading Facebook Messenger chat data from a single file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path (Union[Path, str]): The path to the chat file.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
path (Path): The path to the chat file.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, path: Union[Path, str]) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self.file_path = path if isinstance(path, Path) else Path(path)
|
||||||
|
|
||||||
|
def lazy_load(self) -> Iterator[ChatSession]:
|
||||||
|
"""Lazy loads the chat data from the file.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
ChatSession: A chat session containing the loaded messages.
|
||||||
|
|
||||||
|
"""
|
||||||
|
with open(self.file_path) as f:
|
||||||
|
data = json.load(f)
|
||||||
|
sorted_data = sorted(data["messages"], key=lambda x: x["timestamp_ms"])
|
||||||
|
messages = []
|
||||||
|
for m in sorted_data:
|
||||||
|
messages.append(
|
||||||
|
HumanMessage(
|
||||||
|
content=m["content"], additional_kwargs={"sender": m["sender_name"]}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
yield ChatSession(messages=messages)
|
||||||
|
|
||||||
|
|
||||||
|
class FolderFacebookMessengerChatLoader(BaseChatLoader):
|
||||||
|
"""A chat loader for loading Facebook Messenger chat data from a folder.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path (Union[str, Path]): The path to the directory
|
||||||
|
containing the chat files.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
path (Path): The path to the directory containing the chat files.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, path: Union[str, Path]) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self.directory_path = Path(path) if isinstance(path, str) else path
|
||||||
|
|
||||||
|
def lazy_load(self) -> Iterator[ChatSession]:
|
||||||
|
"""Lazy loads the chat data from the folder.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
ChatSession: A chat session containing the loaded messages.
|
||||||
|
|
||||||
|
"""
|
||||||
|
inbox_path = self.directory_path / "inbox"
|
||||||
|
for _dir in inbox_path.iterdir():
|
||||||
|
if _dir.is_dir():
|
||||||
|
for _file in _dir.iterdir():
|
||||||
|
if _file.suffix.lower() == ".json":
|
||||||
|
file_loader = SingleFileFacebookMessengerChatLoader(path=_file)
|
||||||
|
for result in file_loader.lazy_load():
|
||||||
|
yield result
|
84
libs/langchain/langchain/chat_loaders/slack.py
Normal file
84
libs/langchain/langchain/chat_loaders/slack.py
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import zipfile
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Iterator, List, Union
|
||||||
|
|
||||||
|
from langchain import schema
|
||||||
|
from langchain.chat_loaders import base as chat_loaders
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class SlackChatLoader(chat_loaders.BaseChatLoader):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
path: Union[str, Path],
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Initialize the chat loader with the path to the exported Slack dump zip file.
|
||||||
|
|
||||||
|
:param path: Path to the exported Slack dump zip file.
|
||||||
|
"""
|
||||||
|
self.zip_path = path if isinstance(path, Path) else Path(path)
|
||||||
|
if not self.zip_path.exists():
|
||||||
|
raise FileNotFoundError(f"File {self.zip_path} not found")
|
||||||
|
|
||||||
|
def _load_single_chat_session(
|
||||||
|
self, messages: List[Dict]
|
||||||
|
) -> chat_loaders.ChatSession:
|
||||||
|
results: List[Union[schema.AIMessage, schema.HumanMessage]] = []
|
||||||
|
previous_sender = None
|
||||||
|
for message in messages:
|
||||||
|
if not isinstance(message, dict):
|
||||||
|
continue
|
||||||
|
text = message.get("text", "")
|
||||||
|
timestamp = message.get("ts", "")
|
||||||
|
sender = message.get("user", "")
|
||||||
|
if not sender:
|
||||||
|
continue
|
||||||
|
skip_pattern = re.compile(
|
||||||
|
r"<@U\d+> has joined the channel", flags=re.IGNORECASE
|
||||||
|
)
|
||||||
|
if skip_pattern.match(text):
|
||||||
|
continue
|
||||||
|
if sender == previous_sender:
|
||||||
|
results[-1].content += "\n\n" + text
|
||||||
|
results[-1].additional_kwargs["events"].append(
|
||||||
|
{"message_time": timestamp}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
results.append(
|
||||||
|
schema.HumanMessage(
|
||||||
|
role=sender,
|
||||||
|
content=text,
|
||||||
|
additional_kwargs={
|
||||||
|
"sender": sender,
|
||||||
|
"events": [{"message_time": timestamp}],
|
||||||
|
},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
previous_sender = sender
|
||||||
|
return chat_loaders.ChatSession(messages=results)
|
||||||
|
|
||||||
|
def _read_json(self, zip_file: zipfile.ZipFile, file_path: str) -> List[dict]:
|
||||||
|
"""Read JSON data from a zip subfile."""
|
||||||
|
with zip_file.open(file_path, "r") as f:
|
||||||
|
data = json.load(f)
|
||||||
|
if not isinstance(data, list):
|
||||||
|
raise ValueError(f"Expected list of dictionaries, got {type(data)}")
|
||||||
|
return data
|
||||||
|
|
||||||
|
def lazy_load(self) -> Iterator[chat_loaders.ChatSession]:
|
||||||
|
"""
|
||||||
|
Lazy load the chat sessions from the Slack dump file and yield them
|
||||||
|
in the required format.
|
||||||
|
|
||||||
|
:return: Iterator of chat sessions containing messages.
|
||||||
|
"""
|
||||||
|
with zipfile.ZipFile(str(self.zip_path), "r") as zip_file:
|
||||||
|
for file_path in zip_file.namelist():
|
||||||
|
if file_path.endswith(".json"):
|
||||||
|
messages = self._read_json(zip_file, file_path)
|
||||||
|
yield self._load_single_chat_session(messages)
|
152
libs/langchain/langchain/chat_loaders/telegram.py
Normal file
152
libs/langchain/langchain/chat_loaders/telegram.py
Normal file
@ -0,0 +1,152 @@
|
|||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import zipfile
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterator, List, Union
|
||||||
|
|
||||||
|
from langchain import schema
|
||||||
|
from langchain.chat_loaders import base as chat_loaders
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class TelegramChatLoader(chat_loaders.BaseChatLoader):
|
||||||
|
"""A loading utility for converting telegram conversations
|
||||||
|
to LangChain chat messages.
|
||||||
|
|
||||||
|
To export, use the Telegram Desktop app from
|
||||||
|
https://desktop.telegram.org/, select a conversation, click the three dots
|
||||||
|
in the top right corner, and select "Export chat history". Then select
|
||||||
|
"Machine-readable JSON" (preferred) to export. Note: the 'lite' versions of
|
||||||
|
the desktop app (like "Telegram for MacOS") do not support exporting chat
|
||||||
|
history.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
path: Union[str, Path],
|
||||||
|
):
|
||||||
|
"""Initialize the TelegramChatLoader.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path (Union[str, Path]): Path to the exported Telegram chat zip,
|
||||||
|
directory, json, or HTML file.
|
||||||
|
"""
|
||||||
|
self.path = path if isinstance(path, str) else str(path)
|
||||||
|
|
||||||
|
def _load_single_chat_session_html(
|
||||||
|
self, file_path: str
|
||||||
|
) -> chat_loaders.ChatSession:
|
||||||
|
"""Load a single chat session from an HTML file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path (str): Path to the HTML file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
chat_loaders.ChatSession: The loaded chat session.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"Please install the 'beautifulsoup4' package to load"
|
||||||
|
" Telegram HTML files. You can do this by running"
|
||||||
|
"'pip install beautifulsoup4' in your terminal."
|
||||||
|
)
|
||||||
|
with open(file_path, "r", encoding="utf-8") as file:
|
||||||
|
soup = BeautifulSoup(file, "html.parser")
|
||||||
|
|
||||||
|
results: List[Union[schema.HumanMessage, schema.AIMessage]] = []
|
||||||
|
previous_sender = None
|
||||||
|
for message in soup.select(".message.default"):
|
||||||
|
timestamp = message.select_one(".pull_right.date.details")["title"]
|
||||||
|
from_name_element = message.select_one(".from_name")
|
||||||
|
if from_name_element is None and previous_sender is None:
|
||||||
|
logger.debug("from_name not found in message")
|
||||||
|
continue
|
||||||
|
elif from_name_element is None:
|
||||||
|
from_name = previous_sender
|
||||||
|
else:
|
||||||
|
from_name = from_name_element.text.strip()
|
||||||
|
text = message.select_one(".text").text.strip()
|
||||||
|
results.append(
|
||||||
|
schema.HumanMessage(
|
||||||
|
content=text,
|
||||||
|
additional_kwargs={
|
||||||
|
"sender": from_name,
|
||||||
|
"events": [{"message_time": timestamp}],
|
||||||
|
},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
previous_sender = from_name
|
||||||
|
|
||||||
|
return chat_loaders.ChatSession(messages=results)
|
||||||
|
|
||||||
|
def _load_single_chat_session_json(
|
||||||
|
self, file_path: str
|
||||||
|
) -> chat_loaders.ChatSession:
|
||||||
|
"""Load a single chat session from a JSON file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path (str): Path to the JSON file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
chat_loaders.ChatSession: The loaded chat session.
|
||||||
|
"""
|
||||||
|
with open(file_path, "r", encoding="utf-8") as file:
|
||||||
|
data = json.load(file)
|
||||||
|
|
||||||
|
messages = data.get("messages", [])
|
||||||
|
results: List[schema.BaseMessage] = []
|
||||||
|
for message in messages:
|
||||||
|
text = message.get("text", "")
|
||||||
|
timestamp = message.get("date", "")
|
||||||
|
from_name = message.get("from", "")
|
||||||
|
|
||||||
|
results.append(
|
||||||
|
schema.HumanMessage(
|
||||||
|
content=text,
|
||||||
|
additional_kwargs={
|
||||||
|
"sender": from_name,
|
||||||
|
"events": [{"message_time": timestamp}],
|
||||||
|
},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return chat_loaders.ChatSession(messages=results)
|
||||||
|
|
||||||
|
def _iterate_files(self, path: str) -> Iterator[str]:
|
||||||
|
"""Iterate over files in a directory or zip file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path (str): Path to the directory or zip file.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
str: Path to each file.
|
||||||
|
"""
|
||||||
|
if os.path.isfile(path) and path.endswith((".html", ".json")):
|
||||||
|
yield path
|
||||||
|
elif os.path.isdir(path):
|
||||||
|
for root, _, files in os.walk(path):
|
||||||
|
for file in files:
|
||||||
|
if file.endswith((".html", ".json")):
|
||||||
|
yield os.path.join(root, file)
|
||||||
|
elif zipfile.is_zipfile(path):
|
||||||
|
with zipfile.ZipFile(path) as zip_file:
|
||||||
|
for file in zip_file.namelist():
|
||||||
|
if file.endswith((".html", ".json")):
|
||||||
|
yield zip_file.extract(file)
|
||||||
|
|
||||||
|
def lazy_load(self) -> Iterator[chat_loaders.ChatSession]:
|
||||||
|
"""Lazy load the messages from the chat file and yield them
|
||||||
|
in as chat sessions.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
chat_loaders.ChatSession: The loaded chat session.
|
||||||
|
"""
|
||||||
|
for file_path in self._iterate_files(self.path):
|
||||||
|
if file_path.endswith(".html"):
|
||||||
|
yield self._load_single_chat_session_html(file_path)
|
||||||
|
elif file_path.endswith(".json"):
|
||||||
|
yield self._load_single_chat_session_json(file_path)
|
86
libs/langchain/langchain/chat_loaders/utils.py
Normal file
86
libs/langchain/langchain/chat_loaders/utils.py
Normal file
@ -0,0 +1,86 @@
|
|||||||
|
"""Utilities for chat loaders."""
|
||||||
|
from copy import deepcopy
|
||||||
|
from typing import Iterable, Iterator, List
|
||||||
|
|
||||||
|
from langchain import schema
|
||||||
|
from langchain.chat_loaders.base import ChatSession
|
||||||
|
from langchain.schema.messages import BaseMessage
|
||||||
|
|
||||||
|
|
||||||
|
def merge_chat_runs_in_session(
|
||||||
|
chat_session: ChatSession, delimiter: str = "\n\n"
|
||||||
|
) -> ChatSession:
|
||||||
|
"""Merge chat runs together in a chat session.
|
||||||
|
|
||||||
|
A chat run is a sequence of messages from the same sender.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
chat_session: A chat session.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A chat session with merged chat runs.
|
||||||
|
"""
|
||||||
|
messages: List[BaseMessage] = []
|
||||||
|
for message in chat_session["messages"]:
|
||||||
|
if not messages:
|
||||||
|
messages.append(deepcopy(message))
|
||||||
|
elif (
|
||||||
|
isinstance(message, type(messages[-1]))
|
||||||
|
and messages[-1].additional_kwargs.get("sender") is not None
|
||||||
|
and messages[-1].additional_kwargs["sender"]
|
||||||
|
== message.additional_kwargs.get("sender")
|
||||||
|
):
|
||||||
|
messages[-1].content = (
|
||||||
|
messages[-1].content + delimiter + message.content
|
||||||
|
).strip()
|
||||||
|
messages[-1].additional_kwargs.get("events", []).extend(
|
||||||
|
message.additional_kwargs.get("events") or []
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
messages.append(deepcopy(message))
|
||||||
|
return ChatSession(messages=messages)
|
||||||
|
|
||||||
|
|
||||||
|
def merge_chat_runs(chat_sessions: Iterable[ChatSession]) -> Iterator[ChatSession]:
|
||||||
|
"""Merge chat runs together.
|
||||||
|
|
||||||
|
A chat run is a sequence of messages from the same sender.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
chat_sessions: A list of chat sessions.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A list of chat sessions with merged chat runs.
|
||||||
|
"""
|
||||||
|
for chat_session in chat_sessions:
|
||||||
|
yield merge_chat_runs_in_session(chat_session)
|
||||||
|
|
||||||
|
|
||||||
|
def map_ai_messages_in_session(chat_sessions: ChatSession, sender: str) -> ChatSession:
|
||||||
|
"""Convert messages from the specified 'sender' to AI messages.
|
||||||
|
|
||||||
|
This is useful for fine-tuning the AI to adapt to your voice.
|
||||||
|
"""
|
||||||
|
messages = []
|
||||||
|
num_converted = 0
|
||||||
|
for message in chat_sessions["messages"]:
|
||||||
|
if message.additional_kwargs.get("sender") == sender:
|
||||||
|
message = schema.AIMessage(
|
||||||
|
content=message.content,
|
||||||
|
additional_kwargs=message.additional_kwargs.copy(),
|
||||||
|
example=getattr(message, "example", None),
|
||||||
|
)
|
||||||
|
num_converted += 1
|
||||||
|
messages.append(message)
|
||||||
|
return ChatSession(messages=messages)
|
||||||
|
|
||||||
|
|
||||||
|
def map_ai_messages(
|
||||||
|
chat_sessions: Iterable[ChatSession], sender: str
|
||||||
|
) -> Iterator[ChatSession]:
|
||||||
|
"""Convert messages from the specified 'sender' to AI messages.
|
||||||
|
|
||||||
|
This is useful for fine-tuning the AI to adapt to your voice.
|
||||||
|
"""
|
||||||
|
for chat_session in chat_sessions:
|
||||||
|
yield map_ai_messages_in_session(chat_session, sender)
|
116
libs/langchain/langchain/chat_loaders/whatsapp.py
Normal file
116
libs/langchain/langchain/chat_loaders/whatsapp.py
Normal file
@ -0,0 +1,116 @@
|
|||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import zipfile
|
||||||
|
from typing import Iterator, List, Union
|
||||||
|
|
||||||
|
from langchain import schema
|
||||||
|
from langchain.chat_loaders import base as chat_loaders
|
||||||
|
from langchain.schema import messages
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class WhatsAppChatLoader(chat_loaders.BaseChatLoader):
|
||||||
|
def __init__(self, path: str):
|
||||||
|
"""Initialize the WhatsAppChatLoader.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path (str): Path to the exported WhatsApp chat
|
||||||
|
zip directory, folder, or file.
|
||||||
|
|
||||||
|
To generate the dump, open the chat, click the three dots in the top
|
||||||
|
right corner, and select "More". Then select "Export chat" and
|
||||||
|
choose "Without media".
|
||||||
|
"""
|
||||||
|
self.path = path
|
||||||
|
ignore_lines = [
|
||||||
|
"This message was deleted",
|
||||||
|
"<Media omitted>",
|
||||||
|
"image omitted",
|
||||||
|
"Messages and calls are end-to-end encrypted. No one outside of this chat,"
|
||||||
|
" not even WhatsApp, can read or listen to them.",
|
||||||
|
]
|
||||||
|
self._ignore_lines = re.compile(
|
||||||
|
r"(" + "|".join([r"\u200E*" + line for line in ignore_lines]) + r")",
|
||||||
|
flags=re.IGNORECASE,
|
||||||
|
)
|
||||||
|
self._message_line_regex = re.compile(
|
||||||
|
r"\u200E*\[?(\d{1,2}/\d{1,2}/\d{2,4}, \d{1,2}:\d{2}:\d{2} (?:AM|PM))\]?[ \u200E]*([^:]+): (.+)", # noqa
|
||||||
|
flags=re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _load_single_chat_session(self, file_path: str) -> chat_loaders.ChatSession:
|
||||||
|
"""Load a single chat session from a file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path (str): Path to the chat file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ChatSession: The loaded chat session.
|
||||||
|
"""
|
||||||
|
with open(file_path, "r", encoding="utf-8") as file:
|
||||||
|
txt = file.read()
|
||||||
|
|
||||||
|
# Split messages by newlines, but keep multi-line messages grouped
|
||||||
|
chat_lines: List[str] = []
|
||||||
|
current_message = ""
|
||||||
|
for line in txt.split("\n"):
|
||||||
|
if self._message_line_regex.match(line):
|
||||||
|
if current_message:
|
||||||
|
chat_lines.append(current_message)
|
||||||
|
current_message = line
|
||||||
|
else:
|
||||||
|
current_message += " " + line.strip()
|
||||||
|
if current_message:
|
||||||
|
chat_lines.append(current_message)
|
||||||
|
results: List[Union[messages.HumanMessage, messages.AIMessage]] = []
|
||||||
|
for line in chat_lines:
|
||||||
|
result = self._message_line_regex.match(line.strip())
|
||||||
|
if result:
|
||||||
|
timestamp, sender, text = result.groups()
|
||||||
|
if not self._ignore_lines.match(text.strip()):
|
||||||
|
results.append(
|
||||||
|
schema.HumanMessage(
|
||||||
|
role=sender,
|
||||||
|
content=text,
|
||||||
|
additional_kwargs={
|
||||||
|
"sender": sender,
|
||||||
|
"events": [{"message_time": timestamp}],
|
||||||
|
},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.debug(f"Could not parse line: {line}")
|
||||||
|
return chat_loaders.ChatSession(messages=results)
|
||||||
|
|
||||||
|
def _iterate_files(self, path: str) -> Iterator[str]:
|
||||||
|
"""Iterate over the files in a directory or zip file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path (str): Path to the directory or zip file.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
str: The path to each file.
|
||||||
|
"""
|
||||||
|
if os.path.isfile(path):
|
||||||
|
yield path
|
||||||
|
elif os.path.isdir(path):
|
||||||
|
for root, _, files in os.walk(path):
|
||||||
|
for file in files:
|
||||||
|
if file.endswith(".txt"):
|
||||||
|
yield os.path.join(root, file)
|
||||||
|
elif zipfile.is_zipfile(path):
|
||||||
|
with zipfile.ZipFile(path) as zip_file:
|
||||||
|
for file in zip_file.namelist():
|
||||||
|
if file.endswith(".txt"):
|
||||||
|
yield zip_file.extract(file)
|
||||||
|
|
||||||
|
def lazy_load(self) -> Iterator[chat_loaders.ChatSession]:
|
||||||
|
"""Lazy load the messages from the chat file and yield
|
||||||
|
them as chat sessions.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
Iterator[ChatSession]: The loaded chat sessions.
|
||||||
|
"""
|
||||||
|
yield self._load_single_chat_session(self.path)
|
@ -0,0 +1,166 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
|
||||||
|
<head>
|
||||||
|
|
||||||
|
<meta charset="utf-8"/>
|
||||||
|
<title>Exported Data</title>
|
||||||
|
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
|
||||||
|
|
||||||
|
<link href="css/style.css" rel="stylesheet"/>
|
||||||
|
|
||||||
|
<script src="js/script.js" type="text/javascript">
|
||||||
|
|
||||||
|
</script>
|
||||||
|
|
||||||
|
</head>
|
||||||
|
|
||||||
|
<body onload="CheckLocation();">
|
||||||
|
|
||||||
|
<div class="page_wrap">
|
||||||
|
|
||||||
|
<div class="page_header">
|
||||||
|
|
||||||
|
<div class="content">
|
||||||
|
|
||||||
|
<div class="text bold">
|
||||||
|
Jimmeny Marvelton
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="page_body chat_page">
|
||||||
|
|
||||||
|
<div class="history">
|
||||||
|
|
||||||
|
<div class="message service" id="message-1">
|
||||||
|
|
||||||
|
<div class="body details">
|
||||||
|
23 August 2023
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="message default clearfix" id="message1">
|
||||||
|
|
||||||
|
<div class="pull_left userpic_wrap">
|
||||||
|
|
||||||
|
<div class="userpic userpic2" style="width: 42px; height: 42px">
|
||||||
|
|
||||||
|
<div class="initials" style="line-height: 42px">
|
||||||
|
bA
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="body">
|
||||||
|
|
||||||
|
<div class="pull_right date details" title="23.08.2023 13:11:23 UTC-08:00">
|
||||||
|
13:11
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="from_name">
|
||||||
|
Jimmeny Marvelton
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="text">
|
||||||
|
i refuse to converse with you
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="message default clearfix" id="message2">
|
||||||
|
|
||||||
|
<div class="pull_left userpic_wrap">
|
||||||
|
|
||||||
|
<div class="userpic userpic1" style="width: 42px; height: 42px">
|
||||||
|
|
||||||
|
<div class="initials" style="line-height: 42px">
|
||||||
|
WF
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="body">
|
||||||
|
|
||||||
|
<div class="pull_right date details" title="23.08.2023 13:13:20 UTC-08:00">
|
||||||
|
13:13
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="from_name">
|
||||||
|
Batman & Robin
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="text">
|
||||||
|
Hi nemesis
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="message default clearfix" id="message3">
|
||||||
|
|
||||||
|
<div class="pull_left userpic_wrap">
|
||||||
|
|
||||||
|
<div class="userpic userpic2" style="width: 42px; height: 42px">
|
||||||
|
|
||||||
|
<div class="initials" style="line-height: 42px">
|
||||||
|
bA
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="body">
|
||||||
|
|
||||||
|
<div class="pull_right date details" title="23.08.2023 13:15:35 UTC-08:00">
|
||||||
|
13:15
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="from_name">
|
||||||
|
Jimmeny Marvelton
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="text">
|
||||||
|
we meet again
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="message default clearfix joined" id="message4">
|
||||||
|
|
||||||
|
<div class="body">
|
||||||
|
|
||||||
|
<div class="pull_right date details" title="23.08.2023 13:15:53 UTC-08:00">
|
||||||
|
13:15
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="text">
|
||||||
|
you will not trick me this time
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</body>
|
||||||
|
|
||||||
|
</html>
|
@ -0,0 +1,67 @@
|
|||||||
|
{
|
||||||
|
"name": "Jimmeny",
|
||||||
|
"type": "personal_chat",
|
||||||
|
"id": 5965280513,
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"type": "message",
|
||||||
|
"date": "2023-08-23T13:11:23",
|
||||||
|
"date_unixtime": "1692821483",
|
||||||
|
"from": "Jimmeny Marvelton",
|
||||||
|
"from_id": "user123450513",
|
||||||
|
"text": "i refuse to converse with you",
|
||||||
|
"text_entities": [
|
||||||
|
{
|
||||||
|
"type": "plain",
|
||||||
|
"text": "i refuse to converse with you"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"type": "message",
|
||||||
|
"date": "2023-08-23T13:13:20",
|
||||||
|
"date_unixtime": "1692821600",
|
||||||
|
"from": "Batman & Robin",
|
||||||
|
"from_id": "user6565661032",
|
||||||
|
"text": "Hi nemesis",
|
||||||
|
"text_entities": [
|
||||||
|
{
|
||||||
|
"type": "plain",
|
||||||
|
"text": "Hi nemesis"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"type": "message",
|
||||||
|
"date": "2023-08-23T13:15:35",
|
||||||
|
"date_unixtime": "1692821735",
|
||||||
|
"from": "Jimmeny Marvelton",
|
||||||
|
"from_id": "user123450513",
|
||||||
|
"text": "we meet again",
|
||||||
|
"text_entities": [
|
||||||
|
{
|
||||||
|
"type": "plain",
|
||||||
|
"text": "we meet again"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"type": "message",
|
||||||
|
"date": "2023-08-23T13:15:53",
|
||||||
|
"date_unixtime": "1692821753",
|
||||||
|
"from": "Jimmeny Marvelton",
|
||||||
|
"from_id": "user123450513",
|
||||||
|
"text": "you will not trick me this time",
|
||||||
|
"text_entities": [
|
||||||
|
{
|
||||||
|
"type": "plain",
|
||||||
|
"text": "you will not trick me this time"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
@ -0,0 +1,12 @@
|
|||||||
|
[8/15/23, 9:12:33 AM] Dr. Feather: Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them.
|
||||||
|
[8/15/23, 9:12:43 AM] Dr. Feather: I spotted a rare Hyacinth Macaw yesterday in the Amazon Rainforest. Such a magnificent creature!
|
||||||
|
[8/15/23, 9:12:48 AM] Dr. Feather: image omitted
|
||||||
|
[8/15/23, 9:13:15 AM] Jungle Jane: That's stunning! Were you able to observe its behavior?
|
||||||
|
[8/15/23, 9:13:23 AM] Dr. Feather: image omitted
|
||||||
|
[8/15/23, 9:14:02 AM] Dr. Feather: Yes, it seemed quite social with other macaws. They're known for their playful nature.
|
||||||
|
[8/15/23, 9:14:15 AM] Jungle Jane: How's the research going on parrot communication?
|
||||||
|
[8/15/23, 9:14:30 AM] Dr. Feather: image omitted
|
||||||
|
[8/15/23, 9:14:50 AM] Dr. Feather: It's progressing well. We're learning so much about how they use sound and color to communicate.
|
||||||
|
[8/15/23, 9:15:10 AM] Jungle Jane: That's fascinating! Can't wait to read your paper on it.
|
||||||
|
[8/15/23, 9:15:20 AM] Dr. Feather: Thank you! I'll send you a draft soon.
|
||||||
|
[8/15/23, 9:25:16 PM] Jungle Jane: Looking forward to it! Keep up the great work.
|
24
libs/langchain/tests/unit_tests/chat_loaders/test_slack.py
Normal file
24
libs/langchain/tests/unit_tests/chat_loaders/test_slack.py
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
import pathlib
|
||||||
|
|
||||||
|
from langchain.chat_loaders import slack, utils
|
||||||
|
|
||||||
|
|
||||||
|
def test_slack_chat_loader() -> None:
|
||||||
|
chat_path = (
|
||||||
|
pathlib.Path(__file__).parents[2]
|
||||||
|
/ "integration_tests"
|
||||||
|
/ "examples"
|
||||||
|
/ "slack_export.zip"
|
||||||
|
)
|
||||||
|
loader = slack.SlackChatLoader(str(chat_path))
|
||||||
|
|
||||||
|
chat_sessions = list(
|
||||||
|
utils.map_ai_messages(loader.lazy_load(), sender="U0500003428")
|
||||||
|
)
|
||||||
|
assert chat_sessions, "Chat sessions should not be empty"
|
||||||
|
|
||||||
|
assert chat_sessions[1]["messages"], "Chat messages should not be empty"
|
||||||
|
|
||||||
|
assert (
|
||||||
|
"Example message" in chat_sessions[1]["messages"][0].content
|
||||||
|
), "Chat content mismatch"
|
@ -0,0 +1,97 @@
|
|||||||
|
"""Test the telegram chat loader."""
|
||||||
|
import pathlib
|
||||||
|
import tempfile
|
||||||
|
import zipfile
|
||||||
|
from typing import Sequence
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from langchain import schema
|
||||||
|
from langchain.chat_loaders import telegram, utils
|
||||||
|
|
||||||
|
|
||||||
|
def _assert_messages_are_equal(
|
||||||
|
actual_messages: Sequence[schema.BaseMessage],
|
||||||
|
expected_messages: Sequence[schema.BaseMessage],
|
||||||
|
) -> None:
|
||||||
|
assert len(actual_messages) == len(expected_messages)
|
||||||
|
for actual, expected in zip(actual_messages, expected_messages):
|
||||||
|
assert actual.content == expected.content
|
||||||
|
assert (
|
||||||
|
actual.additional_kwargs["sender"] == expected.additional_kwargs["sender"]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _check_telegram_chat_loader(path: str) -> None:
|
||||||
|
_data_dir = pathlib.Path(__file__).parent / "data"
|
||||||
|
source_path = _data_dir / path
|
||||||
|
# Create a zip file from the directory in a temp directory
|
||||||
|
with tempfile.TemporaryDirectory() as temp_dir_:
|
||||||
|
temp_dir = pathlib.Path(temp_dir_)
|
||||||
|
if path.endswith(".zip"):
|
||||||
|
# Make a new zip file
|
||||||
|
zip_path = temp_dir / "telegram_chat.zip"
|
||||||
|
with zipfile.ZipFile(zip_path, "w") as zip_file:
|
||||||
|
original_path = _data_dir / path.replace(".zip", "")
|
||||||
|
for file_path in original_path.iterdir():
|
||||||
|
zip_file.write(file_path, arcname=file_path.name)
|
||||||
|
source_path = zip_path
|
||||||
|
loader = telegram.TelegramChatLoader(str(source_path))
|
||||||
|
chat_sessions_ = loader.lazy_load()
|
||||||
|
chat_sessions_ = utils.merge_chat_runs(chat_sessions_)
|
||||||
|
chat_sessions = list(
|
||||||
|
utils.map_ai_messages(chat_sessions_, sender="Batman & Robin")
|
||||||
|
)
|
||||||
|
assert len(chat_sessions) == 1
|
||||||
|
session = chat_sessions[0]
|
||||||
|
assert len(session["messages"]) > 0
|
||||||
|
assert session["messages"][0].content == "i refuse to converse with you"
|
||||||
|
expected_content = [
|
||||||
|
schema.HumanMessage(
|
||||||
|
content="i refuse to converse with you",
|
||||||
|
additional_kwargs={
|
||||||
|
"sender": "Jimmeny Marvelton",
|
||||||
|
"events": [{"message_time": "23.08.2023 13:11:23 UTC-08:00"}],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
schema.AIMessage(
|
||||||
|
content="Hi nemesis",
|
||||||
|
additional_kwargs={
|
||||||
|
"sender": "Batman & Robin",
|
||||||
|
"events": [{"message_time": "23.08.2023 13:13:20 UTC-08:00"}],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
schema.HumanMessage(
|
||||||
|
content="we meet again\n\nyou will not trick me this time",
|
||||||
|
additional_kwargs={
|
||||||
|
"sender": "Jimmeny Marvelton",
|
||||||
|
"events": [{"message_time": "23.08.2023 13:15:35 UTC-08:00"}],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
_assert_messages_are_equal(session["messages"], expected_content)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"path",
|
||||||
|
[
|
||||||
|
"telegram_chat_json",
|
||||||
|
"telegram_chat_json.zip",
|
||||||
|
"telegram_chat_json/result.json",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_telegram_chat_loader(path: str) -> None:
|
||||||
|
_check_telegram_chat_loader(path)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(reason="requires bs4 but marking it as such doesn't seem to work")
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"path",
|
||||||
|
[
|
||||||
|
"telegram_chat_json",
|
||||||
|
"telegram_chat_json.zip",
|
||||||
|
"telegram_chat_json/result.json",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_telegram_chat_loader_html(path: str) -> None:
|
||||||
|
_check_telegram_chat_loader(path)
|
@ -0,0 +1,20 @@
|
|||||||
|
import pathlib
|
||||||
|
|
||||||
|
from langchain.chat_loaders import utils, whatsapp
|
||||||
|
|
||||||
|
|
||||||
|
def test_whatsapp_chat_loader() -> None:
|
||||||
|
chat_path = pathlib.Path(__file__).parent / "data" / "whatsapp_chat.txt"
|
||||||
|
loader = whatsapp.WhatsAppChatLoader(str(chat_path))
|
||||||
|
|
||||||
|
chat_sessions = list(
|
||||||
|
utils.map_ai_messages(loader.lazy_load(), sender="Dr. Feather")
|
||||||
|
)
|
||||||
|
assert chat_sessions, "Chat sessions should not be empty"
|
||||||
|
|
||||||
|
assert chat_sessions[0]["messages"], "Chat messages should not be empty"
|
||||||
|
|
||||||
|
assert (
|
||||||
|
"I spotted a rare Hyacinth Macaw yesterday in the Amazon Rainforest."
|
||||||
|
" Such a magnificent creature!" in chat_sessions[0]["messages"][0].content
|
||||||
|
), "Chat content mismatch"
|
Loading…
Reference in New Issue
Block a user