Chat Loaders (#9708)

Still working out interface/notebooks + need discord data dump to test
out things other than copy+paste

Update:
- Going to remove the 'user_id' arg in the loaders themselves and just
standardize on putting the "sender" arg in the extra kwargs. Then can
provide a utility function to map these to ai and human messages
- Going to move the discord one into just a notebook since I don't have
a good dump to test on and copy+paste maybe isn't the greatest thing to
support in v0
- Need to do more testing on slack since it seems the dump only includes
channels and NOT 1 on 1 convos
-

---------

Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
William FH
2023-08-24 17:23:27 -07:00
committed by GitHub
parent 0f48e6c36e
commit dff00ea91e
21 changed files with 2620 additions and 0 deletions

View File

@@ -15,6 +15,7 @@ from typing import (
from typing_extensions import Literal
from langchain.chat_loaders.base import ChatSession
from langchain.schema.messages import (
AIMessage,
AIMessageChunk,
@@ -206,3 +207,19 @@ class ChatCompletion:
_convert_message_chunk_to_delta(c, i)
async for i, c in aenumerate(model_config.astream(converted_messages))
)
def _has_assistant_message(session: ChatSession) -> bool:
"""Check if chat session has an assistant message."""
return any([isinstance(m, AIMessage) for m in session["messages"]])
def convert_messages_for_finetuning(
sessions: Iterable[ChatSession],
) -> List[List[dict]]:
"""Convert messages to a list of lists of dictionaries for fine-tuning."""
return [
[convert_message_to_dict(s) for s in session["messages"]]
for session in sessions
if _has_assistant_message(session)
]

View File

@@ -0,0 +1,6 @@
"""Load chat messages from common communications platforms for finetuning.
This module provides functions to load chat messages from various
communications platforms such as Facebook Messenger, Telegram, and
WhatsApp. The loaded chat messages can be used for finetuning models.
"""

View File

@@ -0,0 +1,31 @@
"""Base definitions for chat loaders.
A chat loader is a class that loads chat messages from an external
source such as a file or a database. The chat messages can then be
used for finetuning.
"""
from abc import ABC, abstractmethod
from typing import Iterator, List, Sequence, TypedDict
from langchain.schema.messages import BaseMessage
class ChatSession(TypedDict):
"""A chat session represents a single
conversation, channel, or other group of messages."""
messages: Sequence[BaseMessage]
"""The LangChain chat messages loaded from the source."""
class BaseChatLoader(ABC):
"""Base class for chat loaders."""
@abstractmethod
def lazy_load(self) -> Iterator[ChatSession]:
"""Lazy load the chat sessions."""
def load(self) -> List[ChatSession]:
"""Eagerly load the chat sessions into memory."""
return list(self.lazy_load())

View File

@@ -0,0 +1,77 @@
import json
import logging
from pathlib import Path
from typing import Iterator, Union
from langchain.chat_loaders.base import BaseChatLoader, ChatSession
from langchain.schema.messages import HumanMessage
logger = logging.getLogger(__file__)
class SingleFileFacebookMessengerChatLoader(BaseChatLoader):
"""A chat loader for loading Facebook Messenger chat data from a single file.
Args:
path (Union[Path, str]): The path to the chat file.
Attributes:
path (Path): The path to the chat file.
"""
def __init__(self, path: Union[Path, str]) -> None:
super().__init__()
self.file_path = path if isinstance(path, Path) else Path(path)
def lazy_load(self) -> Iterator[ChatSession]:
"""Lazy loads the chat data from the file.
Yields:
ChatSession: A chat session containing the loaded messages.
"""
with open(self.file_path) as f:
data = json.load(f)
sorted_data = sorted(data["messages"], key=lambda x: x["timestamp_ms"])
messages = []
for m in sorted_data:
messages.append(
HumanMessage(
content=m["content"], additional_kwargs={"sender": m["sender_name"]}
)
)
yield ChatSession(messages=messages)
class FolderFacebookMessengerChatLoader(BaseChatLoader):
"""A chat loader for loading Facebook Messenger chat data from a folder.
Args:
path (Union[str, Path]): The path to the directory
containing the chat files.
Attributes:
path (Path): The path to the directory containing the chat files.
"""
def __init__(self, path: Union[str, Path]) -> None:
super().__init__()
self.directory_path = Path(path) if isinstance(path, str) else path
def lazy_load(self) -> Iterator[ChatSession]:
"""Lazy loads the chat data from the folder.
Yields:
ChatSession: A chat session containing the loaded messages.
"""
inbox_path = self.directory_path / "inbox"
for _dir in inbox_path.iterdir():
if _dir.is_dir():
for _file in _dir.iterdir():
if _file.suffix.lower() == ".json":
file_loader = SingleFileFacebookMessengerChatLoader(path=_file)
for result in file_loader.lazy_load():
yield result

View File

@@ -0,0 +1,84 @@
import json
import logging
import re
import zipfile
from pathlib import Path
from typing import Dict, Iterator, List, Union
from langchain import schema
from langchain.chat_loaders import base as chat_loaders
logger = logging.getLogger(__name__)
class SlackChatLoader(chat_loaders.BaseChatLoader):
def __init__(
self,
path: Union[str, Path],
):
"""
Initialize the chat loader with the path to the exported Slack dump zip file.
:param path: Path to the exported Slack dump zip file.
"""
self.zip_path = path if isinstance(path, Path) else Path(path)
if not self.zip_path.exists():
raise FileNotFoundError(f"File {self.zip_path} not found")
def _load_single_chat_session(
self, messages: List[Dict]
) -> chat_loaders.ChatSession:
results: List[Union[schema.AIMessage, schema.HumanMessage]] = []
previous_sender = None
for message in messages:
if not isinstance(message, dict):
continue
text = message.get("text", "")
timestamp = message.get("ts", "")
sender = message.get("user", "")
if not sender:
continue
skip_pattern = re.compile(
r"<@U\d+> has joined the channel", flags=re.IGNORECASE
)
if skip_pattern.match(text):
continue
if sender == previous_sender:
results[-1].content += "\n\n" + text
results[-1].additional_kwargs["events"].append(
{"message_time": timestamp}
)
else:
results.append(
schema.HumanMessage(
role=sender,
content=text,
additional_kwargs={
"sender": sender,
"events": [{"message_time": timestamp}],
},
)
)
previous_sender = sender
return chat_loaders.ChatSession(messages=results)
def _read_json(self, zip_file: zipfile.ZipFile, file_path: str) -> List[dict]:
"""Read JSON data from a zip subfile."""
with zip_file.open(file_path, "r") as f:
data = json.load(f)
if not isinstance(data, list):
raise ValueError(f"Expected list of dictionaries, got {type(data)}")
return data
def lazy_load(self) -> Iterator[chat_loaders.ChatSession]:
"""
Lazy load the chat sessions from the Slack dump file and yield them
in the required format.
:return: Iterator of chat sessions containing messages.
"""
with zipfile.ZipFile(str(self.zip_path), "r") as zip_file:
for file_path in zip_file.namelist():
if file_path.endswith(".json"):
messages = self._read_json(zip_file, file_path)
yield self._load_single_chat_session(messages)

View File

@@ -0,0 +1,152 @@
import json
import logging
import os
import zipfile
from pathlib import Path
from typing import Iterator, List, Union
from langchain import schema
from langchain.chat_loaders import base as chat_loaders
logger = logging.getLogger(__name__)
class TelegramChatLoader(chat_loaders.BaseChatLoader):
"""A loading utility for converting telegram conversations
to LangChain chat messages.
To export, use the Telegram Desktop app from
https://desktop.telegram.org/, select a conversation, click the three dots
in the top right corner, and select "Export chat history". Then select
"Machine-readable JSON" (preferred) to export. Note: the 'lite' versions of
the desktop app (like "Telegram for MacOS") do not support exporting chat
history.
"""
def __init__(
self,
path: Union[str, Path],
):
"""Initialize the TelegramChatLoader.
Args:
path (Union[str, Path]): Path to the exported Telegram chat zip,
directory, json, or HTML file.
"""
self.path = path if isinstance(path, str) else str(path)
def _load_single_chat_session_html(
self, file_path: str
) -> chat_loaders.ChatSession:
"""Load a single chat session from an HTML file.
Args:
file_path (str): Path to the HTML file.
Returns:
chat_loaders.ChatSession: The loaded chat session.
"""
try:
from bs4 import BeautifulSoup
except ImportError:
raise ImportError(
"Please install the 'beautifulsoup4' package to load"
" Telegram HTML files. You can do this by running"
"'pip install beautifulsoup4' in your terminal."
)
with open(file_path, "r", encoding="utf-8") as file:
soup = BeautifulSoup(file, "html.parser")
results: List[Union[schema.HumanMessage, schema.AIMessage]] = []
previous_sender = None
for message in soup.select(".message.default"):
timestamp = message.select_one(".pull_right.date.details")["title"]
from_name_element = message.select_one(".from_name")
if from_name_element is None and previous_sender is None:
logger.debug("from_name not found in message")
continue
elif from_name_element is None:
from_name = previous_sender
else:
from_name = from_name_element.text.strip()
text = message.select_one(".text").text.strip()
results.append(
schema.HumanMessage(
content=text,
additional_kwargs={
"sender": from_name,
"events": [{"message_time": timestamp}],
},
)
)
previous_sender = from_name
return chat_loaders.ChatSession(messages=results)
def _load_single_chat_session_json(
self, file_path: str
) -> chat_loaders.ChatSession:
"""Load a single chat session from a JSON file.
Args:
file_path (str): Path to the JSON file.
Returns:
chat_loaders.ChatSession: The loaded chat session.
"""
with open(file_path, "r", encoding="utf-8") as file:
data = json.load(file)
messages = data.get("messages", [])
results: List[schema.BaseMessage] = []
for message in messages:
text = message.get("text", "")
timestamp = message.get("date", "")
from_name = message.get("from", "")
results.append(
schema.HumanMessage(
content=text,
additional_kwargs={
"sender": from_name,
"events": [{"message_time": timestamp}],
},
)
)
return chat_loaders.ChatSession(messages=results)
def _iterate_files(self, path: str) -> Iterator[str]:
"""Iterate over files in a directory or zip file.
Args:
path (str): Path to the directory or zip file.
Yields:
str: Path to each file.
"""
if os.path.isfile(path) and path.endswith((".html", ".json")):
yield path
elif os.path.isdir(path):
for root, _, files in os.walk(path):
for file in files:
if file.endswith((".html", ".json")):
yield os.path.join(root, file)
elif zipfile.is_zipfile(path):
with zipfile.ZipFile(path) as zip_file:
for file in zip_file.namelist():
if file.endswith((".html", ".json")):
yield zip_file.extract(file)
def lazy_load(self) -> Iterator[chat_loaders.ChatSession]:
"""Lazy load the messages from the chat file and yield them
in as chat sessions.
Yields:
chat_loaders.ChatSession: The loaded chat session.
"""
for file_path in self._iterate_files(self.path):
if file_path.endswith(".html"):
yield self._load_single_chat_session_html(file_path)
elif file_path.endswith(".json"):
yield self._load_single_chat_session_json(file_path)

View File

@@ -0,0 +1,86 @@
"""Utilities for chat loaders."""
from copy import deepcopy
from typing import Iterable, Iterator, List
from langchain import schema
from langchain.chat_loaders.base import ChatSession
from langchain.schema.messages import BaseMessage
def merge_chat_runs_in_session(
chat_session: ChatSession, delimiter: str = "\n\n"
) -> ChatSession:
"""Merge chat runs together in a chat session.
A chat run is a sequence of messages from the same sender.
Args:
chat_session: A chat session.
Returns:
A chat session with merged chat runs.
"""
messages: List[BaseMessage] = []
for message in chat_session["messages"]:
if not messages:
messages.append(deepcopy(message))
elif (
isinstance(message, type(messages[-1]))
and messages[-1].additional_kwargs.get("sender") is not None
and messages[-1].additional_kwargs["sender"]
== message.additional_kwargs.get("sender")
):
messages[-1].content = (
messages[-1].content + delimiter + message.content
).strip()
messages[-1].additional_kwargs.get("events", []).extend(
message.additional_kwargs.get("events") or []
)
else:
messages.append(deepcopy(message))
return ChatSession(messages=messages)
def merge_chat_runs(chat_sessions: Iterable[ChatSession]) -> Iterator[ChatSession]:
"""Merge chat runs together.
A chat run is a sequence of messages from the same sender.
Args:
chat_sessions: A list of chat sessions.
Returns:
A list of chat sessions with merged chat runs.
"""
for chat_session in chat_sessions:
yield merge_chat_runs_in_session(chat_session)
def map_ai_messages_in_session(chat_sessions: ChatSession, sender: str) -> ChatSession:
"""Convert messages from the specified 'sender' to AI messages.
This is useful for fine-tuning the AI to adapt to your voice.
"""
messages = []
num_converted = 0
for message in chat_sessions["messages"]:
if message.additional_kwargs.get("sender") == sender:
message = schema.AIMessage(
content=message.content,
additional_kwargs=message.additional_kwargs.copy(),
example=getattr(message, "example", None),
)
num_converted += 1
messages.append(message)
return ChatSession(messages=messages)
def map_ai_messages(
chat_sessions: Iterable[ChatSession], sender: str
) -> Iterator[ChatSession]:
"""Convert messages from the specified 'sender' to AI messages.
This is useful for fine-tuning the AI to adapt to your voice.
"""
for chat_session in chat_sessions:
yield map_ai_messages_in_session(chat_session, sender)

View File

@@ -0,0 +1,116 @@
import logging
import os
import re
import zipfile
from typing import Iterator, List, Union
from langchain import schema
from langchain.chat_loaders import base as chat_loaders
from langchain.schema import messages
logger = logging.getLogger(__name__)
class WhatsAppChatLoader(chat_loaders.BaseChatLoader):
def __init__(self, path: str):
"""Initialize the WhatsAppChatLoader.
Args:
path (str): Path to the exported WhatsApp chat
zip directory, folder, or file.
To generate the dump, open the chat, click the three dots in the top
right corner, and select "More". Then select "Export chat" and
choose "Without media".
"""
self.path = path
ignore_lines = [
"This message was deleted",
"<Media omitted>",
"image omitted",
"Messages and calls are end-to-end encrypted. No one outside of this chat,"
" not even WhatsApp, can read or listen to them.",
]
self._ignore_lines = re.compile(
r"(" + "|".join([r"\u200E*" + line for line in ignore_lines]) + r")",
flags=re.IGNORECASE,
)
self._message_line_regex = re.compile(
r"\u200E*\[?(\d{1,2}/\d{1,2}/\d{2,4}, \d{1,2}:\d{2}:\d{2} (?:AM|PM))\]?[ \u200E]*([^:]+): (.+)", # noqa
flags=re.IGNORECASE,
)
def _load_single_chat_session(self, file_path: str) -> chat_loaders.ChatSession:
"""Load a single chat session from a file.
Args:
file_path (str): Path to the chat file.
Returns:
ChatSession: The loaded chat session.
"""
with open(file_path, "r", encoding="utf-8") as file:
txt = file.read()
# Split messages by newlines, but keep multi-line messages grouped
chat_lines: List[str] = []
current_message = ""
for line in txt.split("\n"):
if self._message_line_regex.match(line):
if current_message:
chat_lines.append(current_message)
current_message = line
else:
current_message += " " + line.strip()
if current_message:
chat_lines.append(current_message)
results: List[Union[messages.HumanMessage, messages.AIMessage]] = []
for line in chat_lines:
result = self._message_line_regex.match(line.strip())
if result:
timestamp, sender, text = result.groups()
if not self._ignore_lines.match(text.strip()):
results.append(
schema.HumanMessage(
role=sender,
content=text,
additional_kwargs={
"sender": sender,
"events": [{"message_time": timestamp}],
},
)
)
else:
logger.debug(f"Could not parse line: {line}")
return chat_loaders.ChatSession(messages=results)
def _iterate_files(self, path: str) -> Iterator[str]:
"""Iterate over the files in a directory or zip file.
Args:
path (str): Path to the directory or zip file.
Yields:
str: The path to each file.
"""
if os.path.isfile(path):
yield path
elif os.path.isdir(path):
for root, _, files in os.walk(path):
for file in files:
if file.endswith(".txt"):
yield os.path.join(root, file)
elif zipfile.is_zipfile(path):
with zipfile.ZipFile(path) as zip_file:
for file in zip_file.namelist():
if file.endswith(".txt"):
yield zip_file.extract(file)
def lazy_load(self) -> Iterator[chat_loaders.ChatSession]:
"""Lazy load the messages from the chat file and yield
them as chat sessions.
Yields:
Iterator[ChatSession]: The loaded chat sessions.
"""
yield self._load_single_chat_session(self.path)

View File

@@ -0,0 +1,166 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title>Exported Data</title>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<link href="css/style.css" rel="stylesheet"/>
<script src="js/script.js" type="text/javascript">
</script>
</head>
<body onload="CheckLocation();">
<div class="page_wrap">
<div class="page_header">
<div class="content">
<div class="text bold">
Jimmeny Marvelton
</div>
</div>
</div>
<div class="page_body chat_page">
<div class="history">
<div class="message service" id="message-1">
<div class="body details">
23 August 2023
</div>
</div>
<div class="message default clearfix" id="message1">
<div class="pull_left userpic_wrap">
<div class="userpic userpic2" style="width: 42px; height: 42px">
<div class="initials" style="line-height: 42px">
bA
</div>
</div>
</div>
<div class="body">
<div class="pull_right date details" title="23.08.2023 13:11:23 UTC-08:00">
13:11
</div>
<div class="from_name">
Jimmeny Marvelton
</div>
<div class="text">
i refuse to converse with you
</div>
</div>
</div>
<div class="message default clearfix" id="message2">
<div class="pull_left userpic_wrap">
<div class="userpic userpic1" style="width: 42px; height: 42px">
<div class="initials" style="line-height: 42px">
WF
</div>
</div>
</div>
<div class="body">
<div class="pull_right date details" title="23.08.2023 13:13:20 UTC-08:00">
13:13
</div>
<div class="from_name">
Batman &amp; Robin
</div>
<div class="text">
Hi nemesis
</div>
</div>
</div>
<div class="message default clearfix" id="message3">
<div class="pull_left userpic_wrap">
<div class="userpic userpic2" style="width: 42px; height: 42px">
<div class="initials" style="line-height: 42px">
bA
</div>
</div>
</div>
<div class="body">
<div class="pull_right date details" title="23.08.2023 13:15:35 UTC-08:00">
13:15
</div>
<div class="from_name">
Jimmeny Marvelton
</div>
<div class="text">
we meet again
</div>
</div>
</div>
<div class="message default clearfix joined" id="message4">
<div class="body">
<div class="pull_right date details" title="23.08.2023 13:15:53 UTC-08:00">
13:15
</div>
<div class="text">
you will not trick me this time
</div>
</div>
</div>
</div>
</div>
</div>
</body>
</html>

View File

@@ -0,0 +1,67 @@
{
"name": "Jimmeny",
"type": "personal_chat",
"id": 5965280513,
"messages": [
{
"id": 1,
"type": "message",
"date": "2023-08-23T13:11:23",
"date_unixtime": "1692821483",
"from": "Jimmeny Marvelton",
"from_id": "user123450513",
"text": "i refuse to converse with you",
"text_entities": [
{
"type": "plain",
"text": "i refuse to converse with you"
}
]
},
{
"id": 2,
"type": "message",
"date": "2023-08-23T13:13:20",
"date_unixtime": "1692821600",
"from": "Batman & Robin",
"from_id": "user6565661032",
"text": "Hi nemesis",
"text_entities": [
{
"type": "plain",
"text": "Hi nemesis"
}
]
},
{
"id": 3,
"type": "message",
"date": "2023-08-23T13:15:35",
"date_unixtime": "1692821735",
"from": "Jimmeny Marvelton",
"from_id": "user123450513",
"text": "we meet again",
"text_entities": [
{
"type": "plain",
"text": "we meet again"
}
]
},
{
"id": 4,
"type": "message",
"date": "2023-08-23T13:15:53",
"date_unixtime": "1692821753",
"from": "Jimmeny Marvelton",
"from_id": "user123450513",
"text": "you will not trick me this time",
"text_entities": [
{
"type": "plain",
"text": "you will not trick me this time"
}
]
}
]
}

View File

@@ -0,0 +1,12 @@
[8/15/23, 9:12:33 AM] Dr. Feather: Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them.
[8/15/23, 9:12:43 AM] Dr. Feather: I spotted a rare Hyacinth Macaw yesterday in the Amazon Rainforest. Such a magnificent creature!
[8/15/23, 9:12:48 AM] Dr. Feather: image omitted
[8/15/23, 9:13:15 AM] Jungle Jane: That's stunning! Were you able to observe its behavior?
[8/15/23, 9:13:23 AM] Dr. Feather: image omitted
[8/15/23, 9:14:02 AM] Dr. Feather: Yes, it seemed quite social with other macaws. They're known for their playful nature.
[8/15/23, 9:14:15 AM] Jungle Jane: How's the research going on parrot communication?
[8/15/23, 9:14:30 AM] Dr. Feather: image omitted
[8/15/23, 9:14:50 AM] Dr. Feather: It's progressing well. We're learning so much about how they use sound and color to communicate.
[8/15/23, 9:15:10 AM] Jungle Jane: That's fascinating! Can't wait to read your paper on it.
[8/15/23, 9:15:20 AM] Dr. Feather: Thank you! I'll send you a draft soon.
[8/15/23, 9:25:16 PM] Jungle Jane: Looking forward to it! Keep up the great work.

View File

@@ -0,0 +1,24 @@
import pathlib
from langchain.chat_loaders import slack, utils
def test_slack_chat_loader() -> None:
chat_path = (
pathlib.Path(__file__).parents[2]
/ "integration_tests"
/ "examples"
/ "slack_export.zip"
)
loader = slack.SlackChatLoader(str(chat_path))
chat_sessions = list(
utils.map_ai_messages(loader.lazy_load(), sender="U0500003428")
)
assert chat_sessions, "Chat sessions should not be empty"
assert chat_sessions[1]["messages"], "Chat messages should not be empty"
assert (
"Example message" in chat_sessions[1]["messages"][0].content
), "Chat content mismatch"

View File

@@ -0,0 +1,97 @@
"""Test the telegram chat loader."""
import pathlib
import tempfile
import zipfile
from typing import Sequence
import pytest
from langchain import schema
from langchain.chat_loaders import telegram, utils
def _assert_messages_are_equal(
actual_messages: Sequence[schema.BaseMessage],
expected_messages: Sequence[schema.BaseMessage],
) -> None:
assert len(actual_messages) == len(expected_messages)
for actual, expected in zip(actual_messages, expected_messages):
assert actual.content == expected.content
assert (
actual.additional_kwargs["sender"] == expected.additional_kwargs["sender"]
)
def _check_telegram_chat_loader(path: str) -> None:
_data_dir = pathlib.Path(__file__).parent / "data"
source_path = _data_dir / path
# Create a zip file from the directory in a temp directory
with tempfile.TemporaryDirectory() as temp_dir_:
temp_dir = pathlib.Path(temp_dir_)
if path.endswith(".zip"):
# Make a new zip file
zip_path = temp_dir / "telegram_chat.zip"
with zipfile.ZipFile(zip_path, "w") as zip_file:
original_path = _data_dir / path.replace(".zip", "")
for file_path in original_path.iterdir():
zip_file.write(file_path, arcname=file_path.name)
source_path = zip_path
loader = telegram.TelegramChatLoader(str(source_path))
chat_sessions_ = loader.lazy_load()
chat_sessions_ = utils.merge_chat_runs(chat_sessions_)
chat_sessions = list(
utils.map_ai_messages(chat_sessions_, sender="Batman & Robin")
)
assert len(chat_sessions) == 1
session = chat_sessions[0]
assert len(session["messages"]) > 0
assert session["messages"][0].content == "i refuse to converse with you"
expected_content = [
schema.HumanMessage(
content="i refuse to converse with you",
additional_kwargs={
"sender": "Jimmeny Marvelton",
"events": [{"message_time": "23.08.2023 13:11:23 UTC-08:00"}],
},
),
schema.AIMessage(
content="Hi nemesis",
additional_kwargs={
"sender": "Batman & Robin",
"events": [{"message_time": "23.08.2023 13:13:20 UTC-08:00"}],
},
),
schema.HumanMessage(
content="we meet again\n\nyou will not trick me this time",
additional_kwargs={
"sender": "Jimmeny Marvelton",
"events": [{"message_time": "23.08.2023 13:15:35 UTC-08:00"}],
},
),
]
_assert_messages_are_equal(session["messages"], expected_content)
@pytest.mark.parametrize(
"path",
[
"telegram_chat_json",
"telegram_chat_json.zip",
"telegram_chat_json/result.json",
],
)
def test_telegram_chat_loader(path: str) -> None:
_check_telegram_chat_loader(path)
@pytest.mark.skip(reason="requires bs4 but marking it as such doesn't seem to work")
@pytest.mark.parametrize(
"path",
[
"telegram_chat_json",
"telegram_chat_json.zip",
"telegram_chat_json/result.json",
],
)
def test_telegram_chat_loader_html(path: str) -> None:
_check_telegram_chat_loader(path)

View File

@@ -0,0 +1,20 @@
import pathlib
from langchain.chat_loaders import utils, whatsapp
def test_whatsapp_chat_loader() -> None:
chat_path = pathlib.Path(__file__).parent / "data" / "whatsapp_chat.txt"
loader = whatsapp.WhatsAppChatLoader(str(chat_path))
chat_sessions = list(
utils.map_ai_messages(loader.lazy_load(), sender="Dr. Feather")
)
assert chat_sessions, "Chat sessions should not be empty"
assert chat_sessions[0]["messages"], "Chat messages should not be empty"
assert (
"I spotted a rare Hyacinth Macaw yesterday in the Amazon Rainforest."
" Such a magnificent creature!" in chat_sessions[0]["messages"][0].content
), "Chat content mismatch"