mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-09 23:12:38 +00:00
Chat Loaders (#9708)
Still working out interface/notebooks + need discord data dump to test out things other than copy+paste Update: - Going to remove the 'user_id' arg in the loaders themselves and just standardize on putting the "sender" arg in the extra kwargs. Then can provide a utility function to map these to ai and human messages - Going to move the discord one into just a notebook since I don't have a good dump to test on and copy+paste maybe isn't the greatest thing to support in v0 - Need to do more testing on slack since it seems the dump only includes channels and NOT 1 on 1 convos - --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
@@ -15,6 +15,7 @@ from typing import (
|
||||
|
||||
from typing_extensions import Literal
|
||||
|
||||
from langchain.chat_loaders.base import ChatSession
|
||||
from langchain.schema.messages import (
|
||||
AIMessage,
|
||||
AIMessageChunk,
|
||||
@@ -206,3 +207,19 @@ class ChatCompletion:
|
||||
_convert_message_chunk_to_delta(c, i)
|
||||
async for i, c in aenumerate(model_config.astream(converted_messages))
|
||||
)
|
||||
|
||||
|
||||
def _has_assistant_message(session: ChatSession) -> bool:
|
||||
"""Check if chat session has an assistant message."""
|
||||
return any([isinstance(m, AIMessage) for m in session["messages"]])
|
||||
|
||||
|
||||
def convert_messages_for_finetuning(
|
||||
sessions: Iterable[ChatSession],
|
||||
) -> List[List[dict]]:
|
||||
"""Convert messages to a list of lists of dictionaries for fine-tuning."""
|
||||
return [
|
||||
[convert_message_to_dict(s) for s in session["messages"]]
|
||||
for session in sessions
|
||||
if _has_assistant_message(session)
|
||||
]
|
||||
|
6
libs/langchain/langchain/chat_loaders/__init__.py
Normal file
6
libs/langchain/langchain/chat_loaders/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
"""Load chat messages from common communications platforms for finetuning.
|
||||
|
||||
This module provides functions to load chat messages from various
|
||||
communications platforms such as Facebook Messenger, Telegram, and
|
||||
WhatsApp. The loaded chat messages can be used for finetuning models.
|
||||
"""
|
31
libs/langchain/langchain/chat_loaders/base.py
Normal file
31
libs/langchain/langchain/chat_loaders/base.py
Normal file
@@ -0,0 +1,31 @@
|
||||
"""Base definitions for chat loaders.
|
||||
|
||||
A chat loader is a class that loads chat messages from an external
|
||||
source such as a file or a database. The chat messages can then be
|
||||
used for finetuning.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Iterator, List, Sequence, TypedDict
|
||||
|
||||
from langchain.schema.messages import BaseMessage
|
||||
|
||||
|
||||
class ChatSession(TypedDict):
|
||||
"""A chat session represents a single
|
||||
conversation, channel, or other group of messages."""
|
||||
|
||||
messages: Sequence[BaseMessage]
|
||||
"""The LangChain chat messages loaded from the source."""
|
||||
|
||||
|
||||
class BaseChatLoader(ABC):
|
||||
"""Base class for chat loaders."""
|
||||
|
||||
@abstractmethod
|
||||
def lazy_load(self) -> Iterator[ChatSession]:
|
||||
"""Lazy load the chat sessions."""
|
||||
|
||||
def load(self) -> List[ChatSession]:
|
||||
"""Eagerly load the chat sessions into memory."""
|
||||
return list(self.lazy_load())
|
77
libs/langchain/langchain/chat_loaders/facebook_messenger.py
Normal file
77
libs/langchain/langchain/chat_loaders/facebook_messenger.py
Normal file
@@ -0,0 +1,77 @@
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Iterator, Union
|
||||
|
||||
from langchain.chat_loaders.base import BaseChatLoader, ChatSession
|
||||
from langchain.schema.messages import HumanMessage
|
||||
|
||||
logger = logging.getLogger(__file__)
|
||||
|
||||
|
||||
class SingleFileFacebookMessengerChatLoader(BaseChatLoader):
|
||||
"""A chat loader for loading Facebook Messenger chat data from a single file.
|
||||
|
||||
Args:
|
||||
path (Union[Path, str]): The path to the chat file.
|
||||
|
||||
Attributes:
|
||||
path (Path): The path to the chat file.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, path: Union[Path, str]) -> None:
|
||||
super().__init__()
|
||||
self.file_path = path if isinstance(path, Path) else Path(path)
|
||||
|
||||
def lazy_load(self) -> Iterator[ChatSession]:
|
||||
"""Lazy loads the chat data from the file.
|
||||
|
||||
Yields:
|
||||
ChatSession: A chat session containing the loaded messages.
|
||||
|
||||
"""
|
||||
with open(self.file_path) as f:
|
||||
data = json.load(f)
|
||||
sorted_data = sorted(data["messages"], key=lambda x: x["timestamp_ms"])
|
||||
messages = []
|
||||
for m in sorted_data:
|
||||
messages.append(
|
||||
HumanMessage(
|
||||
content=m["content"], additional_kwargs={"sender": m["sender_name"]}
|
||||
)
|
||||
)
|
||||
yield ChatSession(messages=messages)
|
||||
|
||||
|
||||
class FolderFacebookMessengerChatLoader(BaseChatLoader):
|
||||
"""A chat loader for loading Facebook Messenger chat data from a folder.
|
||||
|
||||
Args:
|
||||
path (Union[str, Path]): The path to the directory
|
||||
containing the chat files.
|
||||
|
||||
Attributes:
|
||||
path (Path): The path to the directory containing the chat files.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, path: Union[str, Path]) -> None:
|
||||
super().__init__()
|
||||
self.directory_path = Path(path) if isinstance(path, str) else path
|
||||
|
||||
def lazy_load(self) -> Iterator[ChatSession]:
|
||||
"""Lazy loads the chat data from the folder.
|
||||
|
||||
Yields:
|
||||
ChatSession: A chat session containing the loaded messages.
|
||||
|
||||
"""
|
||||
inbox_path = self.directory_path / "inbox"
|
||||
for _dir in inbox_path.iterdir():
|
||||
if _dir.is_dir():
|
||||
for _file in _dir.iterdir():
|
||||
if _file.suffix.lower() == ".json":
|
||||
file_loader = SingleFileFacebookMessengerChatLoader(path=_file)
|
||||
for result in file_loader.lazy_load():
|
||||
yield result
|
84
libs/langchain/langchain/chat_loaders/slack.py
Normal file
84
libs/langchain/langchain/chat_loaders/slack.py
Normal file
@@ -0,0 +1,84 @@
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterator, List, Union
|
||||
|
||||
from langchain import schema
|
||||
from langchain.chat_loaders import base as chat_loaders
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SlackChatLoader(chat_loaders.BaseChatLoader):
|
||||
def __init__(
|
||||
self,
|
||||
path: Union[str, Path],
|
||||
):
|
||||
"""
|
||||
Initialize the chat loader with the path to the exported Slack dump zip file.
|
||||
|
||||
:param path: Path to the exported Slack dump zip file.
|
||||
"""
|
||||
self.zip_path = path if isinstance(path, Path) else Path(path)
|
||||
if not self.zip_path.exists():
|
||||
raise FileNotFoundError(f"File {self.zip_path} not found")
|
||||
|
||||
def _load_single_chat_session(
|
||||
self, messages: List[Dict]
|
||||
) -> chat_loaders.ChatSession:
|
||||
results: List[Union[schema.AIMessage, schema.HumanMessage]] = []
|
||||
previous_sender = None
|
||||
for message in messages:
|
||||
if not isinstance(message, dict):
|
||||
continue
|
||||
text = message.get("text", "")
|
||||
timestamp = message.get("ts", "")
|
||||
sender = message.get("user", "")
|
||||
if not sender:
|
||||
continue
|
||||
skip_pattern = re.compile(
|
||||
r"<@U\d+> has joined the channel", flags=re.IGNORECASE
|
||||
)
|
||||
if skip_pattern.match(text):
|
||||
continue
|
||||
if sender == previous_sender:
|
||||
results[-1].content += "\n\n" + text
|
||||
results[-1].additional_kwargs["events"].append(
|
||||
{"message_time": timestamp}
|
||||
)
|
||||
else:
|
||||
results.append(
|
||||
schema.HumanMessage(
|
||||
role=sender,
|
||||
content=text,
|
||||
additional_kwargs={
|
||||
"sender": sender,
|
||||
"events": [{"message_time": timestamp}],
|
||||
},
|
||||
)
|
||||
)
|
||||
previous_sender = sender
|
||||
return chat_loaders.ChatSession(messages=results)
|
||||
|
||||
def _read_json(self, zip_file: zipfile.ZipFile, file_path: str) -> List[dict]:
|
||||
"""Read JSON data from a zip subfile."""
|
||||
with zip_file.open(file_path, "r") as f:
|
||||
data = json.load(f)
|
||||
if not isinstance(data, list):
|
||||
raise ValueError(f"Expected list of dictionaries, got {type(data)}")
|
||||
return data
|
||||
|
||||
def lazy_load(self) -> Iterator[chat_loaders.ChatSession]:
|
||||
"""
|
||||
Lazy load the chat sessions from the Slack dump file and yield them
|
||||
in the required format.
|
||||
|
||||
:return: Iterator of chat sessions containing messages.
|
||||
"""
|
||||
with zipfile.ZipFile(str(self.zip_path), "r") as zip_file:
|
||||
for file_path in zip_file.namelist():
|
||||
if file_path.endswith(".json"):
|
||||
messages = self._read_json(zip_file, file_path)
|
||||
yield self._load_single_chat_session(messages)
|
152
libs/langchain/langchain/chat_loaders/telegram.py
Normal file
152
libs/langchain/langchain/chat_loaders/telegram.py
Normal file
@@ -0,0 +1,152 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from typing import Iterator, List, Union
|
||||
|
||||
from langchain import schema
|
||||
from langchain.chat_loaders import base as chat_loaders
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TelegramChatLoader(chat_loaders.BaseChatLoader):
|
||||
"""A loading utility for converting telegram conversations
|
||||
to LangChain chat messages.
|
||||
|
||||
To export, use the Telegram Desktop app from
|
||||
https://desktop.telegram.org/, select a conversation, click the three dots
|
||||
in the top right corner, and select "Export chat history". Then select
|
||||
"Machine-readable JSON" (preferred) to export. Note: the 'lite' versions of
|
||||
the desktop app (like "Telegram for MacOS") do not support exporting chat
|
||||
history.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path: Union[str, Path],
|
||||
):
|
||||
"""Initialize the TelegramChatLoader.
|
||||
|
||||
Args:
|
||||
path (Union[str, Path]): Path to the exported Telegram chat zip,
|
||||
directory, json, or HTML file.
|
||||
"""
|
||||
self.path = path if isinstance(path, str) else str(path)
|
||||
|
||||
def _load_single_chat_session_html(
|
||||
self, file_path: str
|
||||
) -> chat_loaders.ChatSession:
|
||||
"""Load a single chat session from an HTML file.
|
||||
|
||||
Args:
|
||||
file_path (str): Path to the HTML file.
|
||||
|
||||
Returns:
|
||||
chat_loaders.ChatSession: The loaded chat session.
|
||||
"""
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Please install the 'beautifulsoup4' package to load"
|
||||
" Telegram HTML files. You can do this by running"
|
||||
"'pip install beautifulsoup4' in your terminal."
|
||||
)
|
||||
with open(file_path, "r", encoding="utf-8") as file:
|
||||
soup = BeautifulSoup(file, "html.parser")
|
||||
|
||||
results: List[Union[schema.HumanMessage, schema.AIMessage]] = []
|
||||
previous_sender = None
|
||||
for message in soup.select(".message.default"):
|
||||
timestamp = message.select_one(".pull_right.date.details")["title"]
|
||||
from_name_element = message.select_one(".from_name")
|
||||
if from_name_element is None and previous_sender is None:
|
||||
logger.debug("from_name not found in message")
|
||||
continue
|
||||
elif from_name_element is None:
|
||||
from_name = previous_sender
|
||||
else:
|
||||
from_name = from_name_element.text.strip()
|
||||
text = message.select_one(".text").text.strip()
|
||||
results.append(
|
||||
schema.HumanMessage(
|
||||
content=text,
|
||||
additional_kwargs={
|
||||
"sender": from_name,
|
||||
"events": [{"message_time": timestamp}],
|
||||
},
|
||||
)
|
||||
)
|
||||
previous_sender = from_name
|
||||
|
||||
return chat_loaders.ChatSession(messages=results)
|
||||
|
||||
def _load_single_chat_session_json(
|
||||
self, file_path: str
|
||||
) -> chat_loaders.ChatSession:
|
||||
"""Load a single chat session from a JSON file.
|
||||
|
||||
Args:
|
||||
file_path (str): Path to the JSON file.
|
||||
|
||||
Returns:
|
||||
chat_loaders.ChatSession: The loaded chat session.
|
||||
"""
|
||||
with open(file_path, "r", encoding="utf-8") as file:
|
||||
data = json.load(file)
|
||||
|
||||
messages = data.get("messages", [])
|
||||
results: List[schema.BaseMessage] = []
|
||||
for message in messages:
|
||||
text = message.get("text", "")
|
||||
timestamp = message.get("date", "")
|
||||
from_name = message.get("from", "")
|
||||
|
||||
results.append(
|
||||
schema.HumanMessage(
|
||||
content=text,
|
||||
additional_kwargs={
|
||||
"sender": from_name,
|
||||
"events": [{"message_time": timestamp}],
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
return chat_loaders.ChatSession(messages=results)
|
||||
|
||||
def _iterate_files(self, path: str) -> Iterator[str]:
|
||||
"""Iterate over files in a directory or zip file.
|
||||
|
||||
Args:
|
||||
path (str): Path to the directory or zip file.
|
||||
|
||||
Yields:
|
||||
str: Path to each file.
|
||||
"""
|
||||
if os.path.isfile(path) and path.endswith((".html", ".json")):
|
||||
yield path
|
||||
elif os.path.isdir(path):
|
||||
for root, _, files in os.walk(path):
|
||||
for file in files:
|
||||
if file.endswith((".html", ".json")):
|
||||
yield os.path.join(root, file)
|
||||
elif zipfile.is_zipfile(path):
|
||||
with zipfile.ZipFile(path) as zip_file:
|
||||
for file in zip_file.namelist():
|
||||
if file.endswith((".html", ".json")):
|
||||
yield zip_file.extract(file)
|
||||
|
||||
def lazy_load(self) -> Iterator[chat_loaders.ChatSession]:
|
||||
"""Lazy load the messages from the chat file and yield them
|
||||
in as chat sessions.
|
||||
|
||||
Yields:
|
||||
chat_loaders.ChatSession: The loaded chat session.
|
||||
"""
|
||||
for file_path in self._iterate_files(self.path):
|
||||
if file_path.endswith(".html"):
|
||||
yield self._load_single_chat_session_html(file_path)
|
||||
elif file_path.endswith(".json"):
|
||||
yield self._load_single_chat_session_json(file_path)
|
86
libs/langchain/langchain/chat_loaders/utils.py
Normal file
86
libs/langchain/langchain/chat_loaders/utils.py
Normal file
@@ -0,0 +1,86 @@
|
||||
"""Utilities for chat loaders."""
|
||||
from copy import deepcopy
|
||||
from typing import Iterable, Iterator, List
|
||||
|
||||
from langchain import schema
|
||||
from langchain.chat_loaders.base import ChatSession
|
||||
from langchain.schema.messages import BaseMessage
|
||||
|
||||
|
||||
def merge_chat_runs_in_session(
|
||||
chat_session: ChatSession, delimiter: str = "\n\n"
|
||||
) -> ChatSession:
|
||||
"""Merge chat runs together in a chat session.
|
||||
|
||||
A chat run is a sequence of messages from the same sender.
|
||||
|
||||
Args:
|
||||
chat_session: A chat session.
|
||||
|
||||
Returns:
|
||||
A chat session with merged chat runs.
|
||||
"""
|
||||
messages: List[BaseMessage] = []
|
||||
for message in chat_session["messages"]:
|
||||
if not messages:
|
||||
messages.append(deepcopy(message))
|
||||
elif (
|
||||
isinstance(message, type(messages[-1]))
|
||||
and messages[-1].additional_kwargs.get("sender") is not None
|
||||
and messages[-1].additional_kwargs["sender"]
|
||||
== message.additional_kwargs.get("sender")
|
||||
):
|
||||
messages[-1].content = (
|
||||
messages[-1].content + delimiter + message.content
|
||||
).strip()
|
||||
messages[-1].additional_kwargs.get("events", []).extend(
|
||||
message.additional_kwargs.get("events") or []
|
||||
)
|
||||
else:
|
||||
messages.append(deepcopy(message))
|
||||
return ChatSession(messages=messages)
|
||||
|
||||
|
||||
def merge_chat_runs(chat_sessions: Iterable[ChatSession]) -> Iterator[ChatSession]:
|
||||
"""Merge chat runs together.
|
||||
|
||||
A chat run is a sequence of messages from the same sender.
|
||||
|
||||
Args:
|
||||
chat_sessions: A list of chat sessions.
|
||||
|
||||
Returns:
|
||||
A list of chat sessions with merged chat runs.
|
||||
"""
|
||||
for chat_session in chat_sessions:
|
||||
yield merge_chat_runs_in_session(chat_session)
|
||||
|
||||
|
||||
def map_ai_messages_in_session(chat_sessions: ChatSession, sender: str) -> ChatSession:
|
||||
"""Convert messages from the specified 'sender' to AI messages.
|
||||
|
||||
This is useful for fine-tuning the AI to adapt to your voice.
|
||||
"""
|
||||
messages = []
|
||||
num_converted = 0
|
||||
for message in chat_sessions["messages"]:
|
||||
if message.additional_kwargs.get("sender") == sender:
|
||||
message = schema.AIMessage(
|
||||
content=message.content,
|
||||
additional_kwargs=message.additional_kwargs.copy(),
|
||||
example=getattr(message, "example", None),
|
||||
)
|
||||
num_converted += 1
|
||||
messages.append(message)
|
||||
return ChatSession(messages=messages)
|
||||
|
||||
|
||||
def map_ai_messages(
|
||||
chat_sessions: Iterable[ChatSession], sender: str
|
||||
) -> Iterator[ChatSession]:
|
||||
"""Convert messages from the specified 'sender' to AI messages.
|
||||
|
||||
This is useful for fine-tuning the AI to adapt to your voice.
|
||||
"""
|
||||
for chat_session in chat_sessions:
|
||||
yield map_ai_messages_in_session(chat_session, sender)
|
116
libs/langchain/langchain/chat_loaders/whatsapp.py
Normal file
116
libs/langchain/langchain/chat_loaders/whatsapp.py
Normal file
@@ -0,0 +1,116 @@
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import zipfile
|
||||
from typing import Iterator, List, Union
|
||||
|
||||
from langchain import schema
|
||||
from langchain.chat_loaders import base as chat_loaders
|
||||
from langchain.schema import messages
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class WhatsAppChatLoader(chat_loaders.BaseChatLoader):
|
||||
def __init__(self, path: str):
|
||||
"""Initialize the WhatsAppChatLoader.
|
||||
|
||||
Args:
|
||||
path (str): Path to the exported WhatsApp chat
|
||||
zip directory, folder, or file.
|
||||
|
||||
To generate the dump, open the chat, click the three dots in the top
|
||||
right corner, and select "More". Then select "Export chat" and
|
||||
choose "Without media".
|
||||
"""
|
||||
self.path = path
|
||||
ignore_lines = [
|
||||
"This message was deleted",
|
||||
"<Media omitted>",
|
||||
"image omitted",
|
||||
"Messages and calls are end-to-end encrypted. No one outside of this chat,"
|
||||
" not even WhatsApp, can read or listen to them.",
|
||||
]
|
||||
self._ignore_lines = re.compile(
|
||||
r"(" + "|".join([r"\u200E*" + line for line in ignore_lines]) + r")",
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
self._message_line_regex = re.compile(
|
||||
r"\u200E*\[?(\d{1,2}/\d{1,2}/\d{2,4}, \d{1,2}:\d{2}:\d{2} (?:AM|PM))\]?[ \u200E]*([^:]+): (.+)", # noqa
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
|
||||
def _load_single_chat_session(self, file_path: str) -> chat_loaders.ChatSession:
|
||||
"""Load a single chat session from a file.
|
||||
|
||||
Args:
|
||||
file_path (str): Path to the chat file.
|
||||
|
||||
Returns:
|
||||
ChatSession: The loaded chat session.
|
||||
"""
|
||||
with open(file_path, "r", encoding="utf-8") as file:
|
||||
txt = file.read()
|
||||
|
||||
# Split messages by newlines, but keep multi-line messages grouped
|
||||
chat_lines: List[str] = []
|
||||
current_message = ""
|
||||
for line in txt.split("\n"):
|
||||
if self._message_line_regex.match(line):
|
||||
if current_message:
|
||||
chat_lines.append(current_message)
|
||||
current_message = line
|
||||
else:
|
||||
current_message += " " + line.strip()
|
||||
if current_message:
|
||||
chat_lines.append(current_message)
|
||||
results: List[Union[messages.HumanMessage, messages.AIMessage]] = []
|
||||
for line in chat_lines:
|
||||
result = self._message_line_regex.match(line.strip())
|
||||
if result:
|
||||
timestamp, sender, text = result.groups()
|
||||
if not self._ignore_lines.match(text.strip()):
|
||||
results.append(
|
||||
schema.HumanMessage(
|
||||
role=sender,
|
||||
content=text,
|
||||
additional_kwargs={
|
||||
"sender": sender,
|
||||
"events": [{"message_time": timestamp}],
|
||||
},
|
||||
)
|
||||
)
|
||||
else:
|
||||
logger.debug(f"Could not parse line: {line}")
|
||||
return chat_loaders.ChatSession(messages=results)
|
||||
|
||||
def _iterate_files(self, path: str) -> Iterator[str]:
|
||||
"""Iterate over the files in a directory or zip file.
|
||||
|
||||
Args:
|
||||
path (str): Path to the directory or zip file.
|
||||
|
||||
Yields:
|
||||
str: The path to each file.
|
||||
"""
|
||||
if os.path.isfile(path):
|
||||
yield path
|
||||
elif os.path.isdir(path):
|
||||
for root, _, files in os.walk(path):
|
||||
for file in files:
|
||||
if file.endswith(".txt"):
|
||||
yield os.path.join(root, file)
|
||||
elif zipfile.is_zipfile(path):
|
||||
with zipfile.ZipFile(path) as zip_file:
|
||||
for file in zip_file.namelist():
|
||||
if file.endswith(".txt"):
|
||||
yield zip_file.extract(file)
|
||||
|
||||
def lazy_load(self) -> Iterator[chat_loaders.ChatSession]:
|
||||
"""Lazy load the messages from the chat file and yield
|
||||
them as chat sessions.
|
||||
|
||||
Yields:
|
||||
Iterator[ChatSession]: The loaded chat sessions.
|
||||
"""
|
||||
yield self._load_single_chat_session(self.path)
|
@@ -0,0 +1,166 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
|
||||
<head>
|
||||
|
||||
<meta charset="utf-8"/>
|
||||
<title>Exported Data</title>
|
||||
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
|
||||
|
||||
<link href="css/style.css" rel="stylesheet"/>
|
||||
|
||||
<script src="js/script.js" type="text/javascript">
|
||||
|
||||
</script>
|
||||
|
||||
</head>
|
||||
|
||||
<body onload="CheckLocation();">
|
||||
|
||||
<div class="page_wrap">
|
||||
|
||||
<div class="page_header">
|
||||
|
||||
<div class="content">
|
||||
|
||||
<div class="text bold">
|
||||
Jimmeny Marvelton
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="page_body chat_page">
|
||||
|
||||
<div class="history">
|
||||
|
||||
<div class="message service" id="message-1">
|
||||
|
||||
<div class="body details">
|
||||
23 August 2023
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="message default clearfix" id="message1">
|
||||
|
||||
<div class="pull_left userpic_wrap">
|
||||
|
||||
<div class="userpic userpic2" style="width: 42px; height: 42px">
|
||||
|
||||
<div class="initials" style="line-height: 42px">
|
||||
bA
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="body">
|
||||
|
||||
<div class="pull_right date details" title="23.08.2023 13:11:23 UTC-08:00">
|
||||
13:11
|
||||
</div>
|
||||
|
||||
<div class="from_name">
|
||||
Jimmeny Marvelton
|
||||
</div>
|
||||
|
||||
<div class="text">
|
||||
i refuse to converse with you
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="message default clearfix" id="message2">
|
||||
|
||||
<div class="pull_left userpic_wrap">
|
||||
|
||||
<div class="userpic userpic1" style="width: 42px; height: 42px">
|
||||
|
||||
<div class="initials" style="line-height: 42px">
|
||||
WF
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="body">
|
||||
|
||||
<div class="pull_right date details" title="23.08.2023 13:13:20 UTC-08:00">
|
||||
13:13
|
||||
</div>
|
||||
|
||||
<div class="from_name">
|
||||
Batman & Robin
|
||||
</div>
|
||||
|
||||
<div class="text">
|
||||
Hi nemesis
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="message default clearfix" id="message3">
|
||||
|
||||
<div class="pull_left userpic_wrap">
|
||||
|
||||
<div class="userpic userpic2" style="width: 42px; height: 42px">
|
||||
|
||||
<div class="initials" style="line-height: 42px">
|
||||
bA
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="body">
|
||||
|
||||
<div class="pull_right date details" title="23.08.2023 13:15:35 UTC-08:00">
|
||||
13:15
|
||||
</div>
|
||||
|
||||
<div class="from_name">
|
||||
Jimmeny Marvelton
|
||||
</div>
|
||||
|
||||
<div class="text">
|
||||
we meet again
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="message default clearfix joined" id="message4">
|
||||
|
||||
<div class="body">
|
||||
|
||||
<div class="pull_right date details" title="23.08.2023 13:15:53 UTC-08:00">
|
||||
13:15
|
||||
</div>
|
||||
|
||||
<div class="text">
|
||||
you will not trick me this time
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
</body>
|
||||
|
||||
</html>
|
@@ -0,0 +1,67 @@
|
||||
{
|
||||
"name": "Jimmeny",
|
||||
"type": "personal_chat",
|
||||
"id": 5965280513,
|
||||
"messages": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "message",
|
||||
"date": "2023-08-23T13:11:23",
|
||||
"date_unixtime": "1692821483",
|
||||
"from": "Jimmeny Marvelton",
|
||||
"from_id": "user123450513",
|
||||
"text": "i refuse to converse with you",
|
||||
"text_entities": [
|
||||
{
|
||||
"type": "plain",
|
||||
"text": "i refuse to converse with you"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "message",
|
||||
"date": "2023-08-23T13:13:20",
|
||||
"date_unixtime": "1692821600",
|
||||
"from": "Batman & Robin",
|
||||
"from_id": "user6565661032",
|
||||
"text": "Hi nemesis",
|
||||
"text_entities": [
|
||||
{
|
||||
"type": "plain",
|
||||
"text": "Hi nemesis"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "message",
|
||||
"date": "2023-08-23T13:15:35",
|
||||
"date_unixtime": "1692821735",
|
||||
"from": "Jimmeny Marvelton",
|
||||
"from_id": "user123450513",
|
||||
"text": "we meet again",
|
||||
"text_entities": [
|
||||
{
|
||||
"type": "plain",
|
||||
"text": "we meet again"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"type": "message",
|
||||
"date": "2023-08-23T13:15:53",
|
||||
"date_unixtime": "1692821753",
|
||||
"from": "Jimmeny Marvelton",
|
||||
"from_id": "user123450513",
|
||||
"text": "you will not trick me this time",
|
||||
"text_entities": [
|
||||
{
|
||||
"type": "plain",
|
||||
"text": "you will not trick me this time"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
@@ -0,0 +1,12 @@
|
||||
[8/15/23, 9:12:33 AM] Dr. Feather: Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them.
|
||||
[8/15/23, 9:12:43 AM] Dr. Feather: I spotted a rare Hyacinth Macaw yesterday in the Amazon Rainforest. Such a magnificent creature!
|
||||
[8/15/23, 9:12:48 AM] Dr. Feather: image omitted
|
||||
[8/15/23, 9:13:15 AM] Jungle Jane: That's stunning! Were you able to observe its behavior?
|
||||
[8/15/23, 9:13:23 AM] Dr. Feather: image omitted
|
||||
[8/15/23, 9:14:02 AM] Dr. Feather: Yes, it seemed quite social with other macaws. They're known for their playful nature.
|
||||
[8/15/23, 9:14:15 AM] Jungle Jane: How's the research going on parrot communication?
|
||||
[8/15/23, 9:14:30 AM] Dr. Feather: image omitted
|
||||
[8/15/23, 9:14:50 AM] Dr. Feather: It's progressing well. We're learning so much about how they use sound and color to communicate.
|
||||
[8/15/23, 9:15:10 AM] Jungle Jane: That's fascinating! Can't wait to read your paper on it.
|
||||
[8/15/23, 9:15:20 AM] Dr. Feather: Thank you! I'll send you a draft soon.
|
||||
[8/15/23, 9:25:16 PM] Jungle Jane: Looking forward to it! Keep up the great work.
|
24
libs/langchain/tests/unit_tests/chat_loaders/test_slack.py
Normal file
24
libs/langchain/tests/unit_tests/chat_loaders/test_slack.py
Normal file
@@ -0,0 +1,24 @@
|
||||
import pathlib
|
||||
|
||||
from langchain.chat_loaders import slack, utils
|
||||
|
||||
|
||||
def test_slack_chat_loader() -> None:
|
||||
chat_path = (
|
||||
pathlib.Path(__file__).parents[2]
|
||||
/ "integration_tests"
|
||||
/ "examples"
|
||||
/ "slack_export.zip"
|
||||
)
|
||||
loader = slack.SlackChatLoader(str(chat_path))
|
||||
|
||||
chat_sessions = list(
|
||||
utils.map_ai_messages(loader.lazy_load(), sender="U0500003428")
|
||||
)
|
||||
assert chat_sessions, "Chat sessions should not be empty"
|
||||
|
||||
assert chat_sessions[1]["messages"], "Chat messages should not be empty"
|
||||
|
||||
assert (
|
||||
"Example message" in chat_sessions[1]["messages"][0].content
|
||||
), "Chat content mismatch"
|
@@ -0,0 +1,97 @@
|
||||
"""Test the telegram chat loader."""
|
||||
import pathlib
|
||||
import tempfile
|
||||
import zipfile
|
||||
from typing import Sequence
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain import schema
|
||||
from langchain.chat_loaders import telegram, utils
|
||||
|
||||
|
||||
def _assert_messages_are_equal(
|
||||
actual_messages: Sequence[schema.BaseMessage],
|
||||
expected_messages: Sequence[schema.BaseMessage],
|
||||
) -> None:
|
||||
assert len(actual_messages) == len(expected_messages)
|
||||
for actual, expected in zip(actual_messages, expected_messages):
|
||||
assert actual.content == expected.content
|
||||
assert (
|
||||
actual.additional_kwargs["sender"] == expected.additional_kwargs["sender"]
|
||||
)
|
||||
|
||||
|
||||
def _check_telegram_chat_loader(path: str) -> None:
|
||||
_data_dir = pathlib.Path(__file__).parent / "data"
|
||||
source_path = _data_dir / path
|
||||
# Create a zip file from the directory in a temp directory
|
||||
with tempfile.TemporaryDirectory() as temp_dir_:
|
||||
temp_dir = pathlib.Path(temp_dir_)
|
||||
if path.endswith(".zip"):
|
||||
# Make a new zip file
|
||||
zip_path = temp_dir / "telegram_chat.zip"
|
||||
with zipfile.ZipFile(zip_path, "w") as zip_file:
|
||||
original_path = _data_dir / path.replace(".zip", "")
|
||||
for file_path in original_path.iterdir():
|
||||
zip_file.write(file_path, arcname=file_path.name)
|
||||
source_path = zip_path
|
||||
loader = telegram.TelegramChatLoader(str(source_path))
|
||||
chat_sessions_ = loader.lazy_load()
|
||||
chat_sessions_ = utils.merge_chat_runs(chat_sessions_)
|
||||
chat_sessions = list(
|
||||
utils.map_ai_messages(chat_sessions_, sender="Batman & Robin")
|
||||
)
|
||||
assert len(chat_sessions) == 1
|
||||
session = chat_sessions[0]
|
||||
assert len(session["messages"]) > 0
|
||||
assert session["messages"][0].content == "i refuse to converse with you"
|
||||
expected_content = [
|
||||
schema.HumanMessage(
|
||||
content="i refuse to converse with you",
|
||||
additional_kwargs={
|
||||
"sender": "Jimmeny Marvelton",
|
||||
"events": [{"message_time": "23.08.2023 13:11:23 UTC-08:00"}],
|
||||
},
|
||||
),
|
||||
schema.AIMessage(
|
||||
content="Hi nemesis",
|
||||
additional_kwargs={
|
||||
"sender": "Batman & Robin",
|
||||
"events": [{"message_time": "23.08.2023 13:13:20 UTC-08:00"}],
|
||||
},
|
||||
),
|
||||
schema.HumanMessage(
|
||||
content="we meet again\n\nyou will not trick me this time",
|
||||
additional_kwargs={
|
||||
"sender": "Jimmeny Marvelton",
|
||||
"events": [{"message_time": "23.08.2023 13:15:35 UTC-08:00"}],
|
||||
},
|
||||
),
|
||||
]
|
||||
_assert_messages_are_equal(session["messages"], expected_content)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"path",
|
||||
[
|
||||
"telegram_chat_json",
|
||||
"telegram_chat_json.zip",
|
||||
"telegram_chat_json/result.json",
|
||||
],
|
||||
)
|
||||
def test_telegram_chat_loader(path: str) -> None:
|
||||
_check_telegram_chat_loader(path)
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="requires bs4 but marking it as such doesn't seem to work")
|
||||
@pytest.mark.parametrize(
|
||||
"path",
|
||||
[
|
||||
"telegram_chat_json",
|
||||
"telegram_chat_json.zip",
|
||||
"telegram_chat_json/result.json",
|
||||
],
|
||||
)
|
||||
def test_telegram_chat_loader_html(path: str) -> None:
|
||||
_check_telegram_chat_loader(path)
|
@@ -0,0 +1,20 @@
|
||||
import pathlib
|
||||
|
||||
from langchain.chat_loaders import utils, whatsapp
|
||||
|
||||
|
||||
def test_whatsapp_chat_loader() -> None:
|
||||
chat_path = pathlib.Path(__file__).parent / "data" / "whatsapp_chat.txt"
|
||||
loader = whatsapp.WhatsAppChatLoader(str(chat_path))
|
||||
|
||||
chat_sessions = list(
|
||||
utils.map_ai_messages(loader.lazy_load(), sender="Dr. Feather")
|
||||
)
|
||||
assert chat_sessions, "Chat sessions should not be empty"
|
||||
|
||||
assert chat_sessions[0]["messages"], "Chat messages should not be empty"
|
||||
|
||||
assert (
|
||||
"I spotted a rare Hyacinth Macaw yesterday in the Amazon Rainforest."
|
||||
" Such a magnificent creature!" in chat_sessions[0]["messages"][0].content
|
||||
), "Chat content mismatch"
|
Reference in New Issue
Block a user