mirror of
https://github.com/hwchase17/langchain.git
synced 2025-05-30 11:39:03 +00:00
Harrison/telegram chat loader (#4698)
Co-authored-by: Akinwande Komolafe <47945512+Sensei-akin@users.noreply.github.com> Co-authored-by: Akinwande Komolafe <akhinoz@gmail.com>
This commit is contained in:
parent
2b181e5a6c
commit
12b4ee1fc7
@ -19,7 +19,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from langchain.document_loaders import TelegramChatLoader"
|
"from langchain.document_loaders import TelegramChatFileLoader, TelegramChatApiLoader"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -29,7 +29,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"loader = TelegramChatLoader(\"example_data/telegram.json\")"
|
"loader = TelegramChatFileLoader(\"example_data/telegram.json\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -41,7 +41,7 @@
|
|||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"[Document(page_content=\"Henry on 2020-01-01T00:00:02: It's 2020...\\n\\nHenry on 2020-01-01T00:00:04: Fireworks!\\n\\nGrace 🧤 ðŸ\\x8d’ on 2020-01-01T00:00:05: You're a minute late!\\n\\n\", lookup_str='', metadata={'source': 'example_data/telegram.json'}, lookup_index=0)]"
|
"[Document(page_content=\"Henry on 2020-01-01T00:00:02: It's 2020...\\n\\nHenry on 2020-01-01T00:00:04: Fireworks!\\n\\nGrace 🧤 ðŸ\\x8d’ on 2020-01-01T00:00:05: You're a minute late!\\n\\n\", metadata={'source': 'example_data/telegram.json'})]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 3,
|
"execution_count": 3,
|
||||||
@ -53,10 +53,45 @@
|
|||||||
"loader.load()"
|
"loader.load()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "3e64cac2",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"`TelegramChatApiLoader` loads data directly from any specified channel from Telegram. In order to export the data, you will need to authenticate your Telegram account. \n",
|
||||||
|
"\n",
|
||||||
|
"You can get the API_HASH and API_ID from https://my.telegram.org/auth?to=apps\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "3e64cac2",
|
"id": "f05f75f3",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = TelegramChatApiLoader(user_name =\"\"\\\n",
|
||||||
|
" chat_url=\"<CHAT_URL>\",\\\n",
|
||||||
|
" api_hash=\"<API HASH>\",\\\n",
|
||||||
|
" api_id=\"<API_ID>\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "40039f7b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader.load()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "18e5af2b",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": []
|
"source": []
|
||||||
@ -78,7 +113,10 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.6"
|
|
||||||
|
"version": "3.9.13"
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -79,7 +79,10 @@ from langchain.document_loaders.slack_directory import SlackDirectoryLoader
|
|||||||
from langchain.document_loaders.spreedly import SpreedlyLoader
|
from langchain.document_loaders.spreedly import SpreedlyLoader
|
||||||
from langchain.document_loaders.srt import SRTLoader
|
from langchain.document_loaders.srt import SRTLoader
|
||||||
from langchain.document_loaders.stripe import StripeLoader
|
from langchain.document_loaders.stripe import StripeLoader
|
||||||
from langchain.document_loaders.telegram import TelegramChatLoader
|
from langchain.document_loaders.telegram import (
|
||||||
|
TelegramChatApiLoader,
|
||||||
|
TelegramChatFileLoader,
|
||||||
|
)
|
||||||
from langchain.document_loaders.text import TextLoader
|
from langchain.document_loaders.text import TextLoader
|
||||||
from langchain.document_loaders.toml import TomlLoader
|
from langchain.document_loaders.toml import TomlLoader
|
||||||
from langchain.document_loaders.twitter import TwitterTweetLoader
|
from langchain.document_loaders.twitter import TwitterTweetLoader
|
||||||
@ -108,6 +111,9 @@ from langchain.document_loaders.youtube import (
|
|||||||
# Legacy: only for backwards compat. Use PyPDFLoader instead
|
# Legacy: only for backwards compat. Use PyPDFLoader instead
|
||||||
PagedPDFSplitter = PyPDFLoader
|
PagedPDFSplitter = PyPDFLoader
|
||||||
|
|
||||||
|
# For backwards compatability
|
||||||
|
TelegramChatLoader = TelegramChatFileLoader
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"AZLyricsLoader",
|
"AZLyricsLoader",
|
||||||
"AirbyteJSONLoader",
|
"AirbyteJSONLoader",
|
||||||
@ -176,9 +182,10 @@ __all__ = [
|
|||||||
"SeleniumURLLoader",
|
"SeleniumURLLoader",
|
||||||
"SitemapLoader",
|
"SitemapLoader",
|
||||||
"SlackDirectoryLoader",
|
"SlackDirectoryLoader",
|
||||||
|
"TelegramChatFileLoader",
|
||||||
|
"TelegramChatApiLoader",
|
||||||
"SpreedlyLoader",
|
"SpreedlyLoader",
|
||||||
"StripeLoader",
|
"StripeLoader",
|
||||||
"TelegramChatLoader",
|
|
||||||
"TextLoader",
|
"TextLoader",
|
||||||
"TomlLoader",
|
"TomlLoader",
|
||||||
"TwitterTweetLoader",
|
"TwitterTweetLoader",
|
||||||
@ -201,4 +208,5 @@ __all__ = [
|
|||||||
"WhatsAppChatLoader",
|
"WhatsAppChatLoader",
|
||||||
"WikipediaLoader",
|
"WikipediaLoader",
|
||||||
"YoutubeLoader",
|
"YoutubeLoader",
|
||||||
|
"TelegramChatLoader",
|
||||||
]
|
]
|
||||||
|
@ -1,10 +1,17 @@
|
|||||||
"""Loader that loads Telegram chat json dump."""
|
"""Loader that loads Telegram chat json dump."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
import json
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
from langchain.document_loaders.base import BaseLoader
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
def concatenate_rows(row: dict) -> str:
|
def concatenate_rows(row: dict) -> str:
|
||||||
@ -15,7 +22,7 @@ def concatenate_rows(row: dict) -> str:
|
|||||||
return f"{sender} on {date}: {text}\n\n"
|
return f"{sender} on {date}: {text}\n\n"
|
||||||
|
|
||||||
|
|
||||||
class TelegramChatLoader(BaseLoader):
|
class TelegramChatFileLoader(BaseLoader):
|
||||||
"""Loader that loads Telegram chat json directory dump."""
|
"""Loader that loads Telegram chat json directory dump."""
|
||||||
|
|
||||||
def __init__(self, path: str):
|
def __init__(self, path: str):
|
||||||
@ -37,3 +44,201 @@ class TelegramChatLoader(BaseLoader):
|
|||||||
metadata = {"source": str(p)}
|
metadata = {"source": str(p)}
|
||||||
|
|
||||||
return [Document(page_content=text, metadata=metadata)]
|
return [Document(page_content=text, metadata=metadata)]
|
||||||
|
|
||||||
|
|
||||||
|
def text_to_docs(text: Union[str, List[str]]) -> List[Document]:
|
||||||
|
"""Converts a string or list of strings to a list of Documents with metadata."""
|
||||||
|
if isinstance(text, str):
|
||||||
|
# Take a single string as one page
|
||||||
|
text = [text]
|
||||||
|
page_docs = [Document(page_content=page) for page in text]
|
||||||
|
|
||||||
|
# Add page numbers as metadata
|
||||||
|
for i, doc in enumerate(page_docs):
|
||||||
|
doc.metadata["page"] = i + 1
|
||||||
|
|
||||||
|
# Split pages into chunks
|
||||||
|
doc_chunks = []
|
||||||
|
|
||||||
|
for doc in page_docs:
|
||||||
|
text_splitter = RecursiveCharacterTextSplitter(
|
||||||
|
chunk_size=800,
|
||||||
|
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
|
||||||
|
chunk_overlap=20,
|
||||||
|
)
|
||||||
|
chunks = text_splitter.split_text(doc.page_content)
|
||||||
|
for i, chunk in enumerate(chunks):
|
||||||
|
doc = Document(
|
||||||
|
page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
|
||||||
|
)
|
||||||
|
# Add sources a metadata
|
||||||
|
doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
|
||||||
|
doc_chunks.append(doc)
|
||||||
|
return doc_chunks
|
||||||
|
|
||||||
|
|
||||||
|
class TelegramChatApiLoader(BaseLoader):
|
||||||
|
"""Loader that loads Telegram chat json directory dump."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
chat_url: Optional[str] = None,
|
||||||
|
api_id: Optional[int] = None,
|
||||||
|
api_hash: Optional[str] = None,
|
||||||
|
username: Optional[str] = None,
|
||||||
|
):
|
||||||
|
"""Initialize with API parameters."""
|
||||||
|
self.chat_url = chat_url
|
||||||
|
self.api_id = api_id
|
||||||
|
self.api_hash = api_hash
|
||||||
|
self.username = username
|
||||||
|
|
||||||
|
async def fetch_data_from_telegram(self) -> None:
|
||||||
|
"""Fetch data from Telegram API and save it as a JSON file."""
|
||||||
|
from telethon.sync import TelegramClient
|
||||||
|
|
||||||
|
data = []
|
||||||
|
async with TelegramClient(self.username, self.api_id, self.api_hash) as client:
|
||||||
|
async for message in client.iter_messages(self.chat_url):
|
||||||
|
is_reply = message.reply_to is not None
|
||||||
|
reply_to_id = message.reply_to.reply_to_msg_id if is_reply else None
|
||||||
|
data.append(
|
||||||
|
{
|
||||||
|
"sender_id": message.sender_id,
|
||||||
|
"text": message.text,
|
||||||
|
"date": message.date.isoformat(),
|
||||||
|
"message.id": message.id,
|
||||||
|
"is_reply": is_reply,
|
||||||
|
"reply_to_id": reply_to_id,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
with open("telegram_data.json", "w", encoding="utf-8") as f:
|
||||||
|
json.dump(data, f, ensure_ascii=False, indent=4)
|
||||||
|
|
||||||
|
self.file_path = "telegram_data.json"
|
||||||
|
|
||||||
|
def _get_message_threads(self, data: pd.DataFrame) -> dict:
|
||||||
|
"""Create a dictionary of message threads from the given data.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data (pd.DataFrame): A DataFrame containing the conversation \
|
||||||
|
data with columns:
|
||||||
|
- message.sender_id
|
||||||
|
- text
|
||||||
|
- date
|
||||||
|
- message.id
|
||||||
|
- is_reply
|
||||||
|
- reply_to_id
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: A dictionary where the key is the parent message ID and \
|
||||||
|
the value is a list of message IDs in ascending order.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def find_replies(parent_id: int, reply_data: pd.DataFrame) -> List[int]:
|
||||||
|
"""
|
||||||
|
Recursively find all replies to a given parent message ID.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
parent_id (int): The parent message ID.
|
||||||
|
reply_data (pd.DataFrame): A DataFrame containing reply messages.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: A list of message IDs that are replies to the parent message ID.
|
||||||
|
"""
|
||||||
|
# Find direct replies to the parent message ID
|
||||||
|
direct_replies = reply_data[reply_data["reply_to_id"] == parent_id][
|
||||||
|
"message.id"
|
||||||
|
].tolist()
|
||||||
|
|
||||||
|
# Recursively find replies to the direct replies
|
||||||
|
all_replies = []
|
||||||
|
for reply_id in direct_replies:
|
||||||
|
all_replies += [reply_id] + find_replies(reply_id, reply_data)
|
||||||
|
|
||||||
|
return all_replies
|
||||||
|
|
||||||
|
# Filter out parent messages
|
||||||
|
parent_messages = data[data["is_reply"] is False]
|
||||||
|
|
||||||
|
# Filter out reply messages and drop rows with NaN in 'reply_to_id'
|
||||||
|
reply_messages = data[data["is_reply"] is True].dropna(subset=["reply_to_id"])
|
||||||
|
|
||||||
|
# Convert 'reply_to_id' to integer
|
||||||
|
reply_messages["reply_to_id"] = reply_messages["reply_to_id"].astype(int)
|
||||||
|
|
||||||
|
# Create a dictionary of message threads with parent message IDs as keys and \
|
||||||
|
# lists of reply message IDs as values
|
||||||
|
message_threads = {
|
||||||
|
parent_id: [parent_id] + find_replies(parent_id, reply_messages)
|
||||||
|
for parent_id in parent_messages["message.id"]
|
||||||
|
}
|
||||||
|
|
||||||
|
return message_threads
|
||||||
|
|
||||||
|
def _combine_message_texts(
|
||||||
|
self, message_threads: Dict[int, List[int]], data: pd.DataFrame
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Combine the message texts for each parent message ID based \
|
||||||
|
on the list of message threads.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
message_threads (dict): A dictionary where the key is the parent message \
|
||||||
|
ID and the value is a list of message IDs in ascending order.
|
||||||
|
data (pd.DataFrame): A DataFrame containing the conversation data:
|
||||||
|
- message.sender_id
|
||||||
|
- text
|
||||||
|
- date
|
||||||
|
- message.id
|
||||||
|
- is_reply
|
||||||
|
- reply_to_id
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: A combined string of message texts sorted by date.
|
||||||
|
"""
|
||||||
|
combined_text = ""
|
||||||
|
|
||||||
|
# Iterate through sorted parent message IDs
|
||||||
|
for parent_id, message_ids in message_threads.items():
|
||||||
|
# Get the message texts for the message IDs and sort them by date
|
||||||
|
message_texts = (
|
||||||
|
data[data["message.id"].isin(message_ids)]
|
||||||
|
.sort_values(by="date")["text"]
|
||||||
|
.tolist()
|
||||||
|
)
|
||||||
|
message_texts = [str(elem) for elem in message_texts]
|
||||||
|
|
||||||
|
# Combine the message texts
|
||||||
|
combined_text += " ".join(message_texts) + ".\n"
|
||||||
|
|
||||||
|
return combined_text.strip()
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
"""Load documents."""
|
||||||
|
if self.chat_url is not None:
|
||||||
|
try:
|
||||||
|
import nest_asyncio
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
nest_asyncio.apply()
|
||||||
|
asyncio.run(self.fetch_data_from_telegram())
|
||||||
|
except ImportError:
|
||||||
|
raise ValueError(
|
||||||
|
"please install with `pip install nest_asyncio`,\
|
||||||
|
`pip install nest_asyncio` "
|
||||||
|
)
|
||||||
|
|
||||||
|
p = Path(self.file_path)
|
||||||
|
|
||||||
|
with open(p, encoding="utf8") as f:
|
||||||
|
d = json.load(f)
|
||||||
|
|
||||||
|
normalized_messages = pd.json_normalize(d)
|
||||||
|
df = pd.DataFrame(normalized_messages)
|
||||||
|
|
||||||
|
message_threads = self._get_message_threads(df)
|
||||||
|
combined_texts = self._combine_message_texts(message_threads, df)
|
||||||
|
|
||||||
|
return text_to_docs(combined_texts)
|
||||||
|
@ -1,12 +1,12 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from langchain.document_loaders import TelegramChatLoader
|
from langchain.document_loaders import TelegramChatFileLoader
|
||||||
|
|
||||||
|
|
||||||
def test_telegram_chat_loader() -> None:
|
def test_telegram_chat_file_loader() -> None:
|
||||||
"""Test TelegramChatLoader."""
|
"""Test TelegramChatLoader."""
|
||||||
file_path = Path(__file__).parent.parent / "examples/telegram.json"
|
file_path = Path(__file__).parent.parent / "examples/telegram.json"
|
||||||
loader = TelegramChatLoader(str(file_path))
|
loader = TelegramChatFileLoader(str(file_path))
|
||||||
docs = loader.load()
|
docs = loader.load()
|
||||||
|
|
||||||
assert len(docs) == 1
|
assert len(docs) == 1
|
||||||
|
Loading…
Reference in New Issue
Block a user