mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-20 13:54:48 +00:00
Harrison/telegram loader (#1080)
Co-authored-by: Maxime Vidal <max.vidal@hotmail.fr>
This commit is contained in:
parent
a1c296bc3c
commit
c60954d0f8
@ -93,7 +93,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.9"
|
"version": "3.9.1"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -0,0 +1,31 @@
|
|||||||
|
{
|
||||||
|
"name": "Grace 🧤",
|
||||||
|
"type": "personal_chat",
|
||||||
|
"id": 2730825451,
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"id": 1980499,
|
||||||
|
"type": "message",
|
||||||
|
"date": "2020-01-01T00:00:02",
|
||||||
|
"from": "Henry",
|
||||||
|
"from_id": 4325636679,
|
||||||
|
"text": "It's 2020..."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 1980500,
|
||||||
|
"type": "message",
|
||||||
|
"date": "2020-01-01T00:00:04",
|
||||||
|
"from": "Henry",
|
||||||
|
"from_id": 4325636679,
|
||||||
|
"text": "Fireworks!"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 1980501,
|
||||||
|
"type": "message",
|
||||||
|
"date": "2020-01-01T00:00:05",
|
||||||
|
"from": "Grace 🧤 ðŸ’",
|
||||||
|
"from_id": 4720225552,
|
||||||
|
"text": "You're a minute late!"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
84
docs/modules/document_loaders/examples/telegram.ipynb
Normal file
84
docs/modules/document_loaders/examples/telegram.ipynb
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "33205b12",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Telegram\n",
|
||||||
|
"\n",
|
||||||
|
"This notebook covers how to load data from Telegram into a format that can be ingested into LangChain."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "90b69c94",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders import TelegramChatLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "13deb0f5",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = TelegramChatLoader(\"example_data/telegram.json\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "9ccc1e2f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[Document(page_content=\"Henry on 2020-01-01T00:00:02: It's 2020...\\n\\nHenry on 2020-01-01T00:00:04: Fireworks!\\n\\nGrace 🧤 ðŸ\\x8d’ on 2020-01-01T00:00:05: You're a minute late!\\n\\n\", lookup_str='', metadata={'source': 'example_data/telegram.json'}, lookup_index=0)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"loader.load()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "3e64cac2",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -6,7 +6,7 @@ from langchain.document_loaders.college_confidential import CollegeConfidentialL
|
|||||||
from langchain.document_loaders.directory import DirectoryLoader
|
from langchain.document_loaders.directory import DirectoryLoader
|
||||||
from langchain.document_loaders.docx import UnstructuredDocxLoader
|
from langchain.document_loaders.docx import UnstructuredDocxLoader
|
||||||
from langchain.document_loaders.email import UnstructuredEmailLoader
|
from langchain.document_loaders.email import UnstructuredEmailLoader
|
||||||
from langchain.document_loaders.everynote import EveryNoteLoader
|
from langchain.document_loaders.evernote import EverNoteLoader
|
||||||
from langchain.document_loaders.gcs_directory import GCSDirectoryLoader
|
from langchain.document_loaders.gcs_directory import GCSDirectoryLoader
|
||||||
from langchain.document_loaders.gcs_file import GCSFileLoader
|
from langchain.document_loaders.gcs_file import GCSFileLoader
|
||||||
from langchain.document_loaders.googledrive import GoogleDriveLoader
|
from langchain.document_loaders.googledrive import GoogleDriveLoader
|
||||||
@ -23,6 +23,7 @@ from langchain.document_loaders.readthedocs import ReadTheDocsLoader
|
|||||||
from langchain.document_loaders.roam import RoamLoader
|
from langchain.document_loaders.roam import RoamLoader
|
||||||
from langchain.document_loaders.s3_directory import S3DirectoryLoader
|
from langchain.document_loaders.s3_directory import S3DirectoryLoader
|
||||||
from langchain.document_loaders.s3_file import S3FileLoader
|
from langchain.document_loaders.s3_file import S3FileLoader
|
||||||
|
from langchain.document_loaders.telegram import TelegramChatLoader
|
||||||
from langchain.document_loaders.text import TextLoader
|
from langchain.document_loaders.text import TextLoader
|
||||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||||
from langchain.document_loaders.url import UnstructuredURLLoader
|
from langchain.document_loaders.url import UnstructuredURLLoader
|
||||||
@ -55,8 +56,9 @@ __all__ = [
|
|||||||
"CollegeConfidentialLoader",
|
"CollegeConfidentialLoader",
|
||||||
"GutenbergLoader",
|
"GutenbergLoader",
|
||||||
"PagedPDFSplitter",
|
"PagedPDFSplitter",
|
||||||
"EveryNoteLoader",
|
"EverNoteLoader",
|
||||||
"AirbyteJSONLoader",
|
"AirbyteJSONLoader",
|
||||||
"OnlinePDFLoader",
|
"OnlinePDFLoader",
|
||||||
"PDFMinerLoader",
|
"PDFMinerLoader",
|
||||||
|
"TelegramChatLoader",
|
||||||
]
|
]
|
||||||
|
49
langchain/document_loaders/telegram.py
Normal file
49
langchain/document_loaders/telegram.py
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
"""Loader that loads Telegram chat json dump."""
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
|
||||||
|
|
||||||
|
def concatenate_rows(row: dict) -> str:
|
||||||
|
"""Combine message information in a readable format ready to be used."""
|
||||||
|
date = row["date"]
|
||||||
|
sender = row["from"]
|
||||||
|
text = row["text"]
|
||||||
|
return f"{sender} on {date}: {text}\n\n"
|
||||||
|
|
||||||
|
|
||||||
|
class TelegramChatLoader(BaseLoader):
|
||||||
|
"""Loader that loads Telegram chat json directory dump."""
|
||||||
|
|
||||||
|
def __init__(self, path: str):
|
||||||
|
"""Initialize with path."""
|
||||||
|
self.file_path = path
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
"""Load documents."""
|
||||||
|
p = Path(self.file_path)
|
||||||
|
|
||||||
|
with open(p, encoding="utf8") as f:
|
||||||
|
d = json.load(f)
|
||||||
|
|
||||||
|
normalized_messages = pd.json_normalize(d["messages"])
|
||||||
|
df_normalized_messages = pd.DataFrame(normalized_messages)
|
||||||
|
|
||||||
|
# Only keep plain text messages (no services, links, hashtags, code, bold...)
|
||||||
|
df_filtered = df_normalized_messages[
|
||||||
|
(df_normalized_messages.type == "message")
|
||||||
|
& (df_normalized_messages.text.apply(lambda x: type(x) == str))
|
||||||
|
]
|
||||||
|
|
||||||
|
df_filtered = df_filtered[["date", "text", "from"]]
|
||||||
|
|
||||||
|
text = df_filtered.apply(concatenate_rows, axis=1).str.cat(sep="")
|
||||||
|
|
||||||
|
metadata = {"source": str(p)}
|
||||||
|
|
||||||
|
return [Document(page_content=text, metadata=metadata)]
|
Loading…
Reference in New Issue
Block a user