mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-13 08:27:03 +00:00
Add Slack Directory Loader (#2841)
Fixes linting issue from #2835 Adds a loader for Slack Exports which can be a very valuable source of knowledge to use for internal QA bots and other use cases. ```py # Export data from your Slack Workspace first. from langchain.document_loaders import SLackDirectoryLoader SLACK_WORKSPACE_URL = "https://awesome.slack.com" loader = ("Slack_Exports", SLACK_WORKSPACE_URL) docs = loader.load() ```
This commit is contained in:
parent
ed2ef5cbe4
commit
bf0887c486
@ -0,0 +1,81 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "1dc7df1d",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Slack (Local Exported Zipfile)\n",
|
||||||
|
"\n",
|
||||||
|
"This notebook covers how to load documents from a Zipfile generated from a Slack export.\n",
|
||||||
|
"\n",
|
||||||
|
"In order to get this Slack export, follow these instructions:\n",
|
||||||
|
"\n",
|
||||||
|
"## 🧑 Instructions for ingesting your own dataset\n",
|
||||||
|
"\n",
|
||||||
|
"Export your Slack data. You can do this by going to your Workspace Management page and clicking the Import/Export option ({your_slack_domain}.slack.com/services/export). Then, choose the right date range and click `Start export`. Slack will send you an email and a DM when the export is ready.\n",
|
||||||
|
"\n",
|
||||||
|
"The download will produce a `.zip` file in your Downloads folder (or wherever your downloads can be found, depending on your OS configuration).\n",
|
||||||
|
"\n",
|
||||||
|
"Copy the path to the `.zip` file, and assign it as `LOCAL_ZIPFILE` below."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "007c5cbf",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders import SlackDirectoryLoader "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "a1caec59",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Optionally set your Slack URL. This will give you proper URLs in the docs sources.\n",
|
||||||
|
"SLACK_WORKSPACE_URL = \"https://xxx.slack.com\"\n",
|
||||||
|
"LOCAL_ZIPFILE = \"\" # Paste the local paty to your Slack zip file here.\n",
|
||||||
|
"\n",
|
||||||
|
"loader = SlackDirectoryLoader(LOCAL_ZIPFILE, SLACK_WORKSPACE_URL)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "b1c30ff7",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"docs = loader.load()\n",
|
||||||
|
"docs"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -55,6 +55,7 @@ from langchain.document_loaders.roam import RoamLoader
|
|||||||
from langchain.document_loaders.s3_directory import S3DirectoryLoader
|
from langchain.document_loaders.s3_directory import S3DirectoryLoader
|
||||||
from langchain.document_loaders.s3_file import S3FileLoader
|
from langchain.document_loaders.s3_file import S3FileLoader
|
||||||
from langchain.document_loaders.sitemap import SitemapLoader
|
from langchain.document_loaders.sitemap import SitemapLoader
|
||||||
|
from langchain.document_loaders.slack_directory import SlackDirectoryLoader
|
||||||
from langchain.document_loaders.srt import SRTLoader
|
from langchain.document_loaders.srt import SRTLoader
|
||||||
from langchain.document_loaders.telegram import TelegramChatLoader
|
from langchain.document_loaders.telegram import TelegramChatLoader
|
||||||
from langchain.document_loaders.text import TextLoader
|
from langchain.document_loaders.text import TextLoader
|
||||||
@ -138,4 +139,5 @@ __all__ = [
|
|||||||
"DuckDBLoader",
|
"DuckDBLoader",
|
||||||
"BigQueryLoader",
|
"BigQueryLoader",
|
||||||
"BiliBiliLoader",
|
"BiliBiliLoader",
|
||||||
|
"SlackDirectoryLoader",
|
||||||
]
|
]
|
||||||
|
112
langchain/document_loaders/slack_directory.py
Normal file
112
langchain/document_loaders/slack_directory.py
Normal file
@ -0,0 +1,112 @@
|
|||||||
|
"""Loader for documents from a Slack export."""
|
||||||
|
import json
|
||||||
|
import zipfile
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
|
||||||
|
|
||||||
|
class SlackDirectoryLoader(BaseLoader):
|
||||||
|
"""Loader for loading documents from a Slack directory dump."""
|
||||||
|
|
||||||
|
def __init__(self, zip_path: str, workspace_url: Optional[str] = None):
|
||||||
|
"""Initialize the SlackDirectoryLoader.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
zip_path (str): The path to the Slack directory dump zip file.
|
||||||
|
workspace_url (Optional[str]): The Slack workspace URL.
|
||||||
|
Including the URL will turn
|
||||||
|
sources into links. Defaults to None.
|
||||||
|
"""
|
||||||
|
self.zip_path = Path(zip_path)
|
||||||
|
self.workspace_url = workspace_url
|
||||||
|
self.channel_id_map = self._get_channel_id_map(self.zip_path)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_channel_id_map(zip_path: Path) -> Dict[str, str]:
|
||||||
|
"""Get a dictionary mapping channel names to their respective IDs."""
|
||||||
|
with zipfile.ZipFile(zip_path, "r") as zip_file:
|
||||||
|
try:
|
||||||
|
with zip_file.open("channels.json", "r") as f:
|
||||||
|
channels = json.load(f)
|
||||||
|
return {channel["name"]: channel["id"] for channel in channels}
|
||||||
|
except KeyError:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
"""Load and return documents from the Slack directory dump."""
|
||||||
|
docs = []
|
||||||
|
with zipfile.ZipFile(self.zip_path, "r") as zip_file:
|
||||||
|
for channel_path in zip_file.namelist():
|
||||||
|
channel_name = Path(channel_path).parent.name
|
||||||
|
if not channel_name:
|
||||||
|
continue
|
||||||
|
if channel_path.endswith(".json"):
|
||||||
|
messages = self._read_json(zip_file, channel_path)
|
||||||
|
for message in messages:
|
||||||
|
document = self._convert_message_to_document(
|
||||||
|
message, channel_name
|
||||||
|
)
|
||||||
|
docs.append(document)
|
||||||
|
return docs
|
||||||
|
|
||||||
|
def _read_json(self, zip_file: zipfile.ZipFile, file_path: str) -> List[dict]:
|
||||||
|
"""Read JSON data from a zip subfile."""
|
||||||
|
with zip_file.open(file_path, "r") as f:
|
||||||
|
data = json.load(f)
|
||||||
|
return data
|
||||||
|
|
||||||
|
def _convert_message_to_document(
|
||||||
|
self, message: dict, channel_name: str
|
||||||
|
) -> Document:
|
||||||
|
"""
|
||||||
|
Convert a message to a Document object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
message (dict): A message in the form of a dictionary.
|
||||||
|
channel_name (str): The name of the channel the message belongs to.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Document: A Document object representing the message.
|
||||||
|
"""
|
||||||
|
text = message.get("text", "")
|
||||||
|
metadata = self._get_message_metadata(message, channel_name)
|
||||||
|
return Document(
|
||||||
|
page_content=text,
|
||||||
|
metadata=metadata,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _get_message_metadata(self, message: dict, channel_name: str) -> dict:
|
||||||
|
"""Create and return metadata for a given message and channel."""
|
||||||
|
timestamp = message.get("ts", "")
|
||||||
|
user = message.get("user", "")
|
||||||
|
source = self._get_message_source(channel_name, user, timestamp)
|
||||||
|
return {
|
||||||
|
"source": source,
|
||||||
|
"channel": channel_name,
|
||||||
|
"timestamp": timestamp,
|
||||||
|
"user": user,
|
||||||
|
}
|
||||||
|
|
||||||
|
def _get_message_source(self, channel_name: str, user: str, timestamp: str) -> str:
|
||||||
|
"""
|
||||||
|
Get the message source as a string.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
channel_name (str): The name of the channel the message belongs to.
|
||||||
|
user (str): The user ID who sent the message.
|
||||||
|
timestamp (str): The timestamp of the message.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The message source.
|
||||||
|
"""
|
||||||
|
if self.workspace_url:
|
||||||
|
channel_id = self.channel_id_map.get(channel_name, "")
|
||||||
|
return (
|
||||||
|
f"{self.workspace_url}/archives/{channel_id}"
|
||||||
|
+ f"/p{timestamp.replace('.', '')}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
return f"{channel_name} - {user} - {timestamp}"
|
23
tests/integration_tests/document_loaders/test_slack.py
Normal file
23
tests/integration_tests/document_loaders/test_slack.py
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
"""Tests for the Slack directory loader"""
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from langchain.document_loaders import SlackDirectoryLoader
|
||||||
|
|
||||||
|
|
||||||
|
def test_slack_directory_loader() -> None:
|
||||||
|
"""Test Slack directory loader."""
|
||||||
|
file_path = Path(__file__).parent.parent / "examples/slack_export.zip"
|
||||||
|
loader = SlackDirectoryLoader(str(file_path))
|
||||||
|
docs = loader.load()
|
||||||
|
|
||||||
|
assert len(docs) == 5
|
||||||
|
|
||||||
|
|
||||||
|
def test_slack_directory_loader_urls() -> None:
|
||||||
|
"""Test workspace URLS are passed through in the SlackDirectoryloader."""
|
||||||
|
file_path = Path(__file__).parent.parent / "examples/slack_export.zip"
|
||||||
|
workspace_url = "example_workspace.com"
|
||||||
|
loader = SlackDirectoryLoader(str(file_path), workspace_url)
|
||||||
|
docs = loader.load()
|
||||||
|
for doc in docs:
|
||||||
|
assert doc.metadata["source"].startswith(workspace_url)
|
BIN
tests/integration_tests/examples/slack_export.zip
Normal file
BIN
tests/integration_tests/examples/slack_export.zip
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user