From 5990651070d82a1f09b6856a54ac6c2540dc48c3 Mon Sep 17 00:00:00 2001 From: Patrick Loeber <50772274+patrickloeber@users.noreply.github.com> Date: Thu, 24 Aug 2023 07:51:19 +0200 Subject: [PATCH] Add new document_loader: AssemblyAIAudioTranscriptLoader (#9667) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR adds a new document loader `AssemblyAIAudioTranscriptLoader` that allows to transcribe audio files with the [AssemblyAI API](https://www.assemblyai.com) and loads the transcribed text into documents. - Add new document_loader with class `AssemblyAIAudioTranscriptLoader` - Add optional dependency `assemblyai` - Add unit tests (using a Mock client) - Add docs notebook This is the equivalent to the JS integration already available in LangChain.js. See the [LangChain JS docs AssemblyAI page](https://js.langchain.com/docs/modules/data_connection/document_loaders/integrations/web_loaders/assemblyai_audio_transcription). At its simplest, you can use the loader to get a transcript back from an audio file like this: ```python from langchain.document_loaders.assemblyai import AssemblyAIAudioTranscriptLoader loader = AssemblyAIAudioTranscriptLoader(file_path="./testfile.mp3") docs = loader.load() ``` To use it, it needs the `assemblyai` python package installed, and the environment variable `ASSEMBLYAI_API_KEY` set with your API key. Alternatively, the API key can also be passed as an argument. Twitter handles to shout out if so kindly 🙇 [@AssemblyAI](https://twitter.com/AssemblyAI) and [@patloeber](https://twitter.com/patloeber) --------- Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> Co-authored-by: Eugene Yurtsev --- .../document_loaders/assemblyai.ipynb | 224 ++++++++++++++++++ .../langchain/document_loaders/__init__.py | 2 + .../langchain/document_loaders/assemblyai.py | 111 +++++++++ libs/langchain/poetry.lock | 104 +++++++- libs/langchain/pyproject.toml | 2 + .../document_loaders/test_assemblyai.py | 50 ++++ 6 files changed, 491 insertions(+), 2 deletions(-) create mode 100644 docs/extras/integrations/document_loaders/assemblyai.ipynb create mode 100644 libs/langchain/langchain/document_loaders/assemblyai.py create mode 100644 libs/langchain/tests/unit_tests/document_loaders/test_assemblyai.py diff --git a/docs/extras/integrations/document_loaders/assemblyai.ipynb b/docs/extras/integrations/document_loaders/assemblyai.ipynb new file mode 100644 index 00000000000..33fdef929d8 --- /dev/null +++ b/docs/extras/integrations/document_loaders/assemblyai.ipynb @@ -0,0 +1,224 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# AssemblyAI Audio Transcripts\n", + "\n", + "The `AssemblyAIAudioTranscriptLoader` allows to transcribe audio files with the [AssemblyAI API](https://www.assemblyai.com) and loads the transcribed text into documents.\n", + "\n", + "To use it, you should have the `assemblyai` python package installed, and the\n", + "environment variable `ASSEMBLYAI_API_KEY` set with your API key. Alternatively, the API key can also be passed as an argument.\n", + "\n", + "More info about AssemblyAI:\n", + "\n", + "- [Website](https://www.assemblyai.com/)\n", + "- [Get a Free API key](https://www.assemblyai.com/dashboard/signup)\n", + "- [AssemblyAI API Docs](https://www.assemblyai.com/docs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Installation\n", + "\n", + "First, you need to install the `assemblyai` python package.\n", + "\n", + "You can find more info about it inside the [assemblyai-python-sdk GitHub repo](https://github.com/AssemblyAI/assemblyai-python-sdk)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install assemblyai" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example\n", + "\n", + "The `AssemblyAIAudioTranscriptLoader` needs at least the `file_path` argument. Audio files can be specified as an URL or a local file path." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders.assemblyai import AssemblyAIAudioTranscriptLoader\n", + "\n", + "audio_file = \"https://storage.googleapis.com/aai-docs-samples/nbc.mp3\"\n", + "# or a local file path: audio_file = \"./nbc.mp3\"\n", + "\n", + "loader = AssemblyAIAudioTranscriptLoader(file_path=audio_file)\n", + "\n", + "docs = loader.load()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note: Calling `loader.load()` blocks until the transcription is finished." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The transcribed text is available in the `page_content`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "docs[0].page_content" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```\n", + "\"Load time, a new president and new congressional makeup. Same old ...\"\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `metadata` contains the full JSON response with more meta information:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "docs[0].metadata" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```\n", + "{'language_code': ,\n", + " 'audio_url': 'https://storage.googleapis.com/aai-docs-samples/nbc.mp3',\n", + " 'punctuate': True,\n", + " 'format_text': True,\n", + " ...\n", + "}\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Transcript Formats\n", + "\n", + "You can specify the `transcript_format` argument for different formats.\n", + "\n", + "Depending on the format, one or more documents are returned. These are the different `TranscriptFormat` options:\n", + "\n", + "- `TEXT`: One document with the transcription text\n", + "- `SENTENCES`: Multiple documents, splits the transcription by each sentence\n", + "- `PARAGRAPHS`: Multiple documents, splits the transcription by each paragraph\n", + "- `SUBTITLES_SRT`: One document with the transcript exported in SRT subtitles format\n", + "- `SUBTITLES_VTT`: One document with the transcript exported in VTT subtitles format" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders.assemblyai import (\n", + " AssemblyAIAudioTranscriptLoader,\n", + " TranscriptFormat,\n", + ")\n", + "\n", + "loader = AssemblyAIAudioTranscriptLoader(\n", + " file_path=\"./your_file.mp3\",\n", + " transcript_format=TranscriptFormat.SENTENCES,\n", + ")\n", + "\n", + "docs = loader.load()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Transcription Config\n", + "\n", + "You can also specify the `config` argument to use different audio intelligence models.\n", + "\n", + "Visit the [AssemblyAI API Documentation](https://www.assemblyai.com/docs) to get an overview of all available models!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import assemblyai as aai\n", + "\n", + "config = aai.TranscriptionConfig(speaker_labels=True,\n", + " auto_chapters=True,\n", + " entity_detection=True\n", + ")\n", + "\n", + "loader = AssemblyAIAudioTranscriptLoader(\n", + " file_path=\"./your_file.mp3\",\n", + " config=config\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Pass the API Key as argument\n", + "\n", + "Next to setting the API key as environment variable `ASSEMBLYAI_API_KEY`, it is also possible to pass it as argument." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "loader = AssemblyAIAudioTranscriptLoader(\n", + " file_path=\"./your_file.mp3\",\n", + " api_key=\"YOUR_KEY\"\n", + ")" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/libs/langchain/langchain/document_loaders/__init__.py b/libs/langchain/langchain/document_loaders/__init__.py index 30f69659cee..ab9d3719137 100644 --- a/libs/langchain/langchain/document_loaders/__init__.py +++ b/libs/langchain/langchain/document_loaders/__init__.py @@ -31,6 +31,7 @@ from langchain.document_loaders.airtable import AirtableLoader from langchain.document_loaders.apify_dataset import ApifyDatasetLoader from langchain.document_loaders.arcgis_loader import ArcGISLoader from langchain.document_loaders.arxiv import ArxivLoader +from langchain.document_loaders.assemblyai import AssemblyAIAudioTranscriptLoader from langchain.document_loaders.async_html import AsyncHtmlLoader from langchain.document_loaders.azlyrics import AZLyricsLoader from langchain.document_loaders.azure_blob_storage_container import ( @@ -219,6 +220,7 @@ __all__ = [ "ApifyDatasetLoader", "ArcGISLoader", "ArxivLoader", + "AssemblyAIAudioTranscriptLoader", "AsyncHtmlLoader", "AzureBlobStorageContainerLoader", "AzureBlobStorageFileLoader", diff --git a/libs/langchain/langchain/document_loaders/assemblyai.py b/libs/langchain/langchain/document_loaders/assemblyai.py new file mode 100644 index 00000000000..d7b7ecb9be6 --- /dev/null +++ b/libs/langchain/langchain/document_loaders/assemblyai.py @@ -0,0 +1,111 @@ +from __future__ import annotations + +from enum import Enum +from typing import TYPE_CHECKING, List, Optional + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + +if TYPE_CHECKING: + import assemblyai + + +class TranscriptFormat(Enum): + """Transcript format to use for the document loader.""" + + TEXT = "text" + """One document with the transcription text""" + SENTENCES = "sentences" + """Multiple documents, splits the transcription by each sentence""" + PARAGRAPHS = "paragraphs" + """Multiple documents, splits the transcription by each paragraph""" + SUBTITLES_SRT = "subtitles_srt" + """One document with the transcript exported in SRT subtitles format""" + SUBTITLES_VTT = "subtitles_vtt" + """One document with the transcript exported in VTT subtitles format""" + + +class AssemblyAIAudioTranscriptLoader(BaseLoader): + """ + Loader for AssemblyAI audio transcripts. + + It uses the AssemblyAI API to transcribe audio files + and loads the transcribed text into one or more Documents, + depending on the specified format. + + To use, you should have the ``assemblyai`` python package installed, and the + environment variable ``ASSEMBLYAI_API_KEY`` set with your API key. + Alternatively, the API key can also be passed as an argument. + + Audio files can be specified via an URL or a local file path. + """ + + def __init__( + self, + file_path: str, + *, + transcript_format: TranscriptFormat = TranscriptFormat.TEXT, + config: Optional[assemblyai.TranscriptionConfig] = None, + api_key: Optional[str] = None, + ): + """ + Initializes the AssemblyAI AudioTranscriptLoader. + + Args: + file_path: An URL or a local file path. + transcript_format: Transcript format to use. + See class ``TranscriptFormat`` for more info. + config: Transcription options and features. If ``None`` is given, + the Transcriber's default configuration will be used. + api_key: AssemblyAI API key. + """ + try: + import assemblyai + except ImportError: + raise ImportError( + "Could not import assemblyai python package. " + "Please install it with `pip install assemblyai`." + ) + if api_key is not None: + assemblyai.settings.api_key = api_key + + self.file_path = file_path + self.transcript_format = transcript_format + self.transcriber = assemblyai.Transcriber(config=config) + + def load(self) -> List[Document]: + """Transcribes the audio file and loads the transcript into documents. + + It uses the AssemblyAI API to transcribe the audio file and blocks until + the transcription is finished. + """ + transcript = self.transcriber.transcribe(self.file_path) + # This will raise a ValueError if no API key is set. + + if transcript.error: + raise ValueError(f"Could not transcribe file: {transcript.error}") + + if self.transcript_format == TranscriptFormat.TEXT: + return [ + Document( + page_content=transcript.text, metadata=transcript.json_response + ) + ] + elif self.transcript_format == TranscriptFormat.SENTENCES: + sentences = transcript.get_sentences() + return [ + Document(page_content=s.text, metadata=s.dict(exclude={"text"})) + for s in sentences + ] + elif self.transcript_format == TranscriptFormat.PARAGRAPHS: + paragraphs = transcript.get_paragraphs() + return [ + Document(page_content=p.text, metadata=p.dict(exclude={"text"})) + for p in paragraphs + ] + elif self.transcript_format == TranscriptFormat.SUBTITLES_SRT: + return [Document(page_content=transcript.export_subtitles_srt())] + elif self.transcript_format == TranscriptFormat.SUBTITLES_VTT: + return [Document(page_content=transcript.export_subtitles_vtt())] + else: + raise ValueError("Unknown transcript format.") diff --git a/libs/langchain/poetry.lock b/libs/langchain/poetry.lock index 3ea9c47341f..5b10733d9c8 100644 --- a/libs/langchain/poetry.lock +++ b/libs/langchain/poetry.lock @@ -436,6 +436,26 @@ files = [ [package.dependencies] feedparser = "*" +[[package]] +name = "assemblyai" +version = "0.17.0" +description = "AssemblyAI Python SDK" +optional = true +python-versions = ">=3.8" +files = [ + {file = "assemblyai-0.17.0-py3-none-any.whl", hash = "sha256:3bad8cc7545b5b831f243f1b2f01bc4cc0e8aad78babf44c8008f2293c540e36"}, + {file = "assemblyai-0.17.0.tar.gz", hash = "sha256:6d5bbfbbaa626ed021c3d3dec0ca52b3ebf6e6ef277ac76a7a6aed52182d531e"}, +] + +[package.dependencies] +httpx = ">=0.19.0" +pydantic = ">=1.7.0,<1.10.7 || >1.10.7" +typing-extensions = ">=3.7" +websockets = ">=11.0" + +[package.extras] +extras = ["pyaudio (>=0.2.13)"] + [[package]] name = "asttokens" version = "2.2.1" @@ -3522,6 +3542,7 @@ optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*" files = [ {file = "jsonpointer-2.4-py2.py3-none-any.whl", hash = "sha256:15d51bba20eea3165644553647711d150376234112651b4f1811022aecad7d7a"}, + {file = "jsonpointer-2.4.tar.gz", hash = "sha256:585cee82b70211fa9e6043b7bb89db6e1aa49524340dde8ad6b63206ea689d88"}, ] [[package]] @@ -9857,6 +9878,85 @@ docs = ["Sphinx (>=6.0)", "sphinx-rtd-theme (>=1.1.0)"] optional = ["python-socks", "wsaccel"] test = ["websockets"] +[[package]] +name = "websockets" +version = "11.0.3" +description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)" +optional = true +python-versions = ">=3.7" +files = [ + {file = "websockets-11.0.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3ccc8a0c387629aec40f2fc9fdcb4b9d5431954f934da3eaf16cdc94f67dbfac"}, + {file = "websockets-11.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d67ac60a307f760c6e65dad586f556dde58e683fab03323221a4e530ead6f74d"}, + {file = "websockets-11.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:84d27a4832cc1a0ee07cdcf2b0629a8a72db73f4cf6de6f0904f6661227f256f"}, + {file = "websockets-11.0.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffd7dcaf744f25f82190856bc26ed81721508fc5cbf2a330751e135ff1283564"}, + {file = "websockets-11.0.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7622a89d696fc87af8e8d280d9b421db5133ef5b29d3f7a1ce9f1a7bf7fcfa11"}, + {file = "websockets-11.0.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bceab846bac555aff6427d060f2fcfff71042dba6f5fca7dc4f75cac815e57ca"}, + {file = "websockets-11.0.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:54c6e5b3d3a8936a4ab6870d46bdd6ec500ad62bde9e44462c32d18f1e9a8e54"}, + {file = "websockets-11.0.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:41f696ba95cd92dc047e46b41b26dd24518384749ed0d99bea0a941ca87404c4"}, + {file = "websockets-11.0.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:86d2a77fd490ae3ff6fae1c6ceaecad063d3cc2320b44377efdde79880e11526"}, + {file = "websockets-11.0.3-cp310-cp310-win32.whl", hash = "sha256:2d903ad4419f5b472de90cd2d40384573b25da71e33519a67797de17ef849b69"}, + {file = "websockets-11.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:1d2256283fa4b7f4c7d7d3e84dc2ece74d341bce57d5b9bf385df109c2a1a82f"}, + {file = "websockets-11.0.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:e848f46a58b9fcf3d06061d17be388caf70ea5b8cc3466251963c8345e13f7eb"}, + {file = "websockets-11.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:aa5003845cdd21ac0dc6c9bf661c5beddd01116f6eb9eb3c8e272353d45b3288"}, + {file = "websockets-11.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b58cbf0697721120866820b89f93659abc31c1e876bf20d0b3d03cef14faf84d"}, + {file = "websockets-11.0.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:660e2d9068d2bedc0912af508f30bbeb505bbbf9774d98def45f68278cea20d3"}, + {file = "websockets-11.0.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c1f0524f203e3bd35149f12157438f406eff2e4fb30f71221c8a5eceb3617b6b"}, + {file = "websockets-11.0.3-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:def07915168ac8f7853812cc593c71185a16216e9e4fa886358a17ed0fd9fcf6"}, + {file = "websockets-11.0.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b30c6590146e53149f04e85a6e4fcae068df4289e31e4aee1fdf56a0dead8f97"}, + {file = "websockets-11.0.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:619d9f06372b3a42bc29d0cd0354c9bb9fb39c2cbc1a9c5025b4538738dbffaf"}, + {file = "websockets-11.0.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:01f5567d9cf6f502d655151645d4e8b72b453413d3819d2b6f1185abc23e82dd"}, + {file = "websockets-11.0.3-cp311-cp311-win32.whl", hash = "sha256:e1459677e5d12be8bbc7584c35b992eea142911a6236a3278b9b5ce3326f282c"}, + {file = "websockets-11.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:e7837cb169eca3b3ae94cc5787c4fed99eef74c0ab9506756eea335e0d6f3ed8"}, + {file = "websockets-11.0.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:9f59a3c656fef341a99e3d63189852be7084c0e54b75734cde571182c087b152"}, + {file = "websockets-11.0.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2529338a6ff0eb0b50c7be33dc3d0e456381157a31eefc561771ee431134a97f"}, + {file = "websockets-11.0.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:34fd59a4ac42dff6d4681d8843217137f6bc85ed29722f2f7222bd619d15e95b"}, + {file = "websockets-11.0.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:332d126167ddddec94597c2365537baf9ff62dfcc9db4266f263d455f2f031cb"}, + {file = "websockets-11.0.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:6505c1b31274723ccaf5f515c1824a4ad2f0d191cec942666b3d0f3aa4cb4007"}, + {file = "websockets-11.0.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:f467ba0050b7de85016b43f5a22b46383ef004c4f672148a8abf32bc999a87f0"}, + {file = "websockets-11.0.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:9d9acd80072abcc98bd2c86c3c9cd4ac2347b5a5a0cae7ed5c0ee5675f86d9af"}, + {file = "websockets-11.0.3-cp37-cp37m-win32.whl", hash = "sha256:e590228200fcfc7e9109509e4d9125eace2042fd52b595dd22bbc34bb282307f"}, + {file = "websockets-11.0.3-cp37-cp37m-win_amd64.whl", hash = "sha256:b16fff62b45eccb9c7abb18e60e7e446998093cdcb50fed33134b9b6878836de"}, + {file = "websockets-11.0.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:fb06eea71a00a7af0ae6aefbb932fb8a7df3cb390cc217d51a9ad7343de1b8d0"}, + {file = "websockets-11.0.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8a34e13a62a59c871064dfd8ffb150867e54291e46d4a7cf11d02c94a5275bae"}, + {file = "websockets-11.0.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4841ed00f1026dfbced6fca7d963c4e7043aa832648671b5138008dc5a8f6d99"}, + {file = "websockets-11.0.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a073fc9ab1c8aff37c99f11f1641e16da517770e31a37265d2755282a5d28aa"}, + {file = "websockets-11.0.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:68b977f21ce443d6d378dbd5ca38621755f2063d6fdb3335bda981d552cfff86"}, + {file = "websockets-11.0.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1a99a7a71631f0efe727c10edfba09ea6bee4166a6f9c19aafb6c0b5917d09c"}, + {file = "websockets-11.0.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:bee9fcb41db2a23bed96c6b6ead6489702c12334ea20a297aa095ce6d31370d0"}, + {file = "websockets-11.0.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4b253869ea05a5a073ebfdcb5cb3b0266a57c3764cf6fe114e4cd90f4bfa5f5e"}, + {file = "websockets-11.0.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:1553cb82942b2a74dd9b15a018dce645d4e68674de2ca31ff13ebc2d9f283788"}, + {file = "websockets-11.0.3-cp38-cp38-win32.whl", hash = "sha256:f61bdb1df43dc9c131791fbc2355535f9024b9a04398d3bd0684fc16ab07df74"}, + {file = "websockets-11.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:03aae4edc0b1c68498f41a6772d80ac7c1e33c06c6ffa2ac1c27a07653e79d6f"}, + {file = "websockets-11.0.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:777354ee16f02f643a4c7f2b3eff8027a33c9861edc691a2003531f5da4f6bc8"}, + {file = "websockets-11.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8c82f11964f010053e13daafdc7154ce7385ecc538989a354ccc7067fd7028fd"}, + {file = "websockets-11.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3580dd9c1ad0701169e4d6fc41e878ffe05e6bdcaf3c412f9d559389d0c9e016"}, + {file = "websockets-11.0.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6f1a3f10f836fab6ca6efa97bb952300b20ae56b409414ca85bff2ad241d2a61"}, + {file = "websockets-11.0.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:df41b9bc27c2c25b486bae7cf42fccdc52ff181c8c387bfd026624a491c2671b"}, + {file = "websockets-11.0.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:279e5de4671e79a9ac877427f4ac4ce93751b8823f276b681d04b2156713b9dd"}, + {file = "websockets-11.0.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:1fdf26fa8a6a592f8f9235285b8affa72748dc12e964a5518c6c5e8f916716f7"}, + {file = "websockets-11.0.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:69269f3a0b472e91125b503d3c0b3566bda26da0a3261c49f0027eb6075086d1"}, + {file = "websockets-11.0.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:97b52894d948d2f6ea480171a27122d77af14ced35f62e5c892ca2fae9344311"}, + {file = "websockets-11.0.3-cp39-cp39-win32.whl", hash = "sha256:c7f3cb904cce8e1be667c7e6fef4516b98d1a6a0635a58a57528d577ac18a128"}, + {file = "websockets-11.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:c792ea4eabc0159535608fc5658a74d1a81020eb35195dd63214dcf07556f67e"}, + {file = "websockets-11.0.3-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:f2e58f2c36cc52d41f2659e4c0cbf7353e28c8c9e63e30d8c6d3494dc9fdedcf"}, + {file = "websockets-11.0.3-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:de36fe9c02995c7e6ae6efe2e205816f5f00c22fd1fbf343d4d18c3d5ceac2f5"}, + {file = "websockets-11.0.3-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0ac56b661e60edd453585f4bd68eb6a29ae25b5184fd5ba51e97652580458998"}, + {file = "websockets-11.0.3-pp37-pypy37_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e052b8467dd07d4943936009f46ae5ce7b908ddcac3fda581656b1b19c083d9b"}, + {file = "websockets-11.0.3-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:42cc5452a54a8e46a032521d7365da775823e21bfba2895fb7b77633cce031bb"}, + {file = "websockets-11.0.3-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:e6316827e3e79b7b8e7d8e3b08f4e331af91a48e794d5d8b099928b6f0b85f20"}, + {file = "websockets-11.0.3-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8531fdcad636d82c517b26a448dcfe62f720e1922b33c81ce695d0edb91eb931"}, + {file = "websockets-11.0.3-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c114e8da9b475739dde229fd3bc6b05a6537a88a578358bc8eb29b4030fac9c9"}, + {file = "websockets-11.0.3-pp38-pypy38_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e063b1865974611313a3849d43f2c3f5368093691349cf3c7c8f8f75ad7cb280"}, + {file = "websockets-11.0.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:92b2065d642bf8c0a82d59e59053dd2fdde64d4ed44efe4870fa816c1232647b"}, + {file = "websockets-11.0.3-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:0ee68fe502f9031f19d495dae2c268830df2760c0524cbac5d759921ba8c8e82"}, + {file = "websockets-11.0.3-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dcacf2c7a6c3a84e720d1bb2b543c675bf6c40e460300b628bab1b1efc7c034c"}, + {file = "websockets-11.0.3-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b67c6f5e5a401fc56394f191f00f9b3811fe843ee93f4a70df3c389d1adf857d"}, + {file = "websockets-11.0.3-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1d5023a4b6a5b183dc838808087033ec5df77580485fc533e7dab2567851b0a4"}, + {file = "websockets-11.0.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:ed058398f55163a79bb9f06a90ef9ccc063b204bb346c4de78efc5d15abfe602"}, + {file = "websockets-11.0.3-py3-none-any.whl", hash = "sha256:6681ba9e7f8f3b19440921e99efbb40fc89f26cd71bf539e45d8c8a25c976dc6"}, + {file = "websockets-11.0.3.tar.gz", hash = "sha256:88fc51d9a26b10fc331be344f1781224a375b78488fc343620184e95a4b27016"}, +] + [[package]] name = "werkzeug" version = "2.3.7" @@ -10338,7 +10438,7 @@ clarifai = ["clarifai"] cohere = ["cohere"] docarray = ["docarray"] embeddings = ["sentence-transformers"] -extended-testing = ["amazon-textract-caller", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "faiss-cpu", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "markdownify", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "openapi-schema-pydantic", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xata", "xmltodict"] +extended-testing = ["amazon-textract-caller", "assemblyai", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "faiss-cpu", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "markdownify", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "openapi-schema-pydantic", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xata", "xmltodict"] javascript = ["esprima"] llms = ["clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openlm", "torch", "transformers"] openai = ["openai", "tiktoken"] @@ -10348,4 +10448,4 @@ text-helpers = ["chardet"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "83280a03c352011c2a51081a29aca67bb5c4c23054ad1b7be94f89d6ce52460b" +content-hash = "fd56d0cf338f6efea449244f3e9e719ca6872dd4b3e136ccd67fd82912912cc2" diff --git a/libs/langchain/pyproject.toml b/libs/langchain/pyproject.toml index 296cbae35ba..60d8afb5a0a 100644 --- a/libs/langchain/pyproject.toml +++ b/libs/langchain/pyproject.toml @@ -127,6 +127,7 @@ xata = {version = "^1.0.0a7", optional = true} xmltodict = {version = "^0.13.0", optional = true} google-api-core = {version = "^2.11.1", optional = true} markdownify = {version = "^0.11.6", optional = true} +assemblyai = {version = "^0.17.0", optional = true} [tool.poetry.group.test.dependencies] @@ -299,6 +300,7 @@ all = [ # merge-conflicts extended_testing = [ "amazon-textract-caller", + "assemblyai", "beautifulsoup4", "bibtexparser", "cassio", diff --git a/libs/langchain/tests/unit_tests/document_loaders/test_assemblyai.py b/libs/langchain/tests/unit_tests/document_loaders/test_assemblyai.py new file mode 100644 index 00000000000..a9b6112e7b1 --- /dev/null +++ b/libs/langchain/tests/unit_tests/document_loaders/test_assemblyai.py @@ -0,0 +1,50 @@ +import pytest +from pytest_mock import MockerFixture + +from langchain.document_loaders.assemblyai import ( + AssemblyAIAudioTranscriptLoader, + TranscriptFormat, +) + + +@pytest.mark.requires("assemblyai") +def test_initialization() -> None: + loader = AssemblyAIAudioTranscriptLoader( + file_path="./testfile.mp3", api_key="api_key" + ) + assert loader.file_path == "./testfile.mp3" + assert loader.transcript_format == TranscriptFormat.TEXT + + +@pytest.mark.requires("assemblyai") +def test_load(mocker: MockerFixture) -> None: + mocker.patch( + "assemblyai.Transcriber.transcribe", + return_value=mocker.MagicMock( + text="Test transcription text", json_response={"id": "1"}, error=None + ), + ) + + loader = AssemblyAIAudioTranscriptLoader( + file_path="./testfile.mp3", api_key="api_key" + ) + docs = loader.load() + assert len(docs) == 1 + assert docs[0].page_content == "Test transcription text" + assert docs[0].metadata == {"id": "1"} + + +@pytest.mark.requires("assemblyai") +def test_transcription_error(mocker: MockerFixture) -> None: + mocker.patch( + "assemblyai.Transcriber.transcribe", + return_value=mocker.MagicMock(error="Test error"), + ) + + loader = AssemblyAIAudioTranscriptLoader( + file_path="./testfile.mp3", api_key="api_key" + ) + + expected_error = "Could not transcribe file: Test error" + with pytest.raises(ValueError, match=expected_error): + loader.load()