mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-18 12:58:59 +00:00
Harrison/embass (#14242)
Co-authored-by: Julius Lipp <lipp.julius@gmail.com>
This commit is contained in:
parent
8504ec56e4
commit
e32185193e
@ -1,167 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"collapsed": false
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"# Embaas\n",
|
|
||||||
"[embaas](https://embaas.io) is a fully managed NLP API service that offers features like embedding generation, document text extraction, document to embeddings and more. You can choose a [variety of pre-trained models](https://embaas.io/docs/models/embeddings).\n",
|
|
||||||
"\n",
|
|
||||||
"### Prerequisites\n",
|
|
||||||
"Create a free embaas account at [https://embaas.io/register](https://embaas.io/register) and generate an [API key](https://embaas.io/dashboard/api-keys)\n",
|
|
||||||
"\n",
|
|
||||||
"### Document Text Extraction API\n",
|
|
||||||
"The document text extraction API allows you to extract the text from a given document. The API supports a variety of document formats, including PDF, mp3, mp4 and more. For a full list of supported formats, check out the API docs (link below)."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"collapsed": false
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Set API key\n",
|
|
||||||
"embaas_api_key = \"YOUR_API_KEY\"\n",
|
|
||||||
"# or set environment variable\n",
|
|
||||||
"os.environ[\"EMBAAS_API_KEY\"] = \"YOUR_API_KEY\""
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"collapsed": false
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"#### Using a blob (bytes)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"collapsed": false
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from langchain.document_loaders.blob_loaders import Blob\n",
|
|
||||||
"from langchain.document_loaders.embaas import EmbaasBlobLoader"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"collapsed": false
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"blob_loader = EmbaasBlobLoader()\n",
|
|
||||||
"blob = Blob.from_path(\"example.pdf\")\n",
|
|
||||||
"documents = blob_loader.load(blob)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"ExecuteTime": {
|
|
||||||
"end_time": "2023-06-12T22:19:48.380467Z",
|
|
||||||
"start_time": "2023-06-12T22:19:48.366886Z"
|
|
||||||
},
|
|
||||||
"collapsed": false
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# You can also directly create embeddings with your preferred embeddings model\n",
|
|
||||||
"blob_loader = EmbaasBlobLoader(params={\"model\": \"e5-large-v2\", \"should_embed\": True})\n",
|
|
||||||
"blob = Blob.from_path(\"example.pdf\")\n",
|
|
||||||
"documents = blob_loader.load(blob)\n",
|
|
||||||
"\n",
|
|
||||||
"print(documents[0][\"metadata\"][\"embedding\"])"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"collapsed": false
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"#### Using a file"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"collapsed": false
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from langchain.document_loaders.embaas import EmbaasLoader"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"collapsed": false
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"file_loader = EmbaasLoader(file_path=\"example.pdf\")\n",
|
|
||||||
"documents = file_loader.load()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 15,
|
|
||||||
"metadata": {
|
|
||||||
"ExecuteTime": {
|
|
||||||
"end_time": "2023-06-12T22:24:31.894665Z",
|
|
||||||
"start_time": "2023-06-12T22:24:31.880857Z"
|
|
||||||
},
|
|
||||||
"collapsed": false
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Disable automatic text splitting\n",
|
|
||||||
"file_loader = EmbaasLoader(file_path=\"example.mp3\", params={\"should_chunk\": False})\n",
|
|
||||||
"documents = file_loader.load()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"collapsed": false
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"For more detailed information about the embaas document text extraction API, please refer to [the official embaas API documentation](https://embaas.io/api-reference)."
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 2
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython2",
|
|
||||||
"version": "2.7.6"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 0
|
|
||||||
}
|
|
@ -77,7 +77,6 @@ from langchain.document_loaders.email import (
|
|||||||
OutlookMessageLoader,
|
OutlookMessageLoader,
|
||||||
UnstructuredEmailLoader,
|
UnstructuredEmailLoader,
|
||||||
)
|
)
|
||||||
from langchain.document_loaders.embaas import EmbaasBlobLoader, EmbaasLoader
|
|
||||||
from langchain.document_loaders.epub import UnstructuredEPubLoader
|
from langchain.document_loaders.epub import UnstructuredEPubLoader
|
||||||
from langchain.document_loaders.etherscan import EtherscanLoader
|
from langchain.document_loaders.etherscan import EtherscanLoader
|
||||||
from langchain.document_loaders.evernote import EverNoteLoader
|
from langchain.document_loaders.evernote import EverNoteLoader
|
||||||
@ -259,8 +258,6 @@ __all__ = [
|
|||||||
"Docx2txtLoader",
|
"Docx2txtLoader",
|
||||||
"DropboxLoader",
|
"DropboxLoader",
|
||||||
"DuckDBLoader",
|
"DuckDBLoader",
|
||||||
"EmbaasBlobLoader",
|
|
||||||
"EmbaasLoader",
|
|
||||||
"EtherscanLoader",
|
"EtherscanLoader",
|
||||||
"EverNoteLoader",
|
"EverNoteLoader",
|
||||||
"FacebookChatLoader",
|
"FacebookChatLoader",
|
||||||
|
@ -1,244 +0,0 @@
|
|||||||
import base64
|
|
||||||
import warnings
|
|
||||||
from typing import Any, Dict, Iterator, List, Optional
|
|
||||||
|
|
||||||
import requests
|
|
||||||
from langchain_core.documents import Document
|
|
||||||
from langchain_core.pydantic_v1 import BaseModel, root_validator, validator
|
|
||||||
from typing_extensions import NotRequired, TypedDict
|
|
||||||
|
|
||||||
from langchain.document_loaders.base import BaseBlobParser, BaseLoader
|
|
||||||
from langchain.document_loaders.blob_loaders import Blob
|
|
||||||
from langchain.text_splitter import TextSplitter
|
|
||||||
from langchain.utils import get_from_dict_or_env
|
|
||||||
|
|
||||||
EMBAAS_DOC_API_URL = "https://api.embaas.io/v1/document/extract-text/bytes/"
|
|
||||||
|
|
||||||
|
|
||||||
class EmbaasDocumentExtractionParameters(TypedDict):
|
|
||||||
"""Parameters for the embaas document extraction API."""
|
|
||||||
|
|
||||||
mime_type: NotRequired[str]
|
|
||||||
"""The mime type of the document."""
|
|
||||||
file_extension: NotRequired[str]
|
|
||||||
"""The file extension of the document."""
|
|
||||||
file_name: NotRequired[str]
|
|
||||||
"""The file name of the document."""
|
|
||||||
|
|
||||||
should_chunk: NotRequired[bool]
|
|
||||||
"""Whether to chunk the document into pages."""
|
|
||||||
chunk_size: NotRequired[int]
|
|
||||||
"""The maximum size of the text chunks."""
|
|
||||||
chunk_overlap: NotRequired[int]
|
|
||||||
"""The maximum overlap allowed between chunks."""
|
|
||||||
chunk_splitter: NotRequired[str]
|
|
||||||
"""The text splitter class name for creating chunks."""
|
|
||||||
separators: NotRequired[List[str]]
|
|
||||||
"""The separators for chunks."""
|
|
||||||
|
|
||||||
should_embed: NotRequired[bool]
|
|
||||||
"""Whether to create embeddings for the document in the response."""
|
|
||||||
model: NotRequired[str]
|
|
||||||
"""The model to pass to the Embaas document extraction API."""
|
|
||||||
instruction: NotRequired[str]
|
|
||||||
"""The instruction to pass to the Embaas document extraction API."""
|
|
||||||
|
|
||||||
|
|
||||||
class EmbaasDocumentExtractionPayload(EmbaasDocumentExtractionParameters):
|
|
||||||
"""Payload for the Embaas document extraction API."""
|
|
||||||
|
|
||||||
bytes: str
|
|
||||||
"""The base64 encoded bytes of the document to extract text from."""
|
|
||||||
|
|
||||||
|
|
||||||
class BaseEmbaasLoader(BaseModel):
|
|
||||||
"""Base loader for `Embaas` document extraction API."""
|
|
||||||
|
|
||||||
embaas_api_key: Optional[str] = None
|
|
||||||
"""The API key for the Embaas document extraction API."""
|
|
||||||
api_url: str = EMBAAS_DOC_API_URL
|
|
||||||
"""The URL of the Embaas document extraction API."""
|
|
||||||
params: EmbaasDocumentExtractionParameters = EmbaasDocumentExtractionParameters()
|
|
||||||
"""Additional parameters to pass to the Embaas document extraction API."""
|
|
||||||
|
|
||||||
@root_validator(pre=True)
|
|
||||||
def validate_environment(cls, values: Dict) -> Dict:
|
|
||||||
"""Validate that api key and python package exists in environment."""
|
|
||||||
embaas_api_key = get_from_dict_or_env(
|
|
||||||
values, "embaas_api_key", "EMBAAS_API_KEY"
|
|
||||||
)
|
|
||||||
values["embaas_api_key"] = embaas_api_key
|
|
||||||
return values
|
|
||||||
|
|
||||||
|
|
||||||
class EmbaasBlobLoader(BaseEmbaasLoader, BaseBlobParser):
|
|
||||||
"""Load `Embaas` blob.
|
|
||||||
|
|
||||||
To use, you should have the
|
|
||||||
environment variable ``EMBAAS_API_KEY`` set with your API key, or pass
|
|
||||||
it as a named parameter to the constructor.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
.. code-block:: python
|
|
||||||
|
|
||||||
# Default parsing
|
|
||||||
from langchain.document_loaders.embaas import EmbaasBlobLoader
|
|
||||||
loader = EmbaasBlobLoader()
|
|
||||||
blob = Blob.from_path(path="example.mp3")
|
|
||||||
documents = loader.parse(blob=blob)
|
|
||||||
|
|
||||||
# Custom api parameters (create embeddings automatically)
|
|
||||||
from langchain.document_loaders.embaas import EmbaasBlobLoader
|
|
||||||
loader = EmbaasBlobLoader(
|
|
||||||
params={
|
|
||||||
"should_embed": True,
|
|
||||||
"model": "e5-large-v2",
|
|
||||||
"chunk_size": 256,
|
|
||||||
"chunk_splitter": "CharacterTextSplitter"
|
|
||||||
}
|
|
||||||
)
|
|
||||||
blob = Blob.from_path(path="example.pdf")
|
|
||||||
documents = loader.parse(blob=blob)
|
|
||||||
"""
|
|
||||||
|
|
||||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
|
||||||
"""Parses the blob lazily.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
blob: The blob to parse.
|
|
||||||
"""
|
|
||||||
yield from self._get_documents(blob=blob)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _api_response_to_documents(chunks: List[Dict[str, Any]]) -> List[Document]:
|
|
||||||
"""Convert the API response to a list of documents."""
|
|
||||||
docs = []
|
|
||||||
for chunk in chunks:
|
|
||||||
metadata = chunk["metadata"]
|
|
||||||
if chunk.get("embedding", None) is not None:
|
|
||||||
metadata["embedding"] = chunk["embedding"]
|
|
||||||
doc = Document(page_content=chunk["text"], metadata=metadata)
|
|
||||||
docs.append(doc)
|
|
||||||
|
|
||||||
return docs
|
|
||||||
|
|
||||||
def _generate_payload(self, blob: Blob) -> EmbaasDocumentExtractionPayload:
|
|
||||||
"""Generates payload for the API request."""
|
|
||||||
base64_byte_str = base64.b64encode(blob.as_bytes()).decode()
|
|
||||||
payload: EmbaasDocumentExtractionPayload = EmbaasDocumentExtractionPayload(
|
|
||||||
bytes=base64_byte_str,
|
|
||||||
# Workaround for mypy issue: https://github.com/python/mypy/issues/9408
|
|
||||||
# type: ignore
|
|
||||||
**self.params,
|
|
||||||
)
|
|
||||||
|
|
||||||
if blob.mimetype is not None and payload.get("mime_type", None) is None:
|
|
||||||
payload["mime_type"] = blob.mimetype
|
|
||||||
|
|
||||||
return payload
|
|
||||||
|
|
||||||
def _handle_request(
|
|
||||||
self, payload: EmbaasDocumentExtractionPayload
|
|
||||||
) -> List[Document]:
|
|
||||||
"""Sends a request to the embaas API and handles the response."""
|
|
||||||
headers = {
|
|
||||||
"Authorization": f"Bearer {self.embaas_api_key}",
|
|
||||||
"Content-Type": "application/json",
|
|
||||||
}
|
|
||||||
|
|
||||||
response = requests.post(self.api_url, headers=headers, json=payload)
|
|
||||||
response.raise_for_status()
|
|
||||||
|
|
||||||
parsed_response = response.json()
|
|
||||||
return EmbaasBlobLoader._api_response_to_documents(
|
|
||||||
chunks=parsed_response["data"]["chunks"]
|
|
||||||
)
|
|
||||||
|
|
||||||
def _get_documents(self, blob: Blob) -> Iterator[Document]:
|
|
||||||
"""Get the documents from the blob."""
|
|
||||||
payload = self._generate_payload(blob=blob)
|
|
||||||
|
|
||||||
try:
|
|
||||||
documents = self._handle_request(payload=payload)
|
|
||||||
except requests.exceptions.RequestException as e:
|
|
||||||
if e.response is None or not e.response.text:
|
|
||||||
raise ValueError(
|
|
||||||
f"Error raised by Embaas document text extraction API: {e}"
|
|
||||||
)
|
|
||||||
|
|
||||||
parsed_response = e.response.json()
|
|
||||||
if "message" in parsed_response:
|
|
||||||
raise ValueError(
|
|
||||||
f"Validation Error raised by Embaas document text extraction API:"
|
|
||||||
f" {parsed_response['message']}"
|
|
||||||
)
|
|
||||||
raise
|
|
||||||
|
|
||||||
yield from documents
|
|
||||||
|
|
||||||
|
|
||||||
class EmbaasLoader(BaseEmbaasLoader, BaseLoader):
|
|
||||||
"""Load from `Embaas`.
|
|
||||||
|
|
||||||
To use, you should have the
|
|
||||||
environment variable ``EMBAAS_API_KEY`` set with your API key, or pass
|
|
||||||
it as a named parameter to the constructor.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
.. code-block:: python
|
|
||||||
|
|
||||||
# Default parsing
|
|
||||||
from langchain.document_loaders.embaas import EmbaasLoader
|
|
||||||
loader = EmbaasLoader(file_path="example.mp3")
|
|
||||||
documents = loader.load()
|
|
||||||
|
|
||||||
# Custom api parameters (create embeddings automatically)
|
|
||||||
from langchain.document_loaders.embaas import EmbaasBlobLoader
|
|
||||||
loader = EmbaasBlobLoader(
|
|
||||||
file_path="example.pdf",
|
|
||||||
params={
|
|
||||||
"should_embed": True,
|
|
||||||
"model": "e5-large-v2",
|
|
||||||
"chunk_size": 256,
|
|
||||||
"chunk_splitter": "CharacterTextSplitter"
|
|
||||||
}
|
|
||||||
)
|
|
||||||
documents = loader.load()
|
|
||||||
"""
|
|
||||||
|
|
||||||
file_path: str
|
|
||||||
"""The path to the file to load."""
|
|
||||||
blob_loader: Optional[EmbaasBlobLoader]
|
|
||||||
"""The blob loader to use. If not provided, a default one will be created."""
|
|
||||||
|
|
||||||
@validator("blob_loader", always=True)
|
|
||||||
def validate_blob_loader(
|
|
||||||
cls, v: EmbaasBlobLoader, values: Dict
|
|
||||||
) -> EmbaasBlobLoader:
|
|
||||||
return v or EmbaasBlobLoader(
|
|
||||||
embaas_api_key=values["embaas_api_key"],
|
|
||||||
api_url=values["api_url"],
|
|
||||||
params=values["params"],
|
|
||||||
)
|
|
||||||
|
|
||||||
def lazy_load(self) -> Iterator[Document]:
|
|
||||||
"""Load the documents from the file path lazily."""
|
|
||||||
blob = Blob.from_path(path=self.file_path)
|
|
||||||
|
|
||||||
assert self.blob_loader is not None
|
|
||||||
# Should never be None, but mypy doesn't know that.
|
|
||||||
yield from self.blob_loader.lazy_parse(blob=blob)
|
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
|
||||||
def load_and_split(
|
|
||||||
self, text_splitter: Optional[TextSplitter] = None
|
|
||||||
) -> List[Document]:
|
|
||||||
if self.params.get("should_embed", False):
|
|
||||||
warnings.warn(
|
|
||||||
"Embeddings are not supported with load_and_split."
|
|
||||||
" Use the API splitter to properly generate embeddings."
|
|
||||||
" For more information see embaas.io docs."
|
|
||||||
)
|
|
||||||
return super().load_and_split(text_splitter=text_splitter)
|
|
@ -1,59 +0,0 @@
|
|||||||
from typing import Any
|
|
||||||
from unittest.mock import MagicMock, patch
|
|
||||||
|
|
||||||
import responses
|
|
||||||
|
|
||||||
from langchain.document_loaders import EmbaasBlobLoader, EmbaasLoader
|
|
||||||
from langchain.document_loaders.blob_loaders import Blob
|
|
||||||
from langchain.document_loaders.embaas import EMBAAS_DOC_API_URL
|
|
||||||
|
|
||||||
|
|
||||||
@responses.activate
|
|
||||||
def test_handle_request() -> None:
|
|
||||||
responses.add(
|
|
||||||
responses.POST,
|
|
||||||
EMBAAS_DOC_API_URL,
|
|
||||||
json={
|
|
||||||
"data": {
|
|
||||||
"chunks": [
|
|
||||||
{
|
|
||||||
"text": "Hello",
|
|
||||||
"metadata": {"start_page": 1, "end_page": 2},
|
|
||||||
"embeddings": [0.0],
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
status=200,
|
|
||||||
)
|
|
||||||
|
|
||||||
loader = EmbaasBlobLoader(embaas_api_key="api_key", params={"should_embed": True})
|
|
||||||
documents = loader.parse(blob=Blob.from_data(data="Hello"))
|
|
||||||
assert len(documents) == 1
|
|
||||||
assert documents[0].page_content == "Hello"
|
|
||||||
assert documents[0].metadata["start_page"] == 1
|
|
||||||
assert documents[0].metadata["end_page"] == 2
|
|
||||||
assert documents[0].metadata["embeddings"] == [0.0]
|
|
||||||
|
|
||||||
|
|
||||||
@responses.activate
|
|
||||||
def test_handle_request_exception() -> None:
|
|
||||||
responses.add(
|
|
||||||
responses.POST,
|
|
||||||
EMBAAS_DOC_API_URL,
|
|
||||||
json={"message": "Invalid request"},
|
|
||||||
status=400,
|
|
||||||
)
|
|
||||||
loader = EmbaasBlobLoader(embaas_api_key="api_key")
|
|
||||||
try:
|
|
||||||
loader.parse(blob=Blob.from_data(data="Hello"))
|
|
||||||
except Exception as e:
|
|
||||||
assert "Invalid request" in str(e)
|
|
||||||
|
|
||||||
|
|
||||||
@patch.object(EmbaasBlobLoader, "_handle_request")
|
|
||||||
def test_load(mock_handle_request: Any) -> None:
|
|
||||||
mock_handle_request.return_value = [MagicMock()]
|
|
||||||
loader = EmbaasLoader(file_path="test_embaas.py", embaas_api_key="api_key")
|
|
||||||
documents = loader.load()
|
|
||||||
assert len(documents) == 1
|
|
@ -52,8 +52,6 @@ EXPECTED_ALL = [
|
|||||||
"Docx2txtLoader",
|
"Docx2txtLoader",
|
||||||
"DropboxLoader",
|
"DropboxLoader",
|
||||||
"DuckDBLoader",
|
"DuckDBLoader",
|
||||||
"EmbaasBlobLoader",
|
|
||||||
"EmbaasLoader",
|
|
||||||
"EtherscanLoader",
|
"EtherscanLoader",
|
||||||
"EverNoteLoader",
|
"EverNoteLoader",
|
||||||
"FacebookChatLoader",
|
"FacebookChatLoader",
|
||||||
|
Loading…
Reference in New Issue
Block a user