langchain/libs/partners/box/langchain_box/utilities/box.py
Scott Hurrey 55fd2e2158
box: add langchain box package and DocumentLoader (#25506)
Thank you for contributing to LangChain!

-Description: Adding new package: `langchain-box`:

* `langchain_box.document_loaders.BoxLoader` — DocumentLoader
functionality
* `langchain_box.utilities.BoxAPIWrapper` — Box-specific code
* `langchain_box.utilities.BoxAuth` — Helper class for Box
authentication
* `langchain_box.utilities.BoxAuthType` — enum used by BoxAuth class

- Twitter handle: @boxplatform


- [x] **Add tests and docs**: If you're adding a new integration, please
include
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in
`docs/docs/integrations` directory.


- [x] **Lint and test**: Run `make format`, `make lint` and `make test`
from the root of the package(s) you've modified. See contribution
guidelines for more: https://python.langchain.com/docs/contributing/

Additional guidelines:
- Make sure optional dependencies are imported within a function.
- Please do not add dependencies to pyproject.toml files (even optional
ones) unless they are required for unit tests.
- Most PRs should not touch more than one package.
- Changes should be backwards compatible.
- If you are adding something to community, do not re-import it in
langchain.

If no one reviews your PR within a few days, please @-mention one of
baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17.

---------

Co-authored-by: Erick Friis <erickfriis@gmail.com>
Co-authored-by: Erick Friis <erick@langchain.dev>
2024-08-21 02:23:43 +00:00

526 lines
18 KiB
Python

"""Util that calls Box APIs."""
from enum import Enum
from typing import Any, Dict, Optional
import box_sdk_gen # type: ignore
import requests
from langchain_core.documents import Document
from langchain_core.pydantic_v1 import BaseModel, root_validator
from langchain_core.utils import get_from_dict_or_env
class DocumentFiles(Enum):
DOC = "doc"
DOCX = "docx"
GDOC = "gdoc"
GSHEET = "gsheet"
NUMBERS = "numbers"
ODS = "ods"
ODT = "odt"
PAGES = "pages"
PDF = "pdf"
RTF = "rtf"
WPD = "wpd"
XLS = "xls"
XLSM = "xlsm"
XLSX = "xlsx"
AS = "as"
AS3 = "as3"
ASM = "asm"
BAT = "bat"
C = "c"
CC = "cc"
CMAKE = "cmake"
CPP = "cpp"
CS = "cs"
CSS = "css"
CSV = "csv"
CXX = "cxx"
DIFF = "diff"
ERB = "erb"
GROOVY = "groovy"
H = "h"
HAML = "haml"
HH = "hh"
HTM = "htm"
HTML = "html"
JAVA = "java"
JS = "js"
JSON = "json"
LESS = "less"
LOG = "log"
M = "m"
MAKE = "make"
MD = "md"
ML = "ml"
MM = "mm"
MSG = "msg"
PHP = "php"
PL = "pl"
PROPERTIES = "properties"
PY = "py"
RB = "rb"
RST = "rst"
SASS = "sass"
SCALA = "scala"
SCM = "scm"
SCRIPT = "script"
SH = "sh"
SML = "sml"
SQL = "sql"
TXT = "txt"
VI = "vi"
VIM = "vim"
WEBDOC = "webdoc"
XHTML = "xhtml"
XLSB = "xlsb"
XML = "xml"
XSD = "xsd"
XSL = "xsl"
YAML = "yaml"
GSLLIDE = "gslide"
GSLIDES = "gslides"
KEY = "key"
ODP = "odp"
PPT = "ppt"
PPTX = "pptx"
BOXNOTE = "boxnote"
class ImageFiles(Enum):
ARW = "arw"
BMP = "bmp"
CR2 = "cr2"
DCM = "dcm"
DICM = "dicm"
DICOM = "dicom"
DNG = "dng"
EPS = "eps"
EXR = "exr"
GIF = "gif"
HEIC = "heic"
INDD = "indd"
INDML = "indml"
INDT = "indt"
INX = "inx"
JPEG = "jpeg"
JPG = "jpg"
NEF = "nef"
PNG = "png"
SVG = "svg"
TIF = "tif"
TIFF = "tiff"
TGA = "tga"
SVS = "svs"
"""
BoxAuthType
an enum to tell BoxLoader how you wish to autheticate your Box connection.
Options are:
TOKEN - Use a developer token generated from the Box Deevloper Token.
Only recommended for development.
Provide `box_developer_token`.
CCG - Client Credentials Grant.
provide `box_client_id`, `box_client_secret`,
and `box_enterprise_id` or optionally `box_user_id`.
JWT - Use JWT for authentication. Config should be stored on the file
system accessible to your app.
provide `box_jwt_path`. Optionally, provide `box_user_id` to
act as a specific user
"""
class BoxAuthType(Enum):
"""Use a developer token or a token retrieved from box-sdk-gen"""
TOKEN = "token"
"""Use `client_credentials` type grant"""
CCG = "ccg"
"""Use JWT bearer token auth"""
JWT = "jwt"
"""
`BoxAuth` supports the following authentication methods:
* Token — either a developer token or any token generated through the Box SDK
* JWT with a service account
* JWT with a specified user
* CCG with a service account
* CCG with a specified user
:::note
If using JWT authentication, you will need to download the configuration from the
Box developer console after generating your public/private key pair. Place this
file in your application directory structure somewhere. You will use the path to
this file when using the `BoxAuth` helper class.
:::
For more information, learn about how to
[set up a Box application](https://developer.box.com/guides/getting-started/first-application/),
and check out the
[Box authentication guide](https://developer.box.com/guides/authentication/select/)
for more about our different authentication options.
Simple implementation
To instantiate, you must provide a ``langchain_box.utilities.BoxAuthType``.
BoxAuthType is an enum to tell BoxLoader how you wish to autheticate your
Box connection.
Options are:
TOKEN - Use a developer token generated from the Box Deevloper Token.
Only recommended for development.
Provide `box_developer_token`.
CCG - Client Credentials Grant.
provide `box_client_id`, `box_client_secret`,
and `box_enterprise_id` or optionally `box_user_id`.
JWT - Use JWT for authentication. Config should be stored on the file
system accessible to your app.
provide `box_jwt_path`. Optionally, provide `box_user_id` to
act as a specific user
.. code-block:: python
from langchain_box.document_loaders import BoxLoader
from langchain_box.utilities import BoxAuth, BoxAuthType
auth = BoxAuth(
auth_type=BoxAuthType.TOKEN,
box_developer_token=box_developer_token
)
loader = BoxLoader(
box_auth=auth,
...
)
To see examples for each supported authentication methodology, visit the
[Box providers](/docs/integrations/providers/box) page. If you want to
use OAuth 2.0 `authorization_code` flow, use
[box-sdk-gen](https://github.com/box/box-python-sdk-gen) SDK, get your
token, and use `BoxAuthType.TOKEN` type.
"""
class BoxAuth(BaseModel):
"""Authentication type to use. Must pass BoxAuthType enum"""
auth_type: BoxAuthType
""" If using BoxAuthType.TOKEN, provide your token here"""
box_developer_token: Optional[str] = None
"""If using BoxAuthType.JWT, provide local path to your
JWT configuration file"""
box_jwt_path: Optional[str] = None
"""If using BoxAuthType.CCG, provide your app's client ID"""
box_client_id: Optional[str] = None
"""If using BoxAuthType.CCG, provide your app's client secret"""
box_client_secret: Optional[str] = None
"""If using BoxAuthType.CCG, provide your enterprise ID.
Only required if you are not sending `box_user_id`"""
box_enterprise_id: Optional[str] = None
"""If using BoxAuthType.CCG or BoxAuthType.JWT, providing
`box_user_id` will act on behalf of a specific user"""
box_user_id: Optional[str] = None
box_client: Optional[box_sdk_gen.BoxClient] = None
custom_header: Dict = dict({"x-box-ai-library": "langchain"})
class Config:
arbitrary_types_allowed = True
use_enum_values = True
extra = "allow"
@root_validator()
def validate_box_auth_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
"""Validate auth_type is set"""
if not values.get("auth_type"):
raise ValueError("Auth type must be set.")
"""Validate that TOKEN auth type provides box_developer_token."""
if values.get("auth_type") == "token":
if not get_from_dict_or_env(
values, "box_developer_token", "BOX_DEVELOPER_TOKEN"
):
raise ValueError(
f"{values.get('auth_type')} requires box_developer_token to be set"
)
"""Validate that JWT auth type provides box_jwt_path."""
if values.get("auth_type") == "jwt":
if not get_from_dict_or_env(values, "box_jwt_path", "BOX_JWT_PATH"):
raise ValueError(
f"{values.get('auth_type')} requires box_jwt_path to be set"
)
"""Validate that CCG auth type provides box_client_id and
box_client_secret and either box_enterprise_id or box_user_id."""
if values.get("auth_type") == "ccg":
if (
not get_from_dict_or_env(values, "box_client_id", "BOX_CLIENT_ID")
or not get_from_dict_or_env(
values, "box_client_secret", "BOX_CLIENT_SECRET"
)
or (
not values.get("box_enterprise_id")
and not values.get("box_user_id")
)
):
raise ValueError(
f"{values.get('auth_type')} requires box_client_id, \
box_client_secret, and box_enterprise_id."
)
return values
def authorize(self) -> None:
match self.auth_type:
case "token":
try:
auth = box_sdk_gen.BoxDeveloperTokenAuth(
token=self.box_developer_token
)
self.box_client = box_sdk_gen.BoxClient(
auth=auth
).with_extra_headers(extra_headers=self.custom_header)
except box_sdk_gen.BoxSDKError as bse:
raise RuntimeError(
f"Error getting client from developer token: {bse.message}"
)
except Exception as ex:
raise ValueError(
f"Invalid Box developer token. Please verify your \
token and try again.\n{ex}"
) from ex
case "jwt":
try:
jwt_config = box_sdk_gen.JWTConfig.from_config_file(
config_file_path=self.box_jwt_path
)
auth = box_sdk_gen.BoxJWTAuth(config=jwt_config)
self.box_client = box_sdk_gen.BoxClient(
auth=auth
).with_extra_headers(extra_headers=self.custom_header)
if self.box_user_id is not None:
user_auth = auth.with_user_subject(self.box_user_id)
self.box_client = box_sdk_gen.BoxClient(
auth=user_auth
).with_extra_headers(extra_headers=self.custom_header)
except box_sdk_gen.BoxSDKError as bse:
raise RuntimeError(
f"Error getting client from jwt token: {bse.message}"
)
except Exception as ex:
raise ValueError(
"Error authenticating. Please verify your JWT config \
and try again."
) from ex
case "ccg":
try:
if self.box_user_id is not None:
ccg_config = box_sdk_gen.CCGConfig(
client_id=self.box_client_id,
client_secret=self.box_client_secret,
user_id=self.box_user_id,
)
else:
ccg_config = box_sdk_gen.CCGConfig(
client_id=self.box_client_id,
client_secret=self.box_client_secret,
enterprise_id=self.box_enterprise_id,
)
auth = box_sdk_gen.BoxCCGAuth(config=ccg_config)
self.box_client = box_sdk_gen.BoxClient(
auth=auth
).with_extra_headers(extra_headers=self.custom_header)
except box_sdk_gen.BoxSDKError as bse:
raise RuntimeError(
f"Error getting client from ccg token: {bse.message}"
)
except Exception as ex:
raise ValueError(
"Error authenticating. Please verify you are providing a \
valid client id, secret and either a valid user ID or \
enterprise ID."
) from ex
case _:
raise ValueError(
f"{self.auth_type} is not a valid auth_type. Value must be \
TOKEN, CCG, or JWT."
)
def get_client(self) -> box_sdk_gen.BoxClient:
"""Instantiate the Box SDK."""
if self.box_client is None:
self.authorize()
return self.box_client
class BoxAPIWrapper(BaseModel):
"""Wrapper for Box API."""
"""String containing the Box Developer Token generated in the developer console"""
box_developer_token: Optional[str] = None
"""Configured langchain_box.utilities.BoxAuth object"""
box_auth: Optional[BoxAuth] = None
"""character_limit is an int that caps the number of characters to
return per document."""
character_limit: Optional[int] = -1
box: Optional[box_sdk_gen.BoxClient]
file_count: int = 0
class Config:
arbitrary_types_allowed = True
use_enum_values = True
extra = "allow"
@root_validator(allow_reuse=True)
def validate_box_api_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
values["box"] = None
"""Validate that TOKEN auth type provides box_developer_token."""
if not values.get("box_auth"):
if not get_from_dict_or_env(
values, "box_developer_token", "BOX_DEVELOPER_TOKEN"
):
raise ValueError(
"You must configure either box_developer_token of box_auth"
)
else:
box_auth = values.get("box_auth")
values["box"] = box_auth.get_client() # type: ignore[union-attr]
return values
def get_box_client(self) -> box_sdk_gen.BoxClient:
box_auth = BoxAuth(
auth_type=BoxAuthType.TOKEN, box_developer_token=self.box_developer_token
)
self.box = box_auth.get_client()
def _do_request(self, url: str) -> Any:
try:
access_token = self.box.auth.retrieve_token().access_token # type: ignore[union-attr]
except box_sdk_gen.BoxSDKError as bse:
raise RuntimeError(f"Error getting client from jwt token: {bse.message}")
resp = requests.get(url, headers={"Authorization": f"Bearer {access_token}"})
resp.raise_for_status()
return resp.content
def get_folder_items(self, folder_id: str) -> box_sdk_gen.Items:
"""Get all the items in a folder. Accepts folder_id as str.
returns box_sdk_gen.Items"""
if self.box is None:
self.get_box_client()
try:
folder_contents = self.box.folders.get_folder_items( # type: ignore[union-attr]
folder_id, fields=["id", "type", "name"]
)
except box_sdk_gen.BoxAPIError as bae:
raise RuntimeError(
f"BoxAPIError: Error getting folder content: {bae.message}"
)
except box_sdk_gen.BoxSDKError as bse:
raise RuntimeError(
f"BoxSDKError: Error getting folder content: {bse.message}"
)
return folder_contents.entries
def get_text_representation(self, file_id: str = "") -> tuple[str, str, str]:
try:
from box_sdk_gen import BoxAPIError, BoxSDKError
except ImportError:
raise ImportError("You must run `pip install box-sdk-gen`")
if self.box is None:
self.get_box_client()
try:
file = self.box.files.get_file_by_id( # type: ignore[union-attr]
file_id,
x_rep_hints="[extracted_text]",
fields=["name", "representations", "type"],
)
except BoxAPIError as bae:
raise RuntimeError(f"BoxAPIError: Error getting text rep: {bae.message}")
except BoxSDKError as bse:
raise RuntimeError(f"BoxSDKError: Error getting text rep: {bse.message}")
except Exception:
return None, None, None # type: ignore[return-value]
file_repr = file.representations.entries
if len(file_repr) <= 0:
return None, None, None # type: ignore[return-value]
for entry in file_repr:
if entry.representation == "extracted_text":
# If the file representation doesn't exist, calling
# info.url will generate text if possible
if entry.status.state == "none":
self._do_request(entry.info.url)
url = entry.content.url_template.replace("{+asset_path}", "")
file_name = file.name.replace(".", "_").replace(" ", "_")
try:
raw_content = self._do_request(url)
except requests.exceptions.HTTPError:
return None, None, None # type: ignore[return-value]
if self.character_limit > 0: # type: ignore[operator]
content = raw_content[0 : self.character_limit]
else:
content = raw_content
return file_name, content, url
return None, None, None # type: ignore[return-value]
def get_document_by_file_id(self, file_id: str) -> Optional[Document]:
"""Load a file from a Box id. Accepts file_id as str.
Returns `Document`"""
if self.box is None:
self.get_box_client()
file = self.box.files.get_file_by_id( # type: ignore[union-attr]
file_id, fields=["name", "type", "extension"]
)
if file.type == "file":
if hasattr(DocumentFiles, file.extension.upper()):
file_name, content, url = self.get_text_representation(file_id=file_id)
if file_name is None or content is None or url is None:
return None
metadata = {
"source": f"{url}",
"title": f"{file_name}",
}
return Document(page_content=content, metadata=metadata)
return None
return None