mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-17 15:35:14 +00:00
box: add langchain box package and DocumentLoader (#25506)
Thank you for contributing to LangChain! -Description: Adding new package: `langchain-box`: * `langchain_box.document_loaders.BoxLoader` — DocumentLoader functionality * `langchain_box.utilities.BoxAPIWrapper` — Box-specific code * `langchain_box.utilities.BoxAuth` — Helper class for Box authentication * `langchain_box.utilities.BoxAuthType` — enum used by BoxAuth class - Twitter handle: @boxplatform - [x] **Add tests and docs**: If you're adding a new integration, please include 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. - [x] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/ Additional guidelines: - Make sure optional dependencies are imported within a function. - Please do not add dependencies to pyproject.toml files (even optional ones) unless they are required for unit tests. - Most PRs should not touch more than one package. - Changes should be backwards compatible. - If you are adding something to community, do not re-import it in langchain. If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17. --------- Co-authored-by: Erick Friis <erickfriis@gmail.com> Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
19
libs/partners/box/langchain_box/__init__.py
Normal file
19
libs/partners/box/langchain_box/__init__.py
Normal file
@@ -0,0 +1,19 @@
|
||||
from importlib import metadata
|
||||
|
||||
from langchain_box.document_loaders import BoxLoader
|
||||
from langchain_box.utilities import BoxAPIWrapper, BoxAuth, BoxAuthType
|
||||
|
||||
try:
|
||||
__version__ = metadata.version(__package__)
|
||||
except metadata.PackageNotFoundError:
|
||||
# Case where package metadata is not available.
|
||||
__version__ = ""
|
||||
del metadata # optional, avoids polluting the results of dir(__package__)
|
||||
|
||||
__all__ = [
|
||||
"BoxLoader",
|
||||
"BoxAuth",
|
||||
"BoxAuthType",
|
||||
"BoxAPIWrapper",
|
||||
"__version__",
|
||||
]
|
@@ -0,0 +1,5 @@
|
||||
"""Box Document Loaders."""
|
||||
|
||||
from langchain_box.document_loaders.box import BoxLoader
|
||||
|
||||
__all__ = ["BoxLoader"]
|
218
libs/partners/box/langchain_box/document_loaders/box.py
Normal file
218
libs/partners/box/langchain_box/document_loaders/box.py
Normal file
@@ -0,0 +1,218 @@
|
||||
from typing import Any, Dict, Iterator, List, Optional
|
||||
|
||||
from box_sdk_gen import FileBaseTypeField # type: ignore
|
||||
from langchain_core.document_loaders.base import BaseLoader
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.pydantic_v1 import BaseModel, ConfigDict, root_validator
|
||||
|
||||
from langchain_box.utilities import BoxAPIWrapper, BoxAuth
|
||||
|
||||
|
||||
class BoxLoader(BaseLoader, BaseModel):
|
||||
"""
|
||||
BoxLoader
|
||||
|
||||
This class will help you load files from your Box instance. You must have a
|
||||
Box account. If you need one, you can sign up for a free developer account.
|
||||
You will also need a Box application created in the developer portal, where
|
||||
you can select your authorization type.
|
||||
|
||||
If you wish to use either of the Box AI options, you must be on an Enterprise
|
||||
Plus plan or above. The free developer account does not have access to Box AI.
|
||||
|
||||
In addition, using the Box AI API requires a few prerequisite steps:
|
||||
* Your administrator must enable the Box AI API
|
||||
* You must enable the `Manage AI` scope in your app in the developer console.
|
||||
* Your administratormust install and enable your application.
|
||||
|
||||
Setup:
|
||||
Install ``langchain-box`` and set environment variable ``BOX_DEVELOPER_TOKEN``.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install -U langchain-box
|
||||
export BOX_DEVELOPER_TOKEN="your-api-key"
|
||||
|
||||
This loader returns ``Document `` objects built from text representations of files
|
||||
in Box. It will skip any document without a text representation available. You can
|
||||
provide either a ``List[str]`` containing Box file IDS, or you can provide a
|
||||
``str`` contining a Box folder ID. If providing a folder ID, you can also enable
|
||||
recursive mode to get the full tree under that folder.
|
||||
|
||||
:::info
|
||||
A Box instance can contain Petabytes of files, and folders can contain millions
|
||||
of files. Be intentional when choosing what folders you choose to index. And we
|
||||
recommend never getting all files from folder 0 recursively. Folder ID 0 is your
|
||||
root folder.
|
||||
:::
|
||||
|
||||
Instantiate:
|
||||
|
||||
Initialization variables
|
||||
variable | description | type | required
|
||||
---+---+---
|
||||
box_developer_token | token to use for auth. | string | no
|
||||
box_auth | client id for you app. Used for CCG | string | no
|
||||
box_file_ids | Array of Box file Ids to retrieve | array of strings | no
|
||||
box_folder_id | Box folder id to retrieve | string | no
|
||||
recursive | whether to return subfolders, default False | bool | no
|
||||
|
||||
Get files — this method requires you pass the ``box_file_ids`` parameter. This is a
|
||||
``List[str]`` containing the file IDs you wish to index.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_box.document_loaders import BoxLoader
|
||||
|
||||
box_file_ids = ["1514555423624", "1514553902288"]
|
||||
|
||||
loader = BoxLoader(
|
||||
box_file_ids=box_file_ids,
|
||||
character_limit=10000 # Optional. Defaults to no limit
|
||||
)
|
||||
|
||||
Get files in a folder — this method requires you pass the ``box_folder_id``
|
||||
parameter. This is a ``str`` containing the folder ID you wish to index.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_box.document_loaders import BoxLoader
|
||||
|
||||
box_folder_id = "260932470532"
|
||||
|
||||
loader = BoxLoader(
|
||||
box_folder_id=box_folder_id,
|
||||
recursive=False # Optional. return entire tree, defaults to False
|
||||
)
|
||||
|
||||
Load:
|
||||
.. code-block:: python
|
||||
|
||||
docs = loader.load()
|
||||
docs[0]
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
Document(metadata={'source': 'https://dl.boxcloud.com/api/2.0/
|
||||
internal_files/1514555423624/versions/1663171610024/representations
|
||||
/extracted_text/content/', 'title': 'Invoice-A5555_txt'},
|
||||
page_content='Vendor: AstroTech Solutions\nInvoice Number: A5555\n\nLine
|
||||
Items:\n - Gravitational Wave Detector Kit: $800\n - Exoplanet
|
||||
Terrarium: $120\nTotal: $920')
|
||||
|
||||
Lazy load:
|
||||
.. code-block:: python
|
||||
|
||||
docs = []
|
||||
docs_lazy = loader.lazy_load()
|
||||
|
||||
for doc in docs_lazy:
|
||||
docs.append(doc)
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
Document(metadata={'source': 'https://dl.boxcloud.com/api/2.0/
|
||||
internal_files/1514555423624/versions/1663171610024/representations
|
||||
/extracted_text/content/', 'title': 'Invoice-A5555_txt'},
|
||||
page_content='Vendor: AstroTech Solutions\nInvoice Number: A5555\n\nLine
|
||||
Items:\n - Gravitational Wave Detector Kit: $800\n - Exoplanet
|
||||
Terrarium: $120\nTotal: $920')
|
||||
"""
|
||||
|
||||
model_config = ConfigDict(use_enum_values=True)
|
||||
|
||||
"""String containing the Box Developer Token generated in the developer console"""
|
||||
box_developer_token: Optional[str] = None
|
||||
"""Configured langchain_box.utilities.BoxAuth object"""
|
||||
box_auth: Optional[BoxAuth] = None
|
||||
"""List[str] containing Box file ids"""
|
||||
box_file_ids: Optional[List[str]] = None
|
||||
"""String containing box folder id to load files from"""
|
||||
box_folder_id: Optional[str] = None
|
||||
"""If getting files by folder id, recursive is a bool to determine if you wish
|
||||
to traverse subfolders to return child documents. Default is False"""
|
||||
recursive: Optional[bool] = False
|
||||
"""character_limit is an int that caps the number of characters to
|
||||
return per document."""
|
||||
character_limit: Optional[int] = -1
|
||||
|
||||
box: Optional[BoxAPIWrapper]
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
extra = "allow"
|
||||
|
||||
@root_validator(allow_reuse=True)
|
||||
def validate_box_loader_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
||||
box = None
|
||||
|
||||
"""Validate that has either box_file_ids or box_folder_id."""
|
||||
if not values.get("box_file_ids") and not values.get("box_folder_id"):
|
||||
raise ValueError("You must provide box_file_ids or box_folder_id.")
|
||||
|
||||
"""Validate that we don't have both box_file_ids and box_folder_id."""
|
||||
if values.get("box_file_ids") and values.get("box_folder_id"):
|
||||
raise ValueError(
|
||||
"You must provide either box_file_ids or box_folder_id, not both."
|
||||
)
|
||||
|
||||
"""Validate that we have either a box_developer_token or box_auth."""
|
||||
if not values.get("box_auth") and not values.get("box_developer_token"):
|
||||
raise ValueError(
|
||||
"you must provide box_developer_token or a box_auth "
|
||||
"generated with langchain_box.utilities.BoxAuth"
|
||||
)
|
||||
|
||||
box = BoxAPIWrapper( # type: ignore[call-arg]
|
||||
box_developer_token=values.get("box_developer_token"),
|
||||
box_auth=values.get("box_auth"),
|
||||
character_limit=values.get("character_limit"),
|
||||
)
|
||||
|
||||
values["box"] = box
|
||||
|
||||
return values
|
||||
|
||||
def _get_files_from_folder(self, folder_id): # type: ignore[no-untyped-def]
|
||||
folder_content = self.box.get_folder_items(folder_id)
|
||||
|
||||
for file in folder_content:
|
||||
try:
|
||||
if file.type == FileBaseTypeField.FILE:
|
||||
doc = self.box.get_document_by_file_id(file.id)
|
||||
|
||||
if doc is not None:
|
||||
yield doc
|
||||
|
||||
elif file.type == "folder" and self.recursive:
|
||||
try:
|
||||
yield from self._get_files_from_folder(file.id)
|
||||
except TypeError:
|
||||
pass
|
||||
except TypeError:
|
||||
pass
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
"""Load documents. Accepts no arguments. Returns `Iterator[Document]`"""
|
||||
if self.box_file_ids:
|
||||
for file_id in self.box_file_ids:
|
||||
try:
|
||||
file = self.box.get_document_by_file_id(file_id) # type: ignore[union-attr]
|
||||
|
||||
if file is not None:
|
||||
yield file
|
||||
except TypeError:
|
||||
pass
|
||||
elif self.box_folder_id:
|
||||
try:
|
||||
yield from self._get_files_from_folder(self.box_folder_id)
|
||||
except TypeError:
|
||||
pass
|
||||
except Exception as e:
|
||||
print(f"Exception {e}") # noqa: T201
|
||||
else:
|
||||
raise ValueError(
|
||||
"You must provide either `box_file_ids` or `box_folder_id`"
|
||||
)
|
0
libs/partners/box/langchain_box/py.typed
Normal file
0
libs/partners/box/langchain_box/py.typed
Normal file
5
libs/partners/box/langchain_box/utilities/__init__.py
Normal file
5
libs/partners/box/langchain_box/utilities/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
"""Box API Utilities."""
|
||||
|
||||
from langchain_box.utilities.box import BoxAPIWrapper, BoxAuth, BoxAuthType
|
||||
|
||||
__all__ = ["BoxAuth", "BoxAuthType", "BoxAPIWrapper"]
|
525
libs/partners/box/langchain_box/utilities/box.py
Normal file
525
libs/partners/box/langchain_box/utilities/box.py
Normal file
@@ -0,0 +1,525 @@
|
||||
"""Util that calls Box APIs."""
|
||||
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import box_sdk_gen # type: ignore
|
||||
import requests
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.pydantic_v1 import BaseModel, root_validator
|
||||
from langchain_core.utils import get_from_dict_or_env
|
||||
|
||||
|
||||
class DocumentFiles(Enum):
|
||||
DOC = "doc"
|
||||
DOCX = "docx"
|
||||
GDOC = "gdoc"
|
||||
GSHEET = "gsheet"
|
||||
NUMBERS = "numbers"
|
||||
ODS = "ods"
|
||||
ODT = "odt"
|
||||
PAGES = "pages"
|
||||
PDF = "pdf"
|
||||
RTF = "rtf"
|
||||
WPD = "wpd"
|
||||
XLS = "xls"
|
||||
XLSM = "xlsm"
|
||||
XLSX = "xlsx"
|
||||
AS = "as"
|
||||
AS3 = "as3"
|
||||
ASM = "asm"
|
||||
BAT = "bat"
|
||||
C = "c"
|
||||
CC = "cc"
|
||||
CMAKE = "cmake"
|
||||
CPP = "cpp"
|
||||
CS = "cs"
|
||||
CSS = "css"
|
||||
CSV = "csv"
|
||||
CXX = "cxx"
|
||||
DIFF = "diff"
|
||||
ERB = "erb"
|
||||
GROOVY = "groovy"
|
||||
H = "h"
|
||||
HAML = "haml"
|
||||
HH = "hh"
|
||||
HTM = "htm"
|
||||
HTML = "html"
|
||||
JAVA = "java"
|
||||
JS = "js"
|
||||
JSON = "json"
|
||||
LESS = "less"
|
||||
LOG = "log"
|
||||
M = "m"
|
||||
MAKE = "make"
|
||||
MD = "md"
|
||||
ML = "ml"
|
||||
MM = "mm"
|
||||
MSG = "msg"
|
||||
PHP = "php"
|
||||
PL = "pl"
|
||||
PROPERTIES = "properties"
|
||||
PY = "py"
|
||||
RB = "rb"
|
||||
RST = "rst"
|
||||
SASS = "sass"
|
||||
SCALA = "scala"
|
||||
SCM = "scm"
|
||||
SCRIPT = "script"
|
||||
SH = "sh"
|
||||
SML = "sml"
|
||||
SQL = "sql"
|
||||
TXT = "txt"
|
||||
VI = "vi"
|
||||
VIM = "vim"
|
||||
WEBDOC = "webdoc"
|
||||
XHTML = "xhtml"
|
||||
XLSB = "xlsb"
|
||||
XML = "xml"
|
||||
XSD = "xsd"
|
||||
XSL = "xsl"
|
||||
YAML = "yaml"
|
||||
GSLLIDE = "gslide"
|
||||
GSLIDES = "gslides"
|
||||
KEY = "key"
|
||||
ODP = "odp"
|
||||
PPT = "ppt"
|
||||
PPTX = "pptx"
|
||||
BOXNOTE = "boxnote"
|
||||
|
||||
|
||||
class ImageFiles(Enum):
|
||||
ARW = "arw"
|
||||
BMP = "bmp"
|
||||
CR2 = "cr2"
|
||||
DCM = "dcm"
|
||||
DICM = "dicm"
|
||||
DICOM = "dicom"
|
||||
DNG = "dng"
|
||||
EPS = "eps"
|
||||
EXR = "exr"
|
||||
GIF = "gif"
|
||||
HEIC = "heic"
|
||||
INDD = "indd"
|
||||
INDML = "indml"
|
||||
INDT = "indt"
|
||||
INX = "inx"
|
||||
JPEG = "jpeg"
|
||||
JPG = "jpg"
|
||||
NEF = "nef"
|
||||
PNG = "png"
|
||||
SVG = "svg"
|
||||
TIF = "tif"
|
||||
TIFF = "tiff"
|
||||
TGA = "tga"
|
||||
SVS = "svs"
|
||||
|
||||
|
||||
"""
|
||||
BoxAuthType
|
||||
an enum to tell BoxLoader how you wish to autheticate your Box connection.
|
||||
|
||||
Options are:
|
||||
TOKEN - Use a developer token generated from the Box Deevloper Token.
|
||||
Only recommended for development.
|
||||
Provide `box_developer_token`.
|
||||
CCG - Client Credentials Grant.
|
||||
provide `box_client_id`, `box_client_secret`,
|
||||
and `box_enterprise_id` or optionally `box_user_id`.
|
||||
JWT - Use JWT for authentication. Config should be stored on the file
|
||||
system accessible to your app.
|
||||
provide `box_jwt_path`. Optionally, provide `box_user_id` to
|
||||
act as a specific user
|
||||
"""
|
||||
|
||||
|
||||
class BoxAuthType(Enum):
|
||||
"""Use a developer token or a token retrieved from box-sdk-gen"""
|
||||
|
||||
TOKEN = "token"
|
||||
"""Use `client_credentials` type grant"""
|
||||
CCG = "ccg"
|
||||
"""Use JWT bearer token auth"""
|
||||
JWT = "jwt"
|
||||
|
||||
|
||||
"""
|
||||
`BoxAuth` supports the following authentication methods:
|
||||
|
||||
* Token — either a developer token or any token generated through the Box SDK
|
||||
* JWT with a service account
|
||||
* JWT with a specified user
|
||||
* CCG with a service account
|
||||
* CCG with a specified user
|
||||
|
||||
:::note
|
||||
If using JWT authentication, you will need to download the configuration from the
|
||||
Box developer console after generating your public/private key pair. Place this
|
||||
file in your application directory structure somewhere. You will use the path to
|
||||
this file when using the `BoxAuth` helper class.
|
||||
:::
|
||||
|
||||
For more information, learn about how to
|
||||
[set up a Box application](https://developer.box.com/guides/getting-started/first-application/),
|
||||
and check out the
|
||||
[Box authentication guide](https://developer.box.com/guides/authentication/select/)
|
||||
for more about our different authentication options.
|
||||
|
||||
Simple implementation
|
||||
|
||||
To instantiate, you must provide a ``langchain_box.utilities.BoxAuthType``.
|
||||
|
||||
BoxAuthType is an enum to tell BoxLoader how you wish to autheticate your
|
||||
Box connection.
|
||||
|
||||
Options are:
|
||||
TOKEN - Use a developer token generated from the Box Deevloper Token.
|
||||
Only recommended for development.
|
||||
Provide `box_developer_token`.
|
||||
CCG - Client Credentials Grant.
|
||||
provide `box_client_id`, `box_client_secret`,
|
||||
and `box_enterprise_id` or optionally `box_user_id`.
|
||||
JWT - Use JWT for authentication. Config should be stored on the file
|
||||
system accessible to your app.
|
||||
provide `box_jwt_path`. Optionally, provide `box_user_id` to
|
||||
act as a specific user
|
||||
|
||||
.. code-block:: python
|
||||
from langchain_box.document_loaders import BoxLoader
|
||||
from langchain_box.utilities import BoxAuth, BoxAuthType
|
||||
|
||||
auth = BoxAuth(
|
||||
auth_type=BoxAuthType.TOKEN,
|
||||
box_developer_token=box_developer_token
|
||||
)
|
||||
|
||||
loader = BoxLoader(
|
||||
box_auth=auth,
|
||||
...
|
||||
)
|
||||
|
||||
To see examples for each supported authentication methodology, visit the
|
||||
[Box providers](/docs/integrations/providers/box) page. If you want to
|
||||
use OAuth 2.0 `authorization_code` flow, use
|
||||
[box-sdk-gen](https://github.com/box/box-python-sdk-gen) SDK, get your
|
||||
token, and use `BoxAuthType.TOKEN` type.
|
||||
"""
|
||||
|
||||
|
||||
class BoxAuth(BaseModel):
|
||||
"""Authentication type to use. Must pass BoxAuthType enum"""
|
||||
|
||||
auth_type: BoxAuthType
|
||||
""" If using BoxAuthType.TOKEN, provide your token here"""
|
||||
box_developer_token: Optional[str] = None
|
||||
"""If using BoxAuthType.JWT, provide local path to your
|
||||
JWT configuration file"""
|
||||
box_jwt_path: Optional[str] = None
|
||||
"""If using BoxAuthType.CCG, provide your app's client ID"""
|
||||
box_client_id: Optional[str] = None
|
||||
"""If using BoxAuthType.CCG, provide your app's client secret"""
|
||||
box_client_secret: Optional[str] = None
|
||||
"""If using BoxAuthType.CCG, provide your enterprise ID.
|
||||
Only required if you are not sending `box_user_id`"""
|
||||
box_enterprise_id: Optional[str] = None
|
||||
"""If using BoxAuthType.CCG or BoxAuthType.JWT, providing
|
||||
`box_user_id` will act on behalf of a specific user"""
|
||||
box_user_id: Optional[str] = None
|
||||
|
||||
box_client: Optional[box_sdk_gen.BoxClient] = None
|
||||
custom_header: Dict = dict({"x-box-ai-library": "langchain"})
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
use_enum_values = True
|
||||
extra = "allow"
|
||||
|
||||
@root_validator()
|
||||
def validate_box_auth_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Validate auth_type is set"""
|
||||
if not values.get("auth_type"):
|
||||
raise ValueError("Auth type must be set.")
|
||||
|
||||
"""Validate that TOKEN auth type provides box_developer_token."""
|
||||
if values.get("auth_type") == "token":
|
||||
if not get_from_dict_or_env(
|
||||
values, "box_developer_token", "BOX_DEVELOPER_TOKEN"
|
||||
):
|
||||
raise ValueError(
|
||||
f"{values.get('auth_type')} requires box_developer_token to be set"
|
||||
)
|
||||
|
||||
"""Validate that JWT auth type provides box_jwt_path."""
|
||||
if values.get("auth_type") == "jwt":
|
||||
if not get_from_dict_or_env(values, "box_jwt_path", "BOX_JWT_PATH"):
|
||||
raise ValueError(
|
||||
f"{values.get('auth_type')} requires box_jwt_path to be set"
|
||||
)
|
||||
|
||||
"""Validate that CCG auth type provides box_client_id and
|
||||
box_client_secret and either box_enterprise_id or box_user_id."""
|
||||
if values.get("auth_type") == "ccg":
|
||||
if (
|
||||
not get_from_dict_or_env(values, "box_client_id", "BOX_CLIENT_ID")
|
||||
or not get_from_dict_or_env(
|
||||
values, "box_client_secret", "BOX_CLIENT_SECRET"
|
||||
)
|
||||
or (
|
||||
not values.get("box_enterprise_id")
|
||||
and not values.get("box_user_id")
|
||||
)
|
||||
):
|
||||
raise ValueError(
|
||||
f"{values.get('auth_type')} requires box_client_id, \
|
||||
box_client_secret, and box_enterprise_id."
|
||||
)
|
||||
|
||||
return values
|
||||
|
||||
def authorize(self) -> None:
|
||||
match self.auth_type:
|
||||
case "token":
|
||||
try:
|
||||
auth = box_sdk_gen.BoxDeveloperTokenAuth(
|
||||
token=self.box_developer_token
|
||||
)
|
||||
self.box_client = box_sdk_gen.BoxClient(
|
||||
auth=auth
|
||||
).with_extra_headers(extra_headers=self.custom_header)
|
||||
|
||||
except box_sdk_gen.BoxSDKError as bse:
|
||||
raise RuntimeError(
|
||||
f"Error getting client from developer token: {bse.message}"
|
||||
)
|
||||
except Exception as ex:
|
||||
raise ValueError(
|
||||
f"Invalid Box developer token. Please verify your \
|
||||
token and try again.\n{ex}"
|
||||
) from ex
|
||||
|
||||
case "jwt":
|
||||
try:
|
||||
jwt_config = box_sdk_gen.JWTConfig.from_config_file(
|
||||
config_file_path=self.box_jwt_path
|
||||
)
|
||||
auth = box_sdk_gen.BoxJWTAuth(config=jwt_config)
|
||||
|
||||
self.box_client = box_sdk_gen.BoxClient(
|
||||
auth=auth
|
||||
).with_extra_headers(extra_headers=self.custom_header)
|
||||
|
||||
if self.box_user_id is not None:
|
||||
user_auth = auth.with_user_subject(self.box_user_id)
|
||||
self.box_client = box_sdk_gen.BoxClient(
|
||||
auth=user_auth
|
||||
).with_extra_headers(extra_headers=self.custom_header)
|
||||
|
||||
except box_sdk_gen.BoxSDKError as bse:
|
||||
raise RuntimeError(
|
||||
f"Error getting client from jwt token: {bse.message}"
|
||||
)
|
||||
except Exception as ex:
|
||||
raise ValueError(
|
||||
"Error authenticating. Please verify your JWT config \
|
||||
and try again."
|
||||
) from ex
|
||||
|
||||
case "ccg":
|
||||
try:
|
||||
if self.box_user_id is not None:
|
||||
ccg_config = box_sdk_gen.CCGConfig(
|
||||
client_id=self.box_client_id,
|
||||
client_secret=self.box_client_secret,
|
||||
user_id=self.box_user_id,
|
||||
)
|
||||
else:
|
||||
ccg_config = box_sdk_gen.CCGConfig(
|
||||
client_id=self.box_client_id,
|
||||
client_secret=self.box_client_secret,
|
||||
enterprise_id=self.box_enterprise_id,
|
||||
)
|
||||
auth = box_sdk_gen.BoxCCGAuth(config=ccg_config)
|
||||
|
||||
self.box_client = box_sdk_gen.BoxClient(
|
||||
auth=auth
|
||||
).with_extra_headers(extra_headers=self.custom_header)
|
||||
|
||||
except box_sdk_gen.BoxSDKError as bse:
|
||||
raise RuntimeError(
|
||||
f"Error getting client from ccg token: {bse.message}"
|
||||
)
|
||||
except Exception as ex:
|
||||
raise ValueError(
|
||||
"Error authenticating. Please verify you are providing a \
|
||||
valid client id, secret and either a valid user ID or \
|
||||
enterprise ID."
|
||||
) from ex
|
||||
|
||||
case _:
|
||||
raise ValueError(
|
||||
f"{self.auth_type} is not a valid auth_type. Value must be \
|
||||
TOKEN, CCG, or JWT."
|
||||
)
|
||||
|
||||
def get_client(self) -> box_sdk_gen.BoxClient:
|
||||
"""Instantiate the Box SDK."""
|
||||
if self.box_client is None:
|
||||
self.authorize()
|
||||
|
||||
return self.box_client
|
||||
|
||||
|
||||
class BoxAPIWrapper(BaseModel):
|
||||
"""Wrapper for Box API."""
|
||||
|
||||
"""String containing the Box Developer Token generated in the developer console"""
|
||||
box_developer_token: Optional[str] = None
|
||||
"""Configured langchain_box.utilities.BoxAuth object"""
|
||||
box_auth: Optional[BoxAuth] = None
|
||||
"""character_limit is an int that caps the number of characters to
|
||||
return per document."""
|
||||
character_limit: Optional[int] = -1
|
||||
|
||||
box: Optional[box_sdk_gen.BoxClient]
|
||||
file_count: int = 0
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
use_enum_values = True
|
||||
extra = "allow"
|
||||
|
||||
@root_validator(allow_reuse=True)
|
||||
def validate_box_api_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
||||
values["box"] = None
|
||||
|
||||
"""Validate that TOKEN auth type provides box_developer_token."""
|
||||
if not values.get("box_auth"):
|
||||
if not get_from_dict_or_env(
|
||||
values, "box_developer_token", "BOX_DEVELOPER_TOKEN"
|
||||
):
|
||||
raise ValueError(
|
||||
"You must configure either box_developer_token of box_auth"
|
||||
)
|
||||
else:
|
||||
box_auth = values.get("box_auth")
|
||||
values["box"] = box_auth.get_client() # type: ignore[union-attr]
|
||||
|
||||
return values
|
||||
|
||||
def get_box_client(self) -> box_sdk_gen.BoxClient:
|
||||
box_auth = BoxAuth(
|
||||
auth_type=BoxAuthType.TOKEN, box_developer_token=self.box_developer_token
|
||||
)
|
||||
|
||||
self.box = box_auth.get_client()
|
||||
|
||||
def _do_request(self, url: str) -> Any:
|
||||
try:
|
||||
access_token = self.box.auth.retrieve_token().access_token # type: ignore[union-attr]
|
||||
except box_sdk_gen.BoxSDKError as bse:
|
||||
raise RuntimeError(f"Error getting client from jwt token: {bse.message}")
|
||||
|
||||
resp = requests.get(url, headers={"Authorization": f"Bearer {access_token}"})
|
||||
resp.raise_for_status()
|
||||
return resp.content
|
||||
|
||||
def get_folder_items(self, folder_id: str) -> box_sdk_gen.Items:
|
||||
"""Get all the items in a folder. Accepts folder_id as str.
|
||||
returns box_sdk_gen.Items"""
|
||||
if self.box is None:
|
||||
self.get_box_client()
|
||||
|
||||
try:
|
||||
folder_contents = self.box.folders.get_folder_items( # type: ignore[union-attr]
|
||||
folder_id, fields=["id", "type", "name"]
|
||||
)
|
||||
except box_sdk_gen.BoxAPIError as bae:
|
||||
raise RuntimeError(
|
||||
f"BoxAPIError: Error getting folder content: {bae.message}"
|
||||
)
|
||||
except box_sdk_gen.BoxSDKError as bse:
|
||||
raise RuntimeError(
|
||||
f"BoxSDKError: Error getting folder content: {bse.message}"
|
||||
)
|
||||
|
||||
return folder_contents.entries
|
||||
|
||||
def get_text_representation(self, file_id: str = "") -> tuple[str, str, str]:
|
||||
try:
|
||||
from box_sdk_gen import BoxAPIError, BoxSDKError
|
||||
except ImportError:
|
||||
raise ImportError("You must run `pip install box-sdk-gen`")
|
||||
|
||||
if self.box is None:
|
||||
self.get_box_client()
|
||||
|
||||
try:
|
||||
file = self.box.files.get_file_by_id( # type: ignore[union-attr]
|
||||
file_id,
|
||||
x_rep_hints="[extracted_text]",
|
||||
fields=["name", "representations", "type"],
|
||||
)
|
||||
except BoxAPIError as bae:
|
||||
raise RuntimeError(f"BoxAPIError: Error getting text rep: {bae.message}")
|
||||
except BoxSDKError as bse:
|
||||
raise RuntimeError(f"BoxSDKError: Error getting text rep: {bse.message}")
|
||||
except Exception:
|
||||
return None, None, None # type: ignore[return-value]
|
||||
|
||||
file_repr = file.representations.entries
|
||||
|
||||
if len(file_repr) <= 0:
|
||||
return None, None, None # type: ignore[return-value]
|
||||
|
||||
for entry in file_repr:
|
||||
if entry.representation == "extracted_text":
|
||||
# If the file representation doesn't exist, calling
|
||||
# info.url will generate text if possible
|
||||
if entry.status.state == "none":
|
||||
self._do_request(entry.info.url)
|
||||
|
||||
url = entry.content.url_template.replace("{+asset_path}", "")
|
||||
file_name = file.name.replace(".", "_").replace(" ", "_")
|
||||
|
||||
try:
|
||||
raw_content = self._do_request(url)
|
||||
except requests.exceptions.HTTPError:
|
||||
return None, None, None # type: ignore[return-value]
|
||||
|
||||
if self.character_limit > 0: # type: ignore[operator]
|
||||
content = raw_content[0 : self.character_limit]
|
||||
else:
|
||||
content = raw_content
|
||||
|
||||
return file_name, content, url
|
||||
|
||||
return None, None, None # type: ignore[return-value]
|
||||
|
||||
def get_document_by_file_id(self, file_id: str) -> Optional[Document]:
|
||||
"""Load a file from a Box id. Accepts file_id as str.
|
||||
Returns `Document`"""
|
||||
|
||||
if self.box is None:
|
||||
self.get_box_client()
|
||||
|
||||
file = self.box.files.get_file_by_id( # type: ignore[union-attr]
|
||||
file_id, fields=["name", "type", "extension"]
|
||||
)
|
||||
|
||||
if file.type == "file":
|
||||
if hasattr(DocumentFiles, file.extension.upper()):
|
||||
file_name, content, url = self.get_text_representation(file_id=file_id)
|
||||
|
||||
if file_name is None or content is None or url is None:
|
||||
return None
|
||||
|
||||
metadata = {
|
||||
"source": f"{url}",
|
||||
"title": f"{file_name}",
|
||||
}
|
||||
|
||||
return Document(page_content=content, metadata=metadata)
|
||||
|
||||
return None
|
||||
|
||||
return None
|
Reference in New Issue
Block a user