langchain/libs/partners/box/langchain_box/document_loaders/box.py
Scott Hurrey 55fd2e2158
box: add langchain box package and DocumentLoader (#25506)
Thank you for contributing to LangChain!

-Description: Adding new package: `langchain-box`:

* `langchain_box.document_loaders.BoxLoader` — DocumentLoader
functionality
* `langchain_box.utilities.BoxAPIWrapper` — Box-specific code
* `langchain_box.utilities.BoxAuth` — Helper class for Box
authentication
* `langchain_box.utilities.BoxAuthType` — enum used by BoxAuth class

- Twitter handle: @boxplatform


- [x] **Add tests and docs**: If you're adding a new integration, please
include
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in
`docs/docs/integrations` directory.


- [x] **Lint and test**: Run `make format`, `make lint` and `make test`
from the root of the package(s) you've modified. See contribution
guidelines for more: https://python.langchain.com/docs/contributing/

Additional guidelines:
- Make sure optional dependencies are imported within a function.
- Please do not add dependencies to pyproject.toml files (even optional
ones) unless they are required for unit tests.
- Most PRs should not touch more than one package.
- Changes should be backwards compatible.
- If you are adding something to community, do not re-import it in
langchain.

If no one reviews your PR within a few days, please @-mention one of
baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17.

---------

Co-authored-by: Erick Friis <erickfriis@gmail.com>
Co-authored-by: Erick Friis <erick@langchain.dev>
2024-08-21 02:23:43 +00:00

219 lines
8.4 KiB
Python

from typing import Any, Dict, Iterator, List, Optional
from box_sdk_gen import FileBaseTypeField # type: ignore
from langchain_core.document_loaders.base import BaseLoader
from langchain_core.documents import Document
from langchain_core.pydantic_v1 import BaseModel, ConfigDict, root_validator
from langchain_box.utilities import BoxAPIWrapper, BoxAuth
class BoxLoader(BaseLoader, BaseModel):
"""
BoxLoader
This class will help you load files from your Box instance. You must have a
Box account. If you need one, you can sign up for a free developer account.
You will also need a Box application created in the developer portal, where
you can select your authorization type.
If you wish to use either of the Box AI options, you must be on an Enterprise
Plus plan or above. The free developer account does not have access to Box AI.
In addition, using the Box AI API requires a few prerequisite steps:
* Your administrator must enable the Box AI API
* You must enable the `Manage AI` scope in your app in the developer console.
* Your administratormust install and enable your application.
Setup:
Install ``langchain-box`` and set environment variable ``BOX_DEVELOPER_TOKEN``.
.. code-block:: bash
pip install -U langchain-box
export BOX_DEVELOPER_TOKEN="your-api-key"
This loader returns ``Document `` objects built from text representations of files
in Box. It will skip any document without a text representation available. You can
provide either a ``List[str]`` containing Box file IDS, or you can provide a
``str`` contining a Box folder ID. If providing a folder ID, you can also enable
recursive mode to get the full tree under that folder.
:::info
A Box instance can contain Petabytes of files, and folders can contain millions
of files. Be intentional when choosing what folders you choose to index. And we
recommend never getting all files from folder 0 recursively. Folder ID 0 is your
root folder.
:::
Instantiate:
Initialization variables
variable | description | type | required
---+---+---
box_developer_token | token to use for auth. | string | no
box_auth | client id for you app. Used for CCG | string | no
box_file_ids | Array of Box file Ids to retrieve | array of strings | no
box_folder_id | Box folder id to retrieve | string | no
recursive | whether to return subfolders, default False | bool | no
Get files — this method requires you pass the ``box_file_ids`` parameter. This is a
``List[str]`` containing the file IDs you wish to index.
.. code-block:: python
from langchain_box.document_loaders import BoxLoader
box_file_ids = ["1514555423624", "1514553902288"]
loader = BoxLoader(
box_file_ids=box_file_ids,
character_limit=10000 # Optional. Defaults to no limit
)
Get files in a folder — this method requires you pass the ``box_folder_id``
parameter. This is a ``str`` containing the folder ID you wish to index.
.. code-block:: python
from langchain_box.document_loaders import BoxLoader
box_folder_id = "260932470532"
loader = BoxLoader(
box_folder_id=box_folder_id,
recursive=False # Optional. return entire tree, defaults to False
)
Load:
.. code-block:: python
docs = loader.load()
docs[0]
.. code-block:: python
Document(metadata={'source': 'https://dl.boxcloud.com/api/2.0/
internal_files/1514555423624/versions/1663171610024/representations
/extracted_text/content/', 'title': 'Invoice-A5555_txt'},
page_content='Vendor: AstroTech Solutions\nInvoice Number: A5555\n\nLine
Items:\n - Gravitational Wave Detector Kit: $800\n - Exoplanet
Terrarium: $120\nTotal: $920')
Lazy load:
.. code-block:: python
docs = []
docs_lazy = loader.lazy_load()
for doc in docs_lazy:
docs.append(doc)
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
Document(metadata={'source': 'https://dl.boxcloud.com/api/2.0/
internal_files/1514555423624/versions/1663171610024/representations
/extracted_text/content/', 'title': 'Invoice-A5555_txt'},
page_content='Vendor: AstroTech Solutions\nInvoice Number: A5555\n\nLine
Items:\n - Gravitational Wave Detector Kit: $800\n - Exoplanet
Terrarium: $120\nTotal: $920')
"""
model_config = ConfigDict(use_enum_values=True)
"""String containing the Box Developer Token generated in the developer console"""
box_developer_token: Optional[str] = None
"""Configured langchain_box.utilities.BoxAuth object"""
box_auth: Optional[BoxAuth] = None
"""List[str] containing Box file ids"""
box_file_ids: Optional[List[str]] = None
"""String containing box folder id to load files from"""
box_folder_id: Optional[str] = None
"""If getting files by folder id, recursive is a bool to determine if you wish
to traverse subfolders to return child documents. Default is False"""
recursive: Optional[bool] = False
"""character_limit is an int that caps the number of characters to
return per document."""
character_limit: Optional[int] = -1
box: Optional[BoxAPIWrapper]
class Config:
arbitrary_types_allowed = True
extra = "allow"
@root_validator(allow_reuse=True)
def validate_box_loader_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
box = None
"""Validate that has either box_file_ids or box_folder_id."""
if not values.get("box_file_ids") and not values.get("box_folder_id"):
raise ValueError("You must provide box_file_ids or box_folder_id.")
"""Validate that we don't have both box_file_ids and box_folder_id."""
if values.get("box_file_ids") and values.get("box_folder_id"):
raise ValueError(
"You must provide either box_file_ids or box_folder_id, not both."
)
"""Validate that we have either a box_developer_token or box_auth."""
if not values.get("box_auth") and not values.get("box_developer_token"):
raise ValueError(
"you must provide box_developer_token or a box_auth "
"generated with langchain_box.utilities.BoxAuth"
)
box = BoxAPIWrapper( # type: ignore[call-arg]
box_developer_token=values.get("box_developer_token"),
box_auth=values.get("box_auth"),
character_limit=values.get("character_limit"),
)
values["box"] = box
return values
def _get_files_from_folder(self, folder_id): # type: ignore[no-untyped-def]
folder_content = self.box.get_folder_items(folder_id)
for file in folder_content:
try:
if file.type == FileBaseTypeField.FILE:
doc = self.box.get_document_by_file_id(file.id)
if doc is not None:
yield doc
elif file.type == "folder" and self.recursive:
try:
yield from self._get_files_from_folder(file.id)
except TypeError:
pass
except TypeError:
pass
def lazy_load(self) -> Iterator[Document]:
"""Load documents. Accepts no arguments. Returns `Iterator[Document]`"""
if self.box_file_ids:
for file_id in self.box_file_ids:
try:
file = self.box.get_document_by_file_id(file_id) # type: ignore[union-attr]
if file is not None:
yield file
except TypeError:
pass
elif self.box_folder_id:
try:
yield from self._get_files_from_folder(self.box_folder_id)
except TypeError:
pass
except Exception as e:
print(f"Exception {e}") # noqa: T201
else:
raise ValueError(
"You must provide either `box_file_ids` or `box_folder_id`"
)