mirror of
https://github.com/hwchase17/langchain.git
synced 2025-04-27 03:31:51 +00:00
Thank you for contributing to LangChain! -Description: Adding new package: `langchain-box`: * `langchain_box.document_loaders.BoxLoader` — DocumentLoader functionality * `langchain_box.utilities.BoxAPIWrapper` — Box-specific code * `langchain_box.utilities.BoxAuth` — Helper class for Box authentication * `langchain_box.utilities.BoxAuthType` — enum used by BoxAuth class - Twitter handle: @boxplatform - [x] **Add tests and docs**: If you're adding a new integration, please include 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. - [x] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/ Additional guidelines: - Make sure optional dependencies are imported within a function. - Please do not add dependencies to pyproject.toml files (even optional ones) unless they are required for unit tests. - Most PRs should not touch more than one package. - Changes should be backwards compatible. - If you are adding something to community, do not re-import it in langchain. If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17. --------- Co-authored-by: Erick Friis <erickfriis@gmail.com> Co-authored-by: Erick Friis <erick@langchain.dev>
219 lines
8.4 KiB
Python
219 lines
8.4 KiB
Python
from typing import Any, Dict, Iterator, List, Optional
|
|
|
|
from box_sdk_gen import FileBaseTypeField # type: ignore
|
|
from langchain_core.document_loaders.base import BaseLoader
|
|
from langchain_core.documents import Document
|
|
from langchain_core.pydantic_v1 import BaseModel, ConfigDict, root_validator
|
|
|
|
from langchain_box.utilities import BoxAPIWrapper, BoxAuth
|
|
|
|
|
|
class BoxLoader(BaseLoader, BaseModel):
|
|
"""
|
|
BoxLoader
|
|
|
|
This class will help you load files from your Box instance. You must have a
|
|
Box account. If you need one, you can sign up for a free developer account.
|
|
You will also need a Box application created in the developer portal, where
|
|
you can select your authorization type.
|
|
|
|
If you wish to use either of the Box AI options, you must be on an Enterprise
|
|
Plus plan or above. The free developer account does not have access to Box AI.
|
|
|
|
In addition, using the Box AI API requires a few prerequisite steps:
|
|
* Your administrator must enable the Box AI API
|
|
* You must enable the `Manage AI` scope in your app in the developer console.
|
|
* Your administratormust install and enable your application.
|
|
|
|
Setup:
|
|
Install ``langchain-box`` and set environment variable ``BOX_DEVELOPER_TOKEN``.
|
|
|
|
.. code-block:: bash
|
|
|
|
pip install -U langchain-box
|
|
export BOX_DEVELOPER_TOKEN="your-api-key"
|
|
|
|
This loader returns ``Document `` objects built from text representations of files
|
|
in Box. It will skip any document without a text representation available. You can
|
|
provide either a ``List[str]`` containing Box file IDS, or you can provide a
|
|
``str`` contining a Box folder ID. If providing a folder ID, you can also enable
|
|
recursive mode to get the full tree under that folder.
|
|
|
|
:::info
|
|
A Box instance can contain Petabytes of files, and folders can contain millions
|
|
of files. Be intentional when choosing what folders you choose to index. And we
|
|
recommend never getting all files from folder 0 recursively. Folder ID 0 is your
|
|
root folder.
|
|
:::
|
|
|
|
Instantiate:
|
|
|
|
Initialization variables
|
|
variable | description | type | required
|
|
---+---+---
|
|
box_developer_token | token to use for auth. | string | no
|
|
box_auth | client id for you app. Used for CCG | string | no
|
|
box_file_ids | Array of Box file Ids to retrieve | array of strings | no
|
|
box_folder_id | Box folder id to retrieve | string | no
|
|
recursive | whether to return subfolders, default False | bool | no
|
|
|
|
Get files — this method requires you pass the ``box_file_ids`` parameter. This is a
|
|
``List[str]`` containing the file IDs you wish to index.
|
|
|
|
.. code-block:: python
|
|
|
|
from langchain_box.document_loaders import BoxLoader
|
|
|
|
box_file_ids = ["1514555423624", "1514553902288"]
|
|
|
|
loader = BoxLoader(
|
|
box_file_ids=box_file_ids,
|
|
character_limit=10000 # Optional. Defaults to no limit
|
|
)
|
|
|
|
Get files in a folder — this method requires you pass the ``box_folder_id``
|
|
parameter. This is a ``str`` containing the folder ID you wish to index.
|
|
|
|
.. code-block:: python
|
|
|
|
from langchain_box.document_loaders import BoxLoader
|
|
|
|
box_folder_id = "260932470532"
|
|
|
|
loader = BoxLoader(
|
|
box_folder_id=box_folder_id,
|
|
recursive=False # Optional. return entire tree, defaults to False
|
|
)
|
|
|
|
Load:
|
|
.. code-block:: python
|
|
|
|
docs = loader.load()
|
|
docs[0]
|
|
|
|
.. code-block:: python
|
|
|
|
Document(metadata={'source': 'https://dl.boxcloud.com/api/2.0/
|
|
internal_files/1514555423624/versions/1663171610024/representations
|
|
/extracted_text/content/', 'title': 'Invoice-A5555_txt'},
|
|
page_content='Vendor: AstroTech Solutions\nInvoice Number: A5555\n\nLine
|
|
Items:\n - Gravitational Wave Detector Kit: $800\n - Exoplanet
|
|
Terrarium: $120\nTotal: $920')
|
|
|
|
Lazy load:
|
|
.. code-block:: python
|
|
|
|
docs = []
|
|
docs_lazy = loader.lazy_load()
|
|
|
|
for doc in docs_lazy:
|
|
docs.append(doc)
|
|
print(docs[0].page_content[:100])
|
|
print(docs[0].metadata)
|
|
|
|
.. code-block:: python
|
|
|
|
Document(metadata={'source': 'https://dl.boxcloud.com/api/2.0/
|
|
internal_files/1514555423624/versions/1663171610024/representations
|
|
/extracted_text/content/', 'title': 'Invoice-A5555_txt'},
|
|
page_content='Vendor: AstroTech Solutions\nInvoice Number: A5555\n\nLine
|
|
Items:\n - Gravitational Wave Detector Kit: $800\n - Exoplanet
|
|
Terrarium: $120\nTotal: $920')
|
|
"""
|
|
|
|
model_config = ConfigDict(use_enum_values=True)
|
|
|
|
"""String containing the Box Developer Token generated in the developer console"""
|
|
box_developer_token: Optional[str] = None
|
|
"""Configured langchain_box.utilities.BoxAuth object"""
|
|
box_auth: Optional[BoxAuth] = None
|
|
"""List[str] containing Box file ids"""
|
|
box_file_ids: Optional[List[str]] = None
|
|
"""String containing box folder id to load files from"""
|
|
box_folder_id: Optional[str] = None
|
|
"""If getting files by folder id, recursive is a bool to determine if you wish
|
|
to traverse subfolders to return child documents. Default is False"""
|
|
recursive: Optional[bool] = False
|
|
"""character_limit is an int that caps the number of characters to
|
|
return per document."""
|
|
character_limit: Optional[int] = -1
|
|
|
|
box: Optional[BoxAPIWrapper]
|
|
|
|
class Config:
|
|
arbitrary_types_allowed = True
|
|
extra = "allow"
|
|
|
|
@root_validator(allow_reuse=True)
|
|
def validate_box_loader_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
|
box = None
|
|
|
|
"""Validate that has either box_file_ids or box_folder_id."""
|
|
if not values.get("box_file_ids") and not values.get("box_folder_id"):
|
|
raise ValueError("You must provide box_file_ids or box_folder_id.")
|
|
|
|
"""Validate that we don't have both box_file_ids and box_folder_id."""
|
|
if values.get("box_file_ids") and values.get("box_folder_id"):
|
|
raise ValueError(
|
|
"You must provide either box_file_ids or box_folder_id, not both."
|
|
)
|
|
|
|
"""Validate that we have either a box_developer_token or box_auth."""
|
|
if not values.get("box_auth") and not values.get("box_developer_token"):
|
|
raise ValueError(
|
|
"you must provide box_developer_token or a box_auth "
|
|
"generated with langchain_box.utilities.BoxAuth"
|
|
)
|
|
|
|
box = BoxAPIWrapper( # type: ignore[call-arg]
|
|
box_developer_token=values.get("box_developer_token"),
|
|
box_auth=values.get("box_auth"),
|
|
character_limit=values.get("character_limit"),
|
|
)
|
|
|
|
values["box"] = box
|
|
|
|
return values
|
|
|
|
def _get_files_from_folder(self, folder_id): # type: ignore[no-untyped-def]
|
|
folder_content = self.box.get_folder_items(folder_id)
|
|
|
|
for file in folder_content:
|
|
try:
|
|
if file.type == FileBaseTypeField.FILE:
|
|
doc = self.box.get_document_by_file_id(file.id)
|
|
|
|
if doc is not None:
|
|
yield doc
|
|
|
|
elif file.type == "folder" and self.recursive:
|
|
try:
|
|
yield from self._get_files_from_folder(file.id)
|
|
except TypeError:
|
|
pass
|
|
except TypeError:
|
|
pass
|
|
|
|
def lazy_load(self) -> Iterator[Document]:
|
|
"""Load documents. Accepts no arguments. Returns `Iterator[Document]`"""
|
|
if self.box_file_ids:
|
|
for file_id in self.box_file_ids:
|
|
try:
|
|
file = self.box.get_document_by_file_id(file_id) # type: ignore[union-attr]
|
|
|
|
if file is not None:
|
|
yield file
|
|
except TypeError:
|
|
pass
|
|
elif self.box_folder_id:
|
|
try:
|
|
yield from self._get_files_from_folder(self.box_folder_id)
|
|
except TypeError:
|
|
pass
|
|
except Exception as e:
|
|
print(f"Exception {e}") # noqa: T201
|
|
else:
|
|
raise ValueError(
|
|
"You must provide either `box_file_ids` or `box_folder_id`"
|
|
)
|