box: add retrievers and fix docs (#25633)

Thank you for contributing to LangChain!


**Description:** Adding `BoxRetriever` for langchain_box. This retriever
handles two use cases:
* Retrieve all documents that match a full-text search
* Retrieve the answer to a Box AI prompt as a Document

**Twitter handle:** @BoxPlatform


- [x] **Add tests and docs**: If you're adding a new integration, please
include
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in
`docs/docs/integrations` directory.


- [x] **Lint and test**: Run `make format`, `make lint` and `make test`
from the root of the package(s) you've modified. See contribution
guidelines for more: https://python.langchain.com/docs/contributing/

Additional guidelines:
- Make sure optional dependencies are imported within a function.
- Please do not add dependencies to pyproject.toml files (even optional
ones) unless they are required for unit tests.
- Most PRs should not touch more than one package.
- Changes should be backwards compatible.
- If you are adding something to community, do not re-import it in
langchain.

If no one reviews your PR within a few days, please @-mention one of
baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17.

---------

Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
Scott Hurrey 2024-08-21 18:40:40 -04:00 committed by GitHub
parent 4f347cbcb9
commit fb1d67edf6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
17 changed files with 1004 additions and 218 deletions

View File

@ -172,3 +172,14 @@ If you wish to use OAuth2 with the authorization_code flow, please use `BoxAuthT
from langchain_box.document_loaders import BoxLoader
```
## Retrievers
### BoxRetriever
[See usage example](/docs/integrations/retrievers/box)
```python
from langchain_box.retrievers import BoxRetriever
```

File diff suppressed because one or more lines are too long

View File

@ -1,7 +1,8 @@
from importlib import metadata
from langchain_box.document_loaders import BoxLoader
from langchain_box.utilities import BoxAPIWrapper, BoxAuth, BoxAuthType
from langchain_box.retrievers import BoxRetriever
from langchain_box.utilities import BoxAuth, BoxAuthType, _BoxAPIWrapper
try:
__version__ = metadata.version(__package__)
@ -12,8 +13,9 @@ del metadata # optional, avoids polluting the results of dir(__package__)
__all__ = [
"BoxLoader",
"BoxRetriever",
"BoxAuth",
"BoxAuthType",
"BoxAPIWrapper",
"_BoxAPIWrapper",
"__version__",
]

View File

@ -3,14 +3,14 @@ from typing import Any, Dict, Iterator, List, Optional
from box_sdk_gen import FileBaseTypeField # type: ignore
from langchain_core.document_loaders.base import BaseLoader
from langchain_core.documents import Document
from langchain_core.pydantic_v1 import BaseModel, ConfigDict, root_validator
from langchain_core.pydantic_v1 import BaseModel, root_validator
from langchain_core.utils import get_from_dict_or_env
from langchain_box.utilities import BoxAPIWrapper, BoxAuth
from langchain_box.utilities import BoxAuth, _BoxAPIWrapper
class BoxLoader(BaseLoader, BaseModel):
"""
BoxLoader
"""BoxLoader.
This class will help you load files from your Box instance. You must have a
Box account. If you need one, you can sign up for a free developer account.
@ -33,18 +33,18 @@ class BoxLoader(BaseLoader, BaseModel):
pip install -U langchain-box
export BOX_DEVELOPER_TOKEN="your-api-key"
This loader returns ``Document `` objects built from text representations of files
in Box. It will skip any document without a text representation available. You can
provide either a ``List[str]`` containing Box file IDS, or you can provide a
``str`` contining a Box folder ID. If providing a folder ID, you can also enable
recursive mode to get the full tree under that folder.
:::info
.. note::
A Box instance can contain Petabytes of files, and folders can contain millions
of files. Be intentional when choosing what folders you choose to index. And we
recommend never getting all files from folder 0 recursively. Folder ID 0 is your
root folder.
:::
Instantiate:
@ -121,32 +121,36 @@ class BoxLoader(BaseLoader, BaseModel):
Terrarium: $120\nTotal: $920')
"""
model_config = ConfigDict(use_enum_values=True)
"""String containing the Box Developer Token generated in the developer console"""
box_developer_token: Optional[str] = None
"""Configured langchain_box.utilities.BoxAuth object"""
"""String containing the Box Developer Token generated in the developer console"""
box_auth: Optional[BoxAuth] = None
"""List[str] containing Box file ids"""
"""Configured langchain_box.utilities.BoxAuth object"""
box_file_ids: Optional[List[str]] = None
"""String containing box folder id to load files from"""
"""List[str] containing Box file ids"""
box_folder_id: Optional[str] = None
"""String containing box folder id to load files from"""
recursive: Optional[bool] = False
"""If getting files by folder id, recursive is a bool to determine if you wish
to traverse subfolders to return child documents. Default is False"""
recursive: Optional[bool] = False
character_limit: Optional[int] = -1
"""character_limit is an int that caps the number of characters to
return per document."""
character_limit: Optional[int] = -1
box: Optional[BoxAPIWrapper]
_box: Optional[_BoxAPIWrapper]
class Config:
arbitrary_types_allowed = True
extra = "allow"
use_enum_values = True
@root_validator(allow_reuse=True)
def validate_box_loader_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
box = None
_box = None
"""Validate that has either box_file_ids or box_folder_id."""
if not values.get("box_file_ids") and not values.get("box_folder_id"):
@ -159,19 +163,30 @@ class BoxLoader(BaseLoader, BaseModel):
)
"""Validate that we have either a box_developer_token or box_auth."""
if not values.get("box_auth") and not values.get("box_developer_token"):
raise ValueError(
"you must provide box_developer_token or a box_auth "
"generated with langchain_box.utilities.BoxAuth"
if not values.get("box_auth"):
if not get_from_dict_or_env(
values, "box_developer_token", "BOX_DEVELOPER_TOKEN"
):
raise ValueError(
"you must provide box_developer_token or a box_auth "
"generated with langchain_box.utilities.BoxAuth"
)
else:
token = get_from_dict_or_env(
values, "box_developer_token", "BOX_DEVELOPER_TOKEN"
)
_box = _BoxAPIWrapper( # type: ignore[call-arg]
box_developer_token=token,
character_limit=values.get("character_limit"),
)
else:
_box = _BoxAPIWrapper( # type: ignore[call-arg]
box_auth=values.get("box_auth"),
character_limit=values.get("character_limit"),
)
box = BoxAPIWrapper( # type: ignore[call-arg]
box_developer_token=values.get("box_developer_token"),
box_auth=values.get("box_auth"),
character_limit=values.get("character_limit"),
)
values["box"] = box
values["_box"] = _box
return values
@ -181,7 +196,7 @@ class BoxLoader(BaseLoader, BaseModel):
for file in folder_content:
try:
if file.type == FileBaseTypeField.FILE:
doc = self.box.get_document_by_file_id(file.id)
doc = self._box.get_document_by_file_id(file.id)
if doc is not None:
yield doc
@ -199,7 +214,7 @@ class BoxLoader(BaseLoader, BaseModel):
if self.box_file_ids:
for file_id in self.box_file_ids:
try:
file = self.box.get_document_by_file_id(file_id) # type: ignore[union-attr]
file = self._box.get_document_by_file_id(file_id) # type: ignore[union-attr]
if file is not None:
yield file

View File

@ -0,0 +1,5 @@
"""Box Document Loaders."""
from langchain_box.retrievers.box import BoxRetriever
__all__ = ["BoxRetriever"]

View File

@ -0,0 +1,158 @@
from typing import Any, Dict, List, Optional
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_core.documents import Document
from langchain_core.pydantic_v1 import root_validator
from langchain_core.retrievers import BaseRetriever
from langchain_box.utilities import BoxAuth, _BoxAPIWrapper
class BoxRetriever(BaseRetriever):
"""Box retriever.
`BoxRetriever` provides the ability to retrieve content from
your Box instance in a couple of ways.
1. You can use the Box full-text search to retrieve the
complete document(s) that match your search query, as
`List[Document]`
2. You can use the Box AI Platform API to retrieve the results
from a Box AI prompt. This can be a `Document` containing
the result of the prompt, or you can retrieve the citations
used to generate the prompt to include in your vectorstore.
Setup:
Install ``langchain-box``:
.. code-block:: bash
pip install -U langchain-box
Instantiate:
To use search:
.. code-block:: python
from langchain_box.retrievers import BoxRetriever
retriever = BoxRetriever()
To use Box AI:
.. code-block:: python
from langchain_box.retrievers import BoxRetriever
file_ids=["12345","67890"]
retriever = BoxRetriever(file_ids)
Usage:
.. code-block:: python
retriever = BoxRetriever()
retriever.invoke("victor")
print(docs[0].page_content[:100])
.. code-block:: none
[
Document(
metadata={
'source': 'url',
'title': 'FIVE_FEET_AND_RISING_by_Peter_Sollett_pdf'
},
page_content='\\n3/20/23, 5:31 PM F...'
)
]
Use within a chain:
.. code-block:: python
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI
retriever = BoxRetriever(box_developer_token=box_developer_token, character_limit=10000)
context="You are an actor reading scripts to learn about your role in an upcoming movie."
question="describe the character Victor"
prompt = ChatPromptTemplate.from_template(
\"""Answer the question based only on the context provided.
Context: {context}
Question: {question}\"""
)
def format_docs(docs):
return "\\n\\n".join(doc.page_content for doc in docs)
chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
chain.invoke("Victor") # search query to find files in Box
)
.. code-block:: none
'Victor is a skinny 12-year-old with sloppy hair who is seen
sleeping on his fire escape in the sun. He is hesitant to go to
the pool with his friend Carlos because he is afraid of getting
in trouble for not letting his mother cut his hair. Ultimately,
he decides to go to the pool with Carlos.'
""" # noqa: E501
box_developer_token: Optional[str] = None
"""String containing the Box Developer Token generated in the developer console"""
box_auth: Optional[BoxAuth] = None
"""Configured langchain_box.utilities.BoxAuth object"""
box_file_ids: Optional[List[str]] = None
"""List[str] containing Box file ids"""
character_limit: Optional[int] = -1
"""character_limit is an int that caps the number of characters to
return per document."""
_box: Optional[_BoxAPIWrapper]
class Config:
arbitrary_types_allowed = True
extra = "allow"
@root_validator(allow_reuse=True)
def validate_box_loader_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
_box = None
"""Validate that we have either a box_developer_token or box_auth."""
if not values.get("box_auth") and not values.get("box_developer_token"):
raise ValueError(
"you must provide box_developer_token or a box_auth "
"generated with langchain_box.utilities.BoxAuth"
)
_box = _BoxAPIWrapper( # type: ignore[call-arg]
box_developer_token=values.get("box_developer_token"),
box_auth=values.get("box_auth"),
character_limit=values.get("character_limit"),
)
values["_box"] = _box
return values
def _get_relevant_documents(
self, query: str, *, run_manager: CallbackManagerForRetrieverRun
) -> List[Document]:
if self.box_file_ids: # If using Box AI
return self._box.ask_box_ai(query=query, box_file_ids=self.box_file_ids) # type: ignore[union-attr]
else: # If using Search
return self._box.search_box(query=query) # type: ignore[union-attr]

View File

@ -1,5 +1,5 @@
"""Box API Utilities."""
from langchain_box.utilities.box import BoxAPIWrapper, BoxAuth, BoxAuthType
from langchain_box.utilities.box import BoxAuth, BoxAuthType, _BoxAPIWrapper
__all__ = ["BoxAuth", "BoxAuthType", "BoxAPIWrapper"]
__all__ = ["BoxAuth", "BoxAuthType", "_BoxAPIWrapper"]

View File

@ -1,7 +1,7 @@
"""Util that calls Box APIs."""
from enum import Enum
from typing import Any, Dict, Optional
from typing import Any, Dict, List, Optional
import box_sdk_gen # type: ignore
import requests
@ -11,6 +11,13 @@ from langchain_core.utils import get_from_dict_or_env
class DocumentFiles(Enum):
"""DocumentFiles(Enum).
An enum containing all of the supported extensions for files
Box considers Documents. These files should have text
representations.
"""
DOC = "doc"
DOCX = "docx"
GDOC = "gdoc"
@ -89,6 +96,12 @@ class DocumentFiles(Enum):
class ImageFiles(Enum):
"""ImageFiles(Enum).
An enum containing all of the supported extensions for files
Box considers images.
"""
ARW = "arw"
BMP = "bmp"
CR2 = "cr2"
@ -115,8 +128,9 @@ class ImageFiles(Enum):
SVS = "svs"
"""
BoxAuthType
class BoxAuthType(Enum):
"""BoxAuthType(Enum).
an enum to tell BoxLoader how you wish to autheticate your Box connection.
Options are:
@ -128,22 +142,23 @@ class ImageFiles(Enum):
and `box_enterprise_id` or optionally `box_user_id`.
JWT - Use JWT for authentication. Config should be stored on the file
system accessible to your app.
provide `box_jwt_path`. Optionally, provide `box_user_id` to
provide `box_jwt_path`. Optionally, provide `box_user_id` to
act as a specific user
"""
class BoxAuthType(Enum):
"""Use a developer token or a token retrieved from box-sdk-gen"""
"""
TOKEN = "token"
"""Use `client_credentials` type grant"""
"""Use a developer token or a token retrieved from box-sdk-gen"""
CCG = "ccg"
"""Use JWT bearer token auth"""
"""Use `client_credentials` type grant"""
JWT = "jwt"
"""Use JWT bearer token auth"""
"""
class BoxAuth(BaseModel):
"""BoxAuth.
`BoxAuth` supports the following authentication methods:
* Token either a developer token or any token generated through the Box SDK
@ -152,16 +167,15 @@ class BoxAuthType(Enum):
* CCG with a service account
* CCG with a specified user
:::note
If using JWT authentication, you will need to download the configuration from the
Box developer console after generating your public/private key pair. Place this
file in your application directory structure somewhere. You will use the path to
.. note::
If using JWT authentication, you will need to download the configuration from the
Box developer console after generating your public/private key pair. Place this
file in your application directory structure somewhere. You will use the path to
this file when using the `BoxAuth` helper class.
:::
For more information, learn about how to
For more information, learn about how to
[set up a Box application](https://developer.box.com/guides/getting-started/first-application/),
and check out the
and check out the
[Box authentication guide](https://developer.box.com/guides/authentication/select/)
for more about our different authentication options.
@ -169,7 +183,7 @@ class BoxAuthType(Enum):
To instantiate, you must provide a ``langchain_box.utilities.BoxAuthType``.
BoxAuthType is an enum to tell BoxLoader how you wish to autheticate your
BoxAuthType is an enum to tell BoxLoader how you wish to autheticate your
Box connection.
Options are:
@ -181,7 +195,7 @@ class BoxAuthType(Enum):
and `box_enterprise_id` or optionally `box_user_id`.
JWT - Use JWT for authentication. Config should be stored on the file
system accessible to your app.
provide `box_jwt_path`. Optionally, provide `box_user_id` to
provide `box_jwt_path`. Optionally, provide `box_user_id` to
act as a specific user
.. code-block:: python
@ -198,36 +212,40 @@ class BoxAuthType(Enum):
...
)
To see examples for each supported authentication methodology, visit the
[Box providers](/docs/integrations/providers/box) page. If you want to
use OAuth 2.0 `authorization_code` flow, use
[box-sdk-gen](https://github.com/box/box-python-sdk-gen) SDK, get your
To see examples for each supported authentication methodology, visit the
[Box providers](/docs/integrations/providers/box) page. If you want to
use OAuth 2.0 `authorization_code` flow, use
[box-sdk-gen](https://github.com/box/box-python-sdk-gen) SDK, get your
token, and use `BoxAuthType.TOKEN` type.
"""
class BoxAuth(BaseModel):
"""Authentication type to use. Must pass BoxAuthType enum"""
"""
auth_type: BoxAuthType
""" If using BoxAuthType.TOKEN, provide your token here"""
"""langchain_box.utilities.BoxAuthType. Enum describing how to
authenticate against Box"""
box_developer_token: Optional[str] = None
""" If using BoxAuthType.TOKEN, provide your token here"""
box_jwt_path: Optional[str] = None
"""If using BoxAuthType.JWT, provide local path to your
JWT configuration file"""
box_jwt_path: Optional[str] = None
"""If using BoxAuthType.CCG, provide your app's client ID"""
box_client_id: Optional[str] = None
"""If using BoxAuthType.CCG, provide your app's client secret"""
"""If using BoxAuthType.CCG, provide your app's client ID"""
box_client_secret: Optional[str] = None
"""If using BoxAuthType.CCG, provide your app's client secret"""
box_enterprise_id: Optional[str] = None
"""If using BoxAuthType.CCG, provide your enterprise ID.
Only required if you are not sending `box_user_id`"""
box_enterprise_id: Optional[str] = None
box_user_id: Optional[str] = None
"""If using BoxAuthType.CCG or BoxAuthType.JWT, providing
`box_user_id` will act on behalf of a specific user"""
box_user_id: Optional[str] = None
box_client: Optional[box_sdk_gen.BoxClient] = None
custom_header: Dict = dict({"x-box-ai-library": "langchain"})
_box_client: Optional[box_sdk_gen.BoxClient] = None
_custom_header: Dict = dict({"x-box-ai-library": "langchain"})
class Config:
arbitrary_types_allowed = True
@ -276,16 +294,16 @@ class BoxAuth(BaseModel):
return values
def authorize(self) -> None:
def _authorize(self) -> None:
match self.auth_type:
case "token":
try:
auth = box_sdk_gen.BoxDeveloperTokenAuth(
token=self.box_developer_token
)
self.box_client = box_sdk_gen.BoxClient(
self._box_client = box_sdk_gen.BoxClient(
auth=auth
).with_extra_headers(extra_headers=self.custom_header)
).with_extra_headers(extra_headers=self._custom_header)
except box_sdk_gen.BoxSDKError as bse:
raise RuntimeError(
@ -304,15 +322,15 @@ class BoxAuth(BaseModel):
)
auth = box_sdk_gen.BoxJWTAuth(config=jwt_config)
self.box_client = box_sdk_gen.BoxClient(
self._box_client = box_sdk_gen.BoxClient(
auth=auth
).with_extra_headers(extra_headers=self.custom_header)
).with_extra_headers(extra_headers=self._custom_header)
if self.box_user_id is not None:
user_auth = auth.with_user_subject(self.box_user_id)
self.box_client = box_sdk_gen.BoxClient(
self._box_client = box_sdk_gen.BoxClient(
auth=user_auth
).with_extra_headers(extra_headers=self.custom_header)
).with_extra_headers(extra_headers=self._custom_header)
except box_sdk_gen.BoxSDKError as bse:
raise RuntimeError(
@ -340,9 +358,9 @@ class BoxAuth(BaseModel):
)
auth = box_sdk_gen.BoxCCGAuth(config=ccg_config)
self.box_client = box_sdk_gen.BoxClient(
self._box_client = box_sdk_gen.BoxClient(
auth=auth
).with_extra_headers(extra_headers=self.custom_header)
).with_extra_headers(extra_headers=self._custom_header)
except box_sdk_gen.BoxSDKError as bse:
raise RuntimeError(
@ -363,25 +381,26 @@ class BoxAuth(BaseModel):
def get_client(self) -> box_sdk_gen.BoxClient:
"""Instantiate the Box SDK."""
if self.box_client is None:
self.authorize()
if self._box_client is None:
self._authorize()
return self.box_client
return self._box_client
class BoxAPIWrapper(BaseModel):
class _BoxAPIWrapper(BaseModel):
"""Wrapper for Box API."""
"""String containing the Box Developer Token generated in the developer console"""
box_developer_token: Optional[str] = None
"""Configured langchain_box.utilities.BoxAuth object"""
"""String containing the Box Developer Token generated in the developer console"""
box_auth: Optional[BoxAuth] = None
"""Configured langchain_box.utilities.BoxAuth object"""
character_limit: Optional[int] = -1
"""character_limit is an int that caps the number of characters to
return per document."""
character_limit: Optional[int] = -1
box: Optional[box_sdk_gen.BoxClient]
file_count: int = 0
_box: Optional[box_sdk_gen.BoxClient]
class Config:
arbitrary_types_allowed = True
@ -390,7 +409,7 @@ class BoxAPIWrapper(BaseModel):
@root_validator(allow_reuse=True)
def validate_box_api_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
values["box"] = None
values["_box"] = None
"""Validate that TOKEN auth type provides box_developer_token."""
if not values.get("box_auth"):
@ -402,7 +421,7 @@ class BoxAPIWrapper(BaseModel):
)
else:
box_auth = values.get("box_auth")
values["box"] = box_auth.get_client() # type: ignore[union-attr]
values["_box"] = box_auth.get_client() # type: ignore[union-attr]
return values
@ -411,11 +430,11 @@ class BoxAPIWrapper(BaseModel):
auth_type=BoxAuthType.TOKEN, box_developer_token=self.box_developer_token
)
self.box = box_auth.get_client()
self._box = box_auth.get_client()
def _do_request(self, url: str) -> Any:
try:
access_token = self.box.auth.retrieve_token().access_token # type: ignore[union-attr]
access_token = self._box.auth.retrieve_token().access_token # type: ignore[union-attr]
except box_sdk_gen.BoxSDKError as bse:
raise RuntimeError(f"Error getting client from jwt token: {bse.message}")
@ -423,38 +442,17 @@ class BoxAPIWrapper(BaseModel):
resp.raise_for_status()
return resp.content
def get_folder_items(self, folder_id: str) -> box_sdk_gen.Items:
"""Get all the items in a folder. Accepts folder_id as str.
returns box_sdk_gen.Items"""
if self.box is None:
self.get_box_client()
try:
folder_contents = self.box.folders.get_folder_items( # type: ignore[union-attr]
folder_id, fields=["id", "type", "name"]
)
except box_sdk_gen.BoxAPIError as bae:
raise RuntimeError(
f"BoxAPIError: Error getting folder content: {bae.message}"
)
except box_sdk_gen.BoxSDKError as bse:
raise RuntimeError(
f"BoxSDKError: Error getting folder content: {bse.message}"
)
return folder_contents.entries
def get_text_representation(self, file_id: str = "") -> tuple[str, str, str]:
def _get_text_representation(self, file_id: str = "") -> tuple[str, str, str]:
try:
from box_sdk_gen import BoxAPIError, BoxSDKError
except ImportError:
raise ImportError("You must run `pip install box-sdk-gen`")
if self.box is None:
if self._box is None:
self.get_box_client()
try:
file = self.box.files.get_file_by_id( # type: ignore[union-attr]
file = self._box.files.get_file_by_id( # type: ignore[union-attr]
file_id,
x_rep_hints="[extracted_text]",
fields=["name", "representations", "type"],
@ -486,8 +484,10 @@ class BoxAPIWrapper(BaseModel):
except requests.exceptions.HTTPError:
return None, None, None # type: ignore[return-value]
if self.character_limit > 0: # type: ignore[operator]
content = raw_content[0 : self.character_limit]
if (
self.character_limit is not None and self.character_limit > 0 # type: ignore[operator]
):
content = raw_content[0 : (self.character_limit - 1)]
else:
content = raw_content
@ -499,16 +499,16 @@ class BoxAPIWrapper(BaseModel):
"""Load a file from a Box id. Accepts file_id as str.
Returns `Document`"""
if self.box is None:
if self._box is None:
self.get_box_client()
file = self.box.files.get_file_by_id( # type: ignore[union-attr]
file = self._box.files.get_file_by_id( # type: ignore[union-attr]
file_id, fields=["name", "type", "extension"]
)
if file.type == "file":
if hasattr(DocumentFiles, file.extension.upper()):
file_name, content, url = self.get_text_representation(file_id=file_id)
file_name, content, url = self._get_text_representation(file_id=file_id)
if file_name is None or content is None or url is None:
return None
@ -523,3 +523,95 @@ class BoxAPIWrapper(BaseModel):
return None
return None
def get_folder_items(self, folder_id: str) -> box_sdk_gen.Items:
"""Get all the items in a folder. Accepts folder_id as str.
returns box_sdk_gen.Items"""
if self._box is None:
self.get_box_client()
try:
folder_contents = self._box.folders.get_folder_items( # type: ignore[union-attr]
folder_id, fields=["id", "type", "name"]
)
except box_sdk_gen.BoxAPIError as bae:
raise RuntimeError(
f"BoxAPIError: Error getting folder content: {bae.message}"
)
except box_sdk_gen.BoxSDKError as bse:
raise RuntimeError(
f"BoxSDKError: Error getting folder content: {bse.message}"
)
return folder_contents.entries
def search_box(self, query: str) -> List[Document]:
if self._box is None:
self.get_box_client()
files = []
try:
results = self._box.search.search_for_content( # type: ignore[union-attr]
query=query, fields=["id", "type", "extension"]
)
if results.entries is None or len(results.entries) <= 0:
return None # type: ignore[return-value]
for file in results.entries:
if (
file is not None
and file.type == "file"
and hasattr(DocumentFiles, file.extension.upper())
):
doc = self.get_document_by_file_id(file.id)
if doc is not None:
files.append(doc)
return files
except box_sdk_gen.BoxAPIError as bae:
raise RuntimeError(
f"BoxAPIError: Error getting search results: {bae.message}"
)
except box_sdk_gen.BoxSDKError as bse:
raise RuntimeError(
f"BoxSDKError: Error getting search results: {bse.message}"
)
def ask_box_ai(self, query: str, box_file_ids: List[str]) -> List[Document]:
if self._box is None:
self.get_box_client()
ai_mode = box_sdk_gen.CreateAiAskMode.SINGLE_ITEM_QA.value
if len(box_file_ids) > 1:
ai_mode = box_sdk_gen.CreateAiAskMode.MULTIPLE_ITEM_QA.value
elif len(box_file_ids) <= 0:
raise ValueError("BOX_AI_ASK requires at least one file ID")
items = []
for file_id in box_file_ids:
item = box_sdk_gen.CreateAiAskItems(
id=file_id, type=box_sdk_gen.CreateAiAskItemsTypeField.FILE.value
)
items.append(item)
try:
response = self._box.ai.create_ai_ask(ai_mode, query, items) # type: ignore[union-attr]
except box_sdk_gen.BoxAPIError as bae:
raise RuntimeError(
f"BoxAPIError: Error getting Box AI result: {bae.message}"
)
except box_sdk_gen.BoxSDKError as bse:
raise RuntimeError(
f"BoxSDKError: Error getting Box AI result: {bse.message}"
)
content = response.answer
metadata = {"source": "Box AI", "title": f"Box AI {query}"}
return [Document(page_content=content, metadata=metadata)]

View File

@ -1,42 +1,3 @@
from langchain_core.documents import Document
from pytest_mock import MockerFixture
from langchain_box.document_loaders import BoxLoader
# test Document retrieval
def test_file_load(mocker: MockerFixture) -> None:
mocker.patch(
"langchain_box.utilities.BoxAPIWrapper.get_document_by_file_id", return_value=[]
)
loader = BoxLoader( # type: ignore[call-arg]
box_developer_token="box_developer_token",
box_file_ids=["box_file_ids"],
)
documents = loader.load()
assert documents
mocker.patch(
"langchain_box.utilities.BoxAPIWrapper.get_document_by_file_id",
return_value=(
Document(
page_content="Test file mode\ndocument contents",
metadata={"title": "Testing Files"},
)
),
)
loader = BoxLoader( # type: ignore[call-arg]
box_developer_token="box_developer_token",
box_file_ids=["box_file_ids"],
)
documents = loader.load()
assert documents == [
Document(
page_content="Test file mode\ndocument contents",
metadata={"title": "Testing Files"},
)
]
"""
TODO: build live integration tests
"""

View File

@ -0,0 +1,3 @@
"""
TODO: build live integration tests
"""

View File

@ -1,47 +1,3 @@
from unittest.mock import Mock
import pytest
from langchain_core.documents import Document
from pytest_mock import MockerFixture
from langchain_box.utilities import BoxAPIWrapper
@pytest.fixture()
def mock_worker(mocker: MockerFixture) -> None:
mocker.patch("langchain_box.utilities.BoxAuth.authorize", return_value=Mock())
mocker.patch("langchain_box.utilities.BoxAuth.get_client", return_value=Mock())
mocker.patch(
"langchain_box.utilities.BoxAPIWrapper.get_text_representation",
return_value=("filename", "content", "url"),
)
def test_get_documents_by_file_ids(mock_worker, mocker: MockerFixture) -> None: # type: ignore[no-untyped-def]
mocker.patch(
"langchain_box.utilities.BoxAPIWrapper.get_document_by_file_id",
return_value=(
Document(
page_content="content", metadata={"source": "url", "title": "filename"}
)
),
)
box = BoxAPIWrapper(box_developer_token="box_developer_token") # type: ignore[call-arg]
documents = box.get_document_by_file_id("box_file_id")
assert documents == Document(
page_content="content", metadata={"source": "url", "title": "filename"}
)
def test_get_documents_by_folder_id(mock_worker, mocker: MockerFixture) -> None: # type: ignore[no-untyped-def]
mocker.patch(
"langchain_box.utilities.BoxAPIWrapper.get_folder_items",
return_value=([{"id": "file_id", "type": "file"}]),
)
box = BoxAPIWrapper(box_developer_token="box_developer_token") # type: ignore[call-arg]
folder_contents = box.get_folder_items("box_folder_id")
assert folder_contents == [{"id": "file_id", "type": "file"}]
"""
TODO: build live integration tests
"""

View File

@ -1,4 +1,6 @@
import pytest
from langchain_core.documents import Document
from pytest_mock import MockerFixture
from langchain_box.document_loaders import BoxLoader
from langchain_box.utilities import BoxAuth, BoxAuthType
@ -56,3 +58,42 @@ def test_failed_initialization_files_and_folders() -> None:
box_folder_id="box_folder_id",
box_file_ids=["box_file_ids"],
)
# test Document retrieval
def test_file_load(mocker: MockerFixture) -> None:
mocker.patch(
"langchain_box.utilities._BoxAPIWrapper.get_document_by_file_id",
return_value=[],
)
loader = BoxLoader( # type: ignore[call-arg]
box_developer_token="box_developer_token",
box_file_ids=["box_file_ids"],
)
documents = loader.load()
assert documents
mocker.patch(
"langchain_box.utilities._BoxAPIWrapper.get_document_by_file_id",
return_value=(
Document(
page_content="Test file mode\ndocument contents",
metadata={"title": "Testing Files"},
)
),
)
loader = BoxLoader( # type: ignore[call-arg]
box_developer_token="box_developer_token",
box_file_ids=["box_file_ids"],
)
documents = loader.load()
assert documents == [
Document(
page_content="Test file mode\ndocument contents",
metadata={"title": "Testing Files"},
)
]

View File

@ -0,0 +1,89 @@
import pytest
from langchain_core.documents import Document
from pytest_mock import MockerFixture
from langchain_box.retrievers import BoxRetriever
from langchain_box.utilities import BoxAuth, BoxAuthType
# Test auth types
def test_direct_token_initialization() -> None:
retriever = BoxRetriever( # type: ignore[call-arg]
box_developer_token="box_developer_token",
box_file_ids=["box_file_ids"],
)
assert retriever.box_developer_token == "box_developer_token"
assert retriever.box_file_ids == ["box_file_ids"]
def test_failed_direct_token_initialization() -> None:
with pytest.raises(ValueError):
retriever = BoxRetriever(box_file_ids=["box_file_ids"]) # type: ignore[call-arg] # noqa: F841
def test_auth_initialization() -> None:
auth = BoxAuth(
auth_type=BoxAuthType.TOKEN, box_developer_token="box_developer_token"
)
retriever = BoxRetriever( # type: ignore[call-arg]
box_auth=auth,
box_file_ids=["box_file_ids"],
)
assert retriever.box_file_ids == ["box_file_ids"]
# test search retrieval
def test_search(mocker: MockerFixture) -> None:
mocker.patch(
"langchain_box.utilities._BoxAPIWrapper.search_box",
return_value=(
[
Document(
page_content="Test file mode\ndocument contents",
metadata={"title": "Testing Files"},
)
]
),
)
retriever = BoxRetriever( # type: ignore[call-arg]
box_developer_token="box_developer_token"
)
documents = retriever.invoke("query")
assert documents == [
Document(
page_content="Test file mode\ndocument contents",
metadata={"title": "Testing Files"},
)
]
# test ai retrieval
def test_ai(mocker: MockerFixture) -> None:
mocker.patch(
"langchain_box.utilities._BoxAPIWrapper.ask_box_ai",
return_value=(
[
Document(
page_content="Test file mode\ndocument contents",
metadata={"title": "Testing Files"},
)
]
),
)
retriever = BoxRetriever( # type: ignore[call-arg]
box_developer_token="box_developer_token", box_file_ids=["box_file_ids"]
)
documents = retriever.invoke("query")
assert documents == [
Document(
page_content="Test file mode\ndocument contents",
metadata={"title": "Testing Files"},
)
]

View File

@ -2,9 +2,10 @@ from langchain_box import __all__
EXPECTED_ALL = [
"BoxLoader",
"BoxRetriever",
"BoxAuth",
"BoxAuthType",
"BoxAPIWrapper",
"_BoxAPIWrapper",
"__version__",
]

View File

@ -1,7 +1,21 @@
import pytest
from pydantic.v1.error_wrappers import ValidationError
from unittest.mock import Mock
from langchain_box.utilities import BoxAPIWrapper, BoxAuth, BoxAuthType
import pytest
from langchain_core.documents import Document
from pydantic.v1.error_wrappers import ValidationError
from pytest_mock import MockerFixture
from langchain_box.utilities import BoxAuth, BoxAuthType, _BoxAPIWrapper
@pytest.fixture()
def mock_worker(mocker: MockerFixture) -> None:
mocker.patch("langchain_box.utilities.BoxAuth._authorize", return_value=Mock())
mocker.patch("langchain_box.utilities.BoxAuth.get_client", return_value=Mock())
mocker.patch(
"langchain_box.utilities._BoxAPIWrapper._get_text_representation",
return_value=("filename", "content", "url"),
)
# Test auth types
@ -79,7 +93,7 @@ def test_failed_ccg_initialization() -> None:
def test_direct_token_initialization() -> None:
box = BoxAPIWrapper( # type: ignore[call-arg]
box = _BoxAPIWrapper( # type: ignore[call-arg]
box_developer_token="box_developer_token"
)
@ -91,11 +105,126 @@ def test_auth_initialization() -> None:
auth_type=BoxAuthType.TOKEN, box_developer_token="box_developer_token"
)
box = BoxAPIWrapper(box_auth=auth) # type: ignore[call-arg] # noqa: F841
box = _BoxAPIWrapper(box_auth=auth) # type: ignore[call-arg] # noqa: F841
assert auth.box_developer_token == "box_developer_token"
def test_failed_initialization_no_auth() -> None:
with pytest.raises(ValidationError):
box = BoxAPIWrapper() # type: ignore[call-arg] # noqa: F841
box = _BoxAPIWrapper() # type: ignore[call-arg] # noqa: F841
def test_get_documents_by_file_ids(mock_worker, mocker: MockerFixture) -> None: # type: ignore[no-untyped-def]
mocker.patch(
"langchain_box.utilities._BoxAPIWrapper.get_document_by_file_id",
return_value=(
Document(
page_content="content", metadata={"source": "url", "title": "filename"}
)
),
)
box = _BoxAPIWrapper(box_developer_token="box_developer_token") # type: ignore[call-arg]
documents = box.get_document_by_file_id("box_file_id")
assert documents == Document(
page_content="content", metadata={"source": "url", "title": "filename"}
)
def test_get_documents_by_folder_id(mock_worker, mocker: MockerFixture) -> None: # type: ignore[no-untyped-def]
mocker.patch(
"langchain_box.utilities._BoxAPIWrapper.get_folder_items",
return_value=([{"id": "file_id", "type": "file"}]),
)
box = _BoxAPIWrapper(box_developer_token="box_developer_token") # type: ignore[call-arg]
folder_contents = box.get_folder_items("box_folder_id")
assert folder_contents == [{"id": "file_id", "type": "file"}]
def test_box_search(mock_worker, mocker: MockerFixture) -> None: # type: ignore[no-untyped-def]
mocker.patch(
"langchain_box.utilities._BoxAPIWrapper.search_box",
return_value=(
[
Document(
page_content="Test file mode\ndocument contents",
metadata={"title": "Testing Files"},
)
]
),
)
box = _BoxAPIWrapper(box_developer_token="box_developer_token") # type: ignore[call-arg]
documents = box.search_box("query")
assert documents == [
Document(
page_content="Test file mode\ndocument contents",
metadata={"title": "Testing Files"},
)
]
def test_ask_box_ai_single_file(mock_worker, mocker: MockerFixture) -> None: # type: ignore[no-untyped-def]
mocker.patch(
"langchain_box.utilities._BoxAPIWrapper.ask_box_ai",
return_value=(
[
Document(
page_content="Test file mode\ndocument contents",
metadata={"title": "Testing Files"},
)
]
),
)
box = _BoxAPIWrapper( # type: ignore[call-arg]
box_developer_token="box_developer_token", box_file_ids=["box_file_ids"]
)
documents = box.ask_box_ai("query") # type: ignore[call-arg]
assert documents == [
Document(
page_content="Test file mode\ndocument contents",
metadata={"title": "Testing Files"},
)
]
def test_ask_box_ai_multiple_files(mock_worker, mocker: MockerFixture) -> None: # type: ignore[no-untyped-def]
mocker.patch(
"langchain_box.utilities._BoxAPIWrapper.ask_box_ai",
return_value=(
[
Document(
page_content="Test file 1 mode\ndocument contents",
metadata={"title": "Test File 1"},
),
Document(
page_content="Test file 2 mode\ndocument contents",
metadata={"title": "Test File 2"},
),
]
),
)
box = _BoxAPIWrapper( # type: ignore[call-arg]
box_developer_token="box_developer_token",
box_file_ids=["box_file_id 1", "box_file_id 2"],
)
documents = box.ask_box_ai("query") # type: ignore[call-arg]
assert documents == [
Document(
page_content="Test file 1 mode\ndocument contents",
metadata={"title": "Test File 1"},
),
Document(
page_content="Test file 2 mode\ndocument contents",
metadata={"title": "Test File 2"},
),
]