box: Add searchoptions for BoxRetriever, documentation for BoxRetriever as agent tool (#26181)

Thank you for contributing to LangChain!

- [x] **PR title**: "package: description"
- Where "package" is whichever of langchain, community, core,
experimental, etc. is being modified. Use "docs: ..." for purely docs
changes, "templates: ..." for template changes, "infra: ..." for CI
changes.
  - Example: "community: add foobar LLM"


Added search options for BoxRetriever and added documentation to
demonstrate how to use BoxRetriever as an agent tool - @BoxPlatform


- [x] **Add tests and docs**: If you're adding a new integration, please
include
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in
`docs/docs/integrations` directory.


- [x] **Lint and test**: Run `make format`, `make lint` and `make test`
from the root of the package(s) you've modified. See contribution
guidelines for more: https://python.langchain.com/docs/contributing/

Additional guidelines:
- Make sure optional dependencies are imported within a function.
- Please do not add dependencies to pyproject.toml files (even optional
ones) unless they are required for unit tests.
- Most PRs should not touch more than one package.
- Changes should be backwards compatible.
- If you are adding something to community, do not re-import it in
langchain.

If no one reviews your PR within a few days, please @-mention one of
baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17.
This commit is contained in:
Scott Hurrey 2024-09-19 00:00:06 -04:00 committed by GitHub
parent e0c36afc3e
commit acbb4e4701
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 453 additions and 29 deletions

File diff suppressed because one or more lines are too long

View File

@ -2,7 +2,14 @@ from importlib import metadata
from langchain_box.document_loaders import BoxLoader
from langchain_box.retrievers import BoxRetriever
from langchain_box.utilities import BoxAuth, BoxAuthType, _BoxAPIWrapper
from langchain_box.utilities.box import (
BoxAuth,
BoxAuthType,
BoxSearchOptions,
DocumentFiles,
SearchTypeFilter,
_BoxAPIWrapper,
)
try:
__version__ = metadata.version(__package__)
@ -16,6 +23,9 @@ __all__ = [
"BoxRetriever",
"BoxAuth",
"BoxAuthType",
"BoxSearchOptions",
"DocumentFiles",
"SearchTypeFilter",
"_BoxAPIWrapper",
"__version__",
]

View File

@ -6,7 +6,7 @@ from langchain_core.retrievers import BaseRetriever
from pydantic import ConfigDict, model_validator
from typing_extensions import Self
from langchain_box.utilities import BoxAuth, _BoxAPIWrapper
from langchain_box.utilities import BoxAuth, BoxSearchOptions, _BoxAPIWrapper
class BoxRetriever(BaseRetriever):
@ -128,7 +128,10 @@ class BoxRetriever(BaseRetriever):
"""character_limit is an int that caps the number of characters to
return per document."""
_box: Optional[_BoxAPIWrapper] = None
box_search_options: Optional[BoxSearchOptions] = None
"""Search options to configure BoxRetriever to narrow search results."""
_box: Optional[_BoxAPIWrapper]
model_config = ConfigDict(
arbitrary_types_allowed=True,
@ -150,6 +153,7 @@ class BoxRetriever(BaseRetriever):
box_developer_token=self.box_developer_token,
box_auth=self.box_auth,
character_limit=self.character_limit,
box_search_options=self.box_search_options,
)
self._box = _box

View File

@ -1,5 +1,19 @@
"""Box API Utilities."""
from langchain_box.utilities.box import BoxAuth, BoxAuthType, _BoxAPIWrapper
from langchain_box.utilities.box import (
BoxAuth,
BoxAuthType,
BoxSearchOptions,
DocumentFiles,
SearchTypeFilter,
_BoxAPIWrapper,
)
__all__ = ["BoxAuth", "BoxAuthType", "_BoxAPIWrapper"]
__all__ = [
"BoxAuth",
"BoxAuthType",
"BoxSearchOptions",
"DocumentFiles",
"SearchTypeFilter",
"_BoxAPIWrapper",
]

View File

@ -470,6 +470,128 @@ class BoxAuth(BaseModel):
return self._box_client
class SearchTypeFilter(Enum):
"""SearchTypeFilter.
Enum to limit the what we search.
"""
NAME = "name"
"""The name of the item, as defined by its ``name`` field."""
DESCRIPTION = "description"
"""The description of the item, as defined by its ``description`` field."""
FILE_CONTENT = "file_content"
"""The actual content of the file."""
COMMENTS = "comments"
"""The content of any of the comments on a file or folder."""
TAGS = "tags"
"""Any tags that are applied to an item, as defined by its ``tags`` field."""
class BoxSearchOptions(BaseModel):
ancestor_folder_ids: Optional[List[str]] = None
"""Limits the search results to items within the given list of folders,
defined as a comma separated lists of folder IDs."""
search_type_filter: Optional[List[SearchTypeFilter]] = None
"""Limits the search results to any items that match the search query for a
specific part of the file, for example the file description.
Content types are defined as a comma separated lists of Box recognized
content types. The allowed content types are as follows. Default is all."""
created_date_range: Optional[List[str]] = None
"""Limits the search results to any items created within a given date range.
Date ranges are defined as comma separated RFC3339 timestamps.
If the the start date is omitted (,2014-05-17T13:35:01-07:00) anything
created before the end date will be returned.
If the end date is omitted (2014-05-15T13:35:01-07:00,) the current
date will be used as the end date instead."""
file_extensions: Optional[List[DocumentFiles]] = None
"""Limits the search results to any files that match any of the provided
file extensions. This list is a comma-separated list of
``langchain_box.utilities.DocumentFiles`` entries"""
k: Optional[int] = 100
"""Defines the maximum number of items to return. Defaults to 100, maximum
is 200."""
size_range: Optional[List[int]] = None
"""Limits the search results to any items with a size within a given file
size range. This applied to files and folders.
Size ranges are defined as comma separated list of a lower and upper
byte size limit (inclusive).
The upper and lower bound can be omitted to create open ranges."""
updated_date_range: Optional[List[str]] = None
"""Limits the search results to any items updated within a given date range.
Date ranges are defined as comma separated RFC3339 timestamps.
If the start date is omitted (,2014-05-17T13:35:01-07:00) anything
updated before the end date will be returned.
If the end date is omitted (2014-05-15T13:35:01-07:00,) the current
date will be used as the end date instead."""
class Config:
arbitrary_types_allowed = True
use_enum_values = True
extra = "allow"
@model_validator(mode="after")
def validate_search_options(self) -> Self:
"""Validate k is between 1 and 200"""
if self.k > 200 or self.k < 1: # type: ignore[operator]
raise ValueError(
f"Invalid setting of k {self.k}. " "Value must be between 1 and 200."
)
"""Validate created_date_range start date is before end date"""
if self.created_date_range:
if (
self.created_date_range[0] is None # type: ignore[index]
or self.created_date_range[0] == "" # type: ignore[index]
or self.created_date_range[1] is None # type: ignore[index]
or self.created_date_range[1] == "" # type: ignore[index]
):
pass
else:
if (
self.created_date_range[0] # type: ignore[index]
> self.created_date_range[1] # type: ignore[index]
):
raise ValueError("Start date must be before end date.")
"""Validate updated_date_range start date is before end date"""
if self.updated_date_range:
if (
self.updated_date_range[0] is None # type: ignore[index]
or self.updated_date_range[0] == "" # type: ignore[index]
or self.updated_date_range[1] is None # type: ignore[index]
or self.updated_date_range[1] == "" # type: ignore[index]
):
pass
else:
if (
self.updated_date_range[0] # type: ignore[index]
> self.updated_date_range[1] # type: ignore[index]
):
raise ValueError("Start date must be before end date.")
return self
class _BoxAPIWrapper(BaseModel):
"""Wrapper for Box API."""
@ -485,7 +607,10 @@ class _BoxAPIWrapper(BaseModel):
"""character_limit is an int that caps the number of characters to
return per document."""
_box: Optional[box_sdk_gen.BoxClient] = None
box_search_options: Optional[BoxSearchOptions] = None
"""Search options to configure BoxRetriever to narrow search results."""
_box: Optional[box_sdk_gen.BoxClient]
model_config = ConfigDict(
arbitrary_types_allowed=True,
@ -636,9 +761,25 @@ class _BoxAPIWrapper(BaseModel):
files = []
try:
results = self._box.search.search_for_content( # type: ignore[union-attr]
query=query, fields=["id", "type", "extension"]
)
results = None
if self.box_search_options is None:
results = self._box.search.search_for_content( # type: ignore[union-attr]
query=query, fields=["id", "type", "extension"], type="file"
)
else:
results = self._box.search.search_for_content( # type: ignore[union-attr]
query=query,
fields=["id", "type", "extension"],
type="file",
ancestor_folder_ids=self.box_search_options.ancestor_folder_ids, # type: ignore[union-attr]
content_types=self.box_search_options.search_type_filter, # type: ignore[union-attr]
created_at_range=self.box_search_options.created_date_range, # type: ignore[union-attr]
file_extensions=self.box_search_options.file_extensions, # type: ignore[union-attr]
limit=self.box_search_options.k, # type: ignore[union-attr]
size_range=self.box_search_options.size_range, # type: ignore[union-attr]
updated_at_range=self.box_search_options.updated_date_range, # type: ignore[union-attr]
)
if results.entries is None or len(results.entries) <= 0:
return None # type: ignore[return-value]

View File

@ -3,7 +3,13 @@ from langchain_core.documents import Document
from pytest_mock import MockerFixture
from langchain_box.retrievers import BoxRetriever
from langchain_box.utilities import BoxAuth, BoxAuthType
from langchain_box.utilities import (
BoxAuth,
BoxAuthType,
BoxSearchOptions,
DocumentFiles,
SearchTypeFilter,
)
# Test auth types
@ -62,6 +68,44 @@ def test_search(mocker: MockerFixture) -> None:
]
# test search options
def test_search_options(mocker: MockerFixture) -> None:
mocker.patch(
"langchain_box.utilities._BoxAPIWrapper.search_box",
return_value=(
[
Document(
page_content="Test file mode\ndocument contents",
metadata={"title": "Testing Files"},
)
]
),
)
box_search_options = BoxSearchOptions(
ancestor_folder_ids=["box_folder_id"],
search_type_filter=[SearchTypeFilter.FILE_CONTENT],
created_date_range=["2023-01-01T00:00:00-07:00", "2024-08-01T00:00:00-07:00,"],
file_extensions=[DocumentFiles.DOCX, DocumentFiles.PDF],
k=200,
size_range=[1, 1000000],
updated_date_range=None,
)
retriever = BoxRetriever( # type: ignore[call-arg]
box_developer_token="box_developer_token", box_search_options=box_search_options
)
documents = retriever.invoke("query")
assert documents == [
Document(
page_content="Test file mode\ndocument contents",
metadata={"title": "Testing Files"},
)
]
# test ai retrieval
def test_ai(mocker: MockerFixture) -> None:
mocker.patch(

View File

@ -5,6 +5,9 @@ EXPECTED_ALL = [
"BoxRetriever",
"BoxAuth",
"BoxAuthType",
"BoxSearchOptions",
"DocumentFiles",
"SearchTypeFilter",
"_BoxAPIWrapper",
"__version__",
]