mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-19 13:23:35 +00:00
box: Add searchoptions for BoxRetriever, documentation for BoxRetriever as agent tool (#26181)
Thank you for contributing to LangChain! - [x] **PR title**: "package: description" - Where "package" is whichever of langchain, community, core, experimental, etc. is being modified. Use "docs: ..." for purely docs changes, "templates: ..." for template changes, "infra: ..." for CI changes. - Example: "community: add foobar LLM" Added search options for BoxRetriever and added documentation to demonstrate how to use BoxRetriever as an agent tool - @BoxPlatform - [x] **Add tests and docs**: If you're adding a new integration, please include 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. - [x] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/ Additional guidelines: - Make sure optional dependencies are imported within a function. - Please do not add dependencies to pyproject.toml files (even optional ones) unless they are required for unit tests. - Most PRs should not touch more than one package. - Changes should be backwards compatible. - If you are adding something to community, do not re-import it in langchain. If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17.
This commit is contained in:
parent
e0c36afc3e
commit
acbb4e4701
File diff suppressed because one or more lines are too long
@ -2,7 +2,14 @@ from importlib import metadata
|
|||||||
|
|
||||||
from langchain_box.document_loaders import BoxLoader
|
from langchain_box.document_loaders import BoxLoader
|
||||||
from langchain_box.retrievers import BoxRetriever
|
from langchain_box.retrievers import BoxRetriever
|
||||||
from langchain_box.utilities import BoxAuth, BoxAuthType, _BoxAPIWrapper
|
from langchain_box.utilities.box import (
|
||||||
|
BoxAuth,
|
||||||
|
BoxAuthType,
|
||||||
|
BoxSearchOptions,
|
||||||
|
DocumentFiles,
|
||||||
|
SearchTypeFilter,
|
||||||
|
_BoxAPIWrapper,
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
__version__ = metadata.version(__package__)
|
__version__ = metadata.version(__package__)
|
||||||
@ -16,6 +23,9 @@ __all__ = [
|
|||||||
"BoxRetriever",
|
"BoxRetriever",
|
||||||
"BoxAuth",
|
"BoxAuth",
|
||||||
"BoxAuthType",
|
"BoxAuthType",
|
||||||
|
"BoxSearchOptions",
|
||||||
|
"DocumentFiles",
|
||||||
|
"SearchTypeFilter",
|
||||||
"_BoxAPIWrapper",
|
"_BoxAPIWrapper",
|
||||||
"__version__",
|
"__version__",
|
||||||
]
|
]
|
||||||
|
@ -6,7 +6,7 @@ from langchain_core.retrievers import BaseRetriever
|
|||||||
from pydantic import ConfigDict, model_validator
|
from pydantic import ConfigDict, model_validator
|
||||||
from typing_extensions import Self
|
from typing_extensions import Self
|
||||||
|
|
||||||
from langchain_box.utilities import BoxAuth, _BoxAPIWrapper
|
from langchain_box.utilities import BoxAuth, BoxSearchOptions, _BoxAPIWrapper
|
||||||
|
|
||||||
|
|
||||||
class BoxRetriever(BaseRetriever):
|
class BoxRetriever(BaseRetriever):
|
||||||
@ -128,7 +128,10 @@ class BoxRetriever(BaseRetriever):
|
|||||||
"""character_limit is an int that caps the number of characters to
|
"""character_limit is an int that caps the number of characters to
|
||||||
return per document."""
|
return per document."""
|
||||||
|
|
||||||
_box: Optional[_BoxAPIWrapper] = None
|
box_search_options: Optional[BoxSearchOptions] = None
|
||||||
|
"""Search options to configure BoxRetriever to narrow search results."""
|
||||||
|
|
||||||
|
_box: Optional[_BoxAPIWrapper]
|
||||||
|
|
||||||
model_config = ConfigDict(
|
model_config = ConfigDict(
|
||||||
arbitrary_types_allowed=True,
|
arbitrary_types_allowed=True,
|
||||||
@ -150,6 +153,7 @@ class BoxRetriever(BaseRetriever):
|
|||||||
box_developer_token=self.box_developer_token,
|
box_developer_token=self.box_developer_token,
|
||||||
box_auth=self.box_auth,
|
box_auth=self.box_auth,
|
||||||
character_limit=self.character_limit,
|
character_limit=self.character_limit,
|
||||||
|
box_search_options=self.box_search_options,
|
||||||
)
|
)
|
||||||
|
|
||||||
self._box = _box
|
self._box = _box
|
||||||
|
@ -1,5 +1,19 @@
|
|||||||
"""Box API Utilities."""
|
"""Box API Utilities."""
|
||||||
|
|
||||||
from langchain_box.utilities.box import BoxAuth, BoxAuthType, _BoxAPIWrapper
|
from langchain_box.utilities.box import (
|
||||||
|
BoxAuth,
|
||||||
|
BoxAuthType,
|
||||||
|
BoxSearchOptions,
|
||||||
|
DocumentFiles,
|
||||||
|
SearchTypeFilter,
|
||||||
|
_BoxAPIWrapper,
|
||||||
|
)
|
||||||
|
|
||||||
__all__ = ["BoxAuth", "BoxAuthType", "_BoxAPIWrapper"]
|
__all__ = [
|
||||||
|
"BoxAuth",
|
||||||
|
"BoxAuthType",
|
||||||
|
"BoxSearchOptions",
|
||||||
|
"DocumentFiles",
|
||||||
|
"SearchTypeFilter",
|
||||||
|
"_BoxAPIWrapper",
|
||||||
|
]
|
||||||
|
@ -470,6 +470,128 @@ class BoxAuth(BaseModel):
|
|||||||
return self._box_client
|
return self._box_client
|
||||||
|
|
||||||
|
|
||||||
|
class SearchTypeFilter(Enum):
|
||||||
|
"""SearchTypeFilter.
|
||||||
|
|
||||||
|
Enum to limit the what we search.
|
||||||
|
"""
|
||||||
|
|
||||||
|
NAME = "name"
|
||||||
|
"""The name of the item, as defined by its ``name`` field."""
|
||||||
|
|
||||||
|
DESCRIPTION = "description"
|
||||||
|
"""The description of the item, as defined by its ``description`` field."""
|
||||||
|
|
||||||
|
FILE_CONTENT = "file_content"
|
||||||
|
"""The actual content of the file."""
|
||||||
|
|
||||||
|
COMMENTS = "comments"
|
||||||
|
"""The content of any of the comments on a file or folder."""
|
||||||
|
|
||||||
|
TAGS = "tags"
|
||||||
|
"""Any tags that are applied to an item, as defined by its ``tags`` field."""
|
||||||
|
|
||||||
|
|
||||||
|
class BoxSearchOptions(BaseModel):
|
||||||
|
ancestor_folder_ids: Optional[List[str]] = None
|
||||||
|
"""Limits the search results to items within the given list of folders,
|
||||||
|
defined as a comma separated lists of folder IDs."""
|
||||||
|
|
||||||
|
search_type_filter: Optional[List[SearchTypeFilter]] = None
|
||||||
|
"""Limits the search results to any items that match the search query for a
|
||||||
|
specific part of the file, for example the file description.
|
||||||
|
|
||||||
|
Content types are defined as a comma separated lists of Box recognized
|
||||||
|
content types. The allowed content types are as follows. Default is all."""
|
||||||
|
|
||||||
|
created_date_range: Optional[List[str]] = None
|
||||||
|
"""Limits the search results to any items created within a given date range.
|
||||||
|
|
||||||
|
Date ranges are defined as comma separated RFC3339 timestamps.
|
||||||
|
|
||||||
|
If the the start date is omitted (,2014-05-17T13:35:01-07:00) anything
|
||||||
|
created before the end date will be returned.
|
||||||
|
|
||||||
|
If the end date is omitted (2014-05-15T13:35:01-07:00,) the current
|
||||||
|
date will be used as the end date instead."""
|
||||||
|
|
||||||
|
file_extensions: Optional[List[DocumentFiles]] = None
|
||||||
|
"""Limits the search results to any files that match any of the provided
|
||||||
|
file extensions. This list is a comma-separated list of
|
||||||
|
``langchain_box.utilities.DocumentFiles`` entries"""
|
||||||
|
|
||||||
|
k: Optional[int] = 100
|
||||||
|
"""Defines the maximum number of items to return. Defaults to 100, maximum
|
||||||
|
is 200."""
|
||||||
|
|
||||||
|
size_range: Optional[List[int]] = None
|
||||||
|
"""Limits the search results to any items with a size within a given file
|
||||||
|
size range. This applied to files and folders.
|
||||||
|
|
||||||
|
Size ranges are defined as comma separated list of a lower and upper
|
||||||
|
byte size limit (inclusive).
|
||||||
|
|
||||||
|
The upper and lower bound can be omitted to create open ranges."""
|
||||||
|
|
||||||
|
updated_date_range: Optional[List[str]] = None
|
||||||
|
"""Limits the search results to any items updated within a given date range.
|
||||||
|
|
||||||
|
Date ranges are defined as comma separated RFC3339 timestamps.
|
||||||
|
|
||||||
|
If the start date is omitted (,2014-05-17T13:35:01-07:00) anything
|
||||||
|
updated before the end date will be returned.
|
||||||
|
|
||||||
|
If the end date is omitted (2014-05-15T13:35:01-07:00,) the current
|
||||||
|
date will be used as the end date instead."""
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
arbitrary_types_allowed = True
|
||||||
|
use_enum_values = True
|
||||||
|
extra = "allow"
|
||||||
|
|
||||||
|
@model_validator(mode="after")
|
||||||
|
def validate_search_options(self) -> Self:
|
||||||
|
"""Validate k is between 1 and 200"""
|
||||||
|
if self.k > 200 or self.k < 1: # type: ignore[operator]
|
||||||
|
raise ValueError(
|
||||||
|
f"Invalid setting of k {self.k}. " "Value must be between 1 and 200."
|
||||||
|
)
|
||||||
|
|
||||||
|
"""Validate created_date_range start date is before end date"""
|
||||||
|
if self.created_date_range:
|
||||||
|
if (
|
||||||
|
self.created_date_range[0] is None # type: ignore[index]
|
||||||
|
or self.created_date_range[0] == "" # type: ignore[index]
|
||||||
|
or self.created_date_range[1] is None # type: ignore[index]
|
||||||
|
or self.created_date_range[1] == "" # type: ignore[index]
|
||||||
|
):
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
if (
|
||||||
|
self.created_date_range[0] # type: ignore[index]
|
||||||
|
> self.created_date_range[1] # type: ignore[index]
|
||||||
|
):
|
||||||
|
raise ValueError("Start date must be before end date.")
|
||||||
|
|
||||||
|
"""Validate updated_date_range start date is before end date"""
|
||||||
|
if self.updated_date_range:
|
||||||
|
if (
|
||||||
|
self.updated_date_range[0] is None # type: ignore[index]
|
||||||
|
or self.updated_date_range[0] == "" # type: ignore[index]
|
||||||
|
or self.updated_date_range[1] is None # type: ignore[index]
|
||||||
|
or self.updated_date_range[1] == "" # type: ignore[index]
|
||||||
|
):
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
if (
|
||||||
|
self.updated_date_range[0] # type: ignore[index]
|
||||||
|
> self.updated_date_range[1] # type: ignore[index]
|
||||||
|
):
|
||||||
|
raise ValueError("Start date must be before end date.")
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
class _BoxAPIWrapper(BaseModel):
|
class _BoxAPIWrapper(BaseModel):
|
||||||
"""Wrapper for Box API."""
|
"""Wrapper for Box API."""
|
||||||
|
|
||||||
@ -485,7 +607,10 @@ class _BoxAPIWrapper(BaseModel):
|
|||||||
"""character_limit is an int that caps the number of characters to
|
"""character_limit is an int that caps the number of characters to
|
||||||
return per document."""
|
return per document."""
|
||||||
|
|
||||||
_box: Optional[box_sdk_gen.BoxClient] = None
|
box_search_options: Optional[BoxSearchOptions] = None
|
||||||
|
"""Search options to configure BoxRetriever to narrow search results."""
|
||||||
|
|
||||||
|
_box: Optional[box_sdk_gen.BoxClient]
|
||||||
|
|
||||||
model_config = ConfigDict(
|
model_config = ConfigDict(
|
||||||
arbitrary_types_allowed=True,
|
arbitrary_types_allowed=True,
|
||||||
@ -636,9 +761,25 @@ class _BoxAPIWrapper(BaseModel):
|
|||||||
files = []
|
files = []
|
||||||
|
|
||||||
try:
|
try:
|
||||||
results = self._box.search.search_for_content( # type: ignore[union-attr]
|
results = None
|
||||||
query=query, fields=["id", "type", "extension"]
|
|
||||||
)
|
if self.box_search_options is None:
|
||||||
|
results = self._box.search.search_for_content( # type: ignore[union-attr]
|
||||||
|
query=query, fields=["id", "type", "extension"], type="file"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
results = self._box.search.search_for_content( # type: ignore[union-attr]
|
||||||
|
query=query,
|
||||||
|
fields=["id", "type", "extension"],
|
||||||
|
type="file",
|
||||||
|
ancestor_folder_ids=self.box_search_options.ancestor_folder_ids, # type: ignore[union-attr]
|
||||||
|
content_types=self.box_search_options.search_type_filter, # type: ignore[union-attr]
|
||||||
|
created_at_range=self.box_search_options.created_date_range, # type: ignore[union-attr]
|
||||||
|
file_extensions=self.box_search_options.file_extensions, # type: ignore[union-attr]
|
||||||
|
limit=self.box_search_options.k, # type: ignore[union-attr]
|
||||||
|
size_range=self.box_search_options.size_range, # type: ignore[union-attr]
|
||||||
|
updated_at_range=self.box_search_options.updated_date_range, # type: ignore[union-attr]
|
||||||
|
)
|
||||||
|
|
||||||
if results.entries is None or len(results.entries) <= 0:
|
if results.entries is None or len(results.entries) <= 0:
|
||||||
return None # type: ignore[return-value]
|
return None # type: ignore[return-value]
|
||||||
|
@ -3,7 +3,13 @@ from langchain_core.documents import Document
|
|||||||
from pytest_mock import MockerFixture
|
from pytest_mock import MockerFixture
|
||||||
|
|
||||||
from langchain_box.retrievers import BoxRetriever
|
from langchain_box.retrievers import BoxRetriever
|
||||||
from langchain_box.utilities import BoxAuth, BoxAuthType
|
from langchain_box.utilities import (
|
||||||
|
BoxAuth,
|
||||||
|
BoxAuthType,
|
||||||
|
BoxSearchOptions,
|
||||||
|
DocumentFiles,
|
||||||
|
SearchTypeFilter,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Test auth types
|
# Test auth types
|
||||||
@ -62,6 +68,44 @@ def test_search(mocker: MockerFixture) -> None:
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# test search options
|
||||||
|
def test_search_options(mocker: MockerFixture) -> None:
|
||||||
|
mocker.patch(
|
||||||
|
"langchain_box.utilities._BoxAPIWrapper.search_box",
|
||||||
|
return_value=(
|
||||||
|
[
|
||||||
|
Document(
|
||||||
|
page_content="Test file mode\ndocument contents",
|
||||||
|
metadata={"title": "Testing Files"},
|
||||||
|
)
|
||||||
|
]
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
box_search_options = BoxSearchOptions(
|
||||||
|
ancestor_folder_ids=["box_folder_id"],
|
||||||
|
search_type_filter=[SearchTypeFilter.FILE_CONTENT],
|
||||||
|
created_date_range=["2023-01-01T00:00:00-07:00", "2024-08-01T00:00:00-07:00,"],
|
||||||
|
file_extensions=[DocumentFiles.DOCX, DocumentFiles.PDF],
|
||||||
|
k=200,
|
||||||
|
size_range=[1, 1000000],
|
||||||
|
updated_date_range=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
retriever = BoxRetriever( # type: ignore[call-arg]
|
||||||
|
box_developer_token="box_developer_token", box_search_options=box_search_options
|
||||||
|
)
|
||||||
|
|
||||||
|
documents = retriever.invoke("query")
|
||||||
|
|
||||||
|
assert documents == [
|
||||||
|
Document(
|
||||||
|
page_content="Test file mode\ndocument contents",
|
||||||
|
metadata={"title": "Testing Files"},
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
# test ai retrieval
|
# test ai retrieval
|
||||||
def test_ai(mocker: MockerFixture) -> None:
|
def test_ai(mocker: MockerFixture) -> None:
|
||||||
mocker.patch(
|
mocker.patch(
|
||||||
|
@ -5,6 +5,9 @@ EXPECTED_ALL = [
|
|||||||
"BoxRetriever",
|
"BoxRetriever",
|
||||||
"BoxAuth",
|
"BoxAuth",
|
||||||
"BoxAuthType",
|
"BoxAuthType",
|
||||||
|
"BoxSearchOptions",
|
||||||
|
"DocumentFiles",
|
||||||
|
"SearchTypeFilter",
|
||||||
"_BoxAPIWrapper",
|
"_BoxAPIWrapper",
|
||||||
"__version__",
|
"__version__",
|
||||||
]
|
]
|
||||||
|
Loading…
Reference in New Issue
Block a user