diff --git a/docs/docs/integrations/retrievers/box.ipynb b/docs/docs/integrations/retrievers/box.ipynb index 8b25c1089e8..a4abf132ac5 100644 --- a/docs/docs/integrations/retrievers/box.ipynb +++ b/docs/docs/integrations/retrievers/box.ipynb @@ -52,18 +52,10 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "b87a8e8b-9b5a-4e78-97e4-274b6b0dd29f", "metadata": {}, - "outputs": [ - { - "name": "stdin", - "output_type": "stream", - "text": [ - "Enter your Box Developer Token: ········\n" - ] - } - ], + "outputs": [], "source": [ "import getpass\n", "import os\n", @@ -81,7 +73,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "a15d341e-3e26-4ca3-830b-5aab30ed66de", "metadata": {}, "outputs": [], @@ -102,10 +94,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "652d6238-1f87-422a-b135-f5abbb8652fc", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], "source": [ "%pip install -qU langchain-box" ] @@ -124,7 +124,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 5, "id": "70cc8e65-2a02-408a-bbc6-8ef649057d82", "metadata": {}, "outputs": [], @@ -146,7 +146,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 6, "id": "97f3ae67", "metadata": {}, "outputs": [ @@ -156,7 +156,7 @@ "[Document(metadata={'source': 'https://dl.boxcloud.com/api/2.0/internal_files/1514555423624/versions/1663171610024/representations/extracted_text/content/', 'title': 'Invoice-A5555_txt'}, page_content='Vendor: AstroTech Solutions\\nInvoice Number: A5555\\n\\nLine Items:\\n - Gravitational Wave Detector Kit: $800\\n - Exoplanet Terrarium: $120\\nTotal: $920')]" ] }, - "execution_count": 33, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -192,7 +192,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 7, "id": "ee0e726d-9974-4aa0-9ce1-0057ec3e540a", "metadata": {}, "outputs": [], @@ -216,17 +216,17 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 8, "id": "51a60dbe-9f2e-4e04-bb62-23968f17164a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[Document(metadata={'source': 'Box AI', 'title': 'Box AI What was the most expensive item purchased'}, page_content='The most expensive item purchased was the **Gravitational Wave Detector Kit** from AstroTech Solutions, which cost $800.')]" + "[Document(metadata={'source': 'Box AI', 'title': 'Box AI What was the most expensive item purchased'}, page_content='The most expensive item purchased is the **Gravitational Wave Detector Kit** from AstroTech Solutions, which costs **$800**.')]" ] }, - "execution_count": 5, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -237,6 +237,80 @@ "retriever.invoke(query)" ] }, + { + "cell_type": "markdown", + "id": "31a59a51", + "metadata": {}, + "source": [ + "## Citations\n", + "\n", + "With Box AI and the `BoxRetriever`, you can return the answer to your prompt, return the citations used by Box to get that answer, or both. No matter how you choose to use Box AI, the retriever returns a `List[Document]` object. We offer this flexibility with two `bool` arguments, `answer` and `citations`. Answer defaults to `True` and citations defaults to `False`, do you can omit both if you just want the answer. If you want both, you can just include `citations=True` and if you only want citations, you would include `answer=False` and `citations=True`\n", + "\n", + "### Get both" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "2eddc8c1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(metadata={'source': 'Box AI', 'title': 'Box AI What was the most expensive item purchased'}, page_content='The most expensive item purchased is the **Gravitational Wave Detector Kit** from AstroTech Solutions, which costs **$800**.'),\n", + " Document(metadata={'source': 'Box AI What was the most expensive item purchased', 'file_name': 'Invoice-A5555.txt', 'file_id': '1514555423624', 'file_type': 'file'}, page_content='Vendor: AstroTech Solutions\\nInvoice Number: A5555\\n\\nLine Items:\\n - Gravitational Wave Detector Kit: $800\\n - Exoplanet Terrarium: $120\\nTotal: $920')]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "retriever = BoxRetriever(\n", + " box_developer_token=box_developer_token, box_file_ids=box_file_ids, citations=True\n", + ")\n", + "\n", + "retriever.invoke(query)" + ] + }, + { + "cell_type": "markdown", + "id": "d2e93a2e", + "metadata": {}, + "source": [ + "### Citations only" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "c1892b07", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(metadata={'source': 'Box AI What was the most expensive item purchased', 'file_name': 'Invoice-A5555.txt', 'file_id': '1514555423624', 'file_type': 'file'}, page_content='Vendor: AstroTech Solutions\\nInvoice Number: A5555\\n\\nLine Items:\\n - Gravitational Wave Detector Kit: $800\\n - Exoplanet Terrarium: $120\\nTotal: $920')]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "retriever = BoxRetriever(\n", + " box_developer_token=box_developer_token,\n", + " box_file_ids=box_file_ids,\n", + " answer=False,\n", + " citations=True,\n", + ")\n", + "\n", + "retriever.invoke(query)" + ] + }, { "cell_type": "markdown", "id": "dfe8aad4-8626-4330-98a9-7ea1ca5d2e0e", @@ -260,7 +334,7 @@ "metadata": {}, "outputs": [ { - "name": "stdin", + "name": "stdout", "output_type": "stream", "text": [ "Enter your OpenAI key: ········\n" diff --git a/libs/partners/box/langchain_box/retrievers/box.py b/libs/partners/box/langchain_box/retrievers/box.py index b216115af3d..1c8dc550665 100644 --- a/libs/partners/box/langchain_box/retrievers/box.py +++ b/libs/partners/box/langchain_box/retrievers/box.py @@ -3,7 +3,8 @@ from typing import List, Optional from langchain_core.callbacks import CallbackManagerForRetrieverRun from langchain_core.documents import Document from langchain_core.retrievers import BaseRetriever -from pydantic import ConfigDict, model_validator +from langchain_core.utils import from_env +from pydantic import ConfigDict, Field, model_validator from typing_extensions import Self from langchain_box.utilities import BoxAuth, BoxSearchOptions, _BoxAPIWrapper @@ -113,8 +114,9 @@ class BoxRetriever(BaseRetriever): he decides to go to the pool with Carlos.' """ # noqa: E501 - box_developer_token: Optional[str] = None - """String containing the Box Developer Token generated in the developer console""" + box_developer_token: Optional[str] = Field( + default_factory=from_env("BOX_DEVELOPER_TOKEN", default=None) + ) box_auth: Optional[BoxAuth] = None """Configured @@ -131,6 +133,15 @@ class BoxRetriever(BaseRetriever): box_search_options: Optional[BoxSearchOptions] = None """Search options to configure BoxRetriever to narrow search results.""" + answer: Optional[bool] = True + """When using Box AI, return the answer to the prompt as a `Document` + object. Returned as `List[Document`]. Default is `True`.""" + + citations: Optional[bool] = False + """When using Box AI, return the citations from to the prompt as + `Document` objects. Can be used with answer. Returned as `List[Document`]. + Default is `False`.""" + _box: Optional[_BoxAPIWrapper] model_config = ConfigDict( @@ -164,6 +175,11 @@ class BoxRetriever(BaseRetriever): self, query: str, *, run_manager: CallbackManagerForRetrieverRun ) -> List[Document]: if self.box_file_ids: # If using Box AI - return self._box.ask_box_ai(query=query, box_file_ids=self.box_file_ids) # type: ignore[union-attr] + return self._box.ask_box_ai( # type: ignore[union-attr] + query=query, + box_file_ids=self.box_file_ids, + answer=self.answer, # type: ignore[arg-type] + citations=self.citations, # type: ignore[arg-type] + ) else: # If using Search return self._box.search_box(query=query) # type: ignore[union-attr] diff --git a/libs/partners/box/langchain_box/utilities/box.py b/libs/partners/box/langchain_box/utilities/box.py index 758f454a9da..8c87a8dc4cd 100644 --- a/libs/partners/box/langchain_box/utilities/box.py +++ b/libs/partners/box/langchain_box/utilities/box.py @@ -805,7 +805,13 @@ class _BoxAPIWrapper(BaseModel): f"BoxSDKError: Error getting search results: {bse.message}" ) - def ask_box_ai(self, query: str, box_file_ids: List[str]) -> List[Document]: + def ask_box_ai( + self, + query: str, + box_file_ids: List[str], + answer: bool = True, + citations: bool = False, + ) -> List[Document]: if self._box is None: self.get_box_client() @@ -819,13 +825,16 @@ class _BoxAPIWrapper(BaseModel): items = [] for file_id in box_file_ids: - item = box_sdk_gen.CreateAiAskItems( - id=file_id, type=box_sdk_gen.CreateAiAskItemsTypeField.FILE.value + item = box_sdk_gen.AiItemBase( + id=file_id, type=box_sdk_gen.AiItemBaseTypeField.FILE.value ) items.append(item) try: - response = self._box.ai.create_ai_ask(ai_mode, query, items) # type: ignore[union-attr] + response = self._box.ai.create_ai_ask( # type: ignore[union-attr] + mode=ai_mode, prompt=query, items=items, include_citations=citations + ) + except box_sdk_gen.BoxAPIError as bae: raise RuntimeError( f"BoxAPIError: Error getting Box AI result: {bae.message}" @@ -835,8 +844,32 @@ class _BoxAPIWrapper(BaseModel): f"BoxSDKError: Error getting Box AI result: {bse.message}" ) - content = response.answer + docs = [] - metadata = {"source": "Box AI", "title": f"Box AI {query}"} + if answer: + content = response.answer + metadata = {"source": "Box AI", "title": f"Box AI {query}"} - return [Document(page_content=content, metadata=metadata)] + document = Document(page_content=content, metadata=metadata) + docs.append(document) + + if citations: + box_citations = response.citations + + for citation in box_citations: + content = citation.content + file_name = citation.name + file_id = citation.id + file_type = citation.type.value + + metadata = { + "source": f"Box AI {query}", + "file_name": file_name, + "file_id": file_id, + "file_type": file_type, + } + + document = Document(page_content=content, metadata=metadata) + docs.append(document) + + return docs diff --git a/libs/partners/box/poetry.lock b/libs/partners/box/poetry.lock index 49e70f6fca4..c09158c9e79 100644 --- a/libs/partners/box/poetry.lock +++ b/libs/partners/box/poetry.lock @@ -13,13 +13,13 @@ files = [ [[package]] name = "anyio" -version = "4.4.0" +version = "4.6.0" description = "High level compatibility layer for multiple asynchronous event loop implementations" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ - {file = "anyio-4.4.0-py3-none-any.whl", hash = "sha256:c1b2d8f46a8a812513012e1107cb0e68c17159a7a594208005a57dc776e1bdc7"}, - {file = "anyio-4.4.0.tar.gz", hash = "sha256:5aadc6a1bbb7cdb0bede386cac5e2940f5e2ff3aa20277e991cf028e0585ce94"}, + {file = "anyio-4.6.0-py3-none-any.whl", hash = "sha256:c7d2e9d63e31599eeb636c8c5c03a7e108d73b345f064f1c19fdc87b79036a9a"}, + {file = "anyio-4.6.0.tar.gz", hash = "sha256:137b4559cbb034c477165047febb6ff83f390fc3b20bf181c1fc0a728cb8beeb"}, ] [package.dependencies] @@ -29,19 +29,19 @@ sniffio = ">=1.1" typing-extensions = {version = ">=4.1", markers = "python_version < \"3.11\""} [package.extras] -doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"] -test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"] -trio = ["trio (>=0.23)"] +doc = ["Sphinx (>=7.4,<8.0)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"] +test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.21.0b1)"] +trio = ["trio (>=0.26.1)"] [[package]] name = "box-sdk-gen" -version = "1.5.0" +version = "1.5.1" description = "Official Box Python Generated SDK" optional = false python-versions = "*" files = [ - {file = "box_sdk_gen-1.5.0-py3-none-any.whl", hash = "sha256:37763ace35fcb02ef84e28b4f61a93fe0d74ae22ceb9e1815ec1309b737cc016"}, - {file = "box_sdk_gen-1.5.0.tar.gz", hash = "sha256:49440a5dc4744261a877d149f0cad0a68f6096c2ad2a0c3392fe443269adca5f"}, + {file = "box_sdk_gen-1.5.1-py3-none-any.whl", hash = "sha256:3aba4615940566df86a236781ac34defd33ac127b9027a8a73775997b6a1ef97"}, + {file = "box_sdk_gen-1.5.1.tar.gz", hash = "sha256:2171b5a9b9d93014aecd4a883767459839515ecab18c6358868a5457401d896e"}, ] [package.dependencies] @@ -444,7 +444,7 @@ files = [ [[package]] name = "langchain-core" -version = "0.3.1" +version = "0.3.6" description = "Building applications with LLMs through composability" optional = false python-versions = ">=3.9,<4.0" @@ -453,7 +453,7 @@ develop = true [package.dependencies] jsonpatch = "^1.33" -langsmith = "^0.1.117" +langsmith = "^0.1.125" packaging = ">=23.2,<25" pydantic = [ {version = ">=2.5.2,<3.0.0", markers = "python_full_version < \"3.12.4\""}, @@ -469,13 +469,13 @@ url = "../../core" [[package]] name = "langsmith" -version = "0.1.122" +version = "0.1.129" description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform." optional = false python-versions = "<4.0,>=3.8.1" files = [ - {file = "langsmith-0.1.122-py3-none-any.whl", hash = "sha256:9c9cde442d7321e8557f5c45c14b1b643b8aa28acc3f844d3a0021a9571aad7c"}, - {file = "langsmith-0.1.122.tar.gz", hash = "sha256:56dff727ca529fe8df300e6e4759dc920efe10ab8cd602b4d6b51e33599214e6"}, + {file = "langsmith-0.1.129-py3-none-any.whl", hash = "sha256:31393fbbb17d6be5b99b9b22d530450094fab23c6c37281a6a6efb2143d05347"}, + {file = "langsmith-0.1.129.tar.gz", hash = "sha256:6c3ba66471bef41b9f87da247cc0b493268b3f54656f73648a256a205261b6a0"}, ] [package.dependencies] @@ -1076,4 +1076,4 @@ zstd = ["zstandard (>=0.18.0)"] [metadata] lock-version = "2.0" python-versions = ">=3.9.0,<3.13" -content-hash = "5fc4a4313b1be5863c4abe9d237ce4eef744c4ad9173a7496e138020db6673de" +content-hash = "fe5ad1ad0e68ef281c8fc11b19ddb9494343dd4931a9b33804bb9698fa5d3b3d" diff --git a/libs/partners/box/pyproject.toml b/libs/partners/box/pyproject.toml index 7487325be3c..af8cd16502b 100644 --- a/libs/partners/box/pyproject.toml +++ b/libs/partners/box/pyproject.toml @@ -14,7 +14,7 @@ license = "MIT" [tool.poetry.dependencies] python = ">=3.9.0,<3.13" langchain-core = "^0.3.1" -box-sdk-gen = { extras = ["jwt"], version = "^1.1.0" } +box-sdk-gen = { extras = ["jwt"], version = "^1.5.0" } pydantic = "^2" [tool.poetry.group.test] diff --git a/libs/partners/box/tests/unit_tests/retrievers/test_box_retriever.py b/libs/partners/box/tests/unit_tests/retrievers/test_box_retriever.py index 82a0a304207..054f7ed205c 100644 --- a/libs/partners/box/tests/unit_tests/retrievers/test_box_retriever.py +++ b/libs/partners/box/tests/unit_tests/retrievers/test_box_retriever.py @@ -131,3 +131,73 @@ def test_ai(mocker: MockerFixture) -> None: metadata={"title": "Testing Files"}, ) ] + + +# test ai retrieval with answer and citations +def test_ai_answer_citations(mocker: MockerFixture) -> None: + mocker.patch( + "langchain_box.utilities._BoxAPIWrapper.ask_box_ai", + return_value=( + [ + Document( + page_content="Test file mode\ndocument contents", + metadata={"title": "Testing Files"}, + ), + Document(page_content="citation 1", metadata={"source": "source 1"}), + Document(page_content="citation 2", metadata={"source": "source 2"}), + Document(page_content="citation 3", metadata={"source": "source 3"}), + Document(page_content="citation 4", metadata={"source": "source 4"}), + Document(page_content="citation 5", metadata={"source": "source 5"}), + ] + ), + ) + + retriever = BoxRetriever( # type: ignore[call-arg] + box_developer_token="box_developer_token", + box_file_ids=["box_file_ids"], + citations=True, + ) + + documents = retriever.invoke("query") + assert documents == [ + Document( + page_content="Test file mode\ndocument contents", + metadata={"title": "Testing Files"}, + ), + Document(page_content="citation 1", metadata={"source": "source 1"}), + Document(page_content="citation 2", metadata={"source": "source 2"}), + Document(page_content="citation 3", metadata={"source": "source 3"}), + Document(page_content="citation 4", metadata={"source": "source 4"}), + Document(page_content="citation 5", metadata={"source": "source 5"}), + ] + + +# test ai retrieval with citations only +def test_ai_citations_only(mocker: MockerFixture) -> None: + mocker.patch( + "langchain_box.utilities._BoxAPIWrapper.ask_box_ai", + return_value=( + [ + Document(page_content="citation 1", metadata={"source": "source 1"}), + Document(page_content="citation 2", metadata={"source": "source 2"}), + Document(page_content="citation 3", metadata={"source": "source 3"}), + Document(page_content="citation 4", metadata={"source": "source 4"}), + Document(page_content="citation 5", metadata={"source": "source 5"}), + ] + ), + ) + + retriever = BoxRetriever( # type: ignore[call-arg] + box_developer_token="box_developer_token", + box_file_ids=["box_file_ids"], + citations=True, + ) + + documents = retriever.invoke("query") + assert documents == [ + Document(page_content="citation 1", metadata={"source": "source 1"}), + Document(page_content="citation 2", metadata={"source": "source 2"}), + Document(page_content="citation 3", metadata={"source": "source 3"}), + Document(page_content="citation 4", metadata={"source": "source 4"}), + Document(page_content="citation 5", metadata={"source": "source 5"}), + ]