From 558fb4d66d0ab40f8053c5870b9b6bb94589a140 Mon Sep 17 00:00:00 2001 From: Scott Hurrey Date: Fri, 4 Oct 2024 14:32:34 -0400 Subject: [PATCH] box: Add citation support to langchain_box.retrievers.BoxRetriever when used with Box AI (#27012) Thank you for contributing to LangChain! **Description:** Box AI can return responses, but it can also be configured to return citations. This change allows the developer to decide if they want the answer, the citations, or both. Regardless of the combination, this is returned as a single List[Document] object. **Dependencies:** Updated to the latest Box Python SDK, v1.5.1 **Twitter handle:** BoxPlatform - [x] **Add tests and docs**: If you're adding a new integration, please include 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. - [x] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/ Additional guidelines: - Make sure optional dependencies are imported within a function. - Please do not add dependencies to pyproject.toml files (even optional ones) unless they are required for unit tests. - Most PRs should not touch more than one package. - Changes should be backwards compatible. - If you are adding something to community, do not re-import it in langchain. If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17. Co-authored-by: Erick Friis --- docs/docs/integrations/retrievers/box.ipynb | 116 ++++++++++++++---- .../box/langchain_box/retrievers/box.py | 24 +++- .../box/langchain_box/utilities/box.py | 47 +++++-- libs/partners/box/poetry.lock | 32 ++--- libs/partners/box/pyproject.toml | 2 +- .../retrievers/test_box_retriever.py | 70 +++++++++++ 6 files changed, 242 insertions(+), 49 deletions(-) diff --git a/docs/docs/integrations/retrievers/box.ipynb b/docs/docs/integrations/retrievers/box.ipynb index 8b25c1089e8..a4abf132ac5 100644 --- a/docs/docs/integrations/retrievers/box.ipynb +++ b/docs/docs/integrations/retrievers/box.ipynb @@ -52,18 +52,10 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "b87a8e8b-9b5a-4e78-97e4-274b6b0dd29f", "metadata": {}, - "outputs": [ - { - "name": "stdin", - "output_type": "stream", - "text": [ - "Enter your Box Developer Token: ········\n" - ] - } - ], + "outputs": [], "source": [ "import getpass\n", "import os\n", @@ -81,7 +73,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "a15d341e-3e26-4ca3-830b-5aab30ed66de", "metadata": {}, "outputs": [], @@ -102,10 +94,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "652d6238-1f87-422a-b135-f5abbb8652fc", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], "source": [ "%pip install -qU langchain-box" ] @@ -124,7 +124,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 5, "id": "70cc8e65-2a02-408a-bbc6-8ef649057d82", "metadata": {}, "outputs": [], @@ -146,7 +146,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 6, "id": "97f3ae67", "metadata": {}, "outputs": [ @@ -156,7 +156,7 @@ "[Document(metadata={'source': 'https://dl.boxcloud.com/api/2.0/internal_files/1514555423624/versions/1663171610024/representations/extracted_text/content/', 'title': 'Invoice-A5555_txt'}, page_content='Vendor: AstroTech Solutions\\nInvoice Number: A5555\\n\\nLine Items:\\n - Gravitational Wave Detector Kit: $800\\n - Exoplanet Terrarium: $120\\nTotal: $920')]" ] }, - "execution_count": 33, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -192,7 +192,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 7, "id": "ee0e726d-9974-4aa0-9ce1-0057ec3e540a", "metadata": {}, "outputs": [], @@ -216,17 +216,17 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 8, "id": "51a60dbe-9f2e-4e04-bb62-23968f17164a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[Document(metadata={'source': 'Box AI', 'title': 'Box AI What was the most expensive item purchased'}, page_content='The most expensive item purchased was the **Gravitational Wave Detector Kit** from AstroTech Solutions, which cost $800.')]" + "[Document(metadata={'source': 'Box AI', 'title': 'Box AI What was the most expensive item purchased'}, page_content='The most expensive item purchased is the **Gravitational Wave Detector Kit** from AstroTech Solutions, which costs **$800**.')]" ] }, - "execution_count": 5, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -237,6 +237,80 @@ "retriever.invoke(query)" ] }, + { + "cell_type": "markdown", + "id": "31a59a51", + "metadata": {}, + "source": [ + "## Citations\n", + "\n", + "With Box AI and the `BoxRetriever`, you can return the answer to your prompt, return the citations used by Box to get that answer, or both. No matter how you choose to use Box AI, the retriever returns a `List[Document]` object. We offer this flexibility with two `bool` arguments, `answer` and `citations`. Answer defaults to `True` and citations defaults to `False`, do you can omit both if you just want the answer. If you want both, you can just include `citations=True` and if you only want citations, you would include `answer=False` and `citations=True`\n", + "\n", + "### Get both" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "2eddc8c1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(metadata={'source': 'Box AI', 'title': 'Box AI What was the most expensive item purchased'}, page_content='The most expensive item purchased is the **Gravitational Wave Detector Kit** from AstroTech Solutions, which costs **$800**.'),\n", + " Document(metadata={'source': 'Box AI What was the most expensive item purchased', 'file_name': 'Invoice-A5555.txt', 'file_id': '1514555423624', 'file_type': 'file'}, page_content='Vendor: AstroTech Solutions\\nInvoice Number: A5555\\n\\nLine Items:\\n - Gravitational Wave Detector Kit: $800\\n - Exoplanet Terrarium: $120\\nTotal: $920')]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "retriever = BoxRetriever(\n", + " box_developer_token=box_developer_token, box_file_ids=box_file_ids, citations=True\n", + ")\n", + "\n", + "retriever.invoke(query)" + ] + }, + { + "cell_type": "markdown", + "id": "d2e93a2e", + "metadata": {}, + "source": [ + "### Citations only" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "c1892b07", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(metadata={'source': 'Box AI What was the most expensive item purchased', 'file_name': 'Invoice-A5555.txt', 'file_id': '1514555423624', 'file_type': 'file'}, page_content='Vendor: AstroTech Solutions\\nInvoice Number: A5555\\n\\nLine Items:\\n - Gravitational Wave Detector Kit: $800\\n - Exoplanet Terrarium: $120\\nTotal: $920')]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "retriever = BoxRetriever(\n", + " box_developer_token=box_developer_token,\n", + " box_file_ids=box_file_ids,\n", + " answer=False,\n", + " citations=True,\n", + ")\n", + "\n", + "retriever.invoke(query)" + ] + }, { "cell_type": "markdown", "id": "dfe8aad4-8626-4330-98a9-7ea1ca5d2e0e", @@ -260,7 +334,7 @@ "metadata": {}, "outputs": [ { - "name": "stdin", + "name": "stdout", "output_type": "stream", "text": [ "Enter your OpenAI key: ········\n" diff --git a/libs/partners/box/langchain_box/retrievers/box.py b/libs/partners/box/langchain_box/retrievers/box.py index b216115af3d..1c8dc550665 100644 --- a/libs/partners/box/langchain_box/retrievers/box.py +++ b/libs/partners/box/langchain_box/retrievers/box.py @@ -3,7 +3,8 @@ from typing import List, Optional from langchain_core.callbacks import CallbackManagerForRetrieverRun from langchain_core.documents import Document from langchain_core.retrievers import BaseRetriever -from pydantic import ConfigDict, model_validator +from langchain_core.utils import from_env +from pydantic import ConfigDict, Field, model_validator from typing_extensions import Self from langchain_box.utilities import BoxAuth, BoxSearchOptions, _BoxAPIWrapper @@ -113,8 +114,9 @@ class BoxRetriever(BaseRetriever): he decides to go to the pool with Carlos.' """ # noqa: E501 - box_developer_token: Optional[str] = None - """String containing the Box Developer Token generated in the developer console""" + box_developer_token: Optional[str] = Field( + default_factory=from_env("BOX_DEVELOPER_TOKEN", default=None) + ) box_auth: Optional[BoxAuth] = None """Configured @@ -131,6 +133,15 @@ class BoxRetriever(BaseRetriever): box_search_options: Optional[BoxSearchOptions] = None """Search options to configure BoxRetriever to narrow search results.""" + answer: Optional[bool] = True + """When using Box AI, return the answer to the prompt as a `Document` + object. Returned as `List[Document`]. Default is `True`.""" + + citations: Optional[bool] = False + """When using Box AI, return the citations from to the prompt as + `Document` objects. Can be used with answer. Returned as `List[Document`]. + Default is `False`.""" + _box: Optional[_BoxAPIWrapper] model_config = ConfigDict( @@ -164,6 +175,11 @@ class BoxRetriever(BaseRetriever): self, query: str, *, run_manager: CallbackManagerForRetrieverRun ) -> List[Document]: if self.box_file_ids: # If using Box AI - return self._box.ask_box_ai(query=query, box_file_ids=self.box_file_ids) # type: ignore[union-attr] + return self._box.ask_box_ai( # type: ignore[union-attr] + query=query, + box_file_ids=self.box_file_ids, + answer=self.answer, # type: ignore[arg-type] + citations=self.citations, # type: ignore[arg-type] + ) else: # If using Search return self._box.search_box(query=query) # type: ignore[union-attr] diff --git a/libs/partners/box/langchain_box/utilities/box.py b/libs/partners/box/langchain_box/utilities/box.py index 758f454a9da..8c87a8dc4cd 100644 --- a/libs/partners/box/langchain_box/utilities/box.py +++ b/libs/partners/box/langchain_box/utilities/box.py @@ -805,7 +805,13 @@ class _BoxAPIWrapper(BaseModel): f"BoxSDKError: Error getting search results: {bse.message}" ) - def ask_box_ai(self, query: str, box_file_ids: List[str]) -> List[Document]: + def ask_box_ai( + self, + query: str, + box_file_ids: List[str], + answer: bool = True, + citations: bool = False, + ) -> List[Document]: if self._box is None: self.get_box_client() @@ -819,13 +825,16 @@ class _BoxAPIWrapper(BaseModel): items = [] for file_id in box_file_ids: - item = box_sdk_gen.CreateAiAskItems( - id=file_id, type=box_sdk_gen.CreateAiAskItemsTypeField.FILE.value + item = box_sdk_gen.AiItemBase( + id=file_id, type=box_sdk_gen.AiItemBaseTypeField.FILE.value ) items.append(item) try: - response = self._box.ai.create_ai_ask(ai_mode, query, items) # type: ignore[union-attr] + response = self._box.ai.create_ai_ask( # type: ignore[union-attr] + mode=ai_mode, prompt=query, items=items, include_citations=citations + ) + except box_sdk_gen.BoxAPIError as bae: raise RuntimeError( f"BoxAPIError: Error getting Box AI result: {bae.message}" @@ -835,8 +844,32 @@ class _BoxAPIWrapper(BaseModel): f"BoxSDKError: Error getting Box AI result: {bse.message}" ) - content = response.answer + docs = [] - metadata = {"source": "Box AI", "title": f"Box AI {query}"} + if answer: + content = response.answer + metadata = {"source": "Box AI", "title": f"Box AI {query}"} - return [Document(page_content=content, metadata=metadata)] + document = Document(page_content=content, metadata=metadata) + docs.append(document) + + if citations: + box_citations = response.citations + + for citation in box_citations: + content = citation.content + file_name = citation.name + file_id = citation.id + file_type = citation.type.value + + metadata = { + "source": f"Box AI {query}", + "file_name": file_name, + "file_id": file_id, + "file_type": file_type, + } + + document = Document(page_content=content, metadata=metadata) + docs.append(document) + + return docs diff --git a/libs/partners/box/poetry.lock b/libs/partners/box/poetry.lock index 49e70f6fca4..c09158c9e79 100644 --- a/libs/partners/box/poetry.lock +++ b/libs/partners/box/poetry.lock @@ -13,13 +13,13 @@ files = [ [[package]] name = "anyio" -version = "4.4.0" +version = "4.6.0" description = "High level compatibility layer for multiple asynchronous event loop implementations" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ - {file = "anyio-4.4.0-py3-none-any.whl", hash = "sha256:c1b2d8f46a8a812513012e1107cb0e68c17159a7a594208005a57dc776e1bdc7"}, - {file = "anyio-4.4.0.tar.gz", hash = "sha256:5aadc6a1bbb7cdb0bede386cac5e2940f5e2ff3aa20277e991cf028e0585ce94"}, + {file = "anyio-4.6.0-py3-none-any.whl", hash = "sha256:c7d2e9d63e31599eeb636c8c5c03a7e108d73b345f064f1c19fdc87b79036a9a"}, + {file = "anyio-4.6.0.tar.gz", hash = "sha256:137b4559cbb034c477165047febb6ff83f390fc3b20bf181c1fc0a728cb8beeb"}, ] [package.dependencies] @@ -29,19 +29,19 @@ sniffio = ">=1.1" typing-extensions = {version = ">=4.1", markers = "python_version < \"3.11\""} [package.extras] -doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"] -test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"] -trio = ["trio (>=0.23)"] +doc = ["Sphinx (>=7.4,<8.0)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"] +test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.21.0b1)"] +trio = ["trio (>=0.26.1)"] [[package]] name = "box-sdk-gen" -version = "1.5.0" +version = "1.5.1" description = "Official Box Python Generated SDK" optional = false python-versions = "*" files = [ - {file = "box_sdk_gen-1.5.0-py3-none-any.whl", hash = "sha256:37763ace35fcb02ef84e28b4f61a93fe0d74ae22ceb9e1815ec1309b737cc016"}, - {file = "box_sdk_gen-1.5.0.tar.gz", hash = "sha256:49440a5dc4744261a877d149f0cad0a68f6096c2ad2a0c3392fe443269adca5f"}, + {file = "box_sdk_gen-1.5.1-py3-none-any.whl", hash = "sha256:3aba4615940566df86a236781ac34defd33ac127b9027a8a73775997b6a1ef97"}, + {file = "box_sdk_gen-1.5.1.tar.gz", hash = "sha256:2171b5a9b9d93014aecd4a883767459839515ecab18c6358868a5457401d896e"}, ] [package.dependencies] @@ -444,7 +444,7 @@ files = [ [[package]] name = "langchain-core" -version = "0.3.1" +version = "0.3.6" description = "Building applications with LLMs through composability" optional = false python-versions = ">=3.9,<4.0" @@ -453,7 +453,7 @@ develop = true [package.dependencies] jsonpatch = "^1.33" -langsmith = "^0.1.117" +langsmith = "^0.1.125" packaging = ">=23.2,<25" pydantic = [ {version = ">=2.5.2,<3.0.0", markers = "python_full_version < \"3.12.4\""}, @@ -469,13 +469,13 @@ url = "../../core" [[package]] name = "langsmith" -version = "0.1.122" +version = "0.1.129" description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform." optional = false python-versions = "<4.0,>=3.8.1" files = [ - {file = "langsmith-0.1.122-py3-none-any.whl", hash = "sha256:9c9cde442d7321e8557f5c45c14b1b643b8aa28acc3f844d3a0021a9571aad7c"}, - {file = "langsmith-0.1.122.tar.gz", hash = "sha256:56dff727ca529fe8df300e6e4759dc920efe10ab8cd602b4d6b51e33599214e6"}, + {file = "langsmith-0.1.129-py3-none-any.whl", hash = "sha256:31393fbbb17d6be5b99b9b22d530450094fab23c6c37281a6a6efb2143d05347"}, + {file = "langsmith-0.1.129.tar.gz", hash = "sha256:6c3ba66471bef41b9f87da247cc0b493268b3f54656f73648a256a205261b6a0"}, ] [package.dependencies] @@ -1076,4 +1076,4 @@ zstd = ["zstandard (>=0.18.0)"] [metadata] lock-version = "2.0" python-versions = ">=3.9.0,<3.13" -content-hash = "5fc4a4313b1be5863c4abe9d237ce4eef744c4ad9173a7496e138020db6673de" +content-hash = "fe5ad1ad0e68ef281c8fc11b19ddb9494343dd4931a9b33804bb9698fa5d3b3d" diff --git a/libs/partners/box/pyproject.toml b/libs/partners/box/pyproject.toml index 7487325be3c..af8cd16502b 100644 --- a/libs/partners/box/pyproject.toml +++ b/libs/partners/box/pyproject.toml @@ -14,7 +14,7 @@ license = "MIT" [tool.poetry.dependencies] python = ">=3.9.0,<3.13" langchain-core = "^0.3.1" -box-sdk-gen = { extras = ["jwt"], version = "^1.1.0" } +box-sdk-gen = { extras = ["jwt"], version = "^1.5.0" } pydantic = "^2" [tool.poetry.group.test] diff --git a/libs/partners/box/tests/unit_tests/retrievers/test_box_retriever.py b/libs/partners/box/tests/unit_tests/retrievers/test_box_retriever.py index 82a0a304207..054f7ed205c 100644 --- a/libs/partners/box/tests/unit_tests/retrievers/test_box_retriever.py +++ b/libs/partners/box/tests/unit_tests/retrievers/test_box_retriever.py @@ -131,3 +131,73 @@ def test_ai(mocker: MockerFixture) -> None: metadata={"title": "Testing Files"}, ) ] + + +# test ai retrieval with answer and citations +def test_ai_answer_citations(mocker: MockerFixture) -> None: + mocker.patch( + "langchain_box.utilities._BoxAPIWrapper.ask_box_ai", + return_value=( + [ + Document( + page_content="Test file mode\ndocument contents", + metadata={"title": "Testing Files"}, + ), + Document(page_content="citation 1", metadata={"source": "source 1"}), + Document(page_content="citation 2", metadata={"source": "source 2"}), + Document(page_content="citation 3", metadata={"source": "source 3"}), + Document(page_content="citation 4", metadata={"source": "source 4"}), + Document(page_content="citation 5", metadata={"source": "source 5"}), + ] + ), + ) + + retriever = BoxRetriever( # type: ignore[call-arg] + box_developer_token="box_developer_token", + box_file_ids=["box_file_ids"], + citations=True, + ) + + documents = retriever.invoke("query") + assert documents == [ + Document( + page_content="Test file mode\ndocument contents", + metadata={"title": "Testing Files"}, + ), + Document(page_content="citation 1", metadata={"source": "source 1"}), + Document(page_content="citation 2", metadata={"source": "source 2"}), + Document(page_content="citation 3", metadata={"source": "source 3"}), + Document(page_content="citation 4", metadata={"source": "source 4"}), + Document(page_content="citation 5", metadata={"source": "source 5"}), + ] + + +# test ai retrieval with citations only +def test_ai_citations_only(mocker: MockerFixture) -> None: + mocker.patch( + "langchain_box.utilities._BoxAPIWrapper.ask_box_ai", + return_value=( + [ + Document(page_content="citation 1", metadata={"source": "source 1"}), + Document(page_content="citation 2", metadata={"source": "source 2"}), + Document(page_content="citation 3", metadata={"source": "source 3"}), + Document(page_content="citation 4", metadata={"source": "source 4"}), + Document(page_content="citation 5", metadata={"source": "source 5"}), + ] + ), + ) + + retriever = BoxRetriever( # type: ignore[call-arg] + box_developer_token="box_developer_token", + box_file_ids=["box_file_ids"], + citations=True, + ) + + documents = retriever.invoke("query") + assert documents == [ + Document(page_content="citation 1", metadata={"source": "source 1"}), + Document(page_content="citation 2", metadata={"source": "source 2"}), + Document(page_content="citation 3", metadata={"source": "source 3"}), + Document(page_content="citation 4", metadata={"source": "source 4"}), + Document(page_content="citation 5", metadata={"source": "source 5"}), + ]