From 2460f977c5c20073b41803c41fd08945be34cd60 Mon Sep 17 00:00:00 2001 From: QIAN Zifei Date: Fri, 22 Dec 2023 08:40:27 +0800 Subject: [PATCH] community[minor]: Azure DocumentIntelligenceLoader/Parser support update with latest SDK (#14389) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - **Description:** Add DocumentIntelligenceLoader & DocumentIntelligenceParser implementation using the latest Azure Document Intelligence SDK with markdown support. The core logic resides in DocumentIntelligenceParser and DocumentIntelligenceLoader is a mere wrapper of the parser. The parser will takes api_endpoint and api_key and creates DocumentIntelligenceClient for the user. 4 parsing modes are supported: 1. Markdown (default) 2. Single 3. Page 4. Object UT and notebook are also updated accordingly. - **Dependencies:** Azure Document Intelligence SDK: azure-ai-documentintelligence [azure-sdk-for-python/sdk/documentintelligence/azure-ai-documentintelligence at 7c42462ac662522a6fd21b17d2a20f4cd40d0356 ยท Azure/azure-sdk-for-python (github.com)](https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com%2FAzure%2Fazure-sdk-for-python%2Ftree%2F7c42462ac662522a6fd21b17d2a20f4cd40d0356%2Fsdk%2Fdocumentintelligence%2Fazure-ai-documentintelligence&data=05%7C01%7CZifei.Qian%40microsoft.com%7C298225aa3e31468a863108dbf07374ff%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C638368150928704292%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=oE0Sl4HERnMKdbkV9KgBV46Z2xytcQAShdTWf7ZNl%2Bs%3D&reserved=0). --------- Co-authored-by: Erick Friis --- .../azure_document_intelligence.ipynb | 97 ++++++++------ .../document_loaders/__init__.py | 4 + .../document_loaders/doc_intelligence.py | 89 +++++++++++++ .../document_loaders/parsers/__init__.py | 4 + .../parsers/doc_intelligence.py | 122 ++++++++++++++++++ .../document_loaders/parsers/pdf.py | 10 +- libs/community/poetry.lock | 55 +++++++- libs/community/pyproject.toml | 2 + .../parsers/test_doc_intelligence.py | 27 ++++ .../parsers/test_public_api.py | 1 + .../document_loaders/test_imports.py | 1 + 11 files changed, 367 insertions(+), 45 deletions(-) create mode 100644 libs/community/langchain_community/document_loaders/doc_intelligence.py create mode 100644 libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py create mode 100644 libs/community/tests/unit_tests/document_loaders/parsers/test_doc_intelligence.py diff --git a/docs/docs/integrations/document_loaders/azure_document_intelligence.ipynb b/docs/docs/integrations/document_loaders/azure_document_intelligence.ipynb index 390db4b5abb..dc68783d3d9 100644 --- a/docs/docs/integrations/document_loaders/azure_document_intelligence.ipynb +++ b/docs/docs/integrations/document_loaders/azure_document_intelligence.ipynb @@ -5,7 +5,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Azure Document Intelligence" + "# Azure AI Document Intelligence" ] }, { @@ -13,7 +13,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Azure Document Intelligence (formerly known as Azure Forms Recognizer) is machine-learning \n", + "Azure AI Document Intelligence (formerly known as Azure Form Recognizer) is machine-learning \n", "based service that extracts text (including handwriting), tables or key-value-pairs from\n", "scanned documents or images.\n", "\n", @@ -21,7 +21,7 @@ "\n", "Document Intelligence supports PDF, JPEG, PNG, BMP, or TIFF.\n", "\n", - "Further documentation is available at https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/?view=doc-intel-3.1.0.\n" + "Further documentation is available at https://aka.ms/doc-intelligence.\n" ] }, { @@ -30,7 +30,7 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install langchain azure-ai-formrecognizer -q" + "%pip install langchain langchain-community azure-ai-documentintelligence -q" ] }, { @@ -46,23 +46,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The first example uses a local file which will be sent to Azure Document Intelligence.\n", - "\n", - "First, an instance of a DocumentAnalysisClient is created with endpoint and key for the Azure service. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azure.ai.formrecognizer import DocumentAnalysisClient\n", - "from azure.core.credentials import AzureKeyCredential\n", - "\n", - "document_analysis_client = DocumentAnalysisClient(\n", - " endpoint=\"\", credential=AzureKeyCredential(\"\")\n", - ")" + "The first example uses a local file which will be sent to Azure AI Document Intelligence." ] }, { @@ -75,15 +59,18 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "from langchain.document_loaders.pdf import DocumentIntelligenceLoader\n", + "from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader\n", "\n", - "loader = DocumentIntelligenceLoader(\n", - " \"\", client=document_analysis_client, model=\"\"\n", - ") # e.g. prebuilt-document\n", + "file_path = \"\"\n", + "endpoint = \"\"\n", + "key = \"\"\n", + "loader = AzureAIDocumentIntelligenceLoader(\n", + " api_endpoint=endpoint, api_key=key, file_path=file_path, api_model=\"prebuilt-layout\"\n", + ")\n", "\n", "documents = loader.load()" ] @@ -93,25 +80,45 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The output contains each page of the source document as a LangChain document: " + "The default output contains one LangChain document with markdown format content: " ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Document(page_content='...', metadata={'source': '...', 'page': 1})]" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], + "source": [ + "documents" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 2\n", + "The input file can also be URL path." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "url_path = \"\"\n", + "loader = AzureAIDocumentIntelligenceLoader(\n", + " api_endpoint=endpoint, api_key=key, url_path=url_path, api_model=\"prebuilt-layout\"\n", + ")\n", + "\n", + "documents = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "documents" ] @@ -124,8 +131,16 @@ "name": "python3" }, "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", "name": "python", - "version": "3.9.5" + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" }, "vscode": { "interpreter": { diff --git a/libs/community/langchain_community/document_loaders/__init__.py b/libs/community/langchain_community/document_loaders/__init__.py index 48143a30b66..ca295e538eb 100644 --- a/libs/community/langchain_community/document_loaders/__init__.py +++ b/libs/community/langchain_community/document_loaders/__init__.py @@ -77,6 +77,9 @@ from langchain_community.document_loaders.dataframe import DataFrameLoader from langchain_community.document_loaders.diffbot import DiffbotLoader from langchain_community.document_loaders.directory import DirectoryLoader from langchain_community.document_loaders.discord import DiscordChatLoader +from langchain_community.document_loaders.doc_intelligence import ( + AzureAIDocumentIntelligenceLoader, +) from langchain_community.document_loaders.docugami import DocugamiLoader from langchain_community.document_loaders.docusaurus import DocusaurusLoader from langchain_community.document_loaders.dropbox import DropboxLoader @@ -247,6 +250,7 @@ __all__ = [ "AssemblyAIAudioTranscriptLoader", "AsyncHtmlLoader", "AzureAIDataLoader", + "AzureAIDocumentIntelligenceLoader", "AzureBlobStorageContainerLoader", "AzureBlobStorageFileLoader", "BSHTMLLoader", diff --git a/libs/community/langchain_community/document_loaders/doc_intelligence.py b/libs/community/langchain_community/document_loaders/doc_intelligence.py new file mode 100644 index 00000000000..bad2130cecd --- /dev/null +++ b/libs/community/langchain_community/document_loaders/doc_intelligence.py @@ -0,0 +1,89 @@ +from typing import Iterator, List, Optional + +from langchain_core.documents import Document + +from langchain_community.document_loaders.base import BaseLoader +from langchain_community.document_loaders.blob_loaders import Blob +from langchain_community.document_loaders.parsers import ( + AzureAIDocumentIntelligenceParser, +) + + +class AzureAIDocumentIntelligenceLoader(BaseLoader): + """Loads a PDF with Azure Document Intelligence""" + + def __init__( + self, + api_endpoint: str, + api_key: str, + file_path: Optional[str] = None, + url_path: Optional[str] = None, + api_version: Optional[str] = None, + api_model: str = "prebuilt-layout", + mode: str = "markdown", + ) -> None: + """ + Initialize the object for file processing with Azure Document Intelligence + (formerly Form Recognizer). + + This constructor initializes a AzureAIDocumentIntelligenceParser object to be + used for parsing files using the Azure Document Intelligence API. The load + method generates Documents whose content representations are determined by the + mode parameter. + + Parameters: + ----------- + api_endpoint: str + The API endpoint to use for DocumentIntelligenceClient construction. + api_key: str + The API key to use for DocumentIntelligenceClient construction. + file_path : Optional[str] + The path to the file that needs to be loaded. + Either file_path or url_path must be specified. + url_path : Optional[str] + The URL to the file that needs to be loaded. + Either file_path or url_path must be specified. + api_version: Optional[str] + The API version for DocumentIntelligenceClient. Setting None to use + the default value from SDK. + api_model: str + The model name or ID to be used for form recognition in Azure. + + Examples: + --------- + >>> obj = AzureAIDocumentIntelligenceLoader( + ... file_path="path/to/file", + ... api_endpoint="https://endpoint.azure.com", + ... api_key="APIKEY", + ... api_version="2023-10-31-preview", + ... model="prebuilt-document" + ... ) + """ + + assert ( + file_path is not None or url_path is not None + ), "file_path or url_path must be provided" + self.file_path = file_path + self.url_path = url_path + + self.parser = AzureAIDocumentIntelligenceParser( + api_endpoint=api_endpoint, + api_key=api_key, + api_version=api_version, + api_model=api_model, + mode=mode, + ) + + def load(self) -> List[Document]: + """Load given path as pages.""" + return list(self.lazy_load()) + + def lazy_load( + self, + ) -> Iterator[Document]: + """Lazy load given path as pages.""" + if self.file_path is not None: + blob = Blob.from_path(self.file_path) + yield from self.parser.parse(blob) + else: + yield from self.parser.parse_url(self.url_path) diff --git a/libs/community/langchain_community/document_loaders/parsers/__init__.py b/libs/community/langchain_community/document_loaders/parsers/__init__.py index c7bd6d73dff..9d01c3df2f5 100644 --- a/libs/community/langchain_community/document_loaders/parsers/__init__.py +++ b/libs/community/langchain_community/document_loaders/parsers/__init__.py @@ -1,4 +1,7 @@ from langchain_community.document_loaders.parsers.audio import OpenAIWhisperParser +from langchain_community.document_loaders.parsers.doc_intelligence import ( + AzureAIDocumentIntelligenceParser, +) from langchain_community.document_loaders.parsers.docai import DocAIParser from langchain_community.document_loaders.parsers.grobid import GrobidParser from langchain_community.document_loaders.parsers.html import BS4HTMLParser @@ -12,6 +15,7 @@ from langchain_community.document_loaders.parsers.pdf import ( ) __all__ = [ + "AzureAIDocumentIntelligenceParser", "BS4HTMLParser", "DocAIParser", "GrobidParser", diff --git a/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py b/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py new file mode 100644 index 00000000000..40645cea7d7 --- /dev/null +++ b/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py @@ -0,0 +1,122 @@ +from typing import Any, Iterator, Optional + +from langchain_core.documents import Document + +from langchain_community.document_loaders.base import BaseBlobParser +from langchain_community.document_loaders.blob_loaders import Blob + + +class AzureAIDocumentIntelligenceParser(BaseBlobParser): + """Loads a PDF with Azure Document Intelligence + (formerly Forms Recognizer).""" + + def __init__( + self, + api_endpoint: str, + api_key: str, + api_version: Optional[str] = None, + api_model: str = "prebuilt-layout", + mode: str = "markdown", + ): + from azure.ai.documentintelligence import DocumentIntelligenceClient + from azure.core.credentials import AzureKeyCredential + + kwargs = {} + if api_version is not None: + kwargs["api_version"] = api_version + self.client = DocumentIntelligenceClient( + endpoint=api_endpoint, + credential=AzureKeyCredential(api_key), + headers={"x-ms-useragent": "langchain-parser/1.0.0"}, + **kwargs, + ) + self.api_model = api_model + self.mode = mode + assert self.mode in ["single", "page", "object", "markdown"] + + def _generate_docs_page(self, result: Any) -> Iterator[Document]: + for p in result.pages: + content = " ".join([line.content for line in p.lines]) + + d = Document( + page_content=content, + metadata={ + "page": p.page_number, + }, + ) + yield d + + def _generate_docs_single(self, result: Any) -> Iterator[Document]: + yield Document(page_content=result.content, metadata={}) + + def _generate_docs_object(self, result: Any) -> Iterator[Document]: + # record relationship between page id and span offset + page_offset = [] + for page in result.pages: + # assume that spans only contain 1 element, to double check + page_offset.append(page.spans[0]["offset"]) + + # paragraph + # warning: paragraph content is overlapping with table content + for para in result.paragraphs: + yield Document( + page_content=para.content, + metadata={ + "role": para.role, + "page": para.bounding_regions[0].page_number, + "bounding_box": para.bounding_regions[0].polygon, + "type": "paragraph", + }, + ) + + # table + for table in result.tables: + yield Document( + page_content=table.cells, # json object + metadata={ + "footnote": table.footnotes, + "caption": table.caption, + "page": para.bounding_regions[0].page_number, + "bounding_box": para.bounding_regions[0].polygon, + "row_count": table.row_count, + "column_count": table.column_count, + "type": "table", + }, + ) + + def lazy_parse(self, blob: Blob) -> Iterator[Document]: + """Lazily parse the blob.""" + + with blob.as_bytes_io() as file_obj: + poller = self.client.begin_analyze_document( + self.api_model, + file_obj, + content_type="application/octet-stream", + output_content_format="markdown" if self.mode == "markdown" else "text", + ) + result = poller.result() + + if self.mode in ["single", "markdown"]: + yield from self._generate_docs_single(result) + elif self.mode == ["page"]: + yield from self._generate_docs_page(result) + else: + yield from self._generate_docs_object(result) + + def parse_url(self, url: str) -> Iterator[Document]: + from azure.ai.documentintelligence.models import AnalyzeDocumentRequest + + poller = self.client.begin_analyze_document( + self.api_model, + AnalyzeDocumentRequest(url_source=url), + # content_type="application/octet-stream", + output_content_format="markdown" if self.mode == "markdown" else "text", + ) + result = poller.result() + + if self.mode in ["single", "markdown"]: + yield from self._generate_docs_single(result) + elif self.mode == ["page"]: + yield from self._generate_docs_page(result) + else: + yield from self._generate_docs_object(result) diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py index 93ba53527b9..5ef03f3f696 100644 --- a/libs/community/langchain_community/document_loaders/parsers/pdf.py +++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py @@ -542,9 +542,17 @@ class AmazonTextractPDFParser(BaseBlobParser): class DocumentIntelligenceParser(BaseBlobParser): """Loads a PDF with Azure Document Intelligence - (formerly Forms Recognizer) and chunks at character level.""" + (formerly Form Recognizer) and chunks at character level.""" def __init__(self, client: Any, model: str): + warnings.warn( + "langchain.document_loaders.parsers.pdf.DocumentIntelligenceParser" + "and langchain.document_loaders.pdf.DocumentIntelligenceLoader" + " are deprecated. Please upgrade to " + "langchain.document_loaders.DocumentIntelligenceLoader " + "for any file parsing purpose using Azure Document Intelligence " + "service." + ) self.client = client self.model = model diff --git a/libs/community/poetry.lock b/libs/community/poetry.lock index 0f41b5941be..84df7d0c9dc 100644 --- a/libs/community/poetry.lock +++ b/libs/community/poetry.lock @@ -531,6 +531,41 @@ docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib- tests = ["attrs[tests-no-zope]", "zope-interface"] tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +[[package]] +name = "azure-ai-documentintelligence" +version = "1.0.0b1" +description = "Microsoft Azure AI Document Intelligence Client Library for Python" +optional = true +python-versions = ">=3.7" +files = [ + {file = "azure-ai-documentintelligence-1.0.0b1.tar.gz", hash = "sha256:b0acedc50489cc63aac44190e32a3a04e5c50c98a1e4ed39bcb910f51fbf5207"}, + {file = "azure_ai_documentintelligence-1.0.0b1-py3-none-any.whl", hash = "sha256:db81ea7c8c30e070b5b424a45f9c43c4111159ab6b3c2994c1346b3d3b01f682"}, +] + +[package.dependencies] +azure-core = ">=1.28.0,<2.0.0" +isodate = ">=0.6.1,<1.0.0" + +[[package]] +name = "azure-core" +version = "1.29.6" +description = "Microsoft Azure Core Library for Python" +optional = true +python-versions = ">=3.7" +files = [ + {file = "azure-core-1.29.6.tar.gz", hash = "sha256:13b485252ecd9384ae624894fe51cfa6220966207264c360beada239f88b738a"}, + {file = "azure_core-1.29.6-py3-none-any.whl", hash = "sha256:604a005bce6a49ba661bb7b2be84a9b169047e52fcfcd0a4e4770affab4178f7"}, +] + +[package.dependencies] +anyio = ">=3.0,<5.0" +requests = ">=2.21.0" +six = ">=1.11.0" +typing-extensions = ">=4.6.0" + +[package.extras] +aio = ["aiohttp (>=3.0)"] + [[package]] name = "babel" version = "2.13.1" @@ -3167,6 +3202,20 @@ widgetsnbextension = ">=4.0.9,<4.1.0" [package.extras] test = ["ipykernel", "jsonschema", "pytest (>=3.6.0)", "pytest-cov", "pytz"] +[[package]] +name = "isodate" +version = "0.6.1" +description = "An ISO 8601 date/time/duration parser and formatter" +optional = true +python-versions = "*" +files = [ + {file = "isodate-0.6.1-py2.py3-none-any.whl", hash = "sha256:0751eece944162659049d35f4f549ed815792b38793f07cf73381c1c87cbed96"}, + {file = "isodate-0.6.1.tar.gz", hash = "sha256:48c5881de7e8b0a0d648cb024c8062dc84e7b840ed81e864c7614fd3c127bde9"}, +] + +[package.dependencies] +six = "*" + [[package]] name = "isoduration" version = "20.11.0" @@ -3821,7 +3870,7 @@ files = [ [[package]] name = "langchain-core" -version = "0.1.1" +version = "0.1.3" description = "Building applications with LLMs through composability" optional = false python-versions = ">=3.8.1,<4.0" @@ -9062,9 +9111,9 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [extras] cli = ["typer"] -extended-testing = ["aiosqlite", "aleph-alpha-client", "anthropic", "arxiv", "assemblyai", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "cohere", "dashvector", "databricks-vectorsearch", "datasets", "dgml-utils", "esprima", "faiss-cpu", "feedparser", "fireworks-ai", "geopandas", "gitpython", "google-cloud-documentai", "gql", "gradientai", "hologres-vector", "html2text", "javelin-sdk", "jinja2", "jq", "jsonschema", "lxml", "markdownify", "motor", "msal", "mwparserfromhell", "mwxml", "newspaper3k", "numexpr", "openai", "openapi-pydantic", "oracle-ads", "pandas", "pdfminer-six", "pgvector", "praw", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "rapidocr-onnxruntime", "requests-toolbelt", "rspace_client", "scikit-learn", "sqlite-vss", "streamlit", "sympy", "telethon", "timescale-vector", "tqdm", "upstash-redis", "xata", "xmltodict"] +extended-testing = ["aiosqlite", "aleph-alpha-client", "anthropic", "arxiv", "assemblyai", "atlassian-python-api", "azure-ai-documentintelligence", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "cohere", "dashvector", "databricks-vectorsearch", "datasets", "dgml-utils", "esprima", "faiss-cpu", "feedparser", "fireworks-ai", "geopandas", "gitpython", "google-cloud-documentai", "gql", "gradientai", "hologres-vector", "html2text", "javelin-sdk", "jinja2", "jq", "jsonschema", "lxml", "markdownify", "motor", "msal", "mwparserfromhell", "mwxml", "newspaper3k", "numexpr", "openai", "openapi-pydantic", "oracle-ads", "pandas", "pdfminer-six", "pgvector", "praw", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "rapidocr-onnxruntime", "requests-toolbelt", "rspace_client", "scikit-learn", "sqlite-vss", "streamlit", "sympy", "telethon", "timescale-vector", "tqdm", "upstash-redis", "xata", "xmltodict"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "00b69a8316c2748362f1f135e229950230be0401e7c307c0ce27a8309f947816" +content-hash = "9094149705a405904c268b09c7dddae98fa466f67b2606defb5c6e3661b36602" diff --git a/libs/community/pyproject.toml b/libs/community/pyproject.toml index b668fb96c88..bc75d721876 100644 --- a/libs/community/pyproject.toml +++ b/libs/community/pyproject.toml @@ -84,6 +84,7 @@ msal = {version = "^1.25.0", optional = true} databricks-vectorsearch = {version = "^0.21", optional = true} dgml-utils = {version = "^0.3.0", optional = true} datasets = {version = "^2.15.0", optional = true} +azure-ai-documentintelligence = {version = "^1.0.0b1", optional = true} oracle-ads = {version = "^2.9.1", optional = true} [tool.poetry.group.test] @@ -244,6 +245,7 @@ extended_testing = [ "databricks-vectorsearch", "dgml-utils", "cohere", + "azure-ai-documentintelligence", "oracle-ads", ] diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/test_doc_intelligence.py b/libs/community/tests/unit_tests/document_loaders/parsers/test_doc_intelligence.py new file mode 100644 index 00000000000..1f1651e3063 --- /dev/null +++ b/libs/community/tests/unit_tests/document_loaders/parsers/test_doc_intelligence.py @@ -0,0 +1,27 @@ +"""Tests for the Google Cloud DocAI parser.""" +from unittest.mock import MagicMock, patch + +import pytest + +from langchain_community.document_loaders.parsers import ( + AzureAIDocumentIntelligenceParser, +) + + +@pytest.mark.requires("azure", "azure.ai", "azure.ai.documentintelligence") +@patch("azure.ai.documentintelligence.DocumentIntelligenceClient") +@patch("azure.core.credentials.AzureKeyCredential") +def test_doc_intelligence(mock_credential: MagicMock, mock_client: MagicMock) -> None: + endpoint = "endpoint" + key = "key" + + parser = AzureAIDocumentIntelligenceParser(api_endpoint=endpoint, api_key=key) + mock_credential.assert_called_once_with(key) + mock_client.assert_called_once_with( + endpoint=endpoint, + credential=mock_credential(), + headers={"x-ms-useragent": "langchain-parser/1.0.0"}, + ) + assert parser.client == mock_client() + assert parser.api_model == "prebuilt-layout" + assert parser.mode == "markdown" diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/test_public_api.py b/libs/community/tests/unit_tests/document_loaders/parsers/test_public_api.py index a0b62f8531f..3dc19adb310 100644 --- a/libs/community/tests/unit_tests/document_loaders/parsers/test_public_api.py +++ b/libs/community/tests/unit_tests/document_loaders/parsers/test_public_api.py @@ -4,6 +4,7 @@ from langchain_community.document_loaders.parsers import __all__ def test_parsers_public_api_correct() -> None: """Test public API of parsers for breaking changes.""" assert set(__all__) == { + "AzureAIDocumentIntelligenceParser", "BS4HTMLParser", "DocAIParser", "GrobidParser", diff --git a/libs/community/tests/unit_tests/document_loaders/test_imports.py b/libs/community/tests/unit_tests/document_loaders/test_imports.py index 69f20546d04..a2101c8830d 100644 --- a/libs/community/tests/unit_tests/document_loaders/test_imports.py +++ b/libs/community/tests/unit_tests/document_loaders/test_imports.py @@ -23,6 +23,7 @@ EXPECTED_ALL = [ "AssemblyAIAudioTranscriptLoader", "AsyncHtmlLoader", "AzureAIDataLoader", + "AzureAIDocumentIntelligenceLoader", "AzureBlobStorageContainerLoader", "AzureBlobStorageFileLoader", "BSHTMLLoader",