mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-17 18:23:59 +00:00
community[patch]: Microsoft Azure Document Intelligence updates (#16932)
- **Description:** Update Azure Document Intelligence implementation by Microsoft team and RAG cookbook with Azure AI Search --------- Co-authored-by: Lu Zhang (AI) <luzhan@microsoft.com> Co-authored-by: Yateng Hong <yatengh@microsoft.com> Co-authored-by: teethache <hongyateng2006@126.com> Co-authored-by: Lu Zhang <44625949+luzhang06@users.noreply.github.com> Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com> Co-authored-by: Bagatur <baskaryan@gmail.com> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
This commit is contained in:
parent
cd79305eb9
commit
f12cb0bea4
274
cookbook/rag_semantic_chunking_azureaidocintelligence.ipynb
Normal file
274
cookbook/rag_semantic_chunking_azureaidocintelligence.ipynb
Normal file
File diff suppressed because one or more lines are too long
@ -14,32 +14,30 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
">[Azure AI Document Intelligence](https://aka.ms/doc-intelligence) (formerly known as `Azure Form Recognizer`) is machine-learning \n",
|
||||
">based service that extracts text (including handwriting), tables or key-value-pairs from\n",
|
||||
">scanned documents or images.\n",
|
||||
">based service that extracts texts (including handwriting), tables, document structures (e.g., titles, section headings, etc.) and key-value-pairs from\n",
|
||||
">digital or scanned PDFs, images, Office and HTML files.\n",
|
||||
">\n",
|
||||
">Document Intelligence supports `PDF`, `JPEG`, `PNG`, `BMP`, or `TIFF`.\n",
|
||||
">Document Intelligence supports `PDF`, `JPEG/JPG`, `PNG`, `BMP`, `TIFF`, `HEIF`, `DOCX`, `XLSX`, `PPTX` and `HTML`.\n",
|
||||
"\n",
|
||||
"This current implementation of a loader using `Document Intelligence` can incorporate content page-wise and turn it into LangChain documents.\n"
|
||||
"This current implementation of a loader using `Document Intelligence` can incorporate content page-wise and turn it into LangChain documents. The default output format is markdown, which can be easily chained with `MarkdownHeaderTextSplitter` for semantic document chunking. You can also use `mode=\"single\"` or `mode=\"page\"` to return pure texts in a single page or document split by page.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Prerequisite\n",
|
||||
"\n",
|
||||
"An Azure AI Document Intelligence resource in one of the 3 preview regions: **East US**, **West US2**, **West Europe** - follow [this document](https://learn.microsoft.com/azure/ai-services/document-intelligence/create-document-intelligence-resource?view=doc-intel-4.0.0) to create one if you don't have. You will be passing `<endpoint>` and `<key>` as parameters to the loader."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.3.2\u001b[0m\n",
|
||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n",
|
||||
"Note: you may need to restart the kernel to use updated packages.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install --upgrade --quiet langchain langchain-community azure-ai-documentintelligence -q"
|
||||
"%pip install --upgrade --quiet langchain langchain-community azure-ai-documentintelligence"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -106,7 +104,7 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Example 2\n",
|
||||
"The input file can also be URL path."
|
||||
"The input file can also be a public URL path. E.g., https://raw.githubusercontent.com/Azure-Samples/cognitive-services-REST-api-samples/master/curl/form-recognizer/rest-api/layout.png."
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -123,6 +121,101 @@
|
||||
"documents = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"documents"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Example 3\n",
|
||||
"You can also specify `mode=\"page\"` to load document by pages."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader\n",
|
||||
"\n",
|
||||
"file_path = \"<filepath>\"\n",
|
||||
"endpoint = \"<endpoint>\"\n",
|
||||
"key = \"<key>\"\n",
|
||||
"loader = AzureAIDocumentIntelligenceLoader(\n",
|
||||
" api_endpoint=endpoint,\n",
|
||||
" api_key=key,\n",
|
||||
" file_path=file_path,\n",
|
||||
" api_model=\"prebuilt-layout\",\n",
|
||||
" mode=\"page\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"documents = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The output will be each page stored as a separate document in the list:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for document in documents:\n",
|
||||
" print(f\"Page Content: {document.page_content}\")\n",
|
||||
" print(f\"Metadata: {document.metadata}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Example 4\n",
|
||||
"You can also specify `analysis_feature=[\"ocrHighResolution\"]` to enable add-on capabilities. For more information, see: https://aka.ms/azsdk/python/documentintelligence/analysisfeature."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader\n",
|
||||
"\n",
|
||||
"file_path = \"<filepath>\"\n",
|
||||
"endpoint = \"<endpoint>\"\n",
|
||||
"key = \"<key>\"\n",
|
||||
"analysis_features = [\"ocrHighResolution\"]\n",
|
||||
"loader = AzureAIDocumentIntelligenceLoader(\n",
|
||||
" api_endpoint=endpoint,\n",
|
||||
" api_key=key,\n",
|
||||
" file_path=file_path,\n",
|
||||
" api_model=\"prebuilt-layout\",\n",
|
||||
" analysis_features=analysis_features,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"documents = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The output contains the LangChain document recognized with high resolution add-on capability:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
@ -43,13 +43,60 @@
|
||||
"docs[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "729ab1a2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using Azure AI Document Intelligence\n",
|
||||
"\n",
|
||||
">[Azure AI Document Intelligence](https://aka.ms/doc-intelligence) (formerly known as `Azure Form Recognizer`) is machine-learning \n",
|
||||
">based service that extracts texts (including handwriting), tables, document structures (e.g., titles, section headings, etc.) and key-value-pairs from\n",
|
||||
">digital or scanned PDFs, images, Office and HTML files.\n",
|
||||
">\n",
|
||||
">Document Intelligence supports `PDF`, `JPEG/JPG`, `PNG`, `BMP`, `TIFF`, `HEIF`, `DOCX`, `XLSX`, `PPTX` and `HTML`.\n",
|
||||
"\n",
|
||||
"This current implementation of a loader using `Document Intelligence` can incorporate content page-wise and turn it into LangChain documents. The default output format is markdown, which can be easily chained with `MarkdownHeaderTextSplitter` for semantic document chunking. You can also use `mode=\"single\"` or `mode=\"page\"` to return pure texts in a single page or document split by page.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "fbe5c77d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Prerequisite\n",
|
||||
"\n",
|
||||
"An Azure AI Document Intelligence resource in one of the 3 preview regions: **East US**, **West US2**, **West Europe** - follow [this document](https://learn.microsoft.com/azure/ai-services/document-intelligence/create-document-intelligence-resource?view=doc-intel-4.0.0) to create one if you don't have. You will be passing `<endpoint>` and `<key>` as parameters to the loader."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9ab94bde",
|
||||
"id": "fda529f8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
"source": [
|
||||
"%pip install --upgrade --quiet langchain langchain-community azure-ai-documentintelligence"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "aa008547",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader\n",
|
||||
"\n",
|
||||
"file_path = \"<filepath>\"\n",
|
||||
"endpoint = \"<endpoint>\"\n",
|
||||
"key = \"<key>\"\n",
|
||||
"loader = AzureAIDocumentIntelligenceLoader(\n",
|
||||
" api_endpoint=endpoint, api_key=key, file_path=file_path, api_model=\"prebuilt-layout\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"documents = loader.load()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
@ -76,7 +76,7 @@
|
||||
"id": "525d6b67",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Retain Elements\n",
|
||||
"### Retain Elements\n",
|
||||
"\n",
|
||||
"Under the hood, `Unstructured` creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
|
||||
]
|
||||
@ -124,13 +124,60 @@
|
||||
"data[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b97180c2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using Azure AI Document Intelligence\n",
|
||||
"\n",
|
||||
">[Azure AI Document Intelligence](https://aka.ms/doc-intelligence) (formerly known as `Azure Form Recognizer`) is machine-learning \n",
|
||||
">based service that extracts texts (including handwriting), tables, document structures (e.g., titles, section headings, etc.) and key-value-pairs from\n",
|
||||
">digital or scanned PDFs, images, Office and HTML files.\n",
|
||||
">\n",
|
||||
">Document Intelligence supports `PDF`, `JPEG/JPG`, `PNG`, `BMP`, `TIFF`, `HEIF`, `DOCX`, `XLSX`, `PPTX` and `HTML`.\n",
|
||||
"\n",
|
||||
"This current implementation of a loader using `Document Intelligence` can incorporate content page-wise and turn it into LangChain documents. The default output format is markdown, which can be easily chained with `MarkdownHeaderTextSplitter` for semantic document chunking. You can also use `mode=\"single\"` or `mode=\"page\"` to return pure texts in a single page or document split by page.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "11851fd0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Prerequisite\n",
|
||||
"\n",
|
||||
"An Azure AI Document Intelligence resource in one of the 3 preview regions: **East US**, **West US2**, **West Europe** - follow [this document](https://learn.microsoft.com/azure/ai-services/document-intelligence/create-document-intelligence-resource?view=doc-intel-4.0.0) to create one if you don't have. You will be passing `<endpoint>` and `<key>` as parameters to the loader."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "381d4139",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
"source": [
|
||||
"%pip install --upgrade --quiet langchain langchain-community azure-ai-documentintelligence"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "077525b8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader\n",
|
||||
"\n",
|
||||
"file_path = \"<filepath>\"\n",
|
||||
"endpoint = \"<endpoint>\"\n",
|
||||
"key = \"<key>\"\n",
|
||||
"loader = AzureAIDocumentIntelligenceLoader(\n",
|
||||
" api_endpoint=endpoint, api_key=key, file_path=file_path, api_model=\"prebuilt-layout\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"documents = loader.load()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
@ -147,7 +147,7 @@
|
||||
"id": "525d6b67",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Retain Elements\n",
|
||||
"### Retain Elements\n",
|
||||
"\n",
|
||||
"Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
|
||||
]
|
||||
@ -192,6 +192,59 @@
|
||||
"source": [
|
||||
"data[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c1f3b83f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using Azure AI Document Intelligence\n",
|
||||
"\n",
|
||||
">[Azure AI Document Intelligence](https://aka.ms/doc-intelligence) (formerly known as `Azure Form Recognizer`) is machine-learning \n",
|
||||
">based service that extracts texts (including handwriting), tables, document structures (e.g., titles, section headings, etc.) and key-value-pairs from\n",
|
||||
">digital or scanned PDFs, images, Office and HTML files.\n",
|
||||
">\n",
|
||||
">Document Intelligence supports `PDF`, `JPEG/JPG`, `PNG`, `BMP`, `TIFF`, `HEIF`, `DOCX`, `XLSX`, `PPTX` and `HTML`.\n",
|
||||
"\n",
|
||||
"This current implementation of a loader using `Document Intelligence` can incorporate content page-wise and turn it into LangChain documents. The default output format is markdown, which can be easily chained with `MarkdownHeaderTextSplitter` for semantic document chunking. You can also use `mode=\"single\"` or `mode=\"page\"` to return pure texts in a single page or document split by page.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a5bd47c2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Prerequisite\n",
|
||||
"\n",
|
||||
"An Azure AI Document Intelligence resource in one of the 3 preview regions: **East US**, **West US2**, **West Europe** - follow [this document](https://learn.microsoft.com/azure/ai-services/document-intelligence/create-document-intelligence-resource?view=doc-intel-4.0.0) to create one if you don't have. You will be passing `<endpoint>` and `<key>` as parameters to the loader."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "71cbdfe0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"%pip install --upgrade --quiet langchain langchain-community azure-ai-documentintelligence"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "691bd9e8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader\n",
|
||||
"\n",
|
||||
"file_path = \"<filepath>\"\n",
|
||||
"endpoint = \"<endpoint>\"\n",
|
||||
"key = \"<key>\"\n",
|
||||
"loader = AzureAIDocumentIntelligenceLoader(\n",
|
||||
" api_endpoint=endpoint, api_key=key, file_path=file_path, api_model=\"prebuilt-layout\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"documents = loader.load()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
@ -84,10 +84,11 @@ from langchain.document_loaders import AzureAIDataLoader
|
||||
|
||||
>[Azure AI Document Intelligence](https://aka.ms/doc-intelligence) (formerly known
|
||||
> as `Azure Form Recognizer`) is machine-learning
|
||||
> based service that extracts text (including handwriting), tables or key-value-pairs
|
||||
> from scanned documents or images.
|
||||
> based service that extracts texts (including handwriting), tables, document structures,
|
||||
> and key-value-pairs
|
||||
> from digital or scanned PDFs, images, Office and HTML files.
|
||||
>
|
||||
>Document Intelligence supports `PDF`, `JPEG`, `PNG`, `BMP`, or `TIFF`.
|
||||
> Document Intelligence supports `PDF`, `JPEG/JPG`, `PNG`, `BMP`, `TIFF`, `HEIF`, `DOCX`, `XLSX`, `PPTX` and `HTML`.
|
||||
|
||||
First, you need to install a python package.
|
||||
|
||||
|
@ -54,3 +54,30 @@ data
|
||||
```
|
||||
|
||||
</CodeOutputBlock>
|
||||
|
||||
## Loading HTML with AzureAIDocumentIntelligenceLoader
|
||||
|
||||
[Azure AI Document Intelligence](https://aka.ms/doc-intelligence) (formerly known as `Azure Form Recognizer`) is machine-learning
|
||||
based service that extracts texts (including handwriting), tables, document structures (e.g., titles, section headings, etc.) and key-value-pairs from
|
||||
digital or scanned PDFs, images, Office and HTML files. Document Intelligence supports `PDF`, `JPEG/JPG`, `PNG`, `BMP`, `TIFF`, `HEIF`, `DOCX`, `XLSX`, `PPTX` and `HTML`.
|
||||
|
||||
This [current implementation](https://aka.ms/di-langchain) of a loader using `Document Intelligence` can incorporate content page-wise and turn it into LangChain documents. The default output format is markdown, which can be easily chained with `MarkdownHeaderTextSplitter` for semantic document chunking. You can also use `mode="single"` or `mode="page"` to return pure texts in a single page or document split by page.
|
||||
|
||||
### Prerequisite
|
||||
|
||||
An Azure AI Document Intelligence resource in one of the 3 preview regions: **East US**, **West US2**, **West Europe** - follow [this document](https://learn.microsoft.com/azure/ai-services/document-intelligence/create-document-intelligence-resource?view=doc-intel-4.0.0) to create one if you don't have. You will be passing `<endpoint>` and `<key>` as parameters to the loader.
|
||||
|
||||
```python
|
||||
%pip install --upgrade --quiet langchain langchain-community azure-ai-documentintelligence
|
||||
|
||||
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
|
||||
|
||||
file_path = "<filepath>"
|
||||
endpoint = "<endpoint>"
|
||||
key = "<key>"
|
||||
loader = AzureAIDocumentIntelligenceLoader(
|
||||
api_endpoint=endpoint, api_key=key, file_path=file_path, api_model="prebuilt-layout"
|
||||
)
|
||||
|
||||
documents = loader.load()
|
||||
```
|
||||
|
@ -0,0 +1,33 @@
|
||||
# Microsoft Office
|
||||
|
||||
>[The Microsoft Office](https://www.office.com/) suite of productivity software includes Microsoft Word, Microsoft Excel, Microsoft PowerPoint, Microsoft Outlook, and Microsoft OneNote. It is available for Microsoft Windows and macOS operating systems. It is also available on Android and iOS.
|
||||
|
||||
This covers how to load commonly used file formats including `DOCX`, `XLSX` and `PPTX` documents into a document format that we can use downstream.
|
||||
|
||||
|
||||
## Loading DOCX, XLSX, PPTX with AzureAIDocumentIntelligenceLoader
|
||||
|
||||
[Azure AI Document Intelligence](https://aka.ms/doc-intelligence) (formerly known as `Azure Form Recognizer`) is machine-learning
|
||||
based service that extracts texts (including handwriting), tables, document structures (e.g., titles, section headings, etc.) and key-value-pairs from
|
||||
digital or scanned PDFs, images, Office and HTML files. Document Intelligence supports `PDF`, `JPEG/JPG`, `PNG`, `BMP`, `TIFF`, `HEIF`, `DOCX`, `XLSX`, `PPTX` and `HTML`.
|
||||
|
||||
This [current implementation](https://aka.ms/di-langchain) of a loader using `Document Intelligence` can incorporate content page-wise and turn it into LangChain documents. The default output format is markdown, which can be easily chained with `MarkdownHeaderTextSplitter` for semantic document chunking. You can also use `mode="single"` or `mode="page"` to return pure texts in a single page or document split by page.
|
||||
|
||||
### Prerequisite
|
||||
|
||||
An Azure AI Document Intelligence resource in one of the 3 preview regions: **East US**, **West US2**, **West Europe** - follow [this document](https://learn.microsoft.com/azure/ai-services/document-intelligence/create-document-intelligence-resource?view=doc-intel-4.0.0) to create one if you don't have. You will be passing `<endpoint>` and `<key>` as parameters to the loader.
|
||||
|
||||
```python
|
||||
%pip install --upgrade --quiet langchain langchain-community azure-ai-documentintelligence
|
||||
|
||||
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
|
||||
|
||||
file_path = "<filepath>"
|
||||
endpoint = "<endpoint>"
|
||||
key = "<key>"
|
||||
loader = AzureAIDocumentIntelligenceLoader(
|
||||
api_endpoint=endpoint, api_key=key, file_path=file_path, api_model="prebuilt-layout"
|
||||
)
|
||||
|
||||
documents = loader.load()
|
||||
```
|
@ -435,5 +435,32 @@ Besides the AWS configuration, it is very similar to the other PDF loaders, whil
|
||||
```python
|
||||
from langchain_community.document_loaders import AmazonTextractPDFLoader
|
||||
loader = AmazonTextractPDFLoader("example_data/alejandro_rosalez_sample-small.jpeg")
|
||||
documents = loader.load()
|
||||
```
|
||||
|
||||
## Using AzureAIDocumentIntelligenceLoader
|
||||
|
||||
[Azure AI Document Intelligence](https://aka.ms/doc-intelligence) (formerly known as `Azure Form Recognizer`) is machine-learning
|
||||
based service that extracts texts (including handwriting), tables, document structures (e.g., titles, section headings, etc.) and key-value-pairs from
|
||||
digital or scanned PDFs, images, Office and HTML files. Document Intelligence supports `PDF`, `JPEG/JPG`, `PNG`, `BMP`, `TIFF`, `HEIF`, `DOCX`, `XLSX`, `PPTX` and `HTML`.
|
||||
|
||||
This [current implementation](https://aka.ms/di-langchain) of a loader using `Document Intelligence` can incorporate content page-wise and turn it into LangChain documents. The default output format is markdown, which can be easily chained with `MarkdownHeaderTextSplitter` for semantic document chunking. You can also use `mode="single"` or `mode="page"` to return pure texts in a single page or document split by page.
|
||||
|
||||
### Prerequisite
|
||||
|
||||
An Azure AI Document Intelligence resource in one of the 3 preview regions: **East US**, **West US2**, **West Europe** - follow [this document](https://learn.microsoft.com/azure/ai-services/document-intelligence/create-document-intelligence-resource?view=doc-intel-4.0.0) to create one if you don't have. You will be passing `<endpoint>` and `<key>` as parameters to the loader.
|
||||
|
||||
```python
|
||||
%pip install --upgrade --quiet langchain langchain-community azure-ai-documentintelligence
|
||||
|
||||
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
|
||||
|
||||
file_path = "<filepath>"
|
||||
endpoint = "<endpoint>"
|
||||
key = "<key>"
|
||||
loader = AzureAIDocumentIntelligenceLoader(
|
||||
api_endpoint=endpoint, api_key=key, file_path=file_path, api_model="prebuilt-layout"
|
||||
)
|
||||
|
||||
documents = loader.load()
|
||||
```
|
@ -1,4 +1,4 @@
|
||||
from typing import Iterator, Optional
|
||||
from typing import Iterator, List, Optional
|
||||
|
||||
from langchain_core.documents import Document
|
||||
|
||||
@ -21,6 +21,8 @@ class AzureAIDocumentIntelligenceLoader(BaseLoader):
|
||||
api_version: Optional[str] = None,
|
||||
api_model: str = "prebuilt-layout",
|
||||
mode: str = "markdown",
|
||||
*,
|
||||
analysis_features: Optional[List[str]] = None,
|
||||
) -> None:
|
||||
"""
|
||||
Initialize the object for file processing with Azure Document Intelligence
|
||||
@ -45,11 +47,18 @@ class AzureAIDocumentIntelligenceLoader(BaseLoader):
|
||||
Either file_path or url_path must be specified.
|
||||
api_version: Optional[str]
|
||||
The API version for DocumentIntelligenceClient. Setting None to use
|
||||
the default value from SDK.
|
||||
the default value from `azure-ai-documentintelligence` package.
|
||||
api_model: str
|
||||
The model name or ID to be used for form recognition in Azure.
|
||||
Unique document model name. Default value is "prebuilt-layout".
|
||||
Note that overriding this default value may result in unsupported
|
||||
behavior.
|
||||
mode: Optional[str]
|
||||
The type of content representation of the generated Documents.
|
||||
Use either "single", "page", or "markdown". Default value is "markdown".
|
||||
analysis_features: Optional[List[str]]
|
||||
List of optional analysis features, each feature should be passed
|
||||
as a str that conforms to the enum `DocumentAnalysisFeature` in
|
||||
`azure-ai-documentintelligence` package. Default value is None.
|
||||
|
||||
Examples:
|
||||
---------
|
||||
@ -58,7 +67,7 @@ class AzureAIDocumentIntelligenceLoader(BaseLoader):
|
||||
... api_endpoint="https://endpoint.azure.com",
|
||||
... api_key="APIKEY",
|
||||
... api_version="2023-10-31-preview",
|
||||
... model="prebuilt-document",
|
||||
... api_model="prebuilt-layout",
|
||||
... mode="markdown"
|
||||
... )
|
||||
"""
|
||||
@ -75,6 +84,7 @@ class AzureAIDocumentIntelligenceLoader(BaseLoader):
|
||||
api_version=api_version,
|
||||
api_model=api_model,
|
||||
mode=mode,
|
||||
analysis_features=analysis_features,
|
||||
)
|
||||
|
||||
def lazy_load(
|
||||
|
@ -1,10 +1,13 @@
|
||||
from typing import Any, Iterator, Optional
|
||||
import logging
|
||||
from typing import Any, Iterator, List, Optional
|
||||
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.document_loaders.base import BaseBlobParser
|
||||
from langchain_community.document_loaders.blob_loaders import Blob
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AzureAIDocumentIntelligenceParser(BaseBlobParser):
|
||||
"""Loads a PDF with Azure Document Intelligence
|
||||
@ -17,22 +20,43 @@ class AzureAIDocumentIntelligenceParser(BaseBlobParser):
|
||||
api_version: Optional[str] = None,
|
||||
api_model: str = "prebuilt-layout",
|
||||
mode: str = "markdown",
|
||||
analysis_features: Optional[List[str]] = None,
|
||||
):
|
||||
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
||||
from azure.ai.documentintelligence.models import DocumentAnalysisFeature
|
||||
from azure.core.credentials import AzureKeyCredential
|
||||
|
||||
kwargs = {}
|
||||
if api_version is not None:
|
||||
kwargs["api_version"] = api_version
|
||||
|
||||
if analysis_features is not None:
|
||||
_SUPPORTED_FEATURES = [
|
||||
DocumentAnalysisFeature.OCR_HIGH_RESOLUTION,
|
||||
]
|
||||
|
||||
analysis_features = [
|
||||
DocumentAnalysisFeature(feature) for feature in analysis_features
|
||||
]
|
||||
if any(
|
||||
[feature not in _SUPPORTED_FEATURES for feature in analysis_features]
|
||||
):
|
||||
logger.warning(
|
||||
f"The current supported features are: "
|
||||
f"{[f.value for f in _SUPPORTED_FEATURES]}. "
|
||||
"Using other features may result in unexpected behavior."
|
||||
)
|
||||
|
||||
self.client = DocumentIntelligenceClient(
|
||||
endpoint=api_endpoint,
|
||||
credential=AzureKeyCredential(api_key),
|
||||
headers={"x-ms-useragent": "langchain-parser/1.0.0"},
|
||||
features=analysis_features,
|
||||
**kwargs,
|
||||
)
|
||||
self.api_model = api_model
|
||||
self.mode = mode
|
||||
assert self.mode in ["single", "page", "object", "markdown"]
|
||||
assert self.mode in ["single", "page", "markdown"]
|
||||
|
||||
def _generate_docs_page(self, result: Any) -> Iterator[Document]:
|
||||
for p in result.pages:
|
||||
@ -49,41 +73,6 @@ class AzureAIDocumentIntelligenceParser(BaseBlobParser):
|
||||
def _generate_docs_single(self, result: Any) -> Iterator[Document]:
|
||||
yield Document(page_content=result.content, metadata={})
|
||||
|
||||
def _generate_docs_object(self, result: Any) -> Iterator[Document]:
|
||||
# record relationship between page id and span offset
|
||||
page_offset = []
|
||||
for page in result.pages:
|
||||
# assume that spans only contain 1 element, to double check
|
||||
page_offset.append(page.spans[0]["offset"])
|
||||
|
||||
# paragraph
|
||||
# warning: paragraph content is overlapping with table content
|
||||
for para in result.paragraphs:
|
||||
yield Document(
|
||||
page_content=para.content,
|
||||
metadata={
|
||||
"role": para.role,
|
||||
"page": para.bounding_regions[0].page_number,
|
||||
"bounding_box": para.bounding_regions[0].polygon,
|
||||
"type": "paragraph",
|
||||
},
|
||||
)
|
||||
|
||||
# table
|
||||
for table in result.tables:
|
||||
yield Document(
|
||||
page_content=table.cells, # json object
|
||||
metadata={
|
||||
"footnote": table.footnotes,
|
||||
"caption": table.caption,
|
||||
"page": para.bounding_regions[0].page_number,
|
||||
"bounding_box": para.bounding_regions[0].polygon,
|
||||
"row_count": table.row_count,
|
||||
"column_count": table.column_count,
|
||||
"type": "table",
|
||||
},
|
||||
)
|
||||
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||
"""Lazily parse the blob."""
|
||||
|
||||
@ -101,7 +90,7 @@ class AzureAIDocumentIntelligenceParser(BaseBlobParser):
|
||||
elif self.mode in ["page"]:
|
||||
yield from self._generate_docs_page(result)
|
||||
else:
|
||||
yield from self._generate_docs_object(result)
|
||||
raise ValueError(f"Invalid mode: {self.mode}")
|
||||
|
||||
def parse_url(self, url: str) -> Iterator[Document]:
|
||||
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
|
||||
@ -119,4 +108,4 @@ class AzureAIDocumentIntelligenceParser(BaseBlobParser):
|
||||
elif self.mode in ["page"]:
|
||||
yield from self._generate_docs_page(result)
|
||||
else:
|
||||
yield from self._generate_docs_object(result)
|
||||
raise ValueError(f"Invalid mode: {self.mode}")
|
||||
|
@ -20,8 +20,44 @@ def test_doc_intelligence(mock_credential: MagicMock, mock_client: MagicMock) ->
|
||||
mock_client.assert_called_once_with(
|
||||
endpoint=endpoint,
|
||||
credential=mock_credential(),
|
||||
headers={"x-ms-useragent": "langchain-parser/1.0.0"},
|
||||
headers={
|
||||
"x-ms-useragent": "langchain-parser/1.0.0",
|
||||
},
|
||||
features=None,
|
||||
)
|
||||
assert parser.client == mock_client()
|
||||
assert parser.api_model == "prebuilt-layout"
|
||||
assert parser.mode == "markdown"
|
||||
|
||||
|
||||
@pytest.mark.requires("azure", "azure.ai", "azure.ai.documentintelligence")
|
||||
@patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
|
||||
@patch("azure.core.credentials.AzureKeyCredential")
|
||||
def test_doc_intelligence_with_analysis_features(
|
||||
mock_credential: MagicMock, mock_client: MagicMock
|
||||
) -> None:
|
||||
endpoint = "endpoint"
|
||||
key = "key"
|
||||
|
||||
analysis_features = ["ocrHighResolution", "barcodes"]
|
||||
parser = AzureAIDocumentIntelligenceParser(
|
||||
api_endpoint=endpoint, api_key=key, analysis_features=analysis_features
|
||||
)
|
||||
mock_credential.assert_called_once_with(key)
|
||||
mock_client.assert_called_once_with(
|
||||
endpoint=endpoint,
|
||||
credential=mock_credential(),
|
||||
headers={
|
||||
"x-ms-useragent": "langchain-parser/1.0.0",
|
||||
},
|
||||
features=analysis_features,
|
||||
)
|
||||
assert parser.client == mock_client()
|
||||
assert parser.api_model == "prebuilt-layout"
|
||||
assert parser.mode == "markdown"
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
analysis_features = ["invalid"]
|
||||
parser = AzureAIDocumentIntelligenceParser(
|
||||
api_endpoint=endpoint, api_key=key, analysis_features=analysis_features
|
||||
)
|
||||
|
Loading…
Reference in New Issue
Block a user