mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-12 12:59:07 +00:00
community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463)
Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes
This commit is contained in:
@@ -0,0 +1,573 @@
|
||||
"""Module contains common parsers for PDFs."""
|
||||
from __future__ import annotations
|
||||
|
||||
import warnings
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Iterable,
|
||||
Iterator,
|
||||
Mapping,
|
||||
Optional,
|
||||
Sequence,
|
||||
Union,
|
||||
)
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import numpy as np
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.document_loaders.base import BaseBlobParser
|
||||
from langchain_community.document_loaders.blob_loaders import Blob
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import fitz.fitz
|
||||
import pdfminer.layout
|
||||
import pdfplumber.page
|
||||
import pypdf._page
|
||||
import pypdfium2._helpers.page
|
||||
|
||||
|
||||
_PDF_FILTER_WITH_LOSS = ["DCTDecode", "DCT", "JPXDecode"]
|
||||
_PDF_FILTER_WITHOUT_LOSS = [
|
||||
"LZWDecode",
|
||||
"LZW",
|
||||
"FlateDecode",
|
||||
"Fl",
|
||||
"ASCII85Decode",
|
||||
"A85",
|
||||
"ASCIIHexDecode",
|
||||
"AHx",
|
||||
"RunLengthDecode",
|
||||
"RL",
|
||||
"CCITTFaxDecode",
|
||||
"CCF",
|
||||
"JBIG2Decode",
|
||||
]
|
||||
|
||||
|
||||
def extract_from_images_with_rapidocr(
|
||||
images: Sequence[Union[Iterable[np.ndarray], bytes]],
|
||||
) -> str:
|
||||
"""Extract text from images with RapidOCR.
|
||||
|
||||
Args:
|
||||
images: Images to extract text from.
|
||||
|
||||
Returns:
|
||||
Text extracted from images.
|
||||
|
||||
Raises:
|
||||
ImportError: If `rapidocr-onnxruntime` package is not installed.
|
||||
"""
|
||||
try:
|
||||
from rapidocr_onnxruntime import RapidOCR
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"`rapidocr-onnxruntime` package not found, please install it with "
|
||||
"`pip install rapidocr-onnxruntime`"
|
||||
)
|
||||
ocr = RapidOCR()
|
||||
text = ""
|
||||
for img in images:
|
||||
result, _ = ocr(img)
|
||||
if result:
|
||||
result = [text[1] for text in result]
|
||||
text += "\n".join(result)
|
||||
return text
|
||||
|
||||
|
||||
class PyPDFParser(BaseBlobParser):
|
||||
"""Load `PDF` using `pypdf`"""
|
||||
|
||||
def __init__(
|
||||
self, password: Optional[Union[str, bytes]] = None, extract_images: bool = False
|
||||
):
|
||||
self.password = password
|
||||
self.extract_images = extract_images
|
||||
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||
"""Lazily parse the blob."""
|
||||
import pypdf
|
||||
|
||||
with blob.as_bytes_io() as pdf_file_obj:
|
||||
pdf_reader = pypdf.PdfReader(pdf_file_obj, password=self.password)
|
||||
yield from [
|
||||
Document(
|
||||
page_content=page.extract_text()
|
||||
+ self._extract_images_from_page(page),
|
||||
metadata={"source": blob.source, "page": page_number},
|
||||
)
|
||||
for page_number, page in enumerate(pdf_reader.pages)
|
||||
]
|
||||
|
||||
def _extract_images_from_page(self, page: pypdf._page.PageObject) -> str:
|
||||
"""Extract images from page and get the text with RapidOCR."""
|
||||
if not self.extract_images or "/XObject" not in page["/Resources"].keys():
|
||||
return ""
|
||||
|
||||
xObject = page["/Resources"]["/XObject"].get_object() # type: ignore
|
||||
images = []
|
||||
for obj in xObject:
|
||||
if xObject[obj]["/Subtype"] == "/Image":
|
||||
if xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITHOUT_LOSS:
|
||||
height, width = xObject[obj]["/Height"], xObject[obj]["/Width"]
|
||||
|
||||
images.append(
|
||||
np.frombuffer(xObject[obj].get_data(), dtype=np.uint8).reshape(
|
||||
height, width, -1
|
||||
)
|
||||
)
|
||||
elif xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITH_LOSS:
|
||||
images.append(xObject[obj].get_data())
|
||||
else:
|
||||
warnings.warn("Unknown PDF Filter!")
|
||||
return extract_from_images_with_rapidocr(images)
|
||||
|
||||
|
||||
class PDFMinerParser(BaseBlobParser):
|
||||
"""Parse `PDF` using `PDFMiner`."""
|
||||
|
||||
def __init__(self, extract_images: bool = False, *, concatenate_pages: bool = True):
|
||||
"""Initialize a parser based on PDFMiner.
|
||||
|
||||
Args:
|
||||
extract_images: Whether to extract images from PDF.
|
||||
concatenate_pages: If True, concatenate all PDF pages into one a single
|
||||
document. Otherwise, return one document per page.
|
||||
"""
|
||||
self.extract_images = extract_images
|
||||
self.concatenate_pages = concatenate_pages
|
||||
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||
"""Lazily parse the blob."""
|
||||
|
||||
if not self.extract_images:
|
||||
from pdfminer.high_level import extract_text
|
||||
|
||||
with blob.as_bytes_io() as pdf_file_obj:
|
||||
if self.concatenate_pages:
|
||||
text = extract_text(pdf_file_obj)
|
||||
metadata = {"source": blob.source}
|
||||
yield Document(page_content=text, metadata=metadata)
|
||||
else:
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
|
||||
pages = PDFPage.get_pages(pdf_file_obj)
|
||||
for i, _ in enumerate(pages):
|
||||
text = extract_text(pdf_file_obj, page_numbers=[i])
|
||||
metadata = {"source": blob.source, "page": str(i)}
|
||||
yield Document(page_content=text, metadata=metadata)
|
||||
else:
|
||||
import io
|
||||
|
||||
from pdfminer.converter import PDFPageAggregator, TextConverter
|
||||
from pdfminer.layout import LAParams
|
||||
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
|
||||
text_io = io.StringIO()
|
||||
with blob.as_bytes_io() as pdf_file_obj:
|
||||
pages = PDFPage.get_pages(pdf_file_obj)
|
||||
rsrcmgr = PDFResourceManager()
|
||||
device_for_text = TextConverter(rsrcmgr, text_io, laparams=LAParams())
|
||||
device_for_image = PDFPageAggregator(rsrcmgr, laparams=LAParams())
|
||||
interpreter_for_text = PDFPageInterpreter(rsrcmgr, device_for_text)
|
||||
interpreter_for_image = PDFPageInterpreter(rsrcmgr, device_for_image)
|
||||
for i, page in enumerate(pages):
|
||||
interpreter_for_text.process_page(page)
|
||||
interpreter_for_image.process_page(page)
|
||||
content = text_io.getvalue() + self._extract_images_from_page(
|
||||
device_for_image.get_result()
|
||||
)
|
||||
text_io.truncate(0)
|
||||
text_io.seek(0)
|
||||
metadata = {"source": blob.source, "page": str(i)}
|
||||
yield Document(page_content=content, metadata=metadata)
|
||||
|
||||
def _extract_images_from_page(self, page: pdfminer.layout.LTPage) -> str:
|
||||
"""Extract images from page and get the text with RapidOCR."""
|
||||
import pdfminer
|
||||
|
||||
def get_image(layout_object: Any) -> Any:
|
||||
if isinstance(layout_object, pdfminer.layout.LTImage):
|
||||
return layout_object
|
||||
if isinstance(layout_object, pdfminer.layout.LTContainer):
|
||||
for child in layout_object:
|
||||
return get_image(child)
|
||||
else:
|
||||
return None
|
||||
|
||||
images = []
|
||||
|
||||
for img in list(filter(bool, map(get_image, page))):
|
||||
if img.stream["Filter"].name in _PDF_FILTER_WITHOUT_LOSS:
|
||||
images.append(
|
||||
np.frombuffer(img.stream.get_data(), dtype=np.uint8).reshape(
|
||||
img.stream["Height"], img.stream["Width"], -1
|
||||
)
|
||||
)
|
||||
elif img.stream["Filter"].name in _PDF_FILTER_WITH_LOSS:
|
||||
images.append(img.stream.get_data())
|
||||
else:
|
||||
warnings.warn("Unknown PDF Filter!")
|
||||
return extract_from_images_with_rapidocr(images)
|
||||
|
||||
|
||||
class PyMuPDFParser(BaseBlobParser):
|
||||
"""Parse `PDF` using `PyMuPDF`."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
text_kwargs: Optional[Mapping[str, Any]] = None,
|
||||
extract_images: bool = False,
|
||||
) -> None:
|
||||
"""Initialize the parser.
|
||||
|
||||
Args:
|
||||
text_kwargs: Keyword arguments to pass to ``fitz.Page.get_text()``.
|
||||
"""
|
||||
self.text_kwargs = text_kwargs or {}
|
||||
self.extract_images = extract_images
|
||||
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||
"""Lazily parse the blob."""
|
||||
import fitz
|
||||
|
||||
with blob.as_bytes_io() as file_path:
|
||||
if blob.data is None:
|
||||
doc = fitz.open(file_path)
|
||||
else:
|
||||
doc = fitz.open(stream=file_path, filetype="pdf")
|
||||
|
||||
yield from [
|
||||
Document(
|
||||
page_content=page.get_text(**self.text_kwargs)
|
||||
+ self._extract_images_from_page(doc, page),
|
||||
metadata=dict(
|
||||
{
|
||||
"source": blob.source,
|
||||
"file_path": blob.source,
|
||||
"page": page.number,
|
||||
"total_pages": len(doc),
|
||||
},
|
||||
**{
|
||||
k: doc.metadata[k]
|
||||
for k in doc.metadata
|
||||
if type(doc.metadata[k]) in [str, int]
|
||||
},
|
||||
),
|
||||
)
|
||||
for page in doc
|
||||
]
|
||||
|
||||
def _extract_images_from_page(
|
||||
self, doc: fitz.fitz.Document, page: fitz.fitz.Page
|
||||
) -> str:
|
||||
"""Extract images from page and get the text with RapidOCR."""
|
||||
if not self.extract_images:
|
||||
return ""
|
||||
import fitz
|
||||
|
||||
img_list = page.get_images()
|
||||
imgs = []
|
||||
for img in img_list:
|
||||
xref = img[0]
|
||||
pix = fitz.Pixmap(doc, xref)
|
||||
imgs.append(
|
||||
np.frombuffer(pix.samples, dtype=np.uint8).reshape(
|
||||
pix.height, pix.width, -1
|
||||
)
|
||||
)
|
||||
return extract_from_images_with_rapidocr(imgs)
|
||||
|
||||
|
||||
class PyPDFium2Parser(BaseBlobParser):
|
||||
"""Parse `PDF` with `PyPDFium2`."""
|
||||
|
||||
def __init__(self, extract_images: bool = False) -> None:
|
||||
"""Initialize the parser."""
|
||||
try:
|
||||
import pypdfium2 # noqa:F401
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"pypdfium2 package not found, please install it with"
|
||||
" `pip install pypdfium2`"
|
||||
)
|
||||
self.extract_images = extract_images
|
||||
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||
"""Lazily parse the blob."""
|
||||
import pypdfium2
|
||||
|
||||
# pypdfium2 is really finicky with respect to closing things,
|
||||
# if done incorrectly creates seg faults.
|
||||
with blob.as_bytes_io() as file_path:
|
||||
pdf_reader = pypdfium2.PdfDocument(file_path, autoclose=True)
|
||||
try:
|
||||
for page_number, page in enumerate(pdf_reader):
|
||||
text_page = page.get_textpage()
|
||||
content = text_page.get_text_range()
|
||||
text_page.close()
|
||||
content += "\n" + self._extract_images_from_page(page)
|
||||
page.close()
|
||||
metadata = {"source": blob.source, "page": page_number}
|
||||
yield Document(page_content=content, metadata=metadata)
|
||||
finally:
|
||||
pdf_reader.close()
|
||||
|
||||
def _extract_images_from_page(self, page: pypdfium2._helpers.page.PdfPage) -> str:
|
||||
"""Extract images from page and get the text with RapidOCR."""
|
||||
if not self.extract_images:
|
||||
return ""
|
||||
|
||||
import pypdfium2.raw as pdfium_c
|
||||
|
||||
images = list(page.get_objects(filter=(pdfium_c.FPDF_PAGEOBJ_IMAGE,)))
|
||||
|
||||
images = list(map(lambda x: x.get_bitmap().to_numpy(), images))
|
||||
return extract_from_images_with_rapidocr(images)
|
||||
|
||||
|
||||
class PDFPlumberParser(BaseBlobParser):
|
||||
"""Parse `PDF` with `PDFPlumber`."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
text_kwargs: Optional[Mapping[str, Any]] = None,
|
||||
dedupe: bool = False,
|
||||
extract_images: bool = False,
|
||||
) -> None:
|
||||
"""Initialize the parser.
|
||||
|
||||
Args:
|
||||
text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()``
|
||||
dedupe: Avoiding the error of duplicate characters if `dedupe=True`.
|
||||
"""
|
||||
self.text_kwargs = text_kwargs or {}
|
||||
self.dedupe = dedupe
|
||||
self.extract_images = extract_images
|
||||
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||
"""Lazily parse the blob."""
|
||||
import pdfplumber
|
||||
|
||||
with blob.as_bytes_io() as file_path:
|
||||
doc = pdfplumber.open(file_path) # open document
|
||||
|
||||
yield from [
|
||||
Document(
|
||||
page_content=self._process_page_content(page)
|
||||
+ "\n"
|
||||
+ self._extract_images_from_page(page),
|
||||
metadata=dict(
|
||||
{
|
||||
"source": blob.source,
|
||||
"file_path": blob.source,
|
||||
"page": page.page_number - 1,
|
||||
"total_pages": len(doc.pages),
|
||||
},
|
||||
**{
|
||||
k: doc.metadata[k]
|
||||
for k in doc.metadata
|
||||
if type(doc.metadata[k]) in [str, int]
|
||||
},
|
||||
),
|
||||
)
|
||||
for page in doc.pages
|
||||
]
|
||||
|
||||
def _process_page_content(self, page: pdfplumber.page.Page) -> str:
|
||||
"""Process the page content based on dedupe."""
|
||||
if self.dedupe:
|
||||
return page.dedupe_chars().extract_text(**self.text_kwargs)
|
||||
return page.extract_text(**self.text_kwargs)
|
||||
|
||||
def _extract_images_from_page(self, page: pdfplumber.page.Page) -> str:
|
||||
"""Extract images from page and get the text with RapidOCR."""
|
||||
if not self.extract_images:
|
||||
return ""
|
||||
|
||||
images = []
|
||||
for img in page.images:
|
||||
if img["stream"]["Filter"].name in _PDF_FILTER_WITHOUT_LOSS:
|
||||
images.append(
|
||||
np.frombuffer(img["stream"].get_data(), dtype=np.uint8).reshape(
|
||||
img["stream"]["Height"], img["stream"]["Width"], -1
|
||||
)
|
||||
)
|
||||
elif img["stream"]["Filter"].name in _PDF_FILTER_WITH_LOSS:
|
||||
images.append(img["stream"].get_data())
|
||||
else:
|
||||
warnings.warn("Unknown PDF Filter!")
|
||||
|
||||
return extract_from_images_with_rapidocr(images)
|
||||
|
||||
|
||||
class AmazonTextractPDFParser(BaseBlobParser):
|
||||
"""Send `PDF` files to `Amazon Textract` and parse them.
|
||||
|
||||
For parsing multi-page PDFs, they have to reside on S3.
|
||||
|
||||
The AmazonTextractPDFLoader calls the
|
||||
[Amazon Textract Service](https://aws.amazon.com/textract/)
|
||||
to convert PDFs into a Document structure.
|
||||
Single and multi-page documents are supported with up to 3000 pages
|
||||
and 512 MB of size.
|
||||
|
||||
For the call to be successful an AWS account is required,
|
||||
similar to the
|
||||
[AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html)
|
||||
requirements.
|
||||
|
||||
Besides the AWS configuration, it is very similar to the other PDF
|
||||
loaders, while also supporting JPEG, PNG and TIFF and non-native
|
||||
PDF formats.
|
||||
|
||||
```python
|
||||
from langchain_community.document_loaders import AmazonTextractPDFLoader
|
||||
loader=AmazonTextractPDFLoader("example_data/alejandro_rosalez_sample-small.jpeg")
|
||||
documents = loader.load()
|
||||
```
|
||||
|
||||
One feature is the linearization of the output.
|
||||
When using the features LAYOUT, FORMS or TABLES together with Textract
|
||||
|
||||
```python
|
||||
from langchain_community.document_loaders import AmazonTextractPDFLoader
|
||||
# you can mix and match each of the features
|
||||
loader=AmazonTextractPDFLoader(
|
||||
"example_data/alejandro_rosalez_sample-small.jpeg",
|
||||
textract_features=["TABLES", "LAYOUT"])
|
||||
documents = loader.load()
|
||||
```
|
||||
|
||||
it will generate output that formats the text in reading order and
|
||||
try to output the information in a tabular structure or
|
||||
output the key/value pairs with a colon (key: value).
|
||||
This helps most LLMs to achieve better accuracy when
|
||||
processing these texts.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
textract_features: Optional[Sequence[int]] = None,
|
||||
client: Optional[Any] = None,
|
||||
) -> None:
|
||||
"""Initializes the parser.
|
||||
|
||||
Args:
|
||||
textract_features: Features to be used for extraction, each feature
|
||||
should be passed as an int that conforms to the enum
|
||||
`Textract_Features`, see `amazon-textract-caller` pkg
|
||||
client: boto3 textract client
|
||||
"""
|
||||
|
||||
try:
|
||||
import textractcaller as tc
|
||||
import textractor.entities.document as textractor
|
||||
|
||||
self.tc = tc
|
||||
self.textractor = textractor
|
||||
|
||||
if textract_features is not None:
|
||||
self.textract_features = [
|
||||
tc.Textract_Features(f) for f in textract_features
|
||||
]
|
||||
else:
|
||||
self.textract_features = []
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import amazon-textract-caller or "
|
||||
"amazon-textract-textractor python package. Please install it "
|
||||
"with `pip install amazon-textract-caller` & "
|
||||
"`pip install amazon-textract-textractor`."
|
||||
)
|
||||
|
||||
if not client:
|
||||
try:
|
||||
import boto3
|
||||
|
||||
self.boto3_textract_client = boto3.client("textract")
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import boto3 python package. "
|
||||
"Please install it with `pip install boto3`."
|
||||
)
|
||||
else:
|
||||
self.boto3_textract_client = client
|
||||
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||
"""Iterates over the Blob pages and returns an Iterator with a Document
|
||||
for each page, like the other parsers If multi-page document, blob.path
|
||||
has to be set to the S3 URI and for single page docs
|
||||
the blob.data is taken
|
||||
"""
|
||||
|
||||
url_parse_result = urlparse(str(blob.path)) if blob.path else None
|
||||
# Either call with S3 path (multi-page) or with bytes (single-page)
|
||||
if (
|
||||
url_parse_result
|
||||
and url_parse_result.scheme == "s3"
|
||||
and url_parse_result.netloc
|
||||
):
|
||||
textract_response_json = self.tc.call_textract(
|
||||
input_document=str(blob.path),
|
||||
features=self.textract_features,
|
||||
boto3_textract_client=self.boto3_textract_client,
|
||||
)
|
||||
else:
|
||||
textract_response_json = self.tc.call_textract(
|
||||
input_document=blob.as_bytes(),
|
||||
features=self.textract_features,
|
||||
call_mode=self.tc.Textract_Call_Mode.FORCE_SYNC,
|
||||
boto3_textract_client=self.boto3_textract_client,
|
||||
)
|
||||
|
||||
document = self.textractor.Document.open(textract_response_json)
|
||||
|
||||
linearizer_config = self.textractor.TextLinearizationConfig(
|
||||
hide_figure_layout=True,
|
||||
title_prefix="# ",
|
||||
section_header_prefix="## ",
|
||||
list_element_prefix="*",
|
||||
)
|
||||
for idx, page in enumerate(document.pages):
|
||||
yield Document(
|
||||
page_content=page.get_text(config=linearizer_config),
|
||||
metadata={"source": blob.source, "page": idx + 1},
|
||||
)
|
||||
|
||||
|
||||
class DocumentIntelligenceParser(BaseBlobParser):
|
||||
"""Loads a PDF with Azure Document Intelligence
|
||||
(formerly Forms Recognizer) and chunks at character level."""
|
||||
|
||||
def __init__(self, client: Any, model: str):
|
||||
self.client = client
|
||||
self.model = model
|
||||
|
||||
def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]:
|
||||
for p in result.pages:
|
||||
content = " ".join([line.content for line in p.lines])
|
||||
|
||||
d = Document(
|
||||
page_content=content,
|
||||
metadata={
|
||||
"source": blob.source,
|
||||
"page": p.page_number,
|
||||
},
|
||||
)
|
||||
yield d
|
||||
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||
"""Lazily parse the blob."""
|
||||
|
||||
with blob.as_bytes_io() as file_obj:
|
||||
poller = self.client.begin_analyze_document(self.model, file_obj)
|
||||
result = poller.result()
|
||||
|
||||
docs = self._generate_docs(blob, result)
|
||||
|
||||
yield from docs
|
Reference in New Issue
Block a user