mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-19 13:23:35 +00:00
integration test for DocAI parser (#11424)
- **Description:** added an integration test - **Issue:** #11407 @baskaryan
This commit is contained in:
parent
2abbdc6ecb
commit
e4a46747dc
@ -0,0 +1,36 @@
|
|||||||
|
"""Test Google Cloud DocAI parser.
|
||||||
|
|
||||||
|
You need to create a processor and enable the DocAI before running this test:
|
||||||
|
|
||||||
|
https://cloud.google.com/document-ai/docs/setup
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
|
||||||
|
from langchain.document_loaders.blob_loaders import Blob
|
||||||
|
from langchain.document_loaders.parsers import DocAIParser
|
||||||
|
from langchain.schema import Document
|
||||||
|
|
||||||
|
|
||||||
|
def test_docai_parser() -> None:
|
||||||
|
"""In order to run this test, you should provide a processor name, output path
|
||||||
|
for DocAI to store parsing results, and an input blob path to parse.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
export BLOB_PATH=gs://...
|
||||||
|
export GCS_OUTPUT_PATH=gs://...
|
||||||
|
export PROCESSOR_NAME=projects/.../locations/us/processors/...
|
||||||
|
"""
|
||||||
|
blob_path = os.environ["BLOB_PATH"]
|
||||||
|
gcs_output_path = os.environ["GCS_OUTPUT_PATH"]
|
||||||
|
processor_name = os.environ["PROCESSOR_NAME"]
|
||||||
|
parser = DocAIParser(
|
||||||
|
location="us", processor_name=processor_name, gcs_output_path=gcs_output_path
|
||||||
|
)
|
||||||
|
blob = Blob(path=blob_path)
|
||||||
|
documents = list(parser.lazy_parse(blob))
|
||||||
|
assert len(documents) > 0
|
||||||
|
for i, doc in enumerate(documents):
|
||||||
|
assert isinstance(doc, Document)
|
||||||
|
assert doc.page_content
|
||||||
|
assert doc.metadata["source"] == blob_path
|
||||||
|
assert doc.metadata["page"] == i + 1
|
Loading…
Reference in New Issue
Block a user