This commit is contained in:
Harrison Chase
2023-04-10 21:22:16 -07:00
parent a0cd0175a8
commit 80bb3206da

View File

@@ -1,12 +1,10 @@
import os
from io import BytesIO
from typing import Any, List, Optional
from typing import List
import boto3
from PIL import Image
from langchain.docstore.document import Document
from langchain.utils import get_from_dict_or_env
class AwsTextractExtraction:
@@ -14,8 +12,8 @@ class AwsTextractExtraction:
self,
aws_region_name: str,
aws_secret_key: str,
aws_access_key,
aws_session_token,
aws_access_key: str,
aws_session_token: str,
file_path: str,
):
self.aws_region_name = aws_region_name
@@ -24,7 +22,7 @@ class AwsTextractExtraction:
self.aws_session_token = aws_session_token
self.file_path = file_path
try:
import boto3
import boto3 # noqa: F401
except ImportError:
raise ValueError(
@@ -37,11 +35,15 @@ class AwsTextractExtraction:
page_no = 0
textract_client = boto3.client(
"textract", region_name, aws_access_key, aws_secret_key, aws_session_token
"textract",
self.aws_region_name,
self.aws_access_key,
self.aws_access_key,
self.aws_session_token,
)
Pil_Image_obj = Image.open(self.file_path)
pil_image_obj = Image.open(self.file_path)
buf = BytesIO()
Pil_Image_obj.save(buf, format="PNG")
pil_image_obj.save(buf, format="PNG")
image_bytes = buf.getvalue()
response = textract_client.detect_document_text(Document={"Bytes": image_bytes})