mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-13 21:47:12 +00:00
upstage: Add Upstage partner package LA and GC (#20651)
--------- Co-authored-by: Sean <chosh0615@gmail.com> Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Sean Cho <sean@upstage.ai>
This commit is contained in:
@@ -1,4 +1,13 @@
|
||||
from langchain_upstage.chat_models import ChatUpstage
|
||||
from langchain_upstage.embeddings import UpstageEmbeddings
|
||||
from langchain_upstage.layout_analysis import UpstageLayoutAnalysisLoader
|
||||
from langchain_upstage.layout_analysis_parsers import UpstageLayoutAnalysisParser
|
||||
from langchain_upstage.tools.groundedness_check import GroundednessCheck
|
||||
|
||||
__all__ = ["ChatUpstage", "UpstageEmbeddings"]
|
||||
__all__ = [
|
||||
"ChatUpstage",
|
||||
"UpstageEmbeddings",
|
||||
"UpstageLayoutAnalysisLoader",
|
||||
"UpstageLayoutAnalysisParser",
|
||||
"GroundednessCheck",
|
||||
]
|
||||
|
190
libs/partners/upstage/langchain_upstage/layout_analysis.py
Normal file
190
libs/partners/upstage/langchain_upstage/layout_analysis.py
Normal file
@@ -0,0 +1,190 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Iterator, List, Literal, Optional, Union
|
||||
|
||||
from langchain_core.document_loaders import BaseLoader, Blob
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from .layout_analysis_parsers import UpstageLayoutAnalysisParser
|
||||
|
||||
DEFAULT_PAGE_BATCH_SIZE = 10
|
||||
|
||||
OutputType = Literal["text", "html"]
|
||||
SplitType = Literal["none", "element", "page"]
|
||||
|
||||
|
||||
def validate_api_key(api_key: str) -> None:
|
||||
"""
|
||||
Validates the provided API key.
|
||||
|
||||
Args:
|
||||
api_key (str): The API key to be validated.
|
||||
|
||||
Raises:
|
||||
ValueError: If the API key is empty or None.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
if not api_key:
|
||||
raise ValueError("API Key is required for Upstage Document Loader")
|
||||
|
||||
|
||||
def validate_file_path(file_path: Union[str, Path, List[str], List[Path]]) -> None:
|
||||
"""
|
||||
Validates if a file exists at the given file path.
|
||||
|
||||
Args:
|
||||
file_path (Union[str, Path, List[str], List[Path]): The file path(s) to be
|
||||
validated.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the file or any of the files in the list do not exist.
|
||||
"""
|
||||
if isinstance(file_path, list):
|
||||
for path in file_path:
|
||||
validate_file_path(path)
|
||||
return
|
||||
if not os.path.exists(file_path):
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
|
||||
|
||||
def get_from_param_or_env(
|
||||
key: str,
|
||||
param: Optional[str] = None,
|
||||
env_key: Optional[str] = None,
|
||||
default: Optional[str] = None,
|
||||
) -> str:
|
||||
"""Get a value from a param or an environment variable."""
|
||||
if param is not None:
|
||||
return param
|
||||
elif env_key and env_key in os.environ and os.environ[env_key]:
|
||||
return os.environ[env_key]
|
||||
elif default is not None:
|
||||
return default
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Did not find {key}, please add an environment variable"
|
||||
f" `{env_key}` which contains it, or pass"
|
||||
f" `{key}` as a named parameter."
|
||||
)
|
||||
|
||||
|
||||
class UpstageLayoutAnalysisLoader(BaseLoader):
|
||||
"""Upstage Layout Analysis.
|
||||
|
||||
To use, you should have the environment variable `UPSTAGE_DOCUMENT_AI_API_KEY`
|
||||
set with your API key or pass it as a named parameter to the constructor.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_upstage import UpstageLayoutAnalysis
|
||||
|
||||
file_path = "/PATH/TO/YOUR/FILE.pdf"
|
||||
loader = UpstageLayoutAnalysis(
|
||||
file_path, split="page", output_type="text"
|
||||
)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Union[str, Path, List[str], List[Path]],
|
||||
output_type: Union[OutputType, dict] = "text",
|
||||
split: SplitType = "none",
|
||||
api_key: Optional[str] = None,
|
||||
use_ocr: bool = False,
|
||||
):
|
||||
"""
|
||||
Initializes an instance of the Upstage document loader.
|
||||
|
||||
Args:
|
||||
file_path (Union[str, Path, List[str], List[Path]): The path to the document
|
||||
to be loaded.
|
||||
output_type (Union[OutputType, dict], optional): The type of output to be
|
||||
generated by the parser.
|
||||
Defaults to "text".
|
||||
split (SplitType, optional): The type of splitting to be applied.
|
||||
Defaults to "none" (no splitting).
|
||||
api_key (str, optional): The API key for accessing the Upstage API.
|
||||
Defaults to None, in which case it will be
|
||||
fetched from the environment variable
|
||||
`UPSTAGE_DOCUMENT_AI_API_KEY`.
|
||||
use_ocr (bool, optional): Extract text from images in the document.
|
||||
Defaults to False. (Use text info in PDF file)
|
||||
"""
|
||||
self.file_path = file_path
|
||||
self.output_type = output_type
|
||||
self.split = split
|
||||
self.api_key = get_from_param_or_env(
|
||||
"UPSTAGE_DOCUMENT_AI_API_KEY", api_key, "UPSTAGE_DOCUMENT_AI_API_KEY"
|
||||
)
|
||||
self.use_ocr = use_ocr
|
||||
|
||||
validate_file_path(self.file_path)
|
||||
validate_api_key(self.api_key)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""
|
||||
Loads and parses the document using the UpstageLayoutAnalysisParser.
|
||||
|
||||
Returns:
|
||||
A list of Document objects representing the parsed layout analysis.
|
||||
"""
|
||||
|
||||
if isinstance(self.file_path, list):
|
||||
result = []
|
||||
|
||||
for file_path in self.file_path:
|
||||
blob = Blob.from_path(file_path)
|
||||
|
||||
parser = UpstageLayoutAnalysisParser(
|
||||
self.api_key,
|
||||
split=self.split,
|
||||
output_type=self.output_type,
|
||||
use_ocr=self.use_ocr,
|
||||
)
|
||||
result.extend(list(parser.lazy_parse(blob, is_batch=True)))
|
||||
|
||||
return result
|
||||
|
||||
else:
|
||||
blob = Blob.from_path(self.file_path)
|
||||
|
||||
parser = UpstageLayoutAnalysisParser(
|
||||
self.api_key,
|
||||
split=self.split,
|
||||
output_type=self.output_type,
|
||||
use_ocr=self.use_ocr,
|
||||
)
|
||||
return list(parser.lazy_parse(blob, is_batch=True))
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
"""
|
||||
Lazily loads and parses the document using the UpstageLayoutAnalysisParser.
|
||||
|
||||
Returns:
|
||||
An iterator of Document objects representing the parsed layout analysis.
|
||||
"""
|
||||
|
||||
if isinstance(self.file_path, list):
|
||||
for file_path in self.file_path:
|
||||
blob = Blob.from_path(file_path)
|
||||
|
||||
parser = UpstageLayoutAnalysisParser(
|
||||
self.api_key,
|
||||
split=self.split,
|
||||
output_type=self.output_type,
|
||||
use_ocr=self.use_ocr,
|
||||
)
|
||||
yield from parser.lazy_parse(blob, is_batch=True)
|
||||
else:
|
||||
blob = Blob.from_path(self.file_path)
|
||||
|
||||
parser = UpstageLayoutAnalysisParser(
|
||||
self.api_key,
|
||||
split=self.split,
|
||||
output_type=self.output_type,
|
||||
use_ocr=self.use_ocr,
|
||||
)
|
||||
yield from parser.lazy_parse(blob)
|
@@ -0,0 +1,375 @@
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
from typing import Dict, Iterator, List, Literal, Optional, Union
|
||||
|
||||
import fitz # type: ignore
|
||||
import requests
|
||||
from fitz import Document as fitzDocument
|
||||
from langchain_core.document_loaders import BaseBlobParser, Blob
|
||||
from langchain_core.documents import Document
|
||||
|
||||
LAYOUT_ANALYSIS_URL = "https://api.upstage.ai/v1/document-ai/layout-analysis"
|
||||
|
||||
DEFAULT_NUMBER_OF_PAGE = 10
|
||||
|
||||
OutputType = Literal["text", "html"]
|
||||
SplitType = Literal["none", "element", "page"]
|
||||
|
||||
|
||||
def validate_api_key(api_key: str) -> None:
|
||||
"""
|
||||
Validates the provided API key.
|
||||
|
||||
Args:
|
||||
api_key (str): The API key to be validated.
|
||||
|
||||
Raises:
|
||||
ValueError: If the API key is empty or None.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
if not api_key:
|
||||
raise ValueError("API Key is required for Upstage Document Loader")
|
||||
|
||||
|
||||
def validate_file_path(file_path: str) -> None:
|
||||
"""
|
||||
Validates if a file exists at the given file path.
|
||||
|
||||
Args:
|
||||
file_path (str): The path to the file.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the file does not exist at the given file path.
|
||||
"""
|
||||
if not os.path.exists(file_path):
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
|
||||
|
||||
def parse_output(data: dict, output_type: Union[OutputType, dict]) -> str:
|
||||
"""
|
||||
Parse the output data based on the specified output type.
|
||||
|
||||
Args:
|
||||
data (dict): The data to be parsed.
|
||||
output_type (Union[OutputType, dict]): The output type to parse the element data
|
||||
into.
|
||||
|
||||
Returns:
|
||||
str: The parsed output.
|
||||
|
||||
Raises:
|
||||
ValueError: If the output type is invalid.
|
||||
"""
|
||||
if isinstance(output_type, dict):
|
||||
if data["category"] in output_type:
|
||||
return data[output_type[data["category"]]]
|
||||
else:
|
||||
return data["text"]
|
||||
elif isinstance(output_type, str):
|
||||
if output_type == "text":
|
||||
return data["text"]
|
||||
elif output_type == "html":
|
||||
return data["html"]
|
||||
else:
|
||||
raise ValueError(f"Invalid output type: {output_type}")
|
||||
else:
|
||||
raise ValueError(f"Invalid output type: {output_type}")
|
||||
|
||||
|
||||
def get_from_param_or_env(
|
||||
key: str,
|
||||
param: Optional[str] = None,
|
||||
env_key: Optional[str] = None,
|
||||
default: Optional[str] = None,
|
||||
) -> str:
|
||||
"""Get a value from a param or an environment variable."""
|
||||
if param is not None:
|
||||
return param
|
||||
elif env_key and env_key in os.environ and os.environ[env_key]:
|
||||
return os.environ[env_key]
|
||||
elif default is not None:
|
||||
return default
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Did not find {key}, please add an environment variable"
|
||||
f" `{env_key}` which contains it, or pass"
|
||||
f" `{key}` as a named parameter."
|
||||
)
|
||||
|
||||
|
||||
class UpstageLayoutAnalysisParser(BaseBlobParser):
|
||||
"""Upstage Layout Analysis Parser.
|
||||
|
||||
To use, you should have the environment variable `UPSTAGE_DOCUMENT_AI_API_KEY`
|
||||
set with your API key or pass it as a named parameter to the constructor.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_upstage import UpstageLayoutAnalysisParser
|
||||
|
||||
loader = UpstageLayoutAnalysisParser(split="page", output_type="text")
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
api_key: Optional[str] = None,
|
||||
output_type: Union[OutputType, dict] = "text",
|
||||
split: SplitType = "none",
|
||||
use_ocr: bool = False,
|
||||
):
|
||||
"""
|
||||
Initializes an instance of the Upstage class.
|
||||
|
||||
Args:
|
||||
api_key (str, optional): The API key for accessing the Upstage API.
|
||||
Defaults to None, in which case it will be
|
||||
fetched from the environment variable
|
||||
`UPSTAGE_DOCUMENT_AI_API_KEY`.
|
||||
output_type (Union[OutputType, dict], optional): The type of output to be
|
||||
generated by the parser.
|
||||
Defaults to "text".
|
||||
split (SplitType, optional): The type of splitting to be applied.
|
||||
Defaults to "none" (no splitting).
|
||||
use_ocr (bool, optional): Extract text from images in the document.
|
||||
Defaults to False. (Use text info in PDF file)
|
||||
"""
|
||||
self.api_key = get_from_param_or_env(
|
||||
"UPSTAGE_DOCUMENT_AI_API_KEY", api_key, "UPSTAGE_DOCUMENT_AI_API_KEY"
|
||||
)
|
||||
|
||||
self.output_type = output_type
|
||||
self.split = split
|
||||
self.use_ocr = use_ocr
|
||||
|
||||
validate_api_key(self.api_key)
|
||||
|
||||
def _get_response(self, files: Dict) -> Dict:
|
||||
"""
|
||||
Sends a POST request to the API endpoint with the provided files and
|
||||
returns the response.
|
||||
|
||||
Args:
|
||||
files (dict): A dictionary containing the files to be sent in the request.
|
||||
|
||||
Returns:
|
||||
dict: The JSON response from the API.
|
||||
|
||||
Raises:
|
||||
ValueError: If there is an error in the API call.
|
||||
"""
|
||||
try:
|
||||
headers = {"Authorization": f"Bearer {self.api_key}"}
|
||||
options = {"ocr": self.use_ocr}
|
||||
response = requests.post(
|
||||
LAYOUT_ANALYSIS_URL, headers=headers, files=files, json=options
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
result = response.json()
|
||||
|
||||
except requests.RequestException as req_err:
|
||||
# Handle any request-related exceptions
|
||||
print(f"Request Exception: {req_err}")
|
||||
except json.JSONDecodeError as json_err:
|
||||
# Handle JSON decode errors
|
||||
print(f"JSON Decode Error: {json_err}")
|
||||
raise ValueError(f"Failed to decode JSON response: {json_err}")
|
||||
|
||||
return result
|
||||
|
||||
def _split_and_request(
|
||||
self,
|
||||
full_docs: fitzDocument,
|
||||
start_page: int,
|
||||
num_pages: int = DEFAULT_NUMBER_OF_PAGE,
|
||||
) -> Dict:
|
||||
"""
|
||||
Splits the full pdf document into partial pages and sends a request to the
|
||||
server.
|
||||
|
||||
Args:
|
||||
full_docs (str): The full document to be split and requested.
|
||||
start_page (int): The starting page number for splitting the document.
|
||||
num_pages (int, optional): The number of pages to split the document
|
||||
into.
|
||||
Defaults to DEFAULT_NUMBER_OF_PAGE.
|
||||
|
||||
Returns:
|
||||
response: The response from the server.
|
||||
"""
|
||||
with fitz.open() as chunk_pdf:
|
||||
chunk_pdf.insert_pdf(
|
||||
full_docs,
|
||||
from_page=start_page,
|
||||
to_page=start_page + num_pages - 1,
|
||||
)
|
||||
pdf_bytes = chunk_pdf.write()
|
||||
|
||||
with io.BytesIO(pdf_bytes) as f:
|
||||
response = self._get_response({"document": f})
|
||||
|
||||
return response
|
||||
|
||||
def _element_document(self, elements: Dict) -> Document:
|
||||
"""
|
||||
Converts an elements into a Document object.
|
||||
|
||||
Args:
|
||||
elements: The elements to convert.
|
||||
|
||||
Returns:
|
||||
A list containing a single Document object.
|
||||
|
||||
"""
|
||||
return Document(
|
||||
page_content=(parse_output(elements, self.output_type)),
|
||||
metadata={
|
||||
"page": elements["page"],
|
||||
"id": elements["id"],
|
||||
"type": self.output_type,
|
||||
"split": self.split,
|
||||
},
|
||||
)
|
||||
|
||||
def _page_document(self, elements: Dict) -> List[Document]:
|
||||
"""
|
||||
Combines elements with the same page number into a single Document object.
|
||||
|
||||
Args:
|
||||
elements (List[Dict]): A list of elements containing page numbers.
|
||||
|
||||
Returns:
|
||||
List[Document]: A list of Document objects, each representing a page
|
||||
with its content and metadata.
|
||||
"""
|
||||
_docs = []
|
||||
pages = sorted(set(map(lambda x: x["page"], elements)))
|
||||
|
||||
page_group = [
|
||||
[element for element in elements if element["page"] == x] for x in pages
|
||||
]
|
||||
|
||||
for group in page_group:
|
||||
page_content = " ".join(
|
||||
[parse_output(element, self.output_type) for element in group]
|
||||
)
|
||||
|
||||
_docs.append(
|
||||
Document(
|
||||
page_content=page_content,
|
||||
metadata={
|
||||
"page": group[0]["page"],
|
||||
"type": self.output_type,
|
||||
"split": self.split,
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
return _docs
|
||||
|
||||
def lazy_parse(self, blob: Blob, is_batch: bool = False) -> Iterator[Document]:
|
||||
"""
|
||||
Lazily parses a document and yields Document objects based on the specified
|
||||
split type.
|
||||
|
||||
Args:
|
||||
blob (Blob): The input document blob to parse.
|
||||
is_batch (bool, optional): Whether to parse the document in batches.
|
||||
Defaults to False (single page parsing)
|
||||
|
||||
Yields:
|
||||
Document: The parsed document object.
|
||||
|
||||
Raises:
|
||||
ValueError: If an invalid split type is provided.
|
||||
|
||||
"""
|
||||
|
||||
if is_batch:
|
||||
num_pages = DEFAULT_NUMBER_OF_PAGE
|
||||
else:
|
||||
num_pages = 1
|
||||
|
||||
full_docs = fitz.open(blob.path)
|
||||
number_of_pages = full_docs.page_count
|
||||
|
||||
if self.split == "none":
|
||||
if full_docs.is_pdf:
|
||||
result = ""
|
||||
start_page = 0
|
||||
num_pages = DEFAULT_NUMBER_OF_PAGE
|
||||
for _ in range(number_of_pages):
|
||||
if start_page >= number_of_pages:
|
||||
break
|
||||
|
||||
response = self._split_and_request(full_docs, start_page, num_pages)
|
||||
result += parse_output(response, self.output_type)
|
||||
|
||||
start_page += num_pages
|
||||
|
||||
else:
|
||||
if not blob.path:
|
||||
raise ValueError("Blob path is required for non-PDF files.")
|
||||
with open(blob.path, "rb") as f:
|
||||
response = self._get_response({"document": f})
|
||||
result = parse_output(response, self.output_type)
|
||||
|
||||
yield Document(
|
||||
page_content=result,
|
||||
metadata={
|
||||
"total_pages": number_of_pages,
|
||||
"type": self.output_type,
|
||||
"split": self.split,
|
||||
},
|
||||
)
|
||||
|
||||
elif self.split == "element":
|
||||
if full_docs.is_pdf:
|
||||
start_page = 0
|
||||
for _ in range(number_of_pages):
|
||||
if start_page >= number_of_pages:
|
||||
break
|
||||
|
||||
response = self._split_and_request(full_docs, start_page, num_pages)
|
||||
for element in response["elements"]:
|
||||
yield self._element_document(element)
|
||||
|
||||
start_page += num_pages
|
||||
|
||||
else:
|
||||
if not blob.path:
|
||||
raise ValueError("Blob path is required for non-PDF files.")
|
||||
with open(blob.path, "rb") as f:
|
||||
response = self._get_response({"document": f})
|
||||
|
||||
for element in response["elements"]:
|
||||
yield self._element_document(element)
|
||||
|
||||
elif self.split == "page":
|
||||
if full_docs.is_pdf:
|
||||
start_page = 0
|
||||
for _ in range(number_of_pages):
|
||||
if start_page >= number_of_pages:
|
||||
break
|
||||
|
||||
response = self._split_and_request(full_docs, start_page, num_pages)
|
||||
elements = response["elements"]
|
||||
yield from self._page_document(elements)
|
||||
|
||||
start_page += num_pages
|
||||
else:
|
||||
if not blob.path:
|
||||
raise ValueError("Blob path is required for non-PDF files.")
|
||||
with open(blob.path, "rb") as f:
|
||||
response = self._get_response({"document": f})
|
||||
|
||||
elements = response["elements"]
|
||||
|
||||
yield from self._page_document(elements)
|
||||
|
||||
else:
|
||||
raise ValueError(f"Invalid split type: {self.split}")
|
@@ -0,0 +1,91 @@
|
||||
import os
|
||||
from typing import Literal, Optional, Type, Union
|
||||
|
||||
from langchain_core.callbacks import (
|
||||
AsyncCallbackManagerForToolRun,
|
||||
CallbackManagerForToolRun,
|
||||
)
|
||||
from langchain_core.messages import AIMessage, HumanMessage
|
||||
from langchain_core.pydantic_v1 import BaseModel, Field, SecretStr
|
||||
from langchain_core.tools import BaseTool
|
||||
|
||||
from langchain_upstage import ChatUpstage
|
||||
|
||||
|
||||
class GroundednessCheckInput(BaseModel):
|
||||
"""Input for the Groundedness Check tool."""
|
||||
|
||||
context: str = Field(description="context in which the answer should be verified")
|
||||
query: str = Field(
|
||||
description="assistant's reply or a text that is subject to groundedness check"
|
||||
)
|
||||
|
||||
|
||||
class GroundednessCheck(BaseTool):
|
||||
"""Tool that checks the groundedness of a context and an assistant message.
|
||||
|
||||
To use, you should have the environment variable `UPSTAGE_API_KEY`
|
||||
set with your API key or pass it as a named parameter to the constructor.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_upstage import GroundednessCheck
|
||||
|
||||
tool = GroundednessCheck()
|
||||
"""
|
||||
|
||||
name: str = "groundedness_check"
|
||||
description: str = (
|
||||
"A tool that checks the groundedness of an assistant response "
|
||||
"to user-provided context. GroundednessCheck ensures that "
|
||||
"the assistant’s response is not only relevant but also "
|
||||
"precisely aligned with the user's initial context, "
|
||||
"promoting a more reliable and context-aware interaction. "
|
||||
"When using retrieval-augmented generation (RAG), "
|
||||
"the Groundedness Check can be used to determine whether "
|
||||
"the assistant's message is grounded in the provided context."
|
||||
)
|
||||
upstage_api_key: Optional[SecretStr] = Field(default=None, alias="api_key")
|
||||
api_wrapper: ChatUpstage
|
||||
|
||||
args_schema: Type[BaseModel] = GroundednessCheckInput
|
||||
|
||||
def __init__(self, upstage_api_key: Optional[SecretStr] = None):
|
||||
if not upstage_api_key:
|
||||
upstage_api_key = SecretStr(os.getenv("UPSTAGE_API_KEY", ""))
|
||||
else:
|
||||
upstage_api_key = upstage_api_key
|
||||
if (
|
||||
not upstage_api_key
|
||||
or not upstage_api_key.get_secret_value()
|
||||
or upstage_api_key.get_secret_value() == ""
|
||||
):
|
||||
raise ValueError("UPSTAGE_API_KEY must be set or passed")
|
||||
|
||||
api_wrapper = ChatUpstage(
|
||||
model_name="solar-1-mini-answer-verification",
|
||||
upstage_api_key=upstage_api_key.get_secret_value(),
|
||||
)
|
||||
super().__init__(upstage_api_key=upstage_api_key, api_wrapper=api_wrapper)
|
||||
|
||||
def _run(
|
||||
self,
|
||||
context: str,
|
||||
query: str,
|
||||
run_manager: Optional[CallbackManagerForToolRun] = None,
|
||||
) -> Union[str, Literal["grounded", "notGrounded", "notSure"]]:
|
||||
"""Use the tool."""
|
||||
response = self.api_wrapper.invoke([HumanMessage(context), AIMessage(query)])
|
||||
return str(response.content)
|
||||
|
||||
async def _arun(
|
||||
self,
|
||||
context: str,
|
||||
query: str,
|
||||
run_manager: Optional[AsyncCallbackManagerForToolRun] = None,
|
||||
) -> Union[str, Literal["grounded", "notGrounded", "notSure"]]:
|
||||
response = await self.api_wrapper.ainvoke(
|
||||
[HumanMessage(context), AIMessage(query)]
|
||||
)
|
||||
return str(response.content)
|
86
libs/partners/upstage/poetry.lock
generated
86
libs/partners/upstage/poetry.lock
generated
@@ -223,13 +223,13 @@ test = ["pytest (>=6)"]
|
||||
|
||||
[[package]]
|
||||
name = "freezegun"
|
||||
version = "1.4.0"
|
||||
version = "1.5.0"
|
||||
description = "Let your Python tests travel through time"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "freezegun-1.4.0-py3-none-any.whl", hash = "sha256:55e0fc3c84ebf0a96a5aa23ff8b53d70246479e9a68863f1fcac5a3e52f19dd6"},
|
||||
{file = "freezegun-1.4.0.tar.gz", hash = "sha256:10939b0ba0ff5adaecf3b06a5c2f73071d9678e507c5eaedb23c761d56ac774b"},
|
||||
{file = "freezegun-1.5.0-py3-none-any.whl", hash = "sha256:ec3f4ba030e34eb6cf7e1e257308aee2c60c3d038ff35996d7475760c9ff3719"},
|
||||
{file = "freezegun-1.5.0.tar.gz", hash = "sha256:200a64359b363aa3653d8aac289584078386c7c3da77339d257e46a01fb5c77c"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@@ -340,7 +340,7 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "langchain-core"
|
||||
version = "0.1.44"
|
||||
version = "0.1.45"
|
||||
description = "Building applications with LLMs through composability"
|
||||
optional = false
|
||||
python-versions = ">=3.8.1,<4.0"
|
||||
@@ -399,13 +399,13 @@ url = "../../standard-tests"
|
||||
|
||||
[[package]]
|
||||
name = "langsmith"
|
||||
version = "0.1.49"
|
||||
version = "0.1.50"
|
||||
description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
|
||||
optional = false
|
||||
python-versions = "<4.0,>=3.8.1"
|
||||
files = [
|
||||
{file = "langsmith-0.1.49-py3-none-any.whl", hash = "sha256:cf0db7474c0dfb22015c22bf97f62e850898c3c6af9564dd111c2df225acc1c8"},
|
||||
{file = "langsmith-0.1.49.tar.gz", hash = "sha256:5aee8537763f9d62b3368d79d7bfef881e2bfaa28639011d8d7328770cbd6419"},
|
||||
{file = "langsmith-0.1.50-py3-none-any.whl", hash = "sha256:a81e9809fcaa277bfb314d729e58116554f186d1478fcfdf553b1c2ccce54b85"},
|
||||
{file = "langsmith-0.1.50.tar.gz", hash = "sha256:9fd22df8c689c044058536ea5af66f5302067e7551b60d7a335fede8d479572b"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@@ -548,13 +548,13 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "openai"
|
||||
version = "1.23.1"
|
||||
version = "1.23.3"
|
||||
description = "The official Python library for the openai API"
|
||||
optional = false
|
||||
python-versions = ">=3.7.1"
|
||||
files = [
|
||||
{file = "openai-1.23.1-py3-none-any.whl", hash = "sha256:7941c1bc6fcdb1b6b889dfcfabff775ca52558a79d57dd1b9e15b463de1b3a4c"},
|
||||
{file = "openai-1.23.1.tar.gz", hash = "sha256:6df937e2a1ad64494951ea3614f5516db4d67c3fcc0b751b8e5edf1bc57e2d3d"},
|
||||
{file = "openai-1.23.3-py3-none-any.whl", hash = "sha256:6eef764a8870095d256d59e6be243acf560a21227e9e3588b508972818929ef7"},
|
||||
{file = "openai-1.23.3.tar.gz", hash = "sha256:6730b8468a0235e5f289dfdfacaa374001645099c4ad1740b58eab378bcf7723"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@@ -642,13 +642,13 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "pluggy"
|
||||
version = "1.4.0"
|
||||
version = "1.5.0"
|
||||
description = "plugin and hook calling mechanisms for python"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "pluggy-1.4.0-py3-none-any.whl", hash = "sha256:7db9f7b503d67d1c5b95f59773ebb58a8c1c288129a88665838012cfb07b8981"},
|
||||
{file = "pluggy-1.4.0.tar.gz", hash = "sha256:8c85c2876142a764e5b7548e7d9a0e0ddb46f5185161049a79b7e974454223be"},
|
||||
{file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"},
|
||||
{file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
@@ -722,6 +722,64 @@ files = [
|
||||
plugins = ["importlib-metadata"]
|
||||
windows-terminal = ["colorama (>=0.4.6)"]
|
||||
|
||||
[[package]]
|
||||
name = "pymupdf"
|
||||
version = "1.24.2"
|
||||
description = "A high performance Python library for data extraction, analysis, conversion & manipulation of PDF (and other) documents."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "PyMuPDF-1.24.2-cp310-none-macosx_10_9_x86_64.whl", hash = "sha256:5faed2bbdfbea80db1bbaa5944888f27a672f2e10e16e61f7d4ff73429a00506"},
|
||||
{file = "PyMuPDF-1.24.2-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:24c398e43a14e0e11f3515ea57875b5b0ee1a37d6dc59f921f69d8d16e881cb8"},
|
||||
{file = "PyMuPDF-1.24.2-cp310-none-manylinux2014_aarch64.whl", hash = "sha256:569336fe3c5f81f28aa9658861597e43e5716cbfa5b8d2602431095df76e0d7c"},
|
||||
{file = "PyMuPDF-1.24.2-cp310-none-manylinux2014_x86_64.whl", hash = "sha256:8fe58a024629c23847423b3294f0f160c72c72f953af53d183bd3328f954593a"},
|
||||
{file = "PyMuPDF-1.24.2-cp310-none-win32.whl", hash = "sha256:49224a558736303ed980252a704646fe9347c74bf70d0ad32530c62b8e0bfe33"},
|
||||
{file = "PyMuPDF-1.24.2-cp310-none-win_amd64.whl", hash = "sha256:a32c94c7ee45f2bfee766e5b957bdfe08c96b21fd9adbfb546c141621af0ca85"},
|
||||
{file = "PyMuPDF-1.24.2-cp311-none-macosx_10_9_x86_64.whl", hash = "sha256:815d9e10faa43a149d8c9928d7cefda83fd91a1f637dfb3a295620175a0af95c"},
|
||||
{file = "PyMuPDF-1.24.2-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:b583add37141a9337935d014d4e1913b10e22d17f3fd656fdc5f0c0c2e65a33e"},
|
||||
{file = "PyMuPDF-1.24.2-cp311-none-manylinux2014_aarch64.whl", hash = "sha256:d6a4d4ad8cc698db25a31026311f03fd351c2db9bfd41d898494cd0baff3b679"},
|
||||
{file = "PyMuPDF-1.24.2-cp311-none-manylinux2014_x86_64.whl", hash = "sha256:7b5acb936203bdaef5945f211af8a5fb40f07059be1ba69a728425f6d522e60f"},
|
||||
{file = "PyMuPDF-1.24.2-cp311-none-win32.whl", hash = "sha256:d01d348a35438f8a1647334428ef23c6d947acae875fa61cac2be3a65b15e4f5"},
|
||||
{file = "PyMuPDF-1.24.2-cp311-none-win_amd64.whl", hash = "sha256:909ab62c752be80c3c130a9774fc27fb863d26149ba880129e0a2cf0d53cebde"},
|
||||
{file = "PyMuPDF-1.24.2-cp312-none-macosx_10_9_x86_64.whl", hash = "sha256:6a3c1f2e99a4ca43c97b1f43fdd1aed739910e25ee5bd7fe73cd4eaf59841ae3"},
|
||||
{file = "PyMuPDF-1.24.2-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:3effff62943ceebbbe32a08ce4aa9c8ed4fa18fd8a462cf6130c78818c47822d"},
|
||||
{file = "PyMuPDF-1.24.2-cp312-none-manylinux2014_aarch64.whl", hash = "sha256:f3964783bf81f2ec94db4f9fa536052be3b7457824c9e9d21edb91f3a489a377"},
|
||||
{file = "PyMuPDF-1.24.2-cp312-none-manylinux2014_x86_64.whl", hash = "sha256:04af266755d4c250b46a3311062aec36ea94cecc4470a53ab79d9de56e5a753d"},
|
||||
{file = "PyMuPDF-1.24.2-cp312-none-win32.whl", hash = "sha256:3bd7bdda4c4e4f98989ce84a7b2c08033639a8be1b46fb064fdd65b20a7e7d03"},
|
||||
{file = "PyMuPDF-1.24.2-cp312-none-win_amd64.whl", hash = "sha256:ec2544f35088b29730210decfb0bdb750e0c3d2652ee470897f6d2e4a6dc1950"},
|
||||
{file = "PyMuPDF-1.24.2-cp38-none-macosx_10_9_x86_64.whl", hash = "sha256:d4fd3957fd507affbcae4536092cb3e3726e91d484be16972603c5cacae7848a"},
|
||||
{file = "PyMuPDF-1.24.2-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:4290273dfcc58a2c0b1f207f5e25479b868f59e9ea6ac9241740506fa0c03c0a"},
|
||||
{file = "PyMuPDF-1.24.2-cp38-none-manylinux2014_aarch64.whl", hash = "sha256:8f52f27d1f5968b6dda4d803e7f5246626a45ab68f0626509a9e17fadcebfb69"},
|
||||
{file = "PyMuPDF-1.24.2-cp38-none-manylinux2014_x86_64.whl", hash = "sha256:db650840eb3efbdc97df94210d0400042c863b08348d67037495d221ec4e8b7f"},
|
||||
{file = "PyMuPDF-1.24.2-cp38-none-win32.whl", hash = "sha256:423217223741f55f9bb7622475a94c2934495e8a843246c582c78f3680914a80"},
|
||||
{file = "PyMuPDF-1.24.2-cp38-none-win_amd64.whl", hash = "sha256:ca493fbb91d81a43d68d3547194d0c86083db49d4dd98e8f41aa5a77a26ff8fe"},
|
||||
{file = "PyMuPDF-1.24.2-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:9783b67f63e7f9b397f119de996ea8214498d163531b9371d8ea7e374cdd45cd"},
|
||||
{file = "PyMuPDF-1.24.2-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:4db161926d636c0bff016ac7591edbe6b30712507079f7008cefb0fdf58055dc"},
|
||||
{file = "PyMuPDF-1.24.2-cp39-none-manylinux2014_aarch64.whl", hash = "sha256:537cc7bef86514a6fa248eeb14b588f51699388628372cf31bae7839283aa628"},
|
||||
{file = "PyMuPDF-1.24.2-cp39-none-manylinux2014_x86_64.whl", hash = "sha256:a124b360898d24b730fe3be0e0bca438789c1568ceaad854387eee1886bb788c"},
|
||||
{file = "PyMuPDF-1.24.2-cp39-none-win32.whl", hash = "sha256:007586883fbc8acb900d46aa95520aaeb8943d05a956b26c54053ddb58dbdd5f"},
|
||||
{file = "PyMuPDF-1.24.2-cp39-none-win_amd64.whl", hash = "sha256:d89cbb1a093dbf042f503f5c7fc368d0718a652418512a7a42a2965cba27713d"},
|
||||
{file = "PyMuPDF-1.24.2.tar.gz", hash = "sha256:cdaca48b7677a0c1dc827413b90c8fe4517f789f74c6ac0fb47f6051368246bb"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
PyMuPDFb = "1.24.1"
|
||||
|
||||
[[package]]
|
||||
name = "pymupdfb"
|
||||
version = "1.24.1"
|
||||
description = "MuPDF shared libraries for PyMuPDF."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "PyMuPDFb-1.24.1-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:37179e363bf69ce9be637937c5469957b96968341dabe3ce8f4b690a82e9ad92"},
|
||||
{file = "PyMuPDFb-1.24.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:17444ea7d6897c27759880ad76af537d19779f901de82ae9548598a70f614558"},
|
||||
{file = "PyMuPDFb-1.24.1-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:490f7fff4dbe362bc895cefdfc5030d712311d024d357a1388d64816eb215d34"},
|
||||
{file = "PyMuPDFb-1.24.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0fbcc0d2a9ce79fa38eb4e8bb5c959b582f7a49938874e9f61d1a6f5eeb1e4b8"},
|
||||
{file = "PyMuPDFb-1.24.1-py3-none-win32.whl", hash = "sha256:ae67736058882cdd9459810a4aae9ac2b2e89ac2e916cb5fefb0f651c9739e9e"},
|
||||
{file = "PyMuPDFb-1.24.1-py3-none-win_amd64.whl", hash = "sha256:01c8b7f0ce9166310eb28c7aebcb8d5fe12a4bc082f9b00d580095eebeaf0af5"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pytest"
|
||||
version = "7.4.4"
|
||||
@@ -1270,4 +1328,4 @@ watchmedo = ["PyYAML (>=3.10)"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.8.1,<4.0"
|
||||
content-hash = "98a8d67be9138240d5190eb4774b93f671fbd8069839ad239d005c753bdbae0d"
|
||||
content-hash = "1bb654e8a4f60cca5f0562ade5477a2f2e852ed2a361c7e9162208fbeb445309"
|
||||
|
@@ -14,6 +14,8 @@ license = "MIT"
|
||||
python = ">=3.8.1,<4.0"
|
||||
langchain-core = "^0.1.44"
|
||||
langchain-openai = "^0.1.3"
|
||||
pymupdf = "^1.24.1"
|
||||
requests = "^2.31.0"
|
||||
|
||||
[tool.poetry.group.test]
|
||||
optional = true
|
||||
@@ -50,6 +52,7 @@ ruff = "^0.1.5"
|
||||
|
||||
[tool.poetry.group.typing.dependencies]
|
||||
mypy = "^0.991"
|
||||
types-requests = ">=2.31.0"
|
||||
langchain-core = { path = "../../core", develop = true }
|
||||
|
||||
[tool.poetry.group.dev]
|
||||
|
BIN
libs/partners/upstage/tests/examples/solar.pdf
Normal file
BIN
libs/partners/upstage/tests/examples/solar.pdf
Normal file
Binary file not shown.
@@ -0,0 +1,17 @@
|
||||
from langchain_upstage import GroundednessCheck
|
||||
|
||||
|
||||
def test_langchain_upstage_groundedness_check() -> None:
|
||||
"""Test Upstage Groundedness Check."""
|
||||
tool = GroundednessCheck()
|
||||
output = tool.run({"context": "foo bar", "query": "bar foo"})
|
||||
|
||||
assert output in ["grounded", "notGrounded", "notSure"]
|
||||
|
||||
|
||||
async def test_langchain_upstage_groundedness_check_async() -> None:
|
||||
"""Test Upstage Groundedness Check asynchronous."""
|
||||
tool = GroundednessCheck()
|
||||
output = await tool.arun({"context": "foo bar", "query": "bar foo"})
|
||||
|
||||
assert output in ["grounded", "notGrounded", "notSure"]
|
@@ -0,0 +1,10 @@
|
||||
import os
|
||||
|
||||
from langchain_upstage import GroundednessCheck
|
||||
|
||||
os.environ["UPSTAGE_API_KEY"] = "foo"
|
||||
|
||||
|
||||
def test_initialization() -> None:
|
||||
"""Test embedding model initialization."""
|
||||
GroundednessCheck()
|
@@ -3,6 +3,9 @@ from langchain_upstage import __all__
|
||||
EXPECTED_ALL = [
|
||||
"ChatUpstage",
|
||||
"UpstageEmbeddings",
|
||||
"UpstageLayoutAnalysisLoader",
|
||||
"UpstageLayoutAnalysisParser",
|
||||
"GroundednessCheck",
|
||||
]
|
||||
|
||||
|
||||
|
200
libs/partners/upstage/tests/unit_tests/test_layout_analysis.py
Normal file
200
libs/partners/upstage/tests/unit_tests/test_layout_analysis.py
Normal file
@@ -0,0 +1,200 @@
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, get_args
|
||||
from unittest.mock import MagicMock, Mock, patch
|
||||
|
||||
from langchain_upstage import UpstageLayoutAnalysisLoader
|
||||
from langchain_upstage.layout_analysis import OutputType, SplitType
|
||||
|
||||
MOCK_RESPONSE_JSON: Dict[str, Any] = {
|
||||
"api": "1.0",
|
||||
"billed_pages": 1,
|
||||
"elements": [
|
||||
{
|
||||
"bounding_box": [
|
||||
{"x": 74, "y": 906},
|
||||
{"x": 148, "y": 906},
|
||||
{"x": 148, "y": 2338},
|
||||
{"x": 74, "y": 2338},
|
||||
],
|
||||
"category": "header",
|
||||
"html": "2021arXiv:2103.15348v2",
|
||||
"id": 0,
|
||||
"page": 1,
|
||||
"text": "arXiv:2103.15348v2",
|
||||
},
|
||||
{
|
||||
"bounding_box": [
|
||||
{"x": 654, "y": 474},
|
||||
{"x": 1912, "y": 474},
|
||||
{"x": 1912, "y": 614},
|
||||
{"x": 654, "y": 614},
|
||||
],
|
||||
"category": "paragraph",
|
||||
"html": "LayoutParser Toolkit",
|
||||
"id": 1,
|
||||
"page": 1,
|
||||
"text": "LayoutParser Toolkit",
|
||||
},
|
||||
],
|
||||
"html": "<header id='0'>arXiv:2103.15348v2</header>"
|
||||
+ "<p id='1'>LayoutParser Toolkit</p>",
|
||||
"mimetype": "multipart/form-data",
|
||||
"model": "layout-analyzer-0.1.0",
|
||||
"text": "arXiv:2103.15348v2LayoutParser Toolkit",
|
||||
}
|
||||
|
||||
EXAMPLE_PDF_PATH = Path(__file__).parent.parent / "examples/solar.pdf"
|
||||
|
||||
|
||||
def test_initialization() -> None:
|
||||
"""Test layout analysis document loader initialization."""
|
||||
UpstageLayoutAnalysisLoader(file_path=EXAMPLE_PDF_PATH, api_key="bar")
|
||||
|
||||
|
||||
def test_layout_analysis_param() -> None:
|
||||
for output_type in get_args(OutputType):
|
||||
for split in get_args(SplitType):
|
||||
loader = UpstageLayoutAnalysisLoader(
|
||||
file_path=EXAMPLE_PDF_PATH,
|
||||
api_key="bar",
|
||||
output_type=output_type,
|
||||
split=split,
|
||||
)
|
||||
assert loader.output_type == output_type
|
||||
assert loader.split == split
|
||||
assert loader.api_key == "bar"
|
||||
assert loader.file_path == EXAMPLE_PDF_PATH
|
||||
|
||||
|
||||
@patch("requests.post")
|
||||
def test_none_split_text_output(mock_post: Mock) -> None:
|
||||
mock_post.return_value = MagicMock(
|
||||
status_code=200, json=MagicMock(return_value=MOCK_RESPONSE_JSON)
|
||||
)
|
||||
|
||||
loader = UpstageLayoutAnalysisLoader(
|
||||
file_path=EXAMPLE_PDF_PATH,
|
||||
output_type="text",
|
||||
split="none",
|
||||
api_key="valid_api_key",
|
||||
)
|
||||
documents = loader.load()
|
||||
|
||||
assert len(documents) == 1
|
||||
assert documents[0].page_content == MOCK_RESPONSE_JSON["text"]
|
||||
assert documents[0].metadata["total_pages"] == 1
|
||||
assert documents[0].metadata["type"] == "text"
|
||||
assert documents[0].metadata["split"] == "none"
|
||||
|
||||
|
||||
@patch("requests.post")
|
||||
def test_element_split_text_output(mock_post: Mock) -> None:
|
||||
mock_post.return_value = MagicMock(
|
||||
status_code=200, json=MagicMock(return_value=MOCK_RESPONSE_JSON)
|
||||
)
|
||||
|
||||
loader = UpstageLayoutAnalysisLoader(
|
||||
file_path=EXAMPLE_PDF_PATH,
|
||||
output_type="text",
|
||||
split="element",
|
||||
api_key="valid_api_key",
|
||||
)
|
||||
documents = loader.load()
|
||||
|
||||
assert len(documents) == 2
|
||||
|
||||
for i, document in enumerate(documents):
|
||||
assert document.page_content == MOCK_RESPONSE_JSON["elements"][i]["text"]
|
||||
assert document.metadata["page"] == MOCK_RESPONSE_JSON["elements"][i]["page"]
|
||||
assert document.metadata["id"] == MOCK_RESPONSE_JSON["elements"][i]["id"]
|
||||
assert document.metadata["type"] == "text"
|
||||
assert document.metadata["split"] == "element"
|
||||
|
||||
|
||||
@patch("requests.post")
|
||||
def test_page_split_text_output(mock_post: Mock) -> None:
|
||||
mock_post.return_value = MagicMock(
|
||||
status_code=200, json=MagicMock(return_value=MOCK_RESPONSE_JSON)
|
||||
)
|
||||
|
||||
loader = UpstageLayoutAnalysisLoader(
|
||||
file_path=EXAMPLE_PDF_PATH,
|
||||
output_type="text",
|
||||
split="page",
|
||||
api_key="valid_api_key",
|
||||
)
|
||||
documents = loader.load()
|
||||
|
||||
assert len(documents) == 1
|
||||
|
||||
for i, document in enumerate(documents):
|
||||
assert document.metadata["page"] == MOCK_RESPONSE_JSON["elements"][i]["page"]
|
||||
assert document.metadata["type"] == "text"
|
||||
assert document.metadata["split"] == "page"
|
||||
|
||||
|
||||
@patch("requests.post")
|
||||
def test_none_split_html_output(mock_post: Mock) -> None:
|
||||
mock_post.return_value = MagicMock(
|
||||
status_code=200, json=MagicMock(return_value=MOCK_RESPONSE_JSON)
|
||||
)
|
||||
|
||||
loader = UpstageLayoutAnalysisLoader(
|
||||
file_path=EXAMPLE_PDF_PATH,
|
||||
output_type="html",
|
||||
split="none",
|
||||
api_key="valid_api_key",
|
||||
)
|
||||
documents = loader.load()
|
||||
|
||||
assert len(documents) == 1
|
||||
assert documents[0].page_content == MOCK_RESPONSE_JSON["html"]
|
||||
assert documents[0].metadata["total_pages"] == 1
|
||||
assert documents[0].metadata["type"] == "html"
|
||||
assert documents[0].metadata["split"] == "none"
|
||||
|
||||
|
||||
@patch("requests.post")
|
||||
def test_element_split_html_output(mock_post: Mock) -> None:
|
||||
mock_post.return_value = MagicMock(
|
||||
status_code=200, json=MagicMock(return_value=MOCK_RESPONSE_JSON)
|
||||
)
|
||||
|
||||
loader = UpstageLayoutAnalysisLoader(
|
||||
file_path=EXAMPLE_PDF_PATH,
|
||||
output_type="html",
|
||||
split="element",
|
||||
api_key="valid_api_key",
|
||||
)
|
||||
documents = loader.load()
|
||||
|
||||
assert len(documents) == 2
|
||||
|
||||
for i, document in enumerate(documents):
|
||||
assert document.page_content == MOCK_RESPONSE_JSON["elements"][i]["html"]
|
||||
assert document.metadata["page"] == MOCK_RESPONSE_JSON["elements"][i]["page"]
|
||||
assert document.metadata["id"] == MOCK_RESPONSE_JSON["elements"][i]["id"]
|
||||
assert document.metadata["type"] == "html"
|
||||
assert document.metadata["split"] == "element"
|
||||
|
||||
|
||||
@patch("requests.post")
|
||||
def test_page_split_html_output(mock_post: Mock) -> None:
|
||||
mock_post.return_value = MagicMock(
|
||||
status_code=200, json=MagicMock(return_value=MOCK_RESPONSE_JSON)
|
||||
)
|
||||
|
||||
loader = UpstageLayoutAnalysisLoader(
|
||||
file_path=EXAMPLE_PDF_PATH,
|
||||
output_type="html",
|
||||
split="page",
|
||||
api_key="valid_api_key",
|
||||
)
|
||||
documents = loader.load()
|
||||
|
||||
assert len(documents) == 1
|
||||
|
||||
for i, document in enumerate(documents):
|
||||
assert document.metadata["page"] == MOCK_RESPONSE_JSON["elements"][i]["page"]
|
||||
assert document.metadata["type"] == "html"
|
||||
assert document.metadata["split"] == "page"
|
Reference in New Issue
Block a user