upstage: Add Upstage partner package LA and GC (#20651)

---------

Co-authored-by: Sean <chosh0615@gmail.com>
Co-authored-by: Erick Friis <erick@langchain.dev>
Co-authored-by: Sean Cho <sean@upstage.ai>
This commit is contained in:
junkeon
2024-04-25 07:17:20 +09:00
committed by GitHub
parent 5ecebf168c
commit c8fd51e8c8
18 changed files with 1471 additions and 19 deletions

View File

@@ -1,4 +1,13 @@
from langchain_upstage.chat_models import ChatUpstage
from langchain_upstage.embeddings import UpstageEmbeddings
from langchain_upstage.layout_analysis import UpstageLayoutAnalysisLoader
from langchain_upstage.layout_analysis_parsers import UpstageLayoutAnalysisParser
from langchain_upstage.tools.groundedness_check import GroundednessCheck
__all__ = ["ChatUpstage", "UpstageEmbeddings"]
__all__ = [
"ChatUpstage",
"UpstageEmbeddings",
"UpstageLayoutAnalysisLoader",
"UpstageLayoutAnalysisParser",
"GroundednessCheck",
]

View File

@@ -0,0 +1,190 @@
import os
from pathlib import Path
from typing import Iterator, List, Literal, Optional, Union
from langchain_core.document_loaders import BaseLoader, Blob
from langchain_core.documents import Document
from .layout_analysis_parsers import UpstageLayoutAnalysisParser
DEFAULT_PAGE_BATCH_SIZE = 10
OutputType = Literal["text", "html"]
SplitType = Literal["none", "element", "page"]
def validate_api_key(api_key: str) -> None:
"""
Validates the provided API key.
Args:
api_key (str): The API key to be validated.
Raises:
ValueError: If the API key is empty or None.
Returns:
None
"""
if not api_key:
raise ValueError("API Key is required for Upstage Document Loader")
def validate_file_path(file_path: Union[str, Path, List[str], List[Path]]) -> None:
"""
Validates if a file exists at the given file path.
Args:
file_path (Union[str, Path, List[str], List[Path]): The file path(s) to be
validated.
Raises:
FileNotFoundError: If the file or any of the files in the list do not exist.
"""
if isinstance(file_path, list):
for path in file_path:
validate_file_path(path)
return
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
def get_from_param_or_env(
key: str,
param: Optional[str] = None,
env_key: Optional[str] = None,
default: Optional[str] = None,
) -> str:
"""Get a value from a param or an environment variable."""
if param is not None:
return param
elif env_key and env_key in os.environ and os.environ[env_key]:
return os.environ[env_key]
elif default is not None:
return default
else:
raise ValueError(
f"Did not find {key}, please add an environment variable"
f" `{env_key}` which contains it, or pass"
f" `{key}` as a named parameter."
)
class UpstageLayoutAnalysisLoader(BaseLoader):
"""Upstage Layout Analysis.
To use, you should have the environment variable `UPSTAGE_DOCUMENT_AI_API_KEY`
set with your API key or pass it as a named parameter to the constructor.
Example:
.. code-block:: python
from langchain_upstage import UpstageLayoutAnalysis
file_path = "/PATH/TO/YOUR/FILE.pdf"
loader = UpstageLayoutAnalysis(
file_path, split="page", output_type="text"
)
"""
def __init__(
self,
file_path: Union[str, Path, List[str], List[Path]],
output_type: Union[OutputType, dict] = "text",
split: SplitType = "none",
api_key: Optional[str] = None,
use_ocr: bool = False,
):
"""
Initializes an instance of the Upstage document loader.
Args:
file_path (Union[str, Path, List[str], List[Path]): The path to the document
to be loaded.
output_type (Union[OutputType, dict], optional): The type of output to be
generated by the parser.
Defaults to "text".
split (SplitType, optional): The type of splitting to be applied.
Defaults to "none" (no splitting).
api_key (str, optional): The API key for accessing the Upstage API.
Defaults to None, in which case it will be
fetched from the environment variable
`UPSTAGE_DOCUMENT_AI_API_KEY`.
use_ocr (bool, optional): Extract text from images in the document.
Defaults to False. (Use text info in PDF file)
"""
self.file_path = file_path
self.output_type = output_type
self.split = split
self.api_key = get_from_param_or_env(
"UPSTAGE_DOCUMENT_AI_API_KEY", api_key, "UPSTAGE_DOCUMENT_AI_API_KEY"
)
self.use_ocr = use_ocr
validate_file_path(self.file_path)
validate_api_key(self.api_key)
def load(self) -> List[Document]:
"""
Loads and parses the document using the UpstageLayoutAnalysisParser.
Returns:
A list of Document objects representing the parsed layout analysis.
"""
if isinstance(self.file_path, list):
result = []
for file_path in self.file_path:
blob = Blob.from_path(file_path)
parser = UpstageLayoutAnalysisParser(
self.api_key,
split=self.split,
output_type=self.output_type,
use_ocr=self.use_ocr,
)
result.extend(list(parser.lazy_parse(blob, is_batch=True)))
return result
else:
blob = Blob.from_path(self.file_path)
parser = UpstageLayoutAnalysisParser(
self.api_key,
split=self.split,
output_type=self.output_type,
use_ocr=self.use_ocr,
)
return list(parser.lazy_parse(blob, is_batch=True))
def lazy_load(self) -> Iterator[Document]:
"""
Lazily loads and parses the document using the UpstageLayoutAnalysisParser.
Returns:
An iterator of Document objects representing the parsed layout analysis.
"""
if isinstance(self.file_path, list):
for file_path in self.file_path:
blob = Blob.from_path(file_path)
parser = UpstageLayoutAnalysisParser(
self.api_key,
split=self.split,
output_type=self.output_type,
use_ocr=self.use_ocr,
)
yield from parser.lazy_parse(blob, is_batch=True)
else:
blob = Blob.from_path(self.file_path)
parser = UpstageLayoutAnalysisParser(
self.api_key,
split=self.split,
output_type=self.output_type,
use_ocr=self.use_ocr,
)
yield from parser.lazy_parse(blob)

View File

@@ -0,0 +1,375 @@
import io
import json
import os
from typing import Dict, Iterator, List, Literal, Optional, Union
import fitz # type: ignore
import requests
from fitz import Document as fitzDocument
from langchain_core.document_loaders import BaseBlobParser, Blob
from langchain_core.documents import Document
LAYOUT_ANALYSIS_URL = "https://api.upstage.ai/v1/document-ai/layout-analysis"
DEFAULT_NUMBER_OF_PAGE = 10
OutputType = Literal["text", "html"]
SplitType = Literal["none", "element", "page"]
def validate_api_key(api_key: str) -> None:
"""
Validates the provided API key.
Args:
api_key (str): The API key to be validated.
Raises:
ValueError: If the API key is empty or None.
Returns:
None
"""
if not api_key:
raise ValueError("API Key is required for Upstage Document Loader")
def validate_file_path(file_path: str) -> None:
"""
Validates if a file exists at the given file path.
Args:
file_path (str): The path to the file.
Raises:
FileNotFoundError: If the file does not exist at the given file path.
"""
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
def parse_output(data: dict, output_type: Union[OutputType, dict]) -> str:
"""
Parse the output data based on the specified output type.
Args:
data (dict): The data to be parsed.
output_type (Union[OutputType, dict]): The output type to parse the element data
into.
Returns:
str: The parsed output.
Raises:
ValueError: If the output type is invalid.
"""
if isinstance(output_type, dict):
if data["category"] in output_type:
return data[output_type[data["category"]]]
else:
return data["text"]
elif isinstance(output_type, str):
if output_type == "text":
return data["text"]
elif output_type == "html":
return data["html"]
else:
raise ValueError(f"Invalid output type: {output_type}")
else:
raise ValueError(f"Invalid output type: {output_type}")
def get_from_param_or_env(
key: str,
param: Optional[str] = None,
env_key: Optional[str] = None,
default: Optional[str] = None,
) -> str:
"""Get a value from a param or an environment variable."""
if param is not None:
return param
elif env_key and env_key in os.environ and os.environ[env_key]:
return os.environ[env_key]
elif default is not None:
return default
else:
raise ValueError(
f"Did not find {key}, please add an environment variable"
f" `{env_key}` which contains it, or pass"
f" `{key}` as a named parameter."
)
class UpstageLayoutAnalysisParser(BaseBlobParser):
"""Upstage Layout Analysis Parser.
To use, you should have the environment variable `UPSTAGE_DOCUMENT_AI_API_KEY`
set with your API key or pass it as a named parameter to the constructor.
Example:
.. code-block:: python
from langchain_upstage import UpstageLayoutAnalysisParser
loader = UpstageLayoutAnalysisParser(split="page", output_type="text")
"""
def __init__(
self,
api_key: Optional[str] = None,
output_type: Union[OutputType, dict] = "text",
split: SplitType = "none",
use_ocr: bool = False,
):
"""
Initializes an instance of the Upstage class.
Args:
api_key (str, optional): The API key for accessing the Upstage API.
Defaults to None, in which case it will be
fetched from the environment variable
`UPSTAGE_DOCUMENT_AI_API_KEY`.
output_type (Union[OutputType, dict], optional): The type of output to be
generated by the parser.
Defaults to "text".
split (SplitType, optional): The type of splitting to be applied.
Defaults to "none" (no splitting).
use_ocr (bool, optional): Extract text from images in the document.
Defaults to False. (Use text info in PDF file)
"""
self.api_key = get_from_param_or_env(
"UPSTAGE_DOCUMENT_AI_API_KEY", api_key, "UPSTAGE_DOCUMENT_AI_API_KEY"
)
self.output_type = output_type
self.split = split
self.use_ocr = use_ocr
validate_api_key(self.api_key)
def _get_response(self, files: Dict) -> Dict:
"""
Sends a POST request to the API endpoint with the provided files and
returns the response.
Args:
files (dict): A dictionary containing the files to be sent in the request.
Returns:
dict: The JSON response from the API.
Raises:
ValueError: If there is an error in the API call.
"""
try:
headers = {"Authorization": f"Bearer {self.api_key}"}
options = {"ocr": self.use_ocr}
response = requests.post(
LAYOUT_ANALYSIS_URL, headers=headers, files=files, json=options
)
response.raise_for_status()
result = response.json()
except requests.RequestException as req_err:
# Handle any request-related exceptions
print(f"Request Exception: {req_err}")
except json.JSONDecodeError as json_err:
# Handle JSON decode errors
print(f"JSON Decode Error: {json_err}")
raise ValueError(f"Failed to decode JSON response: {json_err}")
return result
def _split_and_request(
self,
full_docs: fitzDocument,
start_page: int,
num_pages: int = DEFAULT_NUMBER_OF_PAGE,
) -> Dict:
"""
Splits the full pdf document into partial pages and sends a request to the
server.
Args:
full_docs (str): The full document to be split and requested.
start_page (int): The starting page number for splitting the document.
num_pages (int, optional): The number of pages to split the document
into.
Defaults to DEFAULT_NUMBER_OF_PAGE.
Returns:
response: The response from the server.
"""
with fitz.open() as chunk_pdf:
chunk_pdf.insert_pdf(
full_docs,
from_page=start_page,
to_page=start_page + num_pages - 1,
)
pdf_bytes = chunk_pdf.write()
with io.BytesIO(pdf_bytes) as f:
response = self._get_response({"document": f})
return response
def _element_document(self, elements: Dict) -> Document:
"""
Converts an elements into a Document object.
Args:
elements: The elements to convert.
Returns:
A list containing a single Document object.
"""
return Document(
page_content=(parse_output(elements, self.output_type)),
metadata={
"page": elements["page"],
"id": elements["id"],
"type": self.output_type,
"split": self.split,
},
)
def _page_document(self, elements: Dict) -> List[Document]:
"""
Combines elements with the same page number into a single Document object.
Args:
elements (List[Dict]): A list of elements containing page numbers.
Returns:
List[Document]: A list of Document objects, each representing a page
with its content and metadata.
"""
_docs = []
pages = sorted(set(map(lambda x: x["page"], elements)))
page_group = [
[element for element in elements if element["page"] == x] for x in pages
]
for group in page_group:
page_content = " ".join(
[parse_output(element, self.output_type) for element in group]
)
_docs.append(
Document(
page_content=page_content,
metadata={
"page": group[0]["page"],
"type": self.output_type,
"split": self.split,
},
)
)
return _docs
def lazy_parse(self, blob: Blob, is_batch: bool = False) -> Iterator[Document]:
"""
Lazily parses a document and yields Document objects based on the specified
split type.
Args:
blob (Blob): The input document blob to parse.
is_batch (bool, optional): Whether to parse the document in batches.
Defaults to False (single page parsing)
Yields:
Document: The parsed document object.
Raises:
ValueError: If an invalid split type is provided.
"""
if is_batch:
num_pages = DEFAULT_NUMBER_OF_PAGE
else:
num_pages = 1
full_docs = fitz.open(blob.path)
number_of_pages = full_docs.page_count
if self.split == "none":
if full_docs.is_pdf:
result = ""
start_page = 0
num_pages = DEFAULT_NUMBER_OF_PAGE
for _ in range(number_of_pages):
if start_page >= number_of_pages:
break
response = self._split_and_request(full_docs, start_page, num_pages)
result += parse_output(response, self.output_type)
start_page += num_pages
else:
if not blob.path:
raise ValueError("Blob path is required for non-PDF files.")
with open(blob.path, "rb") as f:
response = self._get_response({"document": f})
result = parse_output(response, self.output_type)
yield Document(
page_content=result,
metadata={
"total_pages": number_of_pages,
"type": self.output_type,
"split": self.split,
},
)
elif self.split == "element":
if full_docs.is_pdf:
start_page = 0
for _ in range(number_of_pages):
if start_page >= number_of_pages:
break
response = self._split_and_request(full_docs, start_page, num_pages)
for element in response["elements"]:
yield self._element_document(element)
start_page += num_pages
else:
if not blob.path:
raise ValueError("Blob path is required for non-PDF files.")
with open(blob.path, "rb") as f:
response = self._get_response({"document": f})
for element in response["elements"]:
yield self._element_document(element)
elif self.split == "page":
if full_docs.is_pdf:
start_page = 0
for _ in range(number_of_pages):
if start_page >= number_of_pages:
break
response = self._split_and_request(full_docs, start_page, num_pages)
elements = response["elements"]
yield from self._page_document(elements)
start_page += num_pages
else:
if not blob.path:
raise ValueError("Blob path is required for non-PDF files.")
with open(blob.path, "rb") as f:
response = self._get_response({"document": f})
elements = response["elements"]
yield from self._page_document(elements)
else:
raise ValueError(f"Invalid split type: {self.split}")

View File

@@ -0,0 +1,91 @@
import os
from typing import Literal, Optional, Type, Union
from langchain_core.callbacks import (
AsyncCallbackManagerForToolRun,
CallbackManagerForToolRun,
)
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.pydantic_v1 import BaseModel, Field, SecretStr
from langchain_core.tools import BaseTool
from langchain_upstage import ChatUpstage
class GroundednessCheckInput(BaseModel):
"""Input for the Groundedness Check tool."""
context: str = Field(description="context in which the answer should be verified")
query: str = Field(
description="assistant's reply or a text that is subject to groundedness check"
)
class GroundednessCheck(BaseTool):
"""Tool that checks the groundedness of a context and an assistant message.
To use, you should have the environment variable `UPSTAGE_API_KEY`
set with your API key or pass it as a named parameter to the constructor.
Example:
.. code-block:: python
from langchain_upstage import GroundednessCheck
tool = GroundednessCheck()
"""
name: str = "groundedness_check"
description: str = (
"A tool that checks the groundedness of an assistant response "
"to user-provided context. GroundednessCheck ensures that "
"the assistants response is not only relevant but also "
"precisely aligned with the user's initial context, "
"promoting a more reliable and context-aware interaction. "
"When using retrieval-augmented generation (RAG), "
"the Groundedness Check can be used to determine whether "
"the assistant's message is grounded in the provided context."
)
upstage_api_key: Optional[SecretStr] = Field(default=None, alias="api_key")
api_wrapper: ChatUpstage
args_schema: Type[BaseModel] = GroundednessCheckInput
def __init__(self, upstage_api_key: Optional[SecretStr] = None):
if not upstage_api_key:
upstage_api_key = SecretStr(os.getenv("UPSTAGE_API_KEY", ""))
else:
upstage_api_key = upstage_api_key
if (
not upstage_api_key
or not upstage_api_key.get_secret_value()
or upstage_api_key.get_secret_value() == ""
):
raise ValueError("UPSTAGE_API_KEY must be set or passed")
api_wrapper = ChatUpstage(
model_name="solar-1-mini-answer-verification",
upstage_api_key=upstage_api_key.get_secret_value(),
)
super().__init__(upstage_api_key=upstage_api_key, api_wrapper=api_wrapper)
def _run(
self,
context: str,
query: str,
run_manager: Optional[CallbackManagerForToolRun] = None,
) -> Union[str, Literal["grounded", "notGrounded", "notSure"]]:
"""Use the tool."""
response = self.api_wrapper.invoke([HumanMessage(context), AIMessage(query)])
return str(response.content)
async def _arun(
self,
context: str,
query: str,
run_manager: Optional[AsyncCallbackManagerForToolRun] = None,
) -> Union[str, Literal["grounded", "notGrounded", "notSure"]]:
response = await self.api_wrapper.ainvoke(
[HumanMessage(context), AIMessage(query)]
)
return str(response.content)

View File

@@ -223,13 +223,13 @@ test = ["pytest (>=6)"]
[[package]]
name = "freezegun"
version = "1.4.0"
version = "1.5.0"
description = "Let your Python tests travel through time"
optional = false
python-versions = ">=3.7"
files = [
{file = "freezegun-1.4.0-py3-none-any.whl", hash = "sha256:55e0fc3c84ebf0a96a5aa23ff8b53d70246479e9a68863f1fcac5a3e52f19dd6"},
{file = "freezegun-1.4.0.tar.gz", hash = "sha256:10939b0ba0ff5adaecf3b06a5c2f73071d9678e507c5eaedb23c761d56ac774b"},
{file = "freezegun-1.5.0-py3-none-any.whl", hash = "sha256:ec3f4ba030e34eb6cf7e1e257308aee2c60c3d038ff35996d7475760c9ff3719"},
{file = "freezegun-1.5.0.tar.gz", hash = "sha256:200a64359b363aa3653d8aac289584078386c7c3da77339d257e46a01fb5c77c"},
]
[package.dependencies]
@@ -340,7 +340,7 @@ files = [
[[package]]
name = "langchain-core"
version = "0.1.44"
version = "0.1.45"
description = "Building applications with LLMs through composability"
optional = false
python-versions = ">=3.8.1,<4.0"
@@ -399,13 +399,13 @@ url = "../../standard-tests"
[[package]]
name = "langsmith"
version = "0.1.49"
version = "0.1.50"
description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
optional = false
python-versions = "<4.0,>=3.8.1"
files = [
{file = "langsmith-0.1.49-py3-none-any.whl", hash = "sha256:cf0db7474c0dfb22015c22bf97f62e850898c3c6af9564dd111c2df225acc1c8"},
{file = "langsmith-0.1.49.tar.gz", hash = "sha256:5aee8537763f9d62b3368d79d7bfef881e2bfaa28639011d8d7328770cbd6419"},
{file = "langsmith-0.1.50-py3-none-any.whl", hash = "sha256:a81e9809fcaa277bfb314d729e58116554f186d1478fcfdf553b1c2ccce54b85"},
{file = "langsmith-0.1.50.tar.gz", hash = "sha256:9fd22df8c689c044058536ea5af66f5302067e7551b60d7a335fede8d479572b"},
]
[package.dependencies]
@@ -548,13 +548,13 @@ files = [
[[package]]
name = "openai"
version = "1.23.1"
version = "1.23.3"
description = "The official Python library for the openai API"
optional = false
python-versions = ">=3.7.1"
files = [
{file = "openai-1.23.1-py3-none-any.whl", hash = "sha256:7941c1bc6fcdb1b6b889dfcfabff775ca52558a79d57dd1b9e15b463de1b3a4c"},
{file = "openai-1.23.1.tar.gz", hash = "sha256:6df937e2a1ad64494951ea3614f5516db4d67c3fcc0b751b8e5edf1bc57e2d3d"},
{file = "openai-1.23.3-py3-none-any.whl", hash = "sha256:6eef764a8870095d256d59e6be243acf560a21227e9e3588b508972818929ef7"},
{file = "openai-1.23.3.tar.gz", hash = "sha256:6730b8468a0235e5f289dfdfacaa374001645099c4ad1740b58eab378bcf7723"},
]
[package.dependencies]
@@ -642,13 +642,13 @@ files = [
[[package]]
name = "pluggy"
version = "1.4.0"
version = "1.5.0"
description = "plugin and hook calling mechanisms for python"
optional = false
python-versions = ">=3.8"
files = [
{file = "pluggy-1.4.0-py3-none-any.whl", hash = "sha256:7db9f7b503d67d1c5b95f59773ebb58a8c1c288129a88665838012cfb07b8981"},
{file = "pluggy-1.4.0.tar.gz", hash = "sha256:8c85c2876142a764e5b7548e7d9a0e0ddb46f5185161049a79b7e974454223be"},
{file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"},
{file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"},
]
[package.extras]
@@ -722,6 +722,64 @@ files = [
plugins = ["importlib-metadata"]
windows-terminal = ["colorama (>=0.4.6)"]
[[package]]
name = "pymupdf"
version = "1.24.2"
description = "A high performance Python library for data extraction, analysis, conversion & manipulation of PDF (and other) documents."
optional = false
python-versions = ">=3.8"
files = [
{file = "PyMuPDF-1.24.2-cp310-none-macosx_10_9_x86_64.whl", hash = "sha256:5faed2bbdfbea80db1bbaa5944888f27a672f2e10e16e61f7d4ff73429a00506"},
{file = "PyMuPDF-1.24.2-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:24c398e43a14e0e11f3515ea57875b5b0ee1a37d6dc59f921f69d8d16e881cb8"},
{file = "PyMuPDF-1.24.2-cp310-none-manylinux2014_aarch64.whl", hash = "sha256:569336fe3c5f81f28aa9658861597e43e5716cbfa5b8d2602431095df76e0d7c"},
{file = "PyMuPDF-1.24.2-cp310-none-manylinux2014_x86_64.whl", hash = "sha256:8fe58a024629c23847423b3294f0f160c72c72f953af53d183bd3328f954593a"},
{file = "PyMuPDF-1.24.2-cp310-none-win32.whl", hash = "sha256:49224a558736303ed980252a704646fe9347c74bf70d0ad32530c62b8e0bfe33"},
{file = "PyMuPDF-1.24.2-cp310-none-win_amd64.whl", hash = "sha256:a32c94c7ee45f2bfee766e5b957bdfe08c96b21fd9adbfb546c141621af0ca85"},
{file = "PyMuPDF-1.24.2-cp311-none-macosx_10_9_x86_64.whl", hash = "sha256:815d9e10faa43a149d8c9928d7cefda83fd91a1f637dfb3a295620175a0af95c"},
{file = "PyMuPDF-1.24.2-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:b583add37141a9337935d014d4e1913b10e22d17f3fd656fdc5f0c0c2e65a33e"},
{file = "PyMuPDF-1.24.2-cp311-none-manylinux2014_aarch64.whl", hash = "sha256:d6a4d4ad8cc698db25a31026311f03fd351c2db9bfd41d898494cd0baff3b679"},
{file = "PyMuPDF-1.24.2-cp311-none-manylinux2014_x86_64.whl", hash = "sha256:7b5acb936203bdaef5945f211af8a5fb40f07059be1ba69a728425f6d522e60f"},
{file = "PyMuPDF-1.24.2-cp311-none-win32.whl", hash = "sha256:d01d348a35438f8a1647334428ef23c6d947acae875fa61cac2be3a65b15e4f5"},
{file = "PyMuPDF-1.24.2-cp311-none-win_amd64.whl", hash = "sha256:909ab62c752be80c3c130a9774fc27fb863d26149ba880129e0a2cf0d53cebde"},
{file = "PyMuPDF-1.24.2-cp312-none-macosx_10_9_x86_64.whl", hash = "sha256:6a3c1f2e99a4ca43c97b1f43fdd1aed739910e25ee5bd7fe73cd4eaf59841ae3"},
{file = "PyMuPDF-1.24.2-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:3effff62943ceebbbe32a08ce4aa9c8ed4fa18fd8a462cf6130c78818c47822d"},
{file = "PyMuPDF-1.24.2-cp312-none-manylinux2014_aarch64.whl", hash = "sha256:f3964783bf81f2ec94db4f9fa536052be3b7457824c9e9d21edb91f3a489a377"},
{file = "PyMuPDF-1.24.2-cp312-none-manylinux2014_x86_64.whl", hash = "sha256:04af266755d4c250b46a3311062aec36ea94cecc4470a53ab79d9de56e5a753d"},
{file = "PyMuPDF-1.24.2-cp312-none-win32.whl", hash = "sha256:3bd7bdda4c4e4f98989ce84a7b2c08033639a8be1b46fb064fdd65b20a7e7d03"},
{file = "PyMuPDF-1.24.2-cp312-none-win_amd64.whl", hash = "sha256:ec2544f35088b29730210decfb0bdb750e0c3d2652ee470897f6d2e4a6dc1950"},
{file = "PyMuPDF-1.24.2-cp38-none-macosx_10_9_x86_64.whl", hash = "sha256:d4fd3957fd507affbcae4536092cb3e3726e91d484be16972603c5cacae7848a"},
{file = "PyMuPDF-1.24.2-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:4290273dfcc58a2c0b1f207f5e25479b868f59e9ea6ac9241740506fa0c03c0a"},
{file = "PyMuPDF-1.24.2-cp38-none-manylinux2014_aarch64.whl", hash = "sha256:8f52f27d1f5968b6dda4d803e7f5246626a45ab68f0626509a9e17fadcebfb69"},
{file = "PyMuPDF-1.24.2-cp38-none-manylinux2014_x86_64.whl", hash = "sha256:db650840eb3efbdc97df94210d0400042c863b08348d67037495d221ec4e8b7f"},
{file = "PyMuPDF-1.24.2-cp38-none-win32.whl", hash = "sha256:423217223741f55f9bb7622475a94c2934495e8a843246c582c78f3680914a80"},
{file = "PyMuPDF-1.24.2-cp38-none-win_amd64.whl", hash = "sha256:ca493fbb91d81a43d68d3547194d0c86083db49d4dd98e8f41aa5a77a26ff8fe"},
{file = "PyMuPDF-1.24.2-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:9783b67f63e7f9b397f119de996ea8214498d163531b9371d8ea7e374cdd45cd"},
{file = "PyMuPDF-1.24.2-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:4db161926d636c0bff016ac7591edbe6b30712507079f7008cefb0fdf58055dc"},
{file = "PyMuPDF-1.24.2-cp39-none-manylinux2014_aarch64.whl", hash = "sha256:537cc7bef86514a6fa248eeb14b588f51699388628372cf31bae7839283aa628"},
{file = "PyMuPDF-1.24.2-cp39-none-manylinux2014_x86_64.whl", hash = "sha256:a124b360898d24b730fe3be0e0bca438789c1568ceaad854387eee1886bb788c"},
{file = "PyMuPDF-1.24.2-cp39-none-win32.whl", hash = "sha256:007586883fbc8acb900d46aa95520aaeb8943d05a956b26c54053ddb58dbdd5f"},
{file = "PyMuPDF-1.24.2-cp39-none-win_amd64.whl", hash = "sha256:d89cbb1a093dbf042f503f5c7fc368d0718a652418512a7a42a2965cba27713d"},
{file = "PyMuPDF-1.24.2.tar.gz", hash = "sha256:cdaca48b7677a0c1dc827413b90c8fe4517f789f74c6ac0fb47f6051368246bb"},
]
[package.dependencies]
PyMuPDFb = "1.24.1"
[[package]]
name = "pymupdfb"
version = "1.24.1"
description = "MuPDF shared libraries for PyMuPDF."
optional = false
python-versions = ">=3.8"
files = [
{file = "PyMuPDFb-1.24.1-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:37179e363bf69ce9be637937c5469957b96968341dabe3ce8f4b690a82e9ad92"},
{file = "PyMuPDFb-1.24.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:17444ea7d6897c27759880ad76af537d19779f901de82ae9548598a70f614558"},
{file = "PyMuPDFb-1.24.1-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:490f7fff4dbe362bc895cefdfc5030d712311d024d357a1388d64816eb215d34"},
{file = "PyMuPDFb-1.24.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0fbcc0d2a9ce79fa38eb4e8bb5c959b582f7a49938874e9f61d1a6f5eeb1e4b8"},
{file = "PyMuPDFb-1.24.1-py3-none-win32.whl", hash = "sha256:ae67736058882cdd9459810a4aae9ac2b2e89ac2e916cb5fefb0f651c9739e9e"},
{file = "PyMuPDFb-1.24.1-py3-none-win_amd64.whl", hash = "sha256:01c8b7f0ce9166310eb28c7aebcb8d5fe12a4bc082f9b00d580095eebeaf0af5"},
]
[[package]]
name = "pytest"
version = "7.4.4"
@@ -1270,4 +1328,4 @@ watchmedo = ["PyYAML (>=3.10)"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.8.1,<4.0"
content-hash = "98a8d67be9138240d5190eb4774b93f671fbd8069839ad239d005c753bdbae0d"
content-hash = "1bb654e8a4f60cca5f0562ade5477a2f2e852ed2a361c7e9162208fbeb445309"

View File

@@ -14,6 +14,8 @@ license = "MIT"
python = ">=3.8.1,<4.0"
langchain-core = "^0.1.44"
langchain-openai = "^0.1.3"
pymupdf = "^1.24.1"
requests = "^2.31.0"
[tool.poetry.group.test]
optional = true
@@ -50,6 +52,7 @@ ruff = "^0.1.5"
[tool.poetry.group.typing.dependencies]
mypy = "^0.991"
types-requests = ">=2.31.0"
langchain-core = { path = "../../core", develop = true }
[tool.poetry.group.dev]

Binary file not shown.

View File

@@ -0,0 +1,17 @@
from langchain_upstage import GroundednessCheck
def test_langchain_upstage_groundedness_check() -> None:
"""Test Upstage Groundedness Check."""
tool = GroundednessCheck()
output = tool.run({"context": "foo bar", "query": "bar foo"})
assert output in ["grounded", "notGrounded", "notSure"]
async def test_langchain_upstage_groundedness_check_async() -> None:
"""Test Upstage Groundedness Check asynchronous."""
tool = GroundednessCheck()
output = await tool.arun({"context": "foo bar", "query": "bar foo"})
assert output in ["grounded", "notGrounded", "notSure"]

View File

@@ -0,0 +1,10 @@
import os
from langchain_upstage import GroundednessCheck
os.environ["UPSTAGE_API_KEY"] = "foo"
def test_initialization() -> None:
"""Test embedding model initialization."""
GroundednessCheck()

View File

@@ -3,6 +3,9 @@ from langchain_upstage import __all__
EXPECTED_ALL = [
"ChatUpstage",
"UpstageEmbeddings",
"UpstageLayoutAnalysisLoader",
"UpstageLayoutAnalysisParser",
"GroundednessCheck",
]

View File

@@ -0,0 +1,200 @@
from pathlib import Path
from typing import Any, Dict, get_args
from unittest.mock import MagicMock, Mock, patch
from langchain_upstage import UpstageLayoutAnalysisLoader
from langchain_upstage.layout_analysis import OutputType, SplitType
MOCK_RESPONSE_JSON: Dict[str, Any] = {
"api": "1.0",
"billed_pages": 1,
"elements": [
{
"bounding_box": [
{"x": 74, "y": 906},
{"x": 148, "y": 906},
{"x": 148, "y": 2338},
{"x": 74, "y": 2338},
],
"category": "header",
"html": "2021arXiv:2103.15348v2",
"id": 0,
"page": 1,
"text": "arXiv:2103.15348v2",
},
{
"bounding_box": [
{"x": 654, "y": 474},
{"x": 1912, "y": 474},
{"x": 1912, "y": 614},
{"x": 654, "y": 614},
],
"category": "paragraph",
"html": "LayoutParser Toolkit",
"id": 1,
"page": 1,
"text": "LayoutParser Toolkit",
},
],
"html": "<header id='0'>arXiv:2103.15348v2</header>"
+ "<p id='1'>LayoutParser Toolkit</p>",
"mimetype": "multipart/form-data",
"model": "layout-analyzer-0.1.0",
"text": "arXiv:2103.15348v2LayoutParser Toolkit",
}
EXAMPLE_PDF_PATH = Path(__file__).parent.parent / "examples/solar.pdf"
def test_initialization() -> None:
"""Test layout analysis document loader initialization."""
UpstageLayoutAnalysisLoader(file_path=EXAMPLE_PDF_PATH, api_key="bar")
def test_layout_analysis_param() -> None:
for output_type in get_args(OutputType):
for split in get_args(SplitType):
loader = UpstageLayoutAnalysisLoader(
file_path=EXAMPLE_PDF_PATH,
api_key="bar",
output_type=output_type,
split=split,
)
assert loader.output_type == output_type
assert loader.split == split
assert loader.api_key == "bar"
assert loader.file_path == EXAMPLE_PDF_PATH
@patch("requests.post")
def test_none_split_text_output(mock_post: Mock) -> None:
mock_post.return_value = MagicMock(
status_code=200, json=MagicMock(return_value=MOCK_RESPONSE_JSON)
)
loader = UpstageLayoutAnalysisLoader(
file_path=EXAMPLE_PDF_PATH,
output_type="text",
split="none",
api_key="valid_api_key",
)
documents = loader.load()
assert len(documents) == 1
assert documents[0].page_content == MOCK_RESPONSE_JSON["text"]
assert documents[0].metadata["total_pages"] == 1
assert documents[0].metadata["type"] == "text"
assert documents[0].metadata["split"] == "none"
@patch("requests.post")
def test_element_split_text_output(mock_post: Mock) -> None:
mock_post.return_value = MagicMock(
status_code=200, json=MagicMock(return_value=MOCK_RESPONSE_JSON)
)
loader = UpstageLayoutAnalysisLoader(
file_path=EXAMPLE_PDF_PATH,
output_type="text",
split="element",
api_key="valid_api_key",
)
documents = loader.load()
assert len(documents) == 2
for i, document in enumerate(documents):
assert document.page_content == MOCK_RESPONSE_JSON["elements"][i]["text"]
assert document.metadata["page"] == MOCK_RESPONSE_JSON["elements"][i]["page"]
assert document.metadata["id"] == MOCK_RESPONSE_JSON["elements"][i]["id"]
assert document.metadata["type"] == "text"
assert document.metadata["split"] == "element"
@patch("requests.post")
def test_page_split_text_output(mock_post: Mock) -> None:
mock_post.return_value = MagicMock(
status_code=200, json=MagicMock(return_value=MOCK_RESPONSE_JSON)
)
loader = UpstageLayoutAnalysisLoader(
file_path=EXAMPLE_PDF_PATH,
output_type="text",
split="page",
api_key="valid_api_key",
)
documents = loader.load()
assert len(documents) == 1
for i, document in enumerate(documents):
assert document.metadata["page"] == MOCK_RESPONSE_JSON["elements"][i]["page"]
assert document.metadata["type"] == "text"
assert document.metadata["split"] == "page"
@patch("requests.post")
def test_none_split_html_output(mock_post: Mock) -> None:
mock_post.return_value = MagicMock(
status_code=200, json=MagicMock(return_value=MOCK_RESPONSE_JSON)
)
loader = UpstageLayoutAnalysisLoader(
file_path=EXAMPLE_PDF_PATH,
output_type="html",
split="none",
api_key="valid_api_key",
)
documents = loader.load()
assert len(documents) == 1
assert documents[0].page_content == MOCK_RESPONSE_JSON["html"]
assert documents[0].metadata["total_pages"] == 1
assert documents[0].metadata["type"] == "html"
assert documents[0].metadata["split"] == "none"
@patch("requests.post")
def test_element_split_html_output(mock_post: Mock) -> None:
mock_post.return_value = MagicMock(
status_code=200, json=MagicMock(return_value=MOCK_RESPONSE_JSON)
)
loader = UpstageLayoutAnalysisLoader(
file_path=EXAMPLE_PDF_PATH,
output_type="html",
split="element",
api_key="valid_api_key",
)
documents = loader.load()
assert len(documents) == 2
for i, document in enumerate(documents):
assert document.page_content == MOCK_RESPONSE_JSON["elements"][i]["html"]
assert document.metadata["page"] == MOCK_RESPONSE_JSON["elements"][i]["page"]
assert document.metadata["id"] == MOCK_RESPONSE_JSON["elements"][i]["id"]
assert document.metadata["type"] == "html"
assert document.metadata["split"] == "element"
@patch("requests.post")
def test_page_split_html_output(mock_post: Mock) -> None:
mock_post.return_value = MagicMock(
status_code=200, json=MagicMock(return_value=MOCK_RESPONSE_JSON)
)
loader = UpstageLayoutAnalysisLoader(
file_path=EXAMPLE_PDF_PATH,
output_type="html",
split="page",
api_key="valid_api_key",
)
documents = loader.load()
assert len(documents) == 1
for i, document in enumerate(documents):
assert document.metadata["page"] == MOCK_RESPONSE_JSON["elements"][i]["page"]
assert document.metadata["type"] == "html"
assert document.metadata["split"] == "page"