upstage: Add Upstage partner package LA and GC (#20651)

--------- Co-authored-by: Sean <chosh0615@gmail.com> Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Sean Cho <sean@upstage.ai>
2025-09-13 21:47:12 +00:00 · 2024-04-25 07:17:20 +09:00
parent 5ecebf168c
commit c8fd51e8c8
18 changed files with 1471 additions and 19 deletions
--- a/libs/partners/upstage/langchain_upstage/init.py
+++ b/libs/partners/upstage/langchain_upstage/init.py
@@ -1,4 +1,13 @@
 from langchain_upstage.chat_models import ChatUpstage
 from langchain_upstage.embeddings import UpstageEmbeddings
+from langchain_upstage.layout_analysis import UpstageLayoutAnalysisLoader
+from langchain_upstage.layout_analysis_parsers import UpstageLayoutAnalysisParser
+from langchain_upstage.tools.groundedness_check import GroundednessCheck

-__all__ = ["ChatUpstage", "UpstageEmbeddings"]
+__all__ = [
+    "ChatUpstage",
+    "UpstageEmbeddings",
+    "UpstageLayoutAnalysisLoader",
+    "UpstageLayoutAnalysisParser",
+    "GroundednessCheck",
+]
--- a/libs/partners/upstage/langchain_upstage/layout_analysis.py
+++ b/libs/partners/upstage/langchain_upstage/layout_analysis.py
@@ -0,0 +1,190 @@
+import os
+from pathlib import Path
+from typing import Iterator, List, Literal, Optional, Union
+
+from langchain_core.document_loaders import BaseLoader, Blob
+from langchain_core.documents import Document
+
+from .layout_analysis_parsers import UpstageLayoutAnalysisParser
+
+DEFAULT_PAGE_BATCH_SIZE = 10
+
+OutputType = Literal["text", "html"]
+SplitType = Literal["none", "element", "page"]
+
+
+def validate_api_key(api_key: str) -> None:
+    """
+    Validates the provided API key.
+
+    Args:
+        api_key (str): The API key to be validated.
+
+    Raises:
+        ValueError: If the API key is empty or None.
+
+    Returns:
+        None
+    """
+    if not api_key:
+        raise ValueError("API Key is required for Upstage Document Loader")
+
+
+def validate_file_path(file_path: Union[str, Path, List[str], List[Path]]) -> None:
+    """
+    Validates if a file exists at the given file path.
+
+    Args:
+        file_path (Union[str, Path, List[str], List[Path]): The file path(s) to be
+                                                            validated.
+
+    Raises:
+        FileNotFoundError: If the file or any of the files in the list do not exist.
+    """
+    if isinstance(file_path, list):
+        for path in file_path:
+            validate_file_path(path)
+        return
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"File not found: {file_path}")
+
+
+def get_from_param_or_env(
+    key: str,
+    param: Optional[str] = None,
+    env_key: Optional[str] = None,
+    default: Optional[str] = None,
+) -> str:
+    """Get a value from a param or an environment variable."""
+    if param is not None:
+        return param
+    elif env_key and env_key in os.environ and os.environ[env_key]:
+        return os.environ[env_key]
+    elif default is not None:
+        return default
+    else:
+        raise ValueError(
+            f"Did not find {key}, please add an environment variable"
+            f" `{env_key}` which contains it, or pass"
+            f"  `{key}` as a named parameter."
+        )
+
+
+class UpstageLayoutAnalysisLoader(BaseLoader):
+    """Upstage Layout Analysis.
+
+    To use, you should have the environment variable `UPSTAGE_DOCUMENT_AI_API_KEY`
+    set with your API key or pass it as a named parameter to the constructor.
+
+    Example:
+        .. code-block:: python
+
+            from langchain_upstage import UpstageLayoutAnalysis
+
+            file_path = "/PATH/TO/YOUR/FILE.pdf"
+            loader = UpstageLayoutAnalysis(
+                        file_path, split="page", output_type="text"
+                     )
+    """
+
+    def __init__(
+        self,
+        file_path: Union[str, Path, List[str], List[Path]],
+        output_type: Union[OutputType, dict] = "text",
+        split: SplitType = "none",
+        api_key: Optional[str] = None,
+        use_ocr: bool = False,
+    ):
+        """
+        Initializes an instance of the Upstage document loader.
+
+        Args:
+            file_path (Union[str, Path, List[str], List[Path]): The path to the document
+                                                                to be loaded.
+            output_type (Union[OutputType, dict], optional): The type of output to be
+                                                             generated by the parser.
+                                                             Defaults to "text".
+            split (SplitType, optional): The type of splitting to be applied.
+                                         Defaults to "none" (no splitting).
+            api_key (str, optional): The API key for accessing the Upstage API.
+                                     Defaults to None, in which case it will be
+                                     fetched from the environment variable
+                                     `UPSTAGE_DOCUMENT_AI_API_KEY`.
+            use_ocr (bool, optional): Extract text from images in the document.
+                                      Defaults to False. (Use text info in PDF file)
+        """
+        self.file_path = file_path
+        self.output_type = output_type
+        self.split = split
+        self.api_key = get_from_param_or_env(
+            "UPSTAGE_DOCUMENT_AI_API_KEY", api_key, "UPSTAGE_DOCUMENT_AI_API_KEY"
+        )
+        self.use_ocr = use_ocr
+
+        validate_file_path(self.file_path)
+        validate_api_key(self.api_key)
+
+    def load(self) -> List[Document]:
+        """
+        Loads and parses the document using the UpstageLayoutAnalysisParser.
+
+        Returns:
+            A list of Document objects representing the parsed layout analysis.
+        """
+
+        if isinstance(self.file_path, list):
+            result = []
+
+            for file_path in self.file_path:
+                blob = Blob.from_path(file_path)
+
+                parser = UpstageLayoutAnalysisParser(
+                    self.api_key,
+                    split=self.split,
+                    output_type=self.output_type,
+                    use_ocr=self.use_ocr,
+                )
+                result.extend(list(parser.lazy_parse(blob, is_batch=True)))
+
+            return result
+
+        else:
+            blob = Blob.from_path(self.file_path)
+
+            parser = UpstageLayoutAnalysisParser(
+                self.api_key,
+                split=self.split,
+                output_type=self.output_type,
+                use_ocr=self.use_ocr,
+            )
+            return list(parser.lazy_parse(blob, is_batch=True))
+
+    def lazy_load(self) -> Iterator[Document]:
+        """
+        Lazily loads and parses the document using the UpstageLayoutAnalysisParser.
+
+        Returns:
+            An iterator of Document objects representing the parsed layout analysis.
+        """
+
+        if isinstance(self.file_path, list):
+            for file_path in self.file_path:
+                blob = Blob.from_path(file_path)
+
+                parser = UpstageLayoutAnalysisParser(
+                    self.api_key,
+                    split=self.split,
+                    output_type=self.output_type,
+                    use_ocr=self.use_ocr,
+                )
+                yield from parser.lazy_parse(blob, is_batch=True)
+        else:
+            blob = Blob.from_path(self.file_path)
+
+            parser = UpstageLayoutAnalysisParser(
+                self.api_key,
+                split=self.split,
+                output_type=self.output_type,
+                use_ocr=self.use_ocr,
+            )
+            yield from parser.lazy_parse(blob)
--- a/libs/partners/upstage/langchain_upstage/layout_analysis_parsers.py
+++ b/libs/partners/upstage/langchain_upstage/layout_analysis_parsers.py
@@ -0,0 +1,375 @@
+import io
+import json
+import os
+from typing import Dict, Iterator, List, Literal, Optional, Union
+
+import fitz  # type: ignore
+import requests
+from fitz import Document as fitzDocument
+from langchain_core.document_loaders import BaseBlobParser, Blob
+from langchain_core.documents import Document
+
+LAYOUT_ANALYSIS_URL = "https://api.upstage.ai/v1/document-ai/layout-analysis"
+
+DEFAULT_NUMBER_OF_PAGE = 10
+
+OutputType = Literal["text", "html"]
+SplitType = Literal["none", "element", "page"]
+
+
+def validate_api_key(api_key: str) -> None:
+    """
+    Validates the provided API key.
+
+    Args:
+        api_key (str): The API key to be validated.
+
+    Raises:
+        ValueError: If the API key is empty or None.
+
+    Returns:
+        None
+    """
+    if not api_key:
+        raise ValueError("API Key is required for Upstage Document Loader")
+
+
+def validate_file_path(file_path: str) -> None:
+    """
+    Validates if a file exists at the given file path.
+
+    Args:
+        file_path (str): The path to the file.
+
+    Raises:
+        FileNotFoundError: If the file does not exist at the given file path.
+    """
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"File not found: {file_path}")
+
+
+def parse_output(data: dict, output_type: Union[OutputType, dict]) -> str:
+    """
+    Parse the output data based on the specified output type.
+
+    Args:
+        data (dict): The data to be parsed.
+        output_type (Union[OutputType, dict]): The output type to parse the element data
+                                               into.
+
+    Returns:
+        str: The parsed output.
+
+    Raises:
+        ValueError: If the output type is invalid.
+    """
+    if isinstance(output_type, dict):
+        if data["category"] in output_type:
+            return data[output_type[data["category"]]]
+        else:
+            return data["text"]
+    elif isinstance(output_type, str):
+        if output_type == "text":
+            return data["text"]
+        elif output_type == "html":
+            return data["html"]
+        else:
+            raise ValueError(f"Invalid output type: {output_type}")
+    else:
+        raise ValueError(f"Invalid output type: {output_type}")
+
+
+def get_from_param_or_env(
+    key: str,
+    param: Optional[str] = None,
+    env_key: Optional[str] = None,
+    default: Optional[str] = None,
+) -> str:
+    """Get a value from a param or an environment variable."""
+    if param is not None:
+        return param
+    elif env_key and env_key in os.environ and os.environ[env_key]:
+        return os.environ[env_key]
+    elif default is not None:
+        return default
+    else:
+        raise ValueError(
+            f"Did not find {key}, please add an environment variable"
+            f" `{env_key}` which contains it, or pass"
+            f"  `{key}` as a named parameter."
+        )
+
+
+class UpstageLayoutAnalysisParser(BaseBlobParser):
+    """Upstage Layout Analysis Parser.
+
+    To use, you should have the environment variable `UPSTAGE_DOCUMENT_AI_API_KEY`
+    set with your API key or pass it as a named parameter to the constructor.
+
+    Example:
+        .. code-block:: python
+
+            from langchain_upstage import UpstageLayoutAnalysisParser
+
+            loader = UpstageLayoutAnalysisParser(split="page", output_type="text")
+    """
+
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+        output_type: Union[OutputType, dict] = "text",
+        split: SplitType = "none",
+        use_ocr: bool = False,
+    ):
+        """
+        Initializes an instance of the Upstage class.
+
+        Args:
+            api_key (str, optional): The API key for accessing the Upstage API.
+                                     Defaults to None, in which case it will be
+                                     fetched from the environment variable
+                                     `UPSTAGE_DOCUMENT_AI_API_KEY`.
+            output_type (Union[OutputType, dict], optional): The type of output to be
+                                                             generated by the parser.
+                                                             Defaults to "text".
+            split (SplitType, optional): The type of splitting to be applied.
+                                         Defaults to "none" (no splitting).
+            use_ocr (bool, optional): Extract text from images in the document.
+                                      Defaults to False. (Use text info in PDF file)
+        """
+        self.api_key = get_from_param_or_env(
+            "UPSTAGE_DOCUMENT_AI_API_KEY", api_key, "UPSTAGE_DOCUMENT_AI_API_KEY"
+        )
+
+        self.output_type = output_type
+        self.split = split
+        self.use_ocr = use_ocr
+
+        validate_api_key(self.api_key)
+
+    def _get_response(self, files: Dict) -> Dict:
+        """
+        Sends a POST request to the API endpoint with the provided files and
+        returns the response.
+
+        Args:
+            files (dict): A dictionary containing the files to be sent in the request.
+
+        Returns:
+            dict: The JSON response from the API.
+
+        Raises:
+            ValueError: If there is an error in the API call.
+        """
+        try:
+            headers = {"Authorization": f"Bearer {self.api_key}"}
+            options = {"ocr": self.use_ocr}
+            response = requests.post(
+                LAYOUT_ANALYSIS_URL, headers=headers, files=files, json=options
+            )
+            response.raise_for_status()
+
+            result = response.json()
+
+        except requests.RequestException as req_err:
+            # Handle any request-related exceptions
+            print(f"Request Exception: {req_err}")
+        except json.JSONDecodeError as json_err:
+            # Handle JSON decode errors
+            print(f"JSON Decode Error: {json_err}")
+            raise ValueError(f"Failed to decode JSON response: {json_err}")
+
+        return result
+
+    def _split_and_request(
+        self,
+        full_docs: fitzDocument,
+        start_page: int,
+        num_pages: int = DEFAULT_NUMBER_OF_PAGE,
+    ) -> Dict:
+        """
+        Splits the full pdf document into partial pages and sends a request to the
+        server.
+
+        Args:
+            full_docs (str): The full document to be split and requested.
+            start_page (int): The starting page number for splitting the document.
+            num_pages (int, optional): The number of pages to split the document
+                                             into.
+                                             Defaults to DEFAULT_NUMBER_OF_PAGE.
+
+        Returns:
+            response: The response from the server.
+        """
+        with fitz.open() as chunk_pdf:
+            chunk_pdf.insert_pdf(
+                full_docs,
+                from_page=start_page,
+                to_page=start_page + num_pages - 1,
+            )
+            pdf_bytes = chunk_pdf.write()
+
+        with io.BytesIO(pdf_bytes) as f:
+            response = self._get_response({"document": f})
+
+        return response
+
+    def _element_document(self, elements: Dict) -> Document:
+        """
+        Converts an elements into a Document object.
+
+        Args:
+            elements: The elements to convert.
+
+        Returns:
+            A list containing a single Document object.
+
+        """
+        return Document(
+            page_content=(parse_output(elements, self.output_type)),
+            metadata={
+                "page": elements["page"],
+                "id": elements["id"],
+                "type": self.output_type,
+                "split": self.split,
+            },
+        )
+
+    def _page_document(self, elements: Dict) -> List[Document]:
+        """
+        Combines elements with the same page number into a single Document object.
+
+        Args:
+            elements (List[Dict]): A list of elements containing page numbers.
+
+        Returns:
+            List[Document]: A list of Document objects, each representing a page
+                            with its content and metadata.
+        """
+        _docs = []
+        pages = sorted(set(map(lambda x: x["page"], elements)))
+
+        page_group = [
+            [element for element in elements if element["page"] == x] for x in pages
+        ]
+
+        for group in page_group:
+            page_content = " ".join(
+                [parse_output(element, self.output_type) for element in group]
+            )
+
+            _docs.append(
+                Document(
+                    page_content=page_content,
+                    metadata={
+                        "page": group[0]["page"],
+                        "type": self.output_type,
+                        "split": self.split,
+                    },
+                )
+            )
+
+        return _docs
+
+    def lazy_parse(self, blob: Blob, is_batch: bool = False) -> Iterator[Document]:
+        """
+        Lazily parses a document and yields Document objects based on the specified
+        split type.
+
+        Args:
+            blob (Blob): The input document blob to parse.
+            is_batch (bool, optional): Whether to parse the document in batches.
+                                       Defaults to False (single page parsing)
+
+        Yields:
+            Document: The parsed document object.
+
+        Raises:
+            ValueError: If an invalid split type is provided.
+
+        """
+
+        if is_batch:
+            num_pages = DEFAULT_NUMBER_OF_PAGE
+        else:
+            num_pages = 1
+
+        full_docs = fitz.open(blob.path)
+        number_of_pages = full_docs.page_count
+
+        if self.split == "none":
+            if full_docs.is_pdf:
+                result = ""
+                start_page = 0
+                num_pages = DEFAULT_NUMBER_OF_PAGE
+                for _ in range(number_of_pages):
+                    if start_page >= number_of_pages:
+                        break
+
+                    response = self._split_and_request(full_docs, start_page, num_pages)
+                    result += parse_output(response, self.output_type)
+
+                    start_page += num_pages
+
+            else:
+                if not blob.path:
+                    raise ValueError("Blob path is required for non-PDF files.")
+                with open(blob.path, "rb") as f:
+                    response = self._get_response({"document": f})
+                    result = parse_output(response, self.output_type)
+
+            yield Document(
+                page_content=result,
+                metadata={
+                    "total_pages": number_of_pages,
+                    "type": self.output_type,
+                    "split": self.split,
+                },
+            )
+
+        elif self.split == "element":
+            if full_docs.is_pdf:
+                start_page = 0
+                for _ in range(number_of_pages):
+                    if start_page >= number_of_pages:
+                        break
+
+                    response = self._split_and_request(full_docs, start_page, num_pages)
+                    for element in response["elements"]:
+                        yield self._element_document(element)
+
+                    start_page += num_pages
+
+            else:
+                if not blob.path:
+                    raise ValueError("Blob path is required for non-PDF files.")
+                with open(blob.path, "rb") as f:
+                    response = self._get_response({"document": f})
+
+                for element in response["elements"]:
+                    yield self._element_document(element)
+
+        elif self.split == "page":
+            if full_docs.is_pdf:
+                start_page = 0
+                for _ in range(number_of_pages):
+                    if start_page >= number_of_pages:
+                        break
+
+                    response = self._split_and_request(full_docs, start_page, num_pages)
+                    elements = response["elements"]
+                    yield from self._page_document(elements)
+
+                    start_page += num_pages
+            else:
+                if not blob.path:
+                    raise ValueError("Blob path is required for non-PDF files.")
+                with open(blob.path, "rb") as f:
+                    response = self._get_response({"document": f})
+
+                elements = response["elements"]
+
+                yield from self._page_document(elements)
+
+        else:
+            raise ValueError(f"Invalid split type: {self.split}")
--- a/libs/partners/upstage/langchain_upstage/tools/groundedness_check.py
+++ b/libs/partners/upstage/langchain_upstage/tools/groundedness_check.py
@@ -0,0 +1,91 @@
+import os
+from typing import Literal, Optional, Type, Union
+
+from langchain_core.callbacks import (
+    AsyncCallbackManagerForToolRun,
+    CallbackManagerForToolRun,
+)
+from langchain_core.messages import AIMessage, HumanMessage
+from langchain_core.pydantic_v1 import BaseModel, Field, SecretStr
+from langchain_core.tools import BaseTool
+
+from langchain_upstage import ChatUpstage
+
+
+class GroundednessCheckInput(BaseModel):
+    """Input for the Groundedness Check tool."""
+
+    context: str = Field(description="context in which the answer should be verified")
+    query: str = Field(
+        description="assistant's reply or a text that is subject to groundedness check"
+    )
+
+
+class GroundednessCheck(BaseTool):
+    """Tool that checks the groundedness of a context and an assistant message.
+
+    To use, you should have the environment variable `UPSTAGE_API_KEY`
+    set with your API key or pass it as a named parameter to the constructor.
+
+    Example:
+        .. code-block:: python
+
+                from langchain_upstage import GroundednessCheck
+
+                tool = GroundednessCheck()
+    """
+
+    name: str = "groundedness_check"
+    description: str = (
+        "A tool that checks the groundedness of an assistant response "
+        "to user-provided context. GroundednessCheck ensures that "
+        "the assistant’s response is not only relevant but also "
+        "precisely aligned with the user's initial context, "
+        "promoting a more reliable and context-aware interaction. "
+        "When using retrieval-augmented generation (RAG), "
+        "the Groundedness Check can be used to determine whether "
+        "the assistant's message is grounded in the provided context."
+    )
+    upstage_api_key: Optional[SecretStr] = Field(default=None, alias="api_key")
+    api_wrapper: ChatUpstage
+
+    args_schema: Type[BaseModel] = GroundednessCheckInput
+
+    def __init__(self, upstage_api_key: Optional[SecretStr] = None):
+        if not upstage_api_key:
+            upstage_api_key = SecretStr(os.getenv("UPSTAGE_API_KEY", ""))
+        else:
+            upstage_api_key = upstage_api_key
+        if (
+            not upstage_api_key
+            or not upstage_api_key.get_secret_value()
+            or upstage_api_key.get_secret_value() == ""
+        ):
+            raise ValueError("UPSTAGE_API_KEY must be set or passed")
+
+        api_wrapper = ChatUpstage(
+            model_name="solar-1-mini-answer-verification",
+            upstage_api_key=upstage_api_key.get_secret_value(),
+        )
+        super().__init__(upstage_api_key=upstage_api_key, api_wrapper=api_wrapper)
+
+    def _run(
+        self,
+        context: str,
+        query: str,
+        run_manager: Optional[CallbackManagerForToolRun] = None,
+    ) -> Union[str, Literal["grounded", "notGrounded", "notSure"]]:
+        """Use the tool."""
+        response = self.api_wrapper.invoke([HumanMessage(context), AIMessage(query)])
+        return str(response.content)
+
+    async def _arun(
+        self,
+        context: str,
+        query: str,
+        run_manager: Optional[AsyncCallbackManagerForToolRun] = None,
+    ) -> Union[str, Literal["grounded", "notGrounded", "notSure"]]:
+        response = await self.api_wrapper.ainvoke(
+            [HumanMessage(context), AIMessage(query)]
+        )
+        return str(response.content)
--- a/libs/partners/upstage/poetry.lock
+++ b/libs/partners/upstage/poetry.lock
@@ -223,13 +223,13 @@ test = ["pytest (>=6)"]

 [[package]]
 name = "freezegun"
-version = "1.4.0"
+version = "1.5.0"
 description = "Let your Python tests travel through time"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "freezegun-1.4.0-py3-none-any.whl", hash = "sha256:55e0fc3c84ebf0a96a5aa23ff8b53d70246479e9a68863f1fcac5a3e52f19dd6"},
-    {file = "freezegun-1.4.0.tar.gz", hash = "sha256:10939b0ba0ff5adaecf3b06a5c2f73071d9678e507c5eaedb23c761d56ac774b"},
+    {file = "freezegun-1.5.0-py3-none-any.whl", hash = "sha256:ec3f4ba030e34eb6cf7e1e257308aee2c60c3d038ff35996d7475760c9ff3719"},
+    {file = "freezegun-1.5.0.tar.gz", hash = "sha256:200a64359b363aa3653d8aac289584078386c7c3da77339d257e46a01fb5c77c"},
 ]

 [package.dependencies]
@@ -340,7 +340,7 @@ files = [

 [[package]]
 name = "langchain-core"
-version = "0.1.44"
+version = "0.1.45"
 description = "Building applications with LLMs through composability"
 optional = false
 python-versions = ">=3.8.1,<4.0"
@@ -399,13 +399,13 @@ url = "../../standard-tests"

 [[package]]
 name = "langsmith"
-version = "0.1.49"
+version = "0.1.50"
 description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "langsmith-0.1.49-py3-none-any.whl", hash = "sha256:cf0db7474c0dfb22015c22bf97f62e850898c3c6af9564dd111c2df225acc1c8"},
-    {file = "langsmith-0.1.49.tar.gz", hash = "sha256:5aee8537763f9d62b3368d79d7bfef881e2bfaa28639011d8d7328770cbd6419"},
+    {file = "langsmith-0.1.50-py3-none-any.whl", hash = "sha256:a81e9809fcaa277bfb314d729e58116554f186d1478fcfdf553b1c2ccce54b85"},
+    {file = "langsmith-0.1.50.tar.gz", hash = "sha256:9fd22df8c689c044058536ea5af66f5302067e7551b60d7a335fede8d479572b"},
 ]

 [package.dependencies]
@@ -548,13 +548,13 @@ files = [

 [[package]]
 name = "openai"
-version = "1.23.1"
+version = "1.23.3"
 description = "The official Python library for the openai API"
 optional = false
 python-versions = ">=3.7.1"
 files = [
-    {file = "openai-1.23.1-py3-none-any.whl", hash = "sha256:7941c1bc6fcdb1b6b889dfcfabff775ca52558a79d57dd1b9e15b463de1b3a4c"},
-    {file = "openai-1.23.1.tar.gz", hash = "sha256:6df937e2a1ad64494951ea3614f5516db4d67c3fcc0b751b8e5edf1bc57e2d3d"},
+    {file = "openai-1.23.3-py3-none-any.whl", hash = "sha256:6eef764a8870095d256d59e6be243acf560a21227e9e3588b508972818929ef7"},
+    {file = "openai-1.23.3.tar.gz", hash = "sha256:6730b8468a0235e5f289dfdfacaa374001645099c4ad1740b58eab378bcf7723"},
 ]

 [package.dependencies]
@@ -642,13 +642,13 @@ files = [

 [[package]]
 name = "pluggy"
-version = "1.4.0"
+version = "1.5.0"
 description = "plugin and hook calling mechanisms for python"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "pluggy-1.4.0-py3-none-any.whl", hash = "sha256:7db9f7b503d67d1c5b95f59773ebb58a8c1c288129a88665838012cfb07b8981"},
-    {file = "pluggy-1.4.0.tar.gz", hash = "sha256:8c85c2876142a764e5b7548e7d9a0e0ddb46f5185161049a79b7e974454223be"},
+    {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"},
+    {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"},
 ]

 [package.extras]
@@ -722,6 +722,64 @@ files = [
 plugins = ["importlib-metadata"]
 windows-terminal = ["colorama (>=0.4.6)"]

+[[package]]
+name = "pymupdf"
+version = "1.24.2"
+description = "A high performance Python library for data extraction, analysis, conversion & manipulation of PDF (and other) documents."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "PyMuPDF-1.24.2-cp310-none-macosx_10_9_x86_64.whl", hash = "sha256:5faed2bbdfbea80db1bbaa5944888f27a672f2e10e16e61f7d4ff73429a00506"},
+    {file = "PyMuPDF-1.24.2-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:24c398e43a14e0e11f3515ea57875b5b0ee1a37d6dc59f921f69d8d16e881cb8"},
+    {file = "PyMuPDF-1.24.2-cp310-none-manylinux2014_aarch64.whl", hash = "sha256:569336fe3c5f81f28aa9658861597e43e5716cbfa5b8d2602431095df76e0d7c"},
+    {file = "PyMuPDF-1.24.2-cp310-none-manylinux2014_x86_64.whl", hash = "sha256:8fe58a024629c23847423b3294f0f160c72c72f953af53d183bd3328f954593a"},
+    {file = "PyMuPDF-1.24.2-cp310-none-win32.whl", hash = "sha256:49224a558736303ed980252a704646fe9347c74bf70d0ad32530c62b8e0bfe33"},
+    {file = "PyMuPDF-1.24.2-cp310-none-win_amd64.whl", hash = "sha256:a32c94c7ee45f2bfee766e5b957bdfe08c96b21fd9adbfb546c141621af0ca85"},
+    {file = "PyMuPDF-1.24.2-cp311-none-macosx_10_9_x86_64.whl", hash = "sha256:815d9e10faa43a149d8c9928d7cefda83fd91a1f637dfb3a295620175a0af95c"},
+    {file = "PyMuPDF-1.24.2-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:b583add37141a9337935d014d4e1913b10e22d17f3fd656fdc5f0c0c2e65a33e"},
+    {file = "PyMuPDF-1.24.2-cp311-none-manylinux2014_aarch64.whl", hash = "sha256:d6a4d4ad8cc698db25a31026311f03fd351c2db9bfd41d898494cd0baff3b679"},
+    {file = "PyMuPDF-1.24.2-cp311-none-manylinux2014_x86_64.whl", hash = "sha256:7b5acb936203bdaef5945f211af8a5fb40f07059be1ba69a728425f6d522e60f"},
+    {file = "PyMuPDF-1.24.2-cp311-none-win32.whl", hash = "sha256:d01d348a35438f8a1647334428ef23c6d947acae875fa61cac2be3a65b15e4f5"},
+    {file = "PyMuPDF-1.24.2-cp311-none-win_amd64.whl", hash = "sha256:909ab62c752be80c3c130a9774fc27fb863d26149ba880129e0a2cf0d53cebde"},
+    {file = "PyMuPDF-1.24.2-cp312-none-macosx_10_9_x86_64.whl", hash = "sha256:6a3c1f2e99a4ca43c97b1f43fdd1aed739910e25ee5bd7fe73cd4eaf59841ae3"},
+    {file = "PyMuPDF-1.24.2-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:3effff62943ceebbbe32a08ce4aa9c8ed4fa18fd8a462cf6130c78818c47822d"},
+    {file = "PyMuPDF-1.24.2-cp312-none-manylinux2014_aarch64.whl", hash = "sha256:f3964783bf81f2ec94db4f9fa536052be3b7457824c9e9d21edb91f3a489a377"},
+    {file = "PyMuPDF-1.24.2-cp312-none-manylinux2014_x86_64.whl", hash = "sha256:04af266755d4c250b46a3311062aec36ea94cecc4470a53ab79d9de56e5a753d"},
+    {file = "PyMuPDF-1.24.2-cp312-none-win32.whl", hash = "sha256:3bd7bdda4c4e4f98989ce84a7b2c08033639a8be1b46fb064fdd65b20a7e7d03"},
+    {file = "PyMuPDF-1.24.2-cp312-none-win_amd64.whl", hash = "sha256:ec2544f35088b29730210decfb0bdb750e0c3d2652ee470897f6d2e4a6dc1950"},
+    {file = "PyMuPDF-1.24.2-cp38-none-macosx_10_9_x86_64.whl", hash = "sha256:d4fd3957fd507affbcae4536092cb3e3726e91d484be16972603c5cacae7848a"},
+    {file = "PyMuPDF-1.24.2-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:4290273dfcc58a2c0b1f207f5e25479b868f59e9ea6ac9241740506fa0c03c0a"},
+    {file = "PyMuPDF-1.24.2-cp38-none-manylinux2014_aarch64.whl", hash = "sha256:8f52f27d1f5968b6dda4d803e7f5246626a45ab68f0626509a9e17fadcebfb69"},
+    {file = "PyMuPDF-1.24.2-cp38-none-manylinux2014_x86_64.whl", hash = "sha256:db650840eb3efbdc97df94210d0400042c863b08348d67037495d221ec4e8b7f"},
+    {file = "PyMuPDF-1.24.2-cp38-none-win32.whl", hash = "sha256:423217223741f55f9bb7622475a94c2934495e8a843246c582c78f3680914a80"},
+    {file = "PyMuPDF-1.24.2-cp38-none-win_amd64.whl", hash = "sha256:ca493fbb91d81a43d68d3547194d0c86083db49d4dd98e8f41aa5a77a26ff8fe"},
+    {file = "PyMuPDF-1.24.2-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:9783b67f63e7f9b397f119de996ea8214498d163531b9371d8ea7e374cdd45cd"},
+    {file = "PyMuPDF-1.24.2-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:4db161926d636c0bff016ac7591edbe6b30712507079f7008cefb0fdf58055dc"},
+    {file = "PyMuPDF-1.24.2-cp39-none-manylinux2014_aarch64.whl", hash = "sha256:537cc7bef86514a6fa248eeb14b588f51699388628372cf31bae7839283aa628"},
+    {file = "PyMuPDF-1.24.2-cp39-none-manylinux2014_x86_64.whl", hash = "sha256:a124b360898d24b730fe3be0e0bca438789c1568ceaad854387eee1886bb788c"},
+    {file = "PyMuPDF-1.24.2-cp39-none-win32.whl", hash = "sha256:007586883fbc8acb900d46aa95520aaeb8943d05a956b26c54053ddb58dbdd5f"},
+    {file = "PyMuPDF-1.24.2-cp39-none-win_amd64.whl", hash = "sha256:d89cbb1a093dbf042f503f5c7fc368d0718a652418512a7a42a2965cba27713d"},
+    {file = "PyMuPDF-1.24.2.tar.gz", hash = "sha256:cdaca48b7677a0c1dc827413b90c8fe4517f789f74c6ac0fb47f6051368246bb"},
+]
+
+[package.dependencies]
+PyMuPDFb = "1.24.1"
+
+[[package]]
+name = "pymupdfb"
+version = "1.24.1"
+description = "MuPDF shared libraries for PyMuPDF."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "PyMuPDFb-1.24.1-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:37179e363bf69ce9be637937c5469957b96968341dabe3ce8f4b690a82e9ad92"},
+    {file = "PyMuPDFb-1.24.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:17444ea7d6897c27759880ad76af537d19779f901de82ae9548598a70f614558"},
+    {file = "PyMuPDFb-1.24.1-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:490f7fff4dbe362bc895cefdfc5030d712311d024d357a1388d64816eb215d34"},
+    {file = "PyMuPDFb-1.24.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0fbcc0d2a9ce79fa38eb4e8bb5c959b582f7a49938874e9f61d1a6f5eeb1e4b8"},
+    {file = "PyMuPDFb-1.24.1-py3-none-win32.whl", hash = "sha256:ae67736058882cdd9459810a4aae9ac2b2e89ac2e916cb5fefb0f651c9739e9e"},
+    {file = "PyMuPDFb-1.24.1-py3-none-win_amd64.whl", hash = "sha256:01c8b7f0ce9166310eb28c7aebcb8d5fe12a4bc082f9b00d580095eebeaf0af5"},
+]
+
 [[package]]
 name = "pytest"
 version = "7.4.4"
@@ -1270,4 +1328,4 @@ watchmedo = ["PyYAML (>=3.10)"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8.1,<4.0"
-content-hash = "98a8d67be9138240d5190eb4774b93f671fbd8069839ad239d005c753bdbae0d"
+content-hash = "1bb654e8a4f60cca5f0562ade5477a2f2e852ed2a361c7e9162208fbeb445309"
--- a/libs/partners/upstage/pyproject.toml
+++ b/libs/partners/upstage/pyproject.toml
@@ -14,6 +14,8 @@ license = "MIT"
 python = ">=3.8.1,<4.0"
 langchain-core = "^0.1.44"
 langchain-openai = "^0.1.3"
+pymupdf = "^1.24.1"
+requests = "^2.31.0"

 [tool.poetry.group.test]
 optional = true
@@ -50,6 +52,7 @@ ruff = "^0.1.5"

 [tool.poetry.group.typing.dependencies]
 mypy = "^0.991"
+types-requests = ">=2.31.0"
 langchain-core = { path = "../../core", develop = true }

 [tool.poetry.group.dev]
--- a/libs/partners/upstage/tests/examples/solar.pdf
+++ b/libs/partners/upstage/tests/examples/solar.pdf
--- a/libs/partners/upstage/tests/integration_tests/test_groundness_check.py
+++ b/libs/partners/upstage/tests/integration_tests/test_groundness_check.py
@@ -0,0 +1,17 @@
+from langchain_upstage import GroundednessCheck
+
+
+def test_langchain_upstage_groundedness_check() -> None:
+    """Test Upstage Groundedness Check."""
+    tool = GroundednessCheck()
+    output = tool.run({"context": "foo bar", "query": "bar foo"})
+
+    assert output in ["grounded", "notGrounded", "notSure"]
+
+
+async def test_langchain_upstage_groundedness_check_async() -> None:
+    """Test Upstage Groundedness Check asynchronous."""
+    tool = GroundednessCheck()
+    output = await tool.arun({"context": "foo bar", "query": "bar foo"})
+
+    assert output in ["grounded", "notGrounded", "notSure"]
--- a/libs/partners/upstage/tests/unit_tests/test_groundedness_check.py
+++ b/libs/partners/upstage/tests/unit_tests/test_groundedness_check.py
@@ -0,0 +1,10 @@
+import os
+
+from langchain_upstage import GroundednessCheck
+
+os.environ["UPSTAGE_API_KEY"] = "foo"
+
+
+def test_initialization() -> None:
+    """Test embedding model initialization."""
+    GroundednessCheck()
--- a/libs/partners/upstage/tests/unit_tests/test_imports.py
+++ b/libs/partners/upstage/tests/unit_tests/test_imports.py
@@ -3,6 +3,9 @@ from langchain_upstage import __all__
 EXPECTED_ALL = [
    "ChatUpstage",
    "UpstageEmbeddings",
+    "UpstageLayoutAnalysisLoader",
+    "UpstageLayoutAnalysisParser",
+    "GroundednessCheck",
 ]


--- a/libs/partners/upstage/tests/unit_tests/test_layout_analysis.py
+++ b/libs/partners/upstage/tests/unit_tests/test_layout_analysis.py
@@ -0,0 +1,200 @@
+from pathlib import Path
+from typing import Any, Dict, get_args
+from unittest.mock import MagicMock, Mock, patch
+
+from langchain_upstage import UpstageLayoutAnalysisLoader
+from langchain_upstage.layout_analysis import OutputType, SplitType
+
+MOCK_RESPONSE_JSON: Dict[str, Any] = {
+    "api": "1.0",
+    "billed_pages": 1,
+    "elements": [
+        {
+            "bounding_box": [
+                {"x": 74, "y": 906},
+                {"x": 148, "y": 906},
+                {"x": 148, "y": 2338},
+                {"x": 74, "y": 2338},
+            ],
+            "category": "header",
+            "html": "2021arXiv:2103.15348v2",
+            "id": 0,
+            "page": 1,
+            "text": "arXiv:2103.15348v2",
+        },
+        {
+            "bounding_box": [
+                {"x": 654, "y": 474},
+                {"x": 1912, "y": 474},
+                {"x": 1912, "y": 614},
+                {"x": 654, "y": 614},
+            ],
+            "category": "paragraph",
+            "html": "LayoutParser Toolkit",
+            "id": 1,
+            "page": 1,
+            "text": "LayoutParser Toolkit",
+        },
+    ],
+    "html": "<header id='0'>arXiv:2103.15348v2</header>"
+    + "<p id='1'>LayoutParser Toolkit</p>",
+    "mimetype": "multipart/form-data",
+    "model": "layout-analyzer-0.1.0",
+    "text": "arXiv:2103.15348v2LayoutParser Toolkit",
+}
+
+EXAMPLE_PDF_PATH = Path(__file__).parent.parent / "examples/solar.pdf"
+
+
+def test_initialization() -> None:
+    """Test layout analysis document loader initialization."""
+    UpstageLayoutAnalysisLoader(file_path=EXAMPLE_PDF_PATH, api_key="bar")
+
+
+def test_layout_analysis_param() -> None:
+    for output_type in get_args(OutputType):
+        for split in get_args(SplitType):
+            loader = UpstageLayoutAnalysisLoader(
+                file_path=EXAMPLE_PDF_PATH,
+                api_key="bar",
+                output_type=output_type,
+                split=split,
+            )
+            assert loader.output_type == output_type
+            assert loader.split == split
+            assert loader.api_key == "bar"
+            assert loader.file_path == EXAMPLE_PDF_PATH
+
+
+@patch("requests.post")
+def test_none_split_text_output(mock_post: Mock) -> None:
+    mock_post.return_value = MagicMock(
+        status_code=200, json=MagicMock(return_value=MOCK_RESPONSE_JSON)
+    )
+
+    loader = UpstageLayoutAnalysisLoader(
+        file_path=EXAMPLE_PDF_PATH,
+        output_type="text",
+        split="none",
+        api_key="valid_api_key",
+    )
+    documents = loader.load()
+
+    assert len(documents) == 1
+    assert documents[0].page_content == MOCK_RESPONSE_JSON["text"]
+    assert documents[0].metadata["total_pages"] == 1
+    assert documents[0].metadata["type"] == "text"
+    assert documents[0].metadata["split"] == "none"
+
+
+@patch("requests.post")
+def test_element_split_text_output(mock_post: Mock) -> None:
+    mock_post.return_value = MagicMock(
+        status_code=200, json=MagicMock(return_value=MOCK_RESPONSE_JSON)
+    )
+
+    loader = UpstageLayoutAnalysisLoader(
+        file_path=EXAMPLE_PDF_PATH,
+        output_type="text",
+        split="element",
+        api_key="valid_api_key",
+    )
+    documents = loader.load()
+
+    assert len(documents) == 2
+
+    for i, document in enumerate(documents):
+        assert document.page_content == MOCK_RESPONSE_JSON["elements"][i]["text"]
+        assert document.metadata["page"] == MOCK_RESPONSE_JSON["elements"][i]["page"]
+        assert document.metadata["id"] == MOCK_RESPONSE_JSON["elements"][i]["id"]
+        assert document.metadata["type"] == "text"
+        assert document.metadata["split"] == "element"
+
+
+@patch("requests.post")
+def test_page_split_text_output(mock_post: Mock) -> None:
+    mock_post.return_value = MagicMock(
+        status_code=200, json=MagicMock(return_value=MOCK_RESPONSE_JSON)
+    )
+
+    loader = UpstageLayoutAnalysisLoader(
+        file_path=EXAMPLE_PDF_PATH,
+        output_type="text",
+        split="page",
+        api_key="valid_api_key",
+    )
+    documents = loader.load()
+
+    assert len(documents) == 1
+
+    for i, document in enumerate(documents):
+        assert document.metadata["page"] == MOCK_RESPONSE_JSON["elements"][i]["page"]
+        assert document.metadata["type"] == "text"
+        assert document.metadata["split"] == "page"
+
+
+@patch("requests.post")
+def test_none_split_html_output(mock_post: Mock) -> None:
+    mock_post.return_value = MagicMock(
+        status_code=200, json=MagicMock(return_value=MOCK_RESPONSE_JSON)
+    )
+
+    loader = UpstageLayoutAnalysisLoader(
+        file_path=EXAMPLE_PDF_PATH,
+        output_type="html",
+        split="none",
+        api_key="valid_api_key",
+    )
+    documents = loader.load()
+
+    assert len(documents) == 1
+    assert documents[0].page_content == MOCK_RESPONSE_JSON["html"]
+    assert documents[0].metadata["total_pages"] == 1
+    assert documents[0].metadata["type"] == "html"
+    assert documents[0].metadata["split"] == "none"
+
+
+@patch("requests.post")
+def test_element_split_html_output(mock_post: Mock) -> None:
+    mock_post.return_value = MagicMock(
+        status_code=200, json=MagicMock(return_value=MOCK_RESPONSE_JSON)
+    )
+
+    loader = UpstageLayoutAnalysisLoader(
+        file_path=EXAMPLE_PDF_PATH,
+        output_type="html",
+        split="element",
+        api_key="valid_api_key",
+    )
+    documents = loader.load()
+
+    assert len(documents) == 2
+
+    for i, document in enumerate(documents):
+        assert document.page_content == MOCK_RESPONSE_JSON["elements"][i]["html"]
+        assert document.metadata["page"] == MOCK_RESPONSE_JSON["elements"][i]["page"]
+        assert document.metadata["id"] == MOCK_RESPONSE_JSON["elements"][i]["id"]
+        assert document.metadata["type"] == "html"
+        assert document.metadata["split"] == "element"
+
+
+@patch("requests.post")
+def test_page_split_html_output(mock_post: Mock) -> None:
+    mock_post.return_value = MagicMock(
+        status_code=200, json=MagicMock(return_value=MOCK_RESPONSE_JSON)
+    )
+
+    loader = UpstageLayoutAnalysisLoader(
+        file_path=EXAMPLE_PDF_PATH,
+        output_type="html",
+        split="page",
+        api_key="valid_api_key",
+    )
+    documents = loader.load()
+
+    assert len(documents) == 1
+
+    for i, document in enumerate(documents):
+        assert document.metadata["page"] == MOCK_RESPONSE_JSON["elements"][i]["page"]
+        assert document.metadata["type"] == "html"
+        assert document.metadata["split"] == "page"