From 8a0951d934a3bb945a3d181f1368b7b69deb066a Mon Sep 17 00:00:00 2001
From: Chad Norvell <chadnorvellx@gmail.com>
Date: Sun, 3 Dec 2023 10:36:49 -0800
Subject: [PATCH] Fix Mathpix PDF loader integration (#13949)

- **Description:** Fixes the Mathpix PDF loader API integration.
Specifically, ensures that Mathpix auth headers are provided for every
request, and ensures that we recognize all errors that can occur during
a request. Also, the option to provide API keys as kwargs never actually
worked before, but now that's fixed too.
  - **Issue:** #11249
  - **Dependencies:** None
---
 .../langchain/document_loaders/pdf.py         | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/libs/langchain/langchain/document_loaders/pdf.py b/libs/langchain/langchain/document_loaders/pdf.py
index 6b5e1dbef95..e056d9caa8b 100644
--- a/libs/langchain/langchain/document_loaders/pdf.py
+++ b/libs/langchain/langchain/document_loaders/pdf.py
@@ -403,6 +403,11 @@ class MathpixPDFLoader(BasePDFLoader):
         self.mathpix_api_id = get_from_dict_or_env(
             kwargs, "mathpix_api_id", "MATHPIX_API_ID"
         )
+
+        # The base class isn't expecting these and doesn't collect **kwargs
+        kwargs.pop("mathpix_api_key", None)
+        kwargs.pop("mathpix_api_id", None)
+
         super().__init__(file_path, **kwargs)
         self.processed_file_format = processed_file_format
         self.extra_request_data = (
@@ -434,6 +439,8 @@ class MathpixPDFLoader(BasePDFLoader):
                 self.url, headers=self._mathpix_headers, files=files, data=self.data
             )
         response_data = response.json()
+        if "error" in response_data:
+            raise ValueError(f"Mathpix request failed: {response_data['error']}")
         if "pdf_id" in response_data:
             pdf_id = response_data["pdf_id"]
             return pdf_id
@@ -450,13 +457,21 @@ class MathpixPDFLoader(BasePDFLoader):
         """
         url = self.url + "/" + pdf_id
         for _ in range(0, self.max_wait_time_seconds, 5):
-            response = requests.get(url, headers=self.headers)
+            response = requests.get(url, headers=self._mathpix_headers)
             response_data = response.json()
+
+            # This indicates an error with the request (e.g. auth problems)
+            error = response_data.get("error", None)
+
+            if error is not None:
+                raise ValueError(f"Unable to retrieve PDF from Mathpix: {error}")
+
             status = response_data.get("status", None)
 
             if status == "completed":
                 return
             elif status == "error":
+                # This indicates an error with the PDF processing
                 raise ValueError("Unable to retrieve PDF from Mathpix")
             else:
                 print(f"Status: {status}, waiting for processing to complete")
@@ -466,7 +481,7 @@ class MathpixPDFLoader(BasePDFLoader):
     def get_processed_pdf(self, pdf_id: str) -> str:
         self.wait_for_processing(pdf_id)
         url = f"{self.url}/{pdf_id}.{self.processed_file_format}"
-        response = requests.get(url, headers=self.headers)
+        response = requests.get(url, headers=self._mathpix_headers)
         return response.content.decode("utf-8")
 
     def clean_pdf(self, contents: str) -> str: