From 8a0951d934a3bb945a3d181f1368b7b69deb066a Mon Sep 17 00:00:00 2001 From: Chad Norvell Date: Sun, 3 Dec 2023 10:36:49 -0800 Subject: [PATCH] Fix Mathpix PDF loader integration (#13949) - **Description:** Fixes the Mathpix PDF loader API integration. Specifically, ensures that Mathpix auth headers are provided for every request, and ensures that we recognize all errors that can occur during a request. Also, the option to provide API keys as kwargs never actually worked before, but now that's fixed too. - **Issue:** #11249 - **Dependencies:** None --- .../langchain/document_loaders/pdf.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/pdf.py b/libs/langchain/langchain/document_loaders/pdf.py index 6b5e1dbef95..e056d9caa8b 100644 --- a/libs/langchain/langchain/document_loaders/pdf.py +++ b/libs/langchain/langchain/document_loaders/pdf.py @@ -403,6 +403,11 @@ class MathpixPDFLoader(BasePDFLoader): self.mathpix_api_id = get_from_dict_or_env( kwargs, "mathpix_api_id", "MATHPIX_API_ID" ) + + # The base class isn't expecting these and doesn't collect **kwargs + kwargs.pop("mathpix_api_key", None) + kwargs.pop("mathpix_api_id", None) + super().__init__(file_path, **kwargs) self.processed_file_format = processed_file_format self.extra_request_data = ( @@ -434,6 +439,8 @@ class MathpixPDFLoader(BasePDFLoader): self.url, headers=self._mathpix_headers, files=files, data=self.data ) response_data = response.json() + if "error" in response_data: + raise ValueError(f"Mathpix request failed: {response_data['error']}") if "pdf_id" in response_data: pdf_id = response_data["pdf_id"] return pdf_id @@ -450,13 +457,21 @@ class MathpixPDFLoader(BasePDFLoader): """ url = self.url + "/" + pdf_id for _ in range(0, self.max_wait_time_seconds, 5): - response = requests.get(url, headers=self.headers) + response = requests.get(url, headers=self._mathpix_headers) response_data = response.json() + + # This indicates an error with the request (e.g. auth problems) + error = response_data.get("error", None) + + if error is not None: + raise ValueError(f"Unable to retrieve PDF from Mathpix: {error}") + status = response_data.get("status", None) if status == "completed": return elif status == "error": + # This indicates an error with the PDF processing raise ValueError("Unable to retrieve PDF from Mathpix") else: print(f"Status: {status}, waiting for processing to complete") @@ -466,7 +481,7 @@ class MathpixPDFLoader(BasePDFLoader): def get_processed_pdf(self, pdf_id: str) -> str: self.wait_for_processing(pdf_id) url = f"{self.url}/{pdf_id}.{self.processed_file_format}" - response = requests.get(url, headers=self.headers) + response = requests.get(url, headers=self._mathpix_headers) return response.content.decode("utf-8") def clean_pdf(self, contents: str) -> str: