mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-08 14:05:16 +00:00
Fix Mathpix PDF loader integration (#13949)
- **Description:** Fixes the Mathpix PDF loader API integration. Specifically, ensures that Mathpix auth headers are provided for every request, and ensures that we recognize all errors that can occur during a request. Also, the option to provide API keys as kwargs never actually worked before, but now that's fixed too. - **Issue:** #11249 - **Dependencies:** None
This commit is contained in:
parent
32d4bb4590
commit
8a0951d934
@ -403,6 +403,11 @@ class MathpixPDFLoader(BasePDFLoader):
|
|||||||
self.mathpix_api_id = get_from_dict_or_env(
|
self.mathpix_api_id = get_from_dict_or_env(
|
||||||
kwargs, "mathpix_api_id", "MATHPIX_API_ID"
|
kwargs, "mathpix_api_id", "MATHPIX_API_ID"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# The base class isn't expecting these and doesn't collect **kwargs
|
||||||
|
kwargs.pop("mathpix_api_key", None)
|
||||||
|
kwargs.pop("mathpix_api_id", None)
|
||||||
|
|
||||||
super().__init__(file_path, **kwargs)
|
super().__init__(file_path, **kwargs)
|
||||||
self.processed_file_format = processed_file_format
|
self.processed_file_format = processed_file_format
|
||||||
self.extra_request_data = (
|
self.extra_request_data = (
|
||||||
@ -434,6 +439,8 @@ class MathpixPDFLoader(BasePDFLoader):
|
|||||||
self.url, headers=self._mathpix_headers, files=files, data=self.data
|
self.url, headers=self._mathpix_headers, files=files, data=self.data
|
||||||
)
|
)
|
||||||
response_data = response.json()
|
response_data = response.json()
|
||||||
|
if "error" in response_data:
|
||||||
|
raise ValueError(f"Mathpix request failed: {response_data['error']}")
|
||||||
if "pdf_id" in response_data:
|
if "pdf_id" in response_data:
|
||||||
pdf_id = response_data["pdf_id"]
|
pdf_id = response_data["pdf_id"]
|
||||||
return pdf_id
|
return pdf_id
|
||||||
@ -450,13 +457,21 @@ class MathpixPDFLoader(BasePDFLoader):
|
|||||||
"""
|
"""
|
||||||
url = self.url + "/" + pdf_id
|
url = self.url + "/" + pdf_id
|
||||||
for _ in range(0, self.max_wait_time_seconds, 5):
|
for _ in range(0, self.max_wait_time_seconds, 5):
|
||||||
response = requests.get(url, headers=self.headers)
|
response = requests.get(url, headers=self._mathpix_headers)
|
||||||
response_data = response.json()
|
response_data = response.json()
|
||||||
|
|
||||||
|
# This indicates an error with the request (e.g. auth problems)
|
||||||
|
error = response_data.get("error", None)
|
||||||
|
|
||||||
|
if error is not None:
|
||||||
|
raise ValueError(f"Unable to retrieve PDF from Mathpix: {error}")
|
||||||
|
|
||||||
status = response_data.get("status", None)
|
status = response_data.get("status", None)
|
||||||
|
|
||||||
if status == "completed":
|
if status == "completed":
|
||||||
return
|
return
|
||||||
elif status == "error":
|
elif status == "error":
|
||||||
|
# This indicates an error with the PDF processing
|
||||||
raise ValueError("Unable to retrieve PDF from Mathpix")
|
raise ValueError("Unable to retrieve PDF from Mathpix")
|
||||||
else:
|
else:
|
||||||
print(f"Status: {status}, waiting for processing to complete")
|
print(f"Status: {status}, waiting for processing to complete")
|
||||||
@ -466,7 +481,7 @@ class MathpixPDFLoader(BasePDFLoader):
|
|||||||
def get_processed_pdf(self, pdf_id: str) -> str:
|
def get_processed_pdf(self, pdf_id: str) -> str:
|
||||||
self.wait_for_processing(pdf_id)
|
self.wait_for_processing(pdf_id)
|
||||||
url = f"{self.url}/{pdf_id}.{self.processed_file_format}"
|
url = f"{self.url}/{pdf_id}.{self.processed_file_format}"
|
||||||
response = requests.get(url, headers=self.headers)
|
response = requests.get(url, headers=self._mathpix_headers)
|
||||||
return response.content.decode("utf-8")
|
return response.content.decode("utf-8")
|
||||||
|
|
||||||
def clean_pdf(self, contents: str) -> str:
|
def clean_pdf(self, contents: str) -> str:
|
||||||
|
Loading…
Reference in New Issue
Block a user