langchain[patch]: Mathpix PDF loader supports arbitrary extra params (#13950)

- **Description:** Support providing whatever extra parameters you want
to the Mathpix PDF loader API request.
  - **Issue:** #12773
  - **Dependencies:** None

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
Chad Norvell 2023-11-29 02:12:32 -08:00 committed by GitHub
parent 9e2ae866c4
commit 1c4bfb8c5f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -372,6 +372,7 @@ class MathpixPDFLoader(BasePDFLoader):
processed_file_format: str = "md",
max_wait_time_seconds: int = 500,
should_clean_pdf: bool = False,
extra_request_data: Optional[Dict[str, Any]] = None,
**kwargs: Any,
) -> None:
"""Initialize with a file path.
@ -382,6 +383,7 @@ class MathpixPDFLoader(BasePDFLoader):
max_wait_time_seconds: a maximum time to wait for the response from
the server. Default is 500.
should_clean_pdf: a flag to clean the PDF file. Default is False.
extra_request_data: Additional request data.
**kwargs: additional keyword arguments.
"""
self.mathpix_api_key = get_from_dict_or_env(
@ -392,6 +394,9 @@ class MathpixPDFLoader(BasePDFLoader):
)
super().__init__(file_path, **kwargs)
self.processed_file_format = processed_file_format
self.extra_request_data = (
extra_request_data if extra_request_data is not None else {}
)
self.max_wait_time_seconds = max_wait_time_seconds
self.should_clean_pdf = should_clean_pdf
@ -405,7 +410,10 @@ class MathpixPDFLoader(BasePDFLoader):
@property
def data(self) -> dict:
options = {"conversion_formats": {self.processed_file_format: True}}
options = {
"conversion_formats": {self.processed_file_format: True},
**self.extra_request_data,
}
return {"options_json": json.dumps(options)}
def send_pdf(self) -> str: