mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-05 20:58:25 +00:00
Fix PyMuPDFLoader kwargs (#11434)
- **Description:** Fix the `PyMuPDFLoader` to accept `loader_kwargs` from the document loader's `loader_kwargs` option. This provides more flexibility in formatting the output from documents. - **Issue:** The `loader_kwargs` is not passed into the `load` method from the document loader, which limits configuration options. - **Dependencies:** None --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
e4a46747dc
commit
1655ff2ded
@ -298,7 +298,9 @@ class PDFMinerPDFasHTMLLoader(BasePDFLoader):
|
|||||||
class PyMuPDFLoader(BasePDFLoader):
|
class PyMuPDFLoader(BasePDFLoader):
|
||||||
"""Load `PDF` files using `PyMuPDF`."""
|
"""Load `PDF` files using `PyMuPDF`."""
|
||||||
|
|
||||||
def __init__(self, file_path: str, *, headers: Optional[Dict] = None) -> None:
|
def __init__(
|
||||||
|
self, file_path: str, *, headers: Optional[Dict] = None, **kwargs: Any
|
||||||
|
) -> None:
|
||||||
"""Initialize with a file path."""
|
"""Initialize with a file path."""
|
||||||
try:
|
try:
|
||||||
import fitz # noqa:F401
|
import fitz # noqa:F401
|
||||||
@ -307,13 +309,19 @@ class PyMuPDFLoader(BasePDFLoader):
|
|||||||
"`PyMuPDF` package not found, please install it with "
|
"`PyMuPDF` package not found, please install it with "
|
||||||
"`pip install pymupdf`"
|
"`pip install pymupdf`"
|
||||||
)
|
)
|
||||||
|
|
||||||
super().__init__(file_path, headers=headers)
|
super().__init__(file_path, headers=headers)
|
||||||
|
self.text_kwargs = kwargs
|
||||||
|
|
||||||
def load(self, **kwargs: Optional[Any]) -> List[Document]:
|
def load(self, **kwargs: Any) -> List[Document]:
|
||||||
"""Load file."""
|
"""Load file."""
|
||||||
|
if kwargs:
|
||||||
|
logger.warning(
|
||||||
|
f"Received runtime arguments {kwargs}. Passing runtime args to `load`"
|
||||||
|
f" is deprecated. Please pass arguments during initialization instead."
|
||||||
|
)
|
||||||
|
|
||||||
parser = PyMuPDFParser(text_kwargs=kwargs)
|
text_kwargs = {**self.text_kwargs, **kwargs}
|
||||||
|
parser = PyMuPDFParser(text_kwargs=text_kwargs)
|
||||||
blob = Blob.from_path(self.file_path)
|
blob = Blob.from_path(self.file_path)
|
||||||
return parser.parse(blob)
|
return parser.parse(blob)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user