Fix revue

2025-08-26 13:01:55 +00:00 · 2025-03-07 16:25:33 +01:00 · 2025-03-07 16:25:33 +01:00 · 496a933f68
commit 496a933f68
parent b76e9bd579
1 changed files with 14 additions and 20 deletions
--- a/libs/community/langchain_community/document_loaders/parsers/pdf.py
+++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@ -1470,6 +1470,17 @@ class PDFPlumberParser(BaseBlobParser):

        return extract_from_images_with_rapidocr(images)

+_map_extract_tables: Dict[Literal["markdown", "html", None], str] = {
+    "markdown": "",
+    "html": "But, use html syntax for convert all tables. ",
+}
+_map_extract_images = {
+    RapidOCRBlobParser: "",
+    TesseractBlobParser: "",
+    LLMImageBlobParser: "If you come across a picture, "
+    "diagram or other illustration, "
+    "describe it. ",
+}

 class ZeroxPDFParser(BaseBlobParser):
    """Parse a blob from a PDF using `py-zerox` library.
@ -1520,25 +1531,8 @@ class ZeroxPDFParser(BaseBlobParser):
            print(docs[0].page_content[:100])
            print(docs[0].metadata)
    """
-
-    warnings.filterwarnings(
-        "ignore",
-        module=r"^pyzerox.models.modellitellm$",
-        message=r"\s*Custom system prompt was provided which.*",
-    )
    _warn_images_to_text = False
    _warn_creator = False
-    _map_extract_tables: Dict[Literal["markdown", "html", None], str] = {
-        "markdown": "",
-        "html": "But, use html syntax for convert all tables. ",
-    }
-    _map_extract_images = {
-        RapidOCRBlobParser: "",
-        TesseractBlobParser: "",
-        LLMImageBlobParser: "If you come across a picture, "
-        "diagram or other illustration, "
-        "describe it. ",
-    }
    _prompt = (
        "Convert the following PDF page to markdown. "
        "{prompt_tables}"
@ -1680,10 +1674,10 @@ class ZeroxPDFParser(BaseBlobParser):
            zerox_prompt = self.custom_system_prompt

            if not zerox_prompt and self.images_parser or self.extract_tables:
-                prompt_tables = ZeroxPDFParser._map_extract_tables[self.extract_tables]
+                prompt_tables = _map_extract_tables[self.extract_tables]
                clazz = self.images_parser.__class__
-                if clazz in ZeroxPDFParser._map_extract_images:
-                    prompt_images = ZeroxPDFParser._map_extract_images[clazz]
+                if clazz in _map_extract_images:
+                    prompt_images = _map_extract_images[clazz]
                else:
                    if not ZeroxPDFParser._warn_creator:
                        ZeroxPDFParser._warn_creator = True