Fix revue

This commit is contained in:
Philippe Prados 2025-03-05 14:47:37 +01:00
parent 0fd062fa6d
commit fa47539b60

View File

@ -1650,9 +1650,7 @@ class PDFPlumberParser(BaseBlobParser):
"keep_blank_chars": True, "keep_blank_chars": True,
# "use_text_flow": True, # "use_text_flow": True,
"presorted": True, "presorted": True,
"layout_bbox": kwargs.get("layout_bbox") "layout_bbox": kwargs.get("layout_bbox") or page.cropbox,
# or geometry.objects_to_bbox(page.chars),
or page.cropbox,
} }
) )
chars = page.dedupe_chars().objects["char"] if self.dedupe else page.chars chars = page.dedupe_chars().objects["char"] if self.dedupe else page.chars
@ -1740,7 +1738,7 @@ class PDFPlumberParser(BaseBlobParser):
def _extract_tables_bbox_from_page( def _extract_tables_bbox_from_page(
self, self,
page: pdfplumber.page.Page, page: pdfplumber.page.Page,
) -> list[tuple]: ) -> list[tuple[float, float, float, float]]:
"""Extract bounding boxes of tables from a PDF page. """Extract bounding boxes of tables from a PDF page.
Args: Args:
@ -1805,15 +1803,13 @@ class PDFPlumberParser(BaseBlobParser):
Returns: Returns:
The table content as a string in CSV format. The table content as a string in CSV format.
Replace "\n" with " ".
""" """
if not table: if not table:
return "" return ""
output = ["\n\n"] output = ["\n\n"]
# skip first row in details if header is part of the table
# j = 0 if self.header.external else 1
# iterate over detail rows # iterate over detail rows
for row in table: for row in table:
line = "" line = ""
@ -1861,6 +1857,7 @@ class PDFPlumberParser(BaseBlobParser):
Returns: Returns:
The table content as a string in Markdown format. The table content as a string in Markdown format.
Replace "-" to "-" and "\n" to " ".
""" """
clean = False clean = False
if not table: if not table:
@ -1871,7 +1868,6 @@ class PDFPlumberParser(BaseBlobParser):
output += "|" + "|".join("---" for i in range(col_count)) + "|\n" output += "|" + "|".join("---" for i in range(col_count)) + "|\n"
# skip first row in details if header is part of the table # skip first row in details if header is part of the table
# j = 0 if self.header.external else 1
# iterate over detail rows # iterate over detail rows
for row in table: for row in table: