mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-17 00:17:47 +00:00
Fix revue
This commit is contained in:
parent
0fd062fa6d
commit
fa47539b60
@ -1650,9 +1650,7 @@ class PDFPlumberParser(BaseBlobParser):
|
||||
"keep_blank_chars": True,
|
||||
# "use_text_flow": True,
|
||||
"presorted": True,
|
||||
"layout_bbox": kwargs.get("layout_bbox")
|
||||
# or geometry.objects_to_bbox(page.chars),
|
||||
or page.cropbox,
|
||||
"layout_bbox": kwargs.get("layout_bbox") or page.cropbox,
|
||||
}
|
||||
)
|
||||
chars = page.dedupe_chars().objects["char"] if self.dedupe else page.chars
|
||||
@ -1740,7 +1738,7 @@ class PDFPlumberParser(BaseBlobParser):
|
||||
def _extract_tables_bbox_from_page(
|
||||
self,
|
||||
page: pdfplumber.page.Page,
|
||||
) -> list[tuple]:
|
||||
) -> list[tuple[float, float, float, float]]:
|
||||
"""Extract bounding boxes of tables from a PDF page.
|
||||
|
||||
Args:
|
||||
@ -1805,15 +1803,13 @@ class PDFPlumberParser(BaseBlobParser):
|
||||
|
||||
Returns:
|
||||
The table content as a string in CSV format.
|
||||
Replace "\n" with " ".
|
||||
"""
|
||||
if not table:
|
||||
return ""
|
||||
|
||||
output = ["\n\n"]
|
||||
|
||||
# skip first row in details if header is part of the table
|
||||
# j = 0 if self.header.external else 1
|
||||
|
||||
# iterate over detail rows
|
||||
for row in table:
|
||||
line = ""
|
||||
@ -1861,6 +1857,7 @@ class PDFPlumberParser(BaseBlobParser):
|
||||
|
||||
Returns:
|
||||
The table content as a string in Markdown format.
|
||||
Replace "-" to "-" and "\n" to " ".
|
||||
"""
|
||||
clean = False
|
||||
if not table:
|
||||
@ -1871,7 +1868,6 @@ class PDFPlumberParser(BaseBlobParser):
|
||||
output += "|" + "|".join("---" for i in range(col_count)) + "|\n"
|
||||
|
||||
# skip first row in details if header is part of the table
|
||||
# j = 0 if self.header.external else 1
|
||||
|
||||
# iterate over detail rows
|
||||
for row in table:
|
||||
|
Loading…
Reference in New Issue
Block a user