mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-17 00:17:47 +00:00
Fix revue
This commit is contained in:
parent
0fd062fa6d
commit
fa47539b60
@ -1650,9 +1650,7 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
"keep_blank_chars": True,
|
"keep_blank_chars": True,
|
||||||
# "use_text_flow": True,
|
# "use_text_flow": True,
|
||||||
"presorted": True,
|
"presorted": True,
|
||||||
"layout_bbox": kwargs.get("layout_bbox")
|
"layout_bbox": kwargs.get("layout_bbox") or page.cropbox,
|
||||||
# or geometry.objects_to_bbox(page.chars),
|
|
||||||
or page.cropbox,
|
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
chars = page.dedupe_chars().objects["char"] if self.dedupe else page.chars
|
chars = page.dedupe_chars().objects["char"] if self.dedupe else page.chars
|
||||||
@ -1740,7 +1738,7 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
def _extract_tables_bbox_from_page(
|
def _extract_tables_bbox_from_page(
|
||||||
self,
|
self,
|
||||||
page: pdfplumber.page.Page,
|
page: pdfplumber.page.Page,
|
||||||
) -> list[tuple]:
|
) -> list[tuple[float, float, float, float]]:
|
||||||
"""Extract bounding boxes of tables from a PDF page.
|
"""Extract bounding boxes of tables from a PDF page.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -1805,15 +1803,13 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
The table content as a string in CSV format.
|
The table content as a string in CSV format.
|
||||||
|
Replace "\n" with " ".
|
||||||
"""
|
"""
|
||||||
if not table:
|
if not table:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
output = ["\n\n"]
|
output = ["\n\n"]
|
||||||
|
|
||||||
# skip first row in details if header is part of the table
|
|
||||||
# j = 0 if self.header.external else 1
|
|
||||||
|
|
||||||
# iterate over detail rows
|
# iterate over detail rows
|
||||||
for row in table:
|
for row in table:
|
||||||
line = ""
|
line = ""
|
||||||
@ -1861,6 +1857,7 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
The table content as a string in Markdown format.
|
The table content as a string in Markdown format.
|
||||||
|
Replace "-" to "-" and "\n" to " ".
|
||||||
"""
|
"""
|
||||||
clean = False
|
clean = False
|
||||||
if not table:
|
if not table:
|
||||||
@ -1871,7 +1868,6 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
output += "|" + "|".join("---" for i in range(col_count)) + "|\n"
|
output += "|" + "|".join("---" for i in range(col_count)) + "|\n"
|
||||||
|
|
||||||
# skip first row in details if header is part of the table
|
# skip first row in details if header is part of the table
|
||||||
# j = 0 if self.header.external else 1
|
|
||||||
|
|
||||||
# iterate over detail rows
|
# iterate over detail rows
|
||||||
for row in table:
|
for row in table:
|
||||||
|
Loading…
Reference in New Issue
Block a user