Refactor PDFPlumber

2025-06-22 23:00:00 +00:00 · 2025-03-04 09:40:17 +01:00 · 2025-03-04 09:40:17 +01:00 · abf2909d43
commit abf2909d43
parent bd3a24f2d1
1 changed files with 1 additions and 11 deletions
--- a/libs/community/langchain_community/document_loaders/parsers/pdf.py
+++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@ -1519,7 +1519,7 @@ class PDFPlumberParser(BaseBlobParser):

        with blob.as_bytes_io() as file_path:  # type: ignore[attr-defined]
            doc = pdfplumber.open(file_path, password=self.password)  # open document
-            from pdfplumber.utils import geometry  # import WordExctractor, TextMap
+            from pdfplumber.utils import geometry

            contents = []
            doc_metadata = doc.metadata | _purge_metadata(
@ -1595,13 +1595,6 @@ class PDFPlumberParser(BaseBlobParser):
                    )
                else:
                    contents.append(all_text)
-                # "tables_as_html": [self._convert_table_to_html(table)
-                #                    for
-                #                    table in tables_content],
-                # "images": images_content,
-                # tables_as_html.extend([self._convert_table(table)
-                #                        for
-                #                        table in tables_content])
            if self.mode == "single":
                yield Document(
                    page_content=self.pages_delimiter.join(contents),
@ -1671,13 +1664,11 @@ class PDFPlumberParser(BaseBlobParser):
        extract_wordmaps: list[Any] = []
        used_arrays = [False] * len(tables_bbox)
        for word, o in wordmap.tuples:
-            # print(f"  Try with '{word['text']}' ...")
            is_table = False
            word_bbox = geometry.obj_to_bbox(word)
            for i, table_bbox in enumerate(tables_bbox):
                if geometry.get_bbox_overlap(word_bbox, table_bbox):
                    # Find a world in a table
-                    # print("  Find in an array")
                    is_table = True
                    if not used_arrays[i]:
                        # First time I see a word in this array
@ -1691,7 +1682,6 @@ class PDFPlumberParser(BaseBlobParser):
                                    if k in kwargs
                                }
                            )
-                            # print(f"yield {new_textmap.to_string()}")
                            yield new_textmap.to_string()
                            extract_wordmaps.clear()
                        # and yield the table