From abf2909d436472ea7137602f7f66aee85d383896 Mon Sep 17 00:00:00 2001
From: Philippe Prados <github@prados.fr>
Date: Tue, 4 Mar 2025 09:40:17 +0100
Subject: [PATCH] Refactor PDFPlumber

---
 .../document_loaders/parsers/pdf.py                  | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py
index 83ff2bf3a19..a518038e82c 100644
--- a/libs/community/langchain_community/document_loaders/parsers/pdf.py
+++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@@ -1519,7 +1519,7 @@ class PDFPlumberParser(BaseBlobParser):
 
         with blob.as_bytes_io() as file_path:  # type: ignore[attr-defined]
             doc = pdfplumber.open(file_path, password=self.password)  # open document
-            from pdfplumber.utils import geometry  # import WordExctractor, TextMap
+            from pdfplumber.utils import geometry
 
             contents = []
             doc_metadata = doc.metadata | _purge_metadata(
@@ -1595,13 +1595,6 @@ class PDFPlumberParser(BaseBlobParser):
                     )
                 else:
                     contents.append(all_text)
-                # "tables_as_html": [self._convert_table_to_html(table)
-                #                    for
-                #                    table in tables_content],
-                # "images": images_content,
-                # tables_as_html.extend([self._convert_table(table)
-                #                        for
-                #                        table in tables_content])
             if self.mode == "single":
                 yield Document(
                     page_content=self.pages_delimiter.join(contents),
@@ -1671,13 +1664,11 @@ class PDFPlumberParser(BaseBlobParser):
         extract_wordmaps: list[Any] = []
         used_arrays = [False] * len(tables_bbox)
         for word, o in wordmap.tuples:
-            # print(f"  Try with '{word['text']}' ...")
             is_table = False
             word_bbox = geometry.obj_to_bbox(word)
             for i, table_bbox in enumerate(tables_bbox):
                 if geometry.get_bbox_overlap(word_bbox, table_bbox):
                     # Find a world in a table
-                    # print("  Find in an array")
                     is_table = True
                     if not used_arrays[i]:
                         # First time I see a word in this array
@@ -1691,7 +1682,6 @@ class PDFPlumberParser(BaseBlobParser):
                                     if k in kwargs
                                 }
                             )
-                            # print(f"yield {new_textmap.to_string()}")
                             yield new_textmap.to_string()
                             extract_wordmaps.clear()
                         # and yield the table