Fixed the confluence loader ".csv" files loading issue (#7195)

- Description: Sometimes there are csv attachments with the media type "application/vnd.ms-excel". These files failed to be loaded via the xlrd library. It throws a corrupted file error. I fixed it by separately processing excel files using pandas. Excel files will be processed just like before. - Dependencies: pandas, os, io --------- Co-authored-by: Chathura <chathurar@yaalalabs.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
2025-08-14 07:07:34 +00:00 · 2023-07-07 02:51:43 +05:30 · 2023-07-07 02:51:43 +05:30 · ec10787bc7
commit ec10787bc7
parent b21c2f8704
1 changed files with 31 additions and 7 deletions
--- a/langchain/document_loaders/confluence.py
+++ b/langchain/document_loaders/confluence.py
@ -589,11 +589,23 @@ class ConfluenceLoader(BaseLoader):
        return docx2txt.process(file_data)
    def process_xls(self, link: str) -> str:
        import io
        import os
        try:
            import xlrd  # noqa: F401
        except ImportError:
            raise ImportError("`xlrd` package not found, please run `pip install xlrd`")
        try:
            import pandas as pd
        except ImportError:
            raise ImportError(
                "`pandas` package not found, please run `pip install pandas`"
            )
        response = self.confluence.request(path=link, absolute=True)
        text = ""
@ -604,14 +616,26 @@ class ConfluenceLoader(BaseLoader):
        ):
            return text
-        workbook = xlrd.open_workbook(file_contents=response.content)
+        filename = os.path.basename(link)
-        for sheet in workbook.sheets():
+        # Getting the whole content of the url after filename,
-            text += f"{sheet.name}:\n"
+        # Example: ".csv?version=2&modificationDate=1631800010678&cacheVersion=1&api=v2"
-            for row in range(sheet.nrows):
+        file_extension = os.path.splitext(filename)[1]
-                for col in range(sheet.ncols):
+
-                    text += f"{sheet.cell_value(row, col)}\t"
+        if file_extension.startswith(
            ".csv"
        ):  # if the extension found in the url is ".csv"
            content_string = response.content.decode("utf-8")
            df = pd.read_csv(io.StringIO(content_string))
            text += df.to_string(index=False, header=False) + "\n\n"
        else:
            workbook = xlrd.open_workbook(file_contents=response.content)
            for sheet in workbook.sheets():
                text += f"{sheet.name}:\n"
                for row in range(sheet.nrows):
                    for col in range(sheet.ncols):
                        text += f"{sheet.cell_value(row, col)}\t"
                    text += "\n"
                text += "\n"
            text += "\n"
        return text