mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-22 06:39:52 +00:00
Fixed the confluence loader ".csv" files loading issue (#7195)
- Description: Sometimes there are csv attachments with the media type "application/vnd.ms-excel". These files failed to be loaded via the xlrd library. It throws a corrupted file error. I fixed it by separately processing excel files using pandas. Excel files will be processed just like before. - Dependencies: pandas, os, io --------- Co-authored-by: Chathura <chathurar@yaalalabs.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
b21c2f8704
commit
ec10787bc7
@ -589,11 +589,23 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
return docx2txt.process(file_data)
|
return docx2txt.process(file_data)
|
||||||
|
|
||||||
def process_xls(self, link: str) -> str:
|
def process_xls(self, link: str) -> str:
|
||||||
|
import io
|
||||||
|
import os
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import xlrd # noqa: F401
|
import xlrd # noqa: F401
|
||||||
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError("`xlrd` package not found, please run `pip install xlrd`")
|
raise ImportError("`xlrd` package not found, please run `pip install xlrd`")
|
||||||
|
|
||||||
|
try:
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"`pandas` package not found, please run `pip install pandas`"
|
||||||
|
)
|
||||||
|
|
||||||
response = self.confluence.request(path=link, absolute=True)
|
response = self.confluence.request(path=link, absolute=True)
|
||||||
text = ""
|
text = ""
|
||||||
|
|
||||||
@ -604,14 +616,26 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
):
|
):
|
||||||
return text
|
return text
|
||||||
|
|
||||||
workbook = xlrd.open_workbook(file_contents=response.content)
|
filename = os.path.basename(link)
|
||||||
for sheet in workbook.sheets():
|
# Getting the whole content of the url after filename,
|
||||||
text += f"{sheet.name}:\n"
|
# Example: ".csv?version=2&modificationDate=1631800010678&cacheVersion=1&api=v2"
|
||||||
for row in range(sheet.nrows):
|
file_extension = os.path.splitext(filename)[1]
|
||||||
for col in range(sheet.ncols):
|
|
||||||
text += f"{sheet.cell_value(row, col)}\t"
|
if file_extension.startswith(
|
||||||
|
".csv"
|
||||||
|
): # if the extension found in the url is ".csv"
|
||||||
|
content_string = response.content.decode("utf-8")
|
||||||
|
df = pd.read_csv(io.StringIO(content_string))
|
||||||
|
text += df.to_string(index=False, header=False) + "\n\n"
|
||||||
|
else:
|
||||||
|
workbook = xlrd.open_workbook(file_contents=response.content)
|
||||||
|
for sheet in workbook.sheets():
|
||||||
|
text += f"{sheet.name}:\n"
|
||||||
|
for row in range(sheet.nrows):
|
||||||
|
for col in range(sheet.ncols):
|
||||||
|
text += f"{sheet.cell_value(row, col)}\t"
|
||||||
|
text += "\n"
|
||||||
text += "\n"
|
text += "\n"
|
||||||
text += "\n"
|
|
||||||
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user