Tweaks for Excel to Markdown conversion (#3022)

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
Jared Van Bortel 2024-10-04 14:25:00 -04:00 committed by GitHub
parent dc82f883f8
commit b850e7c867
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 49 additions and 42 deletions

2
.gitmodules vendored
View File

@ -16,4 +16,4 @@
url = https://github.com/nomic-ai/DuckX.git url = https://github.com/nomic-ai/DuckX.git
[submodule "gpt4all-chat/deps/QXlsx"] [submodule "gpt4all-chat/deps/QXlsx"]
path = gpt4all-chat/deps/QXlsx path = gpt4all-chat/deps/QXlsx
url = https://github.com/QtExcel/QXlsx.git url = https://github.com/nomic-ai/QXlsx.git

View File

@ -16,6 +16,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
- Change the error message when a message is too long ([#3004](https://github.com/nomic-ai/gpt4all/pull/3004)) - Change the error message when a message is too long ([#3004](https://github.com/nomic-ai/gpt4all/pull/3004))
- Simplify chatmodel to get rid of unnecessary field and bump chat version ([#3016](https://github.com/nomic-ai/gpt4all/pull/3016)) - Simplify chatmodel to get rid of unnecessary field and bump chat version ([#3016](https://github.com/nomic-ai/gpt4all/pull/3016))
- Allow ChatLLM to have direct access to ChatModel for restoring state from text ([#3018](https://github.com/nomic-ai/gpt4all/pull/3018)) - Allow ChatLLM to have direct access to ChatModel for restoring state from text ([#3018](https://github.com/nomic-ai/gpt4all/pull/3018))
- Improvements to XLSX conversion and UI fix ([#3022](https://github.com/nomic-ai/gpt4all/pull/3022))
### Fixed ### Fixed
- Fix a crash when attempting to continue a chat loaded from disk ([#2995](https://github.com/nomic-ai/gpt4all/pull/2995)) - Fix a crash when attempting to continue a chat loaded from disk ([#2995](https://github.com/nomic-ai/gpt4all/pull/2995))

@ -1 +1 @@
Subproject commit fda6b806e2ceebd81c01cdded07ae84c94f5879c Subproject commit 29e81b369128525749dcb6516195b6b062eda955

View File

@ -939,6 +939,7 @@ Rectangle {
Text { Text {
id: attachmentFileText id: attachmentFileText
width: 295
height: 40 height: 40
text: modelData.file text: modelData.file
color: theme.textColor color: theme.textColor
@ -947,6 +948,7 @@ Rectangle {
font.pixelSize: theme.fontSizeMedium font.pixelSize: theme.fontSizeMedium
font.bold: true font.bold: true
wrapMode: Text.WrapAnywhere wrapMode: Text.WrapAnywhere
elide: Qt.ElideRight
} }
} }
} }
@ -1971,6 +1973,7 @@ Rectangle {
Text { Text {
id: attachmentFileText2 id: attachmentFileText2
width: 265
height: 40 height: 40
text: model.file text: model.file
color: theme.textColor color: theme.textColor
@ -1979,6 +1982,7 @@ Rectangle {
font.pixelSize: theme.fontSizeMedium font.pixelSize: theme.fontSizeMedium
font.bold: true font.bold: true
wrapMode: Text.WrapAnywhere wrapMode: Text.WrapAnywhere
elide: Qt.ElideRight
} }
} }

View File

@ -10,8 +10,10 @@
#include <QDateTime> #include <QDateTime>
#include <QDebug> #include <QDebug>
#include <QList> #include <QList>
#include <QRegularExpression>
#include <QString> #include <QString>
#include <QStringList> #include <QStringList>
#include <QStringView>
#include <QVariant> #include <QVariant>
#include <QtGlobal> #include <QtGlobal>
#include <QtLogging> #include <QtLogging>
@ -33,7 +35,7 @@ static QString formatCellText(const QXlsx::Cell *cell)
if (cell->isDateTime()) { if (cell->isDateTime()) {
// Handle DateTime // Handle DateTime
QDateTime dateTime = cell->dateTime().toDateTime(); QDateTime dateTime = cell->dateTime().toDateTime();
cellText = dateTime.isValid() ? dateTime.toString("yyyy-MM-dd") : value.toString(); cellText = dateTime.isValid() ? dateTime.toString(QStringView(u"yyyy-MM-dd")) : value.toString();
} else { } else {
cellText = value.toString(); cellText = value.toString();
} }
@ -41,23 +43,32 @@ static QString formatCellText(const QXlsx::Cell *cell)
if (cellText.isEmpty()) if (cellText.isEmpty())
return QString(); return QString();
// Apply Markdown and HTML formatting based on font styles // Escape special characters
QString formattedText = cellText; static QRegularExpression special(
QStringLiteral(
if (format.fontBold() && format.fontItalic()) R"(()([\\`*_[\]<>()!|])|)" // special characters
formattedText = "***" + formattedText + "***"; R"(^(\s*)(#+(?:\s|$))|)" // headings
else if (format.fontBold()) R"(^(\s*[0-9])(\.(?:\s|$))|)" // ordered lists ("1. a")
formattedText = "**" + formattedText + "**"; R"(^(\s*)([+-](?:\s|$)))" // unordered lists ("- a")
else if (format.fontItalic()) ),
formattedText = "*" + formattedText + "*"; QRegularExpression::MultilineOption
);
cellText.replace(special, uR"(\1\\2)"_s);
cellText.replace(u'&', "&amp;"_L1);
cellText.replace(u'<', "&lt;"_L1);
cellText.replace(u'>', "&gt;"_L1);
// Apply Markdown formatting based on font styles
if (format.fontUnderline())
cellText = u"_%1_"_s.arg(cellText);
if (format.fontBold())
cellText = u"**%1**"_s.arg(cellText);
if (format.fontItalic())
cellText = u"*%1*"_s.arg(cellText);
if (format.fontStrikeOut()) if (format.fontStrikeOut())
formattedText = "~~" + formattedText + "~~"; cellText = u"~~%1~~"_s.arg(cellText);
// Escape pipe characters to prevent Markdown table issues return cellText;
formattedText.replace("|", "\\|");
return formattedText;
} }
static QString getCellValue(QXlsx::Worksheet *sheet, int row, int col) static QString getCellValue(QXlsx::Worksheet *sheet, int row, int col)
@ -124,44 +135,35 @@ QString XLSXToMD::toMarkdown(QIODevice *xlsxDevice)
if (firstRow > lastRow || firstCol > lastCol) { if (firstRow > lastRow || firstCol > lastCol) {
qWarning() << "Sheet" << sheetName << "is empty."; qWarning() << "Sheet" << sheetName << "is empty.";
markdown += "*No data available.*\n\n"; markdown += QStringView(u"*No data available.*\n\n");
continue; continue;
} }
// Assume the first row is the header auto appendRow = [&markdown](auto &list) { markdown += u"|%1|\n"_s.arg(list.join(u'|')); };
int headerRow = firstRow;
// Collect headers // Empty header
static QString header(u' ');
static QString separator(u'-');
QStringList headers; QStringList headers;
for (int col = firstCol; col <= lastCol; ++col) {
QString header = getCellValue(sheet, headerRow, col);
headers << header;
}
// Create Markdown header row
QString headerRowMarkdown = "|" + headers.join("|") + "|";
markdown += headerRowMarkdown + "\n";
// Create Markdown separator row
QStringList separators; QStringList separators;
for (int i = 0; i < headers.size(); ++i) for (int col = firstCol; col <= lastCol; ++col) {
separators << "---"; headers << header;
QString separatorRow = "|" + separators.join("|") + "|"; separators << separator;
markdown += separatorRow + "\n"; }
appendRow(headers);
appendRow(separators);
// Iterate through data rows (starting from the row after header) // Iterate through data rows
for (int row = headerRow + 1; row <= lastRow; ++row) { for (int row = firstRow; row <= lastRow; ++row) {
QStringList rowData; QStringList rowData;
for (int col = firstCol; col <= lastCol; ++col) { for (int col = firstCol; col <= lastCol; ++col) {
QString cellText = getCellValue(sheet, row, col); QString cellText = getCellValue(sheet, row, col);
rowData << cellText; rowData << (cellText.isEmpty() ? u" "_s : cellText);
} }
appendRow(rowData);
QString dataRow = "|" + rowData.join("|") + "|";
markdown += dataRow + "\n";
} }
markdown += "\n"; // Add an empty line between sheets markdown += u'\n'; // Add an empty line between sheets
} }
return markdown; return markdown;
} }