Fixup docx parsing (#3140)

Signed-off-by: Adam Treat <treat.adam@gmail.com>
This commit is contained in:
AT 2024-10-28 13:32:16 -04:00 committed by GitHub
parent b19db6c20d
commit 861453c4d7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 24 additions and 9 deletions

View File

@ -15,6 +15,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
### Fixed ### Fixed
- Fix bug in GUI when localdocs encounters binary data ([#3137](https://github.com/nomic-ai/gpt4all/pull/3137)) - Fix bug in GUI when localdocs encounters binary data ([#3137](https://github.com/nomic-ai/gpt4all/pull/3137))
- Fix LocalDocs bugs that prevented some docx files from fully chunking ([#3140](https://github.com/nomic-ai/gpt4all/pull/3140))
## [3.4.2] - 2024-10-16 ## [3.4.2] - 2024-10-16

View File

@ -1208,11 +1208,14 @@ protected:
qsizetype wordEnd = wordStart + 1; qsizetype wordEnd = wordStart + 1;
while (wordEnd >= m_buffer.size() || !m_buffer[wordEnd].isSpace()) { while (wordEnd >= m_buffer.size() || !m_buffer[wordEnd].isSpace()) {
if (wordEnd >= m_buffer.size() && !fillBuffer()) if (wordEnd >= m_buffer.size() && !fillBuffer())
return std::nullopt; break;
if (!m_buffer[wordEnd].isSpace()) if (!m_buffer[wordEnd].isSpace())
++wordEnd; ++wordEnd;
} }
if (wordStart == wordEnd)
return std::nullopt;
auto size = wordEnd - wordStart; auto size = wordEnd - wordStart;
QString word = std::move(m_buffer); QString word = std::move(m_buffer);
m_buffer = word.sliced(wordStart + size); m_buffer = word.sliced(wordStart + size);
@ -1220,7 +1223,6 @@ protected:
word.resize(size); word.resize(size);
else else
word = word.sliced(wordStart, size); word = word.sliced(wordStart, size);
return word; return word;
} }
@ -1232,18 +1234,30 @@ protected:
// try next paragraph // try next paragraph
if (!m_paragraph->has_next()) if (!m_paragraph->has_next())
return false; return false;
m_paragraph->next(); m_paragraph->next();
m_buffer += u'\n'; m_buffer += u'\n';
} }
bool foundText = false;
auto &run = m_run->get_node(); auto &run = m_run->get_node();
const char *text = run.child("w:t").text().get(); for (auto node = run.first_child(); node; node = node.next_sibling()) {
if (!*text && run.child("w:tab")) std::string node_name = node.name();
text = "\t"; if (node_name == "w:t") {
m_run->next(); const char *text = node.text().get();
if (*text) { if (*text) {
m_buffer += QUtf8StringView(text); foundText = true;
return true; m_buffer += QUtf8StringView(text);
}
} else if (node_name == "w:br") {
m_buffer += u'\n';
} else if (node_name == "w:tab") {
m_buffer += u'\t';
}
} }
m_run->next();
if (foundText) return true;
} }
} }