mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2025-07-07 04:20:59 +00:00
Fixup docx parsing (#3140)
Signed-off-by: Adam Treat <treat.adam@gmail.com>
This commit is contained in:
parent
b19db6c20d
commit
861453c4d7
@ -15,6 +15,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
|
||||
|
||||
### Fixed
|
||||
- Fix bug in GUI when localdocs encounters binary data ([#3137](https://github.com/nomic-ai/gpt4all/pull/3137))
|
||||
- Fix LocalDocs bugs that prevented some docx files from fully chunking ([#3140](https://github.com/nomic-ai/gpt4all/pull/3140))
|
||||
|
||||
## [3.4.2] - 2024-10-16
|
||||
|
||||
|
@ -1208,11 +1208,14 @@ protected:
|
||||
qsizetype wordEnd = wordStart + 1;
|
||||
while (wordEnd >= m_buffer.size() || !m_buffer[wordEnd].isSpace()) {
|
||||
if (wordEnd >= m_buffer.size() && !fillBuffer())
|
||||
return std::nullopt;
|
||||
break;
|
||||
if (!m_buffer[wordEnd].isSpace())
|
||||
++wordEnd;
|
||||
}
|
||||
|
||||
if (wordStart == wordEnd)
|
||||
return std::nullopt;
|
||||
|
||||
auto size = wordEnd - wordStart;
|
||||
QString word = std::move(m_buffer);
|
||||
m_buffer = word.sliced(wordStart + size);
|
||||
@ -1220,7 +1223,6 @@ protected:
|
||||
word.resize(size);
|
||||
else
|
||||
word = word.sliced(wordStart, size);
|
||||
|
||||
return word;
|
||||
}
|
||||
|
||||
@ -1232,18 +1234,30 @@ protected:
|
||||
// try next paragraph
|
||||
if (!m_paragraph->has_next())
|
||||
return false;
|
||||
|
||||
m_paragraph->next();
|
||||
m_buffer += u'\n';
|
||||
}
|
||||
|
||||
bool foundText = false;
|
||||
auto &run = m_run->get_node();
|
||||
const char *text = run.child("w:t").text().get();
|
||||
if (!*text && run.child("w:tab"))
|
||||
text = "\t";
|
||||
m_run->next();
|
||||
if (*text) {
|
||||
m_buffer += QUtf8StringView(text);
|
||||
return true;
|
||||
for (auto node = run.first_child(); node; node = node.next_sibling()) {
|
||||
std::string node_name = node.name();
|
||||
if (node_name == "w:t") {
|
||||
const char *text = node.text().get();
|
||||
if (*text) {
|
||||
foundText = true;
|
||||
m_buffer += QUtf8StringView(text);
|
||||
}
|
||||
} else if (node_name == "w:br") {
|
||||
m_buffer += u'\n';
|
||||
} else if (node_name == "w:tab") {
|
||||
m_buffer += u'\t';
|
||||
}
|
||||
}
|
||||
|
||||
m_run->next();
|
||||
if (foundText) return true;
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user