From 861453c4d72eebd4dee423f8840b852482f57ce2 Mon Sep 17 00:00:00 2001 From: AT Date: Mon, 28 Oct 2024 13:32:16 -0400 Subject: [PATCH] Fixup docx parsing (#3140) Signed-off-by: Adam Treat --- gpt4all-chat/CHANGELOG.md | 1 + gpt4all-chat/src/database.cpp | 32 +++++++++++++++++++++++--------- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/gpt4all-chat/CHANGELOG.md b/gpt4all-chat/CHANGELOG.md index 44c7aa80..bf34d424 100644 --- a/gpt4all-chat/CHANGELOG.md +++ b/gpt4all-chat/CHANGELOG.md @@ -15,6 +15,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). ### Fixed - Fix bug in GUI when localdocs encounters binary data ([#3137](https://github.com/nomic-ai/gpt4all/pull/3137)) +- Fix LocalDocs bugs that prevented some docx files from fully chunking ([#3140](https://github.com/nomic-ai/gpt4all/pull/3140)) ## [3.4.2] - 2024-10-16 diff --git a/gpt4all-chat/src/database.cpp b/gpt4all-chat/src/database.cpp index 5519c31e..5ea2fd17 100644 --- a/gpt4all-chat/src/database.cpp +++ b/gpt4all-chat/src/database.cpp @@ -1208,11 +1208,14 @@ protected: qsizetype wordEnd = wordStart + 1; while (wordEnd >= m_buffer.size() || !m_buffer[wordEnd].isSpace()) { if (wordEnd >= m_buffer.size() && !fillBuffer()) - return std::nullopt; + break; if (!m_buffer[wordEnd].isSpace()) ++wordEnd; } + if (wordStart == wordEnd) + return std::nullopt; + auto size = wordEnd - wordStart; QString word = std::move(m_buffer); m_buffer = word.sliced(wordStart + size); @@ -1220,7 +1223,6 @@ protected: word.resize(size); else word = word.sliced(wordStart, size); - return word; } @@ -1232,18 +1234,30 @@ protected: // try next paragraph if (!m_paragraph->has_next()) return false; + m_paragraph->next(); m_buffer += u'\n'; } + + bool foundText = false; auto &run = m_run->get_node(); - const char *text = run.child("w:t").text().get(); - if (!*text && run.child("w:tab")) - text = "\t"; - m_run->next(); - if (*text) { - m_buffer += QUtf8StringView(text); - return true; + for (auto node = run.first_child(); node; node = node.next_sibling()) { + std::string node_name = node.name(); + if (node_name == "w:t") { + const char *text = node.text().get(); + if (*text) { + foundText = true; + m_buffer += QUtf8StringView(text); + } + } else if (node_name == "w:br") { + m_buffer += u'\n'; + } else if (node_name == "w:tab") { + m_buffer += u'\t'; + } } + + m_run->next(); + if (foundText) return true; } }