mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-09-02 09:37:03 +00:00
feat(KnowledgeBase):Add Word97-2003 (.doc) Binary File parsing module (#2544)
This commit is contained in:
@@ -7,9 +7,12 @@
|
|||||||
"USERNAME": "${localEnv:USER}"
|
"USERNAME": "${localEnv:USER}"
|
||||||
},
|
},
|
||||||
"options": [
|
"options": [
|
||||||
"--no-cache"
|
"--no-cache",
|
||||||
|
"--network=host"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
"updateRemoteUserUID": false,
|
||||||
|
"remoteUser": "${localEnv:USER}",
|
||||||
"initializeCommand": ".devcontainer/init_env.sh",
|
"initializeCommand": ".devcontainer/init_env.sh",
|
||||||
"name": "dbgpt",
|
"name": "dbgpt",
|
||||||
"workspaceFolder": "/app",
|
"workspaceFolder": "/app",
|
||||||
|
@@ -1,8 +1,9 @@
|
|||||||
FROM eosphorosai/dbgpt:latest
|
FROM eosphorosai/dbgpt-full:latest
|
||||||
ARG PYTHON_VERSION=3.11
|
ARG PYTHON_VERSION=3.11
|
||||||
ARG PIP_INDEX_URL="https://mirrors.aliyun.com/pypi/simple"
|
ARG PIP_INDEX_URL="https://pypi.tuna.tsinghua.edu.cn/simple"
|
||||||
ARG USERNAME
|
ARG USERNAME
|
||||||
ARG DEFAULT_VEN=/opt/.uv.venv
|
ARG EXTRAS="base,proxy_openai,rag,storage_chromadb, storage_elasticsearch,cuda121,hf,quant_bnb,dbgpts"
|
||||||
|
ARG DEFAULT_VENV=/opt/.uv.venv
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
COPY . .
|
COPY . .
|
||||||
USER root
|
USER root
|
||||||
@@ -11,30 +12,38 @@ USER root
|
|||||||
# between the container user (root) and the host user,
|
# between the container user (root) and the host user,
|
||||||
# and to resolve the issue of the host user lacking write permissions.
|
# and to resolve the issue of the host user lacking write permissions.
|
||||||
RUN . .devcontainer/.env && \
|
RUN . .devcontainer/.env && \
|
||||||
groupadd -g $USER_GID $USERNAME && \
|
groupadd -g $USER_GID $GROUPNAME && \
|
||||||
useradd -u $USER_UID -g $USER_GID -m $USERNAME && \
|
useradd -u $USER_UID -g $USER_GID -m $USERNAME && \
|
||||||
chown -R $USER_UID:$USER_GID /app
|
chown -R $USER_UID:$USER_GID /app
|
||||||
RUN apt-get update && apt-get install -y \
|
RUN apt-get update && apt-get install -y \
|
||||||
git \
|
git \
|
||||||
curl \
|
curl \
|
||||||
wget \
|
wget \
|
||||||
|
python${PYTHON_VERSION}-dev \
|
||||||
|
default-libmysqlclient-dev \
|
||||||
ssh zsh autojump curl git-flow vim sudo \
|
ssh zsh autojump curl git-flow vim sudo \
|
||||||
&& python${PYTHON_VERSION} -m pip install --upgrade pip \
|
&& python${PYTHON_VERSION} -m pip install --upgrade pip \
|
||||||
&& python${PYTHON_VERSION} -m pip install --upgrade pipx \
|
&& python${PYTHON_VERSION} -m pip install --upgrade pipx \
|
||||||
&& pipx install -i $PIP_INDEX_URL uv --global \
|
&& pipx install -i $PIP_INDEX_URL uv --global \
|
||||||
&& chown -R $USERNAME:$USERNAME $DEFAULT_VEN \
|
&& chown -R $USERNAME:$GROUPNAME $DEFAULT_VENV \
|
||||||
&& echo "$USERNAME ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/$USERNAME \
|
&& echo "$USERNAME ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/$USERNAME \
|
||||||
&& chmod 0440 /etc/sudoers.d/$USERNAME
|
&& chmod 0440 /etc/sudoers.d/$USERNAME
|
||||||
USER $USERNAME
|
USER $USERNAME
|
||||||
ENV UV_LINK_MODE=copy \
|
ENV UV_LINK_MODE=copy \
|
||||||
PIP_INDEX_URL=$PIP_INDEX_URL \
|
PIP_INDEX_URL=$PIP_INDEX_URL \
|
||||||
VIRTUAL_ENV=$DEFAULT_VEN \
|
VIRTUAL_ENV=$DEFAULT_VENV \
|
||||||
UV_PROJECT_ENVIRONMENT=$DEFAULT_VEN \
|
UV_PROJECT_ENVIRONMENT=$DEFAULT_VENV \
|
||||||
UV_PYTHON=$DEFAULT_VEN/bin/python3
|
UV_PYTHON=$DEFAULT_VENV/bin/python3 \
|
||||||
|
UV_INDEX=$PIP_INDEX_URL \
|
||||||
|
UV_DEFAULT_INDEX=$PIP_INDEX_URL
|
||||||
|
|
||||||
RUN . $DEFAULT_VEN/bin/activate && \
|
RUN sed -i "s|/app/\.venv|${FINAL_VENV_NAME}|g" /${DEFAULT_VENV}/bin/activate && \
|
||||||
uv pip install --prefix $VIRTUAL_ENV -r pyproject.toml --all-extras --index-url=$PIP_INDEX_URL && \
|
pip config set global.index-url $PIP_INDEX_URL && \
|
||||||
uv pip install --prefix $VIRTUAL_ENV -r requirements/dev-requirements.txt --index-url=$PIP_INDEX_URL && \
|
pip config set global.trusted-host $(echo "$PIP_INDEX_URL" | sed -E 's|^https?://([^/]+).*|\1|') && \
|
||||||
uv pip install --prefix $VIRTUAL_ENV -r requirements/lint-requirements.txt --index-url=$PIP_INDEX_URL && \
|
. $DEFAULT_VENV/bin/activate && \
|
||||||
|
extras=$(echo $EXTRAS | tr ',' '\n' | while read extra; do echo "--extra $extra"; done | tr '\n' ' ') && \
|
||||||
|
uv sync -v --active --all-packages $extras --default-index $PIP_INDEX_URL && \
|
||||||
|
uv pip -v install --prefix $VIRTUAL_ENV -r requirements/dev-requirements.txt && \
|
||||||
|
uv pip -v install --prefix $VIRTUAL_ENV -r requirements/lint-requirements.txt && \
|
||||||
cp .devcontainer/dbgpt.pth /opt/.uv.venv/lib/python${PYTHON_VERSION}/site-packages/dbgpt.pth && \
|
cp .devcontainer/dbgpt.pth /opt/.uv.venv/lib/python${PYTHON_VERSION}/site-packages/dbgpt.pth && \
|
||||||
python -c "import dbgpt; print(dbgpt.__version__)"
|
python -c "import dbgpt; print(dbgpt.__version__)"
|
36
.devcontainer/README.md
Normal file
36
.devcontainer/README.md
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
# Developing inside a Container
|
||||||
|
Use VS Code's Dev Container extension to build a containerized development environment. Leverage the eosphorosai/dbgpt:latest image as the development environment to avoid repeated dependency installations and improve development efficiency.
|
||||||
|
NOTE: **Compatible with Linux and Windows Subsystem for Linux (WSL) environments only.**
|
||||||
|
# Setup
|
||||||
|
|
||||||
|
- Follow the guide [Developing inside a Container](https://code.visualstudio.com/docs/devcontainers/containers) to set up the Dev Container:
|
||||||
|
- Install the **Dev Containers** extension.
|
||||||
|
|
||||||
|
- Before the first launch, please execute the .devcontainer/init_env.sh script in the project root directory in **host**
|
||||||
|
- Create `models` dir in project root and download text2vec-large-chinese to models/text2vec-large-chinese
|
||||||
|
- Use the shortcut `Ctrl+Shift+P` to open the command palette, then enter `Dev Containers: Open Folder in Container`.
|
||||||
|
|
||||||
|
# Develop
|
||||||
|
After successfully starting the Dev Container, open the terminal
|
||||||
|
|
||||||
|
- Activate the virtual environment
|
||||||
|
```bash
|
||||||
|
. /opt/.uv.venv/bin/activate
|
||||||
|
```
|
||||||
|
|
||||||
|
- Customize the configuration file
|
||||||
|
|
||||||
|
You can copy the configuration file to the `.devcontainer` directory and rename it to `dev.toml` to avoid committing your personal configurations to the repository.
|
||||||
|
```bash
|
||||||
|
cp configs/dbgpt-app-config.example.toml .devcontainer/dev.toml
|
||||||
|
```
|
||||||
|
|
||||||
|
- Start the service
|
||||||
|
|
||||||
|
```bash
|
||||||
|
dbgpt start webserver --config .devcontainer/dev.toml
|
||||||
|
```
|
||||||
|
|
||||||
|
# Create A Pull Request
|
||||||
|
|
||||||
|
Please refer to [CONTRIBUTING.md](../CONTRIBUTING.md). Before executing the make script or git commit, remember to deactivate the current virtual environment in the development environment.
|
@@ -20,10 +20,19 @@ printf "OS=%s\nUSERNAME=%s\nUSER_UID=%s\nGROUPNAME=%s\nUSER_GID=%s\n" \
|
|||||||
|
|
||||||
# sharing-git-credentials see https://code.visualstudio.com/remote/advancedcontainers/sharing-git-credentials
|
# sharing-git-credentials see https://code.visualstudio.com/remote/advancedcontainers/sharing-git-credentials
|
||||||
init_ssh_agent(){
|
init_ssh_agent(){
|
||||||
|
if [[ -z "$SSH_AUTH_SOCK" || ! -S "$SSH_AUTH_SOCK" ]]; then
|
||||||
|
RUNNING_AGENT="$(ps -ax | grep '''ssh-agent -s''' | grep -v grep | wc -l)"
|
||||||
|
if [ "$RUNNING_AGENT" = "0" ]; then
|
||||||
|
ssh-agent -s &> $HOME/.ssh/ssh-agent
|
||||||
|
fi
|
||||||
|
eval $(cat $HOME/.ssh/ssh-agent) > /dev/null
|
||||||
|
ssh-add 2> /dev/null
|
||||||
|
echo $SSH_AUTH_SOCK
|
||||||
|
fi
|
||||||
# Define code block to insert (with unique identifier comment)
|
# Define code block to insert (with unique identifier comment)
|
||||||
SSH_AGENT_CODE='# SSH Agent Auto Management[ID:ssh_agent_v1]
|
SSH_AGENT_CODE='# SSH Agent Auto Management[ID:ssh_agent_v1]
|
||||||
if [ -z "$SSH_AUTH_SOCK" ]; then
|
if [[ -z "$SSH_AUTH_SOCK" || ! -S "$SSH_AUTH_SOCK" ]]; then
|
||||||
RUNNING_AGENT="$(ps -ax | grep '\''ssh-agent -s'\'' | grep -v grep | wc -l | tr -d '\''[:space:]'\'')"
|
RUNNING_AGENT="$(ps -ax | grep '\''ssh-agent -s'\'' | grep -v grep | wc -l)"
|
||||||
if [ "$RUNNING_AGENT" = "0" ]; then
|
if [ "$RUNNING_AGENT" = "0" ]; then
|
||||||
ssh-agent -s &> $HOME/.ssh/ssh-agent
|
ssh-agent -s &> $HOME/.ssh/ssh-agent
|
||||||
fi
|
fi
|
||||||
@@ -32,11 +41,7 @@ if [ -z "$SSH_AUTH_SOCK" ]; then
|
|||||||
fi
|
fi
|
||||||
# END_SSH_AGENT_CODE'
|
# END_SSH_AGENT_CODE'
|
||||||
|
|
||||||
# Auto detect shell type
|
|
||||||
TARGET_FILE="$HOME/.bashrc"
|
TARGET_FILE="$HOME/.bashrc"
|
||||||
if [[ "$SHELL" == *"zsh"* ]]; then
|
|
||||||
TARGET_FILE="$HOME/.zshrc"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Create .ssh directory if not exists
|
# Create .ssh directory if not exists
|
||||||
mkdir -p "$HOME/.ssh"
|
mkdir -p "$HOME/.ssh"
|
||||||
@@ -45,6 +50,9 @@ mkdir -p "$HOME/.ssh"
|
|||||||
if ! grep -q 'END_SSH_AGENT_CODE' "$TARGET_FILE"; then
|
if ! grep -q 'END_SSH_AGENT_CODE' "$TARGET_FILE"; then
|
||||||
echo "Adding SSH agent management code to ${TARGET_FILE}..."
|
echo "Adding SSH agent management code to ${TARGET_FILE}..."
|
||||||
echo "$SSH_AGENT_CODE" >> "$TARGET_FILE"
|
echo "$SSH_AGENT_CODE" >> "$TARGET_FILE"
|
||||||
|
if [[ "$SHELL" == *"zsh"* ]]; then
|
||||||
|
echo "$SSH_AGENT_CODE" >> "$HOME/.zshrc"
|
||||||
|
fi
|
||||||
echo "Code added successfully. Please run source ${TARGET_FILE} to apply changes immediately"
|
echo "Code added successfully. Please run source ${TARGET_FILE} to apply changes immediately"
|
||||||
else
|
else
|
||||||
echo "Existing SSH agent code detected, no need to add again"
|
echo "Existing SSH agent code detected, no need to add again"
|
||||||
|
@@ -35,8 +35,8 @@ fi
|
|||||||
|
|
||||||
# Configuration section remains the same...
|
# Configuration section remains the same...
|
||||||
# Apply custom configuration
|
# Apply custom configuration
|
||||||
if [ -f /workspace/.devcontainer/zshrc-config ]; then
|
if [ -f /app/.devcontainer/zshrc-config ]; then
|
||||||
cp /workspace/.devcontainer/zshrc-config ~/.zshrc
|
cp /app/.devcontainer/zshrc-config ~/.zshrc
|
||||||
else
|
else
|
||||||
# Generate basic .zshrc if no custom configuration exists
|
# Generate basic .zshrc if no custom configuration exists
|
||||||
cat << EOF >> ~/.zshrc
|
cat << EOF >> ~/.zshrc
|
||||||
|
4
.gitignore
vendored
4
.gitignore
vendored
@@ -186,4 +186,6 @@ thirdparty
|
|||||||
/examples/**/*.gv.pdf
|
/examples/**/*.gv.pdf
|
||||||
/i18n/locales/**/**/*_ai_translated.po
|
/i18n/locales/**/**/*_ai_translated.po
|
||||||
/i18n/locales/**/**/*~
|
/i18n/locales/**/**/*~
|
||||||
configs/my
|
configs/my
|
||||||
|
.devcontainer/dev.toml
|
||||||
|
test_docs
|
@@ -118,3 +118,6 @@ Write necessary information about your changes and click "Create pull request".
|
|||||||
|
|
||||||
That's it you made it 🐣⭐⭐
|
That's it you made it 🐣⭐⭐
|
||||||
|
|
||||||
|
# Developing inside a Container
|
||||||
|
|
||||||
|
If you are using VS Code as your IDE for development, you can refer to the [configuration here](.devcontainer/README.md) to set up the Dev Containers development environment.
|
||||||
|
@@ -118,6 +118,8 @@ COPY . .
|
|||||||
# Fix the shebang of the dbgpt script
|
# Fix the shebang of the dbgpt script
|
||||||
RUN sed -i "s|^#\!/app/\.venv/bin/python[0-9.]*|#!/${FINAL_VENV_NAME}/bin/python${PYTHON_VERSION}|" /${FINAL_VENV_NAME}/bin/dbgpt
|
RUN sed -i "s|^#\!/app/\.venv/bin/python[0-9.]*|#!/${FINAL_VENV_NAME}/bin/python${PYTHON_VERSION}|" /${FINAL_VENV_NAME}/bin/dbgpt
|
||||||
RUN sed -i "s|^#\!/app/\.venv/bin/python[0-9.]*|#!/${FINAL_VENV_NAME}/bin/python${PYTHON_VERSION}|" /${FINAL_VENV_NAME}/bin/pip
|
RUN sed -i "s|^#\!/app/\.venv/bin/python[0-9.]*|#!/${FINAL_VENV_NAME}/bin/python${PYTHON_VERSION}|" /${FINAL_VENV_NAME}/bin/pip
|
||||||
|
RUN sed -i "s|/app/\.venv|${FINAL_VENV_NAME}|g" /${FINAL_VENV_NAME}/bin/activate
|
||||||
|
|
||||||
ENV PATH="${FINAL_VENV_NAME}/bin:$PATH" \
|
ENV PATH="${FINAL_VENV_NAME}/bin:$PATH" \
|
||||||
VIRTUAL_ENV="${FINAL_VENV_NAME}"
|
VIRTUAL_ENV="${FINAL_VENV_NAME}"
|
||||||
# Default command
|
# Default command
|
||||||
|
@@ -27,6 +27,7 @@ class DocumentType(Enum):
|
|||||||
HTML = "html"
|
HTML = "html"
|
||||||
DATASOURCE = "datasource"
|
DATASOURCE = "datasource"
|
||||||
EXCEL = "xlsx"
|
EXCEL = "xlsx"
|
||||||
|
DOC = "doc"
|
||||||
|
|
||||||
|
|
||||||
class KnowledgeType(Enum):
|
class KnowledgeType(Enum):
|
||||||
|
@@ -31,6 +31,7 @@ rag = [
|
|||||||
"bs4",
|
"bs4",
|
||||||
"python-pptx",
|
"python-pptx",
|
||||||
"python-docx",
|
"python-docx",
|
||||||
|
"olefile",
|
||||||
"pypdf",
|
"pypdf",
|
||||||
"pdfplumber",
|
"pdfplumber",
|
||||||
]
|
]
|
||||||
|
@@ -19,6 +19,7 @@ def __getattr__(name: str):
|
|||||||
"CSVKnowledge": "csv",
|
"CSVKnowledge": "csv",
|
||||||
"DatasourceKnowledge": "datasource",
|
"DatasourceKnowledge": "datasource",
|
||||||
"DocxKnowledge": "docx",
|
"DocxKnowledge": "docx",
|
||||||
|
"Word97DocKnowledge": "doc",
|
||||||
"HTMLKnowledge": "html",
|
"HTMLKnowledge": "html",
|
||||||
"MarkdownKnowledge": "markdown",
|
"MarkdownKnowledge": "markdown",
|
||||||
"PDFKnowledge": "pdf",
|
"PDFKnowledge": "pdf",
|
||||||
@@ -43,6 +44,7 @@ __all__ = [
|
|||||||
"CSVKnowledge",
|
"CSVKnowledge",
|
||||||
"DatasourceKnowledge",
|
"DatasourceKnowledge",
|
||||||
"DocxKnowledge",
|
"DocxKnowledge",
|
||||||
|
"Word97DocKnowledge",
|
||||||
"HTMLKnowledge",
|
"HTMLKnowledge",
|
||||||
"MarkdownKnowledge",
|
"MarkdownKnowledge",
|
||||||
"PDFKnowledge",
|
"PDFKnowledge",
|
||||||
|
627
packages/dbgpt-ext/src/dbgpt_ext/rag/knowledge/doc.py
Normal file
627
packages/dbgpt-ext/src/dbgpt_ext/rag/knowledge/doc.py
Normal file
@@ -0,0 +1,627 @@
|
|||||||
|
import struct
|
||||||
|
from typing import Any, Dict, List, Optional, Union
|
||||||
|
|
||||||
|
import olefile
|
||||||
|
|
||||||
|
from dbgpt.core import Document
|
||||||
|
from dbgpt.rag.knowledge.base import (
|
||||||
|
ChunkStrategy,
|
||||||
|
DocumentType,
|
||||||
|
Knowledge,
|
||||||
|
KnowledgeType,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class Word97DocParser:
|
||||||
|
"""Parser for Microsoft Word 97-2003 (.doc) binary files.
|
||||||
|
|
||||||
|
This module implements a parser for the legacy Word 97-2003 binary format (.doc),
|
||||||
|
based on the official Microsoft [MS-DOC] specification.
|
||||||
|
|
||||||
|
Specification Reference:
|
||||||
|
[MS-DOC]: Word (.doc) Binary File Format
|
||||||
|
https://learn.microsoft.com/en-us/openspecs/office_file_formats/ms-doc/ccd7b486-7881-484c-a137-51170af7cc22
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> from doc import Word97DocParser
|
||||||
|
>>> with Word97DocParser("example.doc") as parser:
|
||||||
|
... paragraphs = parser.extract_text_by_paragraphs()
|
||||||
|
... for i, para in enumerate(paragraphs, 1):
|
||||||
|
... print(f"\nParagraph {i}:")
|
||||||
|
... print(para)
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Mapping of special ANSI characters to Unicode
|
||||||
|
ANSI_TO_UNICODE = {
|
||||||
|
0x82: 0x201A,
|
||||||
|
0x83: 0x0192,
|
||||||
|
0x84: 0x201E,
|
||||||
|
0x85: 0x2026,
|
||||||
|
0x86: 0x2020,
|
||||||
|
0x87: 0x2021,
|
||||||
|
0x88: 0x02C6,
|
||||||
|
0x89: 0x2030,
|
||||||
|
0x8A: 0x0160,
|
||||||
|
0x8B: 0x2039,
|
||||||
|
0x8C: 0x0152,
|
||||||
|
0x91: 0x2018,
|
||||||
|
0x92: 0x2019,
|
||||||
|
0x93: 0x201C,
|
||||||
|
0x94: 0x201D,
|
||||||
|
0x95: 0x2022,
|
||||||
|
0x96: 0x2013,
|
||||||
|
0x97: 0x2014,
|
||||||
|
0x98: 0x02DC,
|
||||||
|
0x99: 0x2122,
|
||||||
|
0x9A: 0x0161,
|
||||||
|
0x9B: 0x203A,
|
||||||
|
0x9C: 0x0153,
|
||||||
|
0x9F: 0x0178,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Mapping of nFib values to cbRgFcLcb sizes
|
||||||
|
FIB_VERSIONS = {
|
||||||
|
0x00C1: 0x005D, # Word 97
|
||||||
|
0x00D9: 0x006C, # Word 2000
|
||||||
|
0x0101: 0x0088, # Word 2002
|
||||||
|
0x010C: 0x00A4, # Word 2003
|
||||||
|
0x0112: 0x00B7, # Word 2007
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, doc_path):
|
||||||
|
"""Initialize with path to Word document"""
|
||||||
|
self.doc_path = doc_path
|
||||||
|
self.ole = None
|
||||||
|
self.word_doc_stream = None
|
||||||
|
self.table_stream = None
|
||||||
|
self.fib_info = None
|
||||||
|
self.plc_pcd = None
|
||||||
|
|
||||||
|
def __enter__(self):
|
||||||
|
"""Context manager entry"""
|
||||||
|
self.open()
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||||
|
"""Context manager exit"""
|
||||||
|
self.close()
|
||||||
|
|
||||||
|
def open(self):
|
||||||
|
"""Open the OLE file and required streams"""
|
||||||
|
self.ole = olefile.OleFileIO(self.doc_path)
|
||||||
|
|
||||||
|
if not self.ole.exists("WordDocument"):
|
||||||
|
raise ValueError("WordDocument stream not found")
|
||||||
|
|
||||||
|
self.word_doc_stream = self.ole.openstream("WordDocument")
|
||||||
|
|
||||||
|
# Determine table stream name
|
||||||
|
table_stream_name = "0Table" if self.ole.exists("0Table") else "1Table"
|
||||||
|
self.table_stream = self.ole.openstream(table_stream_name)
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
"""Close all open resources"""
|
||||||
|
if self.ole:
|
||||||
|
self.ole.close()
|
||||||
|
self.ole = None
|
||||||
|
self.word_doc_stream = None
|
||||||
|
self.table_stream = None
|
||||||
|
|
||||||
|
def read_fib(self):
|
||||||
|
"""Read the File Information Block (FIB) from the WordDocument stream (2.5)"""
|
||||||
|
# Read FibBase (32 bytes)
|
||||||
|
fib_base = self.word_doc_stream.read(32)
|
||||||
|
|
||||||
|
# Unpack nFib (offset 2, size 2 bytes)
|
||||||
|
nFib = struct.unpack("<H", fib_base[2:4])[0]
|
||||||
|
if nFib not in self.FIB_VERSIONS:
|
||||||
|
raise ValueError(f"Unsupported nFib version: 0x{nFib:04X}")
|
||||||
|
|
||||||
|
# Skip csw (2 bytes at offset 32)
|
||||||
|
self.word_doc_stream.read(2)
|
||||||
|
|
||||||
|
# Skip FibRgW97 (28 bytes) and cslw (2 bytes)
|
||||||
|
self.word_doc_stream.read(28 + 2)
|
||||||
|
|
||||||
|
# Read FibRgLw97 (88 bytes)
|
||||||
|
fib_rg_lw = self.word_doc_stream.read(88)
|
||||||
|
ccpText = struct.unpack("<I", fib_rg_lw[12:16])[0] # Total character count
|
||||||
|
|
||||||
|
# Read cbRgFcLcb (2 bytes)
|
||||||
|
cb_rg_fc_lcb = struct.unpack("<H", self.word_doc_stream.read(2))[0]
|
||||||
|
|
||||||
|
# Read FibRgFcLcbBlob (variable size)
|
||||||
|
fib_rg_fc_lcb_blob = self.word_doc_stream.read(cb_rg_fc_lcb * 8)
|
||||||
|
|
||||||
|
# Skip cswNew (2 bytes) and FibRgCswNew (variable size)
|
||||||
|
csw_new = struct.unpack("<H", self.word_doc_stream.read(2))[0]
|
||||||
|
if csw_new > 0:
|
||||||
|
self.word_doc_stream.read(csw_new * 2)
|
||||||
|
|
||||||
|
# Extract fcClx and lcbClx from FibRgFcLcb97
|
||||||
|
fc_clx = struct.unpack("<I", fib_rg_fc_lcb_blob[0x108:0x10C])[0]
|
||||||
|
lcb_clx = struct.unpack("<I", fib_rg_fc_lcb_blob[0x10C:0x110])[0]
|
||||||
|
fc_plcf_bte_papx = struct.unpack("<I", fib_rg_fc_lcb_blob[0x68:0x6C])[0]
|
||||||
|
lcb_plcf_bte_papx = struct.unpack("<I", fib_rg_fc_lcb_blob[0x6C:0x70])[0]
|
||||||
|
self.fib_info = {
|
||||||
|
"nFib": nFib,
|
||||||
|
"fcClx": fc_clx,
|
||||||
|
"lcbClx": lcb_clx,
|
||||||
|
"ccpText": ccpText,
|
||||||
|
"fcPlcfBtePapx": fc_plcf_bte_papx,
|
||||||
|
"lcbPlcfBtePapx": lcb_plcf_bte_papx,
|
||||||
|
}
|
||||||
|
return self.fib_info
|
||||||
|
|
||||||
|
def read_clx(self, fc_clx, lcb_clx):
|
||||||
|
"""Read the CLX structure from the Table stream"""
|
||||||
|
self.table_stream.seek(fc_clx)
|
||||||
|
clx_data = self.table_stream.read(lcb_clx)
|
||||||
|
|
||||||
|
# For simplicity, we assume the data starts with Pcdt (0x02)
|
||||||
|
if clx_data[0] != 0x02:
|
||||||
|
raise ValueError("Expected Pcdt structure not found in CLX data")
|
||||||
|
|
||||||
|
return clx_data
|
||||||
|
|
||||||
|
def parse_plc_pcd(self, pcdt_data):
|
||||||
|
"""Parse the PLC structure containing PCDs(2.9.177)"""
|
||||||
|
# Get the size of PlcPcd structure
|
||||||
|
lcb = struct.unpack("<I", pcdt_data[1:5])[0]
|
||||||
|
plc_pcd_bytes = pcdt_data[5 : 5 + lcb]
|
||||||
|
|
||||||
|
# Calculate number of PCDs: n = (lcb - 4) // 12
|
||||||
|
n = (lcb - 4) // 12
|
||||||
|
|
||||||
|
# Parse aCP array (n+1 CPs, each 4 bytes)
|
||||||
|
aCP = [
|
||||||
|
struct.unpack("<I", plc_pcd_bytes[i * 4 : (i + 1) * 4])[0]
|
||||||
|
for i in range(n + 1)
|
||||||
|
]
|
||||||
|
|
||||||
|
# Parse aPcd array (n PCDs, each 8 bytes)
|
||||||
|
aPcd = []
|
||||||
|
for i in range(n):
|
||||||
|
start = (n + 1) * 4 + i * 8
|
||||||
|
pcd_bytes = plc_pcd_bytes[start : start + 8]
|
||||||
|
|
||||||
|
# Extract fc (bytes 2-6) and compression flag
|
||||||
|
fc_bytes = pcd_bytes[2:6]
|
||||||
|
fc = int.from_bytes(fc_bytes, byteorder="little", signed=False)
|
||||||
|
fcompressed = (fc >> 1) & 0x1
|
||||||
|
|
||||||
|
aPcd.append(
|
||||||
|
{
|
||||||
|
"fc": fc,
|
||||||
|
"start": aCP[i],
|
||||||
|
"end": aCP[i + 1],
|
||||||
|
"f_compressed": fcompressed,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
self.plc_pcd = {"aCP": aCP, "aPcd": aPcd}
|
||||||
|
return self.plc_pcd
|
||||||
|
|
||||||
|
def _find_pcd_index(self, cp):
|
||||||
|
"""Find the index of the PCD containing the given character position"""
|
||||||
|
aCP = self.plc_pcd["aCP"]
|
||||||
|
for i in range(len(aCP) - 1):
|
||||||
|
if aCP[i] <= cp < aCP[i + 1]:
|
||||||
|
return i
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_paragraph_boundaries(self, cp):
|
||||||
|
"""Find paragraph boundaries for a given character position (2.4.2)"""
|
||||||
|
if not self.fib_info or not self.plc_pcd:
|
||||||
|
raise RuntimeError("Must read FIB and PLC/PCD first")
|
||||||
|
|
||||||
|
# Find the PCD containing this cp
|
||||||
|
i = self._find_pcd_index(cp)
|
||||||
|
if i is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
pcd = self.plc_pcd["aPcd"][i]
|
||||||
|
start_cp = self._find_paragraph_start(cp, i, pcd)
|
||||||
|
end_cp = self._find_paragraph_end(cp, i, pcd)
|
||||||
|
|
||||||
|
return (start_cp, end_cp)
|
||||||
|
|
||||||
|
def _find_paragraph_start(self, cp, i, pcd):
|
||||||
|
"""Find the start of the paragraph containing cp (algorithm 2.4.2)"""
|
||||||
|
# Step 3: Calculate fc and fc_pcd
|
||||||
|
fc_pcd = pcd["fc"]
|
||||||
|
# Let fcPcd be Pcd.fc.fc. Let fc be fcPcd + 2(cp – PlcPcd.aCp[i]).
|
||||||
|
# If Pcd.fc.fCompressed is one, set fc to fc / 2, and set fcPcd to fcPcd/2.
|
||||||
|
fc = fc_pcd + 2 * (cp - self.plc_pcd["aCP"][i])
|
||||||
|
if pcd["f_compressed"]:
|
||||||
|
fc = fc // 2
|
||||||
|
fc_pcd = fc_pcd // 2
|
||||||
|
|
||||||
|
# Step 4: Read PlcBtePapx
|
||||||
|
self.table_stream.seek(self.fib_info["fcPlcfBtePapx"])
|
||||||
|
plcf_bte_papx_data = self.table_stream.read(self.fib_info["lcbPlcfBtePapx"])
|
||||||
|
a_fc, a_pn = self._parse_plcf_bte_papx(plcf_bte_papx_data)
|
||||||
|
|
||||||
|
# Handle case where a_fc is empty
|
||||||
|
if not a_fc:
|
||||||
|
return None
|
||||||
|
|
||||||
|
fc_last = a_fc[-1]
|
||||||
|
|
||||||
|
# Step 4 continued: Check fcLast
|
||||||
|
if fc_last <= fc:
|
||||||
|
if fc_last < fc_pcd:
|
||||||
|
# Step 8: Check if at beginning of document
|
||||||
|
if self.plc_pcd["aCP"][i] == 0:
|
||||||
|
return 0
|
||||||
|
# Step 9: Recurse with previous cp
|
||||||
|
return self._find_paragraph_start(
|
||||||
|
self.plc_pcd["aCP"][i], i - 1, self.plc_pcd["aPcd"][i - 1]
|
||||||
|
)
|
||||||
|
# Adjust fc and fc_last if needed
|
||||||
|
fc = fc_last
|
||||||
|
if pcd["f_compressed"]:
|
||||||
|
fc_last = fc_last // 2
|
||||||
|
fc_first = fc_last
|
||||||
|
else:
|
||||||
|
# Step 5: Find largest j where a_fc[j] <= fc
|
||||||
|
j = self._find_largest_index_le(a_fc, fc)
|
||||||
|
if j is None:
|
||||||
|
return None # Invalid cp
|
||||||
|
|
||||||
|
# Read PapxFkp
|
||||||
|
papx_fkp = self._read_papx_fkp(a_pn[j])
|
||||||
|
# print(f"papx_fkp:{papx_fkp}, j:{j}")
|
||||||
|
if not papx_fkp or not papx_fkp.get("rgfc"):
|
||||||
|
return None # Invalid data
|
||||||
|
|
||||||
|
# Step 6: Find largest k where rgfc[k] <= fc
|
||||||
|
k = self._find_largest_index_le(papx_fkp["rgfc"], fc)
|
||||||
|
if k is None:
|
||||||
|
return None # Invalid cp
|
||||||
|
|
||||||
|
# Check if cp is outside document range
|
||||||
|
if papx_fkp["rgfc"][-1] <= fc:
|
||||||
|
return None
|
||||||
|
|
||||||
|
fc_first = papx_fkp["rgfc"][k]
|
||||||
|
|
||||||
|
# Step 7: Calculate paragraph start
|
||||||
|
if fc_first > fc_pcd:
|
||||||
|
dfc = fc_first - fc_pcd
|
||||||
|
if not pcd["f_compressed"]:
|
||||||
|
dfc = dfc // 2
|
||||||
|
return self.plc_pcd["aCP"][i] + dfc
|
||||||
|
|
||||||
|
# Step 8: Check if at beginning of document
|
||||||
|
if self.plc_pcd["aCP"][i] == 0:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# Step 9: Recurse with previous cp
|
||||||
|
return self._find_paragraph_start(
|
||||||
|
self.plc_pcd["aCP"][i], i - 1, self.plc_pcd["aPcd"][i - 1]
|
||||||
|
)
|
||||||
|
|
||||||
|
def _find_paragraph_end(self, cp, i, pcd):
|
||||||
|
"""Find the end of the paragraph containing cp (algorithm 2.4.2)"""
|
||||||
|
fc_pcd = pcd["fc"]
|
||||||
|
fc = fc_pcd + 2 * (cp - self.plc_pcd["aCP"][i])
|
||||||
|
fc_mac = fc_pcd + 2 * (self.plc_pcd["aCP"][i + 1] - self.plc_pcd["aCP"][i])
|
||||||
|
|
||||||
|
if pcd["f_compressed"]:
|
||||||
|
fc = fc // 2
|
||||||
|
fc_pcd = fc_pcd // 2
|
||||||
|
fc_mac = fc_mac // 2
|
||||||
|
|
||||||
|
# Read PlcBtePapx
|
||||||
|
self.table_stream.seek(self.fib_info["fcPlcfBtePapx"])
|
||||||
|
plcf_bte_papx_data = self.table_stream.read(self.fib_info["lcbPlcfBtePapx"])
|
||||||
|
a_fc, a_pn = self._parse_plcf_bte_papx(plcf_bte_papx_data)
|
||||||
|
|
||||||
|
# Find largest j where a_fc[j] <= fc
|
||||||
|
j = self._find_largest_index_le(a_fc, fc)
|
||||||
|
if j is None or (a_fc and fc >= a_fc[-1]):
|
||||||
|
return self._find_paragraph_end(
|
||||||
|
self.plc_pcd["aCP"][i + 1], i + 1, self.plc_pcd["aPcd"][i + 1]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Read PapxFkp
|
||||||
|
papx_fkp = self._read_papx_fkp(a_pn[j])
|
||||||
|
if not papx_fkp:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Find largest k where rgfc[k] <= fc
|
||||||
|
k = self._find_largest_index_le(papx_fkp["rgfc"], fc)
|
||||||
|
if k is None or (papx_fkp["rgfc"] and fc >= papx_fkp["rgfc"][-1]):
|
||||||
|
return None
|
||||||
|
|
||||||
|
fc_lim = papx_fkp["rgfc"][k + 1] if k + 1 < len(papx_fkp["rgfc"]) else fc_mac
|
||||||
|
|
||||||
|
if fc_lim <= fc_mac:
|
||||||
|
dfc = fc_lim - fc_pcd
|
||||||
|
if not pcd["f_compressed"]:
|
||||||
|
dfc = dfc // 2
|
||||||
|
return self.plc_pcd["aCP"][i] + dfc - 1
|
||||||
|
|
||||||
|
return self._find_paragraph_end(
|
||||||
|
self.plc_pcd["aCP"][i + 1], i + 1, self.plc_pcd["aPcd"][i + 1]
|
||||||
|
)
|
||||||
|
|
||||||
|
def _parse_plcf_bte_papx(self, data):
|
||||||
|
"""Parse PlcBtePapx structure (2.8.6)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: Raw bytes of PlcBtePapx structure.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(a_fc, a_pn): Tuple of two lists:
|
||||||
|
- a_fc: List of unsigned 4-byte integers (FC offsets).
|
||||||
|
- a_pn: List of unsigned 4-byte integers (PnFkpPapx entries).
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If data is malformed or aFC is not sorted/unique.
|
||||||
|
"""
|
||||||
|
if len(data) < 12: # Minimum: 8 (aFC[0..1]) + 4 (aPnBtePapx[0])
|
||||||
|
return [], []
|
||||||
|
|
||||||
|
# Calculate number of aPnBtePapx entries (n)
|
||||||
|
n = (len(data) - 4) // 8
|
||||||
|
if (2 * n + 1) * 4 != len(data):
|
||||||
|
raise ValueError("Invalid PlcBtePapx size")
|
||||||
|
|
||||||
|
a_fc = []
|
||||||
|
a_pn = []
|
||||||
|
|
||||||
|
# Parse aFC (n+1 entries, each 4 bytes)
|
||||||
|
for i in range(n + 1):
|
||||||
|
offset = i * 4
|
||||||
|
fc = struct.unpack("<I", data[offset : offset + 4])[0]
|
||||||
|
a_fc.append(fc)
|
||||||
|
|
||||||
|
# Parse aPnBtePapx (n entries, each 4 bytes, starting after last aFC)
|
||||||
|
pn_offset = (n + 1) * 4
|
||||||
|
for i in range(n):
|
||||||
|
offset = pn_offset + i * 4
|
||||||
|
pn = struct.unpack("<I", data[offset : offset + 4])[0]
|
||||||
|
a_pn.append(pn)
|
||||||
|
|
||||||
|
# Validate aFC is strictly increasing (sorted and unique)
|
||||||
|
for i in range(len(a_fc) - 1):
|
||||||
|
if a_fc[i] >= a_fc[i + 1]:
|
||||||
|
raise ValueError("aFC must be strictly increasing")
|
||||||
|
|
||||||
|
return a_fc, a_pn
|
||||||
|
|
||||||
|
def _read_papx_fkp(self, pn):
|
||||||
|
"""Read PapxFkp structure from WordDocument stream.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pn: Page number (PnFkpPapx), offset = pn * 512.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with keys:
|
||||||
|
- "rgfc": List of FC offsets (4-byte unsigned integers).
|
||||||
|
- "rgbx": List of BxPap (1-byte integers).
|
||||||
|
- "papx_in_fkp": List of PapxInFkp raw bytes.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If FKP data is invalid.
|
||||||
|
"""
|
||||||
|
offset = pn * 512
|
||||||
|
self.word_doc_stream.seek(offset)
|
||||||
|
fkp_data = self.word_doc_stream.read(512)
|
||||||
|
|
||||||
|
if len(fkp_data) != 512:
|
||||||
|
raise ValueError("FKP size must be 512 bytes")
|
||||||
|
|
||||||
|
cpara = fkp_data[511] # Number of paragraphs (1 ≤ cpara ≤ 0x1D)
|
||||||
|
if not 1 <= cpara <= 0x1D:
|
||||||
|
raise ValueError(f"Invalid cpara: {cpara} (must be 1 ≤ cpara ≤ 29)")
|
||||||
|
|
||||||
|
# Parse rgfc (cpara + 1 entries, each 4 bytes)
|
||||||
|
rgfc = []
|
||||||
|
for i in range(cpara + 1):
|
||||||
|
fc_offset = i * 4
|
||||||
|
fc = struct.unpack("<I", fkp_data[fc_offset : fc_offset + 4])[0]
|
||||||
|
rgfc.append(fc)
|
||||||
|
|
||||||
|
# Parse rgbx (cpara entries, each 1 byte)
|
||||||
|
rgbx_start = (cpara + 1) * 4
|
||||||
|
rgbx = list(fkp_data[rgbx_start : rgbx_start + cpara])
|
||||||
|
|
||||||
|
# Parse PapxInFkp (variable size, located after rgbx)
|
||||||
|
papx_in_fkp_start = rgbx_start + cpara
|
||||||
|
papx_in_fkp_end = 511 # cpara is the last byte
|
||||||
|
papx_in_fkp = fkp_data[papx_in_fkp_start:papx_in_fkp_end]
|
||||||
|
|
||||||
|
return {
|
||||||
|
"rgfc": rgfc,
|
||||||
|
"rgbx": rgbx,
|
||||||
|
"papx_in_fkp": papx_in_fkp,
|
||||||
|
}
|
||||||
|
|
||||||
|
def _find_largest_index_le(self, array, value):
|
||||||
|
"""Find largest index where array[index] <= value"""
|
||||||
|
for i in reversed(range(len(array))):
|
||||||
|
if array[i] <= value:
|
||||||
|
return i
|
||||||
|
return None
|
||||||
|
|
||||||
|
def extract_text_by_paragraphs(self):
|
||||||
|
"""Extract text organized by paragraphs"""
|
||||||
|
self.word_doc_stream.seek(0)
|
||||||
|
self.table_stream.seek(0)
|
||||||
|
self.read_fib()
|
||||||
|
clx_data = self.read_clx(self.fib_info["fcClx"], self.fib_info["lcbClx"])
|
||||||
|
self.parse_plc_pcd(clx_data)
|
||||||
|
|
||||||
|
paragraphs = []
|
||||||
|
current_cp = 0
|
||||||
|
total_chars = self.fib_info["ccpText"]
|
||||||
|
|
||||||
|
while current_cp < total_chars:
|
||||||
|
boundaries = self.get_paragraph_boundaries(current_cp)
|
||||||
|
if not boundaries:
|
||||||
|
break
|
||||||
|
|
||||||
|
start, end = boundaries
|
||||||
|
if start > end:
|
||||||
|
break
|
||||||
|
|
||||||
|
paragraph_text = self._extract_text_range(start, end)
|
||||||
|
paragraphs.append(paragraph_text)
|
||||||
|
current_cp = end + 1
|
||||||
|
|
||||||
|
return paragraphs
|
||||||
|
|
||||||
|
def _extract_text_range(self, start_cp, end_cp):
|
||||||
|
"""Extract text between two character positions"""
|
||||||
|
text_chars = []
|
||||||
|
i = self._find_pcd_index(start_cp)
|
||||||
|
|
||||||
|
while i is not None and start_cp <= end_cp:
|
||||||
|
pcd = self.plc_pcd["aPcd"][i]
|
||||||
|
pcd_start = self.plc_pcd["aCP"][i]
|
||||||
|
pcd_end = self.plc_pcd["aCP"][i + 1]
|
||||||
|
|
||||||
|
# Determine range within this PCD
|
||||||
|
range_start = max(start_cp, pcd_start)
|
||||||
|
range_end = min(end_cp, pcd_end - 1)
|
||||||
|
|
||||||
|
if range_start > range_end:
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
fc = pcd["fc"]
|
||||||
|
compressed = pcd["f_compressed"]
|
||||||
|
|
||||||
|
for cp in range(range_start, range_end + 1):
|
||||||
|
if compressed:
|
||||||
|
offset = fc + (cp - pcd_start)
|
||||||
|
self.word_doc_stream.seek(offset)
|
||||||
|
char_byte = self.word_doc_stream.read(1)
|
||||||
|
char_code = char_byte[0]
|
||||||
|
char = chr(self.ANSI_TO_UNICODE.get(char_code, char_code))
|
||||||
|
else:
|
||||||
|
offset = fc + 2 * (cp - pcd_start)
|
||||||
|
self.word_doc_stream.seek(offset)
|
||||||
|
char_bytes = self.word_doc_stream.read(2)
|
||||||
|
char = char_bytes.decode("utf-16-le")
|
||||||
|
|
||||||
|
text_chars.append(char)
|
||||||
|
|
||||||
|
start_cp = range_end + 1
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
return "".join(text_chars)
|
||||||
|
|
||||||
|
def extract_text(self):
|
||||||
|
"""Main method to extract text from the document"""
|
||||||
|
self.word_doc_stream.seek(0)
|
||||||
|
self.table_stream.seek(0)
|
||||||
|
fib_info = self.read_fib()
|
||||||
|
clx_data = self.read_clx(fib_info["fcClx"], fib_info["lcbClx"])
|
||||||
|
pcd_array = self.parse_plc_pcd(clx_data)
|
||||||
|
|
||||||
|
full_text = []
|
||||||
|
for pcd in pcd_array["aPcd"]:
|
||||||
|
start_cp, end_cp = pcd["start"], pcd["end"]
|
||||||
|
char_count = end_cp - start_cp
|
||||||
|
|
||||||
|
if char_count == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
fc = pcd["fc"]
|
||||||
|
compressed = pcd["f_compressed"]
|
||||||
|
text_chars = []
|
||||||
|
|
||||||
|
for cp in range(start_cp, end_cp):
|
||||||
|
offset = (
|
||||||
|
fc + (cp - start_cp) if compressed else fc + 2 * (cp - start_cp)
|
||||||
|
)
|
||||||
|
self.word_doc_stream.seek(offset)
|
||||||
|
|
||||||
|
if compressed:
|
||||||
|
char_byte = self.word_doc_stream.read(1)
|
||||||
|
char_code = char_byte[0]
|
||||||
|
char = chr(self.ANSI_TO_UNICODE.get(char_code, char_code))
|
||||||
|
else:
|
||||||
|
char_bytes = self.word_doc_stream.read(2)
|
||||||
|
# decode char
|
||||||
|
char = char_bytes.decode("utf-16-le")
|
||||||
|
text_chars.append(char)
|
||||||
|
full_text.append("".join(text_chars))
|
||||||
|
|
||||||
|
return "".join(full_text)
|
||||||
|
|
||||||
|
|
||||||
|
class Word97DocKnowledge(Knowledge):
|
||||||
|
"""Microsoft Word 97-2003 (.doc)."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
file_path: Optional[str] = None,
|
||||||
|
knowledge_type: Any = KnowledgeType.DOCUMENT,
|
||||||
|
encoding: Optional[str] = "utf-16-le",
|
||||||
|
loader: Optional[Any] = None,
|
||||||
|
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> None:
|
||||||
|
"""Create Microsoft Word 97-2003 (.doc) Knowledge with Knowledge arguments.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path(str, optional): file path
|
||||||
|
knowledge_type(KnowledgeType, optional): knowledge type
|
||||||
|
encoding(str, optional): .doc encoding
|
||||||
|
loader(Any, optional): loader
|
||||||
|
"""
|
||||||
|
super().__init__(
|
||||||
|
path=file_path,
|
||||||
|
knowledge_type=knowledge_type,
|
||||||
|
data_loader=loader,
|
||||||
|
metadata=metadata,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
self._encoding = encoding
|
||||||
|
|
||||||
|
def _load(self) -> List[Document]:
|
||||||
|
"""Load doc document from loader."""
|
||||||
|
if self._loader:
|
||||||
|
documents = self._loader.load()
|
||||||
|
else:
|
||||||
|
docs = []
|
||||||
|
content = []
|
||||||
|
with Word97DocParser(self._path) as parser:
|
||||||
|
paragraphs = parser.extract_text_by_paragraphs()
|
||||||
|
for i, para in enumerate(paragraphs):
|
||||||
|
content.append(para)
|
||||||
|
|
||||||
|
metadata = {"source": self._path}
|
||||||
|
if self._metadata:
|
||||||
|
metadata.update(self._metadata) # type: ignore
|
||||||
|
docs.append(Document(content="\n".join(content), metadata=metadata))
|
||||||
|
return docs
|
||||||
|
return [Document.langchain2doc(lc_document) for lc_document in documents]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
|
||||||
|
"""Return support chunk strategy."""
|
||||||
|
return [
|
||||||
|
ChunkStrategy.CHUNK_BY_SIZE,
|
||||||
|
ChunkStrategy.CHUNK_BY_PARAGRAPH,
|
||||||
|
ChunkStrategy.CHUNK_BY_SEPARATOR,
|
||||||
|
]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def default_chunk_strategy(cls) -> ChunkStrategy:
|
||||||
|
"""Return default chunk strategy."""
|
||||||
|
return ChunkStrategy.CHUNK_BY_SIZE
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def type(cls) -> KnowledgeType:
|
||||||
|
"""Return knowledge type."""
|
||||||
|
return KnowledgeType.DOCUMENT
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def document_type(cls) -> DocumentType:
|
||||||
|
"""Return document type."""
|
||||||
|
return DocumentType.DOC
|
@@ -171,6 +171,7 @@ class KnowledgeFactory:
|
|||||||
|
|
||||||
from .csv import CSVKnowledge # noqa: F401
|
from .csv import CSVKnowledge # noqa: F401
|
||||||
from .datasource import DatasourceKnowledge # noqa: F401
|
from .datasource import DatasourceKnowledge # noqa: F401
|
||||||
|
from .doc import Word97DocKnowledge # noqa: F401
|
||||||
from .docx import DocxKnowledge # noqa: F401
|
from .docx import DocxKnowledge # noqa: F401
|
||||||
from .excel import ExcelKnowledge # noqa: F401
|
from .excel import ExcelKnowledge # noqa: F401
|
||||||
from .html import HTMLKnowledge # noqa: F401
|
from .html import HTMLKnowledge # noqa: F401
|
||||||
|
Binary file not shown.
@@ -0,0 +1,20 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from ..doc import Word97DocKnowledge
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_file_path():
|
||||||
|
file_path = Path(__file__).parent / "data" / "test_mock.doc"
|
||||||
|
return file_path.as_posix()
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_from_docx(mock_file_path):
|
||||||
|
knowledge = Word97DocKnowledge(file_path=mock_file_path)
|
||||||
|
documents = knowledge._load()
|
||||||
|
actual = documents[0].content.replace("\r", "\n")
|
||||||
|
assert len(documents) == 1
|
||||||
|
assert actual == "This is the first paragraph.\n\nThis is the second paragraph.\n"
|
||||||
|
assert documents[0].metadata["source"] == mock_file_path
|
Reference in New Issue
Block a user