mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-07-28 14:27:20 +00:00
feat(ChatKnowledge):pdf loader support table extraction (#2147)
This commit is contained in:
parent
0bc478b7b5
commit
43c86caebf
@ -1,6 +1,11 @@
|
|||||||
"""PDF Knowledge."""
|
"""PDF Knowledge."""
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from collections import defaultdict
|
||||||
from typing import Any, Dict, List, Optional, Union
|
from typing import Any, Dict, List, Optional, Union
|
||||||
|
|
||||||
|
from dbgpt.component import logger
|
||||||
from dbgpt.core import Document
|
from dbgpt.core import Document
|
||||||
from dbgpt.rag.knowledge.base import (
|
from dbgpt.rag.knowledge.base import (
|
||||||
ChunkStrategy,
|
ChunkStrategy,
|
||||||
@ -38,44 +43,213 @@ class PDFKnowledge(Knowledge):
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
self._language = language
|
self._language = language
|
||||||
|
self._pdf_processor = PDFProcessor(filepath=self._path)
|
||||||
|
self.all_title: List[dict] = []
|
||||||
|
self.all_text: List[dict] = []
|
||||||
|
|
||||||
|
def process_text_data(self):
|
||||||
|
"""Text data processing to level 1 and level 2 titles."""
|
||||||
|
for i, data in enumerate(self.all_text):
|
||||||
|
# data = self.all_text[i]
|
||||||
|
inside_content = data.get("inside")
|
||||||
|
content_type = data.get("type")
|
||||||
|
if content_type == "text":
|
||||||
|
# use regex to match the first level title
|
||||||
|
first_level_match = re.match(
|
||||||
|
r"§(\d+)+([\u4e00-\u9fa5]+)", inside_content.strip()
|
||||||
|
)
|
||||||
|
second_level_match = re.match(
|
||||||
|
r"(\d+\.\d+)([\u4e00-\u9fa5]+)", inside_content.strip()
|
||||||
|
)
|
||||||
|
first_num_match = re.match(r"^§(\d+)$", inside_content.strip())
|
||||||
|
# get all level 1 titles
|
||||||
|
title_name = [
|
||||||
|
dictionary["first_title"]
|
||||||
|
for dictionary in self.all_title
|
||||||
|
if "first_title" in dictionary
|
||||||
|
]
|
||||||
|
if first_level_match:
|
||||||
|
first_title_text = first_level_match.group(2)
|
||||||
|
first_title_num = first_level_match.group(1)
|
||||||
|
first_title = first_title_num + first_title_text
|
||||||
|
# the title does not contain "..." and is not in the title list
|
||||||
|
# , add it to the title list
|
||||||
|
if first_title not in title_name and (
|
||||||
|
int(first_title_num) == 1
|
||||||
|
or int(first_title_num) - int(self.all_title[-1]["id"]) == 1
|
||||||
|
):
|
||||||
|
current_entry = {
|
||||||
|
"id": first_title_num,
|
||||||
|
"first_title": first_title,
|
||||||
|
"second_title": [],
|
||||||
|
"table": [],
|
||||||
|
}
|
||||||
|
self.all_title.append(current_entry)
|
||||||
|
|
||||||
|
elif second_level_match:
|
||||||
|
second_title_name = second_level_match.group(0)
|
||||||
|
second_title = second_level_match.group(1)
|
||||||
|
first_title = second_title.split(".")[0]
|
||||||
|
if (int(first_title) - 1 >= len(self.all_title)) or int(
|
||||||
|
first_title
|
||||||
|
) - 1 < 0:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
titles = [
|
||||||
|
sub_item["title"]
|
||||||
|
for sub_item in self.all_title[int(first_title) - 1][
|
||||||
|
"second_title"
|
||||||
|
]
|
||||||
|
]
|
||||||
|
if second_title_name not in titles:
|
||||||
|
self.all_title[int(first_title) - 1]["second_title"].append(
|
||||||
|
{"title": second_title_name, "table": []}
|
||||||
|
)
|
||||||
|
elif first_num_match:
|
||||||
|
first_num = first_num_match.group(1)
|
||||||
|
first_text = self.all_text[i + 1].get("inside")
|
||||||
|
first_title = first_num_match.group(1) + first_text
|
||||||
|
# if the title does not contain "..." and is not in the title list
|
||||||
|
if (
|
||||||
|
"..." not in first_text
|
||||||
|
and first_title not in title_name
|
||||||
|
and (
|
||||||
|
int(first_num) == 1
|
||||||
|
or int(first_num) - int(self.all_title[-1]["id"]) == 1
|
||||||
|
)
|
||||||
|
):
|
||||||
|
current_entry = {
|
||||||
|
"id": first_num,
|
||||||
|
"first_title": first_title,
|
||||||
|
"second_title": [],
|
||||||
|
"table": [],
|
||||||
|
}
|
||||||
|
self.all_title.append(current_entry)
|
||||||
|
|
||||||
def _load(self) -> List[Document]:
|
def _load(self) -> List[Document]:
|
||||||
"""Load pdf document from loader."""
|
"""Load pdf document from loader."""
|
||||||
if self._loader:
|
if self._loader:
|
||||||
documents = self._loader.load()
|
documents = self._loader.load()
|
||||||
else:
|
else:
|
||||||
import pypdf
|
self._pdf_processor.pdf_to_json()
|
||||||
|
file_title = self.file_path.rsplit("/", 1)[-1].replace(".pdf", "")
|
||||||
|
self.all_text = list(self._pdf_processor.all_text.values())
|
||||||
|
self.process_text_data()
|
||||||
|
temp_table = []
|
||||||
|
temp_title = None
|
||||||
|
page_documents = []
|
||||||
|
merged_data = {} # type ignore
|
||||||
|
for i, data in enumerate(self.all_text):
|
||||||
|
content_type = data.get("type")
|
||||||
|
inside_content = data.get("inside")
|
||||||
|
page = data.get("page")
|
||||||
|
|
||||||
pages = []
|
if content_type == "excel":
|
||||||
documents = []
|
temp_table.append(inside_content)
|
||||||
if not self._path:
|
if temp_title is None:
|
||||||
raise ValueError("file path is required")
|
for j in range(i - 1, -1, -1):
|
||||||
with open(self._path, "rb") as file:
|
if self.all_text[j]["type"] == "excel":
|
||||||
reader = pypdf.PdfReader(file)
|
break
|
||||||
for page_num in range(len(reader.pages)):
|
if self.all_text[j]["type"] == "text":
|
||||||
_page = reader.pages[page_num]
|
content = self.all_text[j]["inside"]
|
||||||
pages.append((_page.extract_text(), page_num))
|
if re.match(
|
||||||
|
r"^\d+\.\d+", content
|
||||||
# cleaned_pages = []
|
) or content.startswith("§"):
|
||||||
for page, page_num in pages:
|
temp_title = content.strip()
|
||||||
lines = page.splitlines()
|
break
|
||||||
|
else:
|
||||||
cleaned_lines = []
|
temp_title = content.strip()
|
||||||
for line in lines:
|
break
|
||||||
if self._language == "en":
|
elif content_type == "text":
|
||||||
words = list(line) # noqa: F841
|
if page in merged_data:
|
||||||
|
# page merge
|
||||||
|
merged_data[page]["inside_content"] += " " + inside_content
|
||||||
else:
|
else:
|
||||||
words = line.split() # noqa: F841
|
merged_data[page] = {
|
||||||
cleaned_lines.append(line)
|
"inside_content": inside_content,
|
||||||
page = "\n".join(cleaned_lines)
|
"type": "text",
|
||||||
# cleaned_pages.append(page)
|
}
|
||||||
metadata = {"source": self._path, "page": page_num}
|
|
||||||
if self._metadata:
|
# merge excel table
|
||||||
metadata.update(self._metadata) # type: ignore
|
if temp_table:
|
||||||
# text = "\f".join(cleaned_pages)
|
table_meta = {
|
||||||
document = Document(content=page, metadata=metadata)
|
"title": temp_title or temp_table[0],
|
||||||
documents.append(document)
|
"type": "excel",
|
||||||
return documents
|
}
|
||||||
|
self.all_title.append(table_meta)
|
||||||
|
|
||||||
|
# markdown format
|
||||||
|
markdown_tables = []
|
||||||
|
if temp_table:
|
||||||
|
header = eval(temp_table[0])
|
||||||
|
markdown_tables.append(header)
|
||||||
|
for entry in temp_table[1:]:
|
||||||
|
row = eval(entry)
|
||||||
|
markdown_tables.append(row)
|
||||||
|
markdown_output = "| " + " | ".join(header) + " |\n"
|
||||||
|
markdown_output += (
|
||||||
|
"| " + " | ".join(["---"] * len(header)) + " |\n"
|
||||||
|
)
|
||||||
|
for row in markdown_tables[1:]:
|
||||||
|
markdown_output += "| " + " | ".join(row) + " |\n"
|
||||||
|
|
||||||
|
# merged content
|
||||||
|
merged_data[page]["excel_content"] = temp_table
|
||||||
|
merged_data[page]["markdown_output"] = markdown_output
|
||||||
|
|
||||||
|
temp_title = None
|
||||||
|
temp_table = []
|
||||||
|
|
||||||
|
# deal last excel
|
||||||
|
if temp_table:
|
||||||
|
table_meta = {
|
||||||
|
"title": temp_title or temp_table[0],
|
||||||
|
"table": temp_table,
|
||||||
|
"type": "excel",
|
||||||
|
}
|
||||||
|
self.all_title.append(table_meta)
|
||||||
|
# markdown format
|
||||||
|
markdown_tables = []
|
||||||
|
if temp_table:
|
||||||
|
header = eval(temp_table[0])
|
||||||
|
markdown_tables.append(header)
|
||||||
|
for entry in temp_table[1:]:
|
||||||
|
row = eval(entry)
|
||||||
|
markdown_tables.append(row)
|
||||||
|
markdown_output = "| " + " | ".join(header) + " |\n"
|
||||||
|
markdown_output += "| " + " | ".join(["---"] * len(header)) + " |\n"
|
||||||
|
for row in markdown_tables[1:]:
|
||||||
|
markdown_output += "| " + " | ".join(row) + " |\n"
|
||||||
|
# merged content
|
||||||
|
merged_data[page]["excel_content"] = temp_table
|
||||||
|
merged_data[page]["markdown_output"] = markdown_output
|
||||||
|
|
||||||
|
for page, content in merged_data.items():
|
||||||
|
inside_content = content["inside_content"]
|
||||||
|
if "markdown_output" in content:
|
||||||
|
markdown_content = content["markdown_output"]
|
||||||
|
content_metadata = {
|
||||||
|
"page": page,
|
||||||
|
"type": "excel",
|
||||||
|
"title": file_title,
|
||||||
|
}
|
||||||
|
page_documents.append(
|
||||||
|
Document(
|
||||||
|
content=inside_content + "\n" + markdown_content,
|
||||||
|
metadata=content_metadata,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
content_metadata = {
|
||||||
|
"page": page,
|
||||||
|
"type": "text",
|
||||||
|
"title": file_title,
|
||||||
|
}
|
||||||
|
page_documents.append(
|
||||||
|
Document(content=inside_content, metadata=content_metadata)
|
||||||
|
)
|
||||||
|
|
||||||
|
return page_documents
|
||||||
return [Document.langchain2doc(lc_document) for lc_document in documents]
|
return [Document.langchain2doc(lc_document) for lc_document in documents]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@ -101,3 +275,236 @@ class PDFKnowledge(Knowledge):
|
|||||||
def document_type(cls) -> DocumentType:
|
def document_type(cls) -> DocumentType:
|
||||||
"""Document type of PDF."""
|
"""Document type of PDF."""
|
||||||
return DocumentType.PDF
|
return DocumentType.PDF
|
||||||
|
|
||||||
|
|
||||||
|
class PDFProcessor:
|
||||||
|
"""PDFProcessor class."""
|
||||||
|
|
||||||
|
def __init__(self, filepath):
|
||||||
|
"""Initialize PDFProcessor class."""
|
||||||
|
self.filepath = filepath
|
||||||
|
try:
|
||||||
|
import pdfplumber # type: ignore
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError("Please install pdfplumber first.")
|
||||||
|
self.pdf = pdfplumber.open(filepath)
|
||||||
|
self.all_text = defaultdict(dict)
|
||||||
|
self.allrow = 0
|
||||||
|
self.last_num = 0
|
||||||
|
|
||||||
|
def check_lines(self, page, top, buttom):
|
||||||
|
"""Check lines."""
|
||||||
|
lines = page.extract_words()[::]
|
||||||
|
text = ""
|
||||||
|
last_top = 0
|
||||||
|
last_check = 0
|
||||||
|
for line in range(len(lines)):
|
||||||
|
each_line = lines[line]
|
||||||
|
check_re = (
|
||||||
|
"(?:。|;|单位:人民币元|金额单位:人民币元|单位:万元|币种:人民币|\d|"
|
||||||
|
"报告(?:全文)?(?:(修订版)|(修订稿)|(更正后))?)$"
|
||||||
|
)
|
||||||
|
if top == "" and buttom == "":
|
||||||
|
if abs(last_top - each_line["top"]) <= 2 or (
|
||||||
|
last_check > 0
|
||||||
|
and (page.height * 0.9 - each_line["top"]) > 0
|
||||||
|
and not re.search(check_re, text)
|
||||||
|
):
|
||||||
|
text = text + each_line["text"]
|
||||||
|
else:
|
||||||
|
text = text + "\n" + each_line["text"]
|
||||||
|
elif top == "":
|
||||||
|
if each_line["top"] > buttom:
|
||||||
|
if abs(last_top - each_line["top"]) <= 2 or (
|
||||||
|
last_check > 0
|
||||||
|
and (page.height * 0.85 - each_line["top"]) > 0
|
||||||
|
and not re.search(check_re, text)
|
||||||
|
):
|
||||||
|
text = text + each_line["text"]
|
||||||
|
else:
|
||||||
|
text = text + "\n" + each_line["text"]
|
||||||
|
else:
|
||||||
|
if each_line["top"] < top and each_line["top"] > buttom:
|
||||||
|
if abs(last_top - each_line["top"]) <= 2 or (
|
||||||
|
last_check > 0
|
||||||
|
and (page.height * 0.85 - each_line["top"]) > 0
|
||||||
|
and not re.search(check_re, text)
|
||||||
|
):
|
||||||
|
text = text + each_line["text"]
|
||||||
|
else:
|
||||||
|
text = text + "\n" + each_line["text"]
|
||||||
|
last_top = each_line["top"]
|
||||||
|
last_check = each_line["x1"] - page.width * 0.85
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
def drop_empty_cols(self, data):
|
||||||
|
"""Delete empty column."""
|
||||||
|
transposed_data = list(map(list, zip(*data)))
|
||||||
|
filtered_data = [
|
||||||
|
col for col in transposed_data if not all(cell == "" for cell in col)
|
||||||
|
]
|
||||||
|
result = list(map(list, zip(*filtered_data)))
|
||||||
|
return result
|
||||||
|
|
||||||
|
def extract_text_and_tables(self, page):
|
||||||
|
"""Extract text and tables."""
|
||||||
|
buttom = 0
|
||||||
|
tables = page.find_tables()
|
||||||
|
if len(tables) >= 1:
|
||||||
|
count = len(tables)
|
||||||
|
for table in tables:
|
||||||
|
# process text before table
|
||||||
|
if table.bbox[3] < buttom:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
count -= 1
|
||||||
|
# process text before table
|
||||||
|
top = table.bbox[1]
|
||||||
|
text = self.check_lines(page, top, buttom)
|
||||||
|
text_list = text.split("\n")
|
||||||
|
for _t in range(len(text_list)):
|
||||||
|
self.all_text[self.allrow] = {
|
||||||
|
"page": page.page_number,
|
||||||
|
"allrow": self.allrow,
|
||||||
|
"type": "text",
|
||||||
|
"inside": text_list[_t],
|
||||||
|
}
|
||||||
|
self.allrow += 1
|
||||||
|
|
||||||
|
# process table
|
||||||
|
buttom = table.bbox[3]
|
||||||
|
new_table = table.extract()
|
||||||
|
r_count = 0
|
||||||
|
for r in range(len(new_table)):
|
||||||
|
row = new_table[r]
|
||||||
|
if row[0] is None:
|
||||||
|
r_count += 1
|
||||||
|
for c in range(len(row)):
|
||||||
|
if row[c] is not None and row[c] not in ["", " "]:
|
||||||
|
if new_table[r - r_count][c] is None:
|
||||||
|
new_table[r - r_count][c] = row[c]
|
||||||
|
else:
|
||||||
|
new_table[r - r_count][c] += row[c]
|
||||||
|
new_table[r][c] = None
|
||||||
|
else:
|
||||||
|
r_count = 0
|
||||||
|
|
||||||
|
end_table = []
|
||||||
|
for row in new_table:
|
||||||
|
if row[0] is not None:
|
||||||
|
cell_list = []
|
||||||
|
cell_check = False
|
||||||
|
for cell in row:
|
||||||
|
if cell is not None:
|
||||||
|
cell = cell.replace("\n", "")
|
||||||
|
else:
|
||||||
|
cell = ""
|
||||||
|
if cell != "":
|
||||||
|
cell_check = True
|
||||||
|
cell_list.append(cell)
|
||||||
|
if cell_check:
|
||||||
|
end_table.append(cell_list)
|
||||||
|
|
||||||
|
end_table = self.drop_empty_cols(end_table)
|
||||||
|
|
||||||
|
# process when column name is empty
|
||||||
|
if len(end_table) > 0:
|
||||||
|
for i in range(len(end_table[0])):
|
||||||
|
if end_table[0][i] == "":
|
||||||
|
if 0 < i < len(end_table[0]) - 1:
|
||||||
|
# left column name
|
||||||
|
left_column = end_table[0][i - 1]
|
||||||
|
# right column name
|
||||||
|
right_column = end_table[0][i + 1]
|
||||||
|
# current name = left name + right name
|
||||||
|
end_table[0][i] = left_column + right_column
|
||||||
|
else:
|
||||||
|
# if current column is empty and is the first
|
||||||
|
# column, assign the right column name.
|
||||||
|
# if current column is empty and is the
|
||||||
|
# last column, assign the left column name.
|
||||||
|
end_table[0][i] = (
|
||||||
|
end_table[0][i - 1]
|
||||||
|
if i == len(end_table[0]) - 1
|
||||||
|
else end_table[0][i + 1]
|
||||||
|
)
|
||||||
|
|
||||||
|
# if the first row is empty, assign the value of the previous row
|
||||||
|
for i in range(1, len(end_table)):
|
||||||
|
for j in range(len(end_table[i])):
|
||||||
|
if end_table[i][j] == "":
|
||||||
|
end_table[i][j] = end_table[i][j - 1]
|
||||||
|
|
||||||
|
for row in end_table:
|
||||||
|
self.all_text[self.allrow] = {
|
||||||
|
"page": page.page_number,
|
||||||
|
"allrow": self.allrow,
|
||||||
|
"type": "excel",
|
||||||
|
"inside": str(row),
|
||||||
|
}
|
||||||
|
self.allrow += 1
|
||||||
|
|
||||||
|
if count == 0:
|
||||||
|
text = self.check_lines(page, "", buttom)
|
||||||
|
text_list = text.split("\n")
|
||||||
|
for _t in range(len(text_list)):
|
||||||
|
self.all_text[self.allrow] = {
|
||||||
|
"page": page.page_number,
|
||||||
|
"allrow": self.allrow,
|
||||||
|
"type": "text",
|
||||||
|
"inside": text_list[_t],
|
||||||
|
}
|
||||||
|
self.allrow += 1
|
||||||
|
|
||||||
|
else:
|
||||||
|
text = self.check_lines(page, "", "")
|
||||||
|
text_list = text.split("\n")
|
||||||
|
for _t in range(len(text_list)):
|
||||||
|
self.all_text[self.allrow] = {
|
||||||
|
"page": page.page_number,
|
||||||
|
"allrow": self.allrow,
|
||||||
|
"type": "text",
|
||||||
|
"inside": text_list[_t],
|
||||||
|
}
|
||||||
|
self.allrow += 1
|
||||||
|
|
||||||
|
first_re = "[^计](?:报告(?:全文)?(?:(修订版)|(修订稿)|(更正后))?)$"
|
||||||
|
end_re = "^(?:\d|\\|\/|第|共|页|-|_| ){1,}"
|
||||||
|
if self.last_num == 0:
|
||||||
|
try:
|
||||||
|
first_text = str(self.all_text[1]["inside"])
|
||||||
|
end_text = str(self.all_text[len(self.all_text) - 1]["inside"])
|
||||||
|
if re.search(first_re, first_text) and "[" not in end_text:
|
||||||
|
self.all_text[1]["type"] = "页眉"
|
||||||
|
if re.search(end_re, end_text) and "[" not in end_text:
|
||||||
|
self.all_text[len(self.all_text) - 1]["type"] = "页脚"
|
||||||
|
except Exception:
|
||||||
|
print(page.page_number)
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
first_text = str(self.all_text[self.last_num + 2]["inside"])
|
||||||
|
end_text = str(self.all_text[len(self.all_text) - 1]["inside"])
|
||||||
|
if re.search(first_re, first_text) and "[" not in end_text:
|
||||||
|
self.all_text[self.last_num + 2]["type"] = "页眉"
|
||||||
|
if re.search(end_re, end_text) and "[" not in end_text:
|
||||||
|
self.all_text[len(self.all_text) - 1]["type"] = "页脚"
|
||||||
|
except Exception:
|
||||||
|
print(page.page_number)
|
||||||
|
|
||||||
|
self.last_num = len(self.all_text) - 1
|
||||||
|
|
||||||
|
def pdf_to_json(self):
|
||||||
|
"""Process pdf."""
|
||||||
|
for i in range(len(self.pdf.pages)):
|
||||||
|
self.extract_text_and_tables(self.pdf.pages[i])
|
||||||
|
logger.info(f"{self.filepath} page {i} extract text success")
|
||||||
|
|
||||||
|
def save_all_text(self, path):
|
||||||
|
"""Save all text."""
|
||||||
|
directory = os.path.dirname(path)
|
||||||
|
if not os.path.exists(directory):
|
||||||
|
os.makedirs(directory)
|
||||||
|
for key in self.all_text.keys():
|
||||||
|
with open(path, "a+", encoding="utf-8") as file:
|
||||||
|
file.write(json.dumps(self.all_text[key], ensure_ascii=False) + "\n")
|
||||||
|
@ -5,8 +5,8 @@ import pytest
|
|||||||
from dbgpt.rag.knowledge.pdf import PDFKnowledge
|
from dbgpt.rag.knowledge.pdf import PDFKnowledge
|
||||||
|
|
||||||
MOCK_PDF_PAGES = [
|
MOCK_PDF_PAGES = [
|
||||||
("This is the content of the first page.", 0),
|
("", 0),
|
||||||
("This is the content of the second page.", 1),
|
("", 1),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@ -19,19 +19,19 @@ def mock_pdf_open_and_reader():
|
|||||||
for page in MOCK_PDF_PAGES
|
for page in MOCK_PDF_PAGES
|
||||||
]
|
]
|
||||||
with patch("builtins.open", mock_pdf_file):
|
with patch("builtins.open", mock_pdf_file):
|
||||||
with patch("pypdf.PdfReader", return_value=mock_reader) as mock:
|
with patch("pdfplumber.open", return_value=mock_reader) as mock:
|
||||||
yield mock
|
yield mock
|
||||||
|
|
||||||
|
|
||||||
def test_load_from_pdf(mock_pdf_open_and_reader):
|
def test_load_from_pdf(mock_pdf_open_and_reader):
|
||||||
file_path = "test_document.pdf"
|
file_path = "test_document"
|
||||||
knowledge = PDFKnowledge(file_path=file_path)
|
knowledge = PDFKnowledge(file_path=file_path)
|
||||||
documents = knowledge._load()
|
documents = knowledge._load()
|
||||||
|
|
||||||
assert len(documents) == len(MOCK_PDF_PAGES)
|
assert len(documents) == len(MOCK_PDF_PAGES)
|
||||||
for i, document in enumerate(documents):
|
for i, document in enumerate(documents):
|
||||||
assert MOCK_PDF_PAGES[i][0] in document.content
|
assert MOCK_PDF_PAGES[i][0] in document.content
|
||||||
assert document.metadata["source"] == file_path
|
assert document.metadata["title"] == file_path
|
||||||
assert document.metadata["page"] == MOCK_PDF_PAGES[i][1]
|
assert document.metadata["type"] == "text"
|
||||||
|
|
||||||
#
|
#
|
||||||
|
@ -58,9 +58,15 @@ class TextSplitter(ABC):
|
|||||||
_metadatas = metadatas or [{}] * len(texts)
|
_metadatas = metadatas or [{}] * len(texts)
|
||||||
chunks = []
|
chunks = []
|
||||||
for i, text in enumerate(texts):
|
for i, text in enumerate(texts):
|
||||||
for chunk in self.split_text(text, separator=separator, **kwargs):
|
if _metadatas[i].get("type") == "excel":
|
||||||
new_doc = Chunk(content=chunk, metadata=copy.deepcopy(_metadatas[i]))
|
table_chunk = Chunk(content=text, metadata=copy.deepcopy(_metadatas[i]))
|
||||||
chunks.append(new_doc)
|
chunks.append(table_chunk)
|
||||||
|
else:
|
||||||
|
for chunk in self.split_text(text, separator=separator, **kwargs):
|
||||||
|
new_doc = Chunk(
|
||||||
|
content=chunk, metadata=copy.deepcopy(_metadatas[i])
|
||||||
|
)
|
||||||
|
chunks.append(new_doc)
|
||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
def split_documents(self, documents: Iterable[Document], **kwargs) -> List[Chunk]:
|
def split_documents(self, documents: Iterable[Document], **kwargs) -> List[Chunk]:
|
||||||
@ -489,11 +495,15 @@ class MarkdownHeaderTextSplitter(TextSplitter):
|
|||||||
_metadatas = metadatas or [{}] * len(texts)
|
_metadatas = metadatas or [{}] * len(texts)
|
||||||
chunks = []
|
chunks = []
|
||||||
for i, text in enumerate(texts):
|
for i, text in enumerate(texts):
|
||||||
for chunk in self.split_text(text, separator, **kwargs):
|
if _metadatas[i].get("type") == "excel":
|
||||||
metadata = chunk.metadata or {}
|
table_chunk = Chunk(content=text, metadata=copy.deepcopy(_metadatas[i]))
|
||||||
metadata.update(_metadatas[i])
|
chunks.append(table_chunk)
|
||||||
new_doc = Chunk(content=chunk.content, metadata=metadata)
|
else:
|
||||||
chunks.append(new_doc)
|
for chunk in self.split_text(text, separator, **kwargs):
|
||||||
|
metadata = chunk.metadata or {}
|
||||||
|
metadata.update(_metadatas[i])
|
||||||
|
new_doc = Chunk(content=chunk.content, metadata=metadata)
|
||||||
|
chunks.append(new_doc)
|
||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[Chunk]:
|
def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[Chunk]:
|
||||||
|
Loading…
Reference in New Issue
Block a user