mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-23 15:19:33 +00:00
text-splitters[minor]: Replace lxml and XSLT with BeautifulSoup in HTMLHeaderTextSplitter for Improved Large HTML File Processing (#27678)
This pull request updates the `HTMLHeaderTextSplitter` by replacing the `split_text_from_file` method's implementation. The original method used `lxml` and XSLT for processing HTML files, which caused `lxml.etree.xsltapplyerror maxhead` when handling large HTML documents due to limitations in the XSLT processor. Fixes #13149 By switching to BeautifulSoup (`bs4`), we achieve: - **Improved Performance and Reliability:** BeautifulSoup efficiently processes large HTML files without the errors associated with `lxml` and XSLT. - **Simplified Dependencies:** Removes the dependency on `lxml` and external XSLT files, relying instead on the widely used `beautifulsoup4` library. - **Maintained Functionality:** The new method replicates the original behavior, ensuring compatibility with existing code and preserving the extraction of content and metadata. **Issue:** This change addresses issues related to processing large HTML files with the existing `HTMLHeaderTextSplitter` implementation. It resolves problems where users encounter lxml.etree.xsltapplyerror maxhead due to large HTML documents. **Dependencies:** - **BeautifulSoup (`beautifulsoup4`):** The `beautifulsoup4` library is now used for parsing HTML content. - Installation: `pip install beautifulsoup4` **Code Changes:** Updated the `split_text_from_file` method in `HTMLHeaderTextSplitter` as follows: ```python def split_text_from_file(self, file: Any) -> List[Document]: """Split HTML file using BeautifulSoup. Args: file: HTML file path or file-like object. Returns: List of Document objects with page_content and metadata. """ from bs4 import BeautifulSoup from langchain.docstore.document import Document import bs4 # Read the HTML content from the file or file-like object if isinstance(file, str): with open(file, 'r', encoding='utf-8') as f: html_content = f.read() else: # Assuming file is a file-like object html_content = file.read() # Parse the HTML content using BeautifulSoup soup = BeautifulSoup(html_content, 'html.parser') # Extract the header tags and their corresponding metadata keys headers_to_split_on = [tag[0] for tag in self.headers_to_split_on] header_mapping = dict(self.headers_to_split_on) documents = [] # Find the body of the document body = soup.body if soup.body else soup # Find all header tags in the order they appear all_headers = body.find_all(headers_to_split_on) # If there's content before the first header, collect it first_header = all_headers[0] if all_headers else None if first_header: pre_header_content = '' for elem in first_header.find_all_previous(): if isinstance(elem, bs4.Tag): text = elem.get_text(separator=' ', strip=True) if text: pre_header_content = text + ' ' + pre_header_content if pre_header_content.strip(): documents.append(Document( page_content=pre_header_content.strip(), metadata={} # No metadata since there's no header )) else: # If no headers are found, return the whole content full_text = body.get_text(separator=' ', strip=True) if full_text.strip(): documents.append(Document( page_content=full_text.strip(), metadata={} )) return documents # Process each header and its associated content for header in all_headers: current_metadata = {} header_name = header.name header_text = header.get_text(separator=' ', strip=True) current_metadata[header_mapping[header_name]] = header_text # Collect all sibling elements until the next header of the same or higher level content_elements = [] for sibling in header.find_next_siblings(): if sibling.name in headers_to_split_on: # Stop at the next header break if isinstance(sibling, bs4.Tag): content_elements.append(sibling) # Get the text content of the collected elements current_content = '' for elem in content_elements: text = elem.get_text(separator=' ', strip=True) if text: current_content += text + ' ' # Create a Document if there is content if current_content.strip(): documents.append(Document( page_content=current_content.strip(), metadata=current_metadata.copy() )) else: # If there's no content, but we have metadata, still create a Document documents.append(Document( page_content='', metadata=current_metadata.copy() )) return documents ``` --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
parent
989eec4b7b
commit
d3ed9b86be
@ -3,7 +3,7 @@ from __future__ import annotations
|
|||||||
import copy
|
import copy
|
||||||
import pathlib
|
import pathlib
|
||||||
import re
|
import re
|
||||||
from io import BytesIO, StringIO
|
from io import StringIO
|
||||||
from typing import (
|
from typing import (
|
||||||
Any,
|
Any,
|
||||||
Callable,
|
Callable,
|
||||||
@ -34,148 +34,291 @@ class ElementType(TypedDict):
|
|||||||
|
|
||||||
|
|
||||||
class HTMLHeaderTextSplitter:
|
class HTMLHeaderTextSplitter:
|
||||||
"""Splitting HTML files based on specified headers.
|
"""Split HTML content into structured Documents based on specified headers.
|
||||||
|
|
||||||
Requires lxml package.
|
Splits HTML content by detecting specified header tags (e.g., <h1>, <h2>) and
|
||||||
|
creating hierarchical Document objects that reflect the semantic structure
|
||||||
|
of the original content. For each identified section, the splitter associates
|
||||||
|
the extracted text with metadata corresponding to the encountered headers.
|
||||||
|
|
||||||
|
If no specified headers are found, the entire content is returned as a single
|
||||||
|
Document. This allows for flexible handling of HTML input, ensuring that
|
||||||
|
information is organized according to its semantic headers.
|
||||||
|
|
||||||
|
The splitter provides the option to return each HTML element as a separate
|
||||||
|
Document or aggregate them into semantically meaningful chunks. It also
|
||||||
|
gracefully handles multiple levels of nested headers, creating a rich,
|
||||||
|
hierarchical representation of the content.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
headers_to_split_on (List[Tuple[str, str]]): A list of (header_tag,
|
||||||
|
header_name) pairs representing the headers that define splitting
|
||||||
|
boundaries. For example, [("h1", "Header 1"), ("h2", "Header 2")]
|
||||||
|
will split content by <h1> and <h2> tags, assigning their textual
|
||||||
|
content to the Document metadata.
|
||||||
|
return_each_element (bool): If True, every HTML element encountered
|
||||||
|
(including headers, paragraphs, etc.) is returned as a separate
|
||||||
|
Document. If False, content under the same header hierarchy is
|
||||||
|
aggregated into fewer Documents.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[Document]: A list of Document objects. Each Document contains
|
||||||
|
`page_content` holding the extracted text and `metadata` that maps
|
||||||
|
the header hierarchy to their corresponding titles.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from langchain_text_splitters.html_header_text_splitter import (
|
||||||
|
HTMLHeaderTextSplitter,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Define headers for splitting on h1 and h2 tags.
|
||||||
|
headers_to_split_on = [("h1", "Main Topic"), ("h2", "Sub Topic")]
|
||||||
|
|
||||||
|
splitter = HTMLHeaderTextSplitter(
|
||||||
|
headers_to_split_on=headers_to_split_on,
|
||||||
|
return_each_element=False
|
||||||
|
)
|
||||||
|
|
||||||
|
html_content = \"\"\"
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<h1>Introduction</h1>
|
||||||
|
<p>Welcome to the introduction section.</p>
|
||||||
|
<h2>Background</h2>
|
||||||
|
<p>Some background details here.</p>
|
||||||
|
<h1>Conclusion</h1>
|
||||||
|
<p>Final thoughts.</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
\"\"\"
|
||||||
|
|
||||||
|
documents = splitter.split_text(html_content)
|
||||||
|
|
||||||
|
# 'documents' now contains Document objects reflecting the hierarchy:
|
||||||
|
# - Document with metadata={"Main Topic": "Introduction"} and
|
||||||
|
# content="Introduction"
|
||||||
|
# - Document with metadata={"Main Topic": "Introduction"} and
|
||||||
|
# content="Welcome to the introduction section."
|
||||||
|
# - Document with metadata={"Main Topic": "Introduction",
|
||||||
|
# "Sub Topic": "Background"} and content="Background"
|
||||||
|
# - Document with metadata={"Main Topic": "Introduction",
|
||||||
|
# "Sub Topic": "Background"} and content="Some background details here."
|
||||||
|
# - Document with metadata={"Main Topic": "Conclusion"} and
|
||||||
|
# content="Conclusion"
|
||||||
|
# - Document with metadata={"Main Topic": "Conclusion"} and
|
||||||
|
# content="Final thoughts."
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
headers_to_split_on: List[Tuple[str, str]],
|
headers_to_split_on: List[Tuple[str, str]],
|
||||||
return_each_element: bool = False,
|
return_each_element: bool = False,
|
||||||
):
|
) -> None:
|
||||||
"""Create a new HTMLHeaderTextSplitter.
|
"""Initialize with headers to split on.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
headers_to_split_on: list of tuples of headers we want to track mapped to
|
headers_to_split_on: A list of tuples where
|
||||||
(arbitrary) keys for metadata. Allowed header values: h1, h2, h3, h4,
|
each tuple contains a header tag and its corresponding value.
|
||||||
h5, h6 e.g. [("h1", "Header 1"), ("h2", "Header 2)].
|
return_each_element: Whether to return each HTML
|
||||||
return_each_element: Return each element w/ associated headers.
|
element as a separate Document. Defaults to False.
|
||||||
"""
|
"""
|
||||||
# Output element-by-element or aggregated into chunks w/ common headers
|
self.headers_to_split_on = sorted(
|
||||||
|
headers_to_split_on, key=lambda x: int(x[0][1])
|
||||||
|
)
|
||||||
|
self.header_mapping = dict(self.headers_to_split_on)
|
||||||
|
self.header_tags = [tag for tag, _ in self.headers_to_split_on]
|
||||||
self.return_each_element = return_each_element
|
self.return_each_element = return_each_element
|
||||||
self.headers_to_split_on = sorted(headers_to_split_on)
|
|
||||||
|
|
||||||
def aggregate_elements_to_chunks(
|
|
||||||
self, elements: List[ElementType]
|
|
||||||
) -> List[Document]:
|
|
||||||
"""Combine elements with common metadata into chunks.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
elements: HTML element content with associated identifying info and metadata
|
|
||||||
"""
|
|
||||||
aggregated_chunks: List[ElementType] = []
|
|
||||||
|
|
||||||
for element in elements:
|
|
||||||
if (
|
|
||||||
aggregated_chunks
|
|
||||||
and aggregated_chunks[-1]["metadata"] == element["metadata"]
|
|
||||||
):
|
|
||||||
# If the last element in the aggregated list
|
|
||||||
# has the same metadata as the current element,
|
|
||||||
# append the current content to the last element's content
|
|
||||||
aggregated_chunks[-1]["content"] += " \n" + element["content"]
|
|
||||||
else:
|
|
||||||
# Otherwise, append the current element to the aggregated list
|
|
||||||
aggregated_chunks.append(element)
|
|
||||||
|
|
||||||
return [
|
|
||||||
Document(page_content=chunk["content"], metadata=chunk["metadata"])
|
|
||||||
for chunk in aggregated_chunks
|
|
||||||
]
|
|
||||||
|
|
||||||
def split_text_from_url(self, url: str, **kwargs: Any) -> List[Document]:
|
|
||||||
"""Split HTML from web URL.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
url: web URL
|
|
||||||
**kwargs: Arbitrary additional keyword arguments. These are usually passed
|
|
||||||
to the fetch url content request.
|
|
||||||
"""
|
|
||||||
r = requests.get(url, **kwargs)
|
|
||||||
return self.split_text_from_file(BytesIO(r.content))
|
|
||||||
|
|
||||||
def split_text(self, text: str) -> List[Document]:
|
def split_text(self, text: str) -> List[Document]:
|
||||||
"""Split HTML text string.
|
"""Split the given text into a list of Document objects.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text: HTML text
|
text: The HTML text to split.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A list of split Document objects.
|
||||||
"""
|
"""
|
||||||
return self.split_text_from_file(StringIO(text))
|
return self.split_text_from_file(StringIO(text))
|
||||||
|
|
||||||
def split_text_from_file(self, file: Any) -> List[Document]:
|
def split_text_from_url(
|
||||||
"""Split HTML file.
|
self, url: str, timeout: int = 10, **kwargs: Any
|
||||||
|
) -> List[Document]:
|
||||||
|
"""Fetch text content from a URL and split it into documents.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file: HTML file
|
url: The URL to fetch content from.
|
||||||
|
timeout: Timeout for the request. Defaults to 10.
|
||||||
|
**kwargs: Additional keyword arguments for the request.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A list of split Document objects.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
requests.RequestException: If the HTTP request fails.
|
||||||
|
"""
|
||||||
|
kwargs.setdefault("timeout", timeout)
|
||||||
|
response = requests.get(url, **kwargs)
|
||||||
|
response.raise_for_status()
|
||||||
|
return self.split_text(response.text)
|
||||||
|
|
||||||
|
def _header_level(self, tag_name: str) -> int:
|
||||||
|
"""Determine the heading level of a tag."""
|
||||||
|
if tag_name.lower() in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
||||||
|
return int(tag_name[1])
|
||||||
|
# Returns high level if it isn't a header
|
||||||
|
return 9999
|
||||||
|
|
||||||
|
def _dom_depth(self, element: Any) -> int:
|
||||||
|
"""Determine the DOM depth of an element by counting its parents."""
|
||||||
|
depth = 0
|
||||||
|
for _ in element.parents:
|
||||||
|
depth += 1
|
||||||
|
return depth
|
||||||
|
|
||||||
|
def _get_elements(self, html_content: str) -> List[Any]:
|
||||||
|
"""Parse HTML content and return a list of BeautifulSoup elements.
|
||||||
|
|
||||||
|
This helper function takes HTML content as input,
|
||||||
|
parses it using BeautifulSoup4, and returns all HTML elements
|
||||||
|
found in the document body. If no body tag exists,
|
||||||
|
it returns all elements in the full document.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html_content: Raw HTML content to be parsed.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[Any]: A list of BeautifulSoup elements found in the HTML document.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ImportError: If the BeautifulSoup4 package is not installed.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
from lxml import etree
|
from bs4 import BeautifulSoup # type: ignore[import-untyped]
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"Unable to import lxml, please install with `pip install lxml`."
|
"Unable to import BeautifulSoup/PageElement, \
|
||||||
|
please install with `pip install \
|
||||||
|
bs4`."
|
||||||
) from e
|
) from e
|
||||||
# use lxml library to parse html document and return xml ElementTree
|
soup = BeautifulSoup(html_content, "html.parser")
|
||||||
# Explicitly encoding in utf-8 allows non-English
|
body = soup.body if soup.body else soup
|
||||||
# html files to be processed without garbled characters
|
return body.find_all()
|
||||||
parser = etree.HTMLParser(encoding="utf-8")
|
|
||||||
tree = etree.parse(file, parser)
|
|
||||||
|
|
||||||
# document transformation for "structure-aware" chunking is handled with xsl.
|
def split_text_from_file(self, file: Any) -> List[Document]:
|
||||||
# see comments in html_chunks_with_headers.xslt for more detailed information.
|
"""Split HTML content from a file into a list of Document objects.
|
||||||
xslt_path = pathlib.Path(__file__).parent / "xsl/html_chunks_with_headers.xslt"
|
|
||||||
xslt_tree = etree.parse(xslt_path)
|
|
||||||
transform = etree.XSLT(xslt_tree)
|
|
||||||
result = transform(tree)
|
|
||||||
result_dom = etree.fromstring(str(result))
|
|
||||||
|
|
||||||
# create filter and mapping for header metadata
|
Args:
|
||||||
header_filter = [header[0] for header in self.headers_to_split_on]
|
file: A file path or a file-like object containing HTML content.
|
||||||
header_mapping = dict(self.headers_to_split_on)
|
|
||||||
|
|
||||||
# map xhtml namespace prefix
|
Returns:
|
||||||
ns_map = {"h": "http://www.w3.org/1999/xhtml"}
|
A list of split Document objects.
|
||||||
|
"""
|
||||||
|
if isinstance(file, str):
|
||||||
|
with open(file, "r", encoding="utf-8") as f:
|
||||||
|
html_content = f.read()
|
||||||
|
else:
|
||||||
|
html_content = file.read()
|
||||||
|
elements = self._get_elements(html_content)
|
||||||
|
documents: List[Document] = []
|
||||||
|
active_headers: Dict[str, Tuple[str, int, int]] = {}
|
||||||
|
current_chunk: List[str] = []
|
||||||
|
chunk_dom_depth = 0
|
||||||
|
|
||||||
# build list of elements from DOM
|
def finalize_chunk() -> None:
|
||||||
elements = []
|
if current_chunk:
|
||||||
for element in result_dom.findall("*//*", ns_map):
|
final_meta = {
|
||||||
if element.findall("*[@class='headers']") or element.findall(
|
key: content
|
||||||
"*[@class='chunk']"
|
for key, (content, level, dom_depth) in active_headers.items()
|
||||||
):
|
if chunk_dom_depth >= dom_depth
|
||||||
elements.append(
|
}
|
||||||
ElementType(
|
combined_text = " \n".join(
|
||||||
url=file,
|
line for line in current_chunk if line.strip()
|
||||||
xpath="".join(
|
)
|
||||||
[
|
if combined_text.strip():
|
||||||
node.text or ""
|
documents.append(
|
||||||
for node in element.findall("*[@class='xpath']", ns_map)
|
Document(page_content=combined_text, metadata=final_meta)
|
||||||
|
)
|
||||||
|
current_chunk.clear()
|
||||||
|
|
||||||
|
for element in elements:
|
||||||
|
tag = element.name
|
||||||
|
if not tag:
|
||||||
|
continue
|
||||||
|
text = " ".join(
|
||||||
|
t
|
||||||
|
for t in element.find_all(string=True, recursive=False)
|
||||||
|
if isinstance(t, str)
|
||||||
|
).strip()
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
|
||||||
|
level = self._header_level(tag)
|
||||||
|
dom_depth = self._dom_depth(element)
|
||||||
|
|
||||||
|
if tag in self.header_tags:
|
||||||
|
if not self.return_each_element:
|
||||||
|
finalize_chunk()
|
||||||
|
|
||||||
|
# Remove headers at same or deeper level
|
||||||
|
headers_to_remove = [
|
||||||
|
key for key, (_, lvl, _) in active_headers.items() if lvl >= level
|
||||||
]
|
]
|
||||||
),
|
for key in headers_to_remove:
|
||||||
content="".join(
|
del active_headers[key]
|
||||||
[
|
|
||||||
node.text or ""
|
header_key = self.header_mapping[tag]
|
||||||
for node in element.findall("*[@class='chunk']", ns_map)
|
active_headers[header_key] = (text, level, dom_depth)
|
||||||
|
|
||||||
|
# Produce a document for the header itself
|
||||||
|
header_meta = {
|
||||||
|
key: content
|
||||||
|
for key, (content, lvl, dd) in active_headers.items()
|
||||||
|
if dom_depth >= dd
|
||||||
|
}
|
||||||
|
documents.append(Document(page_content=text, metadata=header_meta))
|
||||||
|
# After encountering a header,
|
||||||
|
# no immediate content goes to current_chunk
|
||||||
|
# (if return_each_element is False, we wait for next content)
|
||||||
|
# (if return_each_element is True, we create docs per element anyway)
|
||||||
|
else:
|
||||||
|
# Non-header element logic
|
||||||
|
# Remove headers that don't apply if dom_depth < their dom_depth
|
||||||
|
headers_to_remove = [
|
||||||
|
key for key, (_, _, dd) in active_headers.items() if dom_depth < dd
|
||||||
]
|
]
|
||||||
),
|
for key in headers_to_remove:
|
||||||
metadata={
|
del active_headers[key]
|
||||||
# Add text of specified headers to metadata using header
|
|
||||||
# mapping.
|
if self.return_each_element:
|
||||||
header_mapping[node.tag]: node.text or ""
|
# Produce a doc for this element immediately
|
||||||
for node in filter(
|
element_meta = {
|
||||||
lambda x: x.tag in header_filter,
|
key: content
|
||||||
element.findall("*[@class='headers']/*", ns_map),
|
for key, (content, lvl, dd) in active_headers.items()
|
||||||
)
|
if dom_depth >= dd
|
||||||
},
|
}
|
||||||
)
|
if text.strip():
|
||||||
|
documents.append(
|
||||||
|
Document(page_content=text, metadata=element_meta)
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
# Accumulate content in current_chunk
|
||||||
|
if text.strip():
|
||||||
|
current_chunk.append(text)
|
||||||
|
chunk_dom_depth = max(chunk_dom_depth, dom_depth)
|
||||||
|
|
||||||
if not self.return_each_element:
|
if not self.return_each_element:
|
||||||
return self.aggregate_elements_to_chunks(elements)
|
# finalize any remaining chunk
|
||||||
else:
|
finalize_chunk()
|
||||||
return [
|
|
||||||
Document(page_content=chunk["content"], metadata=chunk["metadata"])
|
# If no headers were found at all and return_each_element=False, behavior is:
|
||||||
for chunk in elements
|
# The entire content should be in one document.
|
||||||
]
|
# The logic above naturally handles it:
|
||||||
|
# If no recognized headers, we never split; we ended up just accumulating text
|
||||||
|
# in current_chunk and finalizing once at the end.
|
||||||
|
|
||||||
|
return documents
|
||||||
|
|
||||||
|
|
||||||
class HTMLSectionSplitter:
|
class HTMLSectionSplitter:
|
||||||
@ -269,7 +412,10 @@ class HTMLSectionSplitter:
|
|||||||
- 'tag_name': The name of the header tag (e.g., "h1", "h2").
|
- 'tag_name': The name of the header tag (e.g., "h1", "h2").
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
from bs4 import BeautifulSoup, PageElement # type: ignore[import-untyped]
|
from bs4 import (
|
||||||
|
BeautifulSoup, # type: ignore[import-untyped]
|
||||||
|
PageElement,
|
||||||
|
)
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"Unable to import BeautifulSoup/PageElement, \
|
"Unable to import BeautifulSoup/PageElement, \
|
||||||
@ -343,10 +489,13 @@ class HTMLSectionSplitter:
|
|||||||
return str(result)
|
return str(result)
|
||||||
|
|
||||||
def split_text_from_file(self, file: Any) -> List[Document]:
|
def split_text_from_file(self, file: Any) -> List[Document]:
|
||||||
"""Split HTML file.
|
"""Split HTML content from a file into a list of Document objects.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file: HTML file
|
file: A file path or a file-like object containing HTML content.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A list of split Document objects.
|
||||||
"""
|
"""
|
||||||
file_content = file.getvalue()
|
file_content = file.getvalue()
|
||||||
file_content = self.convert_possible_tags_to_header(file_content)
|
file_content = self.convert_possible_tags_to_header(file_content)
|
||||||
@ -844,3 +993,6 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
|
|||||||
for placeholder, preserved_content in preserved_elements.items():
|
for placeholder, preserved_content in preserved_elements.items():
|
||||||
content = content.replace(placeholder, preserved_content.strip())
|
content = content.replace(placeholder, preserved_content.strip())
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
@ -4,7 +4,7 @@ import random
|
|||||||
import re
|
import re
|
||||||
import string
|
import string
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, List
|
from typing import Any, Callable, List, Tuple
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
@ -2039,49 +2039,476 @@ def test_haskell_code_splitter() -> None:
|
|||||||
assert chunks == expected_chunks
|
assert chunks == expected_chunks
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.requires("lxml")
|
@pytest.fixture
|
||||||
def test_html_header_text_splitter(tmp_path: Path) -> None:
|
@pytest.mark.requires("bs4")
|
||||||
splitter = HTMLHeaderTextSplitter(
|
def html_header_splitter_splitter_factory() -> (
|
||||||
headers_to_split_on=[("h1", "Header 1"), ("h2", "Header 2")]
|
Callable[[List[Tuple[str, str]]], HTMLHeaderTextSplitter]
|
||||||
)
|
):
|
||||||
|
"""
|
||||||
content = """
|
Fixture to create an HTMLHeaderTextSplitter instance with given headers.
|
||||||
<h1>Sample Document</h1>
|
This factory allows dynamic creation of splitters with different headers.
|
||||||
<h2>Section</h2>
|
|
||||||
<p id="1234">Reference content.</p>
|
|
||||||
|
|
||||||
<h2>Lists</h2>
|
|
||||||
<ul>
|
|
||||||
<li>Item 1</li>
|
|
||||||
<li>Item 2</li>
|
|
||||||
<li>Item 3</li>
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<h3>A block</h3>
|
|
||||||
<div class="amazing">
|
|
||||||
<p>Some text</p>
|
|
||||||
<p>Some more text</p>
|
|
||||||
</div>
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
docs = splitter.split_text(content)
|
def _create_splitter(
|
||||||
expected = [
|
headers_to_split_on: List[Tuple[str, str]],
|
||||||
|
) -> HTMLHeaderTextSplitter:
|
||||||
|
return HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
|
||||||
|
|
||||||
|
return _create_splitter
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"headers_to_split_on, html_input, expected_documents, test_case",
|
||||||
|
[
|
||||||
|
(
|
||||||
|
# Test Case 1: Split on h1 and h2
|
||||||
|
[("h1", "Header 1"), ("h2", "Header 2")],
|
||||||
|
"""
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<h1>Introduction</h1>
|
||||||
|
<p>This is the introduction.</p>
|
||||||
|
<h2>Background</h2>
|
||||||
|
<p>Background information.</p>
|
||||||
|
<h1>Conclusion</h1>
|
||||||
|
<p>Final thoughts.</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
""",
|
||||||
|
[
|
||||||
Document(
|
Document(
|
||||||
page_content="Reference content.",
|
page_content="Introduction", metadata={"Header 1": "Introduction"}
|
||||||
metadata={"Header 1": "Sample Document", "Header 2": "Section"},
|
|
||||||
),
|
),
|
||||||
Document(
|
Document(
|
||||||
page_content="Item 1 Item 2 Item 3 \nSome text \nSome more text",
|
page_content="This is the introduction.",
|
||||||
metadata={"Header 1": "Sample Document", "Header 2": "Lists"},
|
metadata={"Header 1": "Introduction"},
|
||||||
),
|
),
|
||||||
]
|
Document(
|
||||||
assert docs == expected
|
page_content="Background",
|
||||||
|
metadata={"Header 1": "Introduction", "Header 2": "Background"},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content="Background information.",
|
||||||
|
metadata={"Header 1": "Introduction", "Header 2": "Background"},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content="Conclusion", metadata={"Header 1": "Conclusion"}
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content="Final thoughts.", metadata={"Header 1": "Conclusion"}
|
||||||
|
),
|
||||||
|
],
|
||||||
|
"Simple headers and paragraphs",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
# Test Case 2: Nested headers with h1, h2, and h3
|
||||||
|
[("h1", "Header 1"), ("h2", "Header 2"), ("h3", "Header 3")],
|
||||||
|
"""
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<div>
|
||||||
|
<h1>Main Title</h1>
|
||||||
|
<div>
|
||||||
|
<h2>Subsection</h2>
|
||||||
|
<p>Details of subsection.</p>
|
||||||
|
<div>
|
||||||
|
<h3>Sub-subsection</h3>
|
||||||
|
<p>More details.</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<h1>Another Main Title</h1>
|
||||||
|
<p>Content under another main title.</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
""",
|
||||||
|
[
|
||||||
|
Document(
|
||||||
|
page_content="Main Title", metadata={"Header 1": "Main Title"}
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content="Subsection",
|
||||||
|
metadata={"Header 1": "Main Title", "Header 2": "Subsection"},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content="Details of subsection.",
|
||||||
|
metadata={"Header 1": "Main Title", "Header 2": "Subsection"},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content="Sub-subsection",
|
||||||
|
metadata={
|
||||||
|
"Header 1": "Main Title",
|
||||||
|
"Header 2": "Subsection",
|
||||||
|
"Header 3": "Sub-subsection",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content="More details.",
|
||||||
|
metadata={
|
||||||
|
"Header 1": "Main Title",
|
||||||
|
"Header 2": "Subsection",
|
||||||
|
"Header 3": "Sub-subsection",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content="Another Main Title",
|
||||||
|
metadata={"Header 1": "Another Main Title"},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content="Content under another main title.",
|
||||||
|
metadata={"Header 1": "Another Main Title"},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
"Nested headers with h1, h2, and h3",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
# Test Case 3: No headers
|
||||||
|
[("h1", "Header 1")],
|
||||||
|
"""
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<p>Paragraph one.</p>
|
||||||
|
<p>Paragraph two.</p>
|
||||||
|
<div>
|
||||||
|
<p>Paragraph three.</p>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
""",
|
||||||
|
[
|
||||||
|
Document(
|
||||||
|
page_content="Paragraph one. \nParagraph two. \nParagraph three.",
|
||||||
|
metadata={},
|
||||||
|
)
|
||||||
|
],
|
||||||
|
"No headers present",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
# Test Case 4: Multiple headers of the same level
|
||||||
|
[("h1", "Header 1")],
|
||||||
|
"""
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<h1>Chapter 1</h1>
|
||||||
|
<p>Content of chapter 1.</p>
|
||||||
|
<h1>Chapter 2</h1>
|
||||||
|
<p>Content of chapter 2.</p>
|
||||||
|
<h1>Chapter 3</h1>
|
||||||
|
<p>Content of chapter 3.</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
""",
|
||||||
|
[
|
||||||
|
Document(page_content="Chapter 1", metadata={"Header 1": "Chapter 1"}),
|
||||||
|
Document(
|
||||||
|
page_content="Content of chapter 1.",
|
||||||
|
metadata={"Header 1": "Chapter 1"},
|
||||||
|
),
|
||||||
|
Document(page_content="Chapter 2", metadata={"Header 1": "Chapter 2"}),
|
||||||
|
Document(
|
||||||
|
page_content="Content of chapter 2.",
|
||||||
|
metadata={"Header 1": "Chapter 2"},
|
||||||
|
),
|
||||||
|
Document(page_content="Chapter 3", metadata={"Header 1": "Chapter 3"}),
|
||||||
|
Document(
|
||||||
|
page_content="Content of chapter 3.",
|
||||||
|
metadata={"Header 1": "Chapter 3"},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
"Multiple headers of the same level",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
# Test Case 5: Headers with no content
|
||||||
|
[("h1", "Header 1"), ("h2", "Header 2")],
|
||||||
|
"""
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<h1>Header 1</h1>
|
||||||
|
<h2>Header 2</h2>
|
||||||
|
<h1>Header 3</h1>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
""",
|
||||||
|
[
|
||||||
|
Document(page_content="Header 1", metadata={"Header 1": "Header 1"}),
|
||||||
|
Document(
|
||||||
|
page_content="Header 2",
|
||||||
|
metadata={"Header 1": "Header 1", "Header 2": "Header 2"},
|
||||||
|
),
|
||||||
|
Document(page_content="Header 3", metadata={"Header 1": "Header 3"}),
|
||||||
|
],
|
||||||
|
"Headers with no associated content",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
@pytest.mark.requires("bs4")
|
||||||
|
def test_html_header_text_splitter(
|
||||||
|
html_header_splitter_splitter_factory: Any,
|
||||||
|
headers_to_split_on: List[Tuple[str, str]],
|
||||||
|
html_input: str,
|
||||||
|
expected_documents: List[Document],
|
||||||
|
test_case: str,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Test the HTML header text splitter.
|
||||||
|
|
||||||
with open(tmp_path / "doc.html", "w") as tmp:
|
Args:
|
||||||
tmp.write(content)
|
html_header_splitter_splitter_factory (Any): Factory function to create
|
||||||
docs_from_file = splitter.split_text_from_file(tmp_path / "doc.html")
|
the HTML header splitter.
|
||||||
|
headers_to_split_on (List[Tuple[str, str]]): List of headers to split on.
|
||||||
|
html_input (str): The HTML input string to be split.
|
||||||
|
expected_documents (List[Document]): List of expected Document objects.
|
||||||
|
test_case (str): Description of the test case.
|
||||||
|
|
||||||
assert docs_from_file == expected
|
Raises:
|
||||||
|
AssertionError: If the number of documents or their content/metadata
|
||||||
|
does not match the expected values.
|
||||||
|
"""
|
||||||
|
|
||||||
|
splitter = html_header_splitter_splitter_factory(
|
||||||
|
headers_to_split_on=headers_to_split_on
|
||||||
|
)
|
||||||
|
docs = splitter.split_text(html_input)
|
||||||
|
|
||||||
|
assert len(docs) == len(expected_documents), (
|
||||||
|
f"Test Case '{test_case}' Failed: Number of documents mismatch. "
|
||||||
|
f"Expected {len(expected_documents)}, got {len(docs)}."
|
||||||
|
)
|
||||||
|
for idx, (doc, expected) in enumerate(zip(docs, expected_documents), start=1):
|
||||||
|
assert doc.page_content == expected.page_content, (
|
||||||
|
f"Test Case '{test_case}' Failed at Document {idx}: "
|
||||||
|
f"Content mismatch.\nExpected: {expected.page_content}"
|
||||||
|
"\nGot: {doc.page_content}"
|
||||||
|
)
|
||||||
|
assert doc.metadata == expected.metadata, (
|
||||||
|
f"Test Case '{test_case}' Failed at Document {idx}: "
|
||||||
|
f"Metadata mismatch.\nExpected: {expected.metadata}\nGot: {doc.metadata}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"headers_to_split_on, html_content, expected_output, test_case",
|
||||||
|
[
|
||||||
|
(
|
||||||
|
# Test Case A: Split on h1 and h2 with h3 in content
|
||||||
|
[("h1", "Header 1"), ("h2", "Header 2"), ("h3", "Header 3")],
|
||||||
|
"""
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<div>
|
||||||
|
<h1>Foo</h1>
|
||||||
|
<p>Some intro text about Foo.</p>
|
||||||
|
<div>
|
||||||
|
<h2>Bar main section</h2>
|
||||||
|
<p>Some intro text about Bar.</p>
|
||||||
|
<h3>Bar subsection 1</h3>
|
||||||
|
<p>Some text about the first subtopic of Bar.</p>
|
||||||
|
<h3>Bar subsection 2</h3>
|
||||||
|
<p>Some text about the second subtopic of Bar.</p>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<h2>Baz</h2>
|
||||||
|
<p>Some text about Baz</p>
|
||||||
|
</div>
|
||||||
|
<br>
|
||||||
|
<p>Some concluding text about Foo</p>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
""",
|
||||||
|
[
|
||||||
|
Document(metadata={"Header 1": "Foo"}, page_content="Foo"),
|
||||||
|
Document(
|
||||||
|
metadata={"Header 1": "Foo"},
|
||||||
|
page_content="Some intro text about Foo.",
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
metadata={"Header 1": "Foo", "Header 2": "Bar main section"},
|
||||||
|
page_content="Bar main section",
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
metadata={"Header 1": "Foo", "Header 2": "Bar main section"},
|
||||||
|
page_content="Some intro text about Bar.",
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
metadata={
|
||||||
|
"Header 1": "Foo",
|
||||||
|
"Header 2": "Bar main section",
|
||||||
|
"Header 3": "Bar subsection 1",
|
||||||
|
},
|
||||||
|
page_content="Bar subsection 1",
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
metadata={
|
||||||
|
"Header 1": "Foo",
|
||||||
|
"Header 2": "Bar main section",
|
||||||
|
"Header 3": "Bar subsection 1",
|
||||||
|
},
|
||||||
|
page_content="Some text about the first subtopic of Bar.",
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
metadata={
|
||||||
|
"Header 1": "Foo",
|
||||||
|
"Header 2": "Bar main section",
|
||||||
|
"Header 3": "Bar subsection 2",
|
||||||
|
},
|
||||||
|
page_content="Bar subsection 2",
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
metadata={
|
||||||
|
"Header 1": "Foo",
|
||||||
|
"Header 2": "Bar main section",
|
||||||
|
"Header 3": "Bar subsection 2",
|
||||||
|
},
|
||||||
|
page_content="Some text about the second subtopic of Bar.",
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
metadata={"Header 1": "Foo", "Header 2": "Baz"}, page_content="Baz"
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
metadata={"Header 1": "Foo"},
|
||||||
|
page_content=(
|
||||||
|
"Some text about Baz \nSome concluding text about Foo"
|
||||||
|
),
|
||||||
|
),
|
||||||
|
],
|
||||||
|
"Test Case A: Split on h1, h2, and h3 with nested headers",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
# Test Case B: Split on h1 only without any headers
|
||||||
|
[("h1", "Header 1")],
|
||||||
|
"""
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<p>Paragraph one.</p>
|
||||||
|
<p>Paragraph two.</p>
|
||||||
|
<p>Paragraph three.</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
""",
|
||||||
|
[
|
||||||
|
Document(
|
||||||
|
metadata={},
|
||||||
|
page_content="Paragraph one. \nParagraph two. \nParagraph three.",
|
||||||
|
)
|
||||||
|
],
|
||||||
|
"Test Case B: Split on h1 only without any headers",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
@pytest.mark.requires("bs4")
|
||||||
|
def test_additional_html_header_text_splitter(
|
||||||
|
html_header_splitter_splitter_factory: Any,
|
||||||
|
headers_to_split_on: List[Tuple[str, str]],
|
||||||
|
html_content: str,
|
||||||
|
expected_output: List[Document],
|
||||||
|
test_case: str,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Test the HTML header text splitter.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html_header_splitter_splitter_factory (Any): Factory function to create
|
||||||
|
the HTML header splitter.
|
||||||
|
headers_to_split_on (List[Tuple[str, str]]): List of headers to split on.
|
||||||
|
html_content (str): HTML content to be split.
|
||||||
|
expected_output (List[Document]): Expected list of Document objects.
|
||||||
|
test_case (str): Description of the test case.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
AssertionError: If the number of documents or their content/metadata
|
||||||
|
does not match the expected output.
|
||||||
|
"""
|
||||||
|
splitter = html_header_splitter_splitter_factory(
|
||||||
|
headers_to_split_on=headers_to_split_on
|
||||||
|
)
|
||||||
|
docs = splitter.split_text(html_content)
|
||||||
|
|
||||||
|
assert len(docs) == len(expected_output), (
|
||||||
|
f"{test_case} Failed: Number of documents mismatch. "
|
||||||
|
f"Expected {len(expected_output)}, got {len(docs)}."
|
||||||
|
)
|
||||||
|
for idx, (doc, expected) in enumerate(zip(docs, expected_output), start=1):
|
||||||
|
assert doc.page_content == expected.page_content, (
|
||||||
|
f"{test_case} Failed at Document {idx}: "
|
||||||
|
f"Content mismatch.\nExpected: {expected.page_content}\n"
|
||||||
|
"Got: {doc.page_content}"
|
||||||
|
)
|
||||||
|
assert doc.metadata == expected.metadata, (
|
||||||
|
f"{test_case} Failed at Document {idx}: "
|
||||||
|
f"Metadata mismatch.\nExpected: {expected.metadata}\nGot: {doc.metadata}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"headers_to_split_on, html_content, expected_output, test_case",
|
||||||
|
[
|
||||||
|
(
|
||||||
|
# Test Case C: Split on h1, h2, and h3 with no headers present
|
||||||
|
[("h1", "Header 1"), ("h2", "Header 2"), ("h3", "Header 3")],
|
||||||
|
"""
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<p>Just some random text without headers.</p>
|
||||||
|
<div>
|
||||||
|
<span>More text here.</span>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
""",
|
||||||
|
[
|
||||||
|
Document(
|
||||||
|
page_content="Just some random text without headers."
|
||||||
|
" \nMore text here.",
|
||||||
|
metadata={},
|
||||||
|
)
|
||||||
|
],
|
||||||
|
"Test Case C: Split on h1, h2, and h3 without any headers",
|
||||||
|
)
|
||||||
|
],
|
||||||
|
)
|
||||||
|
@pytest.mark.requires("bs4")
|
||||||
|
def test_html_no_headers_with_multiple_splitters(
|
||||||
|
html_header_splitter_splitter_factory: Any,
|
||||||
|
headers_to_split_on: List[Tuple[str, str]],
|
||||||
|
html_content: str,
|
||||||
|
expected_output: List[Document],
|
||||||
|
test_case: str,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Test HTML content splitting without headers using multiple splitters.
|
||||||
|
Args:
|
||||||
|
html_header_splitter_splitter_factory (Any): Factory to create the
|
||||||
|
HTML header splitter.
|
||||||
|
headers_to_split_on (List[Tuple[str, str]]): List of headers to split on.
|
||||||
|
html_content (str): HTML content to be split.
|
||||||
|
expected_output (List[Document]): Expected list of Document objects
|
||||||
|
after splitting.
|
||||||
|
test_case (str): Description of the test case.
|
||||||
|
Raises:
|
||||||
|
AssertionError: If the number of documents or their content/metadata
|
||||||
|
does not match the expected output.
|
||||||
|
"""
|
||||||
|
splitter = html_header_splitter_splitter_factory(
|
||||||
|
headers_to_split_on=headers_to_split_on
|
||||||
|
)
|
||||||
|
docs = splitter.split_text(html_content)
|
||||||
|
|
||||||
|
assert len(docs) == len(expected_output), (
|
||||||
|
f"{test_case} Failed: Number of documents mismatch. "
|
||||||
|
f"Expected {len(expected_output)}, got {len(docs)}."
|
||||||
|
)
|
||||||
|
for idx, (doc, expected) in enumerate(zip(docs, expected_output), start=1):
|
||||||
|
assert doc.page_content == expected.page_content, (
|
||||||
|
f"{test_case} Failed at Document {idx}: "
|
||||||
|
f"Content mismatch.\nExpected: {expected.page_content}\n"
|
||||||
|
"Got: {doc.page_content}"
|
||||||
|
)
|
||||||
|
assert doc.metadata == expected.metadata, (
|
||||||
|
f"{test_case} Failed at Document {idx}: "
|
||||||
|
f"Metadata mismatch.\nExpected: {expected.metadata}\nGot: {doc.metadata}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_split_text_on_tokens() -> None:
|
def test_split_text_on_tokens() -> None:
|
||||||
@ -2480,7 +2907,8 @@ def test_html_splitter_with_custom_extractor() -> None:
|
|||||||
|
|
||||||
expected = [
|
expected = [
|
||||||
Document(
|
Document(
|
||||||
page_content="This is an iframe: [iframe:http://example.com](http://example.com)",
|
page_content="This is an iframe: "
|
||||||
|
"[iframe:http://example.com](http://example.com)",
|
||||||
metadata={"Header 1": "Section 1"},
|
metadata={"Header 1": "Section 1"},
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
Loading…
Reference in New Issue
Block a user