This commit is contained in:
Eugene Yurtsev
2023-05-17 14:49:27 -04:00
parent 8fe8ee5f80
commit f7aaf26fb5
7 changed files with 36 additions and 24 deletions

View File

@@ -6,7 +6,6 @@ from typing import Dict, List, Union
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
logger = logging.getLogger(__name__)

View File

@@ -1,3 +1,4 @@
from langchain.document_loaders.parsers.html import BS4HTMLParser
from langchain.document_loaders.parsers.pdf import (
PDFMinerParser,
PDFPlumberParser,
@@ -5,7 +6,6 @@ from langchain.document_loaders.parsers.pdf import (
PyPDFium2Parser,
PyPDFParser,
)
from langchain.document_loaders.parsers.html import BS4HTMLParser
__all__ = [
"BS4HTMLParser",

View File

@@ -1,4 +1,4 @@
from langchain.document_loaders.parsers.html.markdownify import MarkdownifyHTMLParser
from langchain.document_loaders.parsers.html.bs4 import BS4HTMLParser
from langchain.document_loaders.parsers.html.markdownify import MarkdownifyHTMLParser
__all__ = ["MarkdownifyHTMLParser", "BS4HTMLParser"]

View File

@@ -1,7 +1,7 @@
"""Loader that uses bs4 to load HTML files, enriching metadata with page title."""
import logging
from typing import Dict, Union, Iterator, Optional, Mapping, Any
from typing import Any, Dict, Iterator, Mapping, Optional, Union
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseBlobParser
@@ -11,7 +11,7 @@ logger = logging.getLogger(__name__)
class BS4HTMLParser(BaseBlobParser):
"""Loader that uses beautiful soup to parse HTML files."""
"""Parser that uses beautiful soup to parse HTML files."""
def __init__(
self,
@@ -19,8 +19,7 @@ class BS4HTMLParser(BaseBlobParser):
bs_kwargs: Optional[Mapping[str, Any]] = None,
get_text_separator: str = "",
) -> None:
"""Initialise with path, and optionally, file encoding to use, and any kwargs
to pass to the BeautifulSoup object."""
"""Initialize a bs4 based HTML parser."""
try:
import bs4 # noqa:F401
except ImportError:
@@ -29,7 +28,7 @@ class BS4HTMLParser(BaseBlobParser):
"`pip install beautifulsoup4`"
)
if "features" in bs_kwargs:
if bs_kwargs and "features" in bs_kwargs:
raise ValueError("features cannot be set in bs_kwargs")
_bs_kwargs = bs_kwargs or {}
@@ -54,4 +53,4 @@ class BS4HTMLParser(BaseBlobParser):
"source": blob.source,
"title": title,
}
return [Document(page_content=text, metadata=metadata)]
yield Document(page_content=text, metadata=metadata)

View File

@@ -1,8 +1,9 @@
"""Load and chunk HTMLs with potential pre-processing to clean the html."""
import re
from typing import Iterator, Tuple
from bs4 import BeautifulSoup
from typing import Tuple, Iterator
from langchain.document_loaders.base import BaseBlobParser
from langchain.document_loaders.blob_loaders import Blob

View File

@@ -1,14 +0,0 @@
"""Tests for the HTML parsers."""
from pathlib import Path
from typing import Iterator
from langchain.document_loaders.base import BaseBlobParser
from langchain.document_loaders.blob_loaders import Blob
from langchain.document_loaders.parsers.html import BS4HTMLParser
# PDFs to test parsers on.
HELLO_PDF = Path(__file__).parent.parent.parent / "examples" / "hello.pdf"
LAYOUT_PARSER_PAPER_PDF = (
Path(__file__).parent.parent.parent / "examples" / "layout-parser-paper.pdf"
)

View File

@@ -0,0 +1,27 @@
"""Tests for the HTML parsers."""
import pytest
from pathlib import Path
from langchain.document_loaders.blob_loaders import Blob
from langchain.document_loaders.parsers.html import BS4HTMLParser
HERE = Path(__file__).parent
EXAMPLES = HERE.parent.parent.parent / "integration_tests" / "examples"
@pytest.mark.requires("bs4")
def test_bs_html_loader() -> None:
"""Test unstructured loader."""
file_path = EXAMPLES / "example.html"
blob = Blob.from_path(file_path)
parser = BS4HTMLParser(get_text_separator="|")
docs = list(parser.lazy_parse(blob))
assert isinstance(docs, list)
assert len(docs) == 1
metadata = docs[0].metadata
content = docs[0].page_content
assert metadata["title"] == "Chew dad's slippers"
assert metadata["source"] == str(file_path)
assert content[:2] == "\n|"