mirror of
https://github.com/hwchase17/langchain.git
synced 2026-02-03 15:55:44 +00:00
x
This commit is contained in:
@@ -6,7 +6,6 @@ from typing import Dict, List, Union
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
from langchain.document_loaders.parsers.html import BS4HTMLParser
|
||||
from langchain.document_loaders.parsers.pdf import (
|
||||
PDFMinerParser,
|
||||
PDFPlumberParser,
|
||||
@@ -5,7 +6,6 @@ from langchain.document_loaders.parsers.pdf import (
|
||||
PyPDFium2Parser,
|
||||
PyPDFParser,
|
||||
)
|
||||
from langchain.document_loaders.parsers.html import BS4HTMLParser
|
||||
|
||||
__all__ = [
|
||||
"BS4HTMLParser",
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from langchain.document_loaders.parsers.html.markdownify import MarkdownifyHTMLParser
|
||||
from langchain.document_loaders.parsers.html.bs4 import BS4HTMLParser
|
||||
from langchain.document_loaders.parsers.html.markdownify import MarkdownifyHTMLParser
|
||||
|
||||
__all__ = ["MarkdownifyHTMLParser", "BS4HTMLParser"]
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
"""Loader that uses bs4 to load HTML files, enriching metadata with page title."""
|
||||
|
||||
import logging
|
||||
from typing import Dict, Union, Iterator, Optional, Mapping, Any
|
||||
from typing import Any, Dict, Iterator, Mapping, Optional, Union
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseBlobParser
|
||||
@@ -11,7 +11,7 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BS4HTMLParser(BaseBlobParser):
|
||||
"""Loader that uses beautiful soup to parse HTML files."""
|
||||
"""Parser that uses beautiful soup to parse HTML files."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -19,8 +19,7 @@ class BS4HTMLParser(BaseBlobParser):
|
||||
bs_kwargs: Optional[Mapping[str, Any]] = None,
|
||||
get_text_separator: str = "",
|
||||
) -> None:
|
||||
"""Initialise with path, and optionally, file encoding to use, and any kwargs
|
||||
to pass to the BeautifulSoup object."""
|
||||
"""Initialize a bs4 based HTML parser."""
|
||||
try:
|
||||
import bs4 # noqa:F401
|
||||
except ImportError:
|
||||
@@ -29,7 +28,7 @@ class BS4HTMLParser(BaseBlobParser):
|
||||
"`pip install beautifulsoup4`"
|
||||
)
|
||||
|
||||
if "features" in bs_kwargs:
|
||||
if bs_kwargs and "features" in bs_kwargs:
|
||||
raise ValueError("features cannot be set in bs_kwargs")
|
||||
|
||||
_bs_kwargs = bs_kwargs or {}
|
||||
@@ -54,4 +53,4 @@ class BS4HTMLParser(BaseBlobParser):
|
||||
"source": blob.source,
|
||||
"title": title,
|
||||
}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
yield Document(page_content=text, metadata=metadata)
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
"""Load and chunk HTMLs with potential pre-processing to clean the html."""
|
||||
|
||||
import re
|
||||
from typing import Iterator, Tuple
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import Tuple, Iterator
|
||||
|
||||
from langchain.document_loaders.base import BaseBlobParser
|
||||
from langchain.document_loaders.blob_loaders import Blob
|
||||
|
||||
@@ -1,14 +0,0 @@
|
||||
"""Tests for the HTML parsers."""
|
||||
from pathlib import Path
|
||||
from typing import Iterator
|
||||
|
||||
from langchain.document_loaders.base import BaseBlobParser
|
||||
from langchain.document_loaders.blob_loaders import Blob
|
||||
from langchain.document_loaders.parsers.html import BS4HTMLParser
|
||||
|
||||
# PDFs to test parsers on.
|
||||
HELLO_PDF = Path(__file__).parent.parent.parent / "examples" / "hello.pdf"
|
||||
|
||||
LAYOUT_PARSER_PAPER_PDF = (
|
||||
Path(__file__).parent.parent.parent / "examples" / "layout-parser-paper.pdf"
|
||||
)
|
||||
@@ -0,0 +1,27 @@
|
||||
"""Tests for the HTML parsers."""
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
|
||||
from langchain.document_loaders.blob_loaders import Blob
|
||||
from langchain.document_loaders.parsers.html import BS4HTMLParser
|
||||
|
||||
HERE = Path(__file__).parent
|
||||
EXAMPLES = HERE.parent.parent.parent / "integration_tests" / "examples"
|
||||
|
||||
|
||||
@pytest.mark.requires("bs4")
|
||||
def test_bs_html_loader() -> None:
|
||||
"""Test unstructured loader."""
|
||||
file_path = EXAMPLES / "example.html"
|
||||
blob = Blob.from_path(file_path)
|
||||
parser = BS4HTMLParser(get_text_separator="|")
|
||||
docs = list(parser.lazy_parse(blob))
|
||||
assert isinstance(docs, list)
|
||||
assert len(docs) == 1
|
||||
|
||||
metadata = docs[0].metadata
|
||||
content = docs[0].page_content
|
||||
|
||||
assert metadata["title"] == "Chew dad's slippers"
|
||||
assert metadata["source"] == str(file_path)
|
||||
assert content[:2] == "\n|"
|
||||
Reference in New Issue
Block a user