Add ability to pass kwargs to loader classes in DirectoryLoader, add ability to modify encoding and BeautifulSoup behaviour in BSHTMLLoader (#2275)

Solves #2247. Noted that the only test I added checks for the
BeautifulSoup behaviour change. Happy to add a test for
`DirectoryLoader` if deemed necessary.
This commit is contained in:
Sam Cordner-Matthews
2023-04-01 20:48:27 +01:00
committed by GitHub
parent 2d0ff1a06d
commit 1ddd6dbf0b
4 changed files with 69 additions and 4 deletions

View File

@@ -1,5 +1,8 @@
import sys
from pathlib import Path
import pytest
from langchain.document_loaders.html_bs import BSHTMLLoader
@@ -15,3 +18,25 @@ def test_bs_html_loader() -> None:
assert metadata["title"] == "Chew dad's slippers"
assert metadata["source"] == str(file_path)
@pytest.mark.skipif(
bool(sys.flags.utf8_mode) or not sys.platform.startswith("win"),
reason="default encoding is utf8",
)
def test_bs_html_loader_non_utf8() -> None:
"""Test providing encoding to BSHTMLLoader."""
file_path = Path(__file__).parent.parent / "examples/example-utf8.html"
with pytest.raises(UnicodeDecodeError):
BSHTMLLoader(str(file_path)).load()
loader = BSHTMLLoader(str(file_path), open_encoding="utf8")
docs = loader.load()
assert len(docs) == 1
metadata = docs[0].metadata
assert metadata["title"] == "Chew dad's slippers"
assert metadata["source"] == str(file_path)

View File

@@ -0,0 +1,25 @@
<html>
<head>
<title>Chew dad's slippers</title>
</head>
<body>
<h1>
Instead of drinking water from the cat bowl, make sure to steal water from
the toilet
</h1>
<h2>Chase the red dot</h2>
<p>
Munch, munch, chomp, chomp hate dogs. Spill litter box, scratch at owner,
destroy all furniture, especially couch get scared by sudden appearance of
cucumber cat is love, cat is life fat baby cat best buddy little guy for
catch eat throw up catch eat throw up bad birds jump on fridge. Purr like
a car engine oh yes, there is my human woman she does best pats ever that
all i like about her hiss meow .
</p>
<p>
Dead stare with ears cocked when “owners” are asleep, cry for no apparent
reason meow all night. Plop down in the middle where everybody walks favor
packaging over toy. Sit on the laptop kitty pounce, trip, faceplant.
</p>
</body>
</html>