mirror of
				https://github.com/hwchase17/langchain.git
				synced 2025-10-25 04:30:13 +00:00 
			
		
		
		
	# Add bs4 html parser * Some minor refactors * Extract the bs4 html parsing code from the bs html loader * Move some tests from integration tests to unit tests
		
			
				
	
	
		
			29 lines
		
	
	
		
			844 B
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			29 lines
		
	
	
		
			844 B
		
	
	
	
		
			Python
		
	
	
	
	
	
| """Tests for the HTML parsers."""
 | |
| from pathlib import Path
 | |
| 
 | |
| import pytest
 | |
| 
 | |
| from langchain.document_loaders.blob_loaders import Blob
 | |
| from langchain.document_loaders.parsers.html import BS4HTMLParser
 | |
| 
 | |
| HERE = Path(__file__).parent
 | |
| EXAMPLES = HERE.parent.parent.parent / "integration_tests" / "examples"
 | |
| 
 | |
| 
 | |
| @pytest.mark.requires("bs4", "lxml")
 | |
| def test_bs_html_loader() -> None:
 | |
|     """Test unstructured loader."""
 | |
|     file_path = EXAMPLES / "example.html"
 | |
|     blob = Blob.from_path(file_path)
 | |
|     parser = BS4HTMLParser(get_text_separator="|")
 | |
|     docs = list(parser.lazy_parse(blob))
 | |
|     assert isinstance(docs, list)
 | |
|     assert len(docs) == 1
 | |
| 
 | |
|     metadata = docs[0].metadata
 | |
|     content = docs[0].page_content
 | |
| 
 | |
|     assert metadata["title"] == "Chew dad's slippers"
 | |
|     assert metadata["source"] == str(file_path)
 | |
|     assert content[:2] == "\n|"
 |