mirror of
				https://github.com/hwchase17/langchain.git
				synced 2025-10-24 20:20:50 +00:00 
			
		
		
		
	fix import error of bs4 (#1952)
Ran into a broken build if bs4 wasn't installed in the project. Minor tweak to follow the other doc loaders optional package-loading conventions. Also updated html docs to include reference to this new html loader. side note: Should there be 2 different html-to-text document loaders? This new one only handles local files, while the existing unstructured html loader handles HTML from local and remote. So it seems like the improvement was adding the title to the metadata, which is useful but could also be added to `html.py`
This commit is contained in:
		| @@ -1,5 +1,8 @@ | |||||||
| <!DOCTYPE html> | <!DOCTYPE html> | ||||||
| <html> | <html> | ||||||
|  | <head> | ||||||
|  |     <title>Test Title</title> | ||||||
|  | </head> | ||||||
| <body> | <body> | ||||||
|  |  | ||||||
| <h1>My First Heading</h1> | <h1>My First Heading</h1> | ||||||
|   | |||||||
| @@ -48,9 +48,7 @@ | |||||||
|    "outputs": [ |    "outputs": [ | ||||||
|     { |     { | ||||||
|      "data": { |      "data": { | ||||||
|       "text/plain": [ |       "text/plain": "[Document(page_content='My First Heading\\n\\nMy first paragraph.', lookup_str='', metadata={'source': 'example_data/fake-content.html'}, lookup_index=0)]" | ||||||
|        "[Document(page_content='My First Heading\\n\\nMy first paragraph.', lookup_str='', metadata={'source': 'example_data/fake-content.html'}, lookup_index=0)]" |  | ||||||
|       ] |  | ||||||
|      }, |      }, | ||||||
|      "execution_count": 4, |      "execution_count": 4, | ||||||
|      "metadata": {}, |      "metadata": {}, | ||||||
| @@ -61,13 +59,57 @@ | |||||||
|     "data" |     "data" | ||||||
|    ] |    ] | ||||||
|   }, |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "markdown", | ||||||
|  |    "source": [ | ||||||
|  |     "## Loading HTML with BeautifulSoup4\n", | ||||||
|  |     "\n", | ||||||
|  |     "We can also use BeautifulSoup4 to load HTML documents using the `BSHTMLLoader`.  This will extract the text from the html into `page_content`, and the page title as `title` into `metadata`." | ||||||
|  |    ], | ||||||
|  |    "metadata": { | ||||||
|  |     "collapsed": false | ||||||
|  |    } | ||||||
|  |   }, | ||||||
|   { |   { | ||||||
|    "cell_type": "code", |    "cell_type": "code", | ||||||
|    "execution_count": null, |    "execution_count": 16, | ||||||
|    "id": "79b1bce4", |    "id": "79b1bce4", | ||||||
|    "metadata": {}, |    "metadata": {}, | ||||||
|    "outputs": [], |    "outputs": [], | ||||||
|    "source": [] |    "source": [ | ||||||
|  |     "from langchain.document_loaders import BSHTMLLoader" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 17, | ||||||
|  |    "outputs": [ | ||||||
|  |     { | ||||||
|  |      "data": { | ||||||
|  |       "text/plain": "[Document(page_content='\\n\\nTest Title\\n\\n\\nMy First Heading\\nMy first paragraph.\\n\\n\\n', lookup_str='', metadata={'source': 'example_data/fake-content.html', 'title': 'Test Title'}, lookup_index=0)]" | ||||||
|  |      }, | ||||||
|  |      "execution_count": 17, | ||||||
|  |      "metadata": {}, | ||||||
|  |      "output_type": "execute_result" | ||||||
|  |     } | ||||||
|  |    ], | ||||||
|  |    "source": [ | ||||||
|  |     "loader = BSHTMLLoader(\"example_data/fake-content.html\")\n", | ||||||
|  |     "data = loader.load()\n", | ||||||
|  |     "data" | ||||||
|  |    ], | ||||||
|  |    "metadata": { | ||||||
|  |     "collapsed": false | ||||||
|  |    } | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": null, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [], | ||||||
|  |    "metadata": { | ||||||
|  |     "collapsed": false | ||||||
|  |    } | ||||||
|   } |   } | ||||||
|  ], |  ], | ||||||
|  "metadata": { |  "metadata": { | ||||||
|   | |||||||
| @@ -3,8 +3,6 @@ | |||||||
| import logging | import logging | ||||||
| from typing import Dict, List, Union | from typing import Dict, List, Union | ||||||
|  |  | ||||||
| from bs4 import BeautifulSoup |  | ||||||
|  |  | ||||||
| from langchain.docstore.document import Document | from langchain.docstore.document import Document | ||||||
| from langchain.document_loaders.base import BaseLoader | from langchain.document_loaders.base import BaseLoader | ||||||
|  |  | ||||||
| @@ -15,9 +13,18 @@ class BSHTMLLoader(BaseLoader): | |||||||
|     """Loader that uses beautiful soup to parse HTML files.""" |     """Loader that uses beautiful soup to parse HTML files.""" | ||||||
|  |  | ||||||
|     def __init__(self, file_path: str) -> None: |     def __init__(self, file_path: str) -> None: | ||||||
|  |         try: | ||||||
|  |             import bs4  # noqa:F401 | ||||||
|  |         except ImportError: | ||||||
|  |             raise ValueError( | ||||||
|  |                 "bs4 package not found, please install it with " "`pip install bs4`" | ||||||
|  |             ) | ||||||
|  |  | ||||||
|         self.file_path = file_path |         self.file_path = file_path | ||||||
|  |  | ||||||
|     def load(self) -> List[Document]: |     def load(self) -> List[Document]: | ||||||
|  |         from bs4 import BeautifulSoup | ||||||
|  |  | ||||||
|         """Load HTML document into document objects.""" |         """Load HTML document into document objects.""" | ||||||
|         with open(self.file_path, "r") as f: |         with open(self.file_path, "r") as f: | ||||||
|             soup = BeautifulSoup(f, features="lxml") |             soup = BeautifulSoup(f, features="lxml") | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user