mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-02 03:26:17 +00:00
[docs]: doc loader changes (#25417)
This commit is contained in:
@@ -10,7 +10,74 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BSHTMLLoader(BaseLoader):
|
||||
"""Load `HTML` files and parse them with `beautiful soup`."""
|
||||
"""
|
||||
__ModuleName__ document loader integration
|
||||
|
||||
Setup:
|
||||
Install ``langchain-community`` and ``bs4``.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install -U langchain-community bs4
|
||||
|
||||
Instantiate:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.document_loaders import BSHTMLLoader
|
||||
|
||||
loader = BSHTMLLoader(
|
||||
file_path="./example_data/fake-content.html",
|
||||
)
|
||||
|
||||
Lazy load:
|
||||
.. code-block:: python
|
||||
|
||||
docs = []
|
||||
docs_lazy = loader.lazy_load()
|
||||
|
||||
# async variant:
|
||||
# docs_lazy = await loader.alazy_load()
|
||||
|
||||
for doc in docs_lazy:
|
||||
docs.append(doc)
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
||||
Test Title
|
||||
|
||||
|
||||
My First Heading
|
||||
My first paragraph.
|
||||
|
||||
|
||||
|
||||
{'source': './example_data/fake-content.html', 'title': 'Test Title'}
|
||||
|
||||
Async load:
|
||||
.. code-block:: python
|
||||
|
||||
docs = await loader.aload()
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
||||
|
||||
Test Title
|
||||
|
||||
|
||||
My First Heading
|
||||
My first paragraph.
|
||||
|
||||
|
||||
|
||||
{'source': './example_data/fake-content.html', 'title': 'Test Title'}
|
||||
|
||||
""" # noqa: E501
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
Reference in New Issue
Block a user