[docs]: doc loader changes (#25417)

This commit is contained in:
Isaac Francisco
2024-08-14 19:46:33 -07:00
committed by GitHub
parent bd261456f6
commit 966b408634
10 changed files with 1064 additions and 22 deletions

View File

@@ -10,7 +10,74 @@ logger = logging.getLogger(__name__)
class BSHTMLLoader(BaseLoader):
"""Load `HTML` files and parse them with `beautiful soup`."""
"""
__ModuleName__ document loader integration
Setup:
Install ``langchain-community`` and ``bs4``.
.. code-block:: bash
pip install -U langchain-community bs4
Instantiate:
.. code-block:: python
from langchain_community.document_loaders import BSHTMLLoader
loader = BSHTMLLoader(
file_path="./example_data/fake-content.html",
)
Lazy load:
.. code-block:: python
docs = []
docs_lazy = loader.lazy_load()
# async variant:
# docs_lazy = await loader.alazy_load()
for doc in docs_lazy:
docs.append(doc)
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
Test Title
My First Heading
My first paragraph.
{'source': './example_data/fake-content.html', 'title': 'Test Title'}
Async load:
.. code-block:: python
docs = await loader.aload()
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
Test Title
My First Heading
My first paragraph.
{'source': './example_data/fake-content.html', 'title': 'Test Title'}
""" # noqa: E501
def __init__(
self,