[docs]: doc loader changes (#25417)

This commit is contained in:
Isaac Francisco
2024-08-14 19:46:33 -07:00
committed by GitHub
parent bd261456f6
commit 966b408634
10 changed files with 1064 additions and 22 deletions

View File

@@ -13,19 +13,60 @@ class UnstructuredMarkdownLoader(UnstructuredFileLoader):
You can pass in additional unstructured kwargs after mode to apply
different unstructured settings.
Examples
--------
from langchain_community.document_loaders import UnstructuredMarkdownLoader
Setup:
Install ``langchain-community``.
loader = UnstructuredMarkdownLoader(
"example.md", mode="elements", strategy="fast",
)
docs = loader.load()
.. code-block:: bash
pip install -U langchain-community
Instantiate:
.. code-block:: python
from langchain_community.document_loaders import UnstructuredMarkdownLoader
loader = UnstructuredMarkdownLoader(
"./example_data/example.md",
mode="elements",
strategy="fast",
)
Lazy load:
.. code-block:: python
docs = []
docs_lazy = loader.lazy_load()
# async variant:
# docs_lazy = await loader.alazy_load()
for doc in docs_lazy:
docs.append(doc)
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
Sample Markdown Document
{'source': './example_data/example.md', 'category_depth': 0, 'last_modified': '2024-08-14T15:04:18', 'languages': ['eng'], 'filetype': 'text/markdown', 'file_directory': './example_data', 'filename': 'example.md', 'category': 'Title', 'element_id': '3d0b313864598e704aa26c728ecb61e5'}
Async load:
.. code-block:: python
docs = await loader.aload()
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
Sample Markdown Document
{'source': './example_data/example.md', 'category_depth': 0, 'last_modified': '2024-08-14T15:04:18', 'languages': ['eng'], 'filetype': 'text/markdown', 'file_directory': './example_data', 'filename': 'example.md', 'category': 'Title', 'element_id': '3d0b313864598e704aa26c728ecb61e5'}
References
----------
https://unstructured-io.github.io/unstructured/core/partition.html#partition-md
"""
""" # noqa: E501
def _get_elements(self) -> List:
from unstructured.__version__ import __version__ as __unstructured_version__