docs, cli[patch]: document loaders doc template (#22862)

From: https://github.com/langchain-ai/langchain/pull/22290 --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
2025-09-13 21:47:12 +00:00 · 2024-06-13 19:28:57 -07:00
parent d1cdde267a
commit 75e966a2fa
4 changed files with 295 additions and 31 deletions
--- a/libs/community/langchain_community/document_loaders/recursive_url_loader.py
+++ b/libs/community/langchain_community/document_loaders/recursive_url_loader.py
@@ -110,14 +110,17 @@ class RecursiveUrlLoader(BaseLoader):
                # ...
            )

-    Load:
-        Use ``.load()`` to synchronously load into memory all Documents, with one
-        Document per visited URL. Starting from the initial URL, we recurse through
-        all linked URLs up to the specified max_depth.
-
+    Lazy load:
        .. code-block:: python

-            docs = loader.load()
+            docs = []
+            docs_lazy = loader.lazy_load()
+
+            # async variant:
+            # docs_lazy = await loader.alazy_load()
+
+            for doc in docs_lazy:
+                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

@@ -146,29 +149,6 @@ class RecursiveUrlLoader(BaseLoader):
                <meta charset="utf-8" /><
            {'source': 'https://docs.python.org/3.9/', 'content_type': 'text/html', 'title': '3.9.19 Documentation', 'language': None}

-    Lazy load:
-        .. code-block:: python
-
-            docs = []
-            docs_lazy = loader.lazy_load()
-
-            # async variant:
-            # docs_lazy = await loader.alazy_load()
-
-            for doc in docs_lazy:
-                docs.append(doc)
-            print(docs[0].page_content[:100])
-            print(docs[0].metadata)
-
-        .. code-block:: python
-
-            <!DOCTYPE html>
-
-            <html xmlns="http://www.w3.org/1999/xhtml">
-            <head>
-                <meta charset="utf-8" /><
-            {'source': 'https://docs.python.org/3.9/', 'content_type': 'text/html', 'title': '3.9.19 Documentation', 'language': None}
-
    Content parsing / extraction:
        By default the loader sets the raw HTML from each link as the Document page
        content. To parse this HTML into a more human/LLM-friendly format you can pass