docs: Standardize DocumentLoader docstrings (#22932)

**Standardizing DocumentLoader docstrings (of which there are many)** This PR addresses issue #22866 and adds docstrings according to the issue's specified format (in the appendix) for files csv_loader.py and json_loader.py in langchain_community.document_loaders. In particular, the following sections have been added to both CSVLoader and JSONLoader: Setup, Instantiate, Load, Async load, and Lazy load. It may be worth adding a 'Metadata' section to the JSONLoader docstring to clarify how we want to extract the JSON metadata (using the `metadata_func` argument). The files I used to walkthrough the various sections were `example_2.json` from [HERE](https://support.oneskyapp.com/hc/en-us/articles/208047697-JSON-sample-files) and `hw_200.csv` from [HERE](https://people.sc.fsu.edu/~jburkardt/data/csv/csv.html). --------- Co-authored-by: lucast2021 <lucast2021@headroyce.org> Co-authored-by: isaac hershenson <ihershenson@hmc.edu>
2025-06-26 08:33:49 +00:00 · 2024-06-17 22:26:36 -05:00 · 2024-06-17 22:26:36 -05:00 · e25a5966b5
commit e25a5966b5
parent a56ff199a7
2 changed files with 133 additions and 7 deletions
--- a/libs/community/langchain_community/document_loaders/csv_loader.py
+++ b/libs/community/langchain_community/document_loaders/csv_loader.py
@ -16,8 +16,9 @@ from langchain_community.document_loaders.unstructured import (
 class CSVLoader(BaseLoader):
    """Load a `CSV` file into a list of Documents.

-    Each document represents one row of the CSV file. Every row is converted into a
-    key/value pair and outputted to a new line in the document's page_content.
+    Each document represents one row of the CSV file. Every row is converted
+    into a key/value pair and outputted to a new line in the document's
+    page_content.

    The source for each document loaded from csv is set to the value of the
    `file_path` argument for all documents by default.
@ -32,6 +33,67 @@ class CSVLoader(BaseLoader):
            column1: value1
            column2: value2
            column3: value3
+
+    Instantiate:
+        .. code-block:: python
+
+            from langchain_community.document_loaders import CSVLoader
+
+            loader = CSVLoader(file_path='./hw_200.csv',
+                csv_args={
+                'delimiter': ',',
+                'quotechar': '"',
+                'fieldnames': ['Index', 'Height', 'Weight']
+            })
+
+    Load:
+        .. code-block:: python
+
+            docs = loader.load()
+            print(docs[0].page_content[:100])
+            print(docs[0].metadata)
+
+        .. code-block:: python
+
+            Index: Index
+            Height: Height(Inches)"
+            Weight: "Weight(Pounds)"
+            {'source': './hw_200.csv', 'row': 0}
+
+    Async load:
+        .. code-block:: python
+
+            docs = await loader.aload()
+            print(docs[0].page_content[:100])
+            print(docs[0].metadata)
+
+        .. code-block:: python
+
+            Index: Index
+            Height: Height(Inches)"
+            Weight: "Weight(Pounds)"
+            {'source': './hw_200.csv', 'row': 0}
+
+    Lazy load:
+        .. code-block:: python
+
+            docs = []
+            docs_lazy = loader.lazy_load()
+
+            # async variant:
+            # docs_lazy = await loader.alazy_load()
+
+            for doc in docs_lazy:
+                docs.append(doc)
+            print(docs[0].page_content[:100])
+            print(docs[0].metadata)
+
+        .. code-block:: python
+
+            Index: Index
+            Height: Height(Inches)"
+            Weight: "Weight(Pounds)"
+            {'source': './hw_200.csv', 'row': 0}
    """

    def __init__(
--- a/libs/community/langchain_community/document_loaders/json_loader.py
+++ b/libs/community/langchain_community/document_loaders/json_loader.py
@ -8,12 +8,76 @@ from langchain_community.document_loaders.base import BaseLoader


 class JSONLoader(BaseLoader):
-    """Load a `JSON` file using a `jq` schema.
+    """
+    Load a `JSON` file using a `jq` schema.

-    Example:
-        [{"text": ...}, {"text": ...}, {"text": ...}] -> schema = .[].text
-        {"key": [{"text": ...}, {"text": ...}, {"text": ...}]} -> schema = .key[].text
-        ["", "", ""] -> schema = .[]
+    Setup:
+        .. code-block:: bash
+
+            pip install -U jq
+
+    Instantiate:
+        .. code-block:: python
+
+            from langchain_community.document_loaders import JSONLoader
+            import json
+            from pathlib import Path
+
+            file_path='./sample_quiz.json'
+            data = json.loads(Path(file_path).read_text())
+            loader = JSONLoader(
+                     file_path=file_path,
+                     jq_schema='.quiz',
+                     text_content=False)
+
+    Load:
+        .. code-block:: python
+
+            docs = loader.load()
+            print(docs[0].page_content[:100])
+            print(docs[0].metadata)
+
+        .. code-block:: python
+
+            {"sport": {"q1": {"question": "Which one is correct team name in
+            NBA?", "options": ["New York Bulls"
+            {'source': '/sample_quiz
+            .json', 'seq_num': 1}
+
+    Async load:
+        .. code-block:: python
+
+            docs = await loader.aload()
+            print(docs[0].page_content[:100])
+            print(docs[0].metadata)
+
+        .. code-block:: python
+
+            {"sport": {"q1": {"question": "Which one is correct team name in
+            NBA?", "options": ["New York Bulls"
+            {'source': '/sample_quizg
+            .json', 'seq_num': 1}
+
+    Lazy load:
+        .. code-block:: python
+
+            docs = []
+            docs_lazy = loader.lazy_load()
+
+            # async variant:
+            # docs_lazy = await loader.alazy_load()
+
+            for doc in docs_lazy:
+                docs.append(doc)
+            print(docs[0].page_content[:100])
+            print(docs[0].metadata)
+
+        .. code-block:: python
+
+            {"sport": {"q1": {"question": "Which one is correct team name in
+            NBA?", "options": ["New York Bulls"
+            {'source': '/sample_quiz
+            .json', 'seq_num': 1}
    """

    def __init__(