docs: Standardize DocumentLoader docstrings (#22932)

**Standardizing DocumentLoader docstrings (of which there are many)**

This PR addresses issue #22866 and adds docstrings according to the
issue's specified format (in the appendix) for files csv_loader.py and
json_loader.py in langchain_community.document_loaders. In particular,
the following sections have been added to both CSVLoader and JSONLoader:
Setup, Instantiate, Load, Async load, and Lazy load. It may be worth
adding a 'Metadata' section to the JSONLoader docstring to clarify how
we want to extract the JSON metadata (using the `metadata_func`
argument). The files I used to walkthrough the various sections were
`example_2.json` from
[HERE](https://support.oneskyapp.com/hc/en-us/articles/208047697-JSON-sample-files)
and `hw_200.csv` from
[HERE](https://people.sc.fsu.edu/~jburkardt/data/csv/csv.html).

---------

Co-authored-by: lucast2021 <lucast2021@headroyce.org>
Co-authored-by: isaac hershenson <ihershenson@hmc.edu>
This commit is contained in:
Lucas Tucker 2024-06-17 22:26:36 -05:00 committed by GitHub
parent a56ff199a7
commit e25a5966b5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 133 additions and 7 deletions

View File

@ -16,8 +16,9 @@ from langchain_community.document_loaders.unstructured import (
class CSVLoader(BaseLoader):
"""Load a `CSV` file into a list of Documents.
Each document represents one row of the CSV file. Every row is converted into a
key/value pair and outputted to a new line in the document's page_content.
Each document represents one row of the CSV file. Every row is converted
into a key/value pair and outputted to a new line in the document's
page_content.
The source for each document loaded from csv is set to the value of the
`file_path` argument for all documents by default.
@ -32,6 +33,67 @@ class CSVLoader(BaseLoader):
column1: value1
column2: value2
column3: value3
Instantiate:
.. code-block:: python
from langchain_community.document_loaders import CSVLoader
loader = CSVLoader(file_path='./hw_200.csv',
csv_args={
'delimiter': ',',
'quotechar': '"',
'fieldnames': ['Index', 'Height', 'Weight']
})
Load:
.. code-block:: python
docs = loader.load()
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
Index: Index
Height: Height(Inches)"
Weight: "Weight(Pounds)"
{'source': './hw_200.csv', 'row': 0}
Async load:
.. code-block:: python
docs = await loader.aload()
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
Index: Index
Height: Height(Inches)"
Weight: "Weight(Pounds)"
{'source': './hw_200.csv', 'row': 0}
Lazy load:
.. code-block:: python
docs = []
docs_lazy = loader.lazy_load()
# async variant:
# docs_lazy = await loader.alazy_load()
for doc in docs_lazy:
docs.append(doc)
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
Index: Index
Height: Height(Inches)"
Weight: "Weight(Pounds)"
{'source': './hw_200.csv', 'row': 0}
"""
def __init__(

View File

@ -8,12 +8,76 @@ from langchain_community.document_loaders.base import BaseLoader
class JSONLoader(BaseLoader):
"""Load a `JSON` file using a `jq` schema.
"""
Load a `JSON` file using a `jq` schema.
Example:
[{"text": ...}, {"text": ...}, {"text": ...}] -> schema = .[].text
{"key": [{"text": ...}, {"text": ...}, {"text": ...}]} -> schema = .key[].text
["", "", ""] -> schema = .[]
Setup:
.. code-block:: bash
pip install -U jq
Instantiate:
.. code-block:: python
from langchain_community.document_loaders import JSONLoader
import json
from pathlib import Path
file_path='./sample_quiz.json'
data = json.loads(Path(file_path).read_text())
loader = JSONLoader(
file_path=file_path,
jq_schema='.quiz',
text_content=False)
Load:
.. code-block:: python
docs = loader.load()
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
{"sport": {"q1": {"question": "Which one is correct team name in
NBA?", "options": ["New York Bulls"
{'source': '/sample_quiz
.json', 'seq_num': 1}
Async load:
.. code-block:: python
docs = await loader.aload()
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
{"sport": {"q1": {"question": "Which one is correct team name in
NBA?", "options": ["New York Bulls"
{'source': '/sample_quizg
.json', 'seq_num': 1}
Lazy load:
.. code-block:: python
docs = []
docs_lazy = loader.lazy_load()
# async variant:
# docs_lazy = await loader.alazy_load()
for doc in docs_lazy:
docs.append(doc)
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
{"sport": {"q1": {"question": "Which one is correct team name in
NBA?", "options": ["New York Bulls"
{'source': '/sample_quiz
.json', 'seq_num': 1}
"""
def __init__(