From e25a5966b5e0023f61577d47850b70d809526a2b Mon Sep 17 00:00:00 2001 From: Lucas Tucker <47258766+lucas-tucker@users.noreply.github.com> Date: Mon, 17 Jun 2024 22:26:36 -0500 Subject: [PATCH] docs: Standardize DocumentLoader docstrings (#22932) **Standardizing DocumentLoader docstrings (of which there are many)** This PR addresses issue #22866 and adds docstrings according to the issue's specified format (in the appendix) for files csv_loader.py and json_loader.py in langchain_community.document_loaders. In particular, the following sections have been added to both CSVLoader and JSONLoader: Setup, Instantiate, Load, Async load, and Lazy load. It may be worth adding a 'Metadata' section to the JSONLoader docstring to clarify how we want to extract the JSON metadata (using the `metadata_func` argument). The files I used to walkthrough the various sections were `example_2.json` from [HERE](https://support.oneskyapp.com/hc/en-us/articles/208047697-JSON-sample-files) and `hw_200.csv` from [HERE](https://people.sc.fsu.edu/~jburkardt/data/csv/csv.html). --------- Co-authored-by: lucast2021 Co-authored-by: isaac hershenson --- .../document_loaders/csv_loader.py | 66 ++++++++++++++++- .../document_loaders/json_loader.py | 74 +++++++++++++++++-- 2 files changed, 133 insertions(+), 7 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/csv_loader.py b/libs/community/langchain_community/document_loaders/csv_loader.py index 37e6f565531..efc8507aa6f 100644 --- a/libs/community/langchain_community/document_loaders/csv_loader.py +++ b/libs/community/langchain_community/document_loaders/csv_loader.py @@ -16,8 +16,9 @@ from langchain_community.document_loaders.unstructured import ( class CSVLoader(BaseLoader): """Load a `CSV` file into a list of Documents. - Each document represents one row of the CSV file. Every row is converted into a - key/value pair and outputted to a new line in the document's page_content. + Each document represents one row of the CSV file. Every row is converted + into a key/value pair and outputted to a new line in the document's + page_content. The source for each document loaded from csv is set to the value of the `file_path` argument for all documents by default. @@ -32,6 +33,67 @@ class CSVLoader(BaseLoader): column1: value1 column2: value2 column3: value3 + + Instantiate: + .. code-block:: python + + from langchain_community.document_loaders import CSVLoader + + loader = CSVLoader(file_path='./hw_200.csv', + csv_args={ + 'delimiter': ',', + 'quotechar': '"', + 'fieldnames': ['Index', 'Height', 'Weight'] + }) + + Load: + .. code-block:: python + + docs = loader.load() + print(docs[0].page_content[:100]) + print(docs[0].metadata) + + .. code-block:: python + + Index: Index + Height: Height(Inches)" + Weight: "Weight(Pounds)" + {'source': './hw_200.csv', 'row': 0} + + Async load: + .. code-block:: python + + docs = await loader.aload() + print(docs[0].page_content[:100]) + print(docs[0].metadata) + + .. code-block:: python + + Index: Index + Height: Height(Inches)" + Weight: "Weight(Pounds)" + {'source': './hw_200.csv', 'row': 0} + + Lazy load: + .. code-block:: python + + docs = [] + docs_lazy = loader.lazy_load() + + # async variant: + # docs_lazy = await loader.alazy_load() + + for doc in docs_lazy: + docs.append(doc) + print(docs[0].page_content[:100]) + print(docs[0].metadata) + + .. code-block:: python + + Index: Index + Height: Height(Inches)" + Weight: "Weight(Pounds)" + {'source': './hw_200.csv', 'row': 0} """ def __init__( diff --git a/libs/community/langchain_community/document_loaders/json_loader.py b/libs/community/langchain_community/document_loaders/json_loader.py index 30eb1962553..839fe482f62 100644 --- a/libs/community/langchain_community/document_loaders/json_loader.py +++ b/libs/community/langchain_community/document_loaders/json_loader.py @@ -8,12 +8,76 @@ from langchain_community.document_loaders.base import BaseLoader class JSONLoader(BaseLoader): - """Load a `JSON` file using a `jq` schema. + """ + Load a `JSON` file using a `jq` schema. - Example: - [{"text": ...}, {"text": ...}, {"text": ...}] -> schema = .[].text - {"key": [{"text": ...}, {"text": ...}, {"text": ...}]} -> schema = .key[].text - ["", "", ""] -> schema = .[] + Setup: + .. code-block:: bash + + pip install -U jq + + Instantiate: + .. code-block:: python + + from langchain_community.document_loaders import JSONLoader + import json + from pathlib import Path + + file_path='./sample_quiz.json' + data = json.loads(Path(file_path).read_text()) + loader = JSONLoader( + file_path=file_path, + jq_schema='.quiz', + text_content=False) + + Load: + .. code-block:: python + + docs = loader.load() + print(docs[0].page_content[:100]) + print(docs[0].metadata) + + .. code-block:: python + + {"sport": {"q1": {"question": "Which one is correct team name in + NBA?", "options": ["New York Bulls" + {'source': '/sample_quiz + .json', 'seq_num': 1} + + Async load: + .. code-block:: python + + docs = await loader.aload() + print(docs[0].page_content[:100]) + print(docs[0].metadata) + + .. code-block:: python + + {"sport": {"q1": {"question": "Which one is correct team name in + NBA?", "options": ["New York Bulls" + {'source': '/sample_quizg + .json', 'seq_num': 1} + + Lazy load: + .. code-block:: python + + docs = [] + docs_lazy = loader.lazy_load() + + # async variant: + # docs_lazy = await loader.alazy_load() + + for doc in docs_lazy: + docs.append(doc) + print(docs[0].page_content[:100]) + print(docs[0].metadata) + + .. code-block:: python + + {"sport": {"q1": {"question": "Which one is correct team name in + NBA?", "options": ["New York Bulls" + {'source': '/sample_quiz + .json', 'seq_num': 1} """ def __init__(