diff --git a/docs/extras/modules/data_connection/document_loaders/integrations/open_city_data.ipynb b/docs/extras/modules/data_connection/document_loaders/integrations/open_city_data.ipynb new file mode 100644 index 00000000000..0cd457e4f7f --- /dev/null +++ b/docs/extras/modules/data_connection/document_loaders/integrations/open_city_data.ipynb @@ -0,0 +1,141 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "9b721926", + "metadata": {}, + "source": [ + "# Open City Data" + ] + }, + { + "cell_type": "markdown", + "id": "35c00849", + "metadata": {}, + "source": [ + "[Socrata](https://dev.socrata.com/foundry/data.sfgov.org/vw6y-z8j6) provides an API for city open data. \n", + "\n", + "For a dataset such as [SF crime](https://data.sfgov.org/Public-Safety/Police-Department-Incident-Reports-Historical-2003/tmnf-yvry), to to the `API` tab on top right. \n", + "\n", + "That provides you with the `dataset identifier`.\n", + "\n", + "Use the dataset identifier to grab specific tables for a given city_id (`data.sfgov.org`) - \n", + "\n", + "E.g., `vw6y-z8j6` for [SF 311 data](https://dev.socrata.com/foundry/data.sfgov.org/vw6y-z8j6).\n", + "\n", + "E.g., `tmnf-yvry` for [SF Police data](https://dev.socrata.com/foundry/data.sfgov.org/tmnf-yvry)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c93cc247", + "metadata": {}, + "outputs": [], + "source": [ + "! pip install sodapy" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b3464a02", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import OpenCityDataLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "478c5255", + "metadata": {}, + "outputs": [], + "source": [ + "dataset = \"vw6y-z8j6\" # 311 data\n", + "dataset = \"tmnf-yvry\" # crime data\n", + "loader = OpenCityDataLoader(city_id=\"data.sfgov.org\",\n", + " dataset_id=dataset,\n", + " limit=2000)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "fa914fc1", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:root:Requests made without an app_token will be subject to strict throttling limits.\n" + ] + } + ], + "source": [ + "docs = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "73a6def2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'pdid': '4133422003074',\n", + " 'incidntnum': '041334220',\n", + " 'incident_code': '03074',\n", + " 'category': 'ROBBERY',\n", + " 'descript': 'ROBBERY, BODILY FORCE',\n", + " 'dayofweek': 'Monday',\n", + " 'date': '2004-11-22T00:00:00.000',\n", + " 'time': '17:50',\n", + " 'pddistrict': 'INGLESIDE',\n", + " 'resolution': 'NONE',\n", + " 'address': 'GENEVA AV / SANTOS ST',\n", + " 'x': '-122.420084075249',\n", + " 'y': '37.7083109744362',\n", + " 'location': {'type': 'Point',\n", + " 'coordinates': [-122.420084075249, 37.7083109744362]},\n", + " ':@computed_region_26cr_cadq': '9',\n", + " ':@computed_region_rxqg_mtj9': '8',\n", + " ':@computed_region_bh8s_q3mv': '309'}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eval(docs[0].page_content)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/extras/modules/data_connection/document_loaders/integrations/pandas_dataframe.ipynb b/docs/extras/modules/data_connection/document_loaders/integrations/pandas_dataframe.ipynb index 7b072d2a587..76402681741 100644 --- a/docs/extras/modules/data_connection/document_loaders/integrations/pandas_dataframe.ipynb +++ b/docs/extras/modules/data_connection/document_loaders/integrations/pandas_dataframe.ipynb @@ -42,7 +42,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 3, "id": "ac273ca1", "metadata": {}, "outputs": [ @@ -116,7 +116,7 @@ "4 Braves 83.31 94" ] }, - "execution_count": 6, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -127,7 +127,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "66e47a13", "metadata": {}, "outputs": [], @@ -137,7 +137,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "id": "2334caca", "metadata": {}, "outputs": [], @@ -147,7 +147,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "id": "d616c2b0", "metadata": {}, "outputs": [ @@ -186,7 +186,7 @@ " Document(page_content='Astros', metadata={' \"Payroll (millions)\"': 60.65, ' \"Wins\"': 55})]" ] }, - "execution_count": 8, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -197,11 +197,52 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "beb55c2f", "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_content='Nationals' metadata={' \"Payroll (millions)\"': 81.34, ' \"Wins\"': 98}\n", + "page_content='Reds' metadata={' \"Payroll (millions)\"': 82.2, ' \"Wins\"': 97}\n", + "page_content='Yankees' metadata={' \"Payroll (millions)\"': 197.96, ' \"Wins\"': 95}\n", + "page_content='Giants' metadata={' \"Payroll (millions)\"': 117.62, ' \"Wins\"': 94}\n", + "page_content='Braves' metadata={' \"Payroll (millions)\"': 83.31, ' \"Wins\"': 94}\n", + "page_content='Athletics' metadata={' \"Payroll (millions)\"': 55.37, ' \"Wins\"': 94}\n", + "page_content='Rangers' metadata={' \"Payroll (millions)\"': 120.51, ' \"Wins\"': 93}\n", + "page_content='Orioles' metadata={' \"Payroll (millions)\"': 81.43, ' \"Wins\"': 93}\n", + "page_content='Rays' metadata={' \"Payroll (millions)\"': 64.17, ' \"Wins\"': 90}\n", + "page_content='Angels' metadata={' \"Payroll (millions)\"': 154.49, ' \"Wins\"': 89}\n", + "page_content='Tigers' metadata={' \"Payroll (millions)\"': 132.3, ' \"Wins\"': 88}\n", + "page_content='Cardinals' metadata={' \"Payroll (millions)\"': 110.3, ' \"Wins\"': 88}\n", + "page_content='Dodgers' metadata={' \"Payroll (millions)\"': 95.14, ' \"Wins\"': 86}\n", + "page_content='White Sox' metadata={' \"Payroll (millions)\"': 96.92, ' \"Wins\"': 85}\n", + "page_content='Brewers' metadata={' \"Payroll (millions)\"': 97.65, ' \"Wins\"': 83}\n", + "page_content='Phillies' metadata={' \"Payroll (millions)\"': 174.54, ' \"Wins\"': 81}\n", + "page_content='Diamondbacks' metadata={' \"Payroll (millions)\"': 74.28, ' \"Wins\"': 81}\n", + "page_content='Pirates' metadata={' \"Payroll (millions)\"': 63.43, ' \"Wins\"': 79}\n", + "page_content='Padres' metadata={' \"Payroll (millions)\"': 55.24, ' \"Wins\"': 76}\n", + "page_content='Mariners' metadata={' \"Payroll (millions)\"': 81.97, ' \"Wins\"': 75}\n", + "page_content='Mets' metadata={' \"Payroll (millions)\"': 93.35, ' \"Wins\"': 74}\n", + "page_content='Blue Jays' metadata={' \"Payroll (millions)\"': 75.48, ' \"Wins\"': 73}\n", + "page_content='Royals' metadata={' \"Payroll (millions)\"': 60.91, ' \"Wins\"': 72}\n", + "page_content='Marlins' metadata={' \"Payroll (millions)\"': 118.07, ' \"Wins\"': 69}\n", + "page_content='Red Sox' metadata={' \"Payroll (millions)\"': 173.18, ' \"Wins\"': 69}\n", + "page_content='Indians' metadata={' \"Payroll (millions)\"': 78.43, ' \"Wins\"': 68}\n", + "page_content='Twins' metadata={' \"Payroll (millions)\"': 94.08, ' \"Wins\"': 66}\n", + "page_content='Rockies' metadata={' \"Payroll (millions)\"': 78.06, ' \"Wins\"': 64}\n", + "page_content='Cubs' metadata={' \"Payroll (millions)\"': 88.19, ' \"Wins\"': 61}\n", + "page_content='Astros' metadata={' \"Payroll (millions)\"': 60.65, ' \"Wins\"': 55}\n" + ] + } + ], + "source": [ + "# Use lazy load for larger table, which won't read the full table into memory \n", + "for i in loader.lazy_load():\n", + " print(i)" + ] } ], "metadata": { @@ -220,7 +261,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.9.16" } }, "nbformat": 4, diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 680b8c28cd2..3d80f1c0fd1 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -75,6 +75,7 @@ from langchain.document_loaders.obsidian import ObsidianLoader from langchain.document_loaders.odt import UnstructuredODTLoader from langchain.document_loaders.onedrive import OneDriveLoader from langchain.document_loaders.onedrive_file import OneDriveFileLoader +from langchain.document_loaders.open_city_data import OpenCityDataLoader from langchain.document_loaders.pdf import ( MathpixPDFLoader, OnlinePDFLoader, @@ -209,6 +210,7 @@ __all__ = [ "OneDriveLoader", "OnlinePDFLoader", "OutlookMessageLoader", + "OpenCityDataLoader", "PDFMinerLoader", "PDFMinerPDFasHTMLLoader", "PDFPlumberLoader", diff --git a/langchain/document_loaders/airtable.py b/langchain/document_loaders/airtable.py index 3dfaf40c4fc..0e2f34a2f1b 100644 --- a/langchain/document_loaders/airtable.py +++ b/langchain/document_loaders/airtable.py @@ -5,7 +5,7 @@ from langchain.document_loaders.base import BaseLoader class AirtableLoader(BaseLoader): - """Loader that loads local airbyte json files.""" + """Loader for Airtable tables.""" def __init__(self, api_token: str, table_id: str, base_id: str): """Initialize with API token and the IDs for table and base""" @@ -14,7 +14,7 @@ class AirtableLoader(BaseLoader): self.base_id = base_id def lazy_load(self) -> Iterator[Document]: - """Load Table.""" + """Lazy load records from table.""" from pyairtable import Table diff --git a/langchain/document_loaders/dataframe.py b/langchain/document_loaders/dataframe.py index 4ae4c5878d2..e0218a3978c 100644 --- a/langchain/document_loaders/dataframe.py +++ b/langchain/document_loaders/dataframe.py @@ -1,5 +1,5 @@ """Load from Dataframe object""" -from typing import Any, List +from typing import Any, Iterator, List from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader @@ -19,16 +19,15 @@ class DataFrameLoader(BaseLoader): self.data_frame = data_frame self.page_content_column = page_content_column - def load(self) -> List[Document]: - """Load from the dataframe.""" - result = [] - # For very large dataframes, this needs to yield instead of building a list - # but that would require chaging return type to a generator for BaseLoader - # and all its subclasses, which is a bigger refactor. Marking as future TODO. - # This change will allow us to extend this to Spark and Dask dataframes. + def lazy_load(self) -> Iterator[Document]: + """Lazy load records from dataframe.""" + for _, row in self.data_frame.iterrows(): text = row[self.page_content_column] metadata = row.to_dict() metadata.pop(self.page_content_column) - result.append(Document(page_content=text, metadata=metadata)) - return result + yield Document(page_content=text, metadata=metadata) + + def load(self) -> List[Document]: + """Load full dataframe.""" + return list(self.lazy_load()) diff --git a/langchain/document_loaders/open_city_data.py b/langchain/document_loaders/open_city_data.py new file mode 100644 index 00000000000..a5af89b41ca --- /dev/null +++ b/langchain/document_loaders/open_city_data.py @@ -0,0 +1,37 @@ +from typing import Iterator, List + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +class OpenCityDataLoader(BaseLoader): + """Loader that loads Open city data.""" + + def __init__(self, city_id: str, dataset_id: str, limit: int): + """Initialize with dataset_id""" + """ Example: https://dev.socrata.com/foundry/data.sfgov.org/vw6y-z8j6 """ + """ e.g., city_id = data.sfgov.org """ + """ e.g., dataset_id = vw6y-z8j6 """ + self.city_id = city_id + self.dataset_id = dataset_id + self.limit = limit + + def lazy_load(self) -> Iterator[Document]: + """Lazy load records.""" + + from sodapy import Socrata + + client = Socrata(self.city_id, None) + results = client.get(self.dataset_id, limit=self.limit) + for record in results: + yield Document( + page_content=str(record), + metadata={ + "source": self.city_id + "_" + self.dataset_id, + }, + ) + + def load(self) -> List[Document]: + """Load records.""" + + return list(self.lazy_load())