mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-27 17:08:47 +00:00
Loader for OpenCityData and minor cleanups to Pandas, Airtable loaders (#6301)
Many cities have open data portals for events like crime, traffic, etc. Socrata provides an API for many, including SF (e.g., see [here](https://dev.socrata.com/foundry/data.sfgov.org/tmnf-yvry)). This is a new data loader for city data that uses Socrata API.
This commit is contained in:
parent
9d42621fa4
commit
6e69bfbb28
@ -0,0 +1,141 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "9b721926",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Open City Data"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "35c00849",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"[Socrata](https://dev.socrata.com/foundry/data.sfgov.org/vw6y-z8j6) provides an API for city open data. \n",
|
||||||
|
"\n",
|
||||||
|
"For a dataset such as [SF crime](https://data.sfgov.org/Public-Safety/Police-Department-Incident-Reports-Historical-2003/tmnf-yvry), to to the `API` tab on top right. \n",
|
||||||
|
"\n",
|
||||||
|
"That provides you with the `dataset identifier`.\n",
|
||||||
|
"\n",
|
||||||
|
"Use the dataset identifier to grab specific tables for a given city_id (`data.sfgov.org`) - \n",
|
||||||
|
"\n",
|
||||||
|
"E.g., `vw6y-z8j6` for [SF 311 data](https://dev.socrata.com/foundry/data.sfgov.org/vw6y-z8j6).\n",
|
||||||
|
"\n",
|
||||||
|
"E.g., `tmnf-yvry` for [SF Police data](https://dev.socrata.com/foundry/data.sfgov.org/tmnf-yvry)."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "c93cc247",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"! pip install sodapy"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "b3464a02",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders import OpenCityDataLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "478c5255",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"dataset = \"vw6y-z8j6\" # 311 data\n",
|
||||||
|
"dataset = \"tmnf-yvry\" # crime data\n",
|
||||||
|
"loader = OpenCityDataLoader(city_id=\"data.sfgov.org\",\n",
|
||||||
|
" dataset_id=dataset,\n",
|
||||||
|
" limit=2000)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "fa914fc1",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"WARNING:root:Requests made without an app_token will be subject to strict throttling limits.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"docs = loader.load()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "73a6def2",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'pdid': '4133422003074',\n",
|
||||||
|
" 'incidntnum': '041334220',\n",
|
||||||
|
" 'incident_code': '03074',\n",
|
||||||
|
" 'category': 'ROBBERY',\n",
|
||||||
|
" 'descript': 'ROBBERY, BODILY FORCE',\n",
|
||||||
|
" 'dayofweek': 'Monday',\n",
|
||||||
|
" 'date': '2004-11-22T00:00:00.000',\n",
|
||||||
|
" 'time': '17:50',\n",
|
||||||
|
" 'pddistrict': 'INGLESIDE',\n",
|
||||||
|
" 'resolution': 'NONE',\n",
|
||||||
|
" 'address': 'GENEVA AV / SANTOS ST',\n",
|
||||||
|
" 'x': '-122.420084075249',\n",
|
||||||
|
" 'y': '37.7083109744362',\n",
|
||||||
|
" 'location': {'type': 'Point',\n",
|
||||||
|
" 'coordinates': [-122.420084075249, 37.7083109744362]},\n",
|
||||||
|
" ':@computed_region_26cr_cadq': '9',\n",
|
||||||
|
" ':@computed_region_rxqg_mtj9': '8',\n",
|
||||||
|
" ':@computed_region_bh8s_q3mv': '309'}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"eval(docs[0].page_content)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.16"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -42,7 +42,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 6,
|
"execution_count": 3,
|
||||||
"id": "ac273ca1",
|
"id": "ac273ca1",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@ -116,7 +116,7 @@
|
|||||||
"4 Braves 83.31 94"
|
"4 Braves 83.31 94"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 6,
|
"execution_count": 3,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
@ -127,7 +127,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 4,
|
||||||
"id": "66e47a13",
|
"id": "66e47a13",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -137,7 +137,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 7,
|
"execution_count": 5,
|
||||||
"id": "2334caca",
|
"id": "2334caca",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -147,7 +147,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 8,
|
"execution_count": 6,
|
||||||
"id": "d616c2b0",
|
"id": "d616c2b0",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@ -186,7 +186,7 @@
|
|||||||
" Document(page_content='Astros', metadata={' \"Payroll (millions)\"': 60.65, ' \"Wins\"': 55})]"
|
" Document(page_content='Astros', metadata={' \"Payroll (millions)\"': 60.65, ' \"Wins\"': 55})]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 8,
|
"execution_count": 6,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
@ -197,11 +197,52 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 7,
|
||||||
"id": "beb55c2f",
|
"id": "beb55c2f",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
"source": []
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"page_content='Nationals' metadata={' \"Payroll (millions)\"': 81.34, ' \"Wins\"': 98}\n",
|
||||||
|
"page_content='Reds' metadata={' \"Payroll (millions)\"': 82.2, ' \"Wins\"': 97}\n",
|
||||||
|
"page_content='Yankees' metadata={' \"Payroll (millions)\"': 197.96, ' \"Wins\"': 95}\n",
|
||||||
|
"page_content='Giants' metadata={' \"Payroll (millions)\"': 117.62, ' \"Wins\"': 94}\n",
|
||||||
|
"page_content='Braves' metadata={' \"Payroll (millions)\"': 83.31, ' \"Wins\"': 94}\n",
|
||||||
|
"page_content='Athletics' metadata={' \"Payroll (millions)\"': 55.37, ' \"Wins\"': 94}\n",
|
||||||
|
"page_content='Rangers' metadata={' \"Payroll (millions)\"': 120.51, ' \"Wins\"': 93}\n",
|
||||||
|
"page_content='Orioles' metadata={' \"Payroll (millions)\"': 81.43, ' \"Wins\"': 93}\n",
|
||||||
|
"page_content='Rays' metadata={' \"Payroll (millions)\"': 64.17, ' \"Wins\"': 90}\n",
|
||||||
|
"page_content='Angels' metadata={' \"Payroll (millions)\"': 154.49, ' \"Wins\"': 89}\n",
|
||||||
|
"page_content='Tigers' metadata={' \"Payroll (millions)\"': 132.3, ' \"Wins\"': 88}\n",
|
||||||
|
"page_content='Cardinals' metadata={' \"Payroll (millions)\"': 110.3, ' \"Wins\"': 88}\n",
|
||||||
|
"page_content='Dodgers' metadata={' \"Payroll (millions)\"': 95.14, ' \"Wins\"': 86}\n",
|
||||||
|
"page_content='White Sox' metadata={' \"Payroll (millions)\"': 96.92, ' \"Wins\"': 85}\n",
|
||||||
|
"page_content='Brewers' metadata={' \"Payroll (millions)\"': 97.65, ' \"Wins\"': 83}\n",
|
||||||
|
"page_content='Phillies' metadata={' \"Payroll (millions)\"': 174.54, ' \"Wins\"': 81}\n",
|
||||||
|
"page_content='Diamondbacks' metadata={' \"Payroll (millions)\"': 74.28, ' \"Wins\"': 81}\n",
|
||||||
|
"page_content='Pirates' metadata={' \"Payroll (millions)\"': 63.43, ' \"Wins\"': 79}\n",
|
||||||
|
"page_content='Padres' metadata={' \"Payroll (millions)\"': 55.24, ' \"Wins\"': 76}\n",
|
||||||
|
"page_content='Mariners' metadata={' \"Payroll (millions)\"': 81.97, ' \"Wins\"': 75}\n",
|
||||||
|
"page_content='Mets' metadata={' \"Payroll (millions)\"': 93.35, ' \"Wins\"': 74}\n",
|
||||||
|
"page_content='Blue Jays' metadata={' \"Payroll (millions)\"': 75.48, ' \"Wins\"': 73}\n",
|
||||||
|
"page_content='Royals' metadata={' \"Payroll (millions)\"': 60.91, ' \"Wins\"': 72}\n",
|
||||||
|
"page_content='Marlins' metadata={' \"Payroll (millions)\"': 118.07, ' \"Wins\"': 69}\n",
|
||||||
|
"page_content='Red Sox' metadata={' \"Payroll (millions)\"': 173.18, ' \"Wins\"': 69}\n",
|
||||||
|
"page_content='Indians' metadata={' \"Payroll (millions)\"': 78.43, ' \"Wins\"': 68}\n",
|
||||||
|
"page_content='Twins' metadata={' \"Payroll (millions)\"': 94.08, ' \"Wins\"': 66}\n",
|
||||||
|
"page_content='Rockies' metadata={' \"Payroll (millions)\"': 78.06, ' \"Wins\"': 64}\n",
|
||||||
|
"page_content='Cubs' metadata={' \"Payroll (millions)\"': 88.19, ' \"Wins\"': 61}\n",
|
||||||
|
"page_content='Astros' metadata={' \"Payroll (millions)\"': 60.65, ' \"Wins\"': 55}\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Use lazy load for larger table, which won't read the full table into memory \n",
|
||||||
|
"for i in loader.lazy_load():\n",
|
||||||
|
" print(i)"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
@ -220,7 +261,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.6"
|
"version": "3.9.16"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -75,6 +75,7 @@ from langchain.document_loaders.obsidian import ObsidianLoader
|
|||||||
from langchain.document_loaders.odt import UnstructuredODTLoader
|
from langchain.document_loaders.odt import UnstructuredODTLoader
|
||||||
from langchain.document_loaders.onedrive import OneDriveLoader
|
from langchain.document_loaders.onedrive import OneDriveLoader
|
||||||
from langchain.document_loaders.onedrive_file import OneDriveFileLoader
|
from langchain.document_loaders.onedrive_file import OneDriveFileLoader
|
||||||
|
from langchain.document_loaders.open_city_data import OpenCityDataLoader
|
||||||
from langchain.document_loaders.pdf import (
|
from langchain.document_loaders.pdf import (
|
||||||
MathpixPDFLoader,
|
MathpixPDFLoader,
|
||||||
OnlinePDFLoader,
|
OnlinePDFLoader,
|
||||||
@ -209,6 +210,7 @@ __all__ = [
|
|||||||
"OneDriveLoader",
|
"OneDriveLoader",
|
||||||
"OnlinePDFLoader",
|
"OnlinePDFLoader",
|
||||||
"OutlookMessageLoader",
|
"OutlookMessageLoader",
|
||||||
|
"OpenCityDataLoader",
|
||||||
"PDFMinerLoader",
|
"PDFMinerLoader",
|
||||||
"PDFMinerPDFasHTMLLoader",
|
"PDFMinerPDFasHTMLLoader",
|
||||||
"PDFPlumberLoader",
|
"PDFPlumberLoader",
|
||||||
|
@ -5,7 +5,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class AirtableLoader(BaseLoader):
|
class AirtableLoader(BaseLoader):
|
||||||
"""Loader that loads local airbyte json files."""
|
"""Loader for Airtable tables."""
|
||||||
|
|
||||||
def __init__(self, api_token: str, table_id: str, base_id: str):
|
def __init__(self, api_token: str, table_id: str, base_id: str):
|
||||||
"""Initialize with API token and the IDs for table and base"""
|
"""Initialize with API token and the IDs for table and base"""
|
||||||
@ -14,7 +14,7 @@ class AirtableLoader(BaseLoader):
|
|||||||
self.base_id = base_id
|
self.base_id = base_id
|
||||||
|
|
||||||
def lazy_load(self) -> Iterator[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
"""Load Table."""
|
"""Lazy load records from table."""
|
||||||
|
|
||||||
from pyairtable import Table
|
from pyairtable import Table
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
"""Load from Dataframe object"""
|
"""Load from Dataframe object"""
|
||||||
from typing import Any, List
|
from typing import Any, Iterator, List
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
from langchain.document_loaders.base import BaseLoader
|
from langchain.document_loaders.base import BaseLoader
|
||||||
@ -19,16 +19,15 @@ class DataFrameLoader(BaseLoader):
|
|||||||
self.data_frame = data_frame
|
self.data_frame = data_frame
|
||||||
self.page_content_column = page_content_column
|
self.page_content_column = page_content_column
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
"""Load from the dataframe."""
|
"""Lazy load records from dataframe."""
|
||||||
result = []
|
|
||||||
# For very large dataframes, this needs to yield instead of building a list
|
|
||||||
# but that would require chaging return type to a generator for BaseLoader
|
|
||||||
# and all its subclasses, which is a bigger refactor. Marking as future TODO.
|
|
||||||
# This change will allow us to extend this to Spark and Dask dataframes.
|
|
||||||
for _, row in self.data_frame.iterrows():
|
for _, row in self.data_frame.iterrows():
|
||||||
text = row[self.page_content_column]
|
text = row[self.page_content_column]
|
||||||
metadata = row.to_dict()
|
metadata = row.to_dict()
|
||||||
metadata.pop(self.page_content_column)
|
metadata.pop(self.page_content_column)
|
||||||
result.append(Document(page_content=text, metadata=metadata))
|
yield Document(page_content=text, metadata=metadata)
|
||||||
return result
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
"""Load full dataframe."""
|
||||||
|
return list(self.lazy_load())
|
||||||
|
37
langchain/document_loaders/open_city_data.py
Normal file
37
langchain/document_loaders/open_city_data.py
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
from typing import Iterator, List
|
||||||
|
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
|
||||||
|
|
||||||
|
class OpenCityDataLoader(BaseLoader):
|
||||||
|
"""Loader that loads Open city data."""
|
||||||
|
|
||||||
|
def __init__(self, city_id: str, dataset_id: str, limit: int):
|
||||||
|
"""Initialize with dataset_id"""
|
||||||
|
""" Example: https://dev.socrata.com/foundry/data.sfgov.org/vw6y-z8j6 """
|
||||||
|
""" e.g., city_id = data.sfgov.org """
|
||||||
|
""" e.g., dataset_id = vw6y-z8j6 """
|
||||||
|
self.city_id = city_id
|
||||||
|
self.dataset_id = dataset_id
|
||||||
|
self.limit = limit
|
||||||
|
|
||||||
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
|
"""Lazy load records."""
|
||||||
|
|
||||||
|
from sodapy import Socrata
|
||||||
|
|
||||||
|
client = Socrata(self.city_id, None)
|
||||||
|
results = client.get(self.dataset_id, limit=self.limit)
|
||||||
|
for record in results:
|
||||||
|
yield Document(
|
||||||
|
page_content=str(record),
|
||||||
|
metadata={
|
||||||
|
"source": self.city_id + "_" + self.dataset_id,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
"""Load records."""
|
||||||
|
|
||||||
|
return list(self.lazy_load())
|
Loading…
Reference in New Issue
Block a user