Create ArcGISLoader & example notebook (#8873)

- Description: Adds the ArcGISLoader class to
`langchain.document_loaders`
  - Allows users to load data from ArcGIS Online, Portal, and similar
- Users can authenticate with `arcgis.gis.GIS` or retrieve public data
anonymously
  - Uses the `arcgis.features.FeatureLayer` class to retrieve the data
  - Defines the most relevant keywords arguments and accepts `**kwargs`
- Dependencies: Using this class requires `arcgis` and, optionally,
`bs4.BeautifulSoup`.

Tagging maintainers:
  - DataLoaders / VectorStores / Retrievers: @rlancemartin, @eyurtsev

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
Joshua Sundance Bailey
2023-08-11 17:33:40 -04:00
committed by GitHub
parent e21152358a
commit eaa505fb09
4 changed files with 503 additions and 0 deletions

View File

@@ -29,6 +29,7 @@ from langchain.document_loaders.airbyte import (
from langchain.document_loaders.airbyte_json import AirbyteJSONLoader
from langchain.document_loaders.airtable import AirtableLoader
from langchain.document_loaders.apify_dataset import ApifyDatasetLoader
from langchain.document_loaders.arcgis_loader import ArcGISLoader
from langchain.document_loaders.arxiv import ArxivLoader
from langchain.document_loaders.async_html import AsyncHtmlLoader
from langchain.document_loaders.azlyrics import AZLyricsLoader
@@ -214,6 +215,7 @@ __all__ = [
"AirtableLoader",
"AmazonTextractPDFLoader",
"ApifyDatasetLoader",
"ArcGISLoader",
"ArxivLoader",
"AsyncHtmlLoader",
"AzureBlobStorageContainerLoader",

View File

@@ -0,0 +1,129 @@
"""Document Loader for ArcGIS FeatureLayers."""
from __future__ import annotations
import json
import re
import warnings
from typing import TYPE_CHECKING, Any, Iterator, List, Optional, Union
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
if TYPE_CHECKING:
import arcgis
_NOT_PROVIDED = "(Not Provided)"
class ArcGISLoader(BaseLoader):
"""Load records from an ArcGIS FeatureLayer."""
def __init__(
self,
layer: Union[str, arcgis.features.FeatureLayer],
gis: Optional[arcgis.gis.GIS] = None,
where: str = "1=1",
out_fields: Optional[Union[List[str], str]] = None,
return_geometry: bool = False,
**kwargs: Any,
):
try:
import arcgis
except ImportError as e:
raise ImportError(
"arcgis is required to use the ArcGIS Loader. "
"Install it with pip or conda."
) from e
try:
from bs4 import BeautifulSoup # type: ignore
self.BEAUTIFULSOUP = BeautifulSoup
except ImportError:
warnings.warn("BeautifulSoup not found. HTML will not be parsed.")
self.BEAUTIFULSOUP = None
self.gis = gis or arcgis.gis.GIS()
if isinstance(layer, str):
self.url = layer
self.layer = arcgis.features.FeatureLayer(layer, gis=gis)
else:
self.url = layer.url
self.layer = layer
self.layer_properties = self._get_layer_properties()
self.where = where
if isinstance(out_fields, str):
self.out_fields = out_fields
elif out_fields is None:
self.out_fields = "*"
else:
self.out_fields = ",".join(out_fields)
self.return_geometry = return_geometry
self.kwargs = kwargs
def _get_layer_properties(self) -> dict:
"""Get the layer properties from the FeatureLayer."""
layer_number_pattern = re.compile(r"/\d+$")
props = self.layer.properties
try:
if self.BEAUTIFULSOUP:
lyr_desc = self.BEAUTIFULSOUP(props["description"]).text
else:
lyr_desc = props["description"]
lyr_desc = lyr_desc or _NOT_PROVIDED
except KeyError:
lyr_desc = _NOT_PROVIDED
try:
item_id = props["serviceItemId"]
item = self.gis.content.get(item_id) or arcgis.features.FeatureLayer(
re.sub(layer_number_pattern, "", self.url),
)
try:
raw_desc = item.description
except AttributeError:
raw_desc = item.properties.description
if self.BEAUTIFULSOUP:
item_desc = self.BEAUTIFULSOUP(raw_desc).text
else:
item_desc = raw_desc
item_desc = item_desc or _NOT_PROVIDED
except KeyError:
item_desc = _NOT_PROVIDED
return {
"layer_description": lyr_desc,
"item_description": item_desc,
"layer_properties": props,
}
def lazy_load(self) -> Iterator[Document]:
"""Lazy load records from FeatureLayer."""
query_response = self.layer.query(
where=self.where,
out_fields=self.out_fields,
return_geometry=self.return_geometry,
return_all_records=True,
**self.kwargs,
)
features = (feature.as_dict["attributes"] for feature in query_response)
for feature in features:
yield Document(
page_content=json.dumps(feature),
metadata={
"url": self.url,
"layer_description": self.layer_properties["layer_description"],
"item_description": self.layer_properties["item_description"],
"layer_properties": self.layer_properties["layer_properties"],
},
)
def load(self) -> List[Document]:
"""Load all records from FeatureLayer."""
return list(self.lazy_load())

View File

@@ -0,0 +1,47 @@
from unittest.mock import MagicMock, patch
import pytest
from langchain.document_loaders import ArcGISLoader
@pytest.fixture
def arcgis_mocks(mock_feature_layer, mock_gis): # type: ignore
sys_modules = {
"arcgis": MagicMock(),
"arcgis.features.FeatureLayer": mock_feature_layer,
"arcgis.gis.GIS": mock_gis,
}
with patch.dict("sys.modules", sys_modules):
yield
@pytest.fixture
def mock_feature_layer(): # type: ignore
feature_layer = MagicMock()
feature_layer.query.return_value = [
MagicMock(as_dict={"attributes": {"field": "value"}})
]
feature_layer.url = "https://example.com/layer_url"
feature_layer.properties = {
"description": "<html><body>Some HTML content</body></html>"
}
return feature_layer
@pytest.fixture
def mock_gis(): # type: ignore
gis = MagicMock()
gis.content.get.return_value = MagicMock(description="Item description")
return gis
def test_lazy_load(arcgis_mocks, mock_feature_layer, mock_gis): # type: ignore
loader = ArcGISLoader(layer=mock_feature_layer, gis=mock_gis)
loader.BEAUTIFULSOUP = None
documents = list(loader.lazy_load())
assert len(documents) == 1
assert documents[0].metadata["url"] == "https://example.com/layer_url"
# Add more assertions based on your expected behavior