mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-13 13:36:15 +00:00
Create ArcGISLoader & example notebook (#8873)
- Description: Adds the ArcGISLoader class to `langchain.document_loaders` - Allows users to load data from ArcGIS Online, Portal, and similar - Users can authenticate with `arcgis.gis.GIS` or retrieve public data anonymously - Uses the `arcgis.features.FeatureLayer` class to retrieve the data - Defines the most relevant keywords arguments and accepts `**kwargs` - Dependencies: Using this class requires `arcgis` and, optionally, `bs4.BeautifulSoup`. Tagging maintainers: - DataLoaders / VectorStores / Retrievers: @rlancemartin, @eyurtsev --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
committed by
GitHub
parent
e21152358a
commit
eaa505fb09
@@ -29,6 +29,7 @@ from langchain.document_loaders.airbyte import (
|
||||
from langchain.document_loaders.airbyte_json import AirbyteJSONLoader
|
||||
from langchain.document_loaders.airtable import AirtableLoader
|
||||
from langchain.document_loaders.apify_dataset import ApifyDatasetLoader
|
||||
from langchain.document_loaders.arcgis_loader import ArcGISLoader
|
||||
from langchain.document_loaders.arxiv import ArxivLoader
|
||||
from langchain.document_loaders.async_html import AsyncHtmlLoader
|
||||
from langchain.document_loaders.azlyrics import AZLyricsLoader
|
||||
@@ -214,6 +215,7 @@ __all__ = [
|
||||
"AirtableLoader",
|
||||
"AmazonTextractPDFLoader",
|
||||
"ApifyDatasetLoader",
|
||||
"ArcGISLoader",
|
||||
"ArxivLoader",
|
||||
"AsyncHtmlLoader",
|
||||
"AzureBlobStorageContainerLoader",
|
||||
|
129
libs/langchain/langchain/document_loaders/arcgis_loader.py
Normal file
129
libs/langchain/langchain/document_loaders/arcgis_loader.py
Normal file
@@ -0,0 +1,129 @@
|
||||
"""Document Loader for ArcGIS FeatureLayers."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
import warnings
|
||||
from typing import TYPE_CHECKING, Any, Iterator, List, Optional, Union
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import arcgis
|
||||
|
||||
_NOT_PROVIDED = "(Not Provided)"
|
||||
|
||||
|
||||
class ArcGISLoader(BaseLoader):
|
||||
"""Load records from an ArcGIS FeatureLayer."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
layer: Union[str, arcgis.features.FeatureLayer],
|
||||
gis: Optional[arcgis.gis.GIS] = None,
|
||||
where: str = "1=1",
|
||||
out_fields: Optional[Union[List[str], str]] = None,
|
||||
return_geometry: bool = False,
|
||||
**kwargs: Any,
|
||||
):
|
||||
try:
|
||||
import arcgis
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"arcgis is required to use the ArcGIS Loader. "
|
||||
"Install it with pip or conda."
|
||||
) from e
|
||||
|
||||
try:
|
||||
from bs4 import BeautifulSoup # type: ignore
|
||||
|
||||
self.BEAUTIFULSOUP = BeautifulSoup
|
||||
except ImportError:
|
||||
warnings.warn("BeautifulSoup not found. HTML will not be parsed.")
|
||||
self.BEAUTIFULSOUP = None
|
||||
|
||||
self.gis = gis or arcgis.gis.GIS()
|
||||
|
||||
if isinstance(layer, str):
|
||||
self.url = layer
|
||||
self.layer = arcgis.features.FeatureLayer(layer, gis=gis)
|
||||
else:
|
||||
self.url = layer.url
|
||||
self.layer = layer
|
||||
|
||||
self.layer_properties = self._get_layer_properties()
|
||||
|
||||
self.where = where
|
||||
|
||||
if isinstance(out_fields, str):
|
||||
self.out_fields = out_fields
|
||||
elif out_fields is None:
|
||||
self.out_fields = "*"
|
||||
else:
|
||||
self.out_fields = ",".join(out_fields)
|
||||
|
||||
self.return_geometry = return_geometry
|
||||
self.kwargs = kwargs
|
||||
|
||||
def _get_layer_properties(self) -> dict:
|
||||
"""Get the layer properties from the FeatureLayer."""
|
||||
|
||||
layer_number_pattern = re.compile(r"/\d+$")
|
||||
props = self.layer.properties
|
||||
|
||||
try:
|
||||
if self.BEAUTIFULSOUP:
|
||||
lyr_desc = self.BEAUTIFULSOUP(props["description"]).text
|
||||
else:
|
||||
lyr_desc = props["description"]
|
||||
lyr_desc = lyr_desc or _NOT_PROVIDED
|
||||
except KeyError:
|
||||
lyr_desc = _NOT_PROVIDED
|
||||
try:
|
||||
item_id = props["serviceItemId"]
|
||||
item = self.gis.content.get(item_id) or arcgis.features.FeatureLayer(
|
||||
re.sub(layer_number_pattern, "", self.url),
|
||||
)
|
||||
try:
|
||||
raw_desc = item.description
|
||||
except AttributeError:
|
||||
raw_desc = item.properties.description
|
||||
if self.BEAUTIFULSOUP:
|
||||
item_desc = self.BEAUTIFULSOUP(raw_desc).text
|
||||
else:
|
||||
item_desc = raw_desc
|
||||
item_desc = item_desc or _NOT_PROVIDED
|
||||
except KeyError:
|
||||
item_desc = _NOT_PROVIDED
|
||||
return {
|
||||
"layer_description": lyr_desc,
|
||||
"item_description": item_desc,
|
||||
"layer_properties": props,
|
||||
}
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
"""Lazy load records from FeatureLayer."""
|
||||
|
||||
query_response = self.layer.query(
|
||||
where=self.where,
|
||||
out_fields=self.out_fields,
|
||||
return_geometry=self.return_geometry,
|
||||
return_all_records=True,
|
||||
**self.kwargs,
|
||||
)
|
||||
features = (feature.as_dict["attributes"] for feature in query_response)
|
||||
for feature in features:
|
||||
yield Document(
|
||||
page_content=json.dumps(feature),
|
||||
metadata={
|
||||
"url": self.url,
|
||||
"layer_description": self.layer_properties["layer_description"],
|
||||
"item_description": self.layer_properties["item_description"],
|
||||
"layer_properties": self.layer_properties["layer_properties"],
|
||||
},
|
||||
)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load all records from FeatureLayer."""
|
||||
return list(self.lazy_load())
|
@@ -0,0 +1,47 @@
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain.document_loaders import ArcGISLoader
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def arcgis_mocks(mock_feature_layer, mock_gis): # type: ignore
|
||||
sys_modules = {
|
||||
"arcgis": MagicMock(),
|
||||
"arcgis.features.FeatureLayer": mock_feature_layer,
|
||||
"arcgis.gis.GIS": mock_gis,
|
||||
}
|
||||
with patch.dict("sys.modules", sys_modules):
|
||||
yield
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_feature_layer(): # type: ignore
|
||||
feature_layer = MagicMock()
|
||||
feature_layer.query.return_value = [
|
||||
MagicMock(as_dict={"attributes": {"field": "value"}})
|
||||
]
|
||||
feature_layer.url = "https://example.com/layer_url"
|
||||
feature_layer.properties = {
|
||||
"description": "<html><body>Some HTML content</body></html>"
|
||||
}
|
||||
return feature_layer
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_gis(): # type: ignore
|
||||
gis = MagicMock()
|
||||
gis.content.get.return_value = MagicMock(description="Item description")
|
||||
return gis
|
||||
|
||||
|
||||
def test_lazy_load(arcgis_mocks, mock_feature_layer, mock_gis): # type: ignore
|
||||
loader = ArcGISLoader(layer=mock_feature_layer, gis=mock_gis)
|
||||
loader.BEAUTIFULSOUP = None
|
||||
|
||||
documents = list(loader.lazy_load())
|
||||
|
||||
assert len(documents) == 1
|
||||
assert documents[0].metadata["url"] == "https://example.com/layer_url"
|
||||
# Add more assertions based on your expected behavior
|
Reference in New Issue
Block a user