Added Geometry Validation, Geometry Metadata, and WKT instead of Python str() to GeoDataFrame Loader (#9466)

@rlancemartin The current implementation within `Geopandas.GeoDataFrame`
loader uses the python builtin `str()` function on the input geometries.
While this looks very close to WKT (Well known text), Python's str
function doesn't guarantee that.

In the interest of interop., I've changed to the of use `wkt` property
on the Shapely geometries for generating the text representation of the
geometries.

Also, included here:
- validation of the input `page_content_column` as being a GeoSeries.
- geometry `crs` (Coordinate Reference System) / bounds
(xmin/ymin/xmax/ymax) added to Document metadata. Having the CRS is
critical... having the bounds is just helpful!

I think there is a larger question of "Should the geometry live in the
`page_content`, or should the record be better summarized and tuck the
geom into metadata?" ...something for another day and another PR.
This commit is contained in:
Brendan Collins 2023-08-19 00:35:39 -04:00 committed by GitHub
parent 616e728ef9
commit 9f545825b7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 29 additions and 4 deletions

View File

@ -29,19 +29,43 @@ class GeoDataFrameLoader(BaseLoader):
f"Expected data_frame to be a gpd.GeoDataFrame, got {type(data_frame)}" f"Expected data_frame to be a gpd.GeoDataFrame, got {type(data_frame)}"
) )
if page_content_column not in data_frame.columns:
raise ValueError(
f"Expected data_frame to have a column named {page_content_column}"
)
if not isinstance(data_frame[page_content_column].iloc[0], gpd.GeoSeries):
raise ValueError(
f"Expected data_frame[{page_content_column}] to be a GeoSeries"
)
self.data_frame = data_frame self.data_frame = data_frame
self.page_content_column = page_content_column self.page_content_column = page_content_column
def lazy_load(self) -> Iterator[Document]: def lazy_load(self) -> Iterator[Document]:
"""Lazy load records from dataframe.""" """Lazy load records from dataframe."""
# assumes all geometries in GeoSeries are same CRS and Geom Type
crs_str = self.data_frame.crs.to_string() if self.data_frame.crs else None
geometry_type = self.data_frame.geometry.geom_type.iloc[0]
for _, row in self.data_frame.iterrows(): for _, row in self.data_frame.iterrows():
text = row[self.page_content_column] geom = row[self.page_content_column]
xmin, ymin, xmax, ymax = geom.bounds
metadata = row.to_dict() metadata = row.to_dict()
metadata["crs"] = crs_str
metadata["geometry_type"] = geometry_type
metadata["xmin"] = xmin
metadata["ymin"] = ymin
metadata["xmax"] = xmax
metadata["ymax"] = ymax
metadata.pop(self.page_content_column) metadata.pop(self.page_content_column)
# Enforce str since shapely Point objects
# geometry type used in GeoPandas) are not strings # using WKT instead of str() to help GIS system interoperability
yield Document(page_content=str(text), metadata=metadata) yield Document(page_content=geom.wkt, metadata=metadata)
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load full dataframe.""" """Load full dataframe."""

View File

@ -17,6 +17,7 @@ else:
def sample_gdf() -> GeoDataFrame: def sample_gdf() -> GeoDataFrame:
import geopandas import geopandas
# TODO: geopandas.datasets will be deprecated in 1.0
path_to_data = geopandas.datasets.get_path("nybb") path_to_data = geopandas.datasets.get_path("nybb")
gdf = geopandas.read_file(path_to_data) gdf = geopandas.read_file(path_to_data)
gdf["area"] = gdf.area gdf["area"] = gdf.area