def make_simple_aoi():
= pd.DataFrame(
df ={
data"col1": [1, 2, 3],
"lat0": [0.0, 1.0, 2.0],
"lon0": [0.0, 0.0, 0.0],
"lat1": [0.0, 1.0, 2.0],
"lon1": [1.0, 1.0, 1.0],
"lat2": [1.0, 2.0, 3.0],
"lon2": [1.0, 1.0, 1.0],
"lat3": [1.0, 2.0, 3.0],
"lon3": [0.0, 0.0, 0.0],
}
)
def square(row):
return Polygon(
(
[
(row.lat0, row.lon0),
(row.lat1, row.lon1),
(row.lat2, row.lon2),
(row.lat3, row.lon3),
]
)
)
return gpd.GeoDataFrame(df, geometry=df.apply(square, axis=1), crs="EPSG:3857") # Use 3857 to match terrain CRS
= make_simple_aoi()
simple_aoi = "../data/sample_terrain.tif" terrain_file
Raster Zonal Stats
Note: This module is a thin layer on top of the rasterstats
and exactextract
packages to make its interface more compatible with the other geowrangler modules (e.g. vector zonal stats)
create_raster_zonal_stats
create_raster_zonal_stats (aoi:Union[str,geopandas.geodataframe.GeoDataF rame], data:Union[str,pathlib.Path], aggregation:Dict[str,Any], extra_args:Dict[str,Any]={'layer': 0, 'band_num': 1, 'nodata': None, 'affine': None, 'all_touched': False})
Compute zonal stats with a vector areas of interest (aoi) from raster data sources. This is a thin layer over the zonal_stats
method from the rasterstats
python package for compatibility with other geowrangler modules. This method currently only supports 1 band for each call, so if you want to create zonal stats for multiple bands with the same raster data, you can call this method for each band (make sure to specify the correct band_num
in the extra_args
parameter). See https://pythonhosted.org/rasterstats/manual.html#zonal-statistics for more details
Type | Default | Details | |
---|---|---|---|
aoi | Union | The area of interest geodataframe, or path to the vector file | |
data | Union | The path to the raster data file | |
aggregation | Dict | A dict specifying the aggregation. See create_zonal_stats from the geowrangler.vector_zonal_stats module for more details |
|
extra_args | Dict | {‘layer’: 0, ‘band_num’: 1, ‘nodata’: None, ‘affine’: None, ‘all_touched’: False} | Extra arguments passed to rasterstats.zonal_stats method |
Returns | GeoDataFrame |
import matplotlib.pyplot as plt
import numpy as np
import rasterio
with rasterio.open(terrain_file) as src:
= src.read(1)
data = src.crs
data_crs = src.bounds
data_bounds print(data.shape, data_crs, data_bounds)
(707, 707) EPSG:3857 BoundingBox(left=-20.0, bottom=-19.994, right=19.994, top=20.0)
simple_aoi.total_bounds
array([0., 0., 3., 1.])
= plt.imshow(data, cmap="pink") ax
= simple_aoi.plot(facecolor="none", edgecolor="blue") ax
= create_raster_zonal_stats(
results
simple_aoi,
terrain_file,=dict(func=["mean", "max", "min", "std"], column="elevation"),
aggregation=dict(nodata=np.nan),
extra_args )
CPU times: user 39.8 ms, sys: 0 ns, total: 39.8 ms
Wall time: 38.6 ms
results
col1 | lat0 | lon0 | lat1 | lon1 | lat2 | lon2 | lat3 | lon3 | geometry | elevation_min | elevation_max | elevation_mean | elevation_std | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 1.0 | 0.0 | POLYGON ((0 0, 0 1, 1 1, 1 0, 0 0)) | 1238.734161 | 1444.722213 | 1339.126726 | 64.362218 |
1 | 2 | 1.0 | 0.0 | 1.0 | 1.0 | 2.0 | 1.0 | 2.0 | 0.0 | POLYGON ((1 0, 1 1, 2 1, 2 0, 1 0)) | 1222.409102 | 1425.920852 | 1311.903997 | 51.286449 |
2 | 3 | 2.0 | 0.0 | 2.0 | 1.0 | 3.0 | 1.0 | 3.0 | 0.0 | POLYGON ((2 0, 2 1, 3 1, 3 0, 2 0)) | 1231.569771 | 1402.859628 | 1319.624868 | 45.298326 |
Check that create_raster_zonal_stats
uses the nodata
attribute set in the tif file if extra_args.nodata
is set to None
# Load in admin bounds
= gpd.read_file("../data/region3_admin.geojson") phl_adm
If the nodata
parameter is explicitly set to -999999
, the Population count should be > 0.
= create_raster_zonal_stats(
grid_aoi_results
phl_adm,"../data/phl_ppp_2020_constrained.tif",
=dict(
aggregation=["sum"],
func="population",
column=["population_count"],
output=[True],
fillna
),=dict(nodata=-99999),
extra_args )
/home/jc_tm/project_repos/geowrangler/geowrangler/vector_zonal_stats.py:182: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
results[colname].fillna(0, inplace=True)
grid_aoi_results
Reg_Code | Reg_Name | Reg_Alt_Name | geometry | population_count | |
---|---|---|---|---|---|
0 | 030000000 | Region III | Central Luzon | MULTIPOLYGON (((120.11687 14.76309, 120.11684 ... | 10983338.0 |
If the nodata
parameter is not set, create_raster_zonal_stats
should use the nodata
attribute set by the geotiff file so that population count should still be > 0.
= create_raster_zonal_stats(
grid_aoi_results
phl_adm,"../data/phl_ppp_2020_constrained.tif",
=dict(
aggregation=["sum"],
func="population",
column=["population_count"],
output=[True],
fillna
), )
/home/jc_tm/project_repos/geowrangler/geowrangler/vector_zonal_stats.py:182: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
results[colname].fillna(0, inplace=True)
grid_aoi_results
Reg_Code | Reg_Name | Reg_Alt_Name | geometry | population_count | |
---|---|---|---|---|---|
0 | 030000000 | Region III | Central Luzon | MULTIPOLYGON (((120.11687 14.76309, 120.11684 ... | 10983338.0 |
Exactextract Zonal Stats
The next section provides an alternative implementation of raster zonal statistics using the exactextract
package. This package promises more exact zonal statistics by taking into account pixel fractions vs rasterstats that assigns the pixels to a polygon based on their centroid locations.
exactextract provides a fast and accurate algorithm for summarizing values in the portion of a raster dataset that is covered by a polygon, often referred to as zonal statistics. Unlike other zonal statistics implementations, it takes into account raster cells that are partially covered by the polygon.
Advantages
- Results from
exactextract
are more precise thanrasterstats
due to handling pixel fractions in its calculations exactextract
is faster to run thanrasterstats
for the same input AOI and raster data
Disadvantages
- nodata value handling is not yet supported by the
get_exactextract_zonal_stats()
method. To handle NODATA, it is recommended to create a raster file with a properly-set NODATA value, i.e.
import rasterio as rio
= 'path/to/existing_raster.tif'
existing_raster = 'path/to/new_raster.tif'
new_raster = -9999
user_defined_nodata
with rio.open(existing_raster) as src:
= src.profile
profile =user_defined_nodata)
profile.update(nodata= src.read()
data
# Write the new raster with the updated profile with nodata
with rio.open(output_raster_path, 'w', **profile) as dst:
dst.write(data)
/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/fastcore/docscrape.py:230: UserWarning: potentially wrong underline length...
Example usage
-------- in
Computes zonal statistics from raster data sources using vector areas of interest (AOI).
...
else: warn(msg)
create_exactextract_zonal_stats
create_exactextract_zonal_stats (aoi:Union[str,geopandas.geodataframe.Ge oDataFrame], data:Union[str,pathlib.Path], aggregation:List[Dict[str,Any]], include_cols:List[str]=None, include_geom:bool=True, extra_args:dict={'strategy': 'feature- sequential', 'max_cells_in_memory': 30000000})
*Computes zonal statistics from raster data sources using vector areas of interest (AOI).
This function is a wrapper over the exact_extract
method from the exactextract
Python package, designed for compatibility with other geowrangler modules. It takes a list of agg specs, with each agg spec applied to a specific band. See https://github.com/isciences/exactextract/blob/master/python/README.md for more details.*
Type | Default | Details | |
---|---|---|---|
aoi | Union | A GeoDataframe specifying geometries. | |
data | Union | The path to the raster data file | |
aggregation | List | List of agg specs, | |
include_cols | List | None | If not None, list of columns from input AOI to include in output |
include_geom | bool | True | If false, drop geometry column. include_cols takes priority. |
extra_args | dict | {‘strategy’: ‘feature-sequential’, ‘max_cells_in_memory’: 30000000} | Extra arguments to pass to exactextract.exact_extract(). “include_cols”, “include_geom”, and “output” arguments are ignored. |
Returns | GeoDataFrame | Extra arguments to pass to `exactextract.exact_extract(). Ignores output, include_geom, and include_cols. |
Usage Examples of get_exactextract_zonal_stats()
# Single band
= create_exactextract_zonal_stats(
results "EPSG:3857"),
simple_aoi.to_crs(
terrain_file,=[
aggregationdict(band=1, func=["sum", "max", "min", "stdev"], output="elevation"),
dict(band=2, func=["mean", "max", "min", "stdev"], output="elevation")
],
) results
CPU times: user 58.8 ms, sys: 0 ns, total: 58.8 ms
Wall time: 57.8 ms
/tmp/ipykernel_19281/4285980530.py:10: UserWarning: Band number in {"band": 2, "func": ["mean", "max", "min", "stdev"], "output": "elevation"} is invalid. Aggregation will be skipped.
warnings.warn(f"Band number in {json.dumps(agg)} is invalid. Aggregation will be skipped.")
col1 | lat0 | lon0 | lat1 | lon1 | lat2 | lon2 | lat3 | lon3 | geometry | elevation_sum | elevation_max | elevation_min | elevation_stdev | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 1.0 | 0.0 | POLYGON ((0 0, 0 1, 1 1, 1 0, 0 0)) | 417459.46875 | 1444.722213 | 1232.164947 | 63.263985 |
1 | 2 | 1.0 | 0.0 | 1.0 | 1.0 | 2.0 | 1.0 | 2.0 | 0.0 | POLYGON ((1 0, 1 1, 2 1, 2 0, 1 0)) | 409081.15625 | 1425.920852 | 1213.356474 | 50.523487 |
2 | 3 | 2.0 | 0.0 | 2.0 | 1.0 | 3.0 | 1.0 | 3.0 | 0.0 | POLYGON ((2 0, 2 1, 3 1, 3 0, 2 0)) | 411552.87500 | 1402.859628 | 1220.031795 | 44.781490 |
# Multiple band
= create_exactextract_zonal_stats(
grid_exactextract_aoi_results
phl_adm,"../data/ph_s5p_AER_AI_340_380.tiff",
=[
aggregationdict(band=1, func=["mean", "sum"], nodata_val=-9999), # default - band1_mean, band1_sum
dict(band=2, func=["mean", "sum"]), # default - band2_mean, band2_sum
dict(band=2, func=["mean", "sum"], output="prefix"), # prefix_mean, prefix_sum
dict(band=1, func=["mean", "sum", "count"], output=["aer_ai_mean", "aer_ai_sum", "aer_ai_count"]),
]
) display(grid_exactextract_aoi_results)
Reg_Code | Reg_Name | Reg_Alt_Name | geometry | band_1_mean | band_1_sum | band_2_mean | band_2_sum | prefix_mean | prefix_sum | aer_ai_mean | aer_ai_sum | aer_ai_count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 030000000 | Region III | Central Luzon | MULTIPOLYGON (((120.11687 14.76309, 120.11684 ... | 0.000279 | 0.12919 | 0.015252 | 7.070237 | 0.015252 | 7.070237 | 0.000279 | 0.12919 | 463.568115 |
CPU times: user 1.12 s, sys: 39.7 ms, total: 1.16 s
Wall time: 1.16 s
# Multiple band test -> include_cols
= create_exactextract_zonal_stats(
grid_exactextract_aoi_results
phl_adm,"../data/ph_s5p_AER_AI_340_380.tiff",
=[
aggregationdict(band=1, func=["mean", "sum"], nodata_val=-9999), # default - band1_mean, band1_sum
dict(band=2, func=["mean", "sum"]), # default - band2_mean, band2_sum
dict(band=2, func=["mean", "sum"], output="prefix"), # prefix_mean, prefix_sum
dict(band=1, func=["mean", "sum", "count"], output=["aer_ai_mean", "aer_ai_sum", "aer_ai_count"]),
],=["Reg_Code"],
include_cols
) display(grid_exactextract_aoi_results)
Reg_Code | band_1_mean | band_1_sum | band_2_mean | band_2_sum | prefix_mean | prefix_sum | aer_ai_mean | aer_ai_sum | aer_ai_count | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 030000000 | 0.000279 | 0.12919 | 0.015252 | 7.070237 | 0.015252 | 7.070237 | 0.000279 | 0.12919 | 463.568115 |
CPU times: user 792 ms, sys: 9.92 ms, total: 801 ms
Wall time: 799 ms
# Using include_geom
= create_exactextract_zonal_stats(
grid_exactextract_aoi_results
phl_adm,"../data/ph_s5p_AER_AI_340_380.tiff",
=[
aggregationdict(band=1, func=["mean", "sum"], nodata_val=-9999), # default - band1_mean, band1_sum
dict(band=2, func=["mean", "sum"]), # default - band2_mean, band2_sum
dict(band=2, func=["mean", "sum"], output="prefix"), # prefix_mean, prefix_sum
dict(band=1, func=["mean", "sum", "count"], output=["aer_ai_mean", "aer_ai_sum", "aer_ai_count"]),
],=False
include_geom
) display(grid_exactextract_aoi_results)
Reg_Code | Reg_Name | Reg_Alt_Name | band_1_mean | band_1_sum | band_2_mean | band_2_sum | prefix_mean | prefix_sum | aer_ai_mean | aer_ai_sum | aer_ai_count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 030000000 | Region III | Central Luzon | 0.000279 | 0.12919 | 0.015252 | 7.070237 | 0.015252 | 7.070237 | 0.000279 | 0.12919 | 463.568115 |
CPU times: user 820 ms, sys: 70.2 ms, total: 890 ms
Wall time: 888 ms
# Using include_geom
= create_exactextract_zonal_stats(
grid_exactextract_aoi_results
phl_adm,"../data/ph_s5p_AER_AI_340_380.tiff",
=[
aggregationdict(band=1, func=["mean", "sum"], nodata_val=-9999), # default - band1_mean, band1_sum
dict(band=2, func=["mean", "sum"]), # default - band2_mean, band2_sum
dict(band=2, func=["mean", "sum"], output="prefix"), # prefix_mean, prefix_sum
dict(band=1, func=["mean", "sum", "count"], output=["aer_ai_mean", "aer_ai_sum", "aer_ai_count"]),
],=["Reg_Code"]
include_cols
)type(grid_exactextract_aoi_results)) display(grid_exactextract_aoi_results,
Reg_Code | band_1_mean | band_1_sum | band_2_mean | band_2_sum | prefix_mean | prefix_sum | aer_ai_mean | aer_ai_sum | aer_ai_count | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 030000000 | 0.000279 | 0.12919 | 0.015252 | 7.070237 | 0.015252 | 7.070237 | 0.000279 | 0.12919 | 463.568115 |
pandas.core.frame.DataFrame
CPU times: user 754 ms, sys: 29.6 ms, total: 784 ms
Wall time: 781 ms
# Using include_geom and include_cols
# As geometry is not in include_cols, it is prioritzed over include_geom=True
= create_exactextract_zonal_stats(
grid_exactextract_aoi_results
phl_adm,"../data/ph_s5p_AER_AI_340_380.tiff",
=[
aggregationdict(band=1, func=["mean", "sum"], nodata_val=-9999), # default - band1_mean, band1_sum
dict(band=2, func=["mean", "sum"]), # default - band2_mean, band2_sum
dict(band=2, func=["mean", "sum"], output="prefix"), # prefix_mean, prefix_sum
dict(band=1, func=["mean", "sum", "count"], output=["aer_ai_mean", "aer_ai_sum", "aer_ai_count"]),
],=["Reg_Code"],
include_cols=True
include_geom
)type(grid_exactextract_aoi_results)) display(grid_exactextract_aoi_results,
Reg_Code | band_1_mean | band_1_sum | band_2_mean | band_2_sum | prefix_mean | prefix_sum | aer_ai_mean | aer_ai_sum | aer_ai_count | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 030000000 | 0.000279 | 0.12919 | 0.015252 | 7.070237 | 0.015252 | 7.070237 | 0.000279 | 0.12919 | 463.568115 |
pandas.core.frame.DataFrame
CPU times: user 713 ms, sys: 30.1 ms, total: 744 ms
Wall time: 741 ms
# Multiple band test - from file
= create_exactextract_zonal_stats(
grid_exactextract_aoi_results "../data/region3_admin.geojson",
"../data/ph_s5p_AER_AI_340_380.tiff",
=[
aggregationdict(band=1, func=["mean", "sum"], nodata_val=-9999), # default - band1_mean, band1_sum
dict(band=2, func=["mean", "sum"]), # default - band2_mean, band2_sum
dict(band=2, func=["mean", "sum"], output="prefix"), # prefix_mean, prefix_sum
dict(band=1, func=["mean", "sum", "count"], output=["aer_ai_mean", "aer_ai_sum", "aer_ai_count"]),
],
) display(grid_exactextract_aoi_results)
Reg_Code | Reg_Name | Reg_Alt_Name | geometry | band_1_mean | band_1_sum | band_2_mean | band_2_sum | prefix_mean | prefix_sum | aer_ai_mean | aer_ai_sum | aer_ai_count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 030000000 | Region III | Central Luzon | MULTIPOLYGON (((120.11687 14.76309, 120.11684 ... | 0.000279 | 0.12919 | 0.015252 | 7.070237 | 0.015252 | 7.070237 | 0.000279 | 0.12919 | 463.568115 |
CPU times: user 1.37 s, sys: 29.3 ms, total: 1.39 s
Wall time: 1.39 s
# Multiple band test - gdal -> UNSUPPORTED
= create_exactextract_zonal_stats(
grid_exactextract_aoi_results
phl_adm,"../data/ph_s5p_AER_AI_340_380.tiff",
=[
aggregationdict(band=1, func=["mean", "sum"], nodata_val=-9999), # default - band1_mean, band1_sum
dict(band=2, func=["mean", "sum"]), # default - band2_mean, band2_sum
dict(band=2, func=["mean", "sum"], output="prefix"), # prefix_mean, prefix_sum
dict(band=1, func=["mean", "sum", "count"], output=["aer_ai_mean", "aer_ai_sum", "aer_ai_count"]),
],= dict(output="gdal")
extra_args
) display(grid_exactextract_aoi_results)
/tmp/ipykernel_19281/2402309088.py:70: UserWarning: output in `extra_args` is ignored. Output is set to 'pandas'. Refer to `exactextract.exact_extract()` to use other output options.
warnings.warn("output in `extra_args` is ignored. Output is set to 'pandas'. Refer to `exactextract.exact_extract()` to use other output options.")
Reg_Code | Reg_Name | Reg_Alt_Name | geometry | band_1_mean | band_1_sum | band_2_mean | band_2_sum | prefix_mean | prefix_sum | aer_ai_mean | aer_ai_sum | aer_ai_count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 030000000 | Region III | Central Luzon | MULTIPOLYGON (((120.11687 14.76309, 120.11684 ... | 0.000279 | 0.12919 | 0.015252 | 7.070237 | 0.015252 | 7.070237 | 0.000279 | 0.12919 | 463.568115 |
CPU times: user 881 ms, sys: 19.9 ms, total: 901 ms
Wall time: 897 ms