Basic Introduction to in processing dhs data
%reload_ext autoreload
%autoreload 2
import geopandas as gpd
import pandas as pd
from geowrangler import dhs
dhs_household_data_path = "../data/ph.DTA"
dhs_gps_coordinates = "../data/ph_gps.shp"
%%time
dhs_df = dhs.load_dhs_file(dhs_household_data_path)
dhs_df.head()
ph_config = dhs.load_column_config("ph")
ph_config
renamed_dhs_df = dhs_df.rename(columns=ph_config)
renamed_dhs_df.head()
wealth_col_name = "Wealth Index"
cluster_col_name = "DHSCLUST"
summarized = (
renamed_dhs_df[[wealth_col_name, cluster_col_name]].groupby(cluster_col_name).mean()
)
summarized.reset_index(inplace=True)
summarized.head()
len(summarized)
summarized.DHSCLUST.isna().sum()
summarized.DHSCLUST.isna().sum()
dhs_shp = gpd.read_file(dhs_gps_coordinates)
dhs_shp.head()
survey_geo = pd.merge(summarized, dhs_shp, on="DHSCLUST")
survey_geo
features = [
"rooms",
"electric",
"mobile telephone",
"radio",
"television",
"car/truck",
"refrigerator",
"motorcycle",
"floor",
"toilet",
"drinking water",
]
dhs.apply_threshold(renamed_dhs_df, columns=features, config={"rooms": [0, 25]})
renamed_dhs_df["Recomputed Wealth Index"] = dhs.assign_wealth_index(
renamed_dhs_df[features], features
)
renamed_dhs_df.head()
dhs_ph_path = "../data/ph.DTA"
dhs_kh_path = "../data/kh.DTA"
dhs_mm_path = "../data/mm.DTA"
dhs_tl_path = "../data/tl.DTA"
ph_config = dhs.load_column_config("ph")
kh_config = dhs.load_column_config("kh")
mm_config = dhs.load_column_config("mm")
tl_config = dhs.load_column_config("tl")
dhs_ph_df = dhs.load_dhs_file(dhs_ph_path).rename(columns=ph_config)
dhs_kh_df = dhs.load_dhs_file(dhs_kh_path).rename(columns=kh_config)
dhs_mm_df = dhs.load_dhs_file(dhs_mm_path).rename(columns=mm_config)
dhs_tl_df = dhs.load_dhs_file(dhs_tl_path).rename(columns=tl_config)
cols = list(ph_config.values()) + ["country code and phase"]
merged_df = pd.concat(
[
dhs_ph_df[cols],
dhs_kh_df[cols],
dhs_mm_df[cols],
dhs_tl_df[cols],
]
)
merged_df = merged_df.fillna(0)
merged_df["Recomputed Wealth Index"] = dhs.assign_wealth_index(merged_df[features])
merged_df.head()
import matplotlib.pyplot as plt # noqa
merged_df.hist("Recomputed Wealth Index")
merged_df["Recomputed Wealth Index Not PCA"] = dhs.assign_wealth_index(
merged_df[features], use_pca=False
)
merged_df.head()
merged_df.hist("Recomputed Wealth Index Not PCA")