Chapter 15 The Effect of Gun Ownership on Gun-Homicide Rates using DML for neural nets
In this lab, we estimate the effect of gun ownership on the homicide rate by a neural network.
library(keras)
## Warning: package 'keras' was built under R version 4.0.5
library(lfe)
## Warning: package 'lfe' was built under R version 4.0.5
## Loading required package: Matrix
# Import relevant packages
import pandas as pd
import numpy as np
import pyreadr
from sklearn import preprocessing
import patsy
from numpy import loadtxt
from keras.models import Sequential
from keras.layers import Dense
import hdmpy
import numpy as np
import random
import statsmodels.api as sm
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import colors
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV, ElasticNetCV
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
import itertools
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
from pandas.api.types import is_categorical_dtype
from itertools import compress
import statsmodels.api as sm
import statsmodels.formula.api as smf
First, we need to load and preprocess the data.
<- read.csv("./data/gun_clean.csv")
data
# Find Variable Names from Dataset
<- function (df=NULL,type=c("numeric","factor","character"), pattern="", exclude=NULL) {
varlist <- character(0)
vars if (any(type %in% "numeric")) {
<- c(vars,names(df)[sapply(df,is.numeric)])
vars
}if (any(type %in% "factor")) {
<- c(vars,names(df)[sapply(df,is.factor)])
vars
} if (any(type %in% "character")) {
<- c(vars,names(df)[sapply(df,is.character)])
vars
} !vars %in% exclude) & grepl(vars,pattern=pattern)]
vars[(
}
# creating variables
# Dummy Variables for Year and County Fixed Effects
<- grep("X_Jfips", names(data), value=TRUE, fixed=TRUE)
fixed <- varlist(data, pattern="X_Tyear")
year
# Census Control Variables
<- NULL
census <- c("^AGE", "^BN", "^BP", "^BZ", "^ED", "^EL","^HI", "^HS", "^INC", "^LF", "^LN", "^PI", "^PO", "^PP", "^PV", "^SPR", "^VS")
census_var
for(i in 1:length(census_var)){
<- append(census, varlist(data, pattern=census_var[i]))
census
}
# Variables
# Treatment Variable
<- "logfssl"
d
# Outcome Variable
<- "logghomr"
y
# Other Control Variables
<- c("logrobr", "logburg", "burg_missing", "robrate_missing")
X1 <- c("newblack", "newfhh", "newmove", "newdens", "newmal")
X2
################################# Partial out Fixed Effects ########################
# New Dataset for Partiled-out Variables
<- as.data.frame(data$CountyCode)
rdata colnames(rdata) <- "CountyCode"
# Variables to be Partialled-out
<- c(y, d,X1, X2, census)
varlist
# Partial out Variables in varlist from year and county fixed effect
for(i in 1:length(varlist)){
<- as.formula(paste(varlist[i], "~", paste(paste(year,collapse="+"), paste(fixed,collapse="+"), sep="+")))
form <- lm(form, data)$residuals
rdata[, varlist[i]] }
= pd.read_csv( r"./data/gun_clean.csv" )
data1 1]
data1.shape[
# To account for heterogeneity across counties and time trends in all variables,
# we remove from them county-specific and time-specific effects in the following
# preprocessing.
## 415
import re
# find Variable Names from Dataset
def varlist( df = None, type_dataframe = [ 'numeric' , 'categorical' , 'string' ], pattern = "", exclude = None ):
= []
varrs
if ('numeric' in type_dataframe):
= varrs + df.columns[ df.apply( is_numeric_dtype , axis = 0 ).to_list() ].tolist()
varrs
if ('categorical' in type_dataframe):
= varrs + df.columns[ df.apply( is_categorical_dtype , axis = 0 ).to_list() ].tolist()
varrs
if ('string' in type_dataframe):
= varrs + df.columns[ df.apply( is_string_dtype , axis = 0 ).to_list() ].tolist()
varrs
= np.array( [ re.search( pattern , variable ) is not None for variable in df.columns.to_list() ] )
grepl_result
if exclude is None:
= list(compress( varrs, grepl_result ) )
result
else:
= np.array( [var in exclude for var in varrs ] )
varrs_excluded = np.logical_and( ~varrs_excluded , grepl_result )
and_filter = list(compress( varrs, and_filter ) )
result
return result
# creating variables
# Dummy Variables for Year and County Fixed Effects
= re.compile("X_Jfips")
r = list( filter( r.match, data1.columns.to_list() ) )
fixed = varlist(data1, pattern="X_Tyear")
year
= []
census = ["^AGE", "^BN", "^BP", "^BZ", "^ED", "^EL","^HI", "^HS", "^INC", "^LF", "^LN", "^PI", "^PO", "^PP", "^PV", "^SPR", "^VS"]
census_var
for variable in census_var:
= re.compile( variable )
r = census + list( filter( r.match, data1.columns.to_list() ) )
census
# variables
= "logfssl"
d
# Outcome Variable
= "logghomr"
y
# Other Control Variables
= ["logrobr", "logburg", "burg_missing", "robrate_missing"]
X1 = ["newblack", "newfhh", "newmove", "newdens", "newmal"]
X2
# Partial out Fixed Effects
= data1[['CountyCode']]
rdata
# Variables to be Partialled-out
= [y] + [d] + X1 + X2 + census
varlist2
# Partial out Variables in varlist from year and county fixed effect
for var_name in varlist2:
= var_name + " ~ " + "+" + " + ".join( year ) + "+" + " + ".join( fixed )
form f'{var_name}'] = smf.ols( formula = form , data = data1 ).fit().resid rdata[
## <string>:3: SettingWithCopyWarning:
## A value is trying to be set on a copy of a slice from a DataFrame.
## Try using .loc[row_indexer,col_indexer] = value instead
##
## See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
15.1 DML for neural nets
The following algorithm comsumes \(Y\),\(D\) and \(Z\), and learns the residuals \(\tilde{Y}\) and \(\tilde{D}\) via a neural network, where the residuals are obtained by cross-validation (cross-fitting). Then, it prints the estimated coefficient β and the clustered standard error from the final OLS regression.
<- function(z, d, y, nfold=2, clu, num_epochs, batch_size) {
DML2.for.NN <- nrow(z) #number of observations
nobs <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] #define folds indices
foldid <- split(1:nobs, foldid) #split observation indices into folds
I <- dtil <- rep(NA, nobs)
ytil cat("fold: ")
for(b in 1:length(I)){
# normalize the data
<- apply(z[-I[[b]],], 2, mean)
mean <- apply(z[-I[[b]],], 2, sd)
std -I[[b]],] <- scale(z[-I[[b]],], center = mean, scale = std)
z[<- scale(z[I[[b]],], center = mean, scale = std)
z[I[[b]],] # building the model
<- function(){
build_model <- keras_model_sequential() %>%
model layer_dense(units = 16, activation = "relu",
input_shape = dim(z[-I[[b]],][2]))%>%
layer_dense(units = 16, activation = "relu") %>%
layer_dense(units = 1)
%>% compile(
model optimizer = "rmsprop",
loss = "mse",
metrics = c("mae")
)
}<- build_model()
model.Y <- build_model()
model.D # fitting the model
%>% fit(z[-I[[b]],], d[-I[[b]]],
model.D epochs = num_epochs, batch_size = batch_size, verbose = 0)
%>% fit(z[-I[[b]],], y[-I[[b]]],
model.Y epochs = num_epochs, batch_size = batch_size, verbose = 0)
<- model.D %>% predict(z[I[[b]],])
dhat <- model.Y %>% predict(z[I[[b]],])
yhat <- (d[I[[b]]] - dhat) #record residual for the left-out fold
dtil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial for the left-out fold
ytil[I[[b]]] cat(b," ")
}#rfit <- lm(ytil ~ dtil) #estimate the main parameter by regressing one residual on the other
<- data.frame(cbind(ytil, dtil, as.matrix(clu)))
data <- felm(ytil ~ dtil|0|0|CountyCode,data=data)
rfit <- coef(rfit)[2] #extract coefficient
coef.est #HC <- vcovHC(rfit)
<- summary(rfit,robust=T)$coefficients[2,2] #record robust standard error by County
se cat(sprintf("\ncoef (se) = %g (%g)\n", coef.est , se)) #printing output
return( list(coef.est =coef.est , se=se, dtil=dtil, ytil=ytil, rfit=rfit) ) #save output and residuals
}
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
def DML2_for_NN(z, d, y, nfold, clu, num_epochs, batch_size):
# Num ob observations
= z.shape[0]
nobs
# Define folds indices
= [*range(0, nfold, 1)]*nobs
list_1 = np.random.choice(nobs,nobs, replace=False).tolist()
sample = [list_1[index] for index in sample]
foldid
# Create split function(similar to R)
def split(x, f):
= max(f) + 1
count return tuple( list(itertools.compress(x, (el == i for el in f))) for i in range(count) )
# Split observation indices into folds
= [*range(0, nobs, 1)]
list_2 = split(list_2, foldid)
I
# loop to save results
for b in range(0,len(I)):
# Split data - index to keep are in mask as booleans
= set(I[b]) #Here should go I[b] Set is more efficient, but doesn't reorder your elements if that is desireable
include_idx = np.array([(i in include_idx) for i in range(len(z))])
mask
# Normalize the data
= StandardScaler()
scaler
~mask,] )
scaler.fit( z[~mask,] = scaler.transform( z[~mask,] )
z[
scaler.fit( z[mask,] )= scaler.transform( z[mask,] )
z[mask,]
# building the model
# define the keras model
= Sequential()
model 16, input_dim = z[~mask,].shape[1], activation = 'relu'))
model.add(Dense(16, activation = 'relu'))
model.add(Dense(1))
model.add(Dense(
# compile the keras model
= keras.optimizers.RMSprop()
opt = tf.keras.losses.MeanSquaredError()
mse = tf.keras.metrics.MeanAbsoluteError(name="mean_absolute_error", dtype=None)
mae compile(loss=mse, optimizer= opt , metrics=mae)
model.
# Fit and predict dhat
~mask,], d[~mask,], epochs=num_epochs, batch_size=batch_size, verbose = 0)
model.fit(z[= model.predict(z[mask,])
dhat
# Fit and predict yhat
~mask,], y[~mask,], epochs=num_epochs, batch_size=batch_size, verbose = 0)
model.fit(z[= model.predict(z[mask,])
yhat
# Create array to save errors
= np.zeros( len(z) ).reshape( len(z) , 1 )
dtil = np.zeros( len(z) ).reshape( len(z) , 1 )
ytil
# save errors
= d[mask,] - dhat.reshape( len(I[b]) , 1 )
dtil[mask] = y[mask,] - yhat.reshape( len(I[b]) , 1 )
ytil[mask] print(b, " ")
# Create dataframe
= pd.DataFrame(np.concatenate( ( ytil, dtil,clu ), axis = 1), columns = ['ytil','dtil','CountyCode'])
data_2
# OLS clustering at the County level
= "ytil ~ dtil"
model = smf.ols(model , data=data_2).fit().get_robustcov_results(cov_type = "cluster", groups= data_2['CountyCode'])
rfit
= rfit.summary2().tables[1]['Coef.']['dtil']
coef_est = rfit.summary2().tables[1]['Std.Err.']['dtil']
se
print("Coefficient is {}, SE is equal to {}".format(coef_est, se))
return coef_est, se, dtil, ytil, rfit
15.2 Estimating the effect with DLM for neural nets
# Treatment Variable
<- rdata[which(colnames(rdata) == d)]
D # Outcome Variable
<- rdata[which(colnames(rdata) == y)]
Y # Construct matrix Z
<- rdata[which(colnames(rdata) %in% c(X1,X2,census))]
Z
# inputs
<- as.matrix(Y)
y_nn <- as.matrix(D)
d_nn <- as.matrix(Z)
z_nn <- rdata[which(colnames(rdata) == "CountyCode")]
clu
# DML with a NN
set.seed(123)
= DML2.for.NN(z_nn, d_nn, y_nn, nfold=2, clu, 100, 10) DML2.nn
## fold: 1 2
## coef (se) = 0.106278 (0.0642281)
# Create main variables
= rdata['logghomr']
Y = rdata['logfssl']
D = rdata.drop(['logghomr', 'logfssl', 'CountyCode'], axis=1)
Z = rdata['CountyCode']
CLU
# as matrix
= Y.to_numpy().reshape( len(Y) , 1 )
y = D.to_numpy().reshape( len(Y) , 1 )
d = Z.to_numpy()
z = CLU.to_numpy().reshape( len(Y) , 1 )
clu
# DML with a NN
= DML2_for_NN(z, d, y, 2, clu, 100, 10) DML2_nn
## 0
## 1
## Coefficient is 0.2984273747970456, SE is equal to 0.10284209429900727