Chapter 15 The Effect of Gun Ownership on Gun-Homicide Rates using DML for neural nets

In this lab, we estimate the effect of gun ownership on the homicide rate by a neural network.

library(keras)

## Warning: package 'keras' was built under R version 4.0.5

library(lfe)

## Warning: package 'lfe' was built under R version 4.0.5

## Loading required package: Matrix

# Import relevant packages
import pandas as pd
import numpy as np
import pyreadr
from sklearn import preprocessing
import patsy

from numpy import loadtxt
from keras.models import Sequential
from keras.layers import Dense

import hdmpy
import numpy as np
import random
import statsmodels.api as sm
import matplotlib.pyplot as plt
import numpy as np

from matplotlib import colors

from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV, ElasticNetCV
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
import itertools
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
from pandas.api.types import is_categorical_dtype
from itertools import compress
import statsmodels.api as sm
import statsmodels.formula.api as smf

First, we need to load and preprocess the data.

data <- read.csv("./data/gun_clean.csv") 

# Find Variable Names from Dataset 
varlist <- function (df=NULL,type=c("numeric","factor","character"), pattern="", exclude=NULL) {
  vars <- character(0)
  if (any(type %in% "numeric")) {
    vars <- c(vars,names(df)[sapply(df,is.numeric)])
  }
  if (any(type %in% "factor")) {
    vars <- c(vars,names(df)[sapply(df,is.factor)])
  }  
  if (any(type %in% "character")) {
    vars <- c(vars,names(df)[sapply(df,is.character)])
  }  
  vars[(!vars %in% exclude) & grepl(vars,pattern=pattern)]
}

# creating variables
# Dummy Variables for Year and County Fixed Effects
fixed  <- grep("X_Jfips", names(data), value=TRUE, fixed=TRUE)
year   <- varlist(data, pattern="X_Tyear")

# Census Control Variables
census     <- NULL
census_var <- c("^AGE", "^BN", "^BP", "^BZ", "^ED", "^EL","^HI", "^HS", "^INC", "^LF", "^LN", "^PI", "^PO", "^PP", "^PV", "^SPR", "^VS")

for(i in 1:length(census_var)){
  
  census  <- append(census, varlist(data, pattern=census_var[i]))
}

# Variables
# Treatment Variable
d     <- "logfssl"

# Outcome Variable
y     <- "logghomr"

# Other Control Variables
X1    <- c("logrobr", "logburg", "burg_missing", "robrate_missing")
X2    <- c("newblack", "newfhh", "newmove", "newdens", "newmal")

#################################  Partial out Fixed Effects ########################

# New Dataset for Partiled-out Variables
rdata    <- as.data.frame(data$CountyCode) 
colnames(rdata) <- "CountyCode"

# Variables to be Partialled-out
varlist <- c(y, d,X1, X2, census)


# Partial out Variables in varlist from year and county fixed effect
for(i in 1:length(varlist)){
  form <- as.formula(paste(varlist[i], "~", paste(paste(year,collapse="+"),  paste(fixed,collapse="+"), sep="+")))
  rdata[, varlist[i]] <- lm(form, data)$residuals
}

data1 = pd.read_csv( r"./data/gun_clean.csv" )
data1.shape[1]

# To account for heterogeneity across counties and time trends in all variables, 
# we remove from them county-specific and time-specific effects in the following 
# preprocessing.

## 415

import re

# find Variable Names from Dataset
def varlist( df = None, type_dataframe = [ 'numeric' , 'categorical' , 'string' ],  pattern = "", exclude = None ):
    varrs = []
    
    if ('numeric' in type_dataframe):
        varrs = varrs + df.columns[ df.apply( is_numeric_dtype , axis = 0 ).to_list() ].tolist()
    
    if ('categorical' in type_dataframe):
        varrs = varrs + df.columns[ df.apply( is_categorical_dtype , axis = 0 ).to_list() ].tolist()
    
    if ('string' in type_dataframe):
        varrs = varrs + df.columns[ df.apply( is_string_dtype , axis = 0 ).to_list() ].tolist()
    
    grepl_result = np.array( [ re.search( pattern , variable ) is not None for variable in df.columns.to_list() ] )
    
    if exclude is None:
        result = list(compress( varrs, grepl_result ) )
    
    else:
        varrs_excluded = np.array( [var in exclude for var in varrs ] )
        and_filter = np.logical_and( ~varrs_excluded ,  grepl_result )
        result = list(compress( varrs, and_filter ) )
    
    return result   

# creating variables
# Dummy Variables for Year and County Fixed Effects
r = re.compile("X_Jfips")
fixed = list( filter( r.match, data1.columns.to_list() ) )
year = varlist(data1, pattern="X_Tyear")

census = []
census_var = ["^AGE", "^BN", "^BP", "^BZ", "^ED", "^EL","^HI", "^HS", "^INC", "^LF", "^LN", "^PI", "^PO", "^PP", "^PV", "^SPR", "^VS"]

for variable in census_var:
    r = re.compile( variable )
    census = census + list( filter( r.match, data1.columns.to_list() ) )

# variables 
d = "logfssl"

# Outcome Variable
y = "logghomr"

# Other Control Variables
X1 = ["logrobr", "logburg", "burg_missing", "robrate_missing"]
X2 = ["newblack", "newfhh", "newmove", "newdens", "newmal"]

# Partial out Fixed Effects
rdata = data1[['CountyCode']]

# Variables to be Partialled-out
varlist2 = [y] + [d] + X1 + X2 + census

# Partial out Variables in varlist from year and county fixed effect
for var_name in varlist2:
    form = var_name + " ~ " + "+" + " + ".join( year ) + "+" + " + ".join( fixed )
    rdata[f'{var_name}'] = smf.ols( formula = form , data = data1 ).fit().resid

## <string>:3: SettingWithCopyWarning: 
## A value is trying to be set on a copy of a slice from a DataFrame.
## Try using .loc[row_indexer,col_indexer] = value instead
## 
## See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

15.1 DML for neural nets

The following algorithm comsumes \(Y\),\(D\) and \(Z\), and learns the residuals \(\tilde{Y}\) and \(\tilde{D}\) via a neural network, where the residuals are obtained by cross-validation (cross-fitting). Then, it prints the estimated coefficient β and the clustered standard error from the final OLS regression.

DML2.for.NN <- function(z, d, y, nfold=2, clu, num_epochs, batch_size) {
  nobs <- nrow(z) #number of observations
  foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] #define folds indices
  I <- split(1:nobs, foldid)  #split observation indices into folds  
  ytil <- dtil <- rep(NA, nobs)
  cat("fold: ")
  for(b in 1:length(I)){
  # normalize the data
  mean <- apply(z[-I[[b]],], 2, mean)
  std <- apply(z[-I[[b]],], 2, sd)
  z[-I[[b]],] <- scale(z[-I[[b]],], center = mean, scale = std)
  z[I[[b]],] <- scale(z[I[[b]],], center = mean, scale = std)
  # building the model                    
  build_model <- function(){
  model <- keras_model_sequential() %>% 
    layer_dense(units = 16, activation = "relu", 
                input_shape = dim(z[-I[[b]],][2]))%>% 
    layer_dense(units = 16, activation = "relu") %>% 
    layer_dense(units = 1) 
  
    model %>% compile(
    optimizer = "rmsprop", 
    loss = "mse", 
    metrics = c("mae")
    )  
   }
  model.Y <- build_model()
  model.D <- build_model()                       
  # fitting the model                   
  model.D %>% fit(z[-I[[b]],], d[-I[[b]]],
                    epochs = num_epochs, batch_size = batch_size, verbose = 0)                       
  model.Y %>% fit(z[-I[[b]],], y[-I[[b]]],
                    epochs = num_epochs, batch_size = batch_size, verbose = 0)
  dhat <- model.D %>% predict(z[I[[b]],])
  yhat <- model.Y %>% predict(z[I[[b]],])   
  dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual for the left-out fold
  ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial for the left-out fold                  
  cat(b," ")
        }
  #rfit <- lm(ytil ~ dtil)    #estimate the main parameter by regressing one residual on the other
  data <- data.frame(cbind(ytil, dtil, as.matrix(clu)))
  rfit <- felm(ytil ~ dtil|0|0|CountyCode,data=data)
  coef.est <- coef(rfit)[2]  #extract coefficient
  #HC <- vcovHC(rfit)
  se    <- summary(rfit,robust=T)$coefficients[2,2] #record robust standard error by County
  cat(sprintf("\ncoef (se) = %g (%g)\n", coef.est , se))  #printing output
  return( list(coef.est =coef.est , se=se, dtil=dtil, ytil=ytil, rfit=rfit) ) #save output and residuals 
}

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

def DML2_for_NN(z, d, y, nfold, clu, num_epochs, batch_size):
    
    # Num ob observations
    nobs = z.shape[0]
    
    # Define folds indices 
    list_1 = [*range(0, nfold, 1)]*nobs
    sample = np.random.choice(nobs,nobs, replace=False).tolist()
    foldid = [list_1[index] for index in sample]

    # Create split function(similar to R)
    def split(x, f):
        count = max(f) + 1
        return tuple( list(itertools.compress(x, (el == i for el in f))) for i in range(count) ) 

    # Split observation indices into folds 
    list_2 = [*range(0, nobs, 1)]
    I = split(list_2, foldid)
    
    # loop to save results
    for b in range(0,len(I)):
    
        # Split data - index to keep are in mask as booleans
        include_idx = set(I[b])  #Here should go I[b] Set is more efficient, but doesn't reorder your elements if that is desireable
        mask = np.array([(i in include_idx) for i in range(len(z))])

        # Normalize the data
        scaler = StandardScaler()
        
        scaler.fit( z[~mask,] )
        z[~mask,] = scaler.transform( z[~mask,] )

        scaler.fit( z[mask,] )
        z[mask,] = scaler.transform( z[mask,] )
        
        # building the model
        # define the keras model
        model = Sequential()
        model.add(Dense(16, input_dim = z[~mask,].shape[1], activation = 'relu'))
        model.add(Dense(16, activation = 'relu'))
        model.add(Dense(1))
        
        # compile the keras model
        opt = keras.optimizers.RMSprop()
        mse = tf.keras.losses.MeanSquaredError()
        mae = tf.keras.metrics.MeanAbsoluteError(name="mean_absolute_error", dtype=None)
        model.compile(loss=mse, optimizer= opt , metrics=mae)

        # Fit and predict dhat
        model.fit(z[~mask,], d[~mask,], epochs=num_epochs, batch_size=batch_size, verbose = 0)
        dhat = model.predict(z[mask,])
        
        # Fit and predict yhat
        model.fit(z[~mask,], y[~mask,], epochs=num_epochs, batch_size=batch_size, verbose = 0)
        yhat = model.predict(z[mask,])

        # Create array to save errors 
        dtil = np.zeros( len(z) ).reshape( len(z) , 1 )
        ytil = np.zeros( len(z) ).reshape( len(z) , 1 )

        # save errors  
        dtil[mask] =  d[mask,] - dhat.reshape( len(I[b]) , 1 )
        ytil[mask] = y[mask,] - yhat.reshape( len(I[b]) , 1 )
        print(b, " ")
    
    # Create dataframe 
    data_2 = pd.DataFrame(np.concatenate( ( ytil, dtil,clu ), axis = 1), columns = ['ytil','dtil','CountyCode'])
   
    # OLS clustering at the County level
    model = "ytil ~ dtil"
    rfit = smf.ols(model , data=data_2).fit().get_robustcov_results(cov_type = "cluster", groups= data_2['CountyCode'])
    
    coef_est = rfit.summary2().tables[1]['Coef.']['dtil']
    se = rfit.summary2().tables[1]['Std.Err.']['dtil']

    print("Coefficient is {}, SE is equal to {}".format(coef_est, se))
    
    return coef_est, se, dtil, ytil, rfit

15.2 Estimating the effect with DLM for neural nets

# Treatment Variable
D    <- rdata[which(colnames(rdata) == d)]
# Outcome Variable
Y     <- rdata[which(colnames(rdata) == y)]
# Construct matrix Z
Z <- rdata[which(colnames(rdata) %in% c(X1,X2,census))]

# inputs
y_nn <- as.matrix(Y)
d_nn <- as.matrix(D)
z_nn <- as.matrix(Z)
clu <- rdata[which(colnames(rdata) == "CountyCode")]

# DML with a NN
set.seed(123)
DML2.nn = DML2.for.NN(z_nn, d_nn, y_nn, nfold=2, clu, 100, 10)

## fold: 1  2  
## coef (se) = 0.106278 (0.0642281)

# Create main variables
Y = rdata['logghomr']
D = rdata['logfssl']
Z = rdata.drop(['logghomr', 'logfssl', 'CountyCode'], axis=1)
CLU = rdata['CountyCode']

# as matrix
y = Y.to_numpy().reshape( len(Y) , 1 )
d = D.to_numpy().reshape( len(Y) , 1 )
z = Z.to_numpy()
clu = CLU.to_numpy().reshape( len(Y) , 1 )

# DML with a NN
DML2_nn = DML2_for_NN(z, d, y, 2, clu, 100, 10)

## 0  
## 1  
## Coefficient is 0.2984273747970456, SE is equal to 0.10284209429900727