Chapter 18 Double/Debiased Machine Learning for the Partially Linear Regression Model.
This is a simple implementation of Debiased Machine Learning for the Partially Linear Regression Model.
Reference:
https://arxiv.org/abs/1608.00060
https://www.amazon.com/Business-Data-Science-Combining-Accelerate/dp/1260452778
The code is based on the book.
18.1 DML algorithm
Here we perform estimation and inference of predictive coefficient \(\alpha\) in the partially linear statistical model, \[ Y = D\alpha + g(X) + U, \quad E (U | D, X) = 0. \] For \(\tilde Y = Y- E(Y|X)\) and \(\tilde D= D- E(D|X)\), we can write \[ \tilde Y = \alpha \tilde D + U, \quad E (U |\tilde D) =0. \] Parameter \(\alpha\) is then estimated using cross-fitting approach to obtain the residuals \(\tilde D\) and \(\tilde Y\). The algorithm comsumes \(Y, D, X\), and machine learning methods for learning the residuals \(\tilde Y\) and \(\tilde D\), where the residuals are obtained by cross-validation (cross-fitting).
The statistical parameter \(\alpha\) has a causal intertpreation of being the effect of \(D\) on \(Y\) in the causal DAG \[ D\to Y, \quad X\to (D,Y)\] or the counterfactual outcome model with conditionally exogenous (conditionally random) assignment of treatment \(D\) given \(X\): \[ Y(d) = d\alpha + g(X) + U(d),\quad U(d) \text{ indep } D |X, \quad Y = Y(D), \quad U = U(D). \]
# install.packages("hdm")
# install.packages("AER")
# install.packages("randomForest")
library(AER) #applied econometrics library
library(randomForest) #random Forest library
library(hdm) #high-dimensional econometrics library
library(glmnet) #glm net
# Import relevant packages
import pandas as pd
import numpy as np
import pyreadr
from sklearn import preprocessing
import patsy
from numpy import loadtxt
from keras.models import Sequential
from keras.layers import Dense
import hdmpy
import numpy as np
import random
import statsmodels.api as sm
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import colors
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV, ElasticNetCV
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
import itertools
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
from pandas.api.types import is_categorical_dtype
from itertools import compress
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.feature_selection import SelectFromModel
from statsmodels.tools import add_constant
from sklearn.linear_model import ElasticNet
import warnings
"ignore") warnings.filterwarnings(
<- function(x, d, y, dreg, yreg, nfold=2) {
DML2.for.PLM
<- nrow(x) #number of observations
nobs
<- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] #define folds indices
foldid
<- split(1:nobs, foldid) #split observation indices into folds
I
<- dtil <- rep(NA, nobs)
ytil
cat("fold: ")
for(b in 1:length(I)){
<- dreg(x[-I[[b]],], d[-I[[b]]]) #take a fold out
dfit
<- yreg(x[-I[[b]],], y[-I[[b]]]) # take a foldt out
yfit
<- predict(dfit, x[I[[b]],], type="response") #predict the left-out fold
dhat
<- predict(yfit, x[I[[b]],], type="response") #predict the left-out fold
yhat
<- (d[I[[b]]] - dhat) #record residual for the left-out fold
dtil[I[[b]]]
<- (y[I[[b]]] - yhat) #record residial for the left-out fold
ytil[I[[b]]]
cat(b," ")
}
<- lm(ytil ~ dtil) #estimate the main parameter by regressing one residual on the other
rfit
<- coef(rfit)[2] #extract coefficient
coef.est
<- sqrt(vcovHC(rfit)[2,2]) #record robust standard error
se
cat(sprintf("\ncoef (se) = %g (%g)\n", coef.est , se)) #printing output
return( list(coef.est =coef.est , se=se, dtil=dtil, ytil=ytil) ) #save output and residuals
}
class standard_skl_model:
def __init__(self, model ):
self.model = model
def fit( self, X, Y ):
# Standarization of X and Y
self.scaler_X = StandardScaler()
self.scaler_X.fit( X )
= self.scaler_X.transform( X )
std_X self.model.fit( std_X , Y )
return self
def predict( self , X ):
self.scaler_X = StandardScaler()
self.scaler_X.fit( X )
= self.scaler_X.transform( X )
std_X = self.model.predict( std_X )
prediction return prediction
class rlasso_sklearn:
def __init__(self, post ):
self.post = post
def fit( self, X, Y ):
# Standarization of X and Y
self.rlasso_model = hdmpy.rlasso( X , Y , post = self.post )
return self
def predict( self , X ):
= self.rlasso_model.est['coefficients'].to_numpy()
beta = ( add_constant( X ) @ beta ).flatten()
prediction return prediction
def DML2_for_PLM(x, d, y, dreg, yreg, nfold = 2 ):
# Num ob observations
= x.shape[0]
nobs
# Define folds indices
= [*range(0, nfold, 1)]*nobs
list_1 = np.random.choice(nobs,nobs, replace=False).tolist()
sample = [list_1[index] for index in sample]
foldid
# Create split function(similar to R)
def split(x, f):
= max(f) + 1
count return tuple( list(itertools.compress(x, (el == i for el in f))) for i in range(count) )
# Split observation indices into folds
= [*range(0, nobs, 1)]
list_2 = split(list_2, foldid)
I
# Create array to save errors
= np.zeros( len(x) ).reshape( len(x) , 1 )
dtil = np.zeros( len(x) ).reshape( len(x) , 1 )
ytil
# loop to save results
for b in range(0,len(I)):
# Split data - index to keep are in mask as booleans
= set(I[b]) #Here should go I[b] Set is more efficient, but doesn't reorder your elements if that is desireable
include_idx = np.array([(i in include_idx) for i in range(len(x))])
mask
# Lasso regression, excluding folds selected
= dreg(x[~mask,], d[~mask,])
dfit = yreg(x[~mask,], y[~mask,])
yfit
# predict estimates using the
= dfit.predict( x[mask,] )
dhat = yfit.predict( x[mask,] )
yhat
# save errors
= d[mask,] - dhat.reshape( len(I[b]) , 1 )
dtil[mask] = y[mask,] - yhat.reshape( len(I[b]) , 1 )
ytil[mask] print(b, " ")
# Create dataframe
= pd.DataFrame(np.concatenate( ( ytil, dtil ), axis = 1), columns = ['ytil','dtil' ])
data_2
# OLS clustering at the County level
= "ytil ~ dtil"
model = smf.ols( model , data = data_2 ).fit().get_robustcov_results(cov_type = "HC3")
baseline_ols = baseline_ols.summary2().tables[ 1 ][ 'Coef.' ][ 'dtil' ]
coef_est = baseline_ols.summary2().tables[ 1 ][ 'Std.Err.' ][ 'dtil' ]
se
= { 'coef_est' : coef_est , 'se' : se , 'dtil' : dtil , 'ytil' : ytil }
Final_result
print( f"\n Coefficient (se) = {coef_est} ({se})" )
return Final_result
data(GrowthData) # Barro-Lee growth data
= as.matrix(GrowthData[,1]) # outcome: growth rate
y= as.matrix(GrowthData[,3]) # treatment: initial wealth
d= as.matrix(GrowthData[,-c(1,2,3)]) # controls: country characteristics
x
cat(sprintf("\n length of y is %g \n", length(y) ))
##
## length of y is 90
# load GrowthData
= pyreadr.read_r("./data/GrowthData.RData")
rdata_read = rdata_read[ 'GrowthData' ]
GrowthData = GrowthData.shape[0]
n
= GrowthData.iloc[ : , 0 ].to_numpy().reshape( GrowthData.shape[0] , 1 )
y = GrowthData.iloc[ : , 2].to_numpy().reshape( GrowthData.shape[0] , 1 )
d = GrowthData.iloc[ : , 3:].to_numpy()
x
print( f'\n length of y is \n {y.size}' )
##
## length of y is
## 90
cat(sprintf("\n num features x is %g \n", dim(x)[2] ))
##
## num features x is 60
print( f'\n num features x is \n {x.shape[ 1 ]}' )
##
## num features x is
## 60
=summary(lm(y~d +x))$coef[2,1:2]
lrescat(sprintf("\n Naive OLS that uses all features w/o cross-fitting \n"))
##
## Naive OLS that uses all features w/o cross-fitting
= sm.OLS( y , add_constant(np.concatenate( ( d , x ) , axis = 1 ) ) ).fit().summary2().tables[ 1 ].iloc[ 1, 0:2 ]
lres print( "\n Naive OLS that uses all features w/o cross-fitting \n" )
##
## Naive OLS that uses all features w/o cross-fitting
cat(sprintf("\ncoef (se) = %g (%g)\n", lres[1] , lres[2]))
##
## coef (se) = -0.00937799 (0.0298877)
print( f'\n coef (se) = {lres[ 0 ]} ({lres[ 1 ]})' )
##
## coef (se) = -0.009377988732377857 (0.02988772637232471)
cat(sprintf("\n DML with OLS w/o feature selection \n"))
##
## DML with OLS w/o feature selection
print( "\n DML with OLS w/o feature selection \n" )
##
## DML with OLS w/o feature selection
#DML with OLS
set.seed(1)
<- function(x,d){ glmnet(x, d, lambda = 0) } #ML method= OLS using glmnet; using lm gives bugs
dreg <- function(x,y){ glmnet(x, y, lambda = 0) } #ML method = OLS
yreg
= DML2.for.PLM(x, d, y, dreg, yreg, nfold=10) DML2.OLS
## fold: 1 2 3 4 5 6 7 8 9 10
## coef (se) = 0.01013 (0.0167061)
#DML with OLS
def dreg(x,d):
= standard_skl_model( linear_model.Lasso( alpha = 0 , random_state = 0 )).fit( x, d )
result return result
def yreg(x,y):
= standard_skl_model( linear_model.Lasso( alpha = 0 , random_state = 0 ) ).fit( x, y )
result return result
= DML2_for_PLM(x, d, y, dreg, yreg, 10 ) DML2_ols
## 0
## 1
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
##
## Coefficient (se) = -0.005431864263545796 (0.011458472710862365)
cat(sprintf("\n DML with Lasso \n"))
##
## DML with Lasso
print( "\n DML with Lasso \n" )
##
## DML with Lasso
#DML with Lasso:
set.seed(1)
<- function(x,d){ rlasso(x,d, post=FALSE) } #ML method= lasso from hdm
dreg <- function(x,y){ rlasso(x,y, post=FALSE) } #ML method = lasso from hdm
yreg = DML2.for.PLM(x, d, y, dreg, yreg, nfold=10) DML2.lasso
## fold: 1 2 3 4 5 6 7 8 9 10
## coef (se) = -0.0352021 (0.0161357)
# DML with LASSO
def dreg(x,d):
= rlasso_sklearn( post = False ).fit( x , d )
result return result
def yreg(x,y):
= rlasso_sklearn( post = False ).fit( x , y )
result return result
= DML2_for_PLM( x , d , y , dreg , yreg , 10 ) DML2_lasso
## 0
## 1
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
##
## Coefficient (se) = -0.040998025492277684 (0.01706393234389699)
cat(sprintf("\n DML with Random Forest \n"))
##
## DML with Random Forest
print( "\n DML with Random Forest \n" )
##
## DML with Random Forest
#DML with Random Forest:
<- function(x,d){ randomForest(x, d) } #ML method=Forest
dreg <- function(x,y){ randomForest(x, y) } #ML method=Forest
yreg set.seed(1)
= DML2.for.PLM(x, d, y, dreg, yreg, nfold=10) DML2.RF
## fold: 1 2 3 4 5 6 7 8 9 10
## coef (se) = -0.0365831 (0.0151539)
#DML with Random Forest:
def dreg( x , d ):
= RandomForestRegressor( random_state = 0 , n_estimators = 500 , max_features = 20 , n_jobs = 4 , min_samples_leaf = 5 ).fit( x, d )
result return result
def yreg( x , y ):
= RandomForestRegressor( random_state = 0 , n_estimators = 500 , max_features = 20 , n_jobs = 4 , min_samples_leaf = 5 ).fit( x, y )
result return result
= DML2_for_PLM( x , d , y , dreg , yreg , 2 ) # set to 2 due to computation time DML2_RF
## 0
## 1
##
## Coefficient (se) = -0.03019599714145048 (0.013698557551547192)
cat(sprintf("\n DML with Lasso/Random Forest \n"))
##
## DML with Lasso/Random Forest
print( "\n DML with Lasso/Random Forest \n" )
##
## DML with Lasso/Random Forest
#DML MIX:
<- function(x,d){ rlasso(x,d, post=FALSE) } #ML method=Forest
dreg <- function(x,y){ randomForest(x, y) } #ML method=Forest
yreg set.seed(1)
= DML2.for.PLM(x, d, y, dreg, yreg, nfold=10) DML2.RF
## fold: 1 2 3 4 5 6 7 8 9 10
## coef (se) = -0.0375019 (0.0135088)
#DML MIX:
def dreg( x , d ):
= rlasso_sklearn( post = False ).fit( x , d )
result return result
def yreg( x , y ):
= RandomForestRegressor( random_state = 0 , n_estimators = 500 , max_features = 20 , n_jobs = 4 , min_samples_leaf = 5 ).fit( x, y )
result return result
= DML2_for_PLM( x , d , y , dreg , yreg , 2 ) # set to 2 due to computation time DML2_RF
## 0
## 1
##
## Coefficient (se) = -0.043979438435625365 (0.014275559775082948)
<- c( mean((DML2.OLS$dtil)^2),
prRes.Dmean((DML2.lasso$dtil)^2),
mean((DML2.RF$dtil)^2));
<- c(mean((DML2.OLS$ytil)^2),
prRes.Ymean((DML2.lasso$ytil)^2),
mean((DML2.RF$ytil)^2));
<- rbind(sqrt(prRes.D), sqrt(prRes.Y));
prRes
rownames(prRes)<- c("RMSE D", "RMSE Y");
colnames(prRes)<- c("OLS", "Lasso", "RF")
print(prRes,digit=2)
## OLS Lasso RF
## RMSE D 0.467 0.372 0.372
## RMSE Y 0.054 0.052 0.046
= [ DML2_ols, DML2_lasso, DML2_RF ]
mods = ["OLS", "Lasso", 'RF']
mods_name
def mdl( model , model_name ):
= np.sqrt( np.mean( model[ 'ytil' ] ) ** 2 ) # I have some doubts about these equations...we have to recheck
RMSEY = np.sqrt( np.mean( model[ 'dtil' ] ) ** 2 ) # I have some doubts about these equations...we have to recheck
RMSED
= pd.DataFrame( { model_name : [ RMSEY , RMSED ]} , index = [ 'RMSEY' , 'RMSED' ])
result return result
= [ mdl( model , name ) for model, name in zip( mods , mods_name ) ]
RES
= pd.concat( RES, axis = 1)
pr_Res
round( 7 ) pr_Res.
## OLS Lasso RF
## RMSEY 0.0 0.000284 0.000348
## RMSED 0.0 0.003598 0.013163