Chapter 18 Double/Debiased Machine Learning for the Partially Linear Regression Model.

This is a simple implementation of Debiased Machine Learning for the Partially Linear Regression Model.

Reference:

https://www.amazon.com/Business-Data-Science-Combining-Accelerate/dp/1260452778

The code is based on the book.

18.1 DML algorithm

Here we perform estimation and inference of predictive coefficient \(\alpha\) in the partially linear statistical model, \[ Y = D\alpha + g(X) + U, \quad E (U | D, X) = 0. \] For \(\tilde Y = Y- E(Y|X)\) and \(\tilde D= D- E(D|X)\), we can write \[ \tilde Y = \alpha \tilde D + U, \quad E (U |\tilde D) =0. \] Parameter \(\alpha\) is then estimated using cross-fitting approach to obtain the residuals \(\tilde D\) and \(\tilde Y\). The algorithm comsumes \(Y, D, X\), and machine learning methods for learning the residuals \(\tilde Y\) and \(\tilde D\), where the residuals are obtained by cross-validation (cross-fitting).

The statistical parameter \(\alpha\) has a causal intertpreation of being the effect of \(D\) on \(Y\) in the causal DAG \[ D\to Y, \quad X\to (D,Y)\] or the counterfactual outcome model with conditionally exogenous (conditionally random) assignment of treatment \(D\) given \(X\): \[ Y(d) = d\alpha + g(X) + U(d),\quad U(d) \text{ indep } D |X, \quad Y = Y(D), \quad U = U(D). \]

# install.packages("hdm")
# install.packages("AER")
# install.packages("randomForest")
library(AER)  #applied econometrics library
library(randomForest)  #random Forest library
library(hdm) #high-dimensional econometrics library
library(glmnet) #glm net

# Import relevant packages
import pandas as pd
import numpy as np
import pyreadr
from sklearn import preprocessing
import patsy

from numpy import loadtxt
from keras.models import Sequential
from keras.layers import Dense

import hdmpy
import numpy as np
import random
import statsmodels.api as sm
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import colors
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV, ElasticNetCV
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
import itertools
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
from pandas.api.types import is_categorical_dtype
from itertools import compress
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.feature_selection import SelectFromModel
from statsmodels.tools import add_constant
from sklearn.linear_model import ElasticNet
import warnings
warnings.filterwarnings("ignore")

DML2.for.PLM <- function(x, d, y, dreg, yreg, nfold=2) {
  
    nobs <- nrow(x) #number of observations
  
    foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] #define folds indices
  
    I <- split(1:nobs, foldid)  #split observation indices into folds  
  
    ytil <- dtil <- rep(NA, nobs)
  
    cat("fold: ")
  
    for(b in 1:length(I)){
        
        dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) #take a fold out
        
        yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a foldt out
        
        dhat <- predict(dfit, x[I[[b]],], type="response") #predict the left-out fold 
        
        yhat <- predict(yfit, x[I[[b]],], type="response") #predict the left-out fold 
        
        dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual for the left-out fold
        
        ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial for the left-out fold
        
        cat(b," ")
        
    }
  
    rfit <- lm(ytil ~ dtil)    #estimate the main parameter by regressing one residual on the other
  
    coef.est <- coef(rfit)[2]  #extract coefficient
  
    se <- sqrt(vcovHC(rfit)[2,2]) #record robust standard error
  
    cat(sprintf("\ncoef (se) = %g (%g)\n", coef.est , se))  #printing output
  
    return( list(coef.est =coef.est , se=se, dtil=dtil, ytil=ytil) ) #save output and residuals 
    
}

class standard_skl_model:
    def __init__(self, model ):
        self.model = model 
    def fit( self, X, Y ):
        # Standarization of X and Y
        self.scaler_X = StandardScaler()
        self.scaler_X.fit( X )
        std_X = self.scaler_X.transform( X )
        self.model.fit( std_X , Y )
        return self
    def predict( self , X ):
        self.scaler_X = StandardScaler()
        self.scaler_X.fit( X )
        std_X = self.scaler_X.transform( X )
        prediction = self.model.predict( std_X )
        return prediction
      
class rlasso_sklearn:
    def __init__(self, post ):
        self.post = post
    def fit( self, X, Y ):
        # Standarization of X and Y
        self.rlasso_model = hdmpy.rlasso( X , Y , post = self.post )                
        return self
    def predict( self , X ):
        beta = self.rlasso_model.est['coefficients'].to_numpy()
        prediction = ( add_constant( X ) @ beta ).flatten()
        return prediction
      
def DML2_for_PLM(x, d, y, dreg, yreg, nfold = 2 ):
    
    # Num ob observations
    nobs = x.shape[0]
    
    # Define folds indices 
    list_1 = [*range(0, nfold, 1)]*nobs
    sample = np.random.choice(nobs,nobs, replace=False).tolist()
    foldid = [list_1[index] for index in sample]

    # Create split function(similar to R)
    def split(x, f):
        count = max(f) + 1
        return tuple( list(itertools.compress(x, (el == i for el in f))) for i in range(count) ) 

    # Split observation indices into folds 
    list_2 = [*range(0, nobs, 1)]
    I = split(list_2, foldid)
    
    # Create array to save errors 
    dtil = np.zeros( len(x) ).reshape( len(x) , 1 )
    ytil = np.zeros( len(x) ).reshape( len(x) , 1 )
    
    # loop to save results
    for b in range(0,len(I)):
    
        # Split data - index to keep are in mask as booleans
        include_idx = set(I[b])  #Here should go I[b] Set is more efficient, but doesn't reorder your elements if that is desireable
        mask = np.array([(i in include_idx) for i in range(len(x))])

        # Lasso regression, excluding folds selected 
        dfit = dreg(x[~mask,], d[~mask,])
        yfit = yreg(x[~mask,], y[~mask,])

        # predict estimates using the 
        dhat = dfit.predict( x[mask,] )
        yhat = yfit.predict( x[mask,] )

        # save errors  
        dtil[mask] =  d[mask,] - dhat.reshape( len(I[b]) , 1 )
        ytil[mask] = y[mask,] - yhat.reshape( len(I[b]) , 1 )
        print(b, " ")
    
    # Create dataframe 
    data_2 = pd.DataFrame(np.concatenate( ( ytil, dtil ), axis = 1), columns = ['ytil','dtil' ])
   
    # OLS clustering at the County level
    model = "ytil ~ dtil"
    baseline_ols = smf.ols( model , data = data_2 ).fit().get_robustcov_results(cov_type = "HC3")
    coef_est = baseline_ols.summary2().tables[ 1 ][ 'Coef.' ][ 'dtil' ]
    se = baseline_ols.summary2().tables[ 1 ][ 'Std.Err.' ][ 'dtil' ]
    
    Final_result = { 'coef_est' : coef_est , 'se' : se , 'dtil' : dtil , 'ytil' : ytil }

    print( f"\n Coefficient (se) = {coef_est} ({se})" )
    
    return Final_result

data(GrowthData)                     # Barro-Lee growth data
y= as.matrix(GrowthData[,1])         # outcome: growth rate
d= as.matrix(GrowthData[,3])         # treatment: initial wealth
x= as.matrix(GrowthData[,-c(1,2,3)]) # controls: country characteristics

cat(sprintf("\n length of y is %g \n", length(y) ))

## 
##  length of y is 90

# load GrowthData
rdata_read = pyreadr.read_r("./data/GrowthData.RData")
GrowthData = rdata_read[ 'GrowthData' ]
n = GrowthData.shape[0]

y = GrowthData.iloc[ : , 0 ].to_numpy().reshape( GrowthData.shape[0] , 1 )
d = GrowthData.iloc[ : , 2].to_numpy().reshape( GrowthData.shape[0] , 1 )
x = GrowthData.iloc[ : , 3:].to_numpy()

print( f'\n length of y is \n {y.size}' )

## 
##  length of y is 
##  90

cat(sprintf("\n num features x is %g \n", dim(x)[2] ))

## 
##  num features x is 60

print( f'\n num features x is \n {x.shape[ 1 ]}' )

## 
##  num features x is 
##  60

lres=summary(lm(y~d +x))$coef[2,1:2]
cat(sprintf("\n Naive OLS that uses all features w/o cross-fitting \n"))

## 
##  Naive OLS that uses all features w/o cross-fitting

lres = sm.OLS( y , add_constant(np.concatenate( ( d , x ) , axis = 1 ) )  ).fit().summary2().tables[ 1 ].iloc[ 1, 0:2 ]
print( "\n Naive OLS that uses all features w/o cross-fitting \n" )

## 
##  Naive OLS that uses all features w/o cross-fitting

cat(sprintf("\ncoef (se) = %g (%g)\n", lres[1] , lres[2]))

## 
## coef (se) = -0.00937799 (0.0298877)

print( f'\n coef (se) = {lres[ 0 ]} ({lres[ 1 ]})' )

## 
##  coef (se) = -0.009377988732377857 (0.02988772637232471)

cat(sprintf("\n DML with OLS w/o feature selection \n"))

## 
##  DML with OLS w/o feature selection

print( "\n DML with OLS w/o feature selection \n" )

## 
##  DML with OLS w/o feature selection

#DML with OLS
set.seed(1)
dreg <- function(x,d){ glmnet(x, d, lambda = 0) } #ML method= OLS using glmnet; using lm gives bugs
yreg <- function(x,y){ glmnet(x, y, lambda = 0) } #ML method = OLS

DML2.OLS = DML2.for.PLM(x, d, y, dreg, yreg, nfold=10)

## fold: 1  2  3  4  5  6  7  8  9  10  
## coef (se) = 0.01013 (0.0167061)

#DML with OLS
def dreg(x,d):
    result = standard_skl_model( linear_model.Lasso( alpha = 0 , random_state = 0 )).fit( x, d )
    return result

def yreg(x,y):
    result = standard_skl_model( linear_model.Lasso( alpha = 0 ,  random_state = 0 ) ).fit( x, y )
    return result

DML2_ols = DML2_for_PLM(x, d, y, dreg, yreg, 10 )

## 0  
## 1  
## 2  
## 3  
## 4  
## 5  
## 6  
## 7  
## 8  
## 9  
## 
##  Coefficient (se) = -0.005431864263545796 (0.011458472710862365)

cat(sprintf("\n DML with Lasso \n"))

## 
##  DML with Lasso

print( "\n DML with Lasso \n" )

## 
##  DML with Lasso

#DML with Lasso:
set.seed(1)
dreg <- function(x,d){ rlasso(x,d, post=FALSE) } #ML method= lasso from hdm 
yreg <- function(x,y){ rlasso(x,y, post=FALSE) } #ML method = lasso from hdm
DML2.lasso = DML2.for.PLM(x, d, y, dreg, yreg, nfold=10)

## fold: 1  2  3  4  5  6  7  8  9  10  
## coef (se) = -0.0352021 (0.0161357)

# DML with LASSO

def dreg(x,d):
    result = rlasso_sklearn( post = False ).fit( x , d )
    return result

def yreg(x,y):
    result = rlasso_sklearn( post = False ).fit( x , y )
    return result

DML2_lasso = DML2_for_PLM( x , d , y , dreg , yreg , 10 )

## 0  
## 1  
## 2  
## 3  
## 4  
## 5  
## 6  
## 7  
## 8  
## 9  
## 
##  Coefficient (se) = -0.040998025492277684 (0.01706393234389699)

cat(sprintf("\n DML with Random Forest \n"))

## 
##  DML with Random Forest

print( "\n DML with Random Forest \n" )

## 
##  DML with Random Forest

#DML with Random Forest:
dreg <- function(x,d){ randomForest(x, d) } #ML method=Forest 
yreg <- function(x,y){ randomForest(x, y) } #ML method=Forest
set.seed(1)
DML2.RF = DML2.for.PLM(x, d, y, dreg, yreg, nfold=10)

## fold: 1  2  3  4  5  6  7  8  9  10  
## coef (se) = -0.0365831 (0.0151539)

#DML with Random Forest:
def dreg( x , d ):
    result = RandomForestRegressor( random_state = 0 , n_estimators = 500 , max_features = 20 , n_jobs = 4 , min_samples_leaf = 5 ).fit( x, d )
    return result

def yreg( x , y ):
    result = RandomForestRegressor( random_state = 0 , n_estimators = 500 , max_features = 20 , n_jobs = 4 , min_samples_leaf = 5 ).fit( x, y )
    return result

DML2_RF = DML2_for_PLM( x , d , y , dreg , yreg , 2 )   # set to 2 due to computation time

## 0  
## 1  
## 
##  Coefficient (se) = -0.03019599714145048 (0.013698557551547192)

cat(sprintf("\n DML with Lasso/Random Forest \n"))

## 
##  DML with Lasso/Random Forest

print( "\n DML with Lasso/Random Forest \n" )

## 
##  DML with Lasso/Random Forest

#DML MIX:
dreg <- function(x,d){ rlasso(x,d, post=FALSE) } #ML method=Forest 
yreg <- function(x,y){ randomForest(x, y) } #ML method=Forest
set.seed(1)
DML2.RF = DML2.for.PLM(x, d, y, dreg, yreg, nfold=10)

## fold: 1  2  3  4  5  6  7  8  9  10  
## coef (se) = -0.0375019 (0.0135088)

#DML MIX:
def dreg( x , d ):
    result = rlasso_sklearn( post = False ).fit( x , d )
    return result

def yreg( x , y ):
    result = RandomForestRegressor( random_state = 0 , n_estimators = 500 , max_features = 20 , n_jobs = 4 , min_samples_leaf = 5 ).fit( x, y )
    return result

DML2_RF = DML2_for_PLM( x , d , y , dreg , yreg , 2 )   # set to 2 due to computation time

## 0  
## 1  
## 
##  Coefficient (se) = -0.043979438435625365 (0.014275559775082948)

prRes.D<- c( mean((DML2.OLS$dtil)^2), 
            mean((DML2.lasso$dtil)^2), 
            mean((DML2.RF$dtil)^2));

prRes.Y<- c(mean((DML2.OLS$ytil)^2), 
            mean((DML2.lasso$ytil)^2),
            mean((DML2.RF$ytil)^2));

prRes<- rbind(sqrt(prRes.D), sqrt(prRes.Y)); 

rownames(prRes)<- c("RMSE D", "RMSE Y");

colnames(prRes)<- c("OLS", "Lasso", "RF")


print(prRes,digit=2)

##          OLS Lasso    RF
## RMSE D 0.467 0.372 0.372
## RMSE Y 0.054 0.052 0.046

mods = [ DML2_ols, DML2_lasso, DML2_RF ]
mods_name = ["OLS", "Lasso", 'RF']

def mdl( model , model_name ):
    
    RMSEY = np.sqrt( np.mean( model[ 'ytil' ] ) ** 2 ) # I have some doubts about these equations...we have to recheck
    RMSED = np.sqrt( np.mean( model[ 'dtil' ] ) ** 2 ) # I have some doubts about these equations...we have to recheck
    
    result = pd.DataFrame( { model_name : [ RMSEY , RMSED ]} , index = [ 'RMSEY' , 'RMSED' ])
    return result

RES = [ mdl( model , name ) for model, name in zip( mods , mods_name ) ]

pr_Res = pd.concat( RES, axis = 1)

pr_Res.round( 7 )

##        OLS     Lasso        RF
## RMSEY  0.0  0.000284  0.000348
## RMSED  0.0  0.003598  0.013163