Chapter 10 Deep Neural Networks for Wage Prediction
So far we have considered many machine learning methods such as Lasso and Random Forests for building a predictive model. In this lab, we extend our toolbox by returning to our wage prediction problem and showing how a neural network can be used for prediction.
10.1 Data Preparation
Again, we consider data from the U.S. March Supplement of the Current Population Survey (CPS) in 2015.
load("./data/wage2015_subsample_inference.Rdata")
<- subset(data,select=-c(lwage,wage)) # regressors Z
# Import relevant packages
import pandas as pd
import numpy as np
import pyreadr
from sklearn import preprocessing
import patsy
from numpy import loadtxt
from keras.models import Sequential
from keras.layers import Dense
= pyreadr.read_r("./data/wage2015_subsample_inference.Rdata")
rdata_read = rdata_read[ 'data' ]
data = data.shape[0] n
First, we split the data first and normalize it.
# Splitting the data
set.seed(1234)
<- sample(nrow(data), nrow(data)*(3/4), replace=FALSE)
training <- data[training,1:16]
data_train <- data[-training,1:16]
data_test
# Normalize the data
<- apply(data_train, 2, mean)
mean <- apply(data_train, 2, sd)
std <- scale(data_train, center = mean, scale = std)
data_train <- scale(data_test, center = mean, scale = std)
data_test <- as.data.frame(data_train)
data_train <- as.data.frame(data_test) data_test
# Import relevant packages for splitting data
import random
import math
# Setting seed to make the results replicable (generating random numbers)
0)
np.random.seed(= np.random.randint(0, data.shape[0], size=math.floor(data.shape[0]))
random "random"] = random
data[= data.sort_values(by=['random'])
data_2
# Creating training and testing sample
= data_2[ : math.floor(n*3/4)] # training sample
data_train = data_2[ math.floor(n*3/4) : ] # testing sample
data_test
= data_train.iloc[:, 0:16]
data_train = data_test.iloc[:, 0:16]
data_test
# Normalizing the data
= preprocessing.StandardScaler().fit(data_train)
scaler = preprocessing.StandardScaler().fit(data_test)
scaler
= scaler.transform(data_train)
data_train_scaled = scaler.transform(data_test)
data_test_scaled
= list(data_train)
columns
= pd.DataFrame(data_train_scaled, columns = columns)
data_train_scaled = pd.DataFrame(data_test_scaled, columns = columns) data_test_scaled
Then, we construct the inputs for our network.
<- "sex + exp1 + shs + hsg+ scl + clg + mw + so + we"
X_basic <- as.formula(paste("lwage", "~", X_basic))
formula_basic
<- model.matrix(formula_basic,data_train)
model_X_basic_train <- model.matrix(formula_basic,data_test)
model_X_basic_test
<- data_train$lwage
Y_train <- data_test$lwage Y_test
= "lwage ~ sex + exp1 + shs + hsg+ scl + clg + mw + so + we"
formula_basic = patsy.dmatrices(formula_basic, data_train_scaled, return_type='dataframe')
Y_train, model_X_basic_train = patsy.dmatrices(formula_basic, data_test_scaled, return_type='dataframe') Y_test, model_X_basic_test
10.2 Neural Networks
First, we need to determine the structure of our network. We are using the R package keras
to build a simple sequential neural network with three dense layers and the ReLU activation function.
library(keras)
<- function() {
build_model <- keras_model_sequential() %>%
model layer_dense(units = 20, activation = "relu", # ReLU activation function
input_shape = dim(model_X_basic_train)[2])%>%
layer_dense(units = 10, activation = "relu") %>%
layer_dense(units = 1)
%>% compile(
model optimizer = optimizer_adam(lr = 0.005), # Adam optimizer
loss = "mse",
metrics = c("mae")
) }
# define the keras model
= Sequential()
model 20, input_dim = model_X_basic_train.shape[1], activation = 'relu'))
model.add(Dense(10, activation = 'relu'))
model.add(Dense(1))
model.add(Dense(
# Importing relevant packages
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
# Compiling the keras model
= keras.optimizers.Adam(learning_rate=0.005)
opt = tf.keras.losses.MeanSquaredError()
mse = tf.keras.metrics.MeanAbsoluteError(name="mean_absolute_error", dtype=None) mae
Let us have a look at the structure of our network in detail.
<- build_model()
model summary(model)
## Model: "sequential_1"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense_5 (Dense) (None, 20) 220
## ________________________________________________________________________________
## dense_4 (Dense) (None, 10) 210
## ________________________________________________________________________________
## dense_3 (Dense) (None, 1) 11
## ================================================================================
## Total params: 441
## Trainable params: 441
## Non-trainable params: 0
## ________________________________________________________________________________
compile(loss=mse, optimizer= opt , metrics=mae)
model.=None, positions=None, print_fn=None) model.summary(line_length
## Model: "sequential"
## _________________________________________________________________
## Layer (type) Output Shape Param #
## =================================================================
## dense (Dense) (None, 20) 220
## _________________________________________________________________
## dense_1 (Dense) (None, 10) 210
## _________________________________________________________________
## dense_2 (Dense) (None, 1) 11
## =================================================================
## Total params: 441
## Trainable params: 441
## Non-trainable params: 0
## _________________________________________________________________
We have \(441\) trainable parameters in total.
Now, let us train the network. Note that this takes substantial computation time. To speed up the computation time, we use GPU as an accelerator. The extent of computational time improvements varies based on a number of factors, including model architecture, batch-size, input pipeline complexity, etc.
# Training the network
<- 1000
num_epochs %>% fit(model_X_basic_train, Y_train,
model epochs = num_epochs, batch_size = 100, verbose = 0)
# Fitting the keras model on the dataset
= 1000
num_epochs =150, batch_size=10) model.fit(model_X_basic_train, Y_train, epochs
After training the neural network, we can evaluate the performance of our model on the test sample.
# Evaluating performance
%>% evaluate(model_X_basic_test, Y_test, verbose = 0) model
## loss mae
## 0.8424253 0.7014177
# Calculating the performance measures
<- model %>% predict(model_X_basic_test)
pred.nn = summary(lm((Y_test-pred.nn)^2~1))$coef[1:2]
MSE.nn <- 1-MSE.nn[1]/var(Y_test)
R2.nn
# Printing R^2
cat("R^2 of the neural network:",R2.nn)
## R^2 of the neural network: 0.1365884
model.metrics_names
## ['loss', 'mean_absolute_error']
= 0) model.evaluate(model_X_basic_test, Y_test, verbose
## [0.8705639839172363, 0.7134189009666443]
= model.predict(model_X_basic_test)
pred_nn
import statsmodels.api as sm
import statsmodels.formula.api as smf
= (Y_test-pred_nn)**2
resid_basic = sm.OLS( resid_basic , np.ones( resid_basic.shape[0] ) ).fit().summary2().tables[1].iloc[0, 0:2]
MSE_nn_basic = 1 - ( MSE_nn_basic[0]/Y_test.var() )
R2_nn_basic
print( f"The R^2 using NN is equal to = {R2_nn_basic[0]}" ) # MSE NN (basic model)
## The R^2 using NN is equal to = 0.13011187192270934