Chapter 4 Simple Exercise on Overfitting
4.1 1. First set p=n
R code
# No additional packages needed
Python code
import numpy as np
import random
import statsmodels.api as smR code
set.seed(123)
n = 1000
p = n
X<- matrix(rnorm(n*p), n, p)
Y<- rnorm(n)
Python code
random.seed(10)
n = 1000
p = n
X = np.random.normal(0, 1, size=(n, p))
Y = np.random.normal(0, 1,n)R code
print("p/n is")## [1] "p/n is"
print(p/n)## [1] 1
print("R2 is")## [1] "R2 is"
print(summary(lm(Y~X))$r.squared)## [1] 1
print("Adjusted R2 is")## [1] "Adjusted R2 is"
print(summary(lm(Y~X))$adj.r.squared)## [1] NaN
Python code
mod = sm.OLS(Y, X) # Describe model
res = mod.fit()
est2 = mod.fit()
print("p/n is",p/n)## p/n is 1.0
print("R2 is",res.rsquared)## R2 is 1.0
print("Adjusted R2 is",est2.rsquared_adj)## Adjusted R2 is nan
##
## C:\Users\MSI-NB\ANACON~1\envs\TENSOR~2\lib\site-packages\statsmodels\regression\linear_model.py:1728: RuntimeWarning: divide by zero encountered in true_divide
## return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
## C:\Users\MSI-NB\ANACON~1\envs\TENSOR~2\lib\site-packages\statsmodels\regression\linear_model.py:1728: RuntimeWarning: invalid value encountered in double_scalars
## return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
4.2 2. Second, set p=n/2.
R code
set.seed(123)
n = 1000
p = n/2
X<- matrix(rnorm(n*p), n, p)
Y<- rnorm(n)
Python code
random.seed(10)
n = 1000
p = n/2X = np.random.normal(0, 1, size=(n, int(p)))
Y = np.random.normal(0, 1,n)
mod = sm.OLS(Y, X) # Describe model
res = mod.fit()
# print(res.summary())R code
print("p/n is")## [1] "p/n is"
print(p/n)## [1] 0.5
print("R2 is")## [1] "R2 is"
print(summary(lm(Y~X))$r.squared)## [1] 0.4922339
print("Adjusted R2 is")## [1] "Adjusted R2 is"
print(summary(lm(Y~X))$adj.r.squared)## [1] -0.01654975
Python code
est2 = mod.fit()
print("p/n is",p/n)## p/n is 0.5
print("R2 is",res.rsquared)## R2 is 0.5189611963708867
print("Adjusted R2 is",est2.rsquared_adj)## Adjusted R2 is 0.037922392741773336
4.3 3. Third, set p/n =.05
R code
set.seed(123)
n = 1000
p = .05*n
X<- matrix(rnorm(n*p), n, p)
Y<- rnorm(n)
Python code
random.seed(10)
n = 1000
p = 0.05*n
int(p)## 50
X = np.random.normal(0, 1, size=(n, int(p)))
Y = np.random.normal(0, 1,n)
mod = sm.OLS(Y, X) # Describe model
res = mod.fit()
# print(res.summary())R code
print("p/n is")## [1] "p/n is"
print(p/n)## [1] 0.05
print("R2 is")## [1] "R2 is"
print(summary(lm(Y~X))$r.squared)## [1] 0.04295907
print("Adjusted R2 is")## [1] "Adjusted R2 is"
print(summary(lm(Y~X))$adj.r.squared)## [1] -0.00746458
Python code
print("p/n is \n",p/n )
#print("summary()\n",res.summary())## p/n is
## 0.05
print("rsquared\n",res.rsquared)## rsquared
## 0.048571820855195846
print("rsquared_adj\n",res.rsquared_adj)## rsquared_adj
## -0.0015033464682148168