Chapter 4 Simple Exercise on Overfitting

4.1 1. First set p=n

R code

# No additional packages needed

 

Python code

import numpy as np
import random
import statsmodels.api as sm

R code

set.seed(123)
n = 1000
p = n

X<- matrix(rnorm(n*p), n, p)
Y<- rnorm(n)

 

Python code

random.seed(10)
n = 1000
p = n

X = np.random.normal(0, 1, size=(n, p))
Y = np.random.normal(0, 1,n)

R code

print("p/n is")
## [1] "p/n is"
print(p/n)
## [1] 1
print("R2 is")
## [1] "R2 is"
print(summary(lm(Y~X))$r.squared)
## [1] 1
print("Adjusted R2 is")
## [1] "Adjusted R2 is"
print(summary(lm(Y~X))$adj.r.squared)
## [1] NaN

 

Python code

mod = sm.OLS(Y, X)    # Describe model
res = mod.fit()

est2 = mod.fit()

print("p/n is",p/n)
## p/n is 1.0
print("R2 is",res.rsquared)
## R2 is 1.0
print("Adjusted R2 is",est2.rsquared_adj)
## Adjusted R2 is nan
## 
## C:\Users\MSI-NB\ANACON~1\envs\TENSOR~2\lib\site-packages\statsmodels\regression\linear_model.py:1728: RuntimeWarning: divide by zero encountered in true_divide
##   return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
## C:\Users\MSI-NB\ANACON~1\envs\TENSOR~2\lib\site-packages\statsmodels\regression\linear_model.py:1728: RuntimeWarning: invalid value encountered in double_scalars
##   return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)

4.2 2. Second, set p=n/2.

R code

set.seed(123)
n = 1000
p = n/2
X<- matrix(rnorm(n*p), n, p)
Y<- rnorm(n)

 

Python code

random.seed(10)
n = 1000
p = n/2
X = np.random.normal(0, 1, size=(n, int(p)))
Y = np.random.normal(0, 1,n)
mod = sm.OLS(Y, X)    # Describe model
res = mod.fit()
# print(res.summary())

R code

print("p/n is")
## [1] "p/n is"
print(p/n)
## [1] 0.5
print("R2 is")
## [1] "R2 is"
print(summary(lm(Y~X))$r.squared)
## [1] 0.4922339
print("Adjusted R2 is")
## [1] "Adjusted R2 is"
print(summary(lm(Y~X))$adj.r.squared)
## [1] -0.01654975

 

Python code

est2 = mod.fit()
print("p/n is",p/n)
## p/n is 0.5
print("R2 is",res.rsquared)
## R2 is 0.5189611963708867
print("Adjusted R2 is",est2.rsquared_adj)
## Adjusted R2 is 0.037922392741773336

4.3 3. Third, set p/n =.05

R code

set.seed(123)
n = 1000

p = .05*n
X<- matrix(rnorm(n*p), n, p)
Y<- rnorm(n)

 

Python code

random.seed(10)
n = 1000
p = 0.05*n
int(p)
## 50
X = np.random.normal(0, 1, size=(n, int(p)))
Y = np.random.normal(0, 1,n)
mod = sm.OLS(Y, X)    # Describe model
res = mod.fit()
# print(res.summary())

R code

print("p/n is")
## [1] "p/n is"
print(p/n)
## [1] 0.05
print("R2 is")
## [1] "R2 is"
print(summary(lm(Y~X))$r.squared)
## [1] 0.04295907
print("Adjusted R2 is")
## [1] "Adjusted R2 is"
print(summary(lm(Y~X))$adj.r.squared)
## [1] -0.00746458

 

Python code

print("p/n is \n",p/n )
#print("summary()\n",res.summary())
## p/n is 
##  0.05
print("rsquared\n",res.rsquared)
## rsquared
##  0.048571820855195846
print("rsquared_adj\n",res.rsquared_adj)
## rsquared_adj
##  -0.0015033464682148168