Chapter 4 Simple Exercise on Overfitting
4.1 1. First set p=n
R code
# No additional packages needed
Python code
import numpy as np
import random
import statsmodels.api as sm
R code
set.seed(123)
= 1000
n = n
p
<- matrix(rnorm(n*p), n, p)
X<- rnorm(n) Y
Python code
10)
random.seed(= 1000
n = n
p
= np.random.normal(0, 1, size=(n, p))
X = np.random.normal(0, 1,n) Y
R code
print("p/n is")
## [1] "p/n is"
print(p/n)
## [1] 1
print("R2 is")
## [1] "R2 is"
print(summary(lm(Y~X))$r.squared)
## [1] 1
print("Adjusted R2 is")
## [1] "Adjusted R2 is"
print(summary(lm(Y~X))$adj.r.squared)
## [1] NaN
Python code
= sm.OLS(Y, X) # Describe model
mod = mod.fit()
res
= mod.fit()
est2
print("p/n is",p/n)
## p/n is 1.0
print("R2 is",res.rsquared)
## R2 is 1.0
print("Adjusted R2 is",est2.rsquared_adj)
## Adjusted R2 is nan
##
## C:\Users\MSI-NB\ANACON~1\envs\TENSOR~2\lib\site-packages\statsmodels\regression\linear_model.py:1728: RuntimeWarning: divide by zero encountered in true_divide
## return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
## C:\Users\MSI-NB\ANACON~1\envs\TENSOR~2\lib\site-packages\statsmodels\regression\linear_model.py:1728: RuntimeWarning: invalid value encountered in double_scalars
## return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
4.2 2. Second, set p=n/2.
R code
set.seed(123)
= 1000
n = n/2
p <- matrix(rnorm(n*p), n, p)
X<- rnorm(n) Y
Python code
10)
random.seed(= 1000
n = n/2 p
= np.random.normal(0, 1, size=(n, int(p)))
X = np.random.normal(0, 1,n)
Y = sm.OLS(Y, X) # Describe model
mod = mod.fit()
res # print(res.summary())
R code
print("p/n is")
## [1] "p/n is"
print(p/n)
## [1] 0.5
print("R2 is")
## [1] "R2 is"
print(summary(lm(Y~X))$r.squared)
## [1] 0.4922339
print("Adjusted R2 is")
## [1] "Adjusted R2 is"
print(summary(lm(Y~X))$adj.r.squared)
## [1] -0.01654975
Python code
= mod.fit()
est2 print("p/n is",p/n)
## p/n is 0.5
print("R2 is",res.rsquared)
## R2 is 0.5189611963708867
print("Adjusted R2 is",est2.rsquared_adj)
## Adjusted R2 is 0.037922392741773336
4.3 3. Third, set p/n =.05
R code
set.seed(123)
= 1000
n
= .05*n
p <- matrix(rnorm(n*p), n, p)
X<- rnorm(n) Y
Python code
10)
random.seed(= 1000
n = 0.05*n
p int(p)
## 50
= np.random.normal(0, 1, size=(n, int(p)))
X = np.random.normal(0, 1,n)
Y = sm.OLS(Y, X) # Describe model
mod = mod.fit()
res # print(res.summary())
R code
print("p/n is")
## [1] "p/n is"
print(p/n)
## [1] 0.05
print("R2 is")
## [1] "R2 is"
print(summary(lm(Y~X))$r.squared)
## [1] 0.04295907
print("Adjusted R2 is")
## [1] "Adjusted R2 is"
print(summary(lm(Y~X))$adj.r.squared)
## [1] -0.00746458
Python code
print("p/n is \n",p/n )
#print("summary()\n",res.summary())
## p/n is
## 0.05
print("rsquared\n",res.rsquared)
## rsquared
## 0.048571820855195846
print("rsquared_adj\n",res.rsquared_adj)
## rsquared_adj
## -0.0015033464682148168