# import packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import ticker, cm
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from scipy.interpolate import splrep, BSpline
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.pyplot import subplots
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import sklearn
import warnings
'ignore') warnings.filterwarnings(
Lecture 1
Linear Regression example
Specify parameters
= 100
n = 1
p = 3
beta = 1 sigma
Generate data
= np.random.normal(size=(n, p))
x = x * beta + np.random.normal(size=(n, 1)) * sigma
y
= ['x' + str(i) for i in range(1, p+1)]
colnames 0, 'y')
colnames.insert(
= pd.DataFrame(np.hstack((y, x)), columns = colnames) df
Fit linear regression model using sklearn
= LinearRegression()
lm
lm.fit(x, y)= lm.predict(x)
y_hat = y - y_hat resid
Plot x vs. y using seaborn
sns.set_theme()= sns.relplot(df, x='x1', y='y', height = 3, aspect = 1.2)
lm_plot 0,lm.intercept_[0]), slope=lm.coef_[0][0]) plt.axline((
Plot x vs. y, including residual distances
= np.minimum(y, y_hat)
y_min = np.maximum(y, y_hat)
y_max = sns.relplot(df, x='x1', y='y', height = 3, aspect = 1.2)
lm_plot 0,lm.intercept_[0]), slope=lm.coef_[0][0])
plt.axline((=list(x[:,0]), ymin=list(y_min[:,0]), ymax=list(y_max[:,0]), color = 'red', alpha=0.5) lm_plot.ax.vlines(x
Overfitting example
= np.argsort(x, axis=0)
sort_ind = np.take_along_axis(x, sort_ind, axis=0)
xsort = np.take_along_axis(y, sort_ind, axis=0)
ysort = splrep(xsort, ysort, s=20)
tck
= np.arange(x.min(), x.max(), 0.01)
xspline = BSpline(*tck)(xspline)
yspline
= sns.relplot(df, x='x1', y='y', height = 3.5, aspect = 1.2)
lm_plot 0,lm.intercept_[0]), slope=lm.coef_[0][0], label = "Linear Regression")
plt.axline((='orange', label = "Spline")
plt.plot(xspline, yspline,color='upper left') plt.legend(loc
Shrinkage plot
Lasso example
This code is adapted from ISLP labs.
= 100
n = 90
p = np.zeros(p)
beta 0]=3
beta[1]=3
beta[= 0.6 * np.ones((p, p))
cov 1)
np.fill_diagonal(cov, = np.random.multivariate_normal(mean=np.zeros(p), cov=cov, size=n)
x = np.matmul(x, beta) + np.random.normal(size=n)
y
= ['x' + str(i+1) for i in range(p)] x_columns
# set up cross-validation
=5
K= sklearn.model_selection.KFold(K,random_state=0,shuffle=True)
kfold
# function to standardize input
= StandardScaler(with_mean=True, with_std=True) scaler
= sklearn.linear_model.ElasticNetCV(n_alphas=100, l1_ratio=1,cv=kfold)
lassoCV = Pipeline(steps=[('scaler', scaler),('lasso', lassoCV)])
pipeCV
pipeCV.fit(x, y)= pipeCV.named_steps['lasso']
tuned_lasso
tuned_lasso.alpha_
= sklearn.linear_model.Lasso.path(x, y,l1_ratio=1,n_alphas=100)[:2]
lambdas, soln_array = pd.DataFrame(soln_array.T,columns=x_columns, index=-np.log(lambdas)) soln_path
= subplots(figsize=(8,8))
path_fig, ax =ax, legend=False)
soln_path.plot(ax'$-\log(\lambda)$', fontsize=20)
ax.set_xlabel('Standardized coefficiients', fontsize=20); ax.set_ylabel(
= subplots(figsize=(8,8))
lassoCV_fig, ax -np.log(tuned_lasso.alphas_),tuned_lasso.mse_path_.mean(1),yerr=tuned_lasso.mse_path_.std(1) / np.sqrt(K))
ax.errorbar(-np.log(tuned_lasso.alpha_), c='k', ls='--')
ax.axvline('$-\log(\lambda)$', fontsize=20)
ax.set_xlabel('Cross-validated MSE', fontsize=20); ax.set_ylabel(
Ridge regression
This code is adapted from ISLP labs
= 10**np.linspace(8, -2, 100) / y.std()
lambdas = sklearn.linear_model.ElasticNetCV(alphas=lambdas,
ridgeCV =0,
l1_ratio=kfold)
cv= Pipeline(steps=[('scaler', scaler),
pipeCV 'ridge', ridgeCV)])
( pipeCV.fit(x, y)
Pipeline(steps=[('scaler', StandardScaler()), ('ridge', ElasticNetCV(alphas=array([1.84404932e+07, 1.46137755e+07, 1.15811671e+07, 9.17787690e+06, 7.27331049e+06, 5.76397417e+06, 4.56785096e+06, 3.61994377e+06, 2.86874353e+06, 2.27343019e+06, 1.80165454e+06, 1.42778041e+06, 1.13149156e+06, 8.96687712e+05, 7.10609677e+05, 5.63146016e+05, 4.46283587e+05, 3.53672111e+05,... 1.53096214e-01, 1.21326131e-01, 9.61488842e-02, 7.61963464e-02, 6.03843014e-02, 4.78535262e-02, 3.79231012e-02, 3.00534091e-02, 2.38168128e-02, 1.88744168e-02, 1.49576525e-02, 1.18536838e-02, 9.39384172e-03, 7.44445891e-03, 5.89960638e-03, 4.67533716e-03, 3.70512474e-03, 2.93624800e-03, 2.32692632e-03, 1.84404932e-03]), cv=KFold(n_splits=5, random_state=0, shuffle=True), l1_ratio=0))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
steps | [('scaler', ...), ('ridge', ...)] | |
transform_input | None | |
memory | None | |
verbose | False |
Parameters
copy | True | |
with_mean | True | |
with_std | True |
Parameters
l1_ratio | 0 | |
eps | 0.001 | |
n_alphas | 'deprecated' | |
alphas | array([1.8440...84404932e-03]) | |
fit_intercept | True | |
precompute | 'auto' | |
max_iter | 1000 | |
tol | 0.0001 | |
cv | KFold(n_split... shuffle=True) | |
copy_X | True | |
verbose | 0 | |
n_jobs | None | |
positive | False | |
random_state | None | |
selection | 'cyclic' |
= sklearn.linear_model.ElasticNet.path(x, y,l1_ratio=0,alphas=lambdas)[:2]
lambdas, soln_array = pd.DataFrame(soln_array.T,columns=x_columns, index=-np.log(lambdas))
soln_path
= subplots(figsize=(8,8))
path_fig, ax =ax, legend=False)
soln_path.plot(ax'$-\log(\lambda)$', fontsize=20)
ax.set_xlabel('Standardized coefficiients', fontsize=20); ax.set_ylabel(
= pipeCV.named_steps['ridge']
tuned_ridge = subplots(figsize=(8,8))
ridgeCV_fig, ax -np.log(lambdas),
ax.errorbar(1),
tuned_ridge.mse_path_.mean(=tuned_ridge.mse_path_.std(1) / np.sqrt(K))
yerr-np.log(tuned_ridge.alpha_), c='k', ls='--')
ax.axvline('$-\log(\lambda)$', fontsize=20)
ax.set_xlabel('Cross-validated MSE', fontsize=20); ax.set_ylabel(
Logistic regression example
Generate data
= 100
n = 2
p = np.random.uniform(-2, 2, size=(n, p))
x
= np.array([2.5, -2.5])
beta = np.matmul(x, beta)
mu = 1/(1 + np.exp(-mu))
prob
= np.zeros((n))
y for i in range(n):
= np.random.binomial(1, prob[i], 1)[0]
y[i]
= np.hstack([y.reshape((n, 1)), x])
df = pd.DataFrame(df, columns = ['y', 'x1', 'x2'])
df
sns.set_theme()= sns.relplot(df, x='x1', y='x2', hue='y', style='y')
logit_plot =.9) logit_plot.figure.subplots_adjust(top
Fitting the logistic regression model
= LogisticRegression()
log_fit
log_fit.fit(x, y)= log_fit.coef_[0]
coeffs = -coeffs[0]/coeffs[1] coeff
Plot x_1\beta_1 + x_2\beta_2 = 0
= sns.relplot(df, x='x1', y='x2', hue='y', style='y')
logit_plot 0,0], slope=coeff)
plt.axline([## title
=.9)
logit_plot.figure.subplots_adjust(topstr(round(coeffs[0], 2)) + r'$x_1$ - ' + str(round(-coeffs[1], 2)) + r'$x_2 = 0$')
logit_plot.figure.suptitle(## fill in area
= np.linspace(-2, 2, num=200)
x_fill = coeff * x_fill
y_line 2, color='blue', alpha=0.2)
logit_plot.ax.fill_between(x_fill, y_line, -2, y_line, color='orange', alpha=0.2)
logit_plot.ax.fill_between(x_fill,
r'$\bf P(Y=1)<0.5$', (0.5, 2.1), color='blue')
logit_plot.ax.annotate(r'$\bf P(Y=1)>0.5$', (0.5, -2.2), color='darkorange') logit_plot.ax.annotate(
Text(0.5, -2.2, '$\\bf P(Y=1)>0.5$')
# Create a meshgrid for x1 and x2
= np.linspace(x[:, 0].min(), x[:, 0].max(), 200)
x1_range = np.linspace(x[:, 1].min(), x[:, 1].max(), 200)
x2_range = np.meshgrid(x1_range, x2_range)
X1, X2
# Compute the sigmoid function using the fitted logistic regression coefficients
= 1 / (1 + np.exp(-(log_fit.intercept_[0] + log_fit.coef_[0,0]*X1 + log_fit.coef_[0,1]*X2)))
Z
= plt.figure(figsize=(5, 7))
fig = fig.add_subplot(111, projection='3d')
ax
# Set background to white
'white')
ax.set_facecolor('white')
fig.patch.set_facecolor(
# Plot with smooth color transitions
= ax.plot_surface(X1, X2, Z, cmap='coolwarm', antialiased=True, linewidth=0, rstride=1, cstride=1)
surf
=15, azim=65+155)
ax.view_init(elev'x1')
ax.set_xlabel('x2')
ax.set_ylabel(r'P(y=1)')
ax.set_zlabel(
" "*115)
ax.set_title(0.5, 0.91, r'g(' + str(round(coeffs[0], 2)) + r'$x_1$ ' +
ax.text2D(str(round(coeffs[1], 2)) + r'$x_2)$',
=ax.transAxes, ha='center', va='top') transform
Text(0.5, 0.91, 'g(1.76$x_1$ -1.8$x_2)$')
Principal Components Analysis
= sns.load_dataset("penguins")
penguins
= plt.figure()
fig
= Axes3D(fig)
ax
fig.add_axes(ax)
= matplotlib.colors.ListedColormap(sns.color_palette("Paired", 3))
cmap
= penguins['species'].copy()
cols =='Adelie']=1
cols[cols=='Chinstrap']=2
cols[cols=='Gentoo']=3
cols[cols
= ax.scatter3D(penguins['bill_depth_mm'],
sc 'bill_length_mm'],
penguins['flipper_length_mm'],
penguins[= cols,
c =cmap,
cmap=1)
alpha'bill depth')
ax.set_xlabel('bill length')
ax.set_ylabel('flipper length')
ax.set_zlabel(1.0, 1.0, 1.0, 0.0)) ax.set_facecolor((
= penguins[['bill_depth_mm', 'bill_length_mm', 'flipper_length_mm']]
x = x.dropna(axis=0)
x
= PCA()
pca_fit
pca_fit.fit(x)= pca_fit.transform(x)
z
= pd.DataFrame(z[:, 0:2], columns = ['z1', 'z2'])
z_df 'species']=penguins['species']
z_df[
sns.set_theme()= sns.relplot(z_df, x='z1', y='z2', hue='species', palette=sns.color_palette("Paired", 3), height=4) pca_plot
= np.linspace(1,3,3).reshape(3,1)
PC_values = np.hstack([PC_values, pca_fit.explained_variance_ratio_.reshape(3,1)])
scree_df = pd.DataFrame(scree_df, columns = ['Principal Components', 'Explained Variance Ratio'])
scree_df = sns.relplot(scree_df, x='Principal Components', y='Explained Variance Ratio', marker='o', kind='line', height=4) scree_plot