import numpy as np import copy import math from scipy.stats import norm import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import axes3d from matplotlib.ticker import MaxNLocator dlblue = '#0096ff'; dlorange = '#FF9300'; dldarkred='#C00000'; dlmagenta='#FF40FF'; dlpurple='#7030A0'; plt.style.use('./deeplearning.mplstyle') def load_data_multi(): data = np.loadtxt("data/ex1data2.txt", delimiter=',') X = data[:,:2] y = data[:,2] return X, y ########################################################## # Plotting Routines ########################################################## def plt_house_x(X, y,f_wb=None, ax=None): ''' plot house with aXis ''' if not ax: fig, ax = plt.subplots(1,1) ax.scatter(X, y, marker='x', c='r', label="Actual Value") ax.set_title("Housing Prices") ax.set_ylabel('Price (in 1000s of dollars)') ax.set_xlabel(f'Size (1000 sqft)') if f_wb is not None: ax.plot(X, f_wb, c=dlblue, label="Our Prediction") ax.legend() def mk_cost_lines(x,y,w,b, ax): ''' makes vertical cost lines''' cstr = "cost = (1/2m)*1000*(" ctot = 0 label = 'cost for point' for p in zip(x,y): f_wb_p = w*p[0]+b c_p = ((f_wb_p - p[1])**2)/2 c_p_txt = c_p/1000 ax.vlines(p[0], p[1],f_wb_p, lw=3, color=dlpurple, ls='dotted', label=label) label='' #just one cxy = [p[0], p[1] + (f_wb_p-p[1])/2] ax.annotate(f'{c_p_txt:0.0f}', xy=cxy, xycoords='data',color=dlpurple, xytext=(5, 0), textcoords='offset points') cstr += f"{c_p_txt:0.0f} +" ctot += c_p ctot = ctot/(len(x)) cstr = cstr[:-1] + f") = {ctot:0.0f}" ax.text(0.15,0.02,cstr, transform=ax.transAxes, color=dlpurple) def inbounds(a,b,xlim,ylim): xlow,xhigh = xlim ylow,yhigh = ylim ax, ay = a bx, by = b if (ax > xlow and ax < xhigh) and (bx > xlow and bx < xhigh) \ and (ay > ylow and ay < yhigh) and (by > ylow and by < yhigh): return(True) else: return(False) from mpl_toolkits.mplot3d import axes3d def plt_contour_wgrad(x, y, hist, ax, w_range=[-100, 500, 5], b_range=[-500, 500, 5], contours = [0.1,50,1000,5000,10000,25000,50000], resolution=5, w_final=200, b_final=100,step=10 ): b0,w0 = np.meshgrid(np.arange(*b_range),np.arange(*w_range)) z=np.zeros_like(b0) n,_ = w0.shape for i in range(w0.shape[0]): for j in range(w0.shape[1]): z[i][j] = compute_cost(x, y, w0[i][j], b0[i][j] ) CS = ax.contour(w0, b0, z, contours, linewidths=2, colors=[dlblue, dlorange, dldarkred, dlmagenta, dlpurple]) ax.clabel(CS, inline=1, fmt='%1.0f', fontsize=10) ax.set_xlabel("w"); ax.set_ylabel("b") ax.set_title('Contour plot of cost J(w,b), vs b,w with path of gradient descent') w = w_final; b=b_final ax.hlines(b, ax.get_xlim()[0],w, lw=2, color=dlpurple, ls='dotted') ax.vlines(w, ax.get_ylim()[0],b, lw=2, color=dlpurple, ls='dotted') base = hist[0] for point in hist[0::step]: edist = np.sqrt((base[0] - point[0])**2 + (base[1] - point[1])**2) if(edist > resolution or point==hist[-1]): if inbounds(point,base, ax.get_xlim(),ax.get_ylim()): plt.annotate('', xy=point, xytext=base,xycoords='data', arrowprops={'arrowstyle': '->', 'color': 'r', 'lw': 3}, va='center', ha='center') base=point return # plots p1 vs p2. Prange is an array of entries [min, max, steps]. In feature scaling lab. def plt_contour_multi(x, y, w, b, ax, prange, p1, p2, title="", xlabel="", ylabel=""): contours = [1e2, 2e2,3e2,4e2, 5e2, 6e2, 7e2,8e2,1e3, 1.25e3,1.5e3, 1e4, 1e5, 1e6, 1e7] px,py = np.meshgrid(np.linspace(*(prange[p1])),np.linspace(*(prange[p2]))) z=np.zeros_like(px) n,_ = px.shape for i in range(px.shape[0]): for j in range(px.shape[1]): w_ij = w b_ij = b if p1 <= 3: w_ij[p1] = px[i,j] if p1 == 4: b_ij = px[i,j] if p2 <= 3: w_ij[p2] = py[i,j] if p2 == 4: b_ij = py[i,j] z[i][j] = compute_cost(x, y, w_ij, b_ij ) CS = ax.contour(px, py, z, contours, linewidths=2, colors=[dlblue, dlorange, dldarkred, dlmagenta, dlpurple]) ax.clabel(CS, inline=1, fmt='%1.2e', fontsize=10) ax.set_xlabel(xlabel); ax.set_ylabel(ylabel) ax.set_title(title, fontsize=14) def plt_equal_scale(X_train, X_norm, y_train): fig,ax = plt.subplots(1,2,figsize=(12,5)) prange = [ [ 0.238-0.045, 0.238+0.045, 50], [-25.77326319-0.045, -25.77326319+0.045, 50], [-50000, 0, 50], [-1500, 0, 50], [0, 200000, 50]] w_best = np.array([0.23844318, -25.77326319, -58.11084634, -1.57727192]) b_best = 235 plt_contour_multi(X_train, y_train, w_best, b_best, ax[0], prange, 0, 1, title='Unnormalized, J(w,b), vs w[0],w[1]', xlabel= "w[0] (size(sqft))", ylabel="w[1] (# bedrooms)") # w_best = np.array([111.1972, -16.75480051, -28.51530411, -37.17305735]) b_best = 376.949151515151 prange = [[ 111-50, 111+50, 75], [-16.75-50,-16.75+50, 75], [-28.5-8, -28.5+8, 50], [-37.1-16,-37.1+16, 50], [376-150, 376+150, 50]] plt_contour_multi(X_norm, y_train, w_best, b_best, ax[1], prange, 0, 1, title='Normalized, J(w,b), vs w[0],w[1]', xlabel= "w[0] (normalized size(sqft))", ylabel="w[1] (normalized # bedrooms)") fig.suptitle("Cost contour with equal scale", fontsize=18) #plt.tight_layout(rect=(0,0,1.05,1.05)) fig.tight_layout(rect=(0,0,1,0.95)) plt.show() def plt_divergence(p_hist, J_hist, x_train,y_train): x=np.zeros(len(p_hist)) y=np.zeros(len(p_hist)) v=np.zeros(len(p_hist)) for i in range(len(p_hist)): x[i] = p_hist[i][0] y[i] = p_hist[i][1] v[i] = J_hist[i] fig = plt.figure(figsize=(12,5)) plt.subplots_adjust( wspace=0 ) gs = fig.add_gridspec(1, 5) fig.suptitle(f"Cost escalates when learning rate is too large") #=============== # First subplot #=============== ax = fig.add_subplot(gs[:2], ) # Print w vs cost to see minimum fix_b = 100 w_array = np.arange(-70000, 70000, 1000) cost = np.zeros_like(w_array) for i in range(len(w_array)): tmp_w = w_array[i] cost[i] = compute_cost(x_train, y_train, tmp_w, fix_b) ax.plot(w_array, cost) ax.plot(x,v, c=dlmagenta) ax.set_title("Cost vs w, b set to 100") ax.set_ylabel('Cost') ax.set_xlabel('w') ax.xaxis.set_major_locator(MaxNLocator(2)) #=============== # Second Subplot #=============== tmp_b,tmp_w = np.meshgrid(np.arange(-35000, 35000, 500),np.arange(-70000, 70000, 500)) z=np.zeros_like(tmp_b) for i in range(tmp_w.shape[0]): for j in range(tmp_w.shape[1]): z[i][j] = compute_cost(x_train, y_train, tmp_w[i][j], tmp_b[i][j] ) ax = fig.add_subplot(gs[2:], projection='3d') ax.plot_surface(tmp_w, tmp_b, z, alpha=0.3, color=dlblue) ax.xaxis.set_major_locator(MaxNLocator(2)) ax.yaxis.set_major_locator(MaxNLocator(2)) ax.set_xlabel('w', fontsize=16) ax.set_ylabel('b', fontsize=16) ax.set_zlabel('\ncost', fontsize=16) plt.title('Cost vs (b, w)') # Customize the view angle ax.view_init(elev=20., azim=-65) ax.plot(x, y, v,c=dlmagenta) return # draw derivative line # y = m*(x - x1) + y1 def add_line(dj_dx, x1, y1, d, ax): x = np.linspace(x1-d, x1+d,50) y = dj_dx*(x - x1) + y1 ax.scatter(x1, y1, color=dlblue, s=50) ax.plot(x, y, '--', c=dldarkred,zorder=10, linewidth = 1) xoff = 30 if x1 == 200 else 10 ax.annotate(r"$\frac{\partial J}{\partial w}$ =%d" % dj_dx, fontsize=14, xy=(x1, y1), xycoords='data', xytext=(xoff, 10), textcoords='offset points', arrowprops=dict(arrowstyle="->"), horizontalalignment='left', verticalalignment='top') def plt_gradients(x_train,y_train, f_compute_cost, f_compute_gradient): #=============== # First subplot #=============== fig,ax = plt.subplots(1,2,figsize=(12,4)) # Print w vs cost to see minimum fix_b = 100 w_array = np.linspace(-100, 500, 50) w_array = np.linspace(0, 400, 50) cost = np.zeros_like(w_array) for i in range(len(w_array)): tmp_w = w_array[i] cost[i] = f_compute_cost(x_train, y_train, tmp_w, fix_b) ax[0].plot(w_array, cost,linewidth=1) ax[0].set_title("Cost vs w, with gradient; b set to 100") ax[0].set_ylabel('Cost') ax[0].set_xlabel('w') # plot lines for fixed b=100 for tmp_w in [100,200,300]: fix_b = 100 dj_dw,dj_db = f_compute_gradient(x_train, y_train, tmp_w, fix_b ) j = f_compute_cost(x_train, y_train, tmp_w, fix_b) add_line(dj_dw, tmp_w, j, 30, ax[0]) #=============== # Second Subplot #=============== tmp_b,tmp_w = np.meshgrid(np.linspace(-200, 200, 10), np.linspace(-100, 600, 10)) U = np.zeros_like(tmp_w) V = np.zeros_like(tmp_b) for i in range(tmp_w.shape[0]): for j in range(tmp_w.shape[1]): U[i][j], V[i][j] = f_compute_gradient(x_train, y_train, tmp_w[i][j], tmp_b[i][j] ) X = tmp_w Y = tmp_b n=-2 color_array = np.sqrt(((V-n)/2)**2 + ((U-n)/2)**2) ax[1].set_title('Gradient shown in quiver plot') Q = ax[1].quiver(X, Y, U, V, color_array, units='width', ) qk = ax[1].quiverkey(Q, 0.9, 0.9, 2, r'$2 \frac{m}{s}$', labelpos='E',coordinates='figure') ax[1].set_xlabel("w"); ax[1].set_ylabel("b") def norm_plot(ax, data): scale = (np.max(data) - np.min(data))*0.2 x = np.linspace(np.min(data)-scale,np.max(data)+scale,50) _,bins, _ = ax.hist(data, x, color="xkcd:azure") #ax.set_ylabel("Count") mu = np.mean(data); std = np.std(data); dist = norm.pdf(bins, loc=mu, scale = std) axr = ax.twinx() axr.plot(bins,dist, color = "orangered", lw=2) axr.set_ylim(bottom=0) axr.axis('off') def plot_cost_i_w(X,y,hist): ws = np.array([ p[0] for p in hist["params"]]) rng = max(abs(ws[:,0].min()),abs(ws[:,0].max())) wr = np.linspace(-rng+0.27,rng+0.27,20) cst = [compute_cost(X,y,np.array([wr[i],-32, -67, -1.46]), 221) for i in range(len(wr))] fig,ax = plt.subplots(1,2,figsize=(12,3)) ax[0].plot(hist["iter"], (hist["cost"])); ax[0].set_title("Cost vs Iteration") ax[0].set_xlabel("iteration"); ax[0].set_ylabel("Cost") ax[1].plot(wr, cst); ax[1].set_title("Cost vs w[0]") ax[1].set_xlabel("w[0]"); ax[1].set_ylabel("Cost") ax[1].plot(ws[:,0],hist["cost"]) plt.show() ########################################################## # Regression Routines ########################################################## def compute_gradient_matrix(X, y, w, b): """ Computes the gradient for linear regression Args: X : (array_like Shape (m,n)) variable such as house size y : (array_like Shape (m,1)) actual value w : (array_like Shape (n,1)) Values of parameters of the model b : (scalar ) Values of parameter of the model Returns dj_dw: (array_like Shape (n,1)) The gradient of the cost w.r.t. the parameters w. dj_db: (scalar) The gradient of the cost w.r.t. the parameter b. """ m,n = X.shape f_wb = X @ w + b e = f_wb - y dj_dw = (1/m) * (X.T @ e) dj_db = (1/m) * np.sum(e) return dj_db,dj_dw #Function to calculate the cost def compute_cost_matrix(X, y, w, b, verbose=False): """ Computes the gradient for linear regression Args: X : (array_like Shape (m,n)) variable such as house size y : (array_like Shape (m,)) actual value w : (array_like Shape (n,)) parameters of the model b : (scalar ) parameter of the model verbose : (Boolean) If true, print out intermediate value f_wb Returns cost: (scalar) """ m,n = X.shape # calculate f_wb for all examples. f_wb = X @ w + b # calculate cost total_cost = (1/(2*m)) * np.sum((f_wb-y)**2) if verbose: print("f_wb:") if verbose: print(f_wb) return total_cost # Loop version of multi-variable compute_cost def compute_cost(X, y, w, b): """ compute cost Args: X : (ndarray): Shape (m,n) matrix of examples with multiple features w : (ndarray): Shape (n) parameters for prediction b : (scalar): parameter for prediction Returns cost: (scalar) cost """ m = X.shape[0] cost = 0.0 for i in range(m): f_wb_i = np.dot(X[i],w) + b cost = cost + (f_wb_i - y[i])**2 cost = cost/(2*m) return(np.squeeze(cost)) def compute_gradient(X, y, w, b): """ Computes the gradient for linear regression Args: X : (ndarray Shape (m,n)) matrix of examples y : (ndarray Shape (m,)) target value of each example w : (ndarray Shape (n,)) parameters of the model b : (scalar) parameter of the model Returns dj_dw : (ndarray Shape (n,)) The gradient of the cost w.r.t. the parameters w. dj_db : (scalar) The gradient of the cost w.r.t. the parameter b. """ m,n = X.shape #(number of examples, number of features) dj_dw = np.zeros((n,)) dj_db = 0. for i in range(m): err = (np.dot(X[i], w) + b) - y[i] for j in range(n): dj_dw[j] = dj_dw[j] + err * X[i,j] dj_db = dj_db + err dj_dw = dj_dw/m dj_db = dj_db/m return dj_db,dj_dw #This version saves more values and is more verbose than the assigment versons def gradient_descent_houses(X, y, w_in, b_in, cost_function, gradient_function, alpha, num_iters): """ Performs batch gradient descent to learn theta. Updates theta by taking num_iters gradient steps with learning rate alpha Args: X : (array_like Shape (m,n) matrix of examples y : (array_like Shape (m,)) target value of each example w_in : (array_like Shape (n,)) Initial values of parameters of the model b_in : (scalar) Initial value of parameter of the model cost_function: function to compute cost gradient_function: function to compute the gradient alpha : (float) Learning rate num_iters : (int) number of iterations to run gradient descent Returns w : (array_like Shape (n,)) Updated values of parameters of the model after running gradient descent b : (scalar) Updated value of parameter of the model after running gradient descent """ # number of training examples m = len(X) # An array to store values at each iteration primarily for graphing later hist={} hist["cost"] = []; hist["params"] = []; hist["grads"]=[]; hist["iter"]=[]; w = copy.deepcopy(w_in) #avoid modifying global w within function b = b_in save_interval = np.ceil(num_iters/10000) # prevent resource exhaustion for long runs print(f"Iteration Cost w0 w1 w2 w3 b djdw0 djdw1 djdw2 djdw3 djdb ") print(f"---------------------|--------|--------|--------|--------|--------|--------|--------|--------|--------|--------|") for i in range(num_iters): # Calculate the gradient and update the parameters dj_db,dj_dw = gradient_function(X, y, w, b) # Update Parameters using w, b, alpha and gradient w = w - alpha * dj_dw b = b - alpha * dj_db # Save cost J,w,b at each save interval for graphing if i == 0 or i % save_interval == 0: hist["cost"].append(cost_function(X, y, w, b)) hist["params"].append([w,b]) hist["grads"].append([dj_dw,dj_db]) hist["iter"].append(i) # Print cost every at intervals 10 times or as many iterations if < 10 if i% math.ceil(num_iters/10) == 0: #print(f"Iteration {i:4d}: Cost {cost_function(X, y, w, b):8.2f} ") cst = cost_function(X, y, w, b) print(f"{i:9d} {cst:0.5e} {w[0]: 0.1e} {w[1]: 0.1e} {w[2]: 0.1e} {w[3]: 0.1e} {b: 0.1e} {dj_dw[0]: 0.1e} {dj_dw[1]: 0.1e} {dj_dw[2]: 0.1e} {dj_dw[3]: 0.1e} {dj_db: 0.1e}") return w, b, hist #return w,b and history for graphing def run_gradient_descent(X,y,iterations=1000, alpha = 1e-6): m,n = X.shape # initialize parameters initial_w = np.zeros(n) initial_b = 0 # run gradient descent w_out, b_out, hist_out = gradient_descent_houses(X ,y, initial_w, initial_b, compute_cost, compute_gradient_matrix, alpha, iterations) print(f"w,b found by gradient descent: w: {w_out}, b: {b_out:0.2f}") return(w_out, b_out, hist_out) # compact extaction of hist data #x = hist["iter"] #J = np.array([ p for p in hist["cost"]]) #ws = np.array([ p[0] for p in hist["params"]]) #dj_ws = np.array([ p[0] for p in hist["grads"]]) #bs = np.array([ p[1] for p in hist["params"]]) def run_gradient_descent_feng(X,y,iterations=1000, alpha = 1e-6): m,n = X.shape # initialize parameters initial_w = np.zeros(n) initial_b = 0 # run gradient descent w_out, b_out, hist_out = gradient_descent(X ,y, initial_w, initial_b, compute_cost, compute_gradient_matrix, alpha, iterations) print(f"w,b found by gradient descent: w: {w_out}, b: {b_out:0.4f}") return(w_out, b_out) def gradient_descent(X, y, w_in, b_in, cost_function, gradient_function, alpha, num_iters): """ Performs batch gradient descent to learn theta. Updates theta by taking num_iters gradient steps with learning rate alpha Args: X : (array_like Shape (m,n) matrix of examples y : (array_like Shape (m,)) target value of each example w_in : (array_like Shape (n,)) Initial values of parameters of the model b_in : (scalar) Initial value of parameter of the model cost_function: function to compute cost gradient_function: function to compute the gradient alpha : (float) Learning rate num_iters : (int) number of iterations to run gradient descent Returns w : (array_like Shape (n,)) Updated values of parameters of the model after running gradient descent b : (scalar) Updated value of parameter of the model after running gradient descent """ # number of training examples m = len(X) # An array to store values at each iteration primarily for graphing later hist={} hist["cost"] = []; hist["params"] = []; hist["grads"]=[]; hist["iter"]=[]; w = copy.deepcopy(w_in) #avoid modifying global w within function b = b_in save_interval = np.ceil(num_iters/10000) # prevent resource exhaustion for long runs for i in range(num_iters): # Calculate the gradient and update the parameters dj_db,dj_dw = gradient_function(X, y, w, b) # Update Parameters using w, b, alpha and gradient w = w - alpha * dj_dw b = b - alpha * dj_db # Save cost J,w,b at each save interval for graphing if i == 0 or i % save_interval == 0: hist["cost"].append(cost_function(X, y, w, b)) hist["params"].append([w,b]) hist["grads"].append([dj_dw,dj_db]) hist["iter"].append(i) # Print cost every at intervals 10 times or as many iterations if < 10 if i% math.ceil(num_iters/10) == 0: #print(f"Iteration {i:4d}: Cost {cost_function(X, y, w, b):8.2f} ") cst = cost_function(X, y, w, b) print(f"Iteration {i:9d}, Cost: {cst:0.5e}") return w, b, hist #return w,b and history for graphing def load_house_data(): data = np.loadtxt("./data/houses.txt", delimiter=',', skiprows=1) X = data[:,:4] y = data[:,4] return X, y def zscore_normalize_features(X,rtn_ms=False): """ returns z-score normalized X by column Args: X : (numpy array (m,n)) Returns X_norm: (numpy array (m,n)) input normalized by column """ mu = np.mean(X,axis=0) sigma = np.std(X,axis=0) X_norm = (X - mu)/sigma if rtn_ms: return(X_norm, mu, sigma) else: return(X_norm)