import numpy as np
import matplotlib.pyplot as plt
In particular, use $z = f(x, y) = -\left(\cos^2 x + \cos^2 y \right)^2$.
def f(x, y): # Define function z = f(x, y).
z = -(np.cos(x)**2 + np.cos(y)**2)**2
return z
def gradient_f(x, y): # Define function grad(z) = (df_dx, df_dy).
df_dx = -2 * (np.cos(x)**2 + np.cos(y)**2) * (2 * np.cos(x) * (-np.sin(x)))
df_dy = -2 * (np.cos(x)**2 + np.cos(y)**2) * (2 * np.cos(y) * (-np.sin(y)))
return (df_dx, df_dy)
grid_n = 20
limit = np.pi/2
x_coords = np.linspace(start=-limit, stop=limit, num=grid_n)
y_coords = x_coords
grid_x, grid_y = np.meshgrid(x_coords, y_coords) # coordinate matrices from coordinate vectors
# https://numpy.org/doc/stable/reference/generated/numpy.vectorize.html
vectorized_f = np.vectorize(f)
# set z = f(x, y) for each z[i, j] = f(x = grid_x[i, j], y = grid_y[i,j]
grid_z = vectorized_f(grid_x, grid_y)
# plot z = f(x, y) over -pi/2 <= x, y <= pi/2
fig = plt.figure(figsize=(10, 10)) # (width, height) in inches
ax = fig.add_subplot(111, projection='3d') # 111 => nrows=1, ncols=1, index=1
ax.plot_wireframe(grid_x, grid_y, grid_z, color='gray', linewidth=1)
ax.grid(False)
# prepare to start gradient descent at (x, y)
x = -.98*limit
y = -.70*limit
print(f'Start at ({x:.2}, {y:.2}).')
old_x = x # save old values for plotting line segments showing steps taken
old_y = y
n_steps = 20
alpha = .1 # step size parameter (library-quality algorithm varies alpha with a line search)
# run gradient descent
for i in range(n_steps):
df_dx, df_dy = gradient_f(x, y)
x = x - alpha * df_dx
y = y - alpha * df_dy
if True: # this block is not part of the algorithm: it just draws
print(f' i={i}, moving ({-alpha * df_dx:.2}, {-alpha * df_dy:.2})')
z = f(x, y)
shift = .1
ax.text(x - shift, y + shift, 0, str(i))
xy_legend, = ax.plot3D(x, y, 0, 'ob', markersize=7, label=f'$(x, y)$') # blue circle in 2D z=0 plane
xyz_legend, = ax.plot3D(x, y, z, 'or', markersize=7, label=f'$(x, y, z=f(x, y))$') # red circle in 3D
label = r'$-\alpha (\partial f / \partial x, \partial f / \partial y)$'
gradient_legend, = ax.plot3D([x, old_x], [y, old_y], 0, '-', color='lime', linewidth=2, label=label)
ax.plot3D([x, x], [y, y], [0, z], '-r', linewidth=1)
old_x = x
old_y = y
print(f'End at ({x:.2}, {y:.2}).')
ax.view_init(elev=60, azim=-50)
plt.title('Gradient descent')
plt.xlabel(f'$x$')
plt.ylabel(f'$y$')
ax.set_zlabel(f'$z$')
# If I call plt.legend(), I get as many points in the legend as I plotted, 2*n_steps,
# plus n_steps lines. I saved some "handle" (?) from each ax.plot() call, overwriting
# it until the last loop iteration. I use those final handles to make a legend
# with only three items:
plt.legend(handles=[xy_legend, xyz_legend, gradient_legend])
plt.savefig(fname='gradientDescent.png')
plt.show(block=False)
Start at (-1.5, -1.1). i=0, moving (0.0026, 0.034) i=1, moving (0.0032, 0.04) i=2, moving (0.004, 0.048) i=3, moving (0.0052, 0.058) i=4, moving (0.0068, 0.071) i=5, moving (0.0093, 0.087) i=6, moving (0.013, 0.11) i=7, moving (0.019, 0.12) i=8, moving (0.028, 0.13) i=9, moving (0.042, 0.12) i=10, moving (0.062, 0.1) i=11, moving (0.089, 0.071) i=12, moving (0.13, 0.046) i=13, moving (0.18, 0.029) i=14, moving (0.26, 0.017) i=15, moving (0.31, 0.0095) i=16, moving (0.25, 0.004) i=17, moving (0.095, 0.0011) i=18, moving (0.021, 0.00023) i=19, moving (0.0041, 4.5e-05) End at (-0.001, -1.1e-05).
def gradient_descent(x_vector, gradient, alpha=.1, epsilon=.01, n=30, verbose=False):
for i in np.arange(n):
gr = gradient(x_vector)
x_vector = x_vector - alpha * gr
gr_size = np.sum(np.abs(gr))
if verbose:
with np.printoptions(precision=3): # set precision for this block only
# I use np.array([...]) below to ensure that np.printoptions() applies.
print(f'i={i}, x_vector={np.array([x_vector])}, gr={np.array([gr])},' +
f'gr_size={np.array([gr_size])}, moving {np.array([-alpha*gr])}')
if (gr_size < epsilon):
break
return x_vector
# Here is a simple function y = x**2 which has a minimum at x=0.
def f(x):
return x**2
def g(x): # gradient
return 2*x
# Use x=1 as the starting point for gradient descent to find the minimum.
gradient_descent(x_vector=np.array(1), gradient=g, verbose=True)
i=0, x_vector=[0.8], gr=[2],gr_size=[2], moving [-0.2] i=1, x_vector=[0.64], gr=[1.6],gr_size=[1.6], moving [-0.16] i=2, x_vector=[0.512], gr=[1.28],gr_size=[1.28], moving [-0.128] i=3, x_vector=[0.41], gr=[1.024],gr_size=[1.024], moving [-0.102] i=4, x_vector=[0.328], gr=[0.819],gr_size=[0.819], moving [-0.082] i=5, x_vector=[0.262], gr=[0.655],gr_size=[0.655], moving [-0.066] i=6, x_vector=[0.21], gr=[0.524],gr_size=[0.524], moving [-0.052] i=7, x_vector=[0.168], gr=[0.419],gr_size=[0.419], moving [-0.042] i=8, x_vector=[0.134], gr=[0.336],gr_size=[0.336], moving [-0.034] i=9, x_vector=[0.107], gr=[0.268],gr_size=[0.268], moving [-0.027] i=10, x_vector=[0.086], gr=[0.215],gr_size=[0.215], moving [-0.021] i=11, x_vector=[0.069], gr=[0.172],gr_size=[0.172], moving [-0.017] i=12, x_vector=[0.055], gr=[0.137],gr_size=[0.137], moving [-0.014] i=13, x_vector=[0.044], gr=[0.11],gr_size=[0.11], moving [-0.011] i=14, x_vector=[0.035], gr=[0.088],gr_size=[0.088], moving [-0.009] i=15, x_vector=[0.028], gr=[0.07],gr_size=[0.07], moving [-0.007] i=16, x_vector=[0.023], gr=[0.056],gr_size=[0.056], moving [-0.006] i=17, x_vector=[0.018], gr=[0.045],gr_size=[0.045], moving [-0.005] i=18, x_vector=[0.014], gr=[0.036],gr_size=[0.036], moving [-0.004] i=19, x_vector=[0.012], gr=[0.029],gr_size=[0.029], moving [-0.003] i=20, x_vector=[0.009], gr=[0.023],gr_size=[0.023], moving [-0.002] i=21, x_vector=[0.007], gr=[0.018],gr_size=[0.018], moving [-0.002] i=22, x_vector=[0.006], gr=[0.015],gr_size=[0.015], moving [-0.001] i=23, x_vector=[0.005], gr=[0.012],gr_size=[0.012], moving [-0.001] i=24, x_vector=[0.004], gr=[0.009],gr_size=[0.009], moving [-0.001]
0.0037778931862957168
def f(x_vector): # function
x = x_vector[0]
y = x_vector[1]
return x**2 + y**2 - 6*x - 4*y + 13
def g(x_vector): # gradient
x = x_vector[0]
y = x_vector[1]
return np.array([2*x - 6, 2*y - 4])
gradient_descent(x_vector=np.array([0, 0]), gradient=g, alpha=0.4, verbose=True)
i=0, x_vector=[[2.4 1.6]], gr=[[-6 -4]],gr_size=[10], moving [[2.4 1.6]] i=1, x_vector=[[2.88 1.92]], gr=[[-1.2 -0.8]],gr_size=[2.], moving [[0.48 0.32]] i=2, x_vector=[[2.976 1.984]], gr=[[-0.24 -0.16]],gr_size=[0.4], moving [[0.096 0.064]] i=3, x_vector=[[2.995 1.997]], gr=[[-0.048 -0.032]],gr_size=[0.08], moving [[0.019 0.013]] i=4, x_vector=[[2.999 1.999]], gr=[[-0.01 -0.006]],gr_size=[0.016], moving [[0.004 0.003]] i=5, x_vector=[[3. 2.]], gr=[[-0.002 -0.001]],gr_size=[0.003], moving [[0.001 0.001]]
array([2.999808, 1.999872])