import numpy as np
import matplotlib.pyplot as plt


def f(x, y): # Define function z = f(x, y).
    z = -(np.cos(x)**2 + np.cos(y)**2)**2
    return z

def gradient_f(x, y): # Define function grad(z) = (df_dx, df_dy).
    df_dx = -2 * (np.cos(x)**2 + np.cos(y)**2) * (2 * np.cos(x) * (-np.sin(x)))
    df_dy = -2 * (np.cos(x)**2 + np.cos(y)**2) * (2 * np.cos(y) * (-np.sin(y)))
    return (df_dx, df_dy)


grid_n = 20
limit = np.pi/2
x_coords = np.linspace(start=-limit, stop=limit, num=grid_n)
y_coords = x_coords
grid_x, grid_y = np.meshgrid(x_coords, y_coords) # coordinate matrices from coordinate vectors
# https://numpy.org/doc/stable/reference/generated/numpy.vectorize.html
vectorized_f = np.vectorize(f)
# set z = f(x, y) for each z[i, j] = f(x = grid_x[i, j], y = grid_y[i,j]
grid_z = vectorized_f(grid_x, grid_y)


# plot z = f(x, y) over -pi/2 <= x, y <= pi/2
fig = plt.figure(figsize=(10, 10)) # (width, height) in inches
ax  = fig.add_subplot(111, projection='3d') # 111 => nrows=1, ncols=1, index=1
ax.plot_wireframe(grid_x, grid_y, grid_z, color='gray', linewidth=1)
ax.grid(False)

# prepare to start gradient descent at (x, y)
x = -.98*limit
y = -.70*limit
print(f'Start at ({x:.2}, {y:.2}).')
old_x = x # save old values for plotting line segments showing steps taken
old_y = y
n_steps = 20
alpha = .1 # step size parameter (library-quality algorithm varies alpha with a line search)

# run gradient descent
for i in range(n_steps):
  df_dx, df_dy = gradient_f(x, y)
  x = x - alpha * df_dx
  y = y - alpha * df_dy

  if True: # this block is not part of the algorithm: it just draws
    print(f'  i={i}, moving ({-alpha * df_dx:.2}, {-alpha * df_dy:.2})')
    z = f(x, y)
    shift = .1
    ax.text(x - shift, y + shift, 0, str(i))
    xy_legend, = ax.plot3D(x, y, 0, 'ob', markersize=7, label=f'$(x, y)$') # blue circle in 2D z=0 plane
    xyz_legend, = ax.plot3D(x, y, z, 'or', markersize=7, label=f'$(x, y, z=f(x, y))$') # red circle in 3D
    label = r'$-\alpha (\partial f / \partial x, \partial f / \partial y)$'
    gradient_legend, = ax.plot3D([x, old_x], [y, old_y], 0, '-', color='lime', linewidth=2, label=label)
    ax.plot3D([x, x], [y, y], [0, z], '-r', linewidth=1)
    old_x = x
    old_y = y
    
print(f'End at ({x:.2}, {y:.2}).')

ax.view_init(elev=60, azim=-50)
plt.title('Gradient descent')
plt.xlabel(f'$x$')
plt.ylabel(f'$y$')
ax.set_zlabel(f'$z$')
# If I call plt.legend(), I get as many points in the legend as I plotted, 2*n_steps,
# plus n_steps lines. I saved some "handle" (?) from each ax.plot() call, overwriting
# it until the last loop iteration. I use those final handles to make a legend
# with only three items:
plt.legend(handles=[xy_legend, xyz_legend, gradient_legend])
plt.savefig(fname='gradientDescent.png')
plt.show(block=False)

Start at (-1.5, -1.1).
  i=0, moving (0.0026, 0.034)
  i=1, moving (0.0032, 0.04)
  i=2, moving (0.004, 0.048)
  i=3, moving (0.0052, 0.058)
  i=4, moving (0.0068, 0.071)
  i=5, moving (0.0093, 0.087)
  i=6, moving (0.013, 0.11)
  i=7, moving (0.019, 0.12)
  i=8, moving (0.028, 0.13)
  i=9, moving (0.042, 0.12)
  i=10, moving (0.062, 0.1)
  i=11, moving (0.089, 0.071)
  i=12, moving (0.13, 0.046)
  i=13, moving (0.18, 0.029)
  i=14, moving (0.26, 0.017)
  i=15, moving (0.31, 0.0095)
  i=16, moving (0.25, 0.004)
  i=17, moving (0.095, 0.0011)
  i=18, moving (0.021, 0.00023)
  i=19, moving (0.0041, 4.5e-05)
End at (-0.001, -1.1e-05).


def gradient_descent(x_vector, gradient, alpha=.1, epsilon=.01, n=30, verbose=False):
    for i in np.arange(n):
        gr = gradient(x_vector)
        x_vector = x_vector - alpha * gr
        gr_size = np.sum(np.abs(gr))
        if verbose:
            with np.printoptions(precision=3): # set precision for this block only
                # I use np.array([...]) below to ensure that np.printoptions() applies.
                print(f'i={i}, x_vector={np.array([x_vector])}, gr={np.array([gr])},' +
                      f'gr_size={np.array([gr_size])}, moving {np.array([-alpha*gr])}')
        if (gr_size < epsilon):
            break
    return x_vector


# Here is a simple function y = x**2 which has a minimum at x=0.
def f(x):
    return x**2

def g(x): # gradient
    return 2*x

# Use x=1 as the starting point for gradient descent to find the minimum.
gradient_descent(x_vector=np.array(1), gradient=g, verbose=True)

i=0, x_vector=[0.8], gr=[2],gr_size=[2], moving [-0.2]
i=1, x_vector=[0.64], gr=[1.6],gr_size=[1.6], moving [-0.16]
i=2, x_vector=[0.512], gr=[1.28],gr_size=[1.28], moving [-0.128]
i=3, x_vector=[0.41], gr=[1.024],gr_size=[1.024], moving [-0.102]
i=4, x_vector=[0.328], gr=[0.819],gr_size=[0.819], moving [-0.082]
i=5, x_vector=[0.262], gr=[0.655],gr_size=[0.655], moving [-0.066]
i=6, x_vector=[0.21], gr=[0.524],gr_size=[0.524], moving [-0.052]
i=7, x_vector=[0.168], gr=[0.419],gr_size=[0.419], moving [-0.042]
i=8, x_vector=[0.134], gr=[0.336],gr_size=[0.336], moving [-0.034]
i=9, x_vector=[0.107], gr=[0.268],gr_size=[0.268], moving [-0.027]
i=10, x_vector=[0.086], gr=[0.215],gr_size=[0.215], moving [-0.021]
i=11, x_vector=[0.069], gr=[0.172],gr_size=[0.172], moving [-0.017]
i=12, x_vector=[0.055], gr=[0.137],gr_size=[0.137], moving [-0.014]
i=13, x_vector=[0.044], gr=[0.11],gr_size=[0.11], moving [-0.011]
i=14, x_vector=[0.035], gr=[0.088],gr_size=[0.088], moving [-0.009]
i=15, x_vector=[0.028], gr=[0.07],gr_size=[0.07], moving [-0.007]
i=16, x_vector=[0.023], gr=[0.056],gr_size=[0.056], moving [-0.006]
i=17, x_vector=[0.018], gr=[0.045],gr_size=[0.045], moving [-0.005]
i=18, x_vector=[0.014], gr=[0.036],gr_size=[0.036], moving [-0.004]
i=19, x_vector=[0.012], gr=[0.029],gr_size=[0.029], moving [-0.003]
i=20, x_vector=[0.009], gr=[0.023],gr_size=[0.023], moving [-0.002]
i=21, x_vector=[0.007], gr=[0.018],gr_size=[0.018], moving [-0.002]
i=22, x_vector=[0.006], gr=[0.015],gr_size=[0.015], moving [-0.001]
i=23, x_vector=[0.005], gr=[0.012],gr_size=[0.012], moving [-0.001]
i=24, x_vector=[0.004], gr=[0.009],gr_size=[0.009], moving [-0.001]

0.0037778931862957168


def f(x_vector): # function
    x = x_vector[0]
    y = x_vector[1]
    return x**2 + y**2 - 6*x - 4*y + 13

def g(x_vector): # gradient
    x = x_vector[0]
    y = x_vector[1]
    return np.array([2*x - 6, 2*y - 4])

gradient_descent(x_vector=np.array([0, 0]), gradient=g, alpha=0.4, verbose=True)

i=0, x_vector=[[2.4 1.6]], gr=[[-6 -4]],gr_size=[10], moving [[2.4 1.6]]
i=1, x_vector=[[2.88 1.92]], gr=[[-1.2 -0.8]],gr_size=[2.], moving [[0.48 0.32]]
i=2, x_vector=[[2.976 1.984]], gr=[[-0.24 -0.16]],gr_size=[0.4], moving [[0.096 0.064]]
i=3, x_vector=[[2.995 1.997]], gr=[[-0.048 -0.032]],gr_size=[0.08], moving [[0.019 0.013]]
i=4, x_vector=[[2.999 1.999]], gr=[[-0.01  -0.006]],gr_size=[0.016], moving [[0.004 0.003]]
i=5, x_vector=[[3. 2.]], gr=[[-0.002 -0.001]],gr_size=[0.003], moving [[0.001 0.001]]

array([2.999808, 1.999872])

Gradient descent¶

Define a function returning $z = f(x, y)$ and a function returning its gradient $\left(\frac{\partial f}{\partial x}, \frac{\partial f}{\partial y}\right)$.¶

Make 2D matrices grid_x, grid_y, and grid_z for 3D plot of $\left\{(x, y, z)\right\}$, where $z=f(x, y)$¶

Show gradient descent:¶

Convert the code above into a function¶

Test gradient_descent() on a familiar 1D function¶

Test gradient_descent() on the 2D function from the lecture notes¶