2.10. Projections, Revisited - EECS 245 Course Notes

We’re almost ready to return to our original motivation for studying linear algebra, which was to perform linear regression using multiple input variables. This section outlines the final piece of the puzzle.

Approximating using a Single Vector¶

In Chapter 2.3, we introduced the approximation problem, which asked:

Among all vectors of the form $k \color{#3d81f6} \vec v$ , which one is closest to ${\color{orange}\vec u}$ ?

We now know the answer is the vector $\color{#004d40} \vec p$ , where

{\color{#004d40} \vec p} = \left( \frac{{\color{orange}\vec u} \cdot \color{#3d81f6} \vec v}{{\color{#3d81f6}\vec v} \cdot {\color{#3d81f6}\vec v}} \right) \color{#3d81f6} \vec v

$\color{#004d40} \vec p$ is called the orthogonal projection of $\color{orange} \vec u$ onto $\color{#3d81f6} \vec v$ .

# This chunk must be in the first plotting cell of each notebook in order to guarantee that the mathjax script is loaded.

import plotly
from IPython.display import display, HTML

plotly.offline.init_notebook_mode()
display(HTML(
    '<script type="text/javascript" async src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_SVG"></script>'
))

import numpy as np
import plotly.graph_objects as go

u = np.array([3, 1])
v = np.array([3, -2])

# Calculate k^* where error is orthogonal to v
k_star = np.dot(u, v) / np.dot(v, v)
p = k_star * v  # p = k^* v
e = u - p       # error vector (orthogonal to v)

def create_vector_trace(coordinates, color, label, opacity=1.0):
    x, y = coordinates
    return go.Scatter(
        x=[0, x], 
        y=[0, y],
        mode='lines+markers',
        line=dict(color=color, width=4),
        marker=dict(
            size=[0, 16],
            color=[color, color],
            symbol=['circle', 'arrow'],
            angleref='previous'
        ),
        hovertemplate='(%{x}, %{y})<extra></extra>',
        showlegend=False,
        name=label,
        opacity=opacity
    )

def create_error_trace(start_coords, end_coords, color, label, opacity=1.0):
    return go.Scatter(
        x=[start_coords[0], end_coords[0]], 
        y=[start_coords[1], end_coords[1]],
        mode='lines+markers',
        line=dict(color=color, width=3, dash='dot'),
        marker=dict(
            size=[0, 12],
            color=[color, color],
            symbol=['circle', 'arrow'],
            angleref='previous'
        ),
        hovertemplate='(%{x}, %{y})<extra></extra>',
        showlegend=False,
        name=label,
        opacity=opacity
    )

def plot_static_projection():
    traces = []

    traces.append(create_vector_trace(tuple(u), 'orange', r'$\vec u$'))
    traces.append(create_vector_trace(tuple(v), '#3d81f6', r'$\vec v$', opacity=0.5))
    traces.append(create_vector_trace(tuple(p), '#004d40', r'$\vec p = k^* \vec v$'))
    traces.append(create_error_trace(tuple(p), tuple(u), '#d81b60', r'$\vec e$'))

    v_unit = v / np.linalg.norm(v)
    e_unit = e / np.linalg.norm(e)
    marker_size = 0.12

    p0 = p
    p1 = p0 + -v_unit * marker_size
    p2 = p1 + e_unit * marker_size

    right_angle_trace = go.Scatter(
        x=[p1[0], p2[0], p2[0] - (p1[0] - p0[0])],
        y=[p1[1], p2[1], p2[1] - (p1[1] - p0[1])],
        mode='lines',
        line=dict(color="#222", width=2),
        showlegend=False,
        hoverinfo='skip'
    )
    traces.append(right_angle_trace)
    
    min_x = min(0, p[0], u[0], v[0]) - 0.5
    max_x = max(0, p[0], u[0], v[0]) + 1.5
    min_y = min(0, p[1], u[1], v[1]) - 0.5
    max_y = max(0, p[1], u[1], v[1]) + 1.5

    top_right_corner = (max_x - 0.2, max_y - 0.2)

    fig = go.Figure(data=traces)
    
    fig.add_annotation(
        x=u[0],
        y=u[1] + 0.13,
        text=r"$\vec u$",
        showarrow=False,
        font=dict(size=18, family="Palatino, serif", color="orange"),
        align="center"
    )
    fig.add_annotation(
        x=v[0] + 0.13,
        y=v[1] - 0.13,
        text=r"$\vec v$",
        showarrow=False,
        font=dict(size=18, family="Palatino, serif", color="#3d81f6"),
        align="center"
    )
    fig.add_annotation(
        x=p[0] - 1,
        y=p[1] + 0.2,
        text=fr"$\vec p = k^* \vec v$",
        showarrow=False,
        font=dict(size=18, family="Palatino, serif", color="#004d40"),
        align="left"
    )
    fig.add_annotation(
        x=p[0] - 1,
        y=p[1] + 0.2,
        text=fr"$\vec p = k^* \vec v$",
        showarrow=False,
        font=dict(size=18, family="Palatino, serif", color="#004d40"),
        align="left"
    )
    fig.add_annotation(
        x=(p[0] + u[0]) / 2 + 0.21,
        y=(p[1] + u[1]) / 2 + 0.03,
        text=r"$\vec e$",
        showarrow=False,
        font=dict(size=18, family="Palatino, serif", color="#d81b60"),
        align="center"
    )
    fig.update_layout(
        width=480,
        height=420,
        yaxis_scaleanchor="x",
        margin=dict(l=10, r=10, t=10, b=10),
        font=dict(family="Palatino, serif"),
        plot_bgcolor="white",
        paper_bgcolor="white",
    )
    fig.update_xaxes(
        range=[min_x, max_x],
        showticklabels=False,
        gridcolor="#fff",
        zerolinecolor="#fff"
    )
    fig.update_yaxes(
        range=[min_y, max_y],
        showticklabels=False,
        gridcolor="#fff",
        zerolinecolor="#fff"
    )
    return fig

plot_static_projection().show(renderer='png', scale=3)

As we’ve studied, the resulting error vector,

{\color{#d81a60} \vec e} = {\color{orange} \vec u} - {\color{#004d40} \vec p}

is orthogonal to $\color{#3d81f6} \vec v$ .

In our original look at the approximation problem, we were approximating $\color{orange} \vec u$ using a scalar multiple of just a single vector, $\color{#3d81f6} \vec v$ . The set of all scalar multiples of $\color{#3d81f6} \vec v$ , denoted by $\text{span}(\{ {\color{#3d81f6} \vec v}\})$ , is a line in $\mathbb{R}^n$ .

Key idea: instead of projecting onto the subspace spanned by just a single vector, how might we project onto the subspace spanned by multiple vectors?

Approximating using Multiple Vectors¶

Equipped with our understanding of linear independence, spans, subspaces, and column spaces, we’re ready to tackle a more advanced version of the approximation problem. I’m going to use slightly different variables to pose the problem than I did in the single vector case, to make the transition back into data, models, and loss functions a little smoother.

The Approximation Problem

Suppose ${\color{#3d81f6} \vec x^{(1)}}, {\color{#3d81f6} \vec x^{(2)}}, ..., {\color{#3d81f6} \vec x^{(d)}} \in \mathbb{R}^n$ , and $\color{orange} \vec y \in \mathbb{R}^n$ is not necessarily in $\text{span}(\{{\color{#3d81f6} \vec x^{(1)}}, {\color{#3d81f6} \vec x^{(2)}}, ..., {\color{#3d81f6} \vec x^{(d)}}\})$ .

We can construct the matrix $\color{#3d81f6} X$ by placing the ${\color{#3d81f6} \vec x^{(i)}}$ 's in its columns:

{\color{#3d81f6} X} = \begin{bmatrix} {\color{#3d81f6} |} & {\color{#3d81f6} |} & & {\color{#3d81f6} |} \\ {\color{#3d81f6} \vec{x}^{(1)}} & {\color{#3d81f6} \vec{x}^{(2)}} & \cdots & {\color{#3d81f6} \vec{x}^{(d)}} \\ {\color{#3d81f6} |} & {\color{#3d81f6} |} & & {\color{#3d81f6} |} \end{bmatrix}

Then, the following three statements are all equivalent ways of asking the approximation problem:

Among all vectors in $\text{span}(\{{\color{#3d81f6} \vec x^{(1)}}, {\color{#3d81f6} \vec x^{(2)}}, ..., {\color{#3d81f6} \vec x^{(d)}}\})$ , which is closest to $\color{orange} \vec y$ ?
Among all vectors in $\text{colsp}({\color{#3d81f6} X})$ , which is closest to $\color{orange} \vec y$ ?
Among all vectors of the form ${\color{#3d81f6} X} \vec w$ , where $\vec w \in \mathbb{R}^d$ , which is closest to $\color{orange} \vec y$ ?

All three statements at the bottom of the box above are asking the exact same question; I’ve presented all three forms so that you see more clearly how the ideas of spans, column spaces, and matrix-vector multiplication fit together. I will tend to refer to the latter two versions of the problem the most. In what follows, suppose $\color{#3d81f6} X$ is an $n \times d$ matrix whose columns ${\color{#3d81f6} \vec x^{(1)}}$ , ${\color{#3d81f6} \vec x^{(2)}}$ , ..., ${\color{#3d81f6} \vec x^{(d)}}$ are the building blocks we want to approximate $\color{orange} \vec y$ with.

First, let’s get the trivial case out of the way. If ${\color{orange} \vec y} \in \text{colsp}({\color{#3d81f6} X})$ , then the vector in $\text{colsp}({\color{#3d81f6} X})$ that is closest to $\color{orange} \vec y$ is just $\color{orange} \vec y$ itself. If that’s the case, there exists some $\vec w$ such that ${\color{orange} \vec y} = {\color{#3d81f6} X} \vec w$ exactly. This $\vec w$ is unique only if ${\color{#3d81f6} X}$ 's columns are linearly independent; otherwise, there will be infinitely many good $\vec w$ ’s.

But, that’s not the case I’m really interested in. I care more about when $\color{orange} \vec y$ is not in $\text{colsp}({\color{#3d81f6} X})$ . (Remember, this is the case we’re interested in when we’re doing linear regression: usually, it’s not possible to make our predictions 100% correct, and we’ll have to settle for some error.)

Then what?

In general, $\text{colsp}({\color{#3d81f6} X})$ is an $r$ -dimensional subspace of $\mathbb{R}^n$ , where $r = \text{rank}({\color{#3d81f6} X})$ . In the diagram below, I’ve used a plane to represent $\text{colsp}({\color{#3d81f6} X})$ ; just remember that ${\color{#3d81f6} X}$ may have more than 3 rows or columns.

from utils import plot_vectors
import numpy as np
import plotly.graph_objects as go

# Define the vectors
y = np.array([1, 3, 6]) * 1.5
v1 = (1, 0, 1)
v2 = np.array([2, 2, 1]) * 0.5
v3 = (-3.5 * v1[0] + 4 * v2[0], -3.5 * v1[1] + 4 * v2[1], -3.5 * v1[2] + 4 * v2[2])  # v3 is v1 + v2, which is on the plane spanned by v1 and v2

# Plot the vectors using plot_vectors function
vectors = [
    (tuple(y), "orange", "y"),
    (v1, "#3d81f6", ""), 
    (tuple(v2), "#3d81f6", ""),
    (v3, "#3d81f6", "")
]

fig = plot_vectors(vectors, show_axis_labels=True, vdeltaz=3)

# Make the plane look more rectangular by using a smaller, symmetric range for s and t
plane_extent = 20  # controls the "size" of the rectangle
num_points = 3   # fewer points for a cleaner rectangle

s_range = np.linspace(-plane_extent, plane_extent, num_points)
t_range = np.linspace(-plane_extent, plane_extent, num_points)
s_grid, t_grid = np.meshgrid(s_range, t_range)

plane_x = s_grid * v1[0] + t_grid * v2[0]
plane_y = s_grid * v1[1] + t_grid * v2[1]
plane_z = s_grid * v1[2] + t_grid * v2[2]

fig.add_trace(go.Surface(
    x=plane_x,
    y=plane_y,
    z=plane_z,
    opacity=0.8,
    colorscale=[[0, 'rgba(61,129,246,0.3)'], [1, 'rgba(61,129,246,0.3)']],
    showscale=False,
))

# Annotate the plane with "colsp(X)"
# Move the annotation "down" the plane by choosing negative s and t values
label_s = 0.3
label_t = 0.9
label_x = label_s * v1[0] + label_t * v2[0]
label_y_coord = label_s * v1[1] + label_t * v2[1] - 2
label_z = label_s * v1[2] + label_t * v2[2]

fig.add_trace(go.Scatter3d(
    x=[2],
    y=[-0.5],
    z=[3],  # small offset above the plane for visibility
    mode="text",
    text=[r"colsp(X)"],
    textposition="middle center",
    textfont=dict(size=22, color="#3d81f6"),
    showlegend=False
))

fig.update_layout(
    scene_camera=dict(
        eye=dict(x=1, y=-1, z=1.6)
    ),
    scene=dict(
        zaxis=dict(range=[-4.5, 5]),  # ensure z-axis hits -3,
        aspectmode="manual",
        aspectratio=dict(x=1, y=1, z=1),
    ),
)
fig.show()

We’re searching for the vector in $\text{colsp}({\color{#3d81f6} X})$ that is closest to $\color{orange} \vec y$ , i.e. whose distance from $\color{orange} \vec y$ is minimal. Remember that $\text{colsp}({\color{#3d81f6} X})$ is the set of linear combinations of ${\color{#3d81f6} X}$ 's columns, so it’s the set of all vectors of the form ${\color{#3d81f6} X} \vec w$ for some $\vec w \in \mathbb{R}^d$ .

So, our problem boils down to finding the $\vec w$ that minimizes the distance between ${\color{orange} \vec y}$ and ${\color{#3d81f6} X} \vec w$ , i.e. the $\vec w$ that minimizes the norm of the error vector ${\color{#d81a60} \vec e} = {\color{orange} \vec y} - {\color{#3d81f6} X} \vec w$ .

\lVert {\color{#d81a60} \vec e} \rVert = \lVert {\color{orange} \vec y} - {\color{#3d81f6} X} \vec w \rVert

$f(\vec w) = \lVert {\color{orange} \vec y} - {\color{#3d81f6} X} \vec w \rVert$ is a function of $\vec w$ only; $\color{#3d81f6} X$ and $\color{orange} \vec y$ should be thought of as fixed. There are two ways we’ll minimize $f(\vec w)$ :

Using a geometric argument involving orthogonality, as we did in the single vector case.
Using calculus. This is more involved than before, since the input variable is a vector, not a scalar, but it can be done, as we’ll see in Chapter 4.

What does our intuition tell us? Extending the single vector case, we expect the vector in $\text{colsp}({\color{#3d81f6} X})$ that is closest to $\color{orange} \vec y$ to be the orthogonal projection of $\color{orange} \vec y$ onto $\text{colsp}({\color{#3d81f6} X})$ .

More to come!

EECS 245 Course Notes

2.9. Inverses

EECS 245 Course Notes

Chapter 3: Regression using Linear Algebra