8.5. Convexity - EECS 245 Course Notes

import numpy as np
import plotly.express as px
import plotly.graph_objects as go


def f(x):
    return 5 * (x**4) - (x**3) - 5 * (x**2) + 2 * x - 9


def df(x):
    return 20 * (x**3) - 3 * (x**2) - 10 * x + 2


def draw_f():
    xs = np.linspace(-1.25, 1.25, 1000)
    ys = f(xs)
    fig = px.line(x=xs, y=ys, line_shape="linear")
    fig.update_traces(line=dict(width=3, color="#3d81f6"))
    fig.update_layout(
        xaxis_title="$x$",
        yaxis_title="$f(x)$",
        width=700,
        height=420,
        font=dict(family="Palatino Linotype, Palatino, serif"),
        plot_bgcolor="white",
        paper_bgcolor="white",
        showlegend=False,
        margin=dict(l=60, r=40, t=30, b=50),
    )
    fig.update_xaxes(gridcolor="#f0f0f0", zerolinecolor="gray", showline=True, linecolor="black", linewidth=1)
    fig.update_yaxes(gridcolor="#f0f0f0", zerolinecolor="gray", showline=True, linecolor="black", linewidth=1)
    return fig


def minimizing_animation(x0, alpha, steps=50):
    play_button = {"label": "▶️ Start animation", "method": "animate", "args": [None]}
    stop_button = {
        "label": "⏯️ Stop animation",
        "method": "animate",
        "visible": True,
        "args": [(), {"frame": {"duration": 0, "redraw": False}, "mode": "next", "fromcurrent": True}],
    }

    x = x0
    xs = []
    for _ in range(steps):
        xs.append(x)
        x = x - alpha * df(x)

    base_fig = draw_f()
    return go.Figure(
        data=[
            base_fig.data[0],
            go.Scatter(x=[xs[0]], y=[f(xs[0])], mode="markers", marker={"size": 18, "color": "orange"}, showlegend=False),
        ],
        frames=[
            go.Frame(
                data=[
                    base_fig.data[0],
                    go.Scatter(x=[x_i], y=[f(x_i)], mode="markers", marker={"size": 18, "color": "orange"}, showlegend=False),
                ]
            )
            for x_i in xs
        ],
        layout=go.Layout(
            updatemenus=[{"type": "buttons", "buttons": [play_button, stop_button]}],
            xaxis=base_fig.layout.xaxis,
            yaxis=base_fig.layout.yaxis,
            plot_bgcolor=base_fig.layout.plot_bgcolor,
            paper_bgcolor=base_fig.layout.paper_bgcolor,
            font=base_fig.layout.font,
            margin=base_fig.layout.margin,
            showlegend=False,
        ),
    )


minimizing_animation(x0=1.1, alpha=0.01)

When I use gradient descent to minimize an empirical risk function, getting stuck at a local minimum that is not global means landing on a parameter vector $\vec w$ that is not actually optimal. That is a real problem.

In modern machine learning, the loss surfaces people care about are often huge and messy, and dealing with bad local minima is part of the game. I am not going to get into those fixes here. Instead, I want to focus on a special family of functions where life is much nicer: convex functions.

The following video, recorded in an earlier semester, summarizes the key ideas of this section.

import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots

PAL = "Palatino Linotype, Palatino, serif"
BLUE = "#3d81f6"
ORANGE = "orange"
PINK = "#d81b60"
GREEN = "#004d40"


def style_2d(fig, x_range, y_range, width=700, height=450, showlegend=False):
    fig.update_xaxes(
        range=x_range,
        gridcolor="#f0f0f0",
        zerolinecolor="gray",
        showline=True,
        linecolor="black",
        linewidth=1,
        tickfont=dict(family=PAL, size=10),
        title_font=dict(family=PAL),
    )
    fig.update_yaxes(
        range=y_range,
        gridcolor="#f0f0f0",
        zerolinecolor="gray",
        showline=True,
        linecolor="black",
        linewidth=1,
        tickfont=dict(family=PAL, size=10),
        title_font=dict(family=PAL),
    )
    fig.update_layout(
        width=width,
        height=height,
        paper_bgcolor="white",
        plot_bgcolor="white",
        font=dict(family=PAL, color="black"),
        margin=dict(l=60, r=40, t=60, b=50),
        showlegend=showlegend,
    )
    return fig


def make_secant_convexity_figure():
    f = lambda x: 0.35 * (x + 0.15) ** 2 + 0.7
    x_left, x_right = -3.0, 2.4
    t = 0.58
    x_mid = (1 - t) * x_left + t * x_right
    y_left, y_right = f(x_left), f(x_right)
    y_curve = f(x_mid)
    y_line = (1 - t) * y_left + t * y_right
    slope = (y_right - y_left) / (x_right - x_left)

    x = np.linspace(-4, 4, 400)
    x_sec = np.linspace(x_left - 0.4, x_right + 0.4, 50)
    y_sec = slope * (x_sec - x_left) + y_left

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=x, y=f(x), mode="lines", name="f(x)", line=dict(color=BLUE, width=4)))
    fig.add_trace(go.Scatter(x=x_sec, y=y_sec, mode="lines", name="secant line", line=dict(color=ORANGE, width=4, dash="dash")))
    fig.add_trace(go.Scatter(x=[x_left, x_right], y=[y_left, y_right], mode="markers", name="endpoints", marker=dict(color=ORANGE, size=9)))
    fig.add_trace(go.Scatter(x=[x_mid], y=[y_curve], mode="markers", name="point on graph", marker=dict(color=PINK, size=10)))
    fig.add_trace(go.Scatter(x=[x_mid], y=[y_line], mode="markers", name="point on secant", marker=dict(color=GREEN, size=10)))
    fig.add_trace(go.Scatter(x=[x_mid, x_mid], y=[y_curve, y_line], mode="lines", showlegend=False, line=dict(color="gray", dash="dot")))

    fig.add_annotation(x=x_left, y=y_left + 0.45, text="(x, f(x))", showarrow=False, font=dict(family=PAL, size=14, color=ORANGE))
    fig.add_annotation(x=x_right, y=y_right + 0.45, text="(y, f(y))", showarrow=False, font=dict(family=PAL, size=14, color=ORANGE))
    fig.add_annotation(
        x=x_mid,
        y=y_line + 0.55,
        text="((1-t)x + ty, (1-t)f(x) + tf(y))",
        showarrow=False,
        font=dict(family=PAL, size=12, color=GREEN),
    )
    fig.add_annotation(
        x=x_mid,
        y=y_curve - 0.6,
        text="same input, but on the graph",
        showarrow=False,
        font=dict(family=PAL, size=12, color=PINK),
    )

    style_2d(fig, x_range=(-4, 4), y_range=(0, 5.75), width=760, height=460, showlegend=True)
    fig.update_layout(
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="center", x=0.5),
        title="A secant line sits above the graph of a convex function",
    )
    return fig


def make_convexity_examples_figure():
    xs = np.linspace(-3, 3, 400)
    f_convex = lambda x: 0.35 * x**2 + 0.4
    f_nonconvex = lambda x: 0.08 * x**4 - 0.55 * x**2 + 1.2
    f_flat = lambda x: np.maximum(np.abs(x) - 1, 0)

    titles = ["Convex", "Not convex", "Convex, not strictly convex"]
    funcs = [f_convex, f_nonconvex, f_flat]
    colors = [BLUE, PINK, GREEN]

    fig = make_subplots(rows=1, cols=3, subplot_titles=titles, horizontal_spacing=0.08)

    for i, (func, color) in enumerate(zip(funcs, colors), start=1):
        fig.add_trace(
            go.Scatter(x=xs, y=func(xs), mode="lines", line=dict(color=color, width=4), showlegend=False),
            row=1,
            col=i,
        )
        fig.update_xaxes(
            range=(-3, 3),
            gridcolor="#f0f0f0",
            zerolinecolor="gray",
            showline=True,
            linecolor="black",
            linewidth=1,
            tickfont=dict(family=PAL, size=10),
            row=1,
            col=i,
        )
        fig.update_yaxes(
            range=(-0.35, 3.9),
            gridcolor="#f0f0f0",
            zerolinecolor="gray",
            showline=True,
            linecolor="black",
            linewidth=1,
            tickfont=dict(family=PAL, size=10),
            row=1,
            col=i,
        )

    fig.update_layout(
        width=980,
        height=340,
        paper_bgcolor="white",
        plot_bgcolor="white",
        font=dict(family=PAL, color="black"),
        margin=dict(l=40, r=30, t=70, b=40),
        title="Three pictures worth keeping in the back of your mind",
        showlegend=False,
    )
    return fig


def make_exp_no_minimum_figure():
    x = np.linspace(-4, 2, 400)
    y = np.exp(x)

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=x, y=y, mode="lines", line=dict(color=BLUE, width=4), name=r"e^x"))
    fig.add_hline(y=0, line_dash="dash", line_color="gray")
    fig.add_annotation(
        x=-2.7,
        y=0.6,
        text="infimum = 0, but the curve never gets there",
        showarrow=False,
        font=dict(family=PAL, size=14, color=PINK),
    )
    style_2d(fig, x_range=(-4, 2), y_range=(-0.2, 8.2), width=700, height=420, showlegend=False)
    fig.update_layout(title=r"$f(x) = e^x$ is convex, but it has no global minimum")
    return fig


def make_flat_bottom_figure():
    f = lambda x: np.maximum(np.abs(x) - 1, 0) ** 2
    x = np.linspace(-3, 3, 500)

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=x, y=f(x), mode="lines", line=dict(color=GREEN, width=4), name="f(x)"))
    fig.add_vrect(x0=-1, x1=1, fillcolor="rgba(0, 77, 64, 0.12)", line_width=0)
    fig.add_annotation(
        x=0,
        y=0.35,
        text="every point here is a global minimizer",
        showarrow=False,
        font=dict(family=PAL, size=14, color=GREEN),
    )
    style_2d(fig, x_range=(-3, 3), y_range=(-0.15, 4.2), width=720, height=420, showlegend=False)
    fig.update_layout(title="A convex function can have a flat bottom")
    return fig


def mse_grid(X, y, w1_grid, w2_grid):
    preds = X[:, 0][:, None, None] * w1_grid[None, :, :] + X[:, 1][:, None, None] * w2_grid[None, :, :]
    residuals = y[:, None, None] - preds
    return np.mean(residuals**2, axis=0)


def make_mse_contours():
    X_full = np.array([[1, 0], [0, 1], [1, 1], [2, -1]], dtype=float)
    y_full = np.array([1, -1, 0, 3], dtype=float)

    X_dep = np.array([[1, 2], [2, 4], [3, 6]], dtype=float)
    y_dep = np.array([1, 2, 3], dtype=float)

    grid = np.linspace(-2.5, 2.5, 220)
    W1, W2 = np.meshgrid(grid, grid)
    Z_full = mse_grid(X_full, y_full, W1, W2)
    Z_dep = mse_grid(X_dep, y_dep, W1, W2)

    fig = make_subplots(
        rows=1,
        cols=2,
        subplot_titles=[
            "Full-rank design matrix",
            "Linearly dependent columns",
        ],
        horizontal_spacing=0.1,
    )

    contour_kwargs = dict(
        contours=dict(showlabels=False),
        showscale=False,
        colorscale="RdBu_r",
        ncontours=18,
    )

    fig.add_trace(go.Contour(x=grid, y=grid, z=Z_full, **contour_kwargs), row=1, col=1)
    fig.add_trace(go.Contour(x=grid, y=grid, z=Z_dep, **contour_kwargs), row=1, col=2)

    w_star = np.linalg.lstsq(X_full, y_full, rcond=None)[0]
    fig.add_trace(
        go.Scatter(x=[w_star[0]], y=[w_star[1]], mode="markers", marker=dict(color="black", size=9), name="unique minimizer"),
        row=1,
        col=1,
    )

    line_w2 = np.linspace(-1.2, 1.6, 200)
    line_w1 = 1 - 2 * line_w2
    fig.add_trace(
        go.Scatter(x=line_w1, y=line_w2, mode="lines", line=dict(color="black", width=3, dash="dash"), name="all minimizers"),
        row=1,
        col=2,
    )

    fig.add_annotation(
        x=w_star[0],
        y=w_star[1] + 0.28,
        text="one minimizer",
        showarrow=False,
        font=dict(family=PAL, size=13, color="black"),
    )
    fig.add_annotation(
        x=0.2,
        y=0.35,
        xref="x2",
        yref="y2",
        text="a whole line of minimizers",
        showarrow=False,
        font=dict(family=PAL, size=13, color="black"),
    )

    for i in [1, 2]:
        fig.update_xaxes(
            title="w1",
            range=(-2.5, 2.5),
            gridcolor="#f0f0f0",
            zerolinecolor="gray",
            showline=True,
            linecolor="black",
            linewidth=1,
            tickfont=dict(family=PAL, size=10),
            row=1,
            col=i,
        )
        fig.update_yaxes(
            title="w2",
            range=(-2.5, 2.5),
            gridcolor="#f0f0f0",
            zerolinecolor="gray",
            showline=True,
            linecolor="black",
            linewidth=1,
            tickfont=dict(family=PAL, size=10),
            row=1,
            col=i,
        )

    fig.update_layout(
        width=960,
        height=420,
        paper_bgcolor="white",
        plot_bgcolor="white",
        font=dict(family=PAL, color="black"),
        margin=dict(l=60, r=30, t=80, b=50),
        title=r"$R_{\mathrm{sq}}(\vec w)$ can be strictly convex or just convex",
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="center", x=0.5),
    )
    return fig


def make_tangent_plane_figure():
    f = lambda x1, x2: x1**2 + 0.5 * x1 * x2 + x2**2 + 1
    grad = lambda x1, x2: np.array([2 * x1 + 0.5 * x2, 0.5 * x1 + 2 * x2])

    a = np.array([1.0, -1.0])
    grad_a = grad(*a)
    tangent = lambda x1, x2: f(*a) + grad_a[0] * (x1 - a[0]) + grad_a[1] * (x2 - a[1])

    grid = np.linspace(-2.2, 2.2, 41)
    X1, X2 = np.meshgrid(grid, grid)
    Z = f(X1, X2)
    T = tangent(X1, X2)

    b = np.array([1.6, 0.75])
    plane_height = tangent(*b)
    surface_height = f(*b)

    fig = go.Figure()
    fig.add_trace(go.Surface(x=X1, y=X2, z=Z, colorscale="Blues", showscale=False, opacity=0.93, name="surface"))
    fig.add_trace(
        go.Surface(
            x=X1,
            y=X2,
            z=T,
            colorscale=[[0, "#f2c89b"], [1, "#f2c89b"]],
            showscale=False,
            opacity=0.62,
            name="tangent plane",
        )
    )
    fig.add_trace(
        go.Scatter3d(
            x=[a[0]],
            y=[a[1]],
            z=[f(*a)],
            mode="markers",
            marker=dict(color=PINK, size=6),
            name="tangent point",
        )
    )
    fig.add_trace(
        go.Scatter3d(
            x=[b[0], b[0]],
            y=[b[1], b[1]],
            z=[plane_height, surface_height],
            mode="lines",
            line=dict(color="black", width=6, dash="dash"),
            showlegend=False,
        )
    )
    fig.add_trace(
        go.Scatter3d(
            x=[b[0]],
            y=[b[1]],
            z=[plane_height],
            mode="markers",
            marker=dict(color="black", size=4),
            name="point on tangent plane",
        )
    )
    fig.add_trace(
        go.Scatter3d(
            x=[b[0]],
            y=[b[1]],
            z=[surface_height],
            mode="markers",
            marker=dict(color=GREEN, size=4),
            name="point on surface",
        )
    )

    fig.update_layout(
        title="A convex surface sits above each of its tangent planes",
        width=860,
        height=720,
        paper_bgcolor="white",
        font=dict(family=PAL, size=14, color="black"),
        margin=dict(l=20, r=20, b=20, t=60),
        scene=dict(
            xaxis=dict(title="x1", gridcolor="#f0f0f0", backgroundcolor="white", showline=True, linecolor="black"),
            yaxis=dict(title="x2", gridcolor="#f0f0f0", backgroundcolor="white", showline=True, linecolor="black"),
            zaxis=dict(title="f(x1, x2)", gridcolor="#f0f0f0", backgroundcolor="white", showline=True, linecolor="black"),
            camera=dict(eye=dict(x=1.55, y=1.45, z=1.05)),
        ),
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="center", x=0.5),
    )
    return fig

Formal Definition of Convexity¶

Let me start with the picture, because the picture is the whole point.

If you need a refresher on what a secant line is, take a quick look at the appendix. For a scalar-to-scalar function, I want every secant line between two points on the graph to lie on or above the graph itself. If that happens no matter which two points I pick, the function has the familiar bowl-shaped behavior I want.

The figure below is the geometry I have in mind.

Loading...

That picture turns almost directly into algebra. Suppose I pick two inputs $x$ and $y$ , and some $t \, \in \, [0,1]$ .

The input that is a fraction $t$ of the way from $x$ to $y$ is $(1-t)x + ty$ .
The height of the secant line at that same horizontal location is $(1-t)f(x) + t f(y)$ .

So saying “the secant line lies above the graph” is the same as saying

f((1-t)x + ty) \le (1-t)f(x) + t f(y).

That is the formal definition in one dimension. In higher dimensions, nothing really changes: I just replace the scalar inputs $x$ and $y$ with vectors $\vec x$ and $\vec y$ , and think about the line segment connecting them.

For $d=1$ , this is exactly the secant-line picture above. For $d>1$ , it says the same thing along every line segment in the domain.

Loading...

This is one of those definitions that is much more useful than it first looks. I do not just want a fancy way to say “bowl-shaped.” I want an inequality that I can actually plug into proofs.

Once I know that every point on every line segment satisfies a weighted-average inequality, I can stop arguing from a picture and start choosing $\vec x$ , $\vec y$ , and $t$ strategically. That is what makes the next result work.

Local Minimums are Global Minimums¶

This is the payoff.

Suppose $f$ is convex, and suppose $\vec x^*$ is a local minimum of $f$ . I want to show that $\vec x^*$ is automatically a global minimum.

Take any other point $\vec z$ . Since $\vec x^*$ is a local minimum, points on the line segment from $\vec x^*$ toward $\vec z$ that stay close enough to $\vec x^*$ cannot have smaller function value. So for all sufficiently small $t > 0$ ,

f((1-t)\vec x^* + t \vec z) \ge f(\vec x^*).

But convexity also gives

f((1-t)\vec x^* + t \vec z) \le (1-t)f(\vec x^*) + t f(\vec z).

Putting those together,

f(\vec x^*) \le (1-t)f(\vec x^*) + t f(\vec z).

Subtract $(1-t)f(\vec x^*)$ from both sides:

t f(\vec x^*) \le t f(\vec z).

Since $t > 0$ , I can divide by $t$ and get

f(\vec x^*) \le f(\vec z).

And since $\vec z$ was arbitrary, $\vec x^*$ beats every other point in the domain. So $\vec x^*$ is a global minimum.

This is why convexity matters so much in optimization. It does not magically make minimization easy, but it does remove the possibility of bad local minima.

There is one important caveat, though: convex functions do not need to have global minima in the first place. A standard example is $f(x) = e^x$ . It is convex, and it keeps getting smaller as I move left, but it never actually attains its infimum of 0.

Loading...

Strict Convexity¶

Convexity rules out bad local minima, but it does not rule out ties.

A convex function can have a completely flat bottom, in which case every point in that flat region is a local minimum and a global minimum. The example below does exactly that.

Loading...

So if I want a guarantee of a single best point, I need to strengthen the definition a little.

The only difference is that the inequality is now strict once I move away from the endpoints. Geometrically, the secant line is allowed to touch the graph at the two endpoints, but not in between.

Now I can prove the uniqueness statement I really want. Suppose a strictly convex function has a global minimum, but that there are two different minimizers, $\vec x^*$ and $\vec y^*$ . Let their common minimum value be $m$ .

For any $t \in (0,1)$ , strict convexity says

f((1-t)\vec x^* + t \vec y^*) < (1-t)f(\vec x^*) + t f(\vec y^*) = (1-t)m + tm = m.

But that says there is a point with function value smaller than the global minimum value $m$ , which is impossible. So a strictly convex function can have at most one global minimum. In other words: if a global minimum exists, it is unique.

Strict Convexity and Mean Squared Error¶

This is exactly the distinction I want you to keep in mind for mean squared error.

Recall that

R_{\mathrm{sq}}(\vec w) = \frac{1}{n} \lVert \vec y - X \vec w \rVert^2.

When the columns of $X$ are linearly independent, this risk surface curves upward in every direction, so there is a single best parameter vector. When the columns of $X$ are linearly dependent, there are flat directions: I can move in some directions without changing the predictions, and the minimum need not be unique.

The contour plots below show both behaviors.

Loading...

The left-hand plot is the nice case: one bowl, one bottom, one minimizer. The right-hand plot still describes a convex function, but not a strictly convex one, because there is a whole line of minimizers.

I am not proving the full criterion for $R_{\mathrm{sq}}$ here just yet. The clean explanation comes from the Hessian, and one chapter from now we will phrase that explanation using eigenvalues of $X^T X$ . But the geometry is already visible: full rank gives curvature in every direction; linear dependence creates flat directions.

Second Derivative Test¶

The formal definition is the ground truth, but it is not always the fastest way to check that a function is convex.

For scalar-to-scalar functions, you may already know the second derivative test from calculus: if a twice-differentiable function satisfies

\frac{d^2 f}{dx^2}(x) > 0

for all $x$ in its domain, then $f$ is convex. In fact, the slightly weaker condition $\frac{d^2 f}{dx^2}(x) \ge 0$ is enough for convexity, while $>0$ points toward strict convexity.

What does this look like for vector-to-scalar functions? Now there is no single second derivative. There are many of them.

For example, if

f(x_1, x_2) = x_1^2 + x_1 x_2 + 2x_2^2,

then the first partial derivatives are

\frac{\partial f}{\partial x_1} = 2x_1 + x_2, \qquad \frac{\partial f}{\partial x_2} = x_1 + 4x_2,

and the second partial derivatives are

\frac{\partial^2 f}{\partial x_1^2} = 2, \qquad \frac{\partial^2 f}{\partial x_1 \partial x_2} = 1, \qquad \frac{\partial^2 f}{\partial x_2^2} = 4.

The natural thing to do is collect all of those second partial derivatives into a matrix.

Definition: Hessian

Suppose $f: \mathbb{R}^d \to \mathbb{R}$ is twice differentiable. The Hessian of $f$ at $\vec x$ is the matrix of second partial derivatives,

H_f(\vec x) = \begin{bmatrix} \frac{\partial^2 f}{\partial x_1^2} & \frac{\partial^2 f}{\partial x_1 \partial x_2} & \cdots & \frac{\partial^2 f}{\partial x_1 \partial x_d} \\ \frac{\partial^2 f}{\partial x_2 \partial x_1} & \frac{\partial^2 f}{\partial x_2^2} & \cdots & \frac{\partial^2 f}{\partial x_2 \partial x_d} \\ \vdots & \vdots & \ddots & \vdots \\ \frac{\partial^2 f}{\partial x_d \partial x_1} & \frac{\partial^2 f}{\partial x_d \partial x_2} & \cdots & \frac{\partial^2 f}{\partial x_d^2} \end{bmatrix}.

For the example above,

H_f(\vec x) = \begin{bmatrix} 2 & 1 \\ 1 & 4 \end{bmatrix},

which does not even depend on $\vec x$ .

The vector-valued second derivative test says: a twice-differentiable function $f$ is convex exactly when its Hessian is positive semidefinite everywhere, meaning

\vec v^T H_f(\vec x) \vec v \ge 0 \qquad \text{for all } \vec x \in \mathbb{R}^d \text{ and all } \vec v \in \mathbb{R}^d.

If the Hessian is positive definite everywhere, then I get strict convexity.

This brings us back to mean squared error. For

R_{\mathrm{sq}}(\vec w) = \frac{1}{n} \lVert \vec y - X \vec w \rVert^2,

we already computed

\nabla R_{\mathrm{sq}}(\vec w) = \frac{2}{n}(X^T X \vec w - X^T \vec y).

Differentiate once more, and the Hessian is

H_{R_{\mathrm{sq}}}(\vec w) = \frac{2}{n} X^T X.

That is a really nice conclusion: the Hessian is constant, and it is always positive semidefinite. So mean squared error is always convex. If the columns of $X$ are linearly independent, then $X^T X$ is positive definite, which is why $R_{\mathrm{sq}}$ becomes strictly convex and has a unique minimizer. If the columns are dependent, there is a zero-curvature direction, which is exactly the flat valley we saw above.

Aside: Tangent Hyperplanes¶

For scalar-to-scalar functions, there is another way to recognize convexity: a differentiable convex function always lies above each of its tangent lines.

For vector-to-scalar functions, the same story holds, except the tangent line becomes a tangent hyperplane. When the input has two coordinates, that hyperplane is literally just a plane in 3D.

The next figure shows a convex surface together with its tangent plane at one point.

Loading...

Suppose I fix a point $\vec a$ . The tangent hyperplane to $f$ at $\vec a$ is the linear approximation

L_{\vec a}(\vec x) = f(\vec a) + \nabla f(\vec a)^T (\vec x - \vec a).

This is the vector-valued version of the tangent-line formula from calculus.

In words: the graph of a differentiable convex function lies above every tangent hyperplane.

I like this characterization because it tells me that the local linear information in the gradient is globally trustworthy. On a non-convex function, a tangent line or tangent plane can point you in a misleading direction. On a convex function, the tangent hyperplane never overshoots the graph, which is a big part of why gradient-based optimization behaves so nicely in this setting.