Updated full width graphs to ML chapter

0c1ac3cc · Tom van Woudenberg · fd3e3a1a · 0c1ac3cc · 0c1ac3cc · 0c1ac3cc
Commit 0c1ac3cc authored 1 year ago by Tom van Woudenberg
--- a/book/ml/decision_theory_interactive.ipynb
+++ b/book/ml/decision_theory_interactive.ipynb
@@ -257,7 +257,8 @@
   "id": "2b2a57a9",
   "metadata": {
    "tags": [
-     "thebe-remove-input-init"
+     "thebe-remove-input-init",
+     "full-width"
    ]
   },
   "outputs": [],
@@ -303,8 +304,7 @@
   "metadata": {
    "tags": [
     "full-width",
-     "thebe-init",
-     "hide-input"
+     "thebe-remove-input-init"
    ]
   },
   "outputs": [],

 %% Cell type:markdown id:302a6da0-890d-45b5-a498-126e4c498ae3 tags:

 # Decision Theory

 %% Cell type:code id:51ef5557 tags:thebe-remove-input-init

 ``` python
 # pip install packages that are not in Pyodide
 %pip install ipympl==0.9.3
 %pip install seaborn==0.12.2
 ```

 %% Cell type:code id:920b602e tags:disable-download-page,thebe-remove-input-init,auto-execute-page

 ``` python
 # Import the necessary libraries
 import numpy as np
 import matplotlib.pyplot as plt
 from sklearn.neighbors import KNeighborsRegressor
 from mude_tools import magicplotter, biasvarianceplotter
 from mude_tools import magicplotter
 from cycler import cycler
 import seaborn as sns

 %matplotlib widget

 # Set the color scheme
 sns.set_theme()
 colors = [
    "#0076C2",
    "#EC6842",
    "#A50034",
    "#009B77",
    "#FFB81C",
    "#E03C31",
    "#6CC24A",
    "#EF60A3",
    "#0C2340",
    "#00B8C8",
    "#6F1D77",
 ]
 plt.rcParams["axes.prop_cycle"] = cycler(color=colors)
 ```

 %% Cell type:markdown id:2851b3b8 tags:

 ## Introduction

 <iframe width="560" height="315" src="https://www.youtube.com/embed/7JHvoZfSFBY?si=9UvxfuVizLSxkBAy" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen></iframe>

 In the previous chapter, we built a k-nearest neighbors model and observed the influence of choosing various values for k. We found that choosing k depends, among other things, on the specific dataset used and the noise in the data. By tweaking k, we could get a model which __qualitatively__ fits our data well.

 In this chapter, we will try to __quantify__ how well a specific model performs for any dataset, which can help us choose the best one.

 %% Cell type:code id:d8eaabc5 tags:thebe-remove-input-init

 ``` python
 # The true function relating t to x
 def f_truth(x, freq=1, **kwargs):
    # Return a sine with a frequency of f
    return np.sin(x * freq)


 # The data generation function
 def f_data(epsilon=0.7, N=100, **kwargs):
    # Apply a seed if one is given
    if "seed" in kwargs:
        np.random.seed(kwargs["seed"])

    # Get the minimum and maximum
    xmin = kwargs.get("xmin", 0)
    xmax = kwargs.get("xmax", 2 * np.pi)

    # Generate N evenly spaced observation locations
    x = np.linspace(xmin, xmax, N)

    # Generate N noisy observations (1 at each location)
    t = f_truth(x, **kwargs) + np.random.normal(0, epsilon, N)

    # Return both the locations and the observations
    return x, t


 # Define a function that makes a KNN prediction at the given locations, based on the given (x,t) data
 def KNN(x, t, x_pred, k=1, **kwargs):
    # Convert x and x_pred to a column vector in order for KNeighborsRegresser to work
    X = x.reshape(-1, 1)
    X_pred = x_pred.reshape(-1, 1)

    # Train the KNN based on the given (x,t) data
    neigh = KNeighborsRegressor(k)
    neigh.fit(X, t)

    # Make a prediction at the locations given by x_pred
    y = neigh.predict(X_pred)

    # Later we require two predictions from a single KNN predictor.
    # To prevent training it twice, the option to predict for 2 separate sets is created
    if "extra_predictions" in kwargs:
        x_pred_extra = kwargs.get("extra_predictions").reshape(-1, 1)
        return y, neigh.predict(x_pred_extra)

    # Return the predicted values
    return y
 ```

 %% Cell type:markdown id:76afeffd tags:

 ## Loss function
 To quantify the "closeness" between predictions of some model $y(\mathbf{x})$ and the observed data $t$, we introduce the squared loss function:

 $$
 L ( t, y(\mathbf{x}))= (y(\mathbf{x}) - t)^2,
 $$

 where $t$ are the observed values corresponding to $x$. Squaring the difference gives a number of nice properties:
 * The loss is always positive, regardless of whether we underestimate or overestimate a prediction.
 * It is differentiable at $y(\mathbf{x})=t$, which is not true for all loss functions (e.g. the absolute loss)

 Below we plot this loss for some data points. Notice that in this case we are comparing the observations $t$ to the ground truth $f(\mathbf{x})$ (as opposed to some fitted model). The error therefore comes solely from the noise in our observations.

 %% Cell type:code id:f245cf39 tags:remove-cell

 ``` python
 # Get the observed data in 8 locations
 x, t = f_data(N=8, seed=0)

 # Define the prediction locations
 x_pred = np.linspace(0, 2 * np.pi, 1000)

 # Plot the data and the ground truth
 fig = plt.figure(figsize=(6, 4))
 fig.canvas.toolbar_visible = False
 plt.plot(x_pred, f_truth(x_pred), "k-", label=r"Ground truth $f(x)$")
 plt.plot(x, t, "x", label=r"Noisy data $(x,t)$")

 # Plot the difference and print the squared loss
 for i, x_i in enumerate(x):
    ymin = min(t[i], f_truth(x_i))
    ymax = max(t[i], f_truth(x_i))
    if i == 0:  # Only plot the label once
        plt.vlines(x_i, ymin, ymax, "C2", label="Squared Loss")
    else:
        plt.vlines(x_i, ymin, ymax, "C2")
    plt.text(x_i + 0.05, (ymin + ymax) / 2, f"{(ymax-ymin)**2:.2f}")

 plt.xlabel("x")
 plt.ylabel("t")
 plt.legend()
 plt.show()
 ```

 %% Cell type:markdown id:c1f8a70a tags:

 ```{figure} ./figs/fig2.png
 Ground truth vs noisy data
 ```

 %% Cell type:markdown id:b930770f tags:

 ## Theory
 To obtain our model, we want a single value that tells us how well it explains all the data. Therefore it is natural to compute the expected loss:

 $$
 \mathbb{E}[L]= \int \int (y(\mathbf{x})-t)^2 p( \mathbf{x},t) d\mathbf{x} dt
 $$

 where $(y(\mathbf{x})-t)^2$ is the error term we have seen before, and we multiply it with $p( \mathbf{x},t)$, the probability of this particular point occuring. Integrating this over the complete probability density $p(\mathbf{x},t)$ results in a single scalar that summarizes our expected prediction error.

 Naturally, our goal is to choose a model $y(\mathbf{x})$ so as to minimize $\mathbb{E}[L]$. We can do this using calculus of variations:

 $$
 \frac{\delta \mathbb{E}[L]}{\delta y(\mathbf{x})}= 2 \int (y(\mathbf{x})-t) p( \mathbf{x},t) dt = 0,
 $$

 where we keep the detailed steps of the derivation out of the scope of this discussion. Solving for $y(\mathbf{x})$ we get:

 $$
 \begin{align}
 \int y(\mathbf{x}) p( \mathbf{x},t) dt &= \int t p( \mathbf{x},t) dt \\
 y(\mathbf{x}) p( \mathbf{x}) &= \int t p( \mathbf{x},t) dt \\
 y(\mathbf{x}) &= \frac{\int t p( \mathbf{x},t) dt}{p(\mathbf{x})} = \int t p(t|\mathbf{x})dt = \mathbb{E}_t[t|\mathbf{x}]
 \end{align},
 $$

 where the Sum Rule of probability $p(\mathbf{x}) = \int{p(\mathbf{x},t)dt}$ has been employed and we factorize the joint probability distribution as $p(\mathbf{x},t) = p(\mathbf{x})p(t\vert\mathbf{x})$ (Product Rule). From the result above, the model that minimizes the squared loss is therefore given by the mean of the conditional distribution $p(t|\mathbf{x})$. This important result can also be seen graphically for a regression model with a single input:

 <center><img src="https://surfdrive.surf.nl/files/index.php/s/OGErUzhQqGl4BCh/download", width=300px/></center>

 In practice we generally do not know $p( \mathbf{x},t)$ or $p(t|\mathbf{x})$ exactly. Instead, we can estimate the loss for any model by taking the average of the squared losses for each point in our limited dataset. We can then tweak $y(\mathbf{x})$ to minimize this loss.

 $$
 \int \int (y(\mathbf{x})-t)^2 p( \mathbf{x},t) d\mathbf{x} dt \approx \dfrac{1}{N} \sum_i^N (y(\mathbf{x}_i)-t_i)^2
 $$

 This is known as the Mean Squared Error (MSE) function. We now revisit our k-nearest neighbors model with the value of the training loss shown in the plot title.

-%% Cell type:code id:2b2a57a9 tags:thebe-remove-input-init
+%% Cell type:code id:2b2a57a9 tags:thebe-remove-input-init,full-width

 ``` python
 # Get the observed data
 x, t = f_data(N=100)

 # Define the prediction locations
 x_pred = np.linspace(0, 2 * np.pi, 1000)

 # Create the interactive plot including training error
 plot1 = magicplotter(f_data, f_truth, KNN, x_pred)
 plot1.fig.canvas.toolbar_visible = False
 plot1.add_sliders("epsilon", "k", "N", "freq")
 plot1.add_buttons("truth", "seed", "reset")
 plot1.title = "Mean squared error: {mse_train:.3f}"
 plot1.show()
 ```

 %% Cell type:markdown id:3ab91be0 tags:

 ## Model selection
 For $k = 1$ our loss function goes to exactly zero, yet the resulting model will not yield good predictions for unseen data. Our model has fitted the noise in the dataset, and therefore does not generalize well. We say that we have __overfitted__ our model to the training data. Clearly, evaluating the squared loss solely on our training points does not suffice if our goal is to make an informed choice for $k$.

 To solve this problem a validation set is introduced. This set consists of different observations from the process which are not included during training. We also define a test set with samples that can be used __at the very end__ of our model building process in order to assess in an unbiased way if our choice of $k$ has been well informed. We therefore have three separate loss values:

 |   |   |
 | :--- | :--- |
 | **Training Loss:** | The value of the loss function based on all observations used to train a model |
 | **Validation Loss:** | The value of the loss function based on observations not used during training, used for finding hyperparameters such as $k$ |
 | **Test Loss:** | The value of the loss function based on observations not used during either training or validation, used for evaluating the fit of the final model (for example to compare to alternative models) |

 We repeat the experiment above with an additional validation set for which we compute the validation loss. This validation error represents the error we obtain when making new predictions that were hidden from the model during training and is therefore a much better indication for assessing our model compared to the training error. Minimizing the error on this validation set allows us to find a good value for k.

-%% Cell type:code id:e97f456c tags:full-width,thebe-init,hide-input
+%% Cell type:code id:e97f456c tags:full-width,thebe-remove-input-init

 ``` python
 # Get the observed data
 x, t = f_data(N=100)

 # Define the prediction locations
 x_pred = np.linspace(0, 2 * np.pi, 1000)

 # Create the interactive plot including training error
 plot2 = magicplotter(f_data, f_truth, KNN, x_pred)
 plot2.fig.canvas.toolbar_visible = False
 plot2.add_sliders("epsilon", "k", "N", "freq")
 plot2.add_buttons("truth", "seed", "reset")
 plot2.add_sidebar()
 plot2.title = "Training error: {mse_train:.3f}, Validation error: {mse_validation:.3f}"
 plot2.show()
 ```

 %% Cell type:markdown id:6bb50f34 tags:

 __Try it out above!__ How does this compare with your findings from the previous chapter? Note that we now leave 20% of data points out for validation, and that the trained function does not fit them exactly. For a given dataset size, move the $k$ slider until the validation loss is as low as possible.

 Having selected a value for $k$, we could then compute the loss on our test dataset as one final check of model validity. We leave that step to you when working on our real-world problems later on.

 ## Bias-variance tradeoff

 As we determined earlier, the optimal prediction (in the sense that it minimizes the squared loss function), which we will denote as $h(\mathbf{x})$, is given by:

 $$
 h(\mathbf{x})= \mathbb{E}[t|\mathbf{x}] = \int t p(t|\mathbf{x})dt
 $$

 Starting from the expression for the loss function you know, we add and subtract the equation above with no loss of generality. This allows us to split the loss expression in two important parts:

 $$
 \begin{aligned}
    \mathbb{E}[L] &= \int\int \left(y(\mathbf{x}) - t\right)^2p(\mathbf{x},t) d\mathbf{x}dt \\
    &= \int\int \left(y(\mathbf{x}) - h(\mathbf{x}) + h(\mathbf{x}) - t \right)^2 p(\mathbf{x},t) d\mathbf{x}dt \\
    &= \int\int \left(y(\mathbf{x}) - h(\mathbf{x})\right)^2 p(\mathbf{x},t) d\mathbf{x}dt + \int\int \left(h(\mathbf{x}) - t \right)^2 p(\mathbf{x},t) d\mathbf{x}dt + 2 \int\int \left(y(\mathbf{x}) - h(\mathbf{x})\right)\left(h(\mathbf{x}) - t\right) p(\mathbf{x},t) d\mathbf{x}dt
 \end{aligned}
 $$

 Now note that the first term does not depend on $t$ anymore (remember $h(x)$ is already an integral over $t$). This means we can simplify it to an expectation only over $\mathbf{x}$. Furthermore, the third term vanishes once we integrate over $t$. We therefore end up with the following very important decomposition:

 $$
 \mathbb{E}[L]= \underbrace{\int \left(y(\mathbf{x}) - h(\mathbf{x})\right)^2 p(\mathbf{x}) d\mathbf{x}}_{\text{reducible loss}} + \underbrace{\int\int \left(h(\mathbf{x}) - t \right)^2 p(\mathbf{x},t) d\mathbf{x}dt}_{\text{irreducible noise}}
 $$

 The irreducible noise term represents the variance that will always be present in our model because we make noisy observations. Even if we had the perfect model $h(\mathbf{x})$, we would still have an expected loss greater than zero due to the noise inherent to our data. For the kNN model above, this is a bit hidden because we have __exactly one observation__ for each value of $x$. This is the reason why we can still obtain zero loss with our kNN estimator, but in practice we would ideally have several observations for the same value of our inputs.

 Regardless, focusing now on the first term, we might note that actually, our model $y(\mathbf{x})$ not only depends on $\mathbf{x}$ but also on which dataset $\mathcal{D}$ we used for training. So, the first integral should be replaced as follows:

 $$
 \int \left(y(\mathbf{x}) - h(\mathbf{x})\right)^2 p(\mathbf{x}) d\mathbf{x} \quad \Longrightarrow \quad \int \mathbb{E}_\mathcal{D}\left[\left(y(\mathbf{x},\mathcal{D}) - h(\mathbf{x})\right)^2\right] p(\mathbf{x}) d\mathbf{x}
 $$

 Now, we do some rearrangement in the inner part of this expression:

 $$
 \begin{aligned}
    \left(y(\mathbf{x},\mathcal{D}) - h(\mathbf{x})\right)^2 &= \left(y(\mathbf{x},\mathcal{D}) - \mathbb{E}_\mathcal{D}\left[y(\mathbf{x},\mathcal{D})\right] + \mathbb{E}_\mathcal{D}\left[y(\mathbf{x},\mathcal{D})\right] - h(\mathbf{x})\right)^2 \\
    &= \left(y(\mathbf{x},\mathcal{D}) - \mathbb{E}_\mathcal{D}\left[y(\mathbf{x},\mathcal{D})\right]\right)^2 + \left(\mathbb{E}_\mathcal{D}\left[y(\mathbf{x},\mathcal{D})\right] - h(\mathbf{x})\right)^2 + 2 \left(y(\mathbf{x},\mathcal{D}) - \mathbb{E}_\mathcal{D}\left[y(\mathbf{x},\mathcal{D})\right]\right)\left(\mathbb{E}_\mathcal{D}\left[y(\mathbf{x},\mathcal{D})\right] - h(\mathbf{x})\right)
 \end{aligned}
 $$

 and then taking the expectation over all possible datasets we get:

 $$
 \begin{aligned}
 \mathbb{E}_\mathcal{D}\left[\left(y(\mathbf{x},\mathcal{D}) - h(\mathbf{x})\right)^2\right] &= \mathbb{E}_\mathcal{D}\left[\left(y(\mathbf{x},\mathcal{D}) - \mathbb{E}_\mathcal{D}\left[y(\mathbf{x},\mathcal{D})\right]\right)^2\right] + \mathbb{E}_\mathcal{D}\left[\left(\mathbb{E}_\mathcal{D}\left[y(\mathbf{x},\mathcal{D})\right] - h(\mathbf{x})\right)\right]^2 + 0 \\
    &= \underbrace{\mathbb{E}_\mathcal{D}\left[\left(y(\mathbf{x},\mathcal{D}) - \mathbb{E}_\mathcal{D}\left[y(\mathbf{x},\mathcal{D})\right]\right)^2\right]}_{\text{variance}} + \underbrace{\left\{\mathbb{E}_\mathcal{D}\left[y(\mathbf{x},\mathcal{D})\right] - h(\mathbf{x})\right\}^2}_{\text{bias}^2}
 \end{aligned}
 $$

 Overall, the expected loss can be decomposed as follows:

 $$
 \text{expected loss} = \text{bias}^2 + \text{variance} + \text{noise}
 $$

 ```{admonition} MUDE Exam Information
 :class: tip, dropdown
 You will not be asked to reproduce this derivation on the exam, but you should be familiar with the meaning of this final result and with the interpretation of each of these important terms.
 ```

 These terms can be understood as follows:

 - __Bias__: Represents the mismatch between the expectaction of our model $\mathbb{E}_\mathcal{D}\left[y(\mathbf{x},\mathcal{D})\right]$ and the optimal prediction $h(\mathbf{x})$. An __unbiased__ model will on average give us the optimal prediction, whereas a __biased__ model will on average deviate from the optimal prediction.
 - __Variance__: Represents the degree to which a given model $y(\mathbf{x},\mathcal{D})$ (trained with one specific realization of $\mathcal{D}$) deviates from the average model $\mathbb{E}_\mathcal{D}\left[y(\mathbf{x},\mathcal{D})\right]$. If the variance is large, it means that a new dataset would produce a very different model.
 - __Noise__: Represents the inherent variation in our dataset. This term cannot be reduced because it stems from our data and not from our choice of model.

 Generally speaking, there is a certain balance to strike between bias and variance. A more flexible model tends to target the optimal prediction $h(\mathbf{x})$ on average but can be significantly influenced by the dataset to which it is fitted. In other words, it has a lower bias but a higher variance. On the other hand, a less flexible model tends to be less affected by the dataset to which it is fitted, but this causes it to deviate from $h(\mathbf{x})$. In other words, it has a lower variance but a higher bias. This dilemma is referred to as the __bias-variance tradeoff__.

 Looking back at the plot above, try to answer the following questions:
 * When k is low, do we have a high or a low bias? And what about the variance?
 * When k is high, do we have a high or a low bias? And what about the variance?

 We will now quantify the bias and variance by creating several datasets and averaging over them, following the equations above. Notice that computing the expressions above is only possible here because we know the ground truth in this example, that is, we know what the perfect model $h(\mathbf{x})$ should look like.

 %% Cell type:code id:6c2e3a15 tags:thebe-remove-input-init

 ``` python
 plot3 = biasvarianceplotter(f_data, f_truth, KNN, x_pred)
 plot3.fig.canvas.toolbar_visible = False
 plot3.add_sliders("epsilon", "k", "N", "freq")
 plot3.add_buttons("seed", "reset", "D_small", "D_medium", "D_large")
 plot3.show()
 ```

 %% Cell type:markdown id:5d9b7f08 tags:

 ## Playing around with the plot
 Understanding bias and variance is critical for any method you use, and this will come back in many of the following lectures. Being able to answer some of the following questions can help you understand what is going on:

 * Why is the variance small for a large $k$?

 ````{admonition} Answer
 :class: dropdown
 With higher values of $k$, we are averaging over a larger neighborhood. Intuitively, you can imagine that if our noise is high, generating multiple datasets of the same size will result in very different datasets, especially if $N$ is small. But over larger and larger neighborhoods, it makes sense that this noise we are adding tends to cancel out between points (remember the expected value of our noise is zero). It is therefore more likely that our resulting model will not change all that much for different datasets. This is the definition of a model with low variance (and therefore high bias!).
 ````

 * Why is the bias not exactly zero everywhere when $k=1$?

 ````{admonition} Answer
 :class: dropdown
 The bias should be zero at the locations of every one of the $N$ data points and when we take expectations over an infinite number of datasets. Take a look at what happens with the bias between data point locations and for different numbers of datasets.
 ````

 * Why does the bias increase when choosing a high frequency $freq$?

 ````{admonition} Answer
 :class: dropdown
 The complexity of our ground truth increases with frequency. So it makes sense that we would then need a more complex model to capture it properly. This in turn means models with the same $k$ will likely show an increase in bias. Note that this is not the case for $k=1$.
 ````

 __TIP__: When playing with the plot, be aware that training the model with $2000$ different datasets might take a while! When moving the sliders set that to a lower number of datasets and only click the '2000 datasets' button when you are done with the sliders.

 ## Final remarks

 So far, we have looked at our k-nearest neighbors regressor, which is relatively easy to understand and play around with. However, this method is not often used in practice, as it scales poorly for large datasets. Furthermore, computing the closest points becomes computationally expensive when considering multiple inputs (high dimensions). In a limited data setting, taking the average of the nearest neighbors will often lead to poor performance. k-nearest neighbors is also sensitive to outliers. In the following chapter, we will apply what we have learned to a different model, namely, linear regression.

--- a/book/ml/nn_interactive.ipynb
+++ b/book/ml/nn_interactive.ipynb
@@ -374,8 +374,7 @@
   "metadata": {
    "tags": [
     "full-width",
-     "thebe-init",
-     "hide-input"
+     "thebe-remove-input-init"
    ]
   },
   "outputs": [],
@@ -469,8 +468,7 @@
   "metadata": {
    "tags": [
     "full-width",
-     "hide-input",
-     "thebe-init"
+     "thebe-remove-input-init"
    ]
   },
   "outputs": [],
@@ -547,8 +545,7 @@
   "metadata": {
    "tags": [
     "full-width",
-     "hide-input",
-     "thebe-init"
+     "thebe-remove-input-init"
    ]
   },
   "outputs": [],

 %% Cell type:markdown id:2b2ff7b6-91a6-4736-9ac3-72d7ac39e0c1 tags:

 <figure>
  <IMG SRC="https://raw.githubusercontent.com/fmeer/public-files/main/TUlogo.png" WIDTH=200 ALIGN="right">
 </figure>

 # Feedforward Neural Networks

 %% Cell type:code id:5e19062f tags:thebe-remove-input-init

 ``` python
 # pip install packages that are not in Pyodide
 %pip install ipympl==0.9.3
 %pip install seaborn==0.12.2
 ```

 %% Cell type:code id:87cf6bca tags:disable-download-page,thebe-remove-input-init,auto-execute-page

 ``` python
 # Import the necessary libraries
 import numpy as np
 import matplotlib.pyplot as plt
 from sklearn.neural_network import MLPRegressor
 from sklearn.preprocessing import StandardScaler
 from mude_tools import neuralnetplotter, draw_neural_net
 from cycler import cycler
 import seaborn as sns

 %matplotlib widget

 # Set the color scheme
 sns.set_theme()
 colors = [
    "#0076C2",
    "#EC6842",
    "#A50034",
    "#009B77",
    "#FFB81C",
    "#E03C31",
    "#6CC24A",
    "#EF60A3",
    "#0C2340",
    "#00B8C8",
    "#6F1D77",
 ]
 plt.rcParams["axes.prop_cycle"] = cycler(color=colors)
 ```

 %% Cell type:markdown id:73f4d0fd-ec99-4e92-be60-69cceb55cf63 tags:thebe-remove-input-init

 ## Introduction

 <iframe width="560" height="315" src="https://www.youtube.com/embed/n6ycKSB66TQ?si=2OJ_vivw9PQHlk7c" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen></iframe>

 Recall that in the previous chapters we have applied linear models with basis functions

 $$
 y(x,\mathbf{w}) = \sum_{j=0}^M w_j \phi_j(x) = \mathbf{w}^T \boldsymbol{\phi} (x).
 $$

 Here $\mathbf{w}$ are the flexible parameters, and $\boldsymbol{\phi}$ the basis functions.

 Because a linear model is linear in its parameters $\mathbf{w}$, we could solve for $\bar{\mathbf{w}}$ directly

 $$
 \bar{\mathbf{w}} = \left(\boldsymbol{\Phi}^T \boldsymbol{\Phi} \right)^{-1} \boldsymbol{\Phi}^T \mathbf{t},
 $$

 where $\mathbf{\Phi}$ is the collection of basis functions evaluated in all data points. The basis functions need to be chosen *a priori* for this approach. When the phenomenon to be modeled is complex, relying on pre-defined basis functions might not give sufficient accuracy. We can overcome this issue with a more flexible model. Aside from the pragmatic strategy of increasing the number of basis functions, we can also achieve more flexibility by replacing the basis functions with parametric functions. In this chapter we will dive into one variant of this concept, namely **neural networks**. The underlying problem will stay the same: we are trying to learn a process based on a limited number of noisy observations $\mathcal{D}=\{\mathbf{X}, \mathbf{t}\}$. Following decision theory, we need to minimize the mean squared error loss function

 $$
 E_D =  \frac{1}{N} \sum_{n=1}^N \left(t_n - y(x_n, \mathbf{w}) \right)^2
 $$

 where $y(x, \mathbf{w})$ now comes from a neural network.

 %% Cell type:code id:fc3c6c8a tags:thebe-remove-input-init

 ``` python
 # The true function relating t to x
 def f_truth(x, freq=2, **kwargs):
    # Return a sine with a frequency of f
    return np.sin(x * freq)


 # The data generation function
 def f_data(epsilon=0.7, N=100, **kwargs):
    # Apply a seed if one is given
    if "seed" in kwargs:
        np.random.seed(kwargs["seed"])

    # Get the minimum and maximum
    xmin = kwargs.get("xmin", 0)
    xmax = kwargs.get("xmax", 2 * np.pi)

    # Generate N evenly spaced observation locations
    x = np.linspace(xmin, xmax, N)

    # Generate N noisy observations (1 at each location)
    t = f_truth(x, **kwargs) + np.random.normal(0, epsilon, N)

    # Return both the locations and the observations
    return x, t
 ```

 %% Cell type:markdown id:9aa5d0f7 tags:

 ## Neural network architecture
 A neural network consists of neurons connected by weights, with information flowing from input neurons towards output neurons. In **supervised learning**, the states of the input and output neurons are known during training. There are additional neurons in layers in between the inputs and outputs, forming a so-called hidden layer. Neurons are separated into layers, where all neurons of one layer depend on (at least) the neurons of the previous layer.

 The state of a neuron is determined by a linear combination of states $z$ from the previous layer with their connecting weights $w$

 $$
 a^{(l)}_{j} = \sum_{i}^{D} w_{ji}^{(l)} z_{i}^{(l-1)} + w_{j0}^{(l)}
 $$

 where $w_{j0}^{(l)}$ are so-called biases, allowing the model to have an offset. Make sure not to confuse this quantity with the model bias from the *bias-variance* tradeoff discussion. This linear combination of states is followed by a nonlinear transformation with an activation function $h(\cdot)$:

 $$
 z^{(l)}_{j} = h(a^{(l)}_{j}).
 $$

 In the plot below, we can see the identity (or linear), sigmoid, hyperbolic tangent (tanh), and rectified linear unit (relu) activation functions applied on an arbitrary state $z$ in the $[-4,4]$ range.

 %% Cell type:code id:0a65ffd4 tags:remove-cell

 ``` python
 x = np.linspace(-4, 4, 25)

 # Compute activation functions
 identity = x
 sigmoid = [1 / (1 + np.exp(-x)) for x in x]
 tanh = np.tanh(x)
 relu = [max(0, x) for x in x]

 # Plot figure
 fig, ax = plt.subplots(figsize=(8, 4.5))
 fig.canvas.toolbar_visible = False
 ax.set_position([0.2, 0.1, 0.7, 0.8])
 plt.plot(x, identity, "-v", label="Identity (linear)")
 plt.plot(x, sigmoid, "-|", label="Sigmoid")
 plt.plot(x, tanh, "-o", label="Tanh")
 plt.plot(x, relu, "-x", label="ReLU")
 plt.xlim(-4, 4)
 plt.ylim(-2, 2)
 plt.xlabel("$a$")
 plt.ylabel("$z$")
 #plt.title("Activation functions")
 plt.legend()
 plt.show()
 ```

 %% Cell type:markdown id:8e2c92e9 tags:

 ```{figure} ./figs/fig18.png
 Activation functions
 ```

 %% Cell type:markdown id:bc3c3bed tags:

 The number of layers in a neural network commonly refers to the number of hidden layers. Following the aforementioned setup of compounding linear transformations with nonlinear activations, the output of a two-layer neural network can be written as:

 $$
 y(x, \mathbf{w}) = h^{(out)} \left( \sum_{k=1}^{K} w_{k}^{(3)} h^{(2)} \bigg( \sum_{j=1}^{J} w_{kj}^{(2)} h^{(1)} \Big( \sum_{i=1}^{I} w_{ji}^{(2)} x_i + w_{j0}^{(1)} \Big) + w_{k0}^{(2)} \bigg) + w_{0}^{(3)} \right)
 $$

 Since the activation function can be nonlinear and quantities proportional to the weights pass through them, the model is evidently no longer necessarily linear w.r.t. the weights and, in general, no closed-form Maximum Likelihood solution can be found. Compare this with the linear basis function models from before. Instead of seeking an analytical solution that no longer exists, some sort of gradient-based optimization scheme, as discussed in the previous chapter in the form of SGD, is required calibrate the weights.

 When your dataset contains multiple inputs or outputs, this model can easily be extended by including multiple neurons in the input or output layer, and the other procedures stay the same. Generally, the activation function of the outputs $h^{(out)}$ is linear and the activations in hidden layers are of a nonlinear type.

 %% Cell type:code id:0358be43-981b-43e6-8587-7f7121a6bbc6 tags:remove-cell

 ``` python
 fig, ax = plt.subplots(1, 2, figsize=(8, 3))
 fig.canvas.toolbar_visible = False
 draw_neural_net(ax[0], 0.1, 0.9, 0.1, 0.9, [1, 5, 1])
 draw_neural_net(ax[1], 0.1, 0.9, 0.1, 0.9, [2, 5, 3])
 ax[0].set_title("1 input, 5 hidden nodes, 1 output")
 ax[1].set_title("2 inputs, 5 hidden nodes, 3 outputs")
 [axs.axis("off") for axs in ax]
 plt.show()
 ```

 %% Cell type:markdown id:7c6a7e88 tags:

 ```{figure} ./figs/fig19.png
 Two neural networks
 ```

 %% Cell type:markdown id:9f7aec86-a550-4c2b-8e0d-86010fc8f2b7 tags:

 ````{admonition} Coding neural nets
 Feedforward neural networks are often also called **Multilayer Perceptrons (MLP)** in machine learning software packages. This is also the case for [`scikit-learn`](https://scikit-learn.org/stable/), the package you will be using during the workshop this week. In that package, a neural network model for regression can be instantiated by using the [`MLPRegressor`](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html#sklearn.neural_network.MLPRegressor) class.

 Refer to the documentation of `scikit-learn` to see which arguments you can pass when creating a new `MLPRegressor`. These include the number of layers in the network, the number of units in each layer, the type of activation function, etc.
 ````

 %% Cell type:markdown id:925b70dd tags:

 ## Model flexibility

 The flexibility of a feedforward neutral network can be adapted by varying the number of neurons per hidden layer, or by adding more hidden layers. Both options lead to an increase in the number of parameters $\mathbf{w}$. When a neural network has too few parameters, it generally puts us at risk of underfitting the data, whereas having too many parameters can quickly lead to overfitting. Since they control model complexity, the number of layers and neurons per layer are therefore hyperparameters, which need to be calibrated. Once again, remember that **hyperparameters are calibrated with validation data**. Simply minimizing training error w.r.t. these hyperparameters will always lead to huge and severely overfit models, especially when we do not have a lot of data available.

 ````{admonition} Coding neural nets
 It is usual to perform a so-called *grid search* when performing model selection for models with more than one hyper-parameter. The strategy is to train a separate model for **each combination** of hyperparameters and pick the one with the lowest value for validation loss. Conceptually, code for this could look something like:

 ```python
 hp1vals = [val1, val2, ...]
 hp2vals = [val1, val2, ...]
 val_losses = np.zeros(len(hp1vals),len(hp2vals))

 for i, hp1 in enumerate(hp1vals):
    for j, hp2 in enumerate(hp2vals):
        # build the model
        model = Model(hp1,hp2)

        # train the model
        lowest_val_loss = train_model(model, X_train, t_train, X_val, n_epochs)

        # store loss
        val_losses[i,j] = lowest_val_loss

 # pick model with lowest entry in val_losses
 ```

 ````

 In the following interactive plot you can study the influence of the number of neurons per layer on model prediciton. The number of hidden layers is fixed at two. You have to click the re-run button to retrain the model after varying the parameter. **Be aware that the required computations can take a few moments to run.** Hide the table of contents to see the full image.

 %% Cell type:code id:fb8137de tags:thebe-remove-input-init

 ``` python
 # Define the prediction locations
 # (note that these are different from the locations where we observed our data)
 x_pred = np.linspace(-1, 2 * np.pi + 1, 200)

 xscaler = StandardScaler()
 xscaler.fit(f_data()[0][:, None])


 # Function that creates a NN
 def NN_create(**kwargs):
    return MLPRegressor(
        solver="sgd",
        hidden_layer_sizes=(kwargs["neurons"], kwargs["neurons"]),
        activation=kwargs["activation"],
        batch_size=kwargs["batch_size"],
    )


 # Function that trains a given NN for a given number of epochs
 def NN_train(x, t, network, epochs_per_block):
    # Convert the training data to a column vector and normalize it
    X = x.reshape(-1, 1)
    X = xscaler.transform(X)

    # Run a number of epochs
    for i in range(epochs_per_block):
        network.partial_fit(X, t)

    return network, network.loss_curve_


 # Function that returns predictions from a given NN model
 def NN_pred(x_pred, network):
    # Convert the prediction data to a column vector and normalize it
    X_pred = x_pred.reshape(-1, 1)
    X_pred = xscaler.transform(X_pred)

    # Make a prediction at the locations given by x_pred
    return network.predict(X_pred)
 ```

-%% Cell type:code id:21c8265d tags:full-width,thebe-init,hide-input
+%% Cell type:code id:21c8265d tags:full-width,thebe-remove-input-init

 ``` python
 # Pass everything to the neuralnetplotter
 plot1 = neuralnetplotter(
    f_data,
    f_truth,
    NN_create,
    NN_train,
    NN_pred,
    x_pred,
    N=100,
    val_pct=60,
    epochs=20000,
 )
 plot1.fig.canvas.toolbar_visible = False
 plot1.add_sliders("neurons", valmax=20, valinit=3)
 plot1.add_buttons("truth", "seed", "reset", "rerun")
 ```

 %% Cell type:markdown id:11764482 tags:thebe-remove-input-init

 As you might have noticed, using the default setting of three neurons per layer was not enough for the network to learn the underlying trend in the data. Which number of neurons per layer gave you a visibly good fit? Did you ever spot overfitting?

 Pay close attention to what we show on the right-hand plot during training:

 <!--
 NOTE: the formulas below look bad in the downloaded book, but nice in the online version.
 Turn the dollar signs in the final column into double dollar signs to make the formulas look good in the downloaded version
 -->
 | Line color | Quantity | Expression |
 | :-- | :-- | :-- |
 | Blue | Training set loss ($40\%$ of $N$) | $E_D=\frac{1}{N_\mathrm{train}}\sum_{n=1}^{N_\mathrm{train}} \left(t_n - y(x_n, \mathbf{w}) \right)^2$ |
 | Purple | Validation set loss ($60\%$ of $N$) | $E_D=\frac{1}{N_\mathrm{val}}\sum_{n=1}^{N_\mathrm{val}} \left(t_n - y(x_n, \mathbf{w}) \right)^2$ |
 | Black | Expected loss (numerical integration) | $\mathbb{E}[L]=\int\int\left(t-y(x,\mathbf{w})\right)^2p(x,t)dxdt$ |

 The black line is a very precise version of our error, but one that requires a lot of data to obtain. Since analytical integration is not possible due to the complex nature of $t$ and $y(x,\mathbf{w})$, we are forced to compute it through numerical integration, in this case by using $1500$ equally-spaced predictions in the range $[0,2\pi]$. Although interesting to include here for educational purposes, it is obvious that **we will not have access to this measure in practice**.

 What we have instead is the validation loss (purple line) computed as a relatively crude Monte Carlo approximation of the expected loss, since we are only using a very small number of data points to compute it. In practice, we need to rely on this approximation to make model selection decisions!

 ````{admonition} Coding neural nets
 As with any other model this week, we need to perform a training-validation-test split to our original dataset in order to arrive at a robust machine learning estimator for $y(x)$. A simple (incomplete) implementation of this procedure could look like:

 ```python
 import numpy as np

 # fix the seed of the RNG, for reproducibility
 np.random.seed(42)

 # get an index array with randomized order
 perm = np.random.permutation(N)

 # compute the number of samples for training/validation/test
 n_train = ?
 n_val = ?
 n_test = ?

 # slice the permutation vector from before
 idcs_train = perm[0:ntrain]
 idcs_val = perm[?:?]
 idcs_test = perm[?:?]

 # use the indices to slice X and t
 X_train = X[idcs_train]
 t_train = t[idcs_train]
 # ...
 ```
 ````

 ## Early stopping

 Choosing a high number of neurons increases the number of parameters and, therefore, slows down training. In addition, it can make the model too flexible and prone to overfitting. It is therefore good practive to always monitor the predictive capability of a NN on a validation set. First, run the model below, then, select the model you think best fits the data by pulling the corresponding slider. At which epoch do you find the best model?

-%% Cell type:code id:0de98308 tags:full-width,hide-input,thebe-init
+%% Cell type:code id:0de98308 tags:full-width,thebe-remove-input-init

 ``` python
 plot2 = neuralnetplotter(
    f_data,
    f_truth,
    NN_create,
    NN_train,
    NN_pred,
    x_pred,
    neurons=20,
    epochs=12000,
    N=40,
    val_pct=60,
    batch_size=2,
 )
 plot2.fig.canvas.toolbar_visible = False
 plot2.seed = 4
 plot2.add_sliders("cur_model")
 plot2.add_buttons("truth", "seed", "reset", "rerun")
 plot2.show()
 ```

 %% Cell type:markdown id:9008b730 tags:

 The example above is a good illustration of overfitting. Note how the training loss almost immediately becomes an unreliable estimate of the true expected loss (black line). In contrast, the validation loss (purple line) does not exactly agree with the black line but consistently follows the same trends. Crucially, **the validation loss correctly points to the model with the lowest true loss**, at around $3000$ training epochs. Move the model selection slider there and try to see why the true loss starts to increase from that point on.

 It is possible to train a neural network for a long time, and then select the $\mathbf{w}$ that corresponds to the lowest validation error. An alternative, known as early stopping, uses the indication that the validation loss increases for a number of epochs as a stopping sign to halt training.

 ````{admonition} Coding neural nets
 To make early stopping work, we need to continuously monitor the validation loss, either every time the model is updated or at least once every epoch. Together with the fact that we train with **mini-batches** of our dataset (as explained in the video), this means we need to code a training loop that implements our SGD routine.

 In `scikit-learn` we could call the `fit` function to fully train a network. However, in order to properly implement early stopping, keep track of our validation loss and see which mini-batches are being picked, we could code our own training loop. To accommodate for this, the `MLPRegressor` model offers the `partial_fit` function, that retrains the network only once for a given pair of `X_batch` and `t_batch`. Then one possible (incomplete) implementation could look like:

 ```python
 def train_model(model, X_train, t_train, X_val, n_epochs):
    for epoch in range(n_epochs):
        # slice the dataset into mini-batches
        batches = slice_dataset(X_train,t_train)

        for X_batch, t_batch in batches:
            # update network weights
            model.partial_fit(X_batch, t_batch)

        # compute the relevant losses
        train_y = model.predict(X_train)
        val_y   = model.predict(X_val)

        train_loss = loss_function(train_t,train_y)
        val_loss = loss_function(val_t,val_y)

        # append losses to the relevant lists
        t_losses.append(train_loss)
        v_losses.append(val_loss)
    return t_losse, v_losses
 ```

 where care should be of course taken to properly normalize the datasets. The losses can then be plotted, the model with the best validation loss can be picked, etc.
 ````

 ## Manual model selection

 In [](./linear_models_interactive.ipynb) we used L<sub>2</sub>-regularization to control the model complexity. The application of this technique to neural networks is straightforward, and will therefore not be demonstrated here. Instead, we will focus on the impact of the number of trainable parameters and the number of samples on overfitting. The ability to display our models at different stages of the training phase will help us to find and inspect particularily good or bad models.

-%% Cell type:code id:5254be84 tags:full-width,hide-input,thebe-init
+%% Cell type:code id:5254be84 tags:full-width,thebe-remove-input-init

 ``` python
 plot3 = neuralnetplotter(
    f_data, f_truth, NN_create, NN_train, NN_pred, x_pred, nnlinewidth=0.25
 )
 plot3.fig.canvas.toolbar_visible = False
 plot3.add_sliders("freq", "neurons", "N", "cur_model", "val_pct")
 plot3.add_buttons("truth", "seed", "reset", "rerun")
 plot3.add_radiobuttons("activation")
 plot3.show()
 ```

 %% Cell type:markdown id:1eebac5a tags:

 Train a number of different neural networks with varying hyperparameter settings, and try to understand the influence of all the the parameters on the resulting model. Try to answer the following questions:

 * Is the model with the lowest validation error always the one that gives the best fit visually?

 ````{admonition} Answer
 :class: dropdown
 There is more to this question than meets the eye. Remember in practice we do not know what the ground truth is. Also remember that the validation set is sometimes just a crude approximation of the true loss. For specific settings of the sliders above, it will sometimes look like models with higher validation error are actually visually better when looking at the ground truth.
 ````

 * Try out a model with linear (*identity*) activation function. Can you make sense of what you observe?

 ````{admonition} Answer
 :class: dropdown
 Recall the expression for the predictions of a neural network. Setting the activation function to linear will make the model collapse back into linear regression! Remember that even though we have a lot of weights here, they just get multiplied with each other and lead to equivalent weights for a very simple linear model.
 ````

 * We plot our activation functions next to their selector buttons. How does the shape of your trained model correlate with the shape of your activation function?

 ````{admonition} Answer
 :class: dropdown
 How each activation function handles nonlinearities clearly affects our final model. Note how the *ReLU* function (bilinear) leads to a piecewise linear model in the end.
 ````

 * In practical situations it is often difficult to visualize model predictions for the whole domain. Can you detect when a model is underfit based only on the training and validation loss?

 ````{admonition} Answer
 :class: dropdown
 This can be quite tricky in practice. A somewhat reliable sign would be to look at how the training and validation losses change with training epochs. If there is basically no change after a small number of epochs, it might pay off to try a more flexible model and see what happens. Of course, when doing this you should always keep an eye out for overfitting.
 ````

 * For a well-trained flexible model with a large training size (N) the errors usually converge to a specific value. What is this value and why does this happen? Can we ever get rid of it?

 ````{admonition} Answer
 :class: dropdown
 Remember the discussion on bias/variance decomposition from before. We can decompose our loss into three parts: squared bias, variance and irreducible noise. The irreducible part of the loss will therefore always be there even if we can achieve the idealized model $h(x)=\mathbb{E}[t\vert x]$ (by having enough flexibility and a lot of data to train the model with). Unless we find a way to explain this irreducible observation noise (e.g. by observing another variable of interest), we can never get rid of it.
 ````

 * How well does the model predict **outside of the training range**?

 ````{admonition} Answer
 :class: dropdown
 Once again, machine learning models with purely data-driven bias/variance should be used in extrapolation with extreme care. Here we see how complex models give senseless predictions outside of their training range. More advanced deep learning models can somehow break this curse and perform well in extrapolation, under specific choices of architecture, datasets and training procedures. Look online for "zero-shot learning" if you are curious about this.
 ````

 %% Cell type:markdown id:b4829586 tags:

 ## Wrap-up

 In these videos you have seen a non-parametric model, namely k-nearest neighbours, and have learned about the bias-variance trade-off. Linear regression was shown as a parametric model that is linear in its parameters and has a closed form solution. Ridge regression has been introduced to prevent overfitting. Stochastic gradient descent has been shown as a way to train a model in an iterative fashion. In this final chapter, we explored a model with a nonlinear dependence on its parameters. You now understand the underlying principles of a broad set machine learning techniques, and know how to distinguish naive curve fitting from extracting (or learning) the relevant trends and patterns in data.

--- a/thebe_lite/_static/sphinx-thebe.js
+++ b/thebe_lite/_static/sphinx-thebe.js
@@ -374,21 +374,16 @@ function setupSpecialTaggedElements() {
  for (const taggedElement of window.specialTaggedElements) {
    switch (taggedElement.tag) {
      case "thebe-remove-input-init": {
-        const { newCellInfo, newNotebookCell } = setupNewCell(
-          undefined,
-          undefined,
-          taggedElement.code
-        );
-
-        // The 4 following lines are an ugly hack to make sure we preserve init order
-        // Maybe improving runInitCells could circumvent this
-        
-        newNotebookCell.area.node.setAttribute("data-thebe-id", newNotebookCell.id);
+        taggedElement.element.style.display = "block";
        
-        newNotebookCell.attachToDOM(taggedElement.placeholder);
-        taggedElement.placeholder.classList.add("tag_thebe-init");
-        taggedElement.placeholder.style.display = "block";
-
+        const cell_input_div = taggedElement.element.querySelector(".cell_input");
+        cell_input_div.classList.remove("cell_input");
+        cell_input_div.querySelector(".highlight")?.classList.remove("highlight");
+        cell_input_div.querySelector(".thebe-input")?.remove();
+
+        const cell_controls = cell_input_div.querySelector(".thebe-controls");
+        if (cell_controls) { cell_controls.style.display = "none"; }
+      
        break;
      }
      default: {
@@ -446,10 +441,11 @@ var initThebe = async () => {
  // 3. Eval the string og the override_pyodide_lookup function in JS, this brings it into scope
  // 4. Execute the override_pyodide_lookup function in JS, and bake in the relative path from root in the book (the home)
  // NOTE: All functions used in override_pyodide_lookup should be nested inside it, since the web worker cannot access functions in this script
+
  thebelab.session.kernel.requestExecute({
-    code: `import js; import pyodide_js; js.fs = pyodide_js.FS; js.eval("""${override_pyodide_lookup.toString()}"""); js.eval(f"override_pyodide_lookup(fs, '${
-      location.pathname.split("/").slice(0, -1).join("/") + "/"
-    }')")`,
+   code: `import js; import pyodide_js; js.fs = pyodide_js.FS; js.eval("""${override_pyodide_lookup.toString()}"""); js.eval(f"override_pyodide_lookup(fs, '${
+     location.pathname.split("/").slice(0, -1).join("/") + "/"
+   }')")`,
  });

  const request = new XMLHttpRequest();
@@ -492,21 +488,14 @@ var detectLanguage = (language) => {
 };

 function handleThebeRemoveInputTag(element) {
-  const placeholder = document.createElement("div");
-  placeholder.style.display = "none";
+  element.style.display = "none";

  window.specialTaggedElements.push({
    tag: "thebe-remove-input-init",
-    placeholder: placeholder,
-    code: element.querySelector("pre").textContent?.trim() ?? "",
+    element: element
  });

-  element.before(placeholder);
-  const placeholderOutput = element.querySelector(".cell_output");
-  if (placeholderOutput !== null) {
-    element.after(placeholderOutput);
-  }
-  element.remove();
+  element.classList.add("tag_thebe-init");
 }

 function handleDisableExecutionTag(element) {