From d3faabe62cd3dd585efc1fe89cc80edb3dc483e2 Mon Sep 17 00:00:00 2001
From: Robert Lanzafame <R.C.Lanzafame@tudelft.nl>
Date: Fri, 20 Sep 2024 05:57:03 +0200
Subject: [PATCH] update solution

---
 content/GA_1_3/Analysis_Solution.ipynb | 38 +++++++------
 content/GA_1_3/Report.md               | 14 ++++-
 content/GA_1_3/Report_solution.md      | 76 ++++++++++++++++++--------
 content/GA_1_3/functions.py            |  6 +-
 4 files changed, 87 insertions(+), 47 deletions(-)

diff --git a/content/GA_1_3/Analysis_Solution.ipynb b/content/GA_1_3/Analysis_Solution.ipynb
index 94fdfefe..23178b07 100644
--- a/content/GA_1_3/Analysis_Solution.ipynb
+++ b/content/GA_1_3/Analysis_Solution.ipynb
@@ -191,7 +191,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 4,
    "id": "1f28eba3",
    "metadata": {
     "tags": []
@@ -431,7 +431,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 52,
+   "execution_count": 8,
    "id": "edf14892",
    "metadata": {
     "tags": []
@@ -469,7 +469,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": 9,
    "id": "6cdfb46b-1324-4c2b-8148-5a6a102ede2e",
    "metadata": {
     "tags": []
@@ -682,9 +682,11 @@
     "## Task 2: Set-up linear functional model\n",
     "\n",
     "We want to investigate how we could model the observed displacements of the road. Because the road is built in the Green Heart we expect that the observed displacements are related to the groundwater level. Furthermore, we assume that the displacements can be modeled using a constant velocity. The model is defined as \n",
+    "\n",
     "$$\n",
     "d = d_0 + vt + k \\ \\textrm{GW},\n",
     "$$\n",
+    "\n",
     "where $d$ is the displacement, $t$ is time and $\\textrm{GW}$ is the groundwater level (that we assume to be deterministic). \n",
     "\n",
     "Therefore, the model has 3 unknowns:\n",
@@ -1191,7 +1193,7 @@
     "    e_hat = y - y_hat\n",
     "\n",
     "    Sigma_Y_hat = A @ Sigma_X_hat @ A.T\n",
-    "    std_y = np.sqrt(Sigma_Y_hat.diagonal())\n",
+    "    std_Y_hat = np.sqrt(Sigma_Y_hat.diagonal())\n",
     "\n",
     "    Sigma_e_hat = Sigma_Y - Sigma_Y_hat\n",
     "    std_e_hat = np.sqrt(Sigma_e_hat.diagonal())\n",
@@ -1201,7 +1203,7 @@
     "    d['y_hat'] = y_hat\n",
     "    d['e_hat'] = e_hat\n",
     "    d['Sigma_Y_hat'] = Sigma_Y_hat\n",
-    "    d['std_y'] = std_y\n",
+    "    d['std_Y_hat'] = std_Y_hat\n",
     "    d['Sigma_e_hat'] = Sigma_e_hat\n",
     "    d['std_e_hat'] = std_e_hat\n",
     "\n",
@@ -1454,7 +1456,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 23,
    "id": "2711da12",
    "metadata": {},
    "outputs": [],
@@ -1468,7 +1470,7 @@
     "    \"\"\"\n",
     "\n",
     "    std_e_hat = d['std_e_hat']\n",
-    "    std_y = d['std_y']\n",
+    "    std_Y_hat = d['std_Y_hat']\n",
     "\n",
     "    # k = YOUR_CODE_HERE\n",
     "    # CI_y = YOUR_CODE_HERE\n",
@@ -1476,11 +1478,11 @@
     "\n",
     "    # SOLUTION:\n",
     "    k = norm.ppf(1 - 0.5*alpha)\n",
-    "    CI_y = k*std_y\n",
+    "    CI_Y_hat = k*std_Y_hat\n",
     "    CI_res = k*std_e_hat\n",
     "\n",
     "    d['alpha'] = alpha\n",
-    "    d['CI_y'] = CI_y\n",
+    "    d['CI_Y_hat'] = CI_Y_hat\n",
     "    d['CI_res'] = CI_res\n",
     "\n",
     "    return d"
@@ -1488,7 +1490,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 25,
    "id": "d9a41ea5",
    "metadata": {},
    "outputs": [],
@@ -1511,7 +1513,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 26,
    "id": "b3bb808e",
    "metadata": {},
    "outputs": [
@@ -1531,11 +1533,11 @@
       "y_hat            -->    <class 'numpy.ndarray'>\n",
       "e_hat            -->    <class 'numpy.ndarray'>\n",
       "Sigma_Y_hat      -->    <class 'numpy.ndarray'>\n",
-      "std_y            -->    <class 'numpy.ndarray'>\n",
+      "std_Y_hat        -->    <class 'numpy.ndarray'>\n",
       "Sigma_e_hat      -->    <class 'numpy.ndarray'>\n",
       "std_e_hat        -->    <class 'numpy.ndarray'>\n",
       "alpha            -->    <class 'float'>\n",
-      "CI_y             -->    <class 'numpy.ndarray'>\n",
+      "CI_Y_hat         -->    <class 'numpy.ndarray'>\n",
       "CI_res           -->    <class 'numpy.ndarray'>\n",
       "\n",
       "Keys and Values (type) for model_gnss:\n",
@@ -1550,11 +1552,11 @@
       "y_hat            -->    <class 'numpy.ndarray'>\n",
       "e_hat            -->    <class 'numpy.ndarray'>\n",
       "Sigma_Y_hat      -->    <class 'numpy.ndarray'>\n",
-      "std_y            -->    <class 'numpy.ndarray'>\n",
+      "std_Y_hat        -->    <class 'numpy.ndarray'>\n",
       "Sigma_e_hat      -->    <class 'numpy.ndarray'>\n",
       "std_e_hat        -->    <class 'numpy.ndarray'>\n",
       "alpha            -->    <class 'float'>\n",
-      "CI_y             -->    <class 'numpy.ndarray'>\n",
+      "CI_Y_hat         -->    <class 'numpy.ndarray'>\n",
       "CI_res           -->    <class 'numpy.ndarray'>\n"
      ]
     }
@@ -1596,7 +1598,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 27,
    "id": "ec7c8bef",
    "metadata": {},
    "outputs": [
@@ -1631,7 +1633,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 28,
    "id": "104d155d",
    "metadata": {},
    "outputs": [
@@ -1719,7 +1721,7 @@
     "\n",
     "$\\textbf{Solution: the True Model}$\n",
     "    \n",
-    "The data used in this exercise was generated using Monte Carlo Simulation. It is added to the plots here to illustrate where and how our models differ (it is your job to interpret \"why\").    \n",
+    "The data used in this exercise was generated primarily using Monte Carlo Simulation (but some data were modified \"by hand). It is added to the plots here to illustrate where and how our models differ (it is your job to interpret \"why\").    \n",
     "</p>\n",
     "</div>"
    ]
diff --git a/content/GA_1_3/Report.md b/content/GA_1_3/Report.md
index ed099157..64926b88 100644
--- a/content/GA_1_3/Report.md
+++ b/content/GA_1_3/Report.md
@@ -12,19 +12,26 @@ Give a short explanation about the data and the models we created. Include brief
 
 _Your answer should only be a few sentences long, and be sure to use quantitative information! You do not need to reproduce the model, but it would be useful to do things like confirm the number of models, model parameters, observations, etc. You can use bullet lists to summarize and state interesting information._
 
+_We copy the LaTeX equation for the model here, in case you would like to refer to it in your Report._
+
+$$
+d = d_0 + vt + k \ \textrm{GW},
+$$
+
 _Write your answer here._
 
 **Question 2**
 
-Describe what the confidence intervals are and mention specifically the type of information that is included in the algorithm to calculate. Make observations about the CI's for each model, and explain why all of the data points are not contained within the interval, even though the value is close to 100%.
+Two types of confidence intervals are used in this assignment. Describe what the confidence intervals are and mention specifically the type of information that is included in the algorithm to calculate them. Make observations about the CI's for each model. Explain why, even though the $\alpha$ value is small (i.e., the CI is close to 100%), in the time series plots many of the data are _not_ contained within the CI, whereas they _are_ contained within the CI for the residual plots.
+
+_Hint: Are the intervals uniform for the entire time series? Which uncertainties are propagated into each type of CI?_
 
-_Hint: which uncertainties are propagated into the CI? Are the intervals uniform for the entire time series?_
 
 _Write your answer here._
 
 **Question 3**
 
-Do you see any systematic effects in the residuals for each model? Make a few relevant observations about the residuals and the , do you see any systematic effect? Give your interpretation for any discrepancy between observations and the model.
+Do you see any systematic effects in the residuals for each model? Make a few relevant observations about the residuals. Give your interpretation for any discrepancy between observations and the model.
 
 _It may be useful to include qualitative and quantitative observations about the residuals, for example, the histogram/PDF comparison and the moments of the residuals._
 
@@ -34,6 +41,7 @@ _Write your answer here._
 
 Compare the results you found for the InSAR observations and the GNSS observations. Discuss the differences between the results. Be quantitative!
 
+
 _Write your answer here._
 
 **Last Question: How did things go? (Optional)**
diff --git a/content/GA_1_3/Report_solution.md b/content/GA_1_3/Report_solution.md
index 7bd6977d..bfc1d69c 100644
--- a/content/GA_1_3/Report_solution.md
+++ b/content/GA_1_3/Report_solution.md
@@ -12,44 +12,64 @@ Give a short explanation about the data and the models we created. Include brief
 
 _Your answer should only be a few sentences long, and be sure to use quantitative information! You do not need to reproduce the model, but it would be useful to do things like confirm the number of models, model parameters, observations, etc. You can use bullet lists to summarize and state interesting information._
 
+_We copy the LaTeX equation for the model here, in case you would like to refer to it in your Report._
+
+$$
+d = d_0 + vt + k \ \textrm{GW},
+$$
+
 **Solution**
 
-Three data sets were used. We had InSAR data, GNSS and waterlevel data. InSAR and GNSS were used as observations and waterlevel was used as parameter. Two models were made using BLUE, one using InSAR and the other using GNSS as observations. All three datasets are reported in mm. The standard deviation of InSAR and GNSS is assumed to be 2 mm and 15 mm respectively. 
+_Note that you were not expected to write this much in your own solution!_
+
+Three data sets were used. We had InSAR and GNSS data (both satellite observations of the ground surface) and groundwater level data. InSAR and GNSS were used as observations and groundwater level was used as a (deterministic!) parameter. Two models were made using BLUE, one using InSAR and the other using GNSS as observations.
+
+InSAR has 730 observations and GNSS 61, groundwater has 25. The standard deviation of each InSAR and GNSS measurement is assumed to be 2 mm and 15 mm, respectively. All three datasets are converted to mm when used in the Analysis notebook (satellite data is converted from m).
+
+The model has three parameters, each of which are linear with respect to the computed displacement of the ground surface:
+- initial displacement, $d_0$ mm
+- rate of displacement as a function of time, $v$ mm/day
+- displacement due to the current groundwater level, $k$ $\textrm{mm}^2$
+
+$$
+d = d_0 + v\ t + k \ \textrm{GW},
+$$
 
-InSAR has 730 observations and GNSS 61, groundwater has 25. To use the groundwater data it is linearly interpolated to get 730 and 61 values for both models. This is necessary since for each observation we need a value for each parameter.
+Note in particular the differences between each term: $d_0$ and $k\ \textrm{GW}$ are constant in time, whereas $v\ t$ is not. The term $k\ \textrm{GW}$ is unique as it changes directly with the groundwater level; the relationship with time is thus non-linear, but deterministic.
+
+To construct the model we need a parameter value for each observation (GNSS/InSAR); this is problematic as we only have 25 groundwater measurements. Luckily the groundwater level changes relatively slowly so we can interpolate (linearly) between the observations. This allows us to have 730 and 61 values that match the other datasets, which are used to construct the two models. 
 
 **Question 2**
 
-Describe what the confidence intervals are and mention specifically the type of information that is included in the algorithm to calculate. Make observations about the CI's for each model, and explain why all of the data points are not contained within the interval, even though the value is close to 100%.
+Two types of confidence intervals are used in this assignment. Describe what the confidence intervals are and mention specifically the type of information that is included in the algorithm to calculate them. Make observations about the CI's for each model. Explain why, even though the $\alpha$ value is small (i.e., the CI is close to 100%), in the time series plots many of the data are _not_ contained within the CI, whereas they _are_ contained within the CI for the residual plots.
 
-_Hint: which uncertainties are propagated into the CI? Are the intervals uniform for the entire time series?_
+_Hint: Are the intervals uniform for the entire time series? Which uncertainties are propagated into each type of CI?_
 
 **Solution**
 
-Confidence intervals are a useful way to report uncertainty. They are based on theoretical distribution. Using the first and second moments (i.e. mean and std. dev.), we can create confidence intervals based on a specific critical value. When our sample size is large enough and our assumptions that our random variable is distributed randomly, we will see that the number of data points in our CI is (almost) equal to our critical value (96%).
+Confidence intervals are a useful way to report uncertainty and are based on an assumed theoretical distribution around a quantity of interest; in this case we assume a Normal distribution about each model prediction, $\hat{y}_i$, as well as each residual, $y_i-\hat{y}_i$. Using the first and second moments (i.e. mean and std. dev.), we can create confidence intervals based on a specific critical value, $\alpha$. When our sample size is large enough and our assumptions that our random variable is distributed randomly are reasonably correct, we will see that the fraction of data points in our CI is (almost) equal to our critical value.
 
-However in this case our residuals are not distributed normally and our data sets (especially the GNSS) which will mean our CI's are not the truth. However, it can still be a useful tool to analyze our (propagation of ) uncertainty 
+The two confidence intervals are different because of the way the covariance matrices $\Sigma_{\hat{Y}}$ and $\Sigma_{\hat{\epsilon}}$ are calculated, and the information that they represent. Even though both incorporate uncertainty of the observations, $\Sigma_{Y}$, the confidence interval of each model prediction $\hat{y}_i$ is based on the uncertainty of the model _parameters_, which is what the CI in the time series plot illustrates.
 
 **Question 3**
 
-Do you see any systematic effects in the residuals for each model? Make a few relevant observations about the residuals and the , do you see any systematic effect? Give your interpretation for any discrepancy between observations and the model.
+Do you see any systematic effects in the residuals for each model? Make a few relevant observations about the residuals. Give your interpretation for any discrepancy between observations and the model.
 
 _It may be useful to include qualitative and quantitative observations about the residuals, for example, the histogram/PDF comparison and the moments of the residuals._
 
 **Solution**
-    
-_Note: here we plotted the true model as well, which you did not have._
-    
-- The mean value and standard deviation of the InSAR residuals is 0.0 mm and 3.115 mm. 
-- The mean value and standard deviation of the GNSS residuals is 0.0 mm and 15.393 mm.
-    
-First of all, for InSAR almost all residuals are within the 99% confidence bounds, indicating that the quality that we assumed for the observations was good. 
+
+The mean value and standard deviation of the InSAR residuals is 0.0 mm and 3.115 mm.
+
+The mean value and standard deviation of the GNSS residuals is 0.0 mm and 15.393 mm.
+
+After examining the residual plots, for InSAR almost all residuals are within the confidence bounds, indicating that the quality that we assumed for the observations was good.
     
 The fitted model seems to follow the observations relatively well, but does not capture the signal completely. This can especially be seen in the residual plot with the confidence bounds. You see that the residuals are not completely centered around zero but that we still see some 'signal' where the model underpredicts at the ends and overpredicts in the middle. Although the values are negative, we can see that the residual plot removes the trend described by the model and illustrates the "over" and "under" aspect quite clearly. 
     
-Moreover, when reviewing the results for GNSS we see only a few outliers (residuals outside the 99% confidence bounds), which is logical given the 99% limit. Furthermore, the left side of the plot have many more observations that are below the confidence bound; this can also be seen in the left tail of the GNSS histogram, which is slightly asymmetric.
+Moreover, when reviewing the results for GNSS we see only a few outliers (residuals outside the confidence bounds), which is logical given the high limit (low $\alpha$). Furthermore, the left side of the plot has many more observations that are below the confidence bound; this can also be seen in the left tail of the GNSS histogram, which is slightly asymmetric.
     
-All of these observations indicate that the model is generally good, but misses some important characteristics in the data. Perhaps we should consider adding a bit of complexity (Part 2!).
+All of these observations indicate that the model is generally good, but misses some important characteristics in the data. Perhaps we should consider adding a bit of complexity (we will do this next week!).
 
 **Question 4**
 
@@ -57,23 +77,33 @@ Compare the results you found for the InSAR observations and the GNSS observatio
 
 **Solution**
 
-Estimated parameters, hence fitted model, is different. 
+The estimated parameters, hence the fitted model, are different.
+
+| Model | $d_0$ [mm] | $v$ [mm/day] | $k$ [$-$] |
+| :---: | :---: | :---: | :---: |
+| InSAR | 9.174 | -0.0243 | 0.202 |
+| GNSS | 1.181 | -0.0209 | 0.16 |
     
-Factors that have an impact are
+Factors that have an impact are:
     
 - precision of the observations
-    
 - number of observations
-    
 - outliers in the GNSS data
+
+The precision (standard deviation) of each parameter is:
+
+| Model | $\sigma_{d_0}$ [mm] | $\sigma_{v}$ [mm/day] | $\sigma_{k}$ [$-$] |
+| :---: | :---: | :---: | :---: |
+| InSAR | 2.128 | 0.0012 | 0.016 |
+| GNSS | 4.467 | 0.0026 | 0.035 |
     
-    
-Although the quality of the GNSS data is lower compared to InSAR (15 mm vs 2 mm), the precision of the estimated parameters is only a factor 2 worse. Here we see the effect of 'more' data points: the much lower precision of the observations is somewhat compensated by the much higher number of observations.
+Although the quality of the GNSS data is lower compared to InSAR (15 mm vs 2 mm), the precision of the estimated parameters is only a factor 2 worse. Here we see the effect of "more" data points: the much lower precision of the observations is somewhat compensated by the much higher number of observations.
 
 The GNSS data seems to have some outliers in the beginning and therefore the model fit is maybe not so good compared to InSAR. 
 
 Also, when reviewing the residuals for both datasets, it seems that the model that we use is maybe too simple since we miss part of the signal. 
-    
+
+_Note: in the solution we plotted the true model as well, which you did not have. This makes it easier to see where the data have some outliers; in fact this was the "manually" adjusted values that deviate from the rest, which were generated randomly using a Monte Carlo Simulation with Normally distributed noise._
 
 **Last Question: How did things go? (Optional)**
 
diff --git a/content/GA_1_3/functions.py b/content/GA_1_3/functions.py
index 895a3b0e..0d030438 100644
--- a/content/GA_1_3/functions.py
+++ b/content/GA_1_3/functions.py
@@ -19,7 +19,7 @@ def plot_model(d, alt_model=None):
     times = d['times']
     y = d['y']
     y_hat = d['y_hat']
-    CI_y = d['CI_y']
+    CI_Y_hat = d['CI_Y_hat']
     data_type = d['data_type']
 
     fig, ax = plt.subplots(figsize = (15,5))
@@ -27,8 +27,8 @@ def plot_model(d, alt_model=None):
     ax.plot(times, y, 'k+',  label = 'Observations')
     ax.plot(times, y_hat,  label = 'Fitted model')
     ax.fill_between(times,
-                    (y_hat - CI_y), 
-                    (y_hat + CI_y),
+                    (y_hat - CI_Y_hat), 
+                    (y_hat + CI_Y_hat),
                     facecolor='orange',
                     alpha=0.4,
                     label = f'{(1-d['alpha'])*100:.1f}%' + ' Confidence Region')
-- 
GitLab