diff --git a/paper/bib.bib b/paper/bib.bib
index 017eb71711b80c7449625a2867481e395037e73d..4bdef5a081f7383e2fdd7a25bbe56036d94b127a 100644
--- a/paper/bib.bib
+++ b/paper/bib.bib
@@ -2678,4 +2678,49 @@
   shorttitle = {Probabilistic machine learning},
 }
 
+@TechReport{artelt2021evaluating,
+  author      = {Artelt, AndrÃ© and Vaquet, Valerie and Velioglu, Riza and Hinder, Fabian and Brinkrolf, Johannes and Schilling, Malte and Hammer, Barbara},
+  date        = {2021-07},
+  institution = {arXiv},
+  title       = {Evaluating {Robustness} of {Counterfactual} {Explanations}},
+  note        = {arXiv:2103.02354 [cs] type: article},
+  url         = {http://arxiv.org/abs/2103.02354},
+  urldate     = {2023-03-24},
+  abstract    = {Transparency is a fundamental requirement for decision making systems when these should be deployed in the real world. It is usually achieved by providing explanations of the system's behavior. A prominent and intuitive type of explanations are counterfactual explanations. Counterfactual explanations explain a behavior to the user by proposing actions -- as changes to the input -- that would cause a different (specified) behavior of the system. However, such explanation methods can be unstable with respect to small changes to the input -- i.e. even a small change in the input can lead to huge or arbitrary changes in the output and of the explanation. This could be problematic for counterfactual explanations, as two similar individuals might get very different explanations. Even worse, if the recommended actions differ considerably in their complexity, one would consider such unstable (counterfactual) explanations as individually unfair. In this work, we formally and empirically study the robustness of counterfactual explanations in general, as well as under different models and different kinds of perturbations. Furthermore, we propose that plausible counterfactual explanations can be used instead of closest counterfactual explanations to improve the robustness and consequently the individual fairness of counterfactual explanations.},
+  annotation  = {Comment: Rewrite paper to make things more clear; Remove one theorem \& corollary due to buggy proof},
+  file        = {:artelt2021evaluating - Evaluating Robustness of Counterfactual Explanations.pdf:PDF},
+  keywords    = {Computer Science - Machine Learning, Computer Science - Artificial Intelligence},
+}
+
+@Article{guidotti2022counterfactual,
+  author       = {Guidotti, Riccardo},
+  date         = {2022-04},
+  journaltitle = {Data Mining and Knowledge Discovery},
+  title        = {Counterfactual explanations and how to find them: literature review and benchmarking},
+  doi          = {10.1007/s10618-022-00831-6},
+  issn         = {1573-756X},
+  language     = {en},
+  url          = {https://doi.org/10.1007/s10618-022-00831-6},
+  urldate      = {2023-03-24},
+  abstract     = {Interpretable machine learning aims at unveiling the reasons behind predictions returned by uninterpretable classifiers. One of the most valuable types of explanation consists of counterfactuals. A counterfactual explanation reveals what should have been different in an instance to observe a diverse outcome. For instance, a bank customer asks for a loan that is rejected. The counterfactual explanation consists of what should have been different for the customer in order to have the loan accepted. Recently, there has been an explosion of proposals for counterfactual explainers. The aim of this work is to survey the most recent explainers returning counterfactual explanations. We categorize explainers based on the approach adopted to return the counterfactuals, and we label them according to characteristics of the method and properties of the counterfactuals returned. In addition, we visually compare the explanations, and we report quantitative benchmarking assessing minimality, actionability, stability, diversity, discriminative power, and running time. The results make evident that the current state of the art does not provide a counterfactual explainer able to guarantee all these properties simultaneously.},
+  file         = {Full Text PDF:https\://link.springer.com/content/pdf/10.1007%2Fs10618-022-00831-6.pdf:application/pdf},
+  keywords     = {Explainable AI, Counterfactual explanations, Contrastive explanations, Interpretable machine learning},
+  shorttitle   = {Counterfactual explanations and how to find them},
+}
+
+@TechReport{mahajan2020preserving,
+  author      = {Mahajan, Divyat and Tan, Chenhao and Sharma, Amit},
+  date        = {2020-06},
+  institution = {arXiv},
+  title       = {Preserving {Causal} {Constraints} in {Counterfactual} {Explanations} for {Machine} {Learning} {Classifiers}},
+  doi         = {10.48550/arXiv.1912.03277},
+  note        = {arXiv:1912.03277 [cs, stat] type: article},
+  url         = {http://arxiv.org/abs/1912.03277},
+  urldate     = {2023-03-24},
+  abstract    = {To construct interpretable explanations that are consistent with the original ML model, counterfactual examples---showing how the model's output changes with small perturbations to the input---have been proposed. This paper extends the work in counterfactual explanations by addressing the challenge of feasibility of such examples. For explanations of ML models in critical domains such as healthcare and finance, counterfactual examples are useful for an end-user only to the extent that perturbation of feature inputs is feasible in the real world. We formulate the problem of feasibility as preserving causal relationships among input features and present a method that uses (partial) structural causal models to generate actionable counterfactuals. When feasibility constraints cannot be easily expressed, we consider an alternative mechanism where people can label generated CF examples on feasibility: whether it is feasible to intervene and realize the candidate CF example from the original input. To learn from this labelled feasibility data, we propose a modified variational auto encoder loss for generating CF examples that optimizes for feasibility as people interact with its output. Our experiments on Bayesian networks and the widely used ''Adult-Income'' dataset show that our proposed methods can generate counterfactual explanations that better satisfy feasibility constraints than existing methods.. Code repository can be accessed here: {\textbackslash}textit\{https://github.com/divyat09/cf-feasibility\}},
+  annotation  = {Comment: 2019 NeurIPS Workshop on Do the right thing: Machine learning and Causal Inference for improved decision making},
+  file        = {:mahajan2020preserving - Preserving Causal Constraints in Counterfactual Explanations for Machine Learning Classifiers.pdf:PDF},
+  keywords    = {Computer Science - Machine Learning, Computer Science - Artificial Intelligence, Statistics - Machine Learning},
+}
+
 @Comment{jabref-meta: databaseType:biblatex;}
diff --git a/paper/paper.pdf b/paper/paper.pdf
index d1de8dbb6765b6e8b2e9dc04dc08931794021cc8..de36c10c9ca075d9ab9e102ad66ea8e44fd01ea4 100644
Binary files a/paper/paper.pdf and b/paper/paper.pdf differ
diff --git a/paper/paper.tex b/paper/paper.tex
index ddc3a150fd2f0bfc1eeb4f2a7cb25b0c4aaee9e7..4ac24b0af255e45647ea3475d8ed72adc5265c18 100644
--- a/paper/paper.tex
+++ b/paper/paper.tex
@@ -125,9 +125,11 @@ where $\text{yloss}$ denotes the primary loss function already introduced above
 
 Solutions to Equation~\ref{eq:general} are considered valid as soon as the predicted label matches the target label. A stripped-down counterfactual explanation is therefore little different from an adversarial example. In Figure~\ref{fig:adv}, for example, we have the baseline approach proposed in \citet{wachter2017counterfactual} to MNIST data (centre panel). This approach solves Equation~\ref{eq:general} through gradient-descent in the feature space with a penalty for the distance between the factual $x$ and the counterfactual $x^{\prime}$. The underlying classifier $M_{\theta}$ is a simple Multi-Layer Perceptron (MLP) with good test accuracy. For the generated counterfactual $x^{\prime}$ the model predicts the target label with high confidence (centre panel in Figure~\ref{fig:adv}). The explanation is valid by definition, even though it looks a lot like an Adversarial Example \citep{goodfellow2014explaining}. \citet{schut2021generating} make the connection between Adversarial Examples and Counterfactual Explanations explicit and propose using a Jacobian-Based Saliency Map Attack to solve Equation~\ref{eq:general}. They demonstrate that this approach yields realistic and sparse counterfactuals for Bayesian, adversarially robust classifiers. Applying their approach to our simple MNIST classifier does not yield a realistic counterfactual but this one, too, is valid (right panel in Figure~\ref{fig:adv}). 
 
-The crucial difference between Adversarial Examples (AE) and Counterfactual Explanations is one of intent. While an AE is intended to go unnoticed, a CE should have certain desirable properties. The literature has made this explicit by introducing various so-called \textit{desiderata}. To properly serve both AI practitioners and individuals affected by AI decision-making systems, counterfactuals should be sparse, proximate~\citep{wachter2017counterfactual}, actionable~\citep{ustun2019actionable}, diverse~\citep{mothilal2020explaining}, plausible~\citep{joshi2019realistic,poyiadzi2020face,schut2021generating}, robust~\citep{upadhyay2021robust,pawelczyk2022probabilistically,altmeyer2023endogenous} and causal~\citep{karimi2021algorithmic} among other things. Researchers have come up with various ways to meet these desiderata, which have been surveyed in~\citep{verma2020counterfactual} and~\citep{karimi2020survey}. 
+The crucial difference between Adversarial Examples (AE) and Counterfactual Explanations is one of intent. While an AE is intended to go unnoticed, a CE should have certain desirable properties. The literature has made this explicit by introducing various so-called \textit{desiderata}. To properly serve both AI practitioners and individuals affected by AI decision-making systems, counterfactuals should be sparse, proximate~\citep{wachter2017counterfactual}, actionable~\citep{ustun2019actionable}, diverse~\citep{mothilal2020explaining}, plausible~\citep{joshi2019realistic,poyiadzi2020face,schut2021generating}, robust~\citep{upadhyay2021robust,pawelczyk2022probabilistically,altmeyer2023endogenous} and causal~\citep{karimi2021algorithmic} among other things. 
 
-Finding ways to generate \textit{plausible} counterfactuals has been one of the primary concerns. To this end, \citet{joshi2019realistic} were among the first to suggest that instead of searching counterfactuals in the feature space $\mathcal{X}$, we can instead traverse a latent embedding $\mathcal{Z}$ that implicitly codifies the data generating process (DGP) of $x\sim\mathcal{X}$. To learn the latent embedding, they introduce a surrogate model. In particular, they propose to use the latent embedding of a Variational Autoencoder (VAE) trained to generate samples $x^* \leftarrow \mathcal{G}(z)$ where $\mathcal{G}$ denotes the decoder part of the VAE. Provided the surrogate model is well-trained, their proposed approach ---REVISE--- can yield compelling counterfactual explanations like the one in the centre panel of Figure~\ref{fig:vae}. 
+Researchers have come up with various ways to meet these desiderata, which have been extensively surveyed and evaluated in various studies~\citep{verma2020counterfactual,karimi2020survey,pawelczyk2021carla,artelt2021evaluating,guidotti2022counterfactual}. Perhaps unsurprisingly, the different desiderata are often positively correlated. For example, \citet{artelt2021evaluating} find that plausibility typically also leads to improved robustness. Similarly, plausibility has also been connected to causality in the sense that plausible counterfactuals respect causal relationships \citep{mahajan2020preserving}. 
+
+Arguably, the plausibility of counterfactuals has been among the primary concerns and some have focused explicitly on this goal. \citet{joshi2019realistic}, for example, were among the first to suggest that instead of searching counterfactuals in the feature space $\mathcal{X}$, we can instead traverse a latent embedding $\mathcal{Z}$ that implicitly codifies the data generating process (DGP) of $x\sim\mathcal{X}$. To learn the latent embedding, they introduce a surrogate model. In particular, they propose to use the latent embedding of a Variational Autoencoder (VAE) trained to generate samples $x^* \leftarrow \mathcal{G}(z)$ where $\mathcal{G}$ denotes the decoder part of the VAE. Provided the surrogate model is well-trained, their proposed approach ---REVISE--- can yield compelling counterfactual explanations like the one in the centre panel of Figure~\ref{fig:vae}. 
 
 Others have proposed similar approaches. \citet{dombrowski2021diffeomorphic} traverse the base space of a normalizing flow to solve Equation~\ref{eq:general}, essentially relying on a different surrogate model for the generative task. \citet{poyiadzi2020face} use density estimators ($\hat{p}: \mathcal{X} \mapsto [0,1]$) to constrain the counterfactual paths. \citet{karimi2021algorithmic} argue that counterfactuals should comply with the causal model that generates the data. All of these different approaches share a common goal: ensuring that the generated counterfactuals comply with the true and unobserved DGP. To summarize this broad objective, we propose the following definition:
 
@@ -192,16 +194,42 @@ While $\mathbf{x}_K$ is only guaranteed to distribute as $p_{\theta}(\mathbf{x}|
   \item How exactly do we plan to quantify plausibility and conformity? Elaborate on measures.
 \end{itemize}
 
+\subsection{Evaluation Measures}\label{evaluation}
+
+Above we have defined plausibility (\ref{def:plausible}) and conformity (\ref{def:conformal}) for Counterfactual Explanations. In this subsection, we introduce evaluation measures that facilitate a quantitative evaluation of counterfactuals for these objectives. 
+
+Firstly, in order to assess the plausibility of counterfactuals we adapt the implausibility metric proposed in \citet{guidotti2022counterfactual}. The authors propose to evaluate plausibility in terms of the distance of the counterfactual $x^{\prime}$ from its nearest neighbour in the target class $t$: the smaller this distance, the more plausible the counterfactual. Instead of focusing only on the nearest neighbour of $x^{\prime}$, we suggest computing the average over distances from multiple (possibly all) observed instances in the target class. Formally, for a single counterfactual, we have:
+
+\begin{equation}\label{eq:impl}
+  \begin{aligned}
+    \text{impl} = \frac{1}{|x \in \mathcal{X}|t|} \sum_{x \in \mathcal{X}|t} \text{dist}(x^{\prime},x)
+  \end{aligned}
+\end{equation}
+
+This measure is straightforward to compute and should be less sensitive to outliers in the target class than the one based on the nearest neighbour. It also gives rise to a very similar evaluation measure for conformity. We merely swap out the subsample of individuals in the target class for the empirical distribution of generated conditional samples:
+
+\begin{equation}\label{eq:conf}
+  \begin{aligned}
+    \text{conf} = \frac{1}{|x \in \mathcal{X}_{\theta}|t|} \sum_{x \in \mathcal{X}_{\theta}|t} \text{dist}(x^{\prime},x)
+  \end{aligned}
+\end{equation}
+
+As noted by \citet{guidotti2022counterfactual}, these distance-based measures are simplistic and more complex alternative measures may ultimately be more appropriate for the task. For example, we considered using statistical divergence measures instead. This would involve generating not one but many counterfactuals and comparing the generated empirical distribution to the target distributions in Definitions~\ref{def:plausible} and~\ref{def:conformal}. While this approach is potentially more rigorous, generating enough counterfactuals is not always practical. 
+
 \subsection{Conformal Training meets Counterfactual Explanations}
 
 Now that we have a way of evaluating Counterfactual Explanations in terms of their plausibility and conformity, we are interested in finding a way to generate counterfactuals that are as plausible and conformal as possible. We hypothesize that a narrow focus on plausibility may come at the cost of reduced conformity. Using a surrogate model for the generative task, for example, may improve plausibility but inadvertently yield counterfactuals that are more consistent with the surrogate than the Black Box Model itself. 
 
-One way to ensure model conformity is to rely strictly on the model itself.~\citet{schut2021generating} demonstrate that this restriction need not impede plausibility, since we can rely on predictive uncertainty estimates to guide our counterfactual search. By avoiding counterfactual paths that are associated with high predictive uncertainty, we end up generating counterfactuals for which the model $M_{\theta}$ predicts the target label $t$ with high confidence. Provided the model is well-calibrated, these counterfactuals are plausible. 
+One way to ensure model conformity is to rely strictly on the model itself.~\citet{schut2021generating} demonstrate that this restriction need not impede plausibility, since we can rely on predictive uncertainty estimates to guide our counterfactual search. By avoiding counterfactual paths that are associated with high predictive uncertainty, we end up generating counterfactuals for which the model $M_{\theta}$ predicts the target label $t$ with high confidence. Provided the model is well-calibrated, these counterfactuals are plausible. The authors demonstrate this empirically by evaluating 
 
 Interestingly, \citet{schut2021generating} point to this connection between the generative task and predictive uncertainty quantification
 
 \section{Experiments}
 
+
+
+
+
 \medskip
 
 \bibliography{bib}