diff --git a/paper/bib.bib b/paper/bib.bib index 71b57f70f9c999651e1f3738d14a42b8dabdc3ab..d04512783d51988a61ef8fc8fe3e63e093e8c9d7 100644 --- a/paper/bib.bib +++ b/paper/bib.bib @@ -2737,4 +2737,14 @@ keywords = {Statistics - Machine Learning, Computer Science - Artificial Intelligence, Computer Science - Machine Learning}, } +@Misc{altmeyer2022conformal, + author = {Altmeyer, Patrick}, + date = {2022-10}, + title = {{Conformal} {Prediction} in {Julia}}, + language = {en}, + url = {https://www.paltmeyer.com/blog/posts/conformal-prediction/}, + urldate = {2023-03-27}, + abstract = {A (very) gentle introduction to Conformal Prediction in Julia using my new package ConformalPrediction.jl.}, +} + @Comment{jabref-meta: databaseType:biblatex;} diff --git a/paper/paper.pdf b/paper/paper.pdf index af18cf30669d9f8767735d51e17c19ee36a0b514..5ddcf57c8c64494c59e0b687c7cfc09331a5a661 100644 Binary files a/paper/paper.pdf and b/paper/paper.pdf differ diff --git a/paper/paper.tex b/paper/paper.tex index 47fa3c49b188383b479f6f0c25a69aa1e5cbbb04..cd6119279b9e7847a12aba42daebe5077d1cc101 100644 --- a/paper/paper.tex +++ b/paper/paper.tex @@ -98,7 +98,7 @@ Counterfactual Explanations are a powerful, flexible and intuitive way to not only explain Black Box Models but also enable affected individuals to challenge them through the means of Algorithmic Recourse. Instead of opening the black box, Counterfactual Explanations work under the premise of strategically perturbing model inputs to understand model behaviour \citep{wachter2017counterfactual}. Intuitively speaking, we generate explanations in this context by asking simple what-if questions of the following nature: `Our credit risk model currently predicts that this individual's credit profile is too risky to offer them a loan. What if they reduced their monthly expenditures by 10\%? Will our model then predict that the individual is credit-worthy'? -This is typically implemented by defining a target outcome $t \in \mathcal{Y}$ for some individual $x \in \mathcal{X}$, for which the model $M_{\theta}:\mathcal{X}\mapsto\mathcal{Y}$ initially predicts a different outcome: $M_{\theta}(x)\ne t$. Counterfactuals are then searched by minimizing a loss function that compares the predicted model output to the target outcome: $\text{yloss}(M_{\theta}(x),t)$. Since Counterfactual Explanations (CE) work directly with the Black Box Model, valid counterfactuals always have full local fidelity by construction \citep{mothilal2020explaining}. Fidelity is defined as the degree to which explanations approximate the predictions of the Black Box Model. This is arguably one of the most important evaluation metrics for model explanations, since any explanation that explains a prediction not actually made by the model is useless \citep{molnar2020interpretable}. +This is typically implemented by defining a target outcome $\mathbf{y}^* \in \mathcal{Y}$ for some individual $\mathbf{x} \in \mathcal{X}=\mathbb{R}^D$ described by $D$ attributes, for which the model $M_{\theta}:\mathcal{X}\mapsto\mathcal{Y}$ initially predicts a different outcome: $M_{\theta}(\mathbf{x})\ne \mathbf{y}^*$. Counterfactuals are then searched by minimizing a loss function that compares the predicted model output to the target outcome: $\text{yloss}(M_{\theta}(\mathbf{x}),\mathbf{y}^*)$. Since Counterfactual Explanations (CE) work directly with the Black Box Model, valid counterfactuals always have full local fidelity by construction \citep{mothilal2020explaining}. Fidelity is defined as the degree to which explanations approximate the predictions of the Black Box Model. This is arguably one of the most important evaluation metrics for model explanations, since any explanation that explains a prediction not actually made by the model is useless \citep{molnar2020interpretable}. In situations where full fidelity is a requirement, CE therefore offers a more appropriate solution to Explainable Artificial Intelligence (XAI) than other popular approaches like LIME \citep{ribeiro2016why} and SHAP \citep{lundberg2017unified}, which involve local surrogate models. But even full fidelity is not a sufficient condition for ensuring that an explanation adequately describes the behaviour of a model. That is because two very distinct explanations can both lead to the same model prediction, especially when dealing with heavily parameterized models: @@ -113,34 +113,36 @@ In the context of CE, the idea that no two explanations are the same arises almo \section{From Adversarial Examples to Plausible Explanations}\label{background} -Most state-of-the-art approaches to generating Counterfactual Explanations rely on gradient descent to optimize different flavours of the same counterfactual search objective, +In this section, we provide some background on Counterfactual Explanations and our motivation for this work. To start off, we briefly introduce the methodology uncerlying most state-of-the-art (SOTA) counterfactual generators. + +While Counterfactual Explanations can be generated for arbitrary regression models \citep{spooner2021counterfactual}, existing work has primarily focused on classification problems. Let $\mathcal{Y}=(0,1)^K$ denote the one-hot-encoded output domain with $K$ classes. Then most SOTA counterfactual generators rely on gradient descent to optimize different flavours of the following counterfactual search objective: \begin{equation} \label{eq:general} \begin{aligned} -\mathbf{s}^\prime &= \arg \min_{\mathbf{s}^\prime \in \mathcal{S}} \left\{ {\text{yloss}(M_{\theta}(f(\mathbf{s}^\prime)),y^*)}+ \lambda {\text{cost}(f(\mathbf{s}^\prime)) } \right\} +\mathbf{Z}^\prime &= \arg \min_{\mathbf{Z}^\prime \in \mathcal{Z}^M} \left\{ {\text{yloss}(M_{\theta}(f(\mathbf{Z}^\prime)),\mathbf{y}^*)}+ \lambda {\text{cost}(f(\mathbf{Z}^\prime)) } \right\} \end{aligned} \end{equation} -where $\text{yloss}$ denotes the primary loss function already introduced above and $\text{cost}$ is either a single penalty or a collection of penalties that are used to impose constraints through regularization. Following the convention in \citet{altmeyer2023endogenous} we use $\mathbf{s}^\prime=\{ s_k\}_K$ to denote the vector $K$-dimensional array of counterfactual states. This is to explicitly account for the fact that we can generate multiple counterfactuals, as with DiCE \citep{mothilal2020explaining}, and may choose to traverse a latent representation $\mathcal{Z}$ of the feature space $\mathcal{X}$, as we will discuss further below. +Here $\text{yloss}$ denotes the primary loss function already introduced above and $\text{cost}$ is either a single penalty or a collection of penalties that are used to impose constraints through regularization. Following the convention in \citet{altmeyer2023endogenous} we use $\mathbf{Z}^\prime=\{ \mathbf{z}_m\}_M$ to denote the vector $M$-dimensional array of counterfactual states. This is to explicitly account for the fact that we can generate multiple counterfactuals $M$, as with DiCE \citep{mothilal2020explaining}, and may choose to traverse a latent representation $\mathcal{Z}$ of the feature space $\mathcal{X}$, as we will discuss further below. -Solutions to Equation~\ref{eq:general} are considered valid as soon as the predicted label matches the target label. A stripped-down counterfactual explanation is therefore little different from an adversarial example. In Figure~\ref{fig:adv}, for example, we have the baseline approach proposed in \citet{wachter2017counterfactual} to MNIST data (centre panel). This approach solves Equation~\ref{eq:general} through gradient-descent in the feature space with a penalty for the distance between the factual $x$ and the counterfactual $x^{\prime}$. The underlying classifier $M_{\theta}$ is a simple Multi-Layer Perceptron (MLP) with good test accuracy. For the generated counterfactual $x^{\prime}$ the model predicts the target label with high confidence (centre panel in Figure~\ref{fig:adv}). The explanation is valid by definition, even though it looks a lot like an Adversarial Example \citep{goodfellow2014explaining}. \citet{schut2021generating} make the connection between Adversarial Examples and Counterfactual Explanations explicit and propose using a Jacobian-Based Saliency Map Attack (JSMA) to solve Equation~\ref{eq:general}. They demonstrate that this approach yields realistic and sparse counterfactuals for Bayesian, adversarially robust classifiers. Applying their approach to our simple MNIST classifier does not yield a realistic counterfactual but this one, too, is valid (right panel in Figure~\ref{fig:adv}). +Solutions to Equation~\ref{eq:general} are considered valid as soon as the predicted label matches the target label. A stripped-down counterfactual explanation is therefore little different from an adversarial example. In Figure~\ref{fig:adv}, for example, we have the baseline approach proposed in \citet{wachter2017counterfactual} to MNIST data (centre panel). This approach solves Equation~\ref{eq:general} through gradient-descent in the feature space with a penalty for the distance between the factual $\mathbf{x}$ and the counterfactual $\mathbf{x}^{\prime}$. The underlying classifier $M_{\theta}$ is a simple Multi-Layer Perceptron (MLP) with good test accuracy. For the generated counterfactual $\mathbf{x}^{\prime}$ the model predicts the target label with high confidence (centre panel in Figure~\ref{fig:adv}). The explanation is valid by definition, even though it looks a lot like an Adversarial Example \citep{goodfellow2014explaining}. \citet{schut2021generating} make the connection between Adversarial Examples and Counterfactual Explanations explicit and propose using a Jacobian-Based Saliency Map Attack (JSMA) to solve Equation~\ref{eq:general}. They demonstrate that this approach yields realistic and sparse counterfactuals for Bayesian, adversarially robust classifiers. Applying their approach to our simple MNIST classifier does not yield a realistic counterfactual but this one, too, is valid (right panel in Figure~\ref{fig:adv}). The crucial difference between Adversarial Examples (AE) and Counterfactual Explanations is one of intent. While an AE is intended to go unnoticed, a CE should have certain desirable properties. The literature has made this explicit by introducing various so-called \textit{desiderata}. To properly serve both AI practitioners and individuals affected by AI decision-making systems, counterfactuals should be sparse, proximate~\citep{wachter2017counterfactual}, actionable~\citep{ustun2019actionable}, diverse~\citep{mothilal2020explaining}, plausible~\citep{joshi2019realistic,poyiadzi2020face,schut2021generating}, robust~\citep{upadhyay2021robust,pawelczyk2022probabilistically,altmeyer2023endogenous} and causal~\citep{karimi2021algorithmic} among other things. Researchers have come up with various ways to meet these desiderata, which have been extensively surveyed and evaluated in various studies~\citep{verma2020counterfactual,karimi2020survey,pawelczyk2021carla,artelt2021evaluating,guidotti2022counterfactual}. Perhaps unsurprisingly, the different desiderata are often positively correlated. For example, \citet{artelt2021evaluating} find that plausibility typically also leads to improved robustness. Similarly, plausibility has also been connected to causality in the sense that plausible counterfactuals respect causal relationships \citep{mahajan2020preserving}. -Arguably, the plausibility of counterfactuals has been among the primary concerns and some have focused explicitly on this goal. \citet{joshi2019realistic}, for example, were among the first to suggest that instead of searching counterfactuals in the feature space $\mathcal{X}$, we can instead traverse a latent embedding $\mathcal{Z}$ that implicitly codifies the data generating process (DGP) of $x\sim\mathcal{X}$. To learn the latent embedding, they introduce a surrogate model. In particular, they propose to use the latent embedding of a Variational Autoencoder (VAE) trained to generate samples $x^* \leftarrow \mathcal{G}(z)$ where $\mathcal{G}$ denotes the decoder part of the VAE. Provided the surrogate model is well-trained, their proposed approach ---REVISE--- can yield compelling counterfactual explanations like the one in the centre panel of Figure~\ref{fig:vae}. +Arguably, the plausibility of counterfactuals has been among the primary concerns and some have focused explicitly on this goal. \citet{joshi2019realistic}, for example, were among the first to suggest that instead of searching counterfactuals in the feature space $\mathcal{X}$, we can instead traverse a latent embedding $\mathcal{Z}$ that implicitly codifies the data generating process (DGP) of $\mathbf{x}\sim\mathcal{X}$. To learn the latent embedding, they introduce a surrogate model. In particular, they propose to use the latent embedding of a Variational Autoencoder (VAE) trained to generate samples $\mathbf{x}^* \leftarrow \mathcal{G}(\mathbf{z})$ where $\mathcal{G}$ denotes the decoder part of the VAE. Provided the surrogate model is well-trained, their proposed approach ---REVISE--- can yield compelling counterfactual explanations like the one in the centre panel of Figure~\ref{fig:vae}. Others have proposed similar approaches. \citet{dombrowski2021diffeomorphic} traverse the base space of a normalizing flow to solve Equation~\ref{eq:general}, essentially relying on a different surrogate model for the generative task. \citet{poyiadzi2020face} use density estimators ($\hat{p}: \mathcal{X} \mapsto [0,1]$) to constrain the counterfactual paths. \citet{karimi2021algorithmic} argue that counterfactuals should comply with the causal model that generates the data. All of these different approaches share a common goal: ensuring that the generated counterfactuals comply with the true and unobserved DGP. To summarize this broad objective, we propose the following definition: \begin{definition}[Plausible Counterfactuals] \label{def:plausible} - Let $\mathcal{X}|y=t$ denote the true conditional distribution of samples in the target class $t$. Then for $x^{\prime}$ to be considered a plausible counterfactual, we need: $x^{\prime} \sim \mathcal{X}|y=t$. + Let $\mathcal{X}|\mathbf{y}^*$ denote the true conditional distribution of samples in the target class $\mathbf{y}^*$. Then for $\mathbf{x}^{\prime}$ to be considered a plausible counterfactual, we need: $\mathbf{x}^{\prime} \sim \mathcal{X}|\mathbf{y}^*$. \end{definition} Note that Definition~\ref{def:plausible} is consistent with the notion of plausible counterfactual paths, since we can simply apply it to each counterfactual state along the path. -Surrogate models offer an obvious solution to achieve this objective. Unfortunately, surrogates also introduce a dependency: the generated explanations no longer depend exclusively on the Black Box Model itself, but also on the surrogate model. This is not necessarily problematic if the primary objective is not to explain the behaviour of the model but to offer recourse to individuals affected by it. It may become problematic even in this context if the dependency turns into a vulnerability. To illustrate this point, we have used REVISE \citep{joshi2019realistic} with an underfitted VAE to generate the counterfactual in the right panel of Figure~\ref{fig:vae}: in this case, the decoder step of the VAE fails to yield plausible values ($\{x^{\prime} \leftarrow \mathcal{G}(z)\} \not\sim \mathcal{X}|y=t$) and hence the counterfactual search in the learned latent space is doomed. +Surrogate models offer an obvious solution to achieve this objective. Unfortunately, surrogates also introduce a dependency: the generated explanations no longer depend exclusively on the Black Box Model itself, but also on the surrogate model. This is not necessarily problematic if the primary objective is not to explain the behaviour of the model but to offer recourse to individuals affected by it. It may become problematic even in this context if the dependency turns into a vulnerability. To illustrate this point, we have used REVISE \citep{joshi2019realistic} with an underfitted VAE to generate the counterfactual in the right panel of Figure~\ref{fig:vae}: in this case, the decoder step of the VAE fails to yield plausible values ($\{\mathbf{x}^{\prime} \leftarrow \mathcal{G}(\mathbf{z})\} \not\sim \mathcal{X}|\mathbf{y}^*$) and hence the counterfactual search in the learned latent space is doomed. \begin{figure} \centering @@ -166,26 +168,26 @@ The word \textit{fidelity} stems from the Latin word `fidelis', which means `fai \begin{definition}[Conformal Counterfactuals] \label{def:conformal} - Let $\mathcal{X}_{\theta}|t = p_{\theta}(x|y=t)$ denote the conditional distribution of $x$ in the target class $t$, where $theta$ denotes the parameters of model $M_{\theta}$. Then for $x^{\prime}$ to be considered a conformal counterfactual, we need: $x^{\prime} \sim \mathcal{X}_{\theta}|t$. + Let $\mathcal{X}_{\theta}|\mathbf{y}^* = p_{\theta}(x|\mathbf{y}^*)$ denote the conditional distribution of $\mathbf{x}$ in the target class $\mathbf{y}^*$, where $\theta$ denotes the parameters of model $M_{\theta}$. Then for $\mathbf{x}^{\prime}$ to be considered a conformal counterfactual, we need: $\mathbf{x}^{\prime} \sim \mathcal{X}_{\theta}|\mathbf{y}^*$. \end{definition} -In words, conformal counterfactuals conform with what the predictive model has learned about the input data $x$. Since this definition works with distributional properties, it explicitly accounts for the multiplicity of explanations we discussed earlier. Except for the posterior conditional distribution $p_{\theta}(x|y=t)$, we already have access to all the ingredients in Definition~\ref{def:conformal}. +In words, conformal counterfactuals conform with what the predictive model has learned about the input data $\mathbf{x}$. Since this definition works with distributional properties, it explicitly accounts for the multiplicity of explanations we discussed earlier. Except for the posterior conditional distribution $p_{\theta}(\mathbf{x}|\mathbf{y}^*)$, we already have access to all the ingredients in Definition~\ref{def:conformal}. -How can we quantify $p_{\theta}(\mathbf{x}|y=t)$? After all, the predictive model $M_{\theta}$ was trained to discriminate outputs conditional on inputs, which is a different conditional distribution: $p_{\theta}(y|x)$. Learning the distribution over inputs $p_{\theta}(\mathbf{x}|y=t)$ is a generative task that $M_{\theta}$ was not explicitly trained for. In the context of Counterfactual Explanations, it is the task that existing approaches have reallocated from the model itself to a surrogate. +How can we quantify $p_{\theta}(\mathbf{x}|\mathbf{y}^*)$? After all, the predictive model $M_{\theta}$ was trained to discriminate outputs conditional on inputs, which is a different conditional distribution: $p_{\theta}(\mathbf{y}|\mathbf{x})$. Learning the distribution over inputs $p_{\theta}(\mathbf{x}|\mathbf{y}^*)$ is a generative task that $M_{\theta}$ was not explicitly trained for. In the context of Counterfactual Explanations, it is the task that existing approaches have reallocated from the model itself to a surrogate. -Fortunately, recent work by \citet{grathwohl2020your} on Energy Based Models (EBM) has pointed out that there is a `generative model hidden within every standard discriminative model'. The authors show that we can draw samples from the posterior conditional distribution $p_{\theta}(\mathbf{x}|y)$ using Stochastic Gradient Langevin Dynamics (SGLD). In doing so, it is possible to train classifiers jointly for the discriminative task using standard cross-entropy and the generative task using SGLD. They demonstrate empirically that among other things this improves predictive uncertainty quantification for discriminative models. +Fortunately, recent work by \citet{grathwohl2020your} on Energy Based Models (EBM) has pointed out that there is a `generative model hidden within every standard discriminative model'. The authors show that we can draw samples from the posterior conditional distribution $p_{\theta}(\mathbf{x}|\mathbf{y})$ using Stochastic Gradient Langevin Dynamics (SGLD). In doing so, it is possible to train classifiers jointly for the discriminative task using standard cross-entropy and the generative task using SGLD. They demonstrate empirically that among other things this improves predictive uncertainty quantification for discriminative models. -To see how their proposed conditional sampling strategy can be applied in our context, note that if we fix $y$ to our target value $t$, we can sample from $p_{\theta}(\mathbf{x}|y=t)$ using SGLD as follows, +To see how their proposed conditional sampling strategy can be applied in our context, note that if we fix $\mathbf{y}$ to our target value $\mathbf{y}^*$, we can sample from $p_{\theta}(\mathbf{x}|\mathbf{y}^*)$ using SGLD as follows, \begin{equation}\label{eq:sgld} \begin{aligned} - \mathbf{x}_{j+1} &\leftarrow \mathbf{x}_j - \frac{\epsilon^2}{2} \mathcal{E}(\mathbf{x}_j|y=t) + \epsilon \mathbf{r}_j, && j=1,...,J + \mathbf{x}_{j+1} &\leftarrow \mathbf{x}_j - \frac{\epsilon^2}{2} \mathcal{E}(\mathbf{x}_j|\mathbf{y}^*) + \epsilon \mathbf{r}_j, && j=1,...,J \end{aligned} \end{equation} -where $\mathbf{r}_j \sim \mathcal{N}(\mathbf{0},\mathbf{I})$ is the stochastic term and the step-size $\epsilon$ is typically polynomially decayed. The term $\mathcal{E}(\mathbf{x}_j|y=t)$ denotes the energy function. Following \citet{grathwohl2020your} we use $\mathcal{E}(\mathbf{x}_j|y=t)=-M_{\theta}(x)[t]$, that is the negative logit corresponding to the target class label $t$. +where $\mathbf{r}_j \sim \mathcal{N}(\mathbf{0},\mathbf{I})$ is the stochastic term and the step-size $\epsilon$ is typically polynomially decayed. The term $\mathcal{E}(\mathbf{x}_j|\mathbf{y}^*)$ denotes the energy function where as in \citep{grathwohl2020your} we use $\mathcal{E}(\mathbf{x}_j|\mathbf{y}^*)=-M_{\theta}(\mathbf{x}_j)[\mathbf{y}^*]$, that is the negative logit corresponding to the target class label $\mathbf{y}^*$. -While $\mathbf{x}_K$ is only guaranteed to distribute as $p_{\theta}(\mathbf{x}|y=t)$ if $\epsilon \rightarrow 0$ and $J \rightarrow \infty$, the bias introduced for a small finite $\epsilon$ is negligible in practice \citep{murphy2023probabilistic,grathwohl2020your}. While \citet{grathwohl2020your} use Equation~\ref{eq:sgld} during training, we are interested in applying the conditional sampling procedure in a post hoc fashion to any standard discriminative model. Generating multiple samples in this manner yields an empirical distribution $\hat{\mathcal{X}}_{\theta}|t$, which we can use to assess if a given counterfactual $x^{\prime}$ conforms with the model $M_{\theta}$ (Definition~\ref{def:conformal}). +While $\mathbf{x}_J$ is only guaranteed to distribute as $p_{\theta}(\mathbf{x}|\mathbf{y}^*)$ if $\epsilon \rightarrow 0$ and $J \rightarrow \infty$, the bias introduced for a small finite $\epsilon$ is negligible in practice \citep{murphy2023probabilistic,grathwohl2020your}. While \citet{grathwohl2020your} use Equation~\ref{eq:sgld} during training, we are interested in applying the conditional sampling procedure in a post hoc fashion to any standard discriminative model. Generating multiple samples in this manner yields an empirical distribution $\hat{\mathcal{X}}_{\theta}|\mathbf{y}^*$, which we can use to assess if a given counterfactual $\mathbf{x}^{\prime}$ conforms with the model $M_{\theta}$ (Definition~\ref{def:conformal}). \textbf{TBD} @@ -197,11 +199,11 @@ While $\mathbf{x}_K$ is only guaranteed to distribute as $p_{\theta}(\mathbf{x}| Above we have defined plausibility (\ref{def:plausible}) and conformity (\ref{def:conformal}) for Counterfactual Explanations. In this subsection, we introduce evaluation measures that facilitate a quantitative evaluation of counterfactuals for these objectives. -Firstly, in order to assess the plausibility of counterfactuals we adapt the implausibility metric proposed in \citet{guidotti2022counterfactual}. The authors propose to evaluate plausibility in terms of the distance of the counterfactual $x^{\prime}$ from its nearest neighbour in the target class $t$: the smaller this distance, the more plausible the counterfactual. Instead of focusing only on the nearest neighbour of $x^{\prime}$, we suggest computing the average over distances from multiple (possibly all) observed instances in the target class. Formally, for a single counterfactual, we have: +Firstly, in order to assess the plausibility of counterfactuals we adapt the implausibility metric proposed in \citet{guidotti2022counterfactual}. The authors propose to evaluate plausibility in terms of the distance of the counterfactual $\mathbf{x}^{\prime}$ from its nearest neighbour in the target class $\mathbf{y}^*$: the smaller this distance, the more plausible the counterfactual. Instead of focusing only on the nearest neighbour of $\mathbf{x}^{\prime}$, we suggest computing the average over distances from multiple (possibly all) observed instances in the target class. Formally, for a single counterfactual, we have: \begin{equation}\label{eq:impl} \begin{aligned} - \text{impl} = \frac{1}{|x \in \mathcal{X}|t|} \sum_{x \in \mathcal{X}|t} \text{dist}(x^{\prime},x) + \text{impl} = \frac{1}{\lvert\mathbf{x} \in \mathcal{X}|\mathbf{y}^*\rvert} \sum_{\mathbf{x} \in \mathcal{X}|\mathbf{y}^*} \text{dist}(\mathbf{x}^{\prime},\mathbf{x}) \end{aligned} \end{equation} @@ -209,7 +211,7 @@ This measure is straightforward to compute and should be less sensitive to outli \begin{equation}\label{eq:conf} \begin{aligned} - \text{conf} = \frac{1}{|x \in \mathcal{X}_{\theta}|t|} \sum_{x \in \mathcal{X}_{\theta}|t} \text{dist}(x^{\prime},x) + \text{conf} = \frac{1}{\lvert\mathbf{x} \in \mathcal{X}_{\theta}|\mathbf{y}^*\rvert} \sum_{\mathbf{x} \in \mathcal{X}_{\theta}|\mathbf{y}^*} \text{dist}(\mathbf{x}^{\prime},\mathbf{x}) \end{aligned} \end{equation} @@ -221,7 +223,7 @@ Now that we have a framework for evaluating Counterfactual Explanations in terms \subsection{Plausible Counterfactuals through Minimal Uncertainty} -Our proposed methodology is built on the findings presented in~\citet{schut2021generating}. The authors demonstrate that it is not only possible but remarkably easy to generate plausible counterfactuals for Black Box Models that provide predictive uncertainty estimates. Their proposed algorithm solves Equation~\ref{eq:general} by greedily applying JSMA in the feature space with standard cross-entropy loss and no penalty at all. They show that this is equivalent to minimizing predictive uncertainty and hence yields counterfactuals for which the model $M_{\theta}$ predicts the target label $t$ with high confidence. Provided the model is well-calibrated, these counterfactuals are plausible which the authors demonstrate empirically through benchmarks \citep{schut2021generating}. +Our proposed methodology is built on the findings presented in~\citet{schut2021generating}. The authors demonstrate that it is not only possible but remarkably easy to generate plausible counterfactuals for Black Box Models that provide predictive uncertainty estimates. Their proposed algorithm solves Equation~\ref{eq:general} by greedily applying JSMA in the feature space with standard cross-entropy loss and no penalty at all. They show that this is equivalent to minimizing predictive uncertainty and hence yields counterfactuals for which the model $M_{\theta}$ predicts the target label $\mathbf{y}^*$ with high confidence. Provided the model is well-calibrated, these counterfactuals are plausible which the authors demonstrate empirically through benchmarks \citep{schut2021generating}. Unfortunately, this idea hinges on the crucial assumption that the Black Box Model provides predictive uncertainty estimates. The authors argue that in light of rapid advances in Bayesian Deep Learning (DL), this assumption is overall less costly than the engineering overhead induced by using surrogate models. This is even more true today, as recent work has put Laplace Approximation back on the map for truly effortless Bayesian DL \citep{immer2020improving,daxberger2021laplace,antoran2023sampling}. Nonetheless, the need for Bayesian methods may be too restrictive in some cases. @@ -233,19 +235,19 @@ Conformal Prediction (CP) is a scalable and statistically rigorous approach to p Intuitively, CP works under the premise of turning heuristic notions of uncertainty into rigorous uncertainty estimates by repeatedly sifting through the data. It can be used to generate prediction intervals for regression models and prediction sets for classification models \citep{altmeyer2022conformal}. Since the literature on CE and AR is typically concerned with classification problems, we focus on the latter. A particular variant of CP called Split Conformal Prediction (SCP) is well-suited for our purposes because it imposes only minimal restrictions on model training. -Specifically, SCP involves splitting the data $\mathcal{D}_n=\{(X_i,Y_i)\}_{i=1,...,n}$ into a proper training set $\mathcal{D}_{\text{train}}$ and a calibration set $\mathcal{D}_{\text{cal}}$. The former is used to train the classifier in any conventional fashion: $\widehat{M}_{\theta}(X_i,Y_i)$, $i\in\mathcal{D}_{\text{train}}$. The latter is then used to compute so-called nonconformity scores: $\mathcal{S}=\{s(X_i,Y_i)\}_{i \in \mathcal{D}_{\text{cal}}}$ where $s: (\mathcal{X},\mathcal{Y}) \mapsto \mathbb{R}$ is referred to as \textit{score function}. In the context of classification, a common choice for the score function is just $s_i=1-\widehat{M}_{\theta}(X_i)[Y_i]$, that is one minus the logit corresponding to the observed label $Y_i$ \citep{angelopoulos2021gentle}. +Specifically, SCP involves splitting the data $\mathcal{D}_n=\{(\mathbf{x}_i,\mathbf{y}_i)\}_{i=1,...,n}$ into a proper training set $\mathcal{D}_{\text{train}}$ and a calibration set $\mathcal{D}_{\text{cal}}$. The former is used to train the classifier in any conventional fashion. The latter is then used to compute so-called nonconformity scores: $\mathcal{S}=\{s(\mathbf{x}_i,\mathbf{y}_i)\}_{i \in \mathcal{D}_{\text{cal}}}$ where $s: (\mathcal{X},\mathcal{Y}) \mapsto \mathbb{R}$ is referred to as \textit{score function}. In the context of classification, a common choice for the score function is just $s_i=1-M_{\theta}(\mathbf{x}_i)[\mathbf{y}_i]$, that is one minus the softmax output corresponding to the observed label $\mathbf{y}_i$ \citep{angelopoulos2021gentle}. Finally, classification sets are formed as follows, \begin{equation}\label{eq:scp} \begin{aligned} - C_{\theta}(X_i;\alpha)=\{y: s(X_i,y) \le \hat{q}\} + C_{\theta}(\mathbf{x}_i;\alpha)=\{\mathbf{y}: s(\mathbf{x}_i,\mathbf{y}) \le \hat{q}\} \end{aligned} \end{equation} -where $\hat{q}$ denotes the $(1-\alpha)$-quantile of $\mathcal{S}$ and $\alpha$ is a predetermined error rate. As the size of the calibration set increases, the probability that the classification set $C(X_{\text{test}})$ for a newly arrived sample $X_i$ does not cover the true test label $Y_i$ approaches $\alpha$ \citep{angelopoulos2021gentle}. +where $\hat{q}$ denotes the $(1-\alpha)$-quantile of $\mathcal{S}$ and $\alpha$ is a predetermined error rate. As the size of the calibration set increases, the probability that the classification set $C(\mathbf{x}_{\text{test}})$ for a newly arrived sample $\mathbf{x}_{\text{test}}$ does not cover the true test label $\mathbf{y}_{\text{test}}$ approaches $\alpha$ \citep{angelopoulos2021gentle}. -Observe from Equation~\ref{eq:scp} that Conformal Prediction works on an instance-level basis, much like Counterfactual Explanations are local. The prediction set for an individual instance $X_i$ depends only on the characteristics of that sample and the specified error rate. Intuitively, the set is more likely to include multiple labels for samples that are difficult to classify, so the set size is indicative of predictive uncertainty. To see why this effect is exacerbated by small choices for $\alpha$ consider the case of $\alpha=0$, which requires that the true label is covered by the prediction set with probability equal to one. +Observe from Equation~\ref{eq:scp} that Conformal Prediction works on an instance-level basis, much like Counterfactual Explanations are local. The prediction set for an individual instance $\mathbf{x}_i$ depends only on the characteristics of that sample and the specified error rate. Intuitively, the set is more likely to include multiple labels for samples that are difficult to classify, so the set size is indicative of predictive uncertainty. To see why this effect is exacerbated by small choices for $\alpha$ consider the case of $\alpha=0$, which requires that the true label is covered by the prediction set with probability equal to one. \subsection{Conformal Counterfactual Explanations} @@ -253,23 +255,23 @@ The fact that conformal classifiers produce set-valued predictions introduces a \begin{equation}\label{eq:setsize} \begin{aligned} - \Omega(C_{\theta}(x;\alpha))&=\max \left(0, \sum_{y\in\mathcal{Y}}C_{\theta,y}(X_i;\alpha) - \kappa \right) + \Omega(C_{\theta}(\mathbf{x};\alpha))&=\max \left(0, \sum_{\mathbf{y}\in\mathcal{Y}}C_{\theta,\mathbf{y}}(\mathbf{x}_i;\alpha) - \kappa \right) \end{aligned} \end{equation} -where $\kappa \in \{0,1\}$ is a hyper-parameter and $C_{\theta,y}(X_i;\alpha)$ can be interpreted as the probability of label $y$ being included in the prediction set. Formally, it is defined as $C_{\theta,y}(X_i;\alpha):=\sigma\left((s(X_i,y)-\alpha) T^{-1}\right)$ for $y\in\{1,...,K\}$ where $\sigma$ is the sigmoid function and $T$ is a hyper-parameter used for temperature scaling \citep{stutz2022learning}. +where $\kappa \in \{0,1\}$ is a hyper-parameter and $C_{\theta,\mathbf{y}}(\mathbf{x}_i;\alpha)$ can be interpreted as the probability of label $\mathbf{y}$ being included in the prediction set. Formally, it is defined as $C_{\theta,\mathbf{y}}(\mathbf{x}_i;\alpha):=\sigma\left((s(\mathbf{x}_i,\mathbf{y})-\alpha) T^{-1}\right)$ for $\mathbf{y}\in\mathcal{Y}$ where $\sigma$ is the sigmoid function and $T$ is a hyper-parameter used for temperature scaling \citep{stutz2022learning}. Penalizing the set size in this way is in principal enough to train efficient conformal classifiers \citep{stutz2022learning}. As we explained above, the set size is also closely linked to predictive uncertainty at the local level. This makes the smooth penalty defined in Equation~\ref{eq:setsize} useful in the context of meeting our objective of generating plausible counterfactuals. In particular, we adapt Equation~\ref{eq:general} to define the baseline objective for Conformal Counterfactual Explanations (CCE): \begin{equation}\label{eq:cce} \begin{aligned} - \mathbf{s}^\prime &= \arg \min_{\mathbf{s}^\prime \in \mathcal{S}} \left\{ {\text{yloss}(M_{\theta}(f(\mathbf{s}^\prime)),y^*)}+ \lambda \Omega(C_{\theta}(f(\mathbf{s}^\prime);\alpha)) \right\} + \mathbf{Z}^\prime &= \arg \min_{\mathbf{Z}^\prime \in \mathcal{Z}^M} \left\{ {\text{yloss}(M_{\theta}(f(\mathbf{Z}^\prime)),\mathbf{y}^*)}+ \lambda \Omega(C_{\theta}(f(\mathbf{Z}^\prime);\alpha)) \right\} \end{aligned} \end{equation} -Since we can still retrieve unperturbed logits from our conformal classifier $M_{\theta}$, we are still free work with any loss function of our choice. For example, we could use standard cross-entropy for $\text{yloss}$. +Since we can still retrieve unperturbed softmax outputs from our conformal classifier $M_{\theta}$, we are free to work with any loss function of our choice. For example, we could use standard cross-entropy for $\text{yloss}$. -In order to generate prediction sets $C_{\theta}(f(\mathbf{s}^\prime);\alpha)$ for any Black Box Model we merely need to perform a single calibration pass through a holdout set $\mathcal{D}_{\text{cal}}$. Arguably, data is typically abundant and in most applications practitioners tend to hold out a test data set anyway. Our proposed approach for CCE therefore removes the restriction on the family of predictive models, at the small cost of reserving a subset of the available data for calibration. +In order to generate prediction sets $C_{\theta}(f(\mathbf{Z}^\prime);\alpha)$ for any Black Box Model we merely need to perform a single calibration pass through a holdout set $\mathcal{D}_{\text{cal}}$. Arguably, data is typically abundant and in most applications practitioners tend to hold out a test data set anyway. Our proposed approach for CCE therefore removes the restriction on the family of predictive models, at the small cost of reserving a subset of the available data for calibration. \section{Experiments}