diff --git a/notebooks/tables.Rmd b/notebooks/tables.Rmd index 185837755ec264d29b4d5149d0d9ce2b1007c31c..1480ea7db975bc73d20ad5fd33a8e1fda4d25d7e 100644 --- a/notebooks/tables.Rmd +++ b/notebooks/tables.Rmd @@ -141,7 +141,7 @@ chosen_data <- c( "MNIST", "GMSC" ) -tab_i <- tab_valid +tab_i <- tab # Logic: tab_i <- tab_i[variable %in% measures] @@ -155,7 +155,7 @@ col_names <- c( rep(measure_names,length(chosen_data)) ) caption <- sprintf( - "Results for %s datasets: sample averages +/- one standard deviation over all valid counterfactuals. Best outcomes are highlighted in bold. Asterisks indicate that the given value is more than one (*) or two (**) standard deviations away from the baseline (Wachter). \\label{tab:results-%s} \\newline", + "Results for %s datasets: sample averages +/- one standard deviation across counterfactuals. Best outcomes are highlighted in bold. Asterisks indicate that the given value is more than one (*) or two (**) standard deviations away from the baseline (Wachter). \\label{tab:results-%s} \\newline", chosen_source, chosen_source ) @@ -192,7 +192,7 @@ chosen_data <- c( "Moons", "Circles" ) -tab_i <- tab_valid +tab_i <- tab # Logic: tab_i <- tab_i[variable %in% measures] @@ -206,7 +206,7 @@ col_names <- c( rep(measure_names,length(chosen_data)) ) caption <- sprintf( - "Results for %s datasets: sample averages +/- one standard deviation over all valid counterfactuals. Best outcomes are highlighted in bold. Asterisks indicate that the given value is more than one (*) or two (**) standard deviations away from the baseline (Wachter). \\label{tab:results-%s} \\newline", + "Results for %s datasets: sample averages +/- one standard deviation across counterfactuals. Best outcomes are highlighted in bold. Asterisks indicate that the given value is more than one (*) or two (**) standard deviations away from the baseline (Wachter). \\label{tab:results-%s} \\newline", chosen_source, chosen_source ) diff --git a/paper/contents/table-real-world.tex b/paper/contents/table-real-world.tex index 2ec0473a558a83cd0adb362261ab7e41b68db7c3..d2631979d2d4b1c2af1e08f66b82493626339048 100644 --- a/paper/contents/table-real-world.tex +++ b/paper/contents/table-real-world.tex @@ -1,6 +1,6 @@ \begin{table} -\caption{Results for real-world datasets: sample averages +/- one standard deviation over all valid counterfactuals. Best outcomes are highlighted in bold. Asterisks indicate that the given value is more than one (*) or two (**) standard deviations away from the baseline (Wachter). \label{tab:results-real-world} \newline} +\caption{Results for real-world datasets: sample averages +/- one standard deviation across counterfactuals. Best outcomes are highlighted in bold. Asterisks indicate that the given value is more than one (*) or two (**) standard deviations away from the baseline (Wachter). \label{tab:results-real-world} \newline} \centering \resizebox{\linewidth}{!}{ \begin{tabular}[t]{llcccc} @@ -9,37 +9,37 @@ \cmidrule(l{3pt}r{3pt}){3-4} \cmidrule(l{3pt}r{3pt}){5-6} Model & Generator & Unfaithfulness ↓ & Implausibility ↓ & Unfaithfulness ↓ & Implausibility ↓\\ \midrule - & ECCCo & \textbf{19.27 ± 5.02}** & 314.54 ± 32.54*\hphantom{*} & \textbf{79.16 ± 11.67}** & 18.26 ± 4.92**\\ + & ECCCo & \textbf{19.28 ± 5.01}** & 314.76 ± 32.36*\hphantom{*} & \textbf{79.16 ± 11.67}** & 18.26 ± 4.92**\\ - & REVISE & 188.54 ± 26.22*\hphantom{*} & \textbf{254.32 ± 41.55}** & 186.40 ± 28.06\hphantom{*}\hphantom{*} & \textbf{5.34 ± 2.38}**\\ + & REVISE & 188.70 ± 26.18*\hphantom{*} & \textbf{255.26 ± 41.50}** & 186.40 ± 28.06\hphantom{*}\hphantom{*} & \textbf{5.34 ± 2.38}**\\ - & Schut & 199.70 ± 28.43\hphantom{*}\hphantom{*} & 273.01 ± 39.60** & 200.98 ± 28.49\hphantom{*}\hphantom{*} & 6.50 ± 2.01**\\ + & Schut & 211.00 ± 27.21\hphantom{*}\hphantom{*} & 286.61 ± 39.85*\hphantom{*} & 200.98 ± 28.49\hphantom{*}\hphantom{*} & 6.50 ± 2.01**\\ -\multirow{-4}{*}{\raggedright\arraybackslash JEM} & Wachter & 222.81 ± 26.22\hphantom{*}\hphantom{*} & 361.38 ± 39.55\hphantom{*}\hphantom{*} & 214.08 ± 45.35\hphantom{*}\hphantom{*} & 61.04 ± 2.58\hphantom{*}\hphantom{*}\\ +\multirow{-4}{*}{\raggedright\arraybackslash JEM} & Wachter & 222.90 ± 26.56\hphantom{*}\hphantom{*} & 361.88 ± 39.74\hphantom{*}\hphantom{*} & 214.08 ± 45.35\hphantom{*}\hphantom{*} & 61.04 ± 2.58\hphantom{*}\hphantom{*}\\ \cmidrule{1-6} & ECCCo & \textbf{15.99 ± 3.06}** & 294.72 ± 30.75** & \textbf{83.28 ± 13.26}** & 17.21 ± 4.46**\\ - & REVISE & 173.05 ± 20.38** & \textbf{246.20 ± 37.74}** & 194.24 ± 35.41\hphantom{*}\hphantom{*} & \textbf{4.95 ± 1.26}**\\ + & REVISE & 173.59 ± 20.65** & \textbf{246.32 ± 37.46}** & 194.24 ± 35.41\hphantom{*}\hphantom{*} & \textbf{4.95 ± 1.26}**\\ - & Schut & 186.91 ± 22.98*\hphantom{*} & 264.68 ± 37.58** & 208.45 ± 34.60\hphantom{*}\hphantom{*} & 6.12 ± 1.91**\\ + & Schut & 205.33 ± 24.07\hphantom{*}\hphantom{*} & 287.39 ± 39.33*\hphantom{*} & 208.45 ± 34.60\hphantom{*}\hphantom{*} & 6.12 ± 1.91**\\ -\multirow{-4}{*}{\raggedright\arraybackslash JEM Ensemble} & Wachter & 217.37 ± 23.93\hphantom{*}\hphantom{*} & 362.91 ± 39.40\hphantom{*}\hphantom{*} & 186.19 ± 33.88\hphantom{*}\hphantom{*} & 60.70 ± 44.32\hphantom{*}\hphantom{*}\\ +\multirow{-4}{*}{\raggedright\arraybackslash JEM Ensemble} & Wachter & 217.67 ± 23.78\hphantom{*}\hphantom{*} & 363.23 ± 39.24\hphantom{*}\hphantom{*} & 186.19 ± 33.88\hphantom{*}\hphantom{*} & 60.70 ± 44.32\hphantom{*}\hphantom{*}\\ \cmidrule{1-6} - & ECCCo & \textbf{41.95 ± 6.50}** & 591.58 ± 36.24\hphantom{*}\hphantom{*} & \textbf{75.93 ± 14.27}** & 17.20 ± 3.15\hphantom{*}\hphantom{*}\\ + & ECCCo & \textbf{41.95 ± 6.50}** & 591.58 ± 36.24\hphantom{*}\hphantom{*} & \textbf{75.93 ± 14.27}** & 17.20 ± 3.15**\\ - & REVISE & 365.69 ± 14.90*\hphantom{*} & 245.36 ± 39.69** & 196.75 ± 41.25\hphantom{*}\hphantom{*} & \textbf{4.84 ± 0.60}**\\ + & REVISE & 365.82 ± 15.35*\hphantom{*} & \textbf{249.49 ± 41.55}** & 196.75 ± 41.25\hphantom{*}\hphantom{*} & \textbf{4.84 ± 0.60}**\\ - & Schut & 371.12 ± 19.99\hphantom{*}\hphantom{*} & \textbf{245.11 ± 35.72}** & 212.00 ± 41.15\hphantom{*}\hphantom{*} & 6.44 ± 1.34\hphantom{*}\hphantom{*}\\ + & Schut & 382.44 ± 17.81\hphantom{*}\hphantom{*} & 285.98 ± 42.48*\hphantom{*} & 212.00 ± 41.15\hphantom{*}\hphantom{*} & 6.44 ± 1.34**\\ -\multirow{-4}{*}{\raggedright\arraybackslash MLP} & Wachter & 384.76 ± 16.52\hphantom{*}\hphantom{*} & 359.21 ± 42.03\hphantom{*}\hphantom{*} & 184.03 ± 48.16\hphantom{*}\hphantom{*} & 7.49 ± 0.89\hphantom{*}\hphantom{*}\\ +\multirow{-4}{*}{\raggedright\arraybackslash MLP} & Wachter & 386.05 ± 16.60\hphantom{*}\hphantom{*} & 361.83 ± 42.18\hphantom{*}\hphantom{*} & 218.34 ± 53.26\hphantom{*}\hphantom{*} & 45.84 ± 39.39\hphantom{*}\hphantom{*}\\ \cmidrule{1-6} - & ECCCo & \textbf{31.43 ± 3.91}** & 490.88 ± 27.19\hphantom{*}\hphantom{*} & \textbf{73.86 ± 14.63}** & 17.92 ± 4.17\hphantom{*}\hphantom{*}\\ + & ECCCo & \textbf{31.43 ± 3.91}** & 490.88 ± 27.19\hphantom{*}\hphantom{*} & \textbf{73.86 ± 14.63}** & 17.92 ± 4.17**\\ - & REVISE & 337.21 ± 11.68*\hphantom{*} & \textbf{244.84 ± 37.17}** & 207.21 ± 43.20\hphantom{*}\hphantom{*} & \textbf{5.78 ± 2.10}**\\ + & REVISE & 337.74 ± 11.89*\hphantom{*} & \textbf{247.67 ± 38.36}** & 207.21 ± 43.20\hphantom{*}\hphantom{*} & \textbf{5.78 ± 2.10}**\\ - & Schut & 344.60 ± 13.64*\hphantom{*} & 252.53 ± 37.92** & 205.36 ± 32.11\hphantom{*}\hphantom{*} & 7.00 ± 2.15*\hphantom{*}\\ + & Schut & 359.54 ± 14.52\hphantom{*}\hphantom{*} & 283.99 ± 41.08*\hphantom{*} & 205.36 ± 32.11\hphantom{*}\hphantom{*} & 7.00 ± 2.15**\\ -\multirow{-4}{*}{\raggedright\arraybackslash MLP Ensemble} & Wachter & 358.51 ± 13.18\hphantom{*}\hphantom{*} & 352.63 ± 39.93\hphantom{*}\hphantom{*} & 177.20 ± 25.86\hphantom{*}\hphantom{*} & 10.27 ± 3.21\hphantom{*}\hphantom{*}\\ +\multirow{-4}{*}{\raggedright\arraybackslash MLP Ensemble} & Wachter & 360.79 ± 14.39\hphantom{*}\hphantom{*} & 357.73 ± 42.55\hphantom{*}\hphantom{*} & 213.71 ± 54.17\hphantom{*}\hphantom{*} & 73.09 ± 64.50\hphantom{*}\hphantom{*}\\ \bottomrule \end{tabular}} \end{table} diff --git a/paper/contents/table-synthetic.tex b/paper/contents/table-synthetic.tex index aa6abedfad4ecba082d5a096448830a31bba4356..df1746fbf5c7717ecbfc890e2f0e7551ab9834d8 100644 --- a/paper/contents/table-synthetic.tex +++ b/paper/contents/table-synthetic.tex @@ -1,6 +1,6 @@ \begin{table} -\caption{Results for synthetic datasets: sample averages +/- one standard deviation over all valid counterfactuals. Best outcomes are highlighted in bold. Asterisks indicate that the given value is more than one (*) or two (**) standard deviations away from the baseline (Wachter). \label{tab:results-synthetic} \newline} +\caption{Results for synthetic datasets: sample averages +/- one standard deviation across counterfactuals. Best outcomes are highlighted in bold. Asterisks indicate that the given value is more than one (*) or two (**) standard deviations away from the baseline (Wachter). \label{tab:results-synthetic} \newline} \centering \resizebox{\linewidth}{!}{ \begin{tabular}[t]{llcccccc} @@ -15,21 +15,21 @@ Model & Generator & Unfaithfulness ↓ & Implausibility ↓ & Unfaithfulness ↓ & ECCCo (no EBM) & 0.16 ± 0.11\hphantom{*}\hphantom{*} & 0.34 ± 0.19\hphantom{*}\hphantom{*} & 0.91 ± 0.32\hphantom{*}\hphantom{*} & 1.71 ± 0.25\hphantom{*}\hphantom{*} & 0.70 ± 0.33\hphantom{*}\hphantom{*} & 1.30 ± 0.37\hphantom{*}\hphantom{*}\\ - & REVISE & 0.15 ± 0.00** & 0.41 ± 0.01** & 0.78 ± 0.23\hphantom{*}\hphantom{*} & 1.57 ± 0.26\hphantom{*}\hphantom{*} & \textbf{0.33 ± 0.01}** & \textbf{0.64 ± 0.00}**\\ + & REVISE & 0.19 ± 0.03\hphantom{*}\hphantom{*} & 0.41 ± 0.01** & 0.78 ± 0.23\hphantom{*}\hphantom{*} & 1.57 ± 0.26\hphantom{*}\hphantom{*} & \textbf{0.48 ± 0.16}*\hphantom{*} & \textbf{0.95 ± 0.32}*\hphantom{*}\\ - & Schut & 0.39 ± 0.07\hphantom{*}\hphantom{*} & 0.73 ± 0.17\hphantom{*}\hphantom{*} & 0.66 ± 0.25\hphantom{*}\hphantom{*} & 1.47 ± 0.10** & 0.54 ± 0.43\hphantom{*}\hphantom{*} & 1.28 ± 0.53\hphantom{*}\hphantom{*}\\ + & Schut & 0.39 ± 0.07\hphantom{*}\hphantom{*} & 0.73 ± 0.17\hphantom{*}\hphantom{*} & 0.67 ± 0.27\hphantom{*}\hphantom{*} & 1.50 ± 0.22*\hphantom{*} & 0.54 ± 0.43\hphantom{*}\hphantom{*} & 1.28 ± 0.53\hphantom{*}\hphantom{*}\\ -\multirow{-6}{*}{\raggedright\arraybackslash JEM} & Wachter & 0.18 ± 0.10\hphantom{*}\hphantom{*} & 0.44 ± 0.17\hphantom{*}\hphantom{*} & 0.78 ± 0.23\hphantom{*}\hphantom{*} & 1.75 ± 0.19\hphantom{*}\hphantom{*} & 0.68 ± 0.34\hphantom{*}\hphantom{*} & 1.33 ± 0.32\hphantom{*}\hphantom{*}\\ +\multirow{-6}{*}{\raggedright\arraybackslash JEM} & Wachter & 0.18 ± 0.10\hphantom{*}\hphantom{*} & 0.44 ± 0.17\hphantom{*}\hphantom{*} & 0.80 ± 0.27\hphantom{*}\hphantom{*} & 1.78 ± 0.24\hphantom{*}\hphantom{*} & 0.68 ± 0.34\hphantom{*}\hphantom{*} & 1.33 ± 0.32\hphantom{*}\hphantom{*}\\ \cmidrule{1-8} & ECCCo & \textbf{0.29 ± 0.05}** & 0.23 ± 0.06** & 0.80 ± 0.62\hphantom{*}\hphantom{*} & 1.69 ± 0.40\hphantom{*}\hphantom{*} & 0.65 ± 0.53\hphantom{*}\hphantom{*} & 1.17 ± 0.41\hphantom{*}\hphantom{*}\\ - & ECCCo (no CP) & 0.29 ± 0.05** & \textbf{0.23 ± 0.07}** & \textbf{0.79 ± 0.62}\hphantom{*}\hphantom{*} & 1.68 ± 0.42\hphantom{*}\hphantom{*} & 0.49 ± 0.35\hphantom{*}\hphantom{*} & 1.19 ± 0.44\hphantom{*}\hphantom{*}\\ + & ECCCo (no CP) & 0.29 ± 0.05** & \textbf{0.23 ± 0.07}** & \textbf{0.79 ± 0.62}\hphantom{*}\hphantom{*} & 1.68 ± 0.42\hphantom{*}\hphantom{*} & \textbf{0.49 ± 0.35}\hphantom{*}\hphantom{*} & 1.19 ± 0.44\hphantom{*}\hphantom{*}\\ & ECCCo (no EBM) & 0.46 ± 0.05\hphantom{*}\hphantom{*} & 0.28 ± 0.04** & 1.34 ± 0.47\hphantom{*}\hphantom{*} & 1.68 ± 0.47\hphantom{*}\hphantom{*} & 0.84 ± 0.51\hphantom{*}\hphantom{*} & 1.23 ± 0.31\hphantom{*}\hphantom{*}\\ - & REVISE & 0.52 ± 0.04\hphantom{*}\hphantom{*} & 0.41 ± 0.01\hphantom{*}\hphantom{*} & 1.45 ± 0.44\hphantom{*}\hphantom{*} & 1.64 ± 0.31\hphantom{*}\hphantom{*} & \textbf{0.06 ± 0.01}** & \textbf{0.64 ± 0.00}**\\ + & REVISE & 0.56 ± 0.05\hphantom{*}\hphantom{*} & 0.41 ± 0.01\hphantom{*}\hphantom{*} & 1.45 ± 0.44\hphantom{*}\hphantom{*} & \textbf{1.64 ± 0.31}\hphantom{*}\hphantom{*} & 0.58 ± 0.52\hphantom{*}\hphantom{*} & \textbf{0.95 ± 0.32}\hphantom{*}\hphantom{*}\\ - & Schut & 0.43 ± 0.06*\hphantom{*} & 0.47 ± 0.36\hphantom{*}\hphantom{*} & 1.39 ± 0.50\hphantom{*}\hphantom{*} & \textbf{1.59 ± 0.26}\hphantom{*}\hphantom{*} & 0.58 ± 0.37\hphantom{*}\hphantom{*} & 1.23 ± 0.43\hphantom{*}\hphantom{*}\\ + & Schut & 0.43 ± 0.06*\hphantom{*} & 0.47 ± 0.36\hphantom{*}\hphantom{*} & 1.45 ± 0.55\hphantom{*}\hphantom{*} & 1.73 ± 0.48\hphantom{*}\hphantom{*} & 0.58 ± 0.37\hphantom{*}\hphantom{*} & 1.23 ± 0.43\hphantom{*}\hphantom{*}\\ \multirow{-6}{*}{\raggedright\arraybackslash MLP} & Wachter & 0.51 ± 0.04\hphantom{*}\hphantom{*} & 0.40 ± 0.08\hphantom{*}\hphantom{*} & 1.32 ± 0.41\hphantom{*}\hphantom{*} & 1.69 ± 0.32\hphantom{*}\hphantom{*} & 0.83 ± 0.50\hphantom{*}\hphantom{*} & 1.24 ± 0.29\hphantom{*}\hphantom{*}\\ \bottomrule diff --git a/paper/paper.pdf b/paper/paper.pdf index 806370bf6da6c608028610cbbc69a2366cf56aea..5e1c3ebd24dd6a5df708ceb85fb3fa34e7587a42 100644 Binary files a/paper/paper.pdf and b/paper/paper.pdf differ