Skip to content

Commit 502ff47

Browse files
authored
Add a bias detector based on optimal transport (#434)
1 parent 4b33888 commit 502ff47

File tree

6 files changed

+1826
-1
lines changed

6 files changed

+1826
-1
lines changed

aif360/metrics/ot_metric.py

Lines changed: 225 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,225 @@
1+
from typing import Union
2+
import pandas as pd
3+
import numpy as np
4+
import ot
5+
from sklearn.preprocessing import LabelEncoder
6+
7+
def _normalize(distribution1, distribution2):
8+
"""
9+
Transform distributions to pleasure form, that is their sums are equal to 1,
10+
and in case if there is negative values, increase all values with absolute value of smallest number.
11+
12+
Args:
13+
distribution1 (numpy array): nontreated distribution
14+
distribution2 (numpy array): nontreated distribution
15+
"""
16+
if np.minimum(np.min(distribution1), np.min(distribution2)) < 0:
17+
extra = -np.minimum(np.min(distribution1), np.min(distribution2))
18+
distribution1 += extra
19+
distribution2 += extra
20+
21+
total_of_distribution1 = np.sum(distribution1)
22+
if total_of_distribution1 != 0:
23+
distribution1 /= total_of_distribution1
24+
total_of_distribution2 = np.sum(distribution2)
25+
if total_of_distribution2 != 0:
26+
distribution2 /= total_of_distribution2
27+
28+
def _transform(ground_truth, classifier, cost_matrix=None):
29+
"""
30+
Transform given distributions from pandas type to numpy arrays, and _normalize them.
31+
Rearanges distributions, with totall data allocated of one.
32+
Generates matrix distance with respect to (ground_truth[i] - classifier[j])^2.
33+
34+
Args:
35+
ground_truth (series): ground truth (correct) target values
36+
classifier (series, dataframe, optional): pandas series estimated targets
37+
as returned by a model for binary, continuous and ordinal modes.
38+
39+
Returns:
40+
initial_distribution, which is an processed ground_truth (numpy array)
41+
required_distribution, which is an processed classifier (numpy array)
42+
matrix_distance, which stores the distances between the cells of distributions (2d numpy array)
43+
"""
44+
initial_distribution = ground_truth.to_numpy().astype(float)
45+
required_distribution = classifier.to_numpy().astype(float)
46+
47+
_normalize(initial_distribution, required_distribution)
48+
49+
if cost_matrix is not None:
50+
matrix_distance = cost_matrix
51+
else:
52+
matrix_distance = np.array([abs(i - required_distribution) for i in initial_distribution], dtype=float)
53+
return initial_distribution, required_distribution, matrix_distance
54+
55+
def _evaluate(
56+
ground_truth: pd.Series,
57+
classifier: pd.Series,
58+
prot_attr: pd.Series=None,
59+
num_iters=1e5,
60+
cost_matrix: np.ndarray=None,
61+
**kwargs):
62+
"""calculate Wasserstein distance between groups defined by `prot_attr` in `ground_truth` and `classifier`.
63+
64+
Args:
65+
ground_truth (pd.Series, str): ground truth (correct) target value
66+
classifier (pd.Series): estimated target values
67+
prot_attr (pd.Series, str): pandas series of sensitive attribute values
68+
num_iters (int, optional): number of iterations (random restarts). Should be positive.
69+
70+
Returns:
71+
ot.emd2 (float, dict): Earth mover's distance or dictionary of optimal transports for each of option of classifier
72+
"""
73+
74+
# Calculate just the EMD between ground_truth and classifier
75+
if prot_attr is None:
76+
initial_distribution, required_distribution, matrix_distance = _transform(ground_truth, classifier, cost_matrix)
77+
return ot.emd2(a=initial_distribution, b=required_distribution, M=matrix_distance, numItermax=num_iters)
78+
79+
if not ground_truth.nunique() == 2:
80+
raise ValueError(f"Expected to have exactly 2 target values, got {ground_truth.nunique()}.")
81+
82+
# Calculate EMD between ground truth distribution and distribution of each group
83+
emds = {}
84+
for sa_val in sorted(prot_attr.unique()):
85+
initial_distribution = ground_truth[prot_attr == sa_val]
86+
required_distribution = classifier[prot_attr == sa_val]
87+
initial_distribution, required_distribution, matrix_distance = _transform(initial_distribution, required_distribution, cost_matrix)
88+
emds[sa_val] = ot.emd2(a=initial_distribution, b=required_distribution, M=matrix_distance, numItermax=num_iters)
89+
90+
return emds
91+
92+
93+
def ot_distance(
94+
ground_truth: pd.Series,
95+
classifier: Union[pd.Series, pd.DataFrame],
96+
prot_attr: pd.Series = None,
97+
favorable_value: Union[str, float] = None,
98+
scoring: str = "Wasserstein1",
99+
num_iters: int = 1e5,
100+
penalty: float = 1e-17,
101+
mode: str = "binary",
102+
cost_matrix: np.ndarray=None,
103+
**kwargs,
104+
):
105+
"""Normalize and calculate Wasserstein distance between groups defined by `prot_attr` in `ground_truth` and `classifier`.
106+
107+
Args:
108+
ground_truth (pd.Series, str): ground truth (correct) target values.
109+
classifier (pd.Series, pd.DataFrame, str): estimated target values.
110+
If `mode` is nominal, must be a dataframe with columns containing predictions for each nominal class.
111+
If `None`, model is assumed to be a dummy model that predicts the mean of the targets
112+
or 1/(number of categories) for nominal mode.
113+
prot_attr (pd.Series, str): sensitive attribute values.
114+
If `None`, assume all samples belong to the same protected group.
115+
favorable_value(str, float, optional): Either "high", "low" or a float value if the mode in [binary, ordinal, or continuous].
116+
If float, value has to be the minimum or the maximum in the ground_truth column.
117+
Defaults to high if None for these modes.
118+
Support for float left in to keep the intuition clear in binary classification tasks.
119+
If `mode` is nominal, favorable values should be one of the unique categories in the ground_truth.
120+
Defaults to a one-vs-all scan if None for nominal mode.
121+
scoring (str or class): only 'Wasserstein1'
122+
num_iters (int, optional): number of iterations (random restarts) for EMD. Should be positive.
123+
penalty (float, optional): penalty term. Should be positive. The penalty term as with any regularization parameter
124+
may need to be tuned for a particular use case. The higher the penalty, the higher the influence of entropy regualizer.
125+
mode: one of ['binary', 'continuous', 'nominal', 'ordinal']. Defaults to binary.
126+
In nominal mode, up to 10 categories are supported by default.
127+
To increase this, pass in keyword argument max_nominal = integer value.
128+
cost_matrix (np.ndarray): cost matrix for the Wasserstein distance. Defaults to absolute difference between samples.
129+
130+
Returns:
131+
ot.emd2 (float, dict): Earth mover's distance or dictionary of optimal transports for each of option of classifier
132+
133+
Raises:
134+
ValueError: if `mode` is 'binary' but `ground_truth` contains less than 1 or more than 2 unique values.
135+
"""
136+
137+
# Assert correct mode passed
138+
if mode not in ['binary', 'continuous', 'nominal', 'ordinal']:
139+
raise ValueError(f"Expected one of {['binary', 'continuous', 'nominal', 'ordinal']}, got {mode}.")
140+
141+
# Assert correct types passed to ground_truth, classifier and prot_attr
142+
if not isinstance(ground_truth, (pd.Series, str)):
143+
raise TypeError(f"ground_truth: expected pd.Series or str, got {type(ground_truth)}")
144+
if classifier is not None:
145+
if mode in ["binary", "continuous"] and not isinstance(classifier, pd.Series):
146+
raise TypeError(f"classifier: expected pd.Series for {mode} mode, got {type(classifier)}")
147+
if mode in ["nominal", "ordinal"] and not isinstance(classifier, pd.DataFrame):
148+
raise TypeError(f"classifier: expected pd.DataFrame for {mode} mode, got {type(classifier)}")
149+
if prot_attr is not None and not isinstance(prot_attr, (pd.Series, str)):
150+
raise TypeError(f"prot_attr: expected pd.Series or str, got {type(prot_attr)}")
151+
152+
# Assert correct type passed to cost_matrix
153+
if cost_matrix is not None and not isinstance(cost_matrix, np.ndarray):
154+
raise TypeError(f"cost_matrix: expected numpy.ndarray, got {type(cost_matrix)}")
155+
156+
# Assert scoring is "Wasserstein1"
157+
if not scoring == "Wasserstein1":
158+
raise ValueError(f"Scoring mode can only be \"Wasserstein1\", got {scoring}")
159+
160+
grt = ground_truth.copy()
161+
162+
if classifier is not None:
163+
cls = classifier.copy()
164+
if prot_attr is not None:
165+
cls.index = grt.index
166+
else:
167+
cls = None
168+
169+
if prot_attr is not None:
170+
sat = prot_attr.copy()
171+
sat.index = grt.index
172+
else:
173+
sat = None
174+
175+
uniques = list(grt.unique())
176+
if mode == "binary":
177+
if len(uniques) > 2:
178+
raise ValueError(f"Only 2 unique values allowed in ground_truth for binary mode, got {uniques}")
179+
180+
# Encode variables
181+
if not np.issubdtype(grt.dtype, np.number):
182+
grt_encoder = LabelEncoder().fit(grt)
183+
grt = pd.Series(grt_encoder.transform(grt))
184+
185+
# Set correct favorable value (this tells us if higher or lower is better)
186+
min_val, max_val = grt.min(), grt.max()
187+
188+
if favorable_value == 'high':
189+
favorable_value = max_val
190+
elif favorable_value == 'low':
191+
favorable_value = min_val
192+
elif favorable_value is None:
193+
if mode in ["binary", "ordinal", "continuous"]:
194+
favorable_value = max_val # Default to higher is better
195+
elif mode == "nominal":
196+
favorable_value = "flag-all" # Default to scan through all categories
197+
198+
if favorable_value not in [min_val, max_val, "flag-all", *uniques,]:
199+
raise ValueError(f"Favorable_value should be high, low, or one of categories {uniques}, got {favorable_value}.")
200+
201+
if mode == "binary": # Flip ground truth if favorable_value is 0 in binary mode.
202+
grt = pd.Series(grt == favorable_value, dtype=int)
203+
if cls is None:
204+
cls = pd.Series(grt.mean(), index=grt.index)
205+
emds = _evaluate(grt, cls, sat, num_iters, cost_matrix, **kwargs)
206+
207+
elif mode == "continuous":
208+
if cls is None:
209+
cls = pd.Series(grt.mean(), index=grt.index)
210+
emds = _evaluate(grt, cls, sat, num_iters,cost_matrix, **kwargs)
211+
212+
## TODO: rework ordinal mode to take into account distance between pred and true
213+
elif mode in ["nominal", "ordinal"]:
214+
if cls is None: # Set classifier to 1/(num of categories) for nominal mode
215+
cls = pd.DataFrame([pd.Series(1 / grt.nunique(), index=grt.index)]*grt.nunique())
216+
if grt.nunique() != cls.shape[-1]:
217+
raise ValueError(
218+
f"classifier must have a column for each class. Expected shape [:, {grt.nunique()}], got {cls.shape}")
219+
emds = {}
220+
for class_label in uniques:
221+
grt_cl = grt.map({class_label: 1}).fillna(0)
222+
cls_cl = cls[class_label]
223+
emds[class_label] = _evaluate(grt_cl, cls_cl, sat, num_iters, cost_matrix, **kwargs)
224+
225+
return emds

aif360/sklearn/metrics/metrics.py

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from itertools import permutations
2+
from typing import Union
23

34
import numpy as np
45
import pandas as pd
@@ -10,10 +11,12 @@
1011
from sklearn.utils import check_X_y
1112
from sklearn.utils.deprecation import deprecated
1213

14+
from aif360.metrics import ot_metric
1315
from aif360.sklearn.utils import check_inputs, check_groups
1416
from aif360.detectors.mdss.ScoringFunctions import BerkJones, Bernoulli
1517
from aif360.detectors.mdss.MDSS import MDSS
1618

19+
1720
__all__ = [
1821
# meta-metrics
1922
'difference', 'ratio', 'intersection', 'one_vs_rest',
@@ -24,7 +27,7 @@
2427
'specificity_score', 'base_rate', 'selection_rate', 'smoothed_base_rate',
2528
'smoothed_selection_rate', 'generalized_fpr', 'generalized_fnr',
2629
# group fairness
27-
'statistical_parity_difference', 'disparate_impact_ratio',
30+
'ot_distance', 'statistical_parity_difference', 'disparate_impact_ratio',
2831
'equal_opportunity_difference', 'average_odds_difference', 'average_predictive_value_difference',
2932
'average_odds_error', 'class_imbalance', 'kl_divergence',
3033
'conditional_demographic_disparity', 'smoothed_edf',
@@ -499,6 +502,63 @@ def generalized_fnr(y_true, probas_pred, *, pos_label=1, sample_weight=None,
499502

500503

501504
# ============================ GROUP FAIRNESS ==================================
505+
def ot_distance(
506+
y_true: pd.Series,
507+
y_pred: Union[pd.Series, pd.DataFrame],
508+
prot_attr: pd.Series = None,
509+
pos_label: Union[str, float] = None,
510+
scoring: str = "Wasserstein1",
511+
num_iters: int = 1e5,
512+
penalty: float = 1e-17,
513+
mode: str = "binary",
514+
cost_matrix: np.ndarray=None,
515+
**kwargs,
516+
):
517+
"""Normalize and calculate Wasserstein distance between groups defined by `prot_attr` in `y_true` and `y_pred`.
518+
519+
Args:
520+
y_true (pd.Series): ground truth (correct) target values.
521+
y_pred (pd.Series, pd.DataFrame): estimated target values.
522+
If `mode` is nominal, must be a `pd.DataFrame` with columns containing predictions for each nominal class,
523+
or list of corresponding column names in `data`.
524+
If `None`, model is assumed to be a dummy model that predicts the mean of the targets
525+
or 1/(number of categories) for nominal mode.
526+
prot_attr (pd.Series): sensitive attribute values.
527+
If `None`, assume all samples belong to the same protected group.
528+
pos_label(str, float, optional): Either "high", "low" or a float value if the mode in [binary, ordinal, or continuous].
529+
If float, value has to be the minimum or the maximum in the ground_truth column.
530+
Defaults to high if None for these modes.
531+
Support for float left in to keep the intuition clear in binary classification tasks.
532+
If `mode` is nominal, favorable values should be one of the unique categories in the ground_truth.
533+
Defaults to a one-vs-all scan if None for nominal mode.
534+
scoring (str or class): only 'Wasserstein1'
535+
num_iters (int, optional): number of iterations (random restarts) for EMD. Should be positive.
536+
penalty (float, optional): penalty term. Should be positive. The penalty term as with any regularization parameter
537+
may need to be tuned for a particular use case. The higher the penalty, the higher the influence of entropy regualizer.
538+
mode: one of ['binary', 'continuous', 'nominal', 'ordinal']. Defaults to binary.
539+
In nominal mode, up to 10 categories are supported by default.
540+
To increase this, pass in keyword argument max_nominal = integer value.
541+
cost_matrix (np.ndarray): cost matrix for the Wasserstein distance. Defaults to absolute difference between samples.
542+
543+
Returns:
544+
ot.emd2 (float, dict): Earth mover's distance or dictionary of optimal transports for each of option of classifier
545+
546+
Raises:
547+
ValueError: if `mode` is 'binary' but `ground_truth` contains less than 1 or more than 2 unique values.
548+
"""
549+
return ot_metric.ot_distance(
550+
ground_truth=y_true,
551+
classifier=y_pred,
552+
prot_attr=prot_attr,
553+
favorable_value=pos_label,
554+
scoring=scoring,
555+
num_iters=num_iters,
556+
penalty=penalty,
557+
mode=mode,
558+
cost_matrix=cost_matrix,
559+
**kwargs
560+
)
561+
502562
def statistical_parity_difference(y_true, y_pred=None, *, prot_attr=None,
503563
priv_group=1, pos_label=1, sample_weight=None):
504564
r"""Difference in selection rates.

0 commit comments

Comments
 (0)