{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### This notebook demonstrates the use of a reweighing pre-processing algorithm for bias mitigation\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"# Load all necessary packages\n",
"import sys\n",
"sys.path.append(\"../\")\n",
"import numpy as np\n",
"from tqdm import tqdm\n",
"\n",
"from aif360.datasets import BinaryLabelDataset\n",
"from aif360.datasets import AdultDataset, GermanDataset, CompasDataset\n",
"from aif360.metrics import BinaryLabelDatasetMetric\n",
"from aif360.metrics import ClassificationMetric\n",
"from aif360.algorithms.preprocessing.reweighing import Reweighing\n",
"from aif360.algorithms.preprocessing.optim_preproc_helpers.data_preproc_functions\\\n",
" import load_preproc_data_adult, load_preproc_data_german, load_preproc_data_compas\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.metrics import accuracy_score\n",
"\n",
"from IPython.display import Markdown, display\n",
"import matplotlib.pyplot as plt\n",
"\n",
"from common_utils import compute_metrics"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Load dataset and set options"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"## import dataset\n",
"dataset_used = \"adult\" # \"adult\", \"german\", \"compas\"\n",
"protected_attribute_used = 1 # 1, 2\n",
"\n",
"\n",
"if dataset_used == \"adult\":\n",
"# dataset_orig = AdultDataset()\n",
" if protected_attribute_used == 1:\n",
" privileged_groups = [{'sex': 1}]\n",
" unprivileged_groups = [{'sex': 0}]\n",
" dataset_orig = load_preproc_data_adult(['sex'])\n",
" else:\n",
" privileged_groups = [{'race': 1}]\n",
" unprivileged_groups = [{'race': 0}]\n",
" dataset_orig = load_preproc_data_adult(['race'])\n",
" \n",
"elif dataset_used == \"german\":\n",
"# dataset_orig = GermanDataset()\n",
" if protected_attribute_used == 1:\n",
" privileged_groups = [{'sex': 1}]\n",
" unprivileged_groups = [{'sex': 0}]\n",
" dataset_orig = load_preproc_data_german(['sex'])\n",
" else:\n",
" privileged_groups = [{'age': 1}]\n",
" unprivileged_groups = [{'age': 0}]\n",
" dataset_orig = load_preproc_data_german(['age'])\n",
" \n",
"elif dataset_used == \"compas\":\n",
"# dataset_orig = CompasDataset()\n",
" if protected_attribute_used == 1:\n",
" privileged_groups = [{'sex': 1}]\n",
" unprivileged_groups = [{'sex': 0}]\n",
" dataset_orig = load_preproc_data_compas(['sex'])\n",
" else:\n",
" privileged_groups = [{'race': 1}]\n",
" unprivileged_groups = [{'race': 0}]\n",
" dataset_orig = load_preproc_data_compas(['race'])\n",
"\n",
"all_metrics = [\"Statistical parity difference\",\n",
" \"Average odds difference\",\n",
" \"Equal opportunity difference\"]\n",
"\n",
"#random seed for calibrated equal odds prediction\n",
"np.random.seed(1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Split into train, and test"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Get the dataset and split into train and test\n",
"dataset_orig_train, dataset_orig_vt = dataset_orig.split([0.7], shuffle=True)\n",
"dataset_orig_valid, dataset_orig_test = dataset_orig_vt.split([0.5], shuffle=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Clean up training data"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"#### Training Dataset shape"
],
"text/plain": [
"