Source code for opendp.smartnoise.synthesizers.quail

import logging
import warnings

from functools import wraps

import numpy as np
import pandas as pd

from opendp.smartnoise.synthesizers.base import SDGYMBaseSynthesizer

logger = logging.getLogger(__name__)


[docs]class QUAILSynthesizer(SDGYMBaseSynthesizer): def __init__( self, epsilon, dp_synthesizer, dp_classifier, target, test_size=0.2, seed=None, eps_split=0.9 ): """ Quailified Architecture to Improve Labeling. Divide epsilon in a known classification task between a differentially private synthesizer and classifier. Train DP classifier on real, fit DP synthesizer to features (excluding the target label), and use synthetic data from the DP synthesizer with the DP classifier to create artificial labels. Produces complete synthetic data. More information here: Differentially Private Synthetic Data: Applied Evaluations and Enhancements https://arxiv.org/abs/2011.05537 :param epsilon: Total epsilon used across the DP Synthesizer and DP Classifier :type epsilon: float :param dp_synthesizer: A function that returns an instance of a DP Synthesizer for a specified epsilon value :type dp_synthesizer: function (epsilon) -> SDGYMBaseSynthesizer :param dp_classifier: A function that returns an instance of a DP Classifier for a specified epsilon value :type dp_classifier: function (epsilon) -> classifier :param target: The column name of the target column :type target: str :param test_size: Percent of the data that should be used for the test set, defaults to 0.2 :type test_size: float, optional :param seed: Seed for controlling randomness for testing, defaults to None :type seed: int, optional :param eps_split: Percent of epsilon used for the classifier. 1 - eps_split is used for the Synthesizer., defaults to 0.9 :type eps_split: float, optional """ self.epsilon = epsilon self.eps_split = eps_split self.dp_synthesizer = dp_synthesizer self.dp_classifier = dp_classifier self.target = target self.test_size = test_size self.seed = seed # Model self.private_model = None self.private_synth = None # Pandas check self.pandas = False self.pd_cols = None self.pd_index = None @wraps(SDGYMBaseSynthesizer.fit) def fit(self, data, categorical_columns=tuple(), ordinal_columns=tuple(), verbose=None): """ Takes a dataset and fits the synthesizer/learning model to it, using the epsilon split specified in the init. :param data: Data :type data: pd.DataFrame or np.array """ if verbose is not None: warnings.warn("verbose is deprecated. Use logging.setLevel instead") from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report from sklearn.metrics import accuracy_score if isinstance(data, pd.DataFrame): self.pandas = True for col in data.columns: data[col] = pd.to_numeric(data[col], errors="ignore") self.data = data self.pd_cols = data.columns self.pd_index = data.index else: raise ("Only pandas dataframes for data as of now.") private_features = data.loc[:, data.columns != self.target] private_target = data.loc[:, data.columns == self.target] x_train, x_test, y_train, y_test = train_test_split( private_features, private_target, test_size=self.test_size, random_state=self.seed ) # Here we train a differentially private model on the real # data. We report on the accuracy for now to give a sense of # the upper bound on performance in the sampling step. self.private_model = self.dp_classifier(epsilon=(self.epsilon * self.eps_split)) self.private_model.fit(x_train, y_train.values.ravel()) predictions = self.private_model.predict(x_test) self.class_report = classification_report( np.ravel(y_test), predictions, labels=np.unique(predictions) ) self.target_accuracy = accuracy_score(np.ravel(y_test), predictions) log_level = logger.level if verbose: log_level = logging.INFO logging.log(log_level, "Internal model report: ") logging.log(log_level, self.class_report) logging.log(log_level, self.target_accuracy) # We use the features in our synthesis. self.private_synth = self.dp_synthesizer(epsilon=(self.epsilon * (1 - self.eps_split))) self.private_synth.fit( data=private_features, categorical_columns=categorical_columns, ordinal_columns=ordinal_columns, ) if hasattr(self.private_model, "coef_"): logging.log(log_level, self.private_model.coef_) if hasattr(self.private_model, "intercept_"): logging.log(log_level, self.private_model.intercept_) if hasattr(self.private_model, "classes_"): logging.log(log_level, self.private_model.classes_) @wraps(SDGYMBaseSynthesizer.sample) def sample(self, samples): """ Sample from the synthesizer model. :param samples: The number of samples to create :type samples: int :return: A dataframe of length samples :rtype: pd.Dataframe """ sampled_features = self.private_synth.sample(samples) y_values = self.private_model.predict(sampled_features) sampled_features[self.target] = y_values return sampled_features