Source code for deep_lincs.models.single_classifier

import pandas as pd
import numpy as np
import altair as alt
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.metrics import confusion_matrix

from .base_network import BaseNetwork


[docs]class SingleClassifier(BaseNetwork): """Represents a classifier for a single metadata field Parameters ---------- dataset : ``Dataset`` An instance of a ``Dataset`` intended to train and evaluate a model. target : ``str`` Valid metadata field defining task for classification. test_sizes : tuple, (optional, default ( ``0.2`` , ``0.2`` )) Size of test splits for dividing the dataset into training, validation, and, testing Attributes ---------- target : ``str`` Target task of model. train : ``Dataset`` Dataset used to train the model. val : ``Dataset`` Dataset used during training as validation. test : ``Dataset`` Dataset used to evaluate the model. model : ``tensorflow.keras.Model`` Compiled and trained model. in_size : ``int`` Size of inputs (generally 978 for L1000 landmark genes). out_size : ``int`` Total of target categories. """
[docs] def __init__(self, dataset, target, **kwargs): dataset._data[target] = pd.Categorical(dataset._data[target]) super(SingleClassifier, self).__init__(dataset=dataset, target=target, **kwargs) self.in_size, self.out_size = self._get_in_out_size(dataset, target)
[docs] def compile_model( self, hidden_layers, dropout_rate=0.0, activation="relu", optimizer="adam" ): """Defines how model is built and compiled Parameters ---------- hidden_layers : ``list(int)`` A list describing the size of the hidden layers. dropout_rate : ``float`` (optional: default ``0.0``) Dropout rate used during training. Applied to all hidden layers. activation : ``str``, (optional: default ``"relu"``) Activation function used in hidden layers. optimizer : ``str``, (optional: default ``"adam"``) Optimizer used during training. Returns ------- ``None`` """ model = Sequential() model.add(Dropout(dropout_rate, input_shape=(self.in_size,))) for nunits in hidden_layers: model.add(Dense(nunits, activation=activation)) model.add(Dropout(dropout_rate)) model.add(Dense(self.out_size, activation="softmax")) model.compile( optimizer=optimizer, loss="categorical_crossentropy", metrics=["accuracy"] ) self.model = model
[docs] def plot_confusion_matrix( self, normalize=True, zero_diag=False, size=300, color_scheme="lightgreyteal" ): """Evaluates model and plots a confusion matrix of classification results Parameters ---------- normalize : ``bool``, (optional: default ``True``) Whether to normalize counts to frequencies. zero_diag : ``bool`` (optional: default ``False``) Whether to zero the diagonal of matrix. Useful for examining which categories are most frequently misidenitfied. size : ``int``, (optional: default ``300``) Size of the plot in pixels. color_scheme : ``str``, (optional: default ``"viridis"``) Color scheme in heatmap. Can be any from https://vega.github.io/vega/docs/schemes/. Returns ------- ``altair.Chart`` object """ y_dummies = pd.get_dummies(self.test.sample_meta[self.target]) classes = y_dummies.columns.tolist() y_test = y_dummies.values y_pred = self.predict() cm = confusion_matrix(y_test.argmax(1), y_pred.argmax(1)) if zero_diag: np.fill_diagonal(cm, 0) if normalize: cm = cm / cm.sum(axis=1)[:, np.newaxis] df = ( pd.DataFrame(cm, columns=classes, index=classes) .reset_index() .melt(id_vars="index") .round(2) ) base = alt.Chart(df).encode( x=alt.X("index", title=None), y=alt.Y("variable", title=None), tooltip=["value"], ) heatmap = base.mark_rect().encode( color=alt.Color("value", scale=alt.Scale(scheme=color_scheme)) ) text = base.mark_text(size=0.5 * (size / len(classes))).encode( text=alt.Text("value") ) return (heatmap + text).properties(width=size, height=size)
def _get_in_out_size(self, dataset, target): unique_targets = dataset.sample_meta[target].unique().tolist() if np.nan in unique_targets: raise Exception( f"Dataset contains np.nan entry in '{target}'. " f"You can drop these samples to train the " f"classifier with Dataset.drop_na('{target}')." ) in_size = dataset.data.shape[1] out_size = len(unique_targets) return in_size, out_size def __repr__(self): return ( f"<SingleClassifier: " f"(target: '{self.target}', " f"input_size: {self.in_size}, " f"output_size: {self.out_size})>" )