Source code for orcanet.utilities.visualization

# -*- coding: utf-8 -*-
"""
Visualization tools used without Keras.
Makes performance graphs for training and validating.
"""
import os
import numpy as np

import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages


[docs]class TrainValPlotter: """ Class for plotting train/val curves. Instructions ------------ 1. Use tvp.plot_curves(train, val) once or more on pairs of train/val data. 2. When all lines are plotted, use tvp.apply_layout() once for proper scaling, ylims, etc. """ def __init__(self): # White space added below and above points self.y_lim_padding = [0.10, 0.25] # Store all plotted points for setting x/y lims self._xpoints_train = np.array([]) self._xpoints_val = np.array([]) self._ypoints_train = np.array([]) self._ypoints_val = np.array([])
[docs] def plot_curves( self, train_data, val_data=None, train_label="training", val_label="validation", color=None, smooth_sigma=None, tlw=0.5, vlw=0.5, vms=3, ): """ Plot a training and optionally a validation line. The data can contain nan's. Parameters ---------- train_data : List X data [0] and y data [1] of the train curve. Will be plotted as connected dots. val_data : List, optional Optional X data [0] and y data [1] of the validation curve. Will be plotted as a faint solid line of the same color as train. train_label : str, optional Label for the train line in the legend. val_label : str, optional Label for the validation line in the legend. color : str, optional Color used for the train/val line. smooth_sigma : int, optional Apply gaussian blur to the train curve with given sigma. tlw : float Linewidth of train curve. vlw : float Linewidth of val curve. vms : float Markersize of the val curve. """ if train_data is None and val_data is None: raise ValueError("Can not plot when no train and val data is given.") if train_data is not None: epoch, y_data = train_data if smooth_sigma is not None: y_data = gaussian_smooth(y_data, smooth_sigma) self._xpoints_train = np.concatenate((self._xpoints_train, epoch)) self._ypoints_train = np.concatenate((self._ypoints_train, y_data)) train_plot = plt.plot( epoch, y_data, color=color, ls="-", zorder=3, label=train_label, lw=tlw, alpha=0.5, ) train_color = train_plot[0].get_color() else: train_color = color if val_data is not None: self._xpoints_val = np.concatenate((self._xpoints_val, val_data[0])) self._ypoints_val = np.concatenate((self._ypoints_val, val_data[1])) val_data_clean = skip_nans(val_data) # val plot always has the same color as the train plot plt.plot( val_data_clean[0], val_data_clean[1], color=train_color, marker="o", zorder=3, lw=vlw, markersize=vms, label=val_label,
)
[docs] def apply_layout( self, title=None, x_label="Epoch", y_label=None, grid=True, legend=True, x_lims=None, y_lims="auto", x_ticks="auto", logy=False, ): """ Apply given layout. Can calculate good y_lims and x_ticks automatically. Parameters ---------- title : str Title of the plot. x_label : str X label of the plot. y_label : str Y label of the plot. grid : bool If true, show a grid. legend : bool If true, show a legend. x_lims : List X limits of the data. y_lims : List or str Y limits of the data. "auto" for auto-calculation. x_ticks : List Positions of the major x ticks. logy : bool If true, make y axis log. """ if logy: plt.yscale("log") if x_ticks is not None: if x_ticks == "auto": all_x_points = np.concatenate((self._xpoints_train, self._xpoints_val)) x_ticks = get_epoch_xticks(all_x_points) else: x_ticks = x_ticks plt.xticks(x_ticks) if x_lims is not None: plt.xlim(x_lims) if y_lims is not None: if y_lims == "auto": y_lims = get_ylims( self._ypoints_train, self._ypoints_val, fraction=self.y_lim_padding, ) else: y_lims = y_lims plt.ylim(y_lims) if legend: plt.legend(loc="upper right") plt.xlabel(x_label) plt.ylabel(y_label) if title is not None: title = plt.title(title) title.set_position([0.5, 1.04]) if grid: plt.grid(True, zorder=0, linestyle="dotted")
[docs]def gaussian_smooth(y, sigma, truncate=4): """Smooth a 1d ndarray with a gaussian filter.""" # kernel_width = 2 * sigma * truncate + 1 kernel_x = np.arange(-truncate * sigma, truncate * sigma + 1) kernel = _gauss(kernel_x, 0, sigma) y = np.pad(np.asarray(y), int(len(kernel) / 2), "edge") blurred = np.convolve(y, kernel, "valid") return blurred
def _gauss(x, mu=0, sigma=1): return (1 / (np.sqrt(2 * np.pi) * sigma)) * np.exp( -np.power(x - mu, 2.0) / (2 * np.power(sigma, 2.0)) )
[docs]def plot_history( train_data, val_data=None, train_label="training", val_label="validation", color=None, **kwargs ): """ Plot the train/val curves in a single plot. For backward compat. Functionality moved to TrainValPlotter """ tvp = TrainValPlotter() tvp.plot_curves( train_data, val_data, train_label=train_label, val_label=val_label, color=color ) tvp.apply_layout(**kwargs)
[docs]def skip_nans(data): """ Skip over nan values, so that all dots are connected. Parameters ---------- data : List Contains x and y data as ndarrays. The y values may contain nans. Returns ------- data_clean : List Contains x and y data as ndarrays. Points with y=nan are skipped. """ not_nan = ~np.isnan(data[1]) data_clean = data[0][not_nan], data[1][not_nan] return data_clean
[docs]def get_ylims(y_points_train, y_points_val=None, fraction=0.25): """ Get the y limits for the summary plot. For the training data, limits are calculated while ignoring data points which are far from the median (in terms of the median distance from the median). This is because there are outliers sometimes in the training data, especially early on in the training. Parameters ---------- y_points_train : List y data of the train curve. y_points_val : List or None Y data of the validation curve. fraction : float or List How much whitespace of the total y range is added below and above the lines. Returns ------- y_lims : tuple Minimum, maximum of the data. """ assert not ( y_points_train is None and y_points_val is None ), "train and val data are None" def reject_outliers(data, threshold): d = np.abs(data - np.median(data)) mdev = np.median(d) s = d / mdev if mdev else 0.0 no_outliers = data[s < threshold] lims = np.amin(no_outliers), np.amax(no_outliers) return lims mins, maxs = [], [] if y_points_train is not None and len(y_points_train) != 0: y_train = y_points_train[~np.isnan(y_points_train)] y_lims_train = reject_outliers(y_train, 5) mins.append(y_lims_train[0]) maxs.append(y_lims_train[1]) if y_points_val is not None and len(y_points_val) != 0: y_val = y_points_val[~np.isnan(y_points_val)] if len(y_val) == 1: y_lim_val = y_val[0], y_val[0] else: y_lim_val = np.amin(y_val), np.amax(y_val) mins.append(y_lim_val[0]) maxs.append(y_lim_val[1]) if len(mins) == 1: y_lims = (mins[0], maxs[0]) else: y_lims = np.amin(mins), np.amax(maxs) if y_lims[0] == y_lims[1]: y_range = 0.1 * y_lims[0] else: y_range = y_lims[1] - y_lims[0] try: fraction = float(fraction) padding = [fraction, fraction] except TypeError: # is a list padding = fraction if padding != [0.0, 0.0]: y_lims = (y_lims[0] - padding[0] * y_range, y_lims[1] + padding[1] * y_range) return y_lims
[docs]def get_epoch_xticks(x_points): """ Calculates the xticks for the train and validation summary plot. One tick per epoch. Less the larger #epochs is. Parameters ---------- x_points : List A list of the x coordinates of all points. Returns ------- x_ticks_major : numpy.ndarray Array containing the ticks. """ if len(x_points) == 0: raise ValueError("x-coordinates are empty!") minimum, maximum = np.amin(x_points), np.amax(x_points) if maximum - minimum > 0.5: # for longer trainings start_epoch, end_epoch = np.floor(minimum), np.ceil(maximum) # less xticks if there are many epochs n_epochs = end_epoch - start_epoch x_ticks_stepsize = 1 + np.floor(n_epochs / 20.0) x_ticks_major = np.arange( start_epoch, end_epoch + x_ticks_stepsize, x_ticks_stepsize ) else: # for early peeks start_epoch = np.floor(minimum) end_epoch = maximum + minimum - start_epoch x_ticks_major = np.linspace(start_epoch, end_epoch, 6) return x_ticks_major
[docs]def update_summary_plot(orga): """ Plot and save all metrics of the given validation- and train-data into a pdf file, each metric in its own plot. If metric pairs of a variable and its error are found (e.g. e_loss and e_err_loss), they will have the same color and appear back to back in the plot. Parameters ---------- orga : orcanet.core.Organizer Contains all the configurable options in the OrcaNet scripts. """ plt.ioff() pdf_name = orga.io.get_subfolder("plots", create=True) + "/summary_plot.pdf" # Extract the names of the metrics all_metrics = orga.history.get_metrics() # Sort them all_metrics = sort_metrics(all_metrics) # Plot them w/ custom color cycle colors = [ "#000000", "#332288", "#88CCEE", "#44AA99", "#117733", "#999933", "#DDCC77", "#CC6677", "#882255", "#AA4499", "#661100", "#6699CC", "#AA4466", "#4477AA", ] # ref. personal.sron.nl/~pault/ color_counter = 0 with PdfPages(pdf_name) as pdf: for metric_no, metric in enumerate(all_metrics): # If this metric is an err metric of a variable, color it the same if all_metrics[metric_no - 1] == metric.replace("_err", ""): color_counter -= 1 orga.history.plot_metric(metric, color=colors[color_counter % len(colors)]) plt.suptitle(os.path.basename(os.path.abspath(orga.cfg.output_folder))) color_counter += 1 pdf.savefig() plt.clf() orga.history.plot_lr() color_counter += 1 pdf.savefig() plt.close()
[docs]def sort_metrics(metric_names): """ Sort a list of metrics, so that errors are right after their variable. The format of the metric names have to be e.g. e_loss and e_err_loss for this to work. Example ---------- >>> sort_metrics( ['e_loss', 'loss', 'e_err_loss', 'dx_err_loss'] ) ['e_loss', 'e_err_loss', 'loss', 'dx_err_loss'] Parameters ---------- metric_names : List List of metric names. Returns ------- sorted_metrics : List List of sorted metric names with the same length as the input. """ sorted_metrics = [0] * len(metric_names) counter = 0 for metric_name in metric_names: if "err_" in metric_name: if metric_name.replace("err_", "") not in metric_names: sorted_metrics[counter] = metric_name counter += 1 continue sorted_metrics[counter] = metric_name counter += 1 err_loss = metric_name.split("_loss")[0] + "_err_loss" if err_loss in metric_names: sorted_metrics[counter] = err_loss counter += 1 assert 0 not in sorted_metrics, ( "Something went wrong with the sorting of " "metrics! Given was {}, output was " "{}. ".format(metric_names, sorted_metrics) ) return sorted_metrics