Source code for orcanet_contrib.orca_handler_util

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Michael's orcanet utility stuff.

"""
import numpy as np
import toml

from orcanet_contrib.custom_objects import get_custom_objects


[docs]def update_objects(orga, model_file): """ Update the organizer for using the model. Look up and load in the respective sample-, label-, and dataset- modifiers, as well as the custom objects. Will assert that the respective objects have not already been set to a non-default value (nothing is overwritten). Parameters ---------- orga : object Organizer Contains all the configurable options in the OrcaNet scripts. model_file : str Path to a toml file which has the infos about which modifiers to use. """ file_content = toml.load(model_file) orca_modifiers = file_content["orca_modifiers"] sample_modifier = orca_modifiers.get("sample_modifier") label_modifier = orca_modifiers.get("label_modifier") dataset_modifier = orca_modifiers.get("dataset_modifier") if sample_modifier is not None: print("Using orga sample modifier: ", sample_modifier) orga.cfg.sample_modifier = orca_sample_modifiers(sample_modifier) if label_modifier is not None: print("Using orga label modifier: ", label_modifier) orga.cfg.label_modifier = orca_label_modifiers(label_modifier) if dataset_modifier is not None: print("Using orga dataset modifier: ", dataset_modifier) orga.cfg.dataset_modifier = orca_dataset_modifiers(dataset_modifier) print("Using orga custom objects") orga.cfg.custom_objects = get_custom_objects()
[docs]def orca_sample_modifiers(name): """ Returns one of the sample modifiers used for Orca networks. They will permute columns, and/or add permuted columns to xs. The input to the functions is: xs_files : dict Dict that contains the input samples from the file(s). The keys are the names of the inputs in the toml list file. The values are a single batch of data from each corresponding file. The output is: xs_layer : dict Dict that contains the input samples for a Keras NN. The keys are the names of the input layers of the network. The values are a single batch of data for each input layer. Parameters ---------- name : None/str Name of the sample modifier to return. Returns ------- sample_modifier : function The sample modifier function. """ # assuming input is bxyzt xyzt_permute = {'yzt-x': (0, 2, 3, 4, 1), 'xyt-z': (0, 1, 2, 4, 3), 't-xyz': (0, 4, 1, 2, 3), 'tyz-x': (0, 4, 2, 3, 1)} if name in xyzt_permute: def swap_columns(xs_files): # Transpose dimensions xs_layer = dict() keys = list(xs_files.keys()) xs_layer[keys[0]] = np.transpose(xs_files[keys[0]], xyzt_permute[name]) return xs_layer sample_modifier = swap_columns elif name == "sum_last": def sample_modifier(xs_files): # sum over the last dimension # e.g. shape (10,20,30) --> (10,20,1) xs_layer = dict() for l_name, x in xs_files.items(): xs_layer[l_name] = np.sum(x, axis=-1, keepdims=True) return xs_layer elif name == 'xyz-t_and_yzt-x': def sample_modifier(xs_files): # Use xyz-t, and also transpose it to yzt-x and use that, too. xs_layer = dict() xs_layer['xyz-t'] = xs_files['xyz-t'] xs_layer['yzt-x'] = np.transpose(xs_files['xyz-t'], xyzt_permute['yzt-x']) return xs_layer elif name == 'xyz-t_and_xyz-c_single_input_and_yzt-x': def sample_modifier(xs_files): # Concatenate xyz-t and xyz-c to a single input xs_layer = dict() xs_layer['xyz-t_and_xyz-c_single_input_net_0'] = np.concatenate( [xs_files['xyz-t'], xs_files['xyz-c']], axis=-1) # Transpose xyz-t to yzt-x and use that, too. xs_layer['input_1_net_1'] = np.transpose(xs_files['xyz-t'], xyzt_permute['yzt-x']) return xs_layer elif name == 'xyz-t_and_yzt-x_multi_input_single_train_tight-1_tight-2': def sample_modifier(xs_files): # Use xyz-t in two different time cuts, and also transpose them to yzt-x and use these, too. xs_layer = dict() xs_layer['xyz-t_tight-1'] = xs_files['xyz-t_tight-1'] xs_layer['xyz-t_tight-2'] = xs_files['xyz-t_tight-2'] xs_layer['yzt-x_tight-1'] = np.transpose(xs_files['xyz-t_tight-1'], xyzt_permute['yzt-x']) xs_layer['yzt-x_tight-2'] = np.transpose(xs_files['xyz-t_tight-2'], xyzt_permute['yzt-x']) return xs_layer elif name == 'xyz-t_and_xyz-c_single_input': def sample_modifier(xs_files): # Concatenate xyz-t and xyz-c to a single input xs_layer = dict() xs_layer['xyz-t_and_xyz-c_single_input'] = np.concatenate( [xs_files['xyz-t'], xs_files['xyz-c']], axis=-1) return xs_layer else: raise ValueError('Unknown input_type: ' + str(name)) return sample_modifier
[docs]def orca_label_modifiers(name): """ Returns one of the label modifiers used for Orca networks. CAREFUL: y_values is a structured numpy array! if you use advanced numpy indexing, this may lead to errors. Let's suppose you want to assign a particular value to one or multiple elements of the y_values array. E.g. y_values[1]['bjorkeny'] = 5 This works, since it is basic indexing. Likewise, y_values[1:3]['bjorkeny'] = 5 works as well, because basic indexing gives you a view (!). Advanced indexing though, gives you a copy. So this y_values[[1,2,4]]['bjorkeny'] = 5 will NOT work! Same with boolean indexing, like bool_idx = np.array([True,False,False,True,False]) # if len(y_values) = 5 y_values[bool_idx]['bjorkeny'] = 10 This will NOT work as well!! Instead, use np.place(y_values['bjorkeny'], bool_idx, 10) This works. Parameters ---------- name : str Name of the label modifier that should be used. Returns ------- label_modifier : function The label modifier function. """ if name == 'energy_dir_bjorken-y_vtx_errors': def label_modifier(y_values): ys = dict() particle_type, is_cc = y_values['particle_type'], y_values['is_cc'] elec_nc_bool_idx = np.logical_and(np.abs(particle_type) == 12, is_cc == 0) # correct energy to visible energy visible_energy = y_values[elec_nc_bool_idx]['energy'] * y_values[elec_nc_bool_idx]['bjorkeny'] # make a copy of the y_values array, since we modify it now y_values_copy = np.copy(y_values) # fix energy to visible energy np.place(y_values_copy['energy'], elec_nc_bool_idx, visible_energy) # set bjorkeny label of nc events to 1 np.place(y_values_copy['bjorkeny'], elec_nc_bool_idx, 1) ys['dx'], ys['dx_err'] = y_values_copy['dir_x'], y_values_copy['dir_x'] ys['dy'], ys['dy_err'] = y_values_copy['dir_y'], y_values_copy['dir_y'] ys['dz'], ys['dz_err'] = y_values_copy['dir_z'], y_values_copy['dir_z'] ys['e'], ys['e_err'] = y_values_copy['energy'], y_values_copy['energy'] ys['by'], ys['by_err'] = y_values_copy['bjorkeny'], y_values_copy['bjorkeny'] ys['vx'], ys['vx_err'] = y_values_copy['vertex_pos_x'], y_values_copy['vertex_pos_x'] ys['vy'], ys['vy_err'] = y_values_copy['vertex_pos_y'], y_values_copy['vertex_pos_y'] ys['vz'], ys['vz_err'] = y_values_copy['vertex_pos_z'], y_values_copy['vertex_pos_z'] ys['vt'], ys['vt_err'] = y_values_copy['time_residual_vertex'], y_values_copy['time_residual_vertex'] for key_label in ys: ys[key_label] = ys[key_label].astype(np.float32) return ys elif name == 'ts_classifier': def label_modifier(y_values): # for every sample, [0,1] for shower, or [1,0] for track # {(12, 0): 0, (12, 1): 1, (14, 1): 2, (16, 1): 3} # 0: elec_NC, 1: elec_CC, 2: muon_CC, 3: tau_CC # label is always shower, except if muon-CC ys = dict() particle_type, is_cc = y_values['particle_type'], y_values['is_cc'] is_muon_cc = np.logical_and(np.abs(particle_type) == 14, is_cc == 1) is_not_muon_cc = np.invert(is_muon_cc) batchsize = y_values.shape[0] # categorical [shower, track] -> [1,0] = shower, [0,1] = track categorical_ts = np.zeros((batchsize, 2), dtype='bool') categorical_ts[:, 0] = is_not_muon_cc categorical_ts[:, 1] = is_muon_cc ys['ts_output'] = categorical_ts.astype(np.float32) return ys elif name == 'bg_classifier': def label_modifier(y_values): # for every sample, [1,0,0] for neutrinos, [0,1,0] for mupage # and [0,0,1] for random_noise # particle types: mupage: np.abs(13), random_noise = 0, neutrinos = ys = dict() particle_type = y_values['particle_type'] is_mupage = np.abs(particle_type) == 13 is_random_noise = np.abs(particle_type == 0) is_not_mupage_nor_rn = np.invert(np.logical_or(is_mupage, is_random_noise)) batchsize = y_values.shape[0] categorical_bg = np.zeros((batchsize, 3), dtype='bool') categorical_bg[:, 0] = is_not_mupage_nor_rn categorical_bg[:, 1] = is_mupage categorical_bg[:, 2] = is_random_noise ys['bg_output'] = categorical_bg.astype(np.float32) return ys elif name == 'bg_classifier_2_class': def label_modifier(y_values): # for every sample, [1,0,0] for neutrinos, [0,1,0] for mupage # and [0,0,1] for random_noise # particle types: mupage: np.abs(13), random_noise = 0, neutrinos = ys = dict() particle_type = y_values['particle_type'] is_mupage = np.abs(particle_type) == 13 is_random_noise = np.abs(particle_type == 0) is_not_mupage_nor_rn = np.invert(np.logical_or(is_mupage, is_random_noise)) batchsize = y_values.shape[0] categorical_bg = np.zeros((batchsize, 2), dtype='bool') # neutrino categorical_bg[:, 0] = is_not_mupage_nor_rn # is not neutrino categorical_bg[:, 1] = np.invert(is_not_mupage_nor_rn) ys['bg_output'] = categorical_bg.astype(np.float32) return ys else: raise ValueError("Unknown output_type: " + str(name)) return label_modifier
[docs]def orca_dataset_modifiers(name): """ Returns one of the dataset modifiers used for predicting with OrcaNet. Parameters ---------- name : str Name of the dataset modifier that should be used. """ if name == "struc_arr": # Multi-purpose conversion to rec array # # Output from network: Dict with 2darrays, shapes (x, y_i) # Transform this into a recarray with shape (x, y_1 + y_2 + ...) like this: # y_pred = {"foo": ndarray, "bar": ndarray} # --> dtypes = [foo_1, foo_2, ..., bar_1, bar_2, ... ] def dataset_modifier(info_blob): y_pred = info_blob["y_pred"] y_true = info_blob["y_true"] y_values = info_blob["y_values"] datasets = dict() datasets["pred"] = dict_to_recarray(y_pred) if y_true is not None: datasets["true"] = dict_to_recarray(y_true) if y_values is not None: datasets['mc_info'] = y_values # is already a structured array return datasets elif name == 'bg_classifier': def dataset_modifier(mc_info, y_true, y_pred): # y_pred and y_true are dicts with keys for each output # we only have 1 output in case of the bg classifier y_pred = y_pred['bg_output'] y_true = y_true['bg_output'] datasets = dict() datasets['mc_info'] = mc_info # is already a structured array # make pred dataset dtypes = np.dtype([('prob_neutrino', y_pred.dtype), ('prob_muon', y_pred.dtype), ('prob_random_noise', y_pred.dtype)]) pred = np.empty(y_pred.shape[0], dtype=dtypes) pred['prob_neutrino'] = y_pred[:, 0] pred['prob_muon'] = y_pred[:, 1] pred['prob_random_noise'] = y_pred[:, 2] datasets['pred'] = pred # make true dataset dtypes = np.dtype([('cat_neutrino', y_true.dtype), ('cat_muon', y_true.dtype), ('cat_random_noise', y_true.dtype)]) true = np.empty(y_true.shape[0], dtype=dtypes) true['cat_neutrino'] = y_true[:, 0] true['cat_muon'] = y_true[:, 1] true['cat_random_noise'] = y_true[:, 2] datasets['true'] = true return datasets elif name == 'bg_classifier_2_class': def dataset_modifier(mc_info, y_true, y_pred): # y_pred and y_true are dicts with keys for each output # we only have 1 output in case of the bg classifier y_pred = y_pred['bg_output'] y_true = y_true['bg_output'] datasets = dict() # y_pred is a list of arrays datasets['mc_info'] = mc_info # is already a structured array # make pred dataset dtypes = np.dtype([('prob_neutrino', y_pred.dtype), ('prob_not_neutrino', y_pred.dtype)]) pred = np.empty(y_pred.shape[0], dtype=dtypes) pred['prob_neutrino'] = y_pred[:, 0] pred['prob_not_neutrino'] = y_pred[:, 1] datasets['pred'] = pred # make true dataset dtypes = np.dtype([('cat_neutrino', y_true.dtype), ('cat_not_neutrino', y_true.dtype)]) true = np.empty(y_true.shape[0], dtype=dtypes) true['cat_neutrino'] = y_true[:, 0] true['cat_not_neutrino'] = y_true[:, 1] datasets['true'] = true return datasets elif name == 'ts_classifier': def dataset_modifier(mc_info, y_true, y_pred): # y_pred and y_true are dicts with keys for each output # we only have 1 output in case of the ts classifier y_pred = y_pred['ts_output'] y_true = y_true['ts_output'] datasets = dict() datasets['mc_info'] = mc_info # is already a structured array # make pred dataset dtypes = np.dtype([('prob_shower', y_pred.dtype), ('prob_track', y_pred.dtype)]) pred = np.empty(y_pred.shape[0], dtype=dtypes) pred['prob_shower'] = y_pred[:, 0] pred['prob_track'] = y_pred[:, 1] datasets['pred'] = pred # make true dataset dtypes = np.dtype([('cat_shower', y_true.dtype), ('cat_track', y_true.dtype)]) true = np.empty(y_true.shape[0], dtype=dtypes) true['cat_shower'] = y_true[:, 0] true['cat_track'] = y_true[:, 1] datasets['true'] = true return datasets elif name == 'regression_energy_dir_bjorken-y_vtx_errors': def dataset_modifier(mc_info, y_true, y_pred): datasets = dict() datasets['mc_info'] = mc_info # is already a structured array # make pred dataset """y_pred and y_true are dicts with keys for each output, here, we have 1 key for each regression variable""" pred_labels_and_nn_output_names = [('pred_energy', 'e'), ('pred_dir_x', 'dx'), ('pred_dir_y', 'dy'), ('pred_dir_z', 'dz'), ('pred_bjorkeny', 'by'), ('pred_vtx_x', 'vx'), ('pred_vtx_y', 'vy'), ('pred_vtx_z', 'vz'), ('pred_vtx_t', 'vt'), ('pred_err_energy', 'e_err'), ('pred_err_dir_x', 'dx_err'), ('pred_err_dir_y', 'dy_err'), ('pred_err_dir_z', 'dz_err'), ('pred_err_bjorkeny', 'by_err'), ('pred_err_vtx_x', 'vx_err'), ('pred_err_vtx_y', 'vy_err'), ('pred_err_vtx_z', 'vz_err'), ('pred_err_vtx_t', 'vt_err')] dtypes_pred = [(tpl[0], y_pred[tpl[1]].dtype) for tpl in pred_labels_and_nn_output_names] n_evts = y_pred['e'].shape[0] pred = np.empty(n_evts, dtype=dtypes_pred) for tpl in pred_labels_and_nn_output_names: if 'err' in tpl[1]: # the err outputs have shape (bs, 2) with 2 (pred_label, pred_label_err) # we only want to select the pred_label_err output pred[tpl[0]] = y_pred[tpl[1]][:, 1] else: pred[tpl[0]] = np.squeeze(y_pred[tpl[1]], axis=1) # reshape (bs, 1) to (bs) datasets['pred'] = pred # make true dataset true_labels_and_nn_output_names = [('true_energy', 'e'), ('true_dir_x', 'dx'), ('true_dir_y', 'dy'), ('true_dir_z', 'dz'), ('true_bjorkeny', 'by'), ('true_vtx_x', 'vx'), ('true_vtx_y', 'vy'), ('true_vtx_z', 'vz'), ('true_vtx_t', 'vt'), ('true_err_energy', 'e_err'), ('true_err_dir_x', 'dx_err'), ('true_err_dir_y', 'dy_err'), ('true_err_dir_z', 'dz_err'), ('true_err_bjorkeny', 'by_err'), ('true_err_vtx_x', 'vx_err'), ('true_err_vtx_y', 'vy_err'), ('true_err_vtx_z', 'vz_err'), ('true_err_vtx_t', 'vt_err')] dtypes_true = [(tpl[0], y_true[tpl[1]].dtype) for tpl in true_labels_and_nn_output_names] true = np.empty(n_evts, dtype=dtypes_true) for tpl in true_labels_and_nn_output_names: true[tpl[0]] = y_true[tpl[1]] datasets['true'] = true return datasets else: raise ValueError('Unknown dataset modifier: ' + str(name)) return dataset_modifier
[docs]def dict_to_recarray(data_dict): """ Convert a dict with 2d np arrays to a 2d struc array, with column names derived from the dict keys. Parameters ---------- data_dict : dict Keys: name of the output layer. Values: 2d arrays, first dimension matches Returns ------- recarray : ndarray """ column_names = [] for output_name, data in data_dict.items(): columns = data.shape[1] for i in range(columns): column_names.append(output_name + "_" + str(i+1)) names = ",".join([name for name in column_names]) data = np.concatenate(list(data_dict.values()), axis=1) recarray = np.core.records.fromrecords(data, names=names) return recarray
[docs]def orca_learning_rates(name, total_file_no): """ Returns one of the learning rate schedules used for Orca networks. Parameters ---------- name : str Name of the schedule. total_file_no : int How many files there are to train on. Returns ------- learning_rate : function The learning rate schedule. """ if name == "triple_decay": def learning_rate(n_epoch, n_file): """ Function that calculates the current learning rate based on the number of already trained epochs. Learning rate schedule: lr_decay = 7% for lr > 0.0003 lr_decay = 4% for 0.0003 >= lr > 0.0001 lr_decay = 2% for 0.0001 >= lr Parameters ---------- n_epoch : int The number of the current epoch which is used to calculate the new learning rate. n_file : int The number of the current filenumber which is used to calculate the new learning rate. Returns ------- lr_temp : float Calculated learning rate for this epoch. """ n_lr_decays = (n_epoch - 1) * total_file_no + (n_file - 1) lr_temp = 0.005 # * n_gpu TODO think about multi gpu lr for i in range(n_lr_decays): if lr_temp > 0.0003: lr_decay = 0.07 # standard for regression: 0.07, standard for PID: 0.02 elif 0.0003 >= lr_temp > 0.0001: lr_decay = 0.04 # standard for regression: 0.04, standard for PID: 0.01 else: lr_decay = 0.02 # standard for regression: 0.02, standard for PID: 0.005 lr_temp = lr_temp * (1 - float(lr_decay)) return lr_temp elif name == "triple_decay_weaker": def learning_rate(n_epoch, n_file): """ Function that calculates the current learning rate based on the number of already trained epochs. Learning rate schedule: lr_decay = 2% for lr > 0.0003 lr_decay = 1% for 0.0003 >= lr > 0.0001 lr_decay = 0.5% for 0.0001 >= lr Parameters ---------- n_epoch : int The number of the current epoch which is used to calculate the new learning rate. n_file : int The number of the current filenumber which is used to calculate the new learning rate. Returns ------- lr_temp : float Calculated learning rate for this epoch. """ n_lr_decays = (n_epoch - 1) * total_file_no + (n_file - 1) lr_temp = 0.003 # * n_gpu TODO think about multi gpu lr for i in range(n_lr_decays): if lr_temp > 0.0003: lr_decay = 0.02 # standard for regression: 0.07, standard for PID: 0.02 elif 0.0003 >= lr_temp > 0.0001: lr_decay = 0.01 # standard for regression: 0.04, standard for PID: 0.01 else: lr_decay = 0.005 # standard for regression: 0.02, standard for PID: 0.005 lr_temp = lr_temp * (1 - float(lr_decay)) return lr_temp else: raise NameError("Unknown orca learning rate name", name) return learning_rate