Coverage for orcasong/extractors/neutrino_chain.py: 55%
156 statements
« prev ^ index » next coverage.py v7.2.7, created at 2024-10-03 18:23 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2024-10-03 18:23 +0000
1"""
2Functions that extract info from a blob for the mc_info / y datafield
3in the h5 files.
5These are made for the specific given runs. They might not be
6applicable to other data, and could cause errors or produce unexpected
7results when used on data other then the specified. Check for example the
8primary position in the mc_tracks.
10"""
12import warnings
13import numpy as np
14from km3pipe.io.hdf5 import HDF5Header
15from h5py import File
16import os, re
18__author__ = "Daniel Guderian"
21def get_std_reco(blob, rec_types, rec_parameters_names):
23 """
24 Function to extract std reco info. This implementation requires h5 files
25 to be processed with the option "--best_tracks" which adds the selection
26 of best tracks for each reco type to the output using the km3io tools.
28 Returns
29 -------
30 std_reco_info : dict
31 Dict with the std reco info of the best tracks.
33 """
34 # this dict will be filled up
35 std_reco_info = {}
37 # all known reco types to iterate over
38 reco_type_dict = {
39 "BestJmuon": ("jmuon_", "best_jmuon"),
40 "BestJshower": ("jshower_", "best_jshower"),
41 "BestDusjshower": ("dusjshower_", "best_dusjshower"),
42 "BestAashower": ("aashower_", "best_aashower"),
43 }
45 for name_in_blob, (identifier, best_track_name) in reco_type_dict.items():
47 # always write out something for the generally present rec types
48 if best_track_name in rec_types:
50 # specific names are with the prefix from the rec type
51 specific_reco_names = np.core.defchararray.add(
52 identifier, rec_parameters_names
53 )
55 # extract actually present info
56 if name_in_blob in blob:
58 # get the previously identified best track
59 bt = blob[name_in_blob]
61 # get all its values
62 values = bt.item()
63 values_list = list(values)
64 # reco_names = bt.dtype.names #in case the fitinf and stuff will be tailored to the reco types
65 # at some point, get the names directly like this
67 # in case there is no reco for this event but the reco type was done in general
68 else:
70 # fill all values with nan's
71 values_array = np.empty(len(specific_reco_names))
72 values_array[:] = np.nan
73 values_list = values_array.tolist()
75 # create a dict out of them
76 keys_list = list(specific_reco_names)
78 zip_iterator = zip(keys_list, values_list)
79 reco_dict = dict(zip_iterator)
81 # add this dict to the complete std reco collection
82 std_reco_info.update(reco_dict)
84 return std_reco_info
87def get_rec_types_in_file(file):
89 """
90 Checks and returns which rec types are in the file and thus need to be present
91 in all best track and their fitinf information later.
92 """
94 # the known rec types
95 rec_type_names = ["best_jmuon", "best_jshower", "best_dusjshower", "best_aashower"]
97 # all reco related objects in the file
98 reco_objects_in_file = file["reco"].keys()
100 # check which ones are in there
101 rec_types_in_file = []
102 for rec_type in rec_type_names:
103 if rec_type in reco_objects_in_file:
104 rec_types_in_file.append(rec_type)
106 # also get from here the list of dtype names that is share for all recos
107 rec_parameters_names = file["reco"][rec_type].dtype.names
109 return rec_types_in_file, rec_parameters_names
112def get_real_data_info_extr(input_file):
114 """
115 Wrapper function that includes the actual mc_info_extr
116 for real data. There are no n_gen like in the neutrino case.
118 Parameters
119 ----------
120 input_file : km3net data file
121 Can be online or offline format.
123 Returns
124 -------
125 mc_info_extr : function
126 The actual mc_info_extr function that holds the extractions.
128 """
130 # check if std reco is present
131 f = File(input_file, "r")
132 has_std_reco = "reco" in f.keys()
134 if has_std_reco:
135 # also check, which rec types are present
136 rec_types, rec_parameters_names = get_rec_types_in_file(f)
138 def mc_info_extr(blob):
140 """
141 Processes a blob and creates the y with mc_info and, if existing, std reco.
143 For this real data case it is only general event info, like the id.
145 Parameters
146 ----------
147 blob : dict
148 The blob from the pipeline.
150 Returns
151 -------
152 track : dict
153 Containing all the specified info the y should have.
155 """
157 event_info = blob["EventInfo"][0]
159 # add n_hits info for the cut
160 n_hits = len(blob["Hits"])
161 n_trig_hits = np.count_nonzero(blob["Hits"]["triggered"])
163 track = {
164 "event_id": event_info.event_id,
165 "run_id": event_info.run_id,
166 "trigger_mask": event_info.trigger_mask,
167 "n_hits": n_hits,
168 "n_trig_hits": n_trig_hits,
169 }
171 # get all the std reco info
172 if has_std_reco:
174 std_reco_info = get_std_reco(blob, rec_types, rec_parameters_names)
176 track.update(std_reco_info)
178 return track
180 return mc_info_extr
183def get_random_noise_mc_info_extr(input_file):
185 """
186 Wrapper function that includes the actual mc_info_extr
187 for random noise simulations. There are no n_gen like in the neutrino case.
189 Parameters
190 ----------
191 input_file : km3net data file
192 Can be online or offline format.
194 Returns
195 -------
196 mc_info_extr : function
197 The actual mc_info_extr function that holds the extractions.
199 """
201 # check if std reco is present
202 f = File(input_file, "r")
203 has_std_reco = "reco" in f.keys()
205 if has_std_reco:
206 # also check, which rec types are present
207 rec_types, rec_parameters_names = get_rec_types_in_file(f)
209 # an identifier for what the part of the mc simulation this was
210 # this way, events can later be unambiguously identified
211 input_filename_string = os.path.basename(input_file)
212 part_number = re.findall(r"\d+", input_filename_string)[
213 -2
214 ] # second last because of .h5
216 # dummy value for concatenation
217 tau_topology = 3
219 def mc_info_extr(blob):
221 """
222 Processes a blob and creates the y with mc_info and, if existing, std reco.
224 For this random noise case it is only general event info, like the id.
226 Parameters
227 ----------
228 blob : dict
229 The blob from the pipeline.
231 Returns
232 -------
233 track : dict
234 Containing all the specified info the y should have.
236 """
237 event_info = blob["EventInfo"]
239 # add also the nhits info
240 n_hits = len(blob["Hits"])
241 n_trig_hits = np.count_nonzero(blob["Hits"]["triggered"])
243 track = {
244 "event_id": event_info.event_id[0],
245 "run_id": event_info.run_id[0],
246 "particle_type": 0,
247 "part_number": part_number,
248 "n_hits": n_hits,
249 "n_trig_hits": n_trig_hits,
250 "tau_topology": tau_topology,
251 }
253 # get all the std reco info
254 if has_std_reco:
256 std_reco_info = get_std_reco(blob, rec_types, rec_parameters_names)
258 track.update(std_reco_info)
260 return track
262 return mc_info_extr
265def get_neutrino_mc_info_extr(input_file,prod_identifier=999):
267 """
268 Wrapper function that includes the actual mc_info_extr
269 for neutrino simulations. The n_gen parameter, needed for neutrino weighting
270 is extracted from the header of the file.
272 Parameters
273 ----------
274 input_file : km3net data file
275 Can be online or offline format.
276 prod_identifier : int
277 An internal, unofficial identifier to mark the neutrino production. This has to be
278 defined in a dict before.
280 Returns
281 -------
282 mc_info_extr : function
283 The actual mc_info_extr function that holds the extractions.
285 """
287 # check if std reco is present
288 f = File(input_file, "r")
289 has_std_reco = "reco" in f.keys()
291 if has_std_reco:
292 # also check, which rec types are present
293 rec_types, rec_parameters_names = get_rec_types_in_file(f)
295 # get the n_gen
296 header = HDF5Header.from_hdf5(input_file)
297 n_gen = header.genvol.numberOfEvents
299 # an identifier for what the part of the mc simulation this was
300 # this way, events can later be unambiguously identified
301 input_filename_string = os.path.basename(input_file)
302 try:
303 part_number = re.findall(r"\d+", input_filename_string)[
304 -2
305 ] # second last because of .h5 - works only for officially named files
306 except IndexError:
307 part_number = 0
309 def mc_info_extr(blob):
311 """
312 Processes a blob and creates the y with mc_info and, if existing, std reco.
314 For this neutrino case it is the full mc info for the primary neutrino; there are the several "McTracks":
315 check the simulation which index "p" the neutrino has.
317 Parameters
318 ----------
319 blob : dict
320 The blob from the pipeline.
322 Returns
323 -------
324 track : dict
325 Containing all the specified info the y should have.
327 """
329 # get general info about the event
330 event_info = blob["EventInfo"]
331 event_id = event_info.event_id[0]
332 run_id = event_info.run_id[0]
334 # weights for neutrino analysis
335 weight_w1 = event_info.weight_w1[0]
336 weight_w2 = event_info.weight_w2[0]
337 weight_w3 = event_info.weight_w3[0]
339 is_cc = event_info.W2LIST_GSEAGEN_CC[0]
340 bjorkeny = event_info.W2LIST_GSEAGEN_BY[0]
342 # first, look for the particular neutrino index of the production
343 p = 0 # for ORCA4 (and probably subsequent productions)
345 primary_mc_track = blob["McTracks"][p]
347 # some track mc truth info
348 particle_type = primary_mc_track.pdgid # sometimes type, sometimes pdgid
349 energy = primary_mc_track.energy
350 dir_x, dir_y, dir_z = (
351 primary_mc_track.dir_x,
352 primary_mc_track.dir_y,
353 primary_mc_track.dir_z,
354 )
355 time_interaction = (
356 primary_mc_track.time
357 ) # actually always 0 for primary neutrino, measured in MC time
358 vertex_pos_x, vertex_pos_y, vertex_pos_z = (
359 primary_mc_track.pos_x,
360 primary_mc_track.pos_y,
361 primary_mc_track.pos_z,
362 )
364 # for (muon) NC interactions, the visible energy is different
365 if np.abs(particle_type) == 14 and is_cc == 3:
366 visible_energy = energy * bjorkeny
367 else:
368 visible_energy = energy
370 # for tau CC it is not clear what the second interaction is; 1 for shower, 2 for track, 3 for nothing
371 tau_topology = 3
372 if np.abs(particle_type) == 16:
373 if 13 in np.abs(blob["McTracks"].pdgid):
374 tau_topology = 2
375 else:
376 tau_topology = 1
378 # add also the nhits info
379 n_hits = len(blob["Hits"])
380 n_trig_hits = np.count_nonzero(blob["Hits"]["triggered"])
382 track = {
383 "event_id": event_id,
384 "particle_type": particle_type,
385 "energy": energy,
386 "visible_energy": visible_energy,
387 "is_cc": is_cc,
388 "bjorkeny": bjorkeny,
389 "dir_x": dir_x,
390 "dir_y": dir_y,
391 "dir_z": dir_z,
392 "time_interaction": time_interaction,
393 "run_id": run_id,
394 "vertex_pos_x": vertex_pos_x,
395 "vertex_pos_y": vertex_pos_y,
396 "vertex_pos_z": vertex_pos_z,
397 "n_hits": n_hits,
398 "n_trig_hits": n_trig_hits,
399 "weight_w1": weight_w1,
400 "weight_w2": weight_w2,
401 "weight_w3": weight_w3,
402 "n_gen": n_gen,
403 "part_number": part_number,
404 "tau_topology": tau_topology,
405 "prod_identifier": prod_identifier,
406 }
408 # get all the std reco info
409 if has_std_reco:
411 std_reco_info = get_std_reco(blob, rec_types, rec_parameters_names)
413 track.update(std_reco_info)
415 return track
417 return mc_info_extr
420# function used by Stefan to identify which muons leave how many mc hits in the (active) detector.
421def get_mchits_per_muon(blob, inactive_du=None):
423 """
424 For each muon in McTracks, get the number of McHits.
425 Parameters
426 ----------
427 blob
428 The blob.
429 inactive_du : int, optional
430 McHits in this DU will not be counted.
432 Returns
433 -------
434 np.array
435 n_mchits, len = number of muons
437 """
438 ids = blob["McTracks"]["id"]
440 # Origin of each mchit (as int) in the active line
441 origin = blob["McHits"]["origin"]
443 if inactive_du:
444 # only hits in active line
445 origin = origin[blob["McHits"]["du"] != inactive_du]
447 # get how many mchits were produced per muon in the bundle
448 origin_dict = dict(zip(*np.unique(origin, return_counts=True)))
450 return np.array([origin_dict.get(i, 0) for i in ids])
453def get_muon_mc_info_extr(input_file, inactive_du=None):
455 """
456 Wrapper function that includes the actual mc_info_extr
457 for atm. muon simulations. There are no n_gen like in the neutrino case.
459 Parameters
460 ----------
461 input_file : km3net data file
462 Can be online or offline format.
464 Returns
465 -------
466 mc_info_extr : function
467 The actual mc_info_extr function that holds the extractions.
469 """
471 # check if std reco is present
472 f = File(input_file, "r")
473 has_std_reco = "reco" in f.keys()
475 if has_std_reco:
476 # also check, which rec types are present
477 rec_types, rec_parameters_names = get_rec_types_in_file(f)
479 # no n_gen here, but needed for concatenation
480 n_gen = 1
482 # an identifier for what the part of the mc simulation this was
483 # this way, events can later be unambiguously identified
484 input_filename_string = os.path.basename(input_file)
485 part_number = re.findall(r"\d+", input_filename_string)[
486 -2
487 ] # second last because of .h5
489 # dummy value for concatenation
490 tau_topology = 3
492 def mc_info_extr(blob):
494 """
495 Processes a blob and creates the y with mc_info and, if existing, std reco.
497 For this atm. muon case it is the full mc info for the primary; there are the several "McTracks":
498 check the simulation to understand what "p" you want. Muons come in bundles that have the same direction.
499 For energy: sum of all muons in a bundle,
500 for vertex: weighted (energy) mean of the individual vertices .
502 Parameters
503 ----------
504 blob : dict
505 The blob from the pipeline.
507 Returns
508 -------
509 track : dict
510 Containing all the specified info the y should have.
512 """
514 event_id = blob["EventInfo"].event_id[0]
515 run_id = blob["EventInfo"].run_id[0]
517 p = 0 # for ORCA4 (and probably subsequent productions)
519 primary_mc_track = blob["McTracks"][p]
521 particle_type = 13 # fixed for all muons
522 is_cc = 0 # set to 0
523 bjorkeny = 0 # set to zero
525 time_interaction = primary_mc_track.time # same for all muons in a bundle
527 # sum up the energy from all muons that have at least x mc hits
528 n_hits_per_muon = get_mchits_per_muon(
529 blob, inactive_du=inactive_du
530 ) # DU1 in ORCA4 is in the detx but not powered
532 # dont consider muons with less than 15 mc hits
533 suficient_hits_mask = n_hits_per_muon >= 15
534 energy = np.sum(blob["McTracks"][suficient_hits_mask].energy)
535 # instead, assign them a small energy for consistency
536 if energy == 0:
537 energy = 40
539 # also add a visible energy here, which is not further defined
540 visible_energy = energy
542 # all muons in a bundle are parallel, so just take dir of first muon
543 dir_x, dir_y, dir_z = (
544 primary_mc_track.dir_x,
545 primary_mc_track.dir_y,
546 primary_mc_track.dir_z,
547 )
549 # vertex is the weighted (energy) mean of the individual vertices
550 vertex_pos_x = np.average(
551 blob["McTracks"][p:].pos_x, weights=blob["McTracks"][p:].energy
552 )
553 vertex_pos_y = np.average(
554 blob["McTracks"][p:].pos_y, weights=blob["McTracks"][p:].energy
555 )
556 vertex_pos_z = np.average(
557 blob["McTracks"][p:].pos_z, weights=blob["McTracks"][p:].energy
558 )
560 # add also the nhits info
561 n_hits = len(blob["Hits"])
562 n_trig_hits = np.count_nonzero(blob["Hits"]["triggered"])
564 # this is only relevant for neutrinos, though dummy info is needed for the concatenation
565 weight_w1 = 10
566 weight_w2 = 10
567 weight_w3 = 10
569 track = {
570 "event_id": event_id,
571 "particle_type": particle_type,
572 "energy": energy,
573 "visible_energy": visible_energy,
574 "is_cc": is_cc,
575 "bjorkeny": bjorkeny,
576 "dir_x": dir_x,
577 "dir_y": dir_y,
578 "dir_z": dir_z,
579 "time_interaction": time_interaction,
580 "run_id": run_id,
581 "vertex_pos_x": vertex_pos_x,
582 "vertex_pos_y": vertex_pos_y,
583 "vertex_pos_z": vertex_pos_z,
584 "n_hits": n_hits,
585 "n_trig_hits": n_trig_hits,
586 "weight_w1": weight_w1,
587 "weight_w2": weight_w2,
588 "weight_w3": weight_w3,
589 "n_gen": n_gen,
590 "part_number": part_number,
591 "tau_topology": tau_topology,
592 }
594 # get all the std reco info
595 if has_std_reco:
597 std_reco_info = get_std_reco(blob, rec_types, rec_parameters_names)
599 track.update(std_reco_info)
601 return track
603 return mc_info_extr