Source code for freeforestml.variable

from abc import ABC, abstractmethod

import base64
import os
import dill
import numpy as np
import h5py

from freeforestml.cut import Cut
from freeforestml.helpers import python_to_str, str_to_python

[docs]class BlindingStrategy(ABC): """ The BlindingStrategy class represents a blinding strategy. This is an abstract base class. Sub-classes must implement the __call__ method. """
[docs] @abstractmethod def __call__(self, dataframe, variable, bins, range=None): """ Returns the additional selection in order to blind a process. The first argument is the dataframe to operate on. The second argument is the variable whose histogram should be blinded. The arguments bins and range are identical to the ones for the hist method. They might be used in sub-classes to align the blinding cuts to bin borders. """ raise NotImplementedError()
[docs]class RangeBlindingStrategy(BlindingStrategy): """ Concrete blinding strategy which removes all events between a certain x-axis range. The range might be extended to match the bin borders. """
[docs] def __init__(self, start, end): """ Returns a new RangeBlindingStrategy object. When the object is called, it returns a selection removing all events that lay between start and end. The range might be extended to match bin borders. """ self.start = start self.end = end
[docs] def __call__(self, variable, bins, range=None): """ See base class. Returns the additional selection. """ if range is not None: # Build bins if not isinstance(bins, int): raise err.InvalidBins("When range is given, bins must be int.") if not isinstance(range, tuple) or len(range) != 2: raise err.InvalidProcessSelection("Range argument must be a " "tuple of two numbers.") bins = np.linspace(range[0], range[1], bins + 1) start = self.start if bins.min() < start and start < bins.max(): # Align to bin border diff = bins - start diff[diff > 0] -= float('inf') start = bins[diff.argmax()] end = self.end if bins.min() < end and end < bins.max(): # Align to bin border diff = bins - end diff[diff < 0] += float('inf') end = bins[diff.argmin()] return Cut(lambda d: (variable(d) < start) | (variable(d) > end))
[docs]class Variable: """ Representation of a quantity derived from the columns of a dataframe. The variable can also directly represent a column of the dataframe. The variable object defines a human-readable name for the variable and it's physical unit. The name and the unit are used for plotting and labeling of axes. >>> Variable("MMC", "ditau_mmc_mlm_m", "GeV") <Variable 'MMC' [GeV]> """
[docs] def __init__(self, name, definition, unit=None, blinding=None): r""" Returns a new variable object. The first argument is a human-readable name (potentially using latex). The second argument defines the value of the variable. This can be a string naming the column of the dataframe or a callable that computes the value when a dataframe is passed to it. >>> Variable("MMC", "ditau_mmc_mlm_m", "GeV") <Variable 'MMC' [GeV]> >>> Variable("$\\Delta \\eta$", lambda df: df.jet_0_eta - df.jet_1_eta) <Variable '$\\Delta \\eta$'> The optional argument unit defines the unit of the variable. This information is used for plotting, especially for labeling axes. The optional blinding argument accepts a blinding object implementing the blinding strategy. """ if isinstance(definition, str): # Wrap column string by lambda self.definition = lambda d: getattr(d, definition) else: self.definition = definition self.name = name self.unit = unit if blinding is not None and not isinstance(blinding, BlindingStrategy): raise InvalidBlinding("Blinding object must inherit from " "Blinding class.") self.blinding = blinding
[docs] def __call__(self, dataframe): """ Returns an array or series of variable computed from the given dataframe. This method does not apply the blinding! """ return self.definition(dataframe)
[docs] def __repr__(self): """ Returns a string representation. """ if self.unit is None: return "<Variable %s>" % repr(self.name) else: return "<Variable %s [%s]>" % (repr(self.name), self.unit)
[docs] def __eq__(self, other): """Compare if two variables are the same.""" if not isinstance(other, self.__class__): return False if self.name != other.name: return False if self.unit != other.unit: return False if python_to_str(self.definition) != python_to_str(other.definition): return False # TODO compare blinding strategy return True
[docs] def save_to_h5(self, path, key, overwrite=False): """ Save variable definition to a hdf5 file. 'path' is the file path and 'key' is the path inside the hdf5 file. If overwrite is true then already existing file contents are overwritten. """ if overwrite: open_mode = "w" else: open_mode = "a" with h5py.File(path, open_mode) as output_file: group = output_file.create_group(os.path.join(key)) group.attrs["name"] = np.string_(self.name) group.attrs["unit"] = np.string_(self.unit) group.attrs["definition"] = np.string_(python_to_str(self.definition))
# TODO save blinding strategy
[docs] @classmethod def load_from_h5(cls, path, key): """ Create a new Variable instance from an hdf5 file. 'path' is the file path and 'key' is the path inside the hdf5 file. """ with h5py.File(path, "r") as input_file: name = input_file[key].attrs["name"].decode() unit = input_file[key].attrs["unit"].decode() if unit == "None": unit = None definition = str_to_python(input_file[key].attrs["definition"].decode()) # TODO load blinding strategy return cls(name, definition, unit)