Updates and init correction

GilesStrong · Mar 15, 2019 · be92f5f · be92f5f
1 parent 2a79aee
commit be92f5f
Show file tree

Hide file tree

Showing 11 changed files with 33 additions and 31 deletions.
diff --git a/examples/Binary_Classification.ipynb b/examples/Binary_Classification.ipynb
@@ -644,7 +644,7 @@
    "outputs": [],
    "source": [
     "from lumin.data_processing.pre_proc import fit_input_pipe\n",
-    "input_pipe = fit_input_pipe(train_df, cont_feats, PATH)"
+    "input_pipe = fit_input_pipe(train_df, cont_feats, PATH/'input_pipe')"
    ]
   },
   {
@@ -1158,7 +1158,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Let's see which of the high-level features the RFs find useful. `rf_rank_features` wraps rfpimp and initially trains a RF using the data and eveluates it on the validation set. It then sequentially makes a copy of the data and randomises the order of one of the fetaures and then reevaluates the model performance, and compares it to the original score. If their is a large decrease in the score, then the feature which was randomised was very important in the model. If there is only a small decrease (or increase) in the score, then the feature was not important.\n",
+    "Let's see which of the high-level features the RFs find useful. `rf_rank_features` wraps rfpimp and initially trains a RF using the data and evaluates it on the validation set. It then sequentially makes a copy of the data and randomises the order of one of the features and then reevaluates the model performance, and compares it to the original score. If their is a large decrease in the score, then the feature which was randomised was very important in the model. If there is only a small decrease (or increase) in the score, then the feature was not important.\n",
     "\n",
     "Having established the importance of the features, `rf_rank_features` then runs another training only using the features with importances higher the the `cut` argument. If there is no decrease in the score, the ignored features can probably be safely removed."
    ]
@@ -10048,7 +10048,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.4"
+   "version": "3.6.5"
   },
   "toc": {
    "base_numbering": 1,

diff --git a/examples/Multi_Target_Regression.ipynb b/examples/Multi_Target_Regression.ipynb
@@ -610,7 +610,7 @@
    "outputs": [],
    "source": [
     "from lumin.data_processing.pre_proc import fit_input_pipe\n",
-    "input_pipe = fit_input_pipe(train_df, cont_feats, PATH)"
+    "input_pipe = fit_input_pipe(train_df, cont_feats, PATH/'input_pipe')"
    ]
   },
   {

diff --git a/examples/Multiclass_Classification.ipynb b/examples/Multiclass_Classification.ipynb
@@ -588,7 +588,7 @@
    "outputs": [],
    "source": [
     "from lumin.data_processing.pre_proc import fit_input_pipe\n",
-    "input_pipe = fit_input_pipe(train_df, cont_feats, PATH)"
+    "input_pipe = fit_input_pipe(train_df, cont_feats, PATH/'input_pipe')"
    ]
   },
   {
@@ -10806,7 +10806,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.5"
+   "version": "3.6.4"
   },
   "toc": {
    "base_numbering": 1,

diff --git a/examples/Single_Target_Regression.ipynb b/examples/Single_Target_Regression.ipynb
@@ -601,7 +601,7 @@
    "outputs": [],
    "source": [
     "from lumin.data_processing.pre_proc import fit_input_pipe\n",
-    "input_pipe = fit_input_pipe(train_df, cont_feats, PATH)"
+    "input_pipe = fit_input_pipe(train_df, cont_feats, PATH/'input_pipe')"
    ]
   },
   {

diff --git a/lumin/data_processing/hep_proc.py b/lumin/data_processing/hep_proc.py
@@ -1,6 +1,6 @@
 import numpy as np
 import pandas as pd
-from typing import List, Dict, Tuple
+from typing import List, Dict, Tuple, Union
 
 '''
 Todo:
@@ -63,6 +63,10 @@ def add_abs_mom(df:pd.DataFrame, vec:str, z:bool=True) -> None:
     else: df[f'{vec}_absp'] = np.sqrt(np.square(df[f'{vec}_px'])+np.square(df[f'{vec}_py']))
 
 
+def add_mass(df:pd.DataFrame, vec:str) -> None:
+    df[f'{vec}_mass'] = np.sqrt(np.square(df[f'{vec}_E'])-np.square(df[f'{vec}_absp']))
+
+
 def add_energy(df:pd.DataFrame, vec:str) -> None:
     if f'{vec}_absp' not in df.columns: add_abs_mom(df, vec)
     df[f'{vec}_E'] = np.sqrt(np.square(df[f'{vec}_mass'])+np.square(df[f'{vec}_absp']))
@@ -151,9 +155,10 @@ def proc_event(df:pd.DataFrame, fix_phi:bool=False, fix_y=False, fix_z=False, us
         df.drop(columns=[f'{f}keep'], inplace=True)
 
 
-def calc_pair_mass(df:pd.DataFrame, masses:Tuple[float,float], feat_map:Dict[str,str]) -> np.ndarray:
+def calc_pair_mass(df:pd.DataFrame, masses:Union[Tuple[float,float],Tuple[np.ndarray,np.ndarray]], feat_map:Dict[str,str]) -> np.ndarray:
     '''Compute invarient mass of pair of particles with given masses, using 3-momenta.
-    feat_map maps requested momentum components to the features in df'''
+    feat_map maps requested momentum components to the features in df
+    TODO: no need for dataframe anymore'''
     tmp = pd.DataFrame()
     tmp['0_E'] = np.sqrt((masses[0]**2)+np.square(df.loc[:, feat_map['0_px']])+np.square(df.loc[:, feat_map['0_py']])+np.square(df.loc[:, feat_map['0_pz']]))
     tmp['1_E'] = np.sqrt((masses[1]**2)+np.square(df.loc[:, feat_map['1_px']])+np.square(df.loc[:, feat_map['1_py']])+np.square(df.loc[:, feat_map['1_pz']]))

diff --git a/lumin/data_processing/pre_proc.py b/lumin/data_processing/pre_proc.py
@@ -1,6 +1,5 @@
 import pandas as pd
-from typing import List, Optional, Tuple, Union
-from pathlib import Path
+from typing import List, Optional, Tuple
 import pickle
 from collections import OrderedDict
 
@@ -26,23 +25,21 @@ def get_pre_proc_pipes(norm_in=False, norm_out=False, pca=False, whiten=False, w
     return input_pipe, output_pipe
 
 
-def fit_input_pipe(df:pd.DataFrame, cont_feats:List[str], savepath:Optional[Union[Path, str]]=None) -> Pipeline:
+def fit_input_pipe(df:pd.DataFrame, cont_feats:List[str], savename:str=None) -> Pipeline:
     '''Fit pipeline to continuous features and optionally save to savepath'''
-    if isinstance(savepath, str): savepath = Path(savepath)
     input_pipe, _ = get_pre_proc_pipes(norm_in=True)
     input_pipe.fit(df[cont_feats].values.astype('float32'))
-    if savepath is not None:
-        with open(savepath/'input_pipe.pkl', 'wb') as fout: pickle.dump(input_pipe, fout)
+    if savename is not None:
+        with open(f'{savename}.pkl', 'wb') as fout: pickle.dump(input_pipe, fout)
     return input_pipe
 
 
-def fit_output_pipe(df:pd.DataFrame, targ_feats:List[str], savepath:Optional[Union[Path, str]]=None) -> Pipeline:
+def fit_output_pipe(df:pd.DataFrame, targ_feats:List[str], savename:str=None) -> Pipeline:
     '''Fit pipeline to targets and optionally save to savepath. Have you thought about using a y_range for regression instead?'''
-    if isinstance(savepath, str): savepath = Path(savepath)
     _, output_pipe = get_pre_proc_pipes(norm_out=True)
     output_pipe.fit(df[targ_feats].values.astype('float32'))
-    if savepath is not None:
-        with open(savepath/'output_pipe.pkl', 'wb') as fout: pickle.dump(output_pipe, fout)
+    if savename is not None:
+        with open(f'{savename}.pkl', 'wb') as fout: pickle.dump(output_pipe, fout)
     return output_pipe
 
 

diff --git a/lumin/evaluation/ams.py b/lumin/evaluation/ams.py
@@ -30,7 +30,7 @@ def ams_scan_quick(df:pd.DataFrame, wgt_factor:float=1, br:float=0, syst_unc_b:f
         ams = calc_ams(max(0, s*wgt_factor), max(0, b*wgt_factor), br, syst_unc_b)
         if ams > max_ams: max_ams, threshold = ams, cut
         if df[targ_name].values[i]: s -= df[wgt_name].values[i]
-        else:                            b -= df[wgt_name].values[i]        
+        else:                       b -= df[wgt_name].values[i]        
     return max_ams, threshold
 
 
@@ -45,11 +45,11 @@ def ams_scan_slow(df:pd.DataFrame, wgt_factor:float=1, br:float=0, syst_unc_b:fl
     syst_unc_b2 = np.square(syst_unc_b)
 
     for i, cut in enumerate(progress_bar(df.loc[df[pred_name] >= start_cut, pred_name].values, display=show_prog, leave=show_prog)):
-        bkg_pass = bkg.loc[(bkg[pred_name] >= cut), 'gen_weight']
+        bkg_pass = bkg.loc[(bkg[pred_name] >= cut), wgt_name]
         n_bkg = len(bkg_pass)
         if n_bkg < min_events: continue
 
-        s = np.sum(sig.loc[(sig[pred_name] >= cut), 'gen_weight'])
+        s = np.sum(sig.loc[(sig[pred_name] >= cut), wgt_name])
         b = np.sum(bkg_pass)
         if use_stat_unc: unc_b = np.sqrt(syst_unc_b2+(1/n_bkg))
         else:            unc_b = syst_unc_b

diff --git a/lumin/optimisation/__init.py b/lumin/optimisation/__init.py
diff --git a/lumin/plotting/interpretation.py b/lumin/plotting/interpretation.py
@@ -21,8 +21,8 @@ def plot_importance(df:pd.DataFrame, feat_name:str='Feature', imp_name:str='Impo
         fig, ax = plt.subplots(figsize=(settings.w_large, (0.75)*settings.lbl_sz))
         xerr = None if unc_name not in df else 'Uncertainty'
         df.plot(feat_name, imp_name, 'barh', ax=ax, legend=False, xerr=xerr, error_kw={'elinewidth': 3})
-        ax.set_xlabel('Importance via feature permutation', fontsize=16, color='black')
-        ax.set_ylabel('Feature', fontsize=16, color='black')
+        ax.set_xlabel('Importance via feature permutation', fontsize=settings.lbl_sz, color=settings.lbl_col)
+        ax.set_ylabel('Feature', fontsize=settings.lbl_sz, color=settings.lbl_col)
         plt.xticks(fontsize=settings.tk_sz, color=settings.tk_col)
         plt.yticks(fontsize=settings.tk_sz, color=settings.tk_col)
         if savename is not None: plt.savefig(settings.savepath/f'{savename}{settings.format}')

diff --git a/lumin/plotting/plot_settings.py b/lumin/plotting/plot_settings.py
@@ -4,10 +4,10 @@
 class PlotSettings:
     '''Class to provide control over plot appearances'''
     def __init__(self, **kargs):
-        self.style       = 'whitegrid'  if 'style'       not in kargs else kargs['style']
-        self.cat_palette = 'colorblind' if 'cat_palette' not in kargs else kargs['cat_palette']
-        self.div_palette = 'RdBu_r'     if 'div_palette' not in kargs else kargs['div_palette']
-        self.seq_palette = 'viridis'    if 'seq_palette' not in kargs else kargs['seq_palette']
+        self.style       = 'whitegrid' if 'style'       not in kargs else kargs['style']
+        self.cat_palette = 'tab10'     if 'cat_palette' not in kargs else kargs['cat_palette']
+        self.div_palette = 'RdBu_r'    if 'div_palette' not in kargs else kargs['div_palette']
+        self.seq_palette = 'viridis'   if 'seq_palette' not in kargs else kargs['seq_palette']
 
         self.tk_sz   = 16      if 'tk_sz'   not in kargs else kargs['tk_sz']
         self.tk_col  = 'black' if 'tk_col'  not in kargs else kargs['tk_col']

diff --git a/lumin/plotting/results.py b/lumin/plotting/results.py
@@ -134,7 +134,7 @@ def plot_sample_pred(df:pd.DataFrame, pred_name='pred', targ_name:str='gen_targe
     else:
         width_scale = 1
 
-    with sns.axes_style(settings.style), sns.color_palette(settings.cat_palette, len(bkg_samples)):
+    with sns.axes_style(settings.style), sns.color_palette(settings.cat_palette, 1+max([sample2col[x] for x in sample2col])):
         fig, ax = plt.subplots(figsize=(settings.w_mid, settings.h_mid)) if zoom_args is None else plt.subplots(figsize=(width_scale*settings.w_mid, settings.h_mid))
         if zoom_args is not None: axins = inset_axes(ax, width_zoom, height_zoom, loc='right', bbox_to_anchor=anchor, bbox_transform=ax.figure.transFigure)
         ax.hist([df[df[sample_name] == sample][pred_name] for sample in bkg_samples],
@@ -180,4 +180,4 @@ def plot_sample_pred(df:pd.DataFrame, pred_name='pred', targ_name:str='gen_targe
                 axins.grid(True, which="both")
         ax.set_title(settings.title, fontsize=settings.title_sz, color=settings.title_col, loc=settings.title_loc)
         if savename is not None: plt.savefig(settings.savepath/f'{savename}{settings.format}')
-        fig.show() 
+        fig.show()