Skip to content

Commit

Permalink
Updates and init correction
Browse files Browse the repository at this point in the history
  • Loading branch information
GilesStrong committed Mar 15, 2019
1 parent 2a79aee commit be92f5f
Show file tree
Hide file tree
Showing 11 changed files with 33 additions and 31 deletions.
6 changes: 3 additions & 3 deletions examples/Binary_Classification.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -644,7 +644,7 @@
"outputs": [],
"source": [
"from lumin.data_processing.pre_proc import fit_input_pipe\n",
"input_pipe = fit_input_pipe(train_df, cont_feats, PATH)"
"input_pipe = fit_input_pipe(train_df, cont_feats, PATH/'input_pipe')"
]
},
{
Expand Down Expand Up @@ -1158,7 +1158,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's see which of the high-level features the RFs find useful. `rf_rank_features` wraps rfpimp and initially trains a RF using the data and eveluates it on the validation set. It then sequentially makes a copy of the data and randomises the order of one of the fetaures and then reevaluates the model performance, and compares it to the original score. If their is a large decrease in the score, then the feature which was randomised was very important in the model. If there is only a small decrease (or increase) in the score, then the feature was not important.\n",
"Let's see which of the high-level features the RFs find useful. `rf_rank_features` wraps rfpimp and initially trains a RF using the data and evaluates it on the validation set. It then sequentially makes a copy of the data and randomises the order of one of the features and then reevaluates the model performance, and compares it to the original score. If their is a large decrease in the score, then the feature which was randomised was very important in the model. If there is only a small decrease (or increase) in the score, then the feature was not important.\n",
"\n",
"Having established the importance of the features, `rf_rank_features` then runs another training only using the features with importances higher the the `cut` argument. If there is no decrease in the score, the ignored features can probably be safely removed."
]
Expand Down Expand Up @@ -10048,7 +10048,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
"version": "3.6.5"
},
"toc": {
"base_numbering": 1,
Expand Down
2 changes: 1 addition & 1 deletion examples/Multi_Target_Regression.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -610,7 +610,7 @@
"outputs": [],
"source": [
"from lumin.data_processing.pre_proc import fit_input_pipe\n",
"input_pipe = fit_input_pipe(train_df, cont_feats, PATH)"
"input_pipe = fit_input_pipe(train_df, cont_feats, PATH/'input_pipe')"
]
},
{
Expand Down
4 changes: 2 additions & 2 deletions examples/Multiclass_Classification.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,7 @@
"outputs": [],
"source": [
"from lumin.data_processing.pre_proc import fit_input_pipe\n",
"input_pipe = fit_input_pipe(train_df, cont_feats, PATH)"
"input_pipe = fit_input_pipe(train_df, cont_feats, PATH/'input_pipe')"
]
},
{
Expand Down Expand Up @@ -10806,7 +10806,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
"version": "3.6.4"
},
"toc": {
"base_numbering": 1,
Expand Down
2 changes: 1 addition & 1 deletion examples/Single_Target_Regression.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -601,7 +601,7 @@
"outputs": [],
"source": [
"from lumin.data_processing.pre_proc import fit_input_pipe\n",
"input_pipe = fit_input_pipe(train_df, cont_feats, PATH)"
"input_pipe = fit_input_pipe(train_df, cont_feats, PATH/'input_pipe')"
]
},
{
Expand Down
11 changes: 8 additions & 3 deletions lumin/data_processing/hep_proc.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import numpy as np
import pandas as pd
from typing import List, Dict, Tuple
from typing import List, Dict, Tuple, Union

'''
Todo:
Expand Down Expand Up @@ -63,6 +63,10 @@ def add_abs_mom(df:pd.DataFrame, vec:str, z:bool=True) -> None:
else: df[f'{vec}_absp'] = np.sqrt(np.square(df[f'{vec}_px'])+np.square(df[f'{vec}_py']))


def add_mass(df:pd.DataFrame, vec:str) -> None:
df[f'{vec}_mass'] = np.sqrt(np.square(df[f'{vec}_E'])-np.square(df[f'{vec}_absp']))


def add_energy(df:pd.DataFrame, vec:str) -> None:
if f'{vec}_absp' not in df.columns: add_abs_mom(df, vec)
df[f'{vec}_E'] = np.sqrt(np.square(df[f'{vec}_mass'])+np.square(df[f'{vec}_absp']))
Expand Down Expand Up @@ -151,9 +155,10 @@ def proc_event(df:pd.DataFrame, fix_phi:bool=False, fix_y=False, fix_z=False, us
df.drop(columns=[f'{f}keep'], inplace=True)


def calc_pair_mass(df:pd.DataFrame, masses:Tuple[float,float], feat_map:Dict[str,str]) -> np.ndarray:
def calc_pair_mass(df:pd.DataFrame, masses:Union[Tuple[float,float],Tuple[np.ndarray,np.ndarray]], feat_map:Dict[str,str]) -> np.ndarray:
'''Compute invarient mass of pair of particles with given masses, using 3-momenta.
feat_map maps requested momentum components to the features in df'''
feat_map maps requested momentum components to the features in df
TODO: no need for dataframe anymore'''
tmp = pd.DataFrame()
tmp['0_E'] = np.sqrt((masses[0]**2)+np.square(df.loc[:, feat_map['0_px']])+np.square(df.loc[:, feat_map['0_py']])+np.square(df.loc[:, feat_map['0_pz']]))
tmp['1_E'] = np.sqrt((masses[1]**2)+np.square(df.loc[:, feat_map['1_px']])+np.square(df.loc[:, feat_map['1_py']])+np.square(df.loc[:, feat_map['1_pz']]))
Expand Down
17 changes: 7 additions & 10 deletions lumin/data_processing/pre_proc.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import pandas as pd
from typing import List, Optional, Tuple, Union
from pathlib import Path
from typing import List, Optional, Tuple
import pickle
from collections import OrderedDict

Expand All @@ -26,23 +25,21 @@ def get_pre_proc_pipes(norm_in=False, norm_out=False, pca=False, whiten=False, w
return input_pipe, output_pipe


def fit_input_pipe(df:pd.DataFrame, cont_feats:List[str], savepath:Optional[Union[Path, str]]=None) -> Pipeline:
def fit_input_pipe(df:pd.DataFrame, cont_feats:List[str], savename:str=None) -> Pipeline:
'''Fit pipeline to continuous features and optionally save to savepath'''
if isinstance(savepath, str): savepath = Path(savepath)
input_pipe, _ = get_pre_proc_pipes(norm_in=True)
input_pipe.fit(df[cont_feats].values.astype('float32'))
if savepath is not None:
with open(savepath/'input_pipe.pkl', 'wb') as fout: pickle.dump(input_pipe, fout)
if savename is not None:
with open(f'{savename}.pkl', 'wb') as fout: pickle.dump(input_pipe, fout)
return input_pipe


def fit_output_pipe(df:pd.DataFrame, targ_feats:List[str], savepath:Optional[Union[Path, str]]=None) -> Pipeline:
def fit_output_pipe(df:pd.DataFrame, targ_feats:List[str], savename:str=None) -> Pipeline:
'''Fit pipeline to targets and optionally save to savepath. Have you thought about using a y_range for regression instead?'''
if isinstance(savepath, str): savepath = Path(savepath)
_, output_pipe = get_pre_proc_pipes(norm_out=True)
output_pipe.fit(df[targ_feats].values.astype('float32'))
if savepath is not None:
with open(savepath/'output_pipe.pkl', 'wb') as fout: pickle.dump(output_pipe, fout)
if savename is not None:
with open(f'{savename}.pkl', 'wb') as fout: pickle.dump(output_pipe, fout)
return output_pipe


Expand Down
6 changes: 3 additions & 3 deletions lumin/evaluation/ams.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def ams_scan_quick(df:pd.DataFrame, wgt_factor:float=1, br:float=0, syst_unc_b:f
ams = calc_ams(max(0, s*wgt_factor), max(0, b*wgt_factor), br, syst_unc_b)
if ams > max_ams: max_ams, threshold = ams, cut
if df[targ_name].values[i]: s -= df[wgt_name].values[i]
else: b -= df[wgt_name].values[i]
else: b -= df[wgt_name].values[i]
return max_ams, threshold


Expand All @@ -45,11 +45,11 @@ def ams_scan_slow(df:pd.DataFrame, wgt_factor:float=1, br:float=0, syst_unc_b:fl
syst_unc_b2 = np.square(syst_unc_b)

for i, cut in enumerate(progress_bar(df.loc[df[pred_name] >= start_cut, pred_name].values, display=show_prog, leave=show_prog)):
bkg_pass = bkg.loc[(bkg[pred_name] >= cut), 'gen_weight']
bkg_pass = bkg.loc[(bkg[pred_name] >= cut), wgt_name]
n_bkg = len(bkg_pass)
if n_bkg < min_events: continue

s = np.sum(sig.loc[(sig[pred_name] >= cut), 'gen_weight'])
s = np.sum(sig.loc[(sig[pred_name] >= cut), wgt_name])
b = np.sum(bkg_pass)
if use_stat_unc: unc_b = np.sqrt(syst_unc_b2+(1/n_bkg))
else: unc_b = syst_unc_b
Expand Down
Empty file removed lumin/optimisation/__init.py
Empty file.
4 changes: 2 additions & 2 deletions lumin/plotting/interpretation.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ def plot_importance(df:pd.DataFrame, feat_name:str='Feature', imp_name:str='Impo
fig, ax = plt.subplots(figsize=(settings.w_large, (0.75)*settings.lbl_sz))
xerr = None if unc_name not in df else 'Uncertainty'
df.plot(feat_name, imp_name, 'barh', ax=ax, legend=False, xerr=xerr, error_kw={'elinewidth': 3})
ax.set_xlabel('Importance via feature permutation', fontsize=16, color='black')
ax.set_ylabel('Feature', fontsize=16, color='black')
ax.set_xlabel('Importance via feature permutation', fontsize=settings.lbl_sz, color=settings.lbl_col)
ax.set_ylabel('Feature', fontsize=settings.lbl_sz, color=settings.lbl_col)
plt.xticks(fontsize=settings.tk_sz, color=settings.tk_col)
plt.yticks(fontsize=settings.tk_sz, color=settings.tk_col)
if savename is not None: plt.savefig(settings.savepath/f'{savename}{settings.format}')
Expand Down
8 changes: 4 additions & 4 deletions lumin/plotting/plot_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
class PlotSettings:
'''Class to provide control over plot appearances'''
def __init__(self, **kargs):
self.style = 'whitegrid' if 'style' not in kargs else kargs['style']
self.cat_palette = 'colorblind' if 'cat_palette' not in kargs else kargs['cat_palette']
self.div_palette = 'RdBu_r' if 'div_palette' not in kargs else kargs['div_palette']
self.seq_palette = 'viridis' if 'seq_palette' not in kargs else kargs['seq_palette']
self.style = 'whitegrid' if 'style' not in kargs else kargs['style']
self.cat_palette = 'tab10' if 'cat_palette' not in kargs else kargs['cat_palette']
self.div_palette = 'RdBu_r' if 'div_palette' not in kargs else kargs['div_palette']
self.seq_palette = 'viridis' if 'seq_palette' not in kargs else kargs['seq_palette']

self.tk_sz = 16 if 'tk_sz' not in kargs else kargs['tk_sz']
self.tk_col = 'black' if 'tk_col' not in kargs else kargs['tk_col']
Expand Down
4 changes: 2 additions & 2 deletions lumin/plotting/results.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def plot_sample_pred(df:pd.DataFrame, pred_name='pred', targ_name:str='gen_targe
else:
width_scale = 1

with sns.axes_style(settings.style), sns.color_palette(settings.cat_palette, len(bkg_samples)):
with sns.axes_style(settings.style), sns.color_palette(settings.cat_palette, 1+max([sample2col[x] for x in sample2col])):
fig, ax = plt.subplots(figsize=(settings.w_mid, settings.h_mid)) if zoom_args is None else plt.subplots(figsize=(width_scale*settings.w_mid, settings.h_mid))
if zoom_args is not None: axins = inset_axes(ax, width_zoom, height_zoom, loc='right', bbox_to_anchor=anchor, bbox_transform=ax.figure.transFigure)
ax.hist([df[df[sample_name] == sample][pred_name] for sample in bkg_samples],
Expand Down Expand Up @@ -180,4 +180,4 @@ def plot_sample_pred(df:pd.DataFrame, pred_name='pred', targ_name:str='gen_targe
axins.grid(True, which="both")
ax.set_title(settings.title, fontsize=settings.title_sz, color=settings.title_col, loc=settings.title_loc)
if savename is not None: plt.savefig(settings.savepath/f'{savename}{settings.format}')
fig.show()
fig.show()

0 comments on commit be92f5f

Please sign in to comment.