diff --git a/pyproject.toml b/pyproject.toml index ef0cc662..cf4ffd24 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "statistical_methods_library" -version = "5.0.1" +version = "5.0.2" description = "" authors = ["Your Name <you@example.com>"] license = "MIT" diff --git a/statistical_methods_library/estimation.py b/statistical_methods_library/estimation.py index 6cd2e5c5..5cf415fb 100644 --- a/statistical_methods_library/estimation.py +++ b/statistical_methods_library/estimation.py @@ -199,7 +199,9 @@ def ht_ratio( input_df.select(period_col, strata_col).distinct().count() != input_df.select(period_col, strata_col, h_value_col).distinct().count() ): - raise ValidationError("The h value must be the same per period and stratum.") + raise ValidationError( + f"The {h_value_col} must be the same per period and stratum." + ) # --- prepare our working data frame --- col_list = [ @@ -257,7 +259,9 @@ def ht_ratio( ) > 0 ): - raise ValidationError("The death count must be less than sample count.") + raise ValidationError( + f"The {death_marker_col} count must be less than {sample_marker_col} count." + ) # --- Expansion estimation --- # If we've got a death marker and h value then we'll use these, otherwise diff --git a/statistical_methods_library/outliering.py b/statistical_methods_library/outliering.py index ebb841ec..73a18130 100644 --- a/statistical_methods_library/outliering.py +++ b/statistical_methods_library/outliering.py @@ -132,6 +132,20 @@ def winsorise( if input_df.filter(col(col_name).isNull()).count() > 0: raise ValidationError(f"Column {col_name} must not contain null values.") + if input_df.filter(col(design_col) < 1).count() > 0: + raise ValidationError( + f"Column {design_col} must not contain values smaller than one." + ) + if input_df.filter(col(l_value_col) < 0).count() > 0: + raise ValidationError(f"Column {l_value_col} must not contain negative values.") + + if calibration_col is not None and ( + input_df.filter(col(calibration_col) <= 0).count() > 0 + ): + raise ValidationError( + f"Column {calibration_col} must not contain zero or negative values." + ) + col_list = [ col(reference_col).alias("reference"), col(period_col).alias("period"), diff --git a/tests/fixture_data/outliering/winsorise/unit/design_weight_smaller_than_one.csv b/tests/fixture_data/outliering/winsorise/unit/design_weight_smaller_than_one.csv new file mode 100644 index 00000000..a4efd0a0 --- /dev/null +++ b/tests/fixture_data/outliering/winsorise/unit/design_weight_smaller_than_one.csv @@ -0,0 +1,19 @@ +period,ref,grouping,target,design_weight,l_value +202201,345671,1,17.6,0.999,400 +202201,345672,1,18.4,100,400 +202201,345673,1,20,100,400 +202201,345674,1,21.6,100,400 +202201,345675,1,22.4,100,400 +202201,345676,1,23.2,100,400 +202201,345677,2,40,60,400 +202201,345678,2,25.6,60,400 +202201,345679,2,26.4,60,400 +202201,345680,2,28,60,400 +202201,345681,2,28.8,60,400 +202201,345682,2,29.6,60,400 +202201,345683,3,32.8,40,400 +202201,345684,3,34.4,40,400 +202201,345685,3,36.8,40,400 +202201,345686,3,37.6,40,400 +202201,345687,3,39.2,40,400 +202201,345688,3,38,1,400 diff --git a/tests/fixture_data/outliering/winsorise/unit/negative_calibration_weight.csv b/tests/fixture_data/outliering/winsorise/unit/negative_calibration_weight.csv new file mode 100644 index 00000000..cdd923b4 --- /dev/null +++ b/tests/fixture_data/outliering/winsorise/unit/negative_calibration_weight.csv @@ -0,0 +1,10 @@ +ref,period,grouping,target,design_weight,l_value,calibration_weight,auxiliary +1,202001,1,10,1,5,-0.0001,8 +1,202002,1,10,2,5,4,8 +1,202003,1,15,3,5,6,8 +2,202001,1,20,1,5,2,8 +2,202002,1,20,2,5,4,8 +2,202003,1,25,3,5,6,8 +3,202001,2,30,1,10,2,8 +3,202002,2,30,2,10,4,8 +3,202003,2,35,3,10,6,8 diff --git a/tests/fixture_data/outliering/winsorise/unit/negative_l_value.csv b/tests/fixture_data/outliering/winsorise/unit/negative_l_value.csv new file mode 100644 index 00000000..33236124 --- /dev/null +++ b/tests/fixture_data/outliering/winsorise/unit/negative_l_value.csv @@ -0,0 +1,10 @@ +ref,period,grouping,target,design_weight,l_value,calibration_weight,auxiliary +1,202001,1,10,1,-0.05,4,8 +1,202002,1,10,2,-0.05,4,8 +1,202003,1,15,3,-0.05,6,8 +2,202001,1,20,1,-0.05,2,8 +2,202002,1,20,2,-0.05,4,8 +2,202003,1,25,3,-0.05,6,8 +3,202001,2,30,1,10,2,8 +3,202002,2,30,2,10,4,8 +3,202003,2,35,3,10,6,8 diff --git a/tests/test_outlering.py b/tests/test_outlering.py index ab7c392b..076b9d9f 100644 --- a/tests/test_outlering.py +++ b/tests/test_outlering.py @@ -332,6 +332,76 @@ def test_winsorise_different_stratum_l_values_in_same_period_fails(fxt_load_test ) +@pytest.mark.dependency() +def test_winsorise_negative_calibration_weight_fails(fxt_load_test_csv): + test_dataframe = fxt_load_test_csv( + dataframe_columns, + dataframe_types, + "outliering", + "winsorise", + "unit", + "negative_calibration_weight", + ) + + with pytest.raises( + outliering.ValidationError, + match=rf"Column {calibration_weight_col} must " + + "not contain zero or negative values.", + ): + additional_params = [ + *default_params, + outlier_weight_col, + calibration_weight_col, + auxiliary_col, + ] + outliering.winsorise( + test_dataframe, + *additional_params, + ) + + +@pytest.mark.dependency() +def test_winsorise_negative_l_value_fails(fxt_load_test_csv): + test_dataframe = fxt_load_test_csv( + dataframe_columns, + dataframe_types, + "outliering", + "winsorise", + "unit", + "negative_l_value", + ) + + with pytest.raises( + outliering.ValidationError, + match=rf"Column {l_value_col} must not contain negative values.", + ): + outliering.winsorise( + test_dataframe, + *default_params, + ) + + +@pytest.mark.dependency() +def test_winsorise_design_weight_smaller_than_one_fails(fxt_load_test_csv): + test_dataframe = fxt_load_test_csv( + dataframe_columns, + dataframe_types, + "outliering", + "winsorise", + "unit", + "design_weight_smaller_than_one", + ) + + with pytest.raises( + outliering.ValidationError, + match=rf"Column {design_weight_col} must not contain values smaller than one.", + ): + outliering.winsorise( + test_dataframe, + *default_params, + ) + + @pytest.mark.dependency() def test_winsorise_different_stratum_l_values_in_different_periods_succeeds( fxt_load_test_csv,