ONSdigital · vidhyamanisankar · Jul 25, 2022 · Jul 21, 2022 · Jul 22, 2022 · Jul 25, 2022
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "statistical_methods_library"
-version = "5.0.1"
+version = "5.0.2"
 description = ""
 authors = ["Your Name <[email protected]>"]
 license = "MIT"

@@ -199,7 +199,9 @@ def ht_ratio(
         input_df.select(period_col, strata_col).distinct().count()
         != input_df.select(period_col, strata_col, h_value_col).distinct().count()
     ):
-        raise ValidationError("The h value must be the same per period and stratum.")
+        raise ValidationError(
+            f"The {h_value_col} must be the same per period and stratum."
+        )
 
     # --- prepare our working data frame ---
     col_list = [
@@ -257,7 +259,9 @@ def ht_ratio(
         )
         > 0
     ):
-        raise ValidationError("The death count must be less than sample count.")
+        raise ValidationError(
+            f"The {death_marker_col} count must be less than {sample_marker_col} count."
+        )
 
     # --- Expansion estimation ---
     # If we've got a death marker and h value then we'll use these, otherwise

@@ -132,6 +132,20 @@ def winsorise(
         if input_df.filter(col(col_name).isNull()).count() > 0:
             raise ValidationError(f"Column {col_name} must not contain null values.")
 
+    if input_df.filter(col(design_col) < 1).count() > 0:
+        raise ValidationError(
+            f"Column {design_col} must not contain values smaller than one."
+        )
+    if input_df.filter(col(l_value_col) < 0).count() > 0:
+        raise ValidationError(f"Column {l_value_col} must not contain negative values.")
+
+    if calibration_col is not None and (
+        input_df.filter(col(calibration_col) <= 0).count() > 0
+    ):
+        raise ValidationError(
+            f"Column {calibration_col} must not contain zero or negative values."
+        )
+
     col_list = [
         col(reference_col).alias("reference"),
         col(period_col).alias("period"),

@@ -0,0 +1,19 @@
+period,ref,grouping,target,design_weight,l_value
+202201,345671,1,17.6,0.999,400
+202201,345672,1,18.4,100,400
+202201,345673,1,20,100,400
+202201,345674,1,21.6,100,400
+202201,345675,1,22.4,100,400
+202201,345676,1,23.2,100,400
+202201,345677,2,40,60,400
+202201,345678,2,25.6,60,400
+202201,345679,2,26.4,60,400
+202201,345680,2,28,60,400
+202201,345681,2,28.8,60,400
+202201,345682,2,29.6,60,400
+202201,345683,3,32.8,40,400
+202201,345684,3,34.4,40,400
+202201,345685,3,36.8,40,400
+202201,345686,3,37.6,40,400
+202201,345687,3,39.2,40,400
+202201,345688,3,38,1,400
@@ -0,0 +1,10 @@
+ref,period,grouping,target,design_weight,l_value,calibration_weight,auxiliary
+1,202001,1,10,1,5,-0.0001,8
+1,202002,1,10,2,5,4,8
+1,202003,1,15,3,5,6,8
+2,202001,1,20,1,5,2,8
+2,202002,1,20,2,5,4,8
+2,202003,1,25,3,5,6,8
+3,202001,2,30,1,10,2,8
+3,202002,2,30,2,10,4,8
+3,202003,2,35,3,10,6,8
@@ -0,0 +1,10 @@
+ref,period,grouping,target,design_weight,l_value,calibration_weight,auxiliary
+1,202001,1,10,1,-0.05,4,8
+1,202002,1,10,2,-0.05,4,8
+1,202003,1,15,3,-0.05,6,8
+2,202001,1,20,1,-0.05,2,8
+2,202002,1,20,2,-0.05,4,8
+2,202003,1,25,3,-0.05,6,8
+3,202001,2,30,1,10,2,8
+3,202002,2,30,2,10,4,8
+3,202003,2,35,3,10,6,8
@@ -332,6 +332,76 @@ def test_winsorise_different_stratum_l_values_in_same_period_fails(fxt_load_test
         )
 
 
+@pytest.mark.dependency()
+def test_winsorise_negative_calibration_weight_fails(fxt_load_test_csv):
+    test_dataframe = fxt_load_test_csv(
+        dataframe_columns,
+        dataframe_types,
+        "outliering",
+        "winsorise",
+        "unit",
+        "negative_calibration_weight",
+    )
+
+    with pytest.raises(
+        outliering.ValidationError,
+        match=rf"Column {calibration_weight_col} must "
+        + "not contain zero or negative values.",
+    ):
+        additional_params = [
+            *default_params,
+            outlier_weight_col,
+            calibration_weight_col,
+            auxiliary_col,
+        ]
+        outliering.winsorise(
+            test_dataframe,
+            *additional_params,
+        )
+
+
+@pytest.mark.dependency()
+def test_winsorise_negative_l_value_fails(fxt_load_test_csv):
+    test_dataframe = fxt_load_test_csv(
+        dataframe_columns,
+        dataframe_types,
+        "outliering",
+        "winsorise",
+        "unit",
+        "negative_l_value",
+    )
+
+    with pytest.raises(
+        outliering.ValidationError,
+        match=rf"Column {l_value_col} must not contain negative values.",
+    ):
+        outliering.winsorise(
+            test_dataframe,
+            *default_params,
+        )
+
+
+@pytest.mark.dependency()
+def test_winsorise_design_weight_smaller_than_one_fails(fxt_load_test_csv):
+    test_dataframe = fxt_load_test_csv(
+        dataframe_columns,
+        dataframe_types,
+        "outliering",
+        "winsorise",
+        "unit",
+        "design_weight_smaller_than_one",
+    )
+
+    with pytest.raises(
+        outliering.ValidationError,
+        match=rf"Column {design_weight_col} must not contain values smaller than one.",
+    ):
+        outliering.winsorise(
+            test_dataframe,
+            *default_params,
+        )
+
+
 @pytest.mark.dependency()
 def test_winsorise_different_stratum_l_values_in_different_periods_succeeds(
     fxt_load_test_csv,