Merge pull request #89 from ONSdigital/spp_5971_death_count_validation

vidhyamanisankar · web-flow · commit baa9c993b586 · 2022-07-01T13:29:09.000+01:00
Spp 5971 death count validation
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "statistical_methods_library"
-version = "4.4.0"
+version = "4.5.0"
 description = ""
 authors = ["Your Name <you@example.com>"]
 license = "MIT"
diff --git a/statistical_methods_library/estimation.py b/statistical_methods_library/estimation.py
@@ -215,6 +215,22 @@ def estimate(
     ):
         raise ValidationError("The h value must be the same per period and stratum.")
 
+    # death(death_marker=1) count must be less than sample(sample_marker=1)
+    if (
+        death_marker_col is not None
+        and (
+            input_df.groupBy([period_col, strata_col])
+            .agg(
+                sum(col(death_marker_col)).alias("sum_death_marker"),
+                sum(col(sample_marker_col)).alias("sum_sample_marker"),
+            )
+            .filter(col("sum_death_marker") > col("sum_sample_marker"))
+            .count()
+        )
+        > 0
+    ):
+        raise ValidationError("The death count must be less than sample count.")
+
     # --- prepare our working data frame ---
     col_list = [
         col(period_col).alias("period"),
diff --git a/tests/fixture_data/estimation/unit/large_death_count.csv b/tests/fixture_data/estimation/unit/large_death_count.csv
@@ -0,0 +1,37 @@
+reference,period,strata,sample_inclusion_marker,death_marker,H,auxiliary,calibration_group
+1,202009,10,1,0,True,10,10
+2,202009,10,1,0,True,10,10
+3,202009,10,1,1,True,10,10
+4,202009,10,0,1,True,10,10
+5,202009,10,0,1,True,10,10
+6,202009,10,0,1,True,10,10
+7,202009,11,1,0,True,15,11
+8,202009,11,1,0,True,15,11
+9,202009,11,1,1,True,15,11
+10,202009,11,0,0,True,15,11
+11,202009,11,0,0,True,15,11
+12,202009,11,0,1,True,15,11
+13,202009,12,1,0,False,20,12
+14,202009,12,1,0,False,20,12
+15,202009,12,1,1,False,20,12
+16,202009,12,0,0,False,20,12
+17,202009,12,0,0,False,20,12
+18,202009,12,0,1,False,20,12
+1,202010,10,0,0,True,10,10
+2,202010,10,0,0,True,10,10
+3,202010,10,0,1,True,10,10
+4,202010,10,0,0,True,10,10
+5,202010,10,0,0,True,10,10
+6,202010,10,0,1,True,10,10
+7,202010,11,1,1,True,15,11
+8,202010,11,1,1,True,15,11
+9,202010,11,1,1,True,15,11
+10,202010,11,0,1,True,15,11
+11,202010,11,0,1,True,15,11
+12,202010,11,0,1,True,15,11
+13,202010,12,1,0,False,20,12
+14,202010,12,1,0,False,20,12
+15,202010,12,1,1,False,20,12
+16,202010,12,0,0,False,20,12
+17,202010,12,0,0,False,20,12
+18,202010,12,0,1,False,20,12
diff --git a/tests/test_estimation.py b/tests/test_estimation.py
@@ -174,6 +174,20 @@ def test_dataframe_non_boolean_markers(fxt_load_test_csv):
         estimation.estimate(test_dataframe, *params)
 
 
+@pytest.mark.dependency()
+def test_dataframe_large_death_count(fxt_load_test_csv):
+    test_dataframe = fxt_load_test_csv(
+        dataframe_columns,
+        dataframe_types,
+        "estimation",
+        "unit",
+        "large_death_count",
+    )
+    with pytest.raises(estimation.ValidationError):
+        estimation_params = [*params, death_col, h_col]
+        estimation.estimate(test_dataframe, *estimation_params)
+
+
 # --- Test validation fail if mixed h values in a strata  ---
 @pytest.mark.dependency()
 def test_dataframe_mixed_h_values_in_strata(fxt_load_test_csv):