ONSdigital · Jul 1, 2022
diff --git a/‎poetry.lock
+39-39 b/‎poetry.lock
+39-39
diff --git a/‎pyproject.toml
+1-1 b/‎pyproject.toml
+1-1
diff --git a/‎statistical_methods_library/estimation.py
+69-11 b/‎statistical_methods_library/estimation.py
+69-11
diff --git a/‎tests/fixture_data/estimation/methodology_scenarios/04_out_of_scope_full_with_unadjusted_input.csv
+27 b/‎tests/fixture_data/estimation/methodology_scenarios/04_out_of_scope_full_with_unadjusted_input.csv
+27
diff --git a/‎tests/fixture_data/estimation/methodology_scenarios/04_out_of_scope_full_with_unadjusted_output.csv
+7 b/‎tests/fixture_data/estimation/methodology_scenarios/04_out_of_scope_full_with_unadjusted_output.csv
+7
diff --git a/‎tests/fixture_data/estimation/methodology_scenarios/05_out_of_scope_partial_with_unadjusted_input.csv
+27 b/‎tests/fixture_data/estimation/methodology_scenarios/05_out_of_scope_partial_with_unadjusted_input.csv
+27
diff --git a/‎tests/fixture_data/estimation/methodology_scenarios/05_out_of_scope_partial_with_unadjusted_output.csv
+7 b/‎tests/fixture_data/estimation/methodology_scenarios/05_out_of_scope_partial_with_unadjusted_output.csv
+7
diff --git a/‎tests/test_estimation.py
+14-2 b/‎tests/test_estimation.py
+14-2
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "statistical_methods_library"
-version = "4.3.0"
+version = "4.4.0"
 description = ""
 authors = ["Your Name <you@example.com>"]
 license = "MIT"
 
@@ -24,6 +24,8 @@ def estimate(
     sample_marker_col: str,
     death_marker_col: typing.Optional[str] = None,
     h_value_col: typing.Optional[str] = None,
+    out_of_scope_marker_col: typing.Optional[str] = None,
+    out_of_scope_full: typing.Optional[bool] = None,
     auxiliary_col: typing.Optional[str] = None,
     calibration_group_col: typing.Optional[str] = None,
     unadjusted_design_weight_col: typing.Optional[str] = None,
@@ -35,28 +37,37 @@ def estimate(
     Ratio estimation.
 
     ###Arguments
-    * input_df: The input data frame.
-    * unique_identifier_col: The name of the column containing the unique identifier
+    * `input_df`: The input data frame.
+    * `unique_identifier_col`: The name of the column containing the unique identifier
       for the contributors.
-    * period_col: The name of the column containing the period information for
+    * `period_col`: The name of the column containing the period information for
       the contributor.
-    * strata_col: The name of the column containing the strata of the contributor.
-    * sample_marker_col: The name of the column containing a marker
+    * `strata_col`: The name of the column containing the strata of the contributor.
+    * `sample_marker_col`: The name of the column containing a marker
       for whether to include the contributor in the sample or only in the
       population. This column must only contain values of 0 or 1 where 0 means
       to exclude the contributor from the sample and 1 means the contributor
       will be included in the sample count.
-    * death_marker_col: The name of the column containing a marker for whether
+    * `death_marker_col`: The name of the column containing a marker for whether
       the contributor is dead. This column must only contain the values 0
       meaning the contributor is not dead and 1 meaning that the contributor is dead.
-    * h_value_col: The name of the column containing the h value for the strata.
+    * `h_value_col`: The name of the column containing the h value for the strata.
+    * `out_of_scope_marker_col`: The name of the column containing a marker for
+      whether the contributor is out of scope. This column must only contain
+      the values 0 meaning the contributor is not out of scope and 1 meaning
+      that the contributor is out of scope.
+    * out_of_scope_full: A parameter that specifies what type of out of scope
+      to run when an `out_of_scope_marker_col` is provided. True specifies
+      that the out of scope is used on both sides of the adjustment fraction.
+      False specifies that the out of scope is used only on the denominator of
+      the adjustment fraction.
     * auxiliary_col: The name of the column containing the auxiliary value for
       the contributor.
     * calibration_group_col: The name of the column containing the calibration
       group for the contributor.
     * unadjusted_design_weight_col: The name of the column which will contain
-      the unadjusted design weight for the contributor.
-      Defaults to None, this will mean the column isn't output unless a name is provided.
+      the unadjusted design weight for the contributor. The column isn't
+      output unless a name is provided.
     * design_weight_col: The name of the column which will contain the
       design weight for the contributor. Defaults to `design_weight`.
     * calibration_weight_col: The name of the column which will containthe
@@ -93,6 +104,10 @@ def estimate(
     is per-stratum, the `h_value_col` must not change within a given period and
     stratum.
 
+    If `out_of_scope_marker_col` is specified the `out_of_scope_full`
+    parameter must also be set. In addition `death_marker_col` and `h_value_col`
+    must be provided.
+
     If `auxiliary_col` is specified then one of Separate Ratio or Combined Ratio
     estimation is performed. This depends on whether `calibration_group_col`
     is specified. If so then Combined Ratio estimation is performed, otherwise
@@ -118,6 +133,19 @@ def estimate(
             "Either both or none of death_marker_col and h_value_col must be specified."
         )
 
+    # Not the same as death_cols because when out_of_scope_full is false the
+    # all fails.
+    out_of_scope_cols = (out_of_scope_marker_col, out_of_scope_full)
+    if out_of_scope_cols.count(None) == 1:
+        raise TypeError(
+            "Either both or none of out_of_scope_marker_col "
+            + "and out_of_scope_full must be specified."
+        )
+    if any(out_of_scope_cols) and not any(death_cols):
+        raise TypeError(
+            "For out of scope, death_marker_col and h_value_col must be specified."
+        )
+
     if calibration_group_col is not None and auxiliary_col is None:
         raise TypeError(
             "If calibration_group_col is specified then auxiliary_col must be provided."
@@ -132,6 +160,9 @@ def estimate(
     if death_marker_col is not None:
         expected_cols += [death_marker_col, h_value_col]
 
+    if out_of_scope_marker_col is not None:
+        expected_cols.append(out_of_scope_marker_col)
+
     if auxiliary_col is not None:
         expected_cols.append(auxiliary_col)
 
@@ -168,6 +199,8 @@ def estimate(
     marker_cols = [sample_marker_col]
     if death_marker_col is not None:
         marker_cols.append(death_marker_col)
+    if out_of_scope_marker_col is not None:
+        marker_cols.append(out_of_scope_marker_col)
 
     for col_name in marker_cols:
         if input_df.filter((col(col_name) != 0) & (col(col_name) != 1)).count() > 0:
@@ -220,6 +253,20 @@ def estimate(
     else:
         col_list += [lit(0).alias("death_marker"), lit(0.0).alias("h_value")]
 
+    if out_of_scope_marker_col is not None:
+        col_list.append(
+            col(out_of_scope_marker_col).alias("out_of_scope_marker_denominator")
+        )
+        if out_of_scope_full:
+            col_list.append(
+                col(out_of_scope_marker_col).alias("out_of_scope_marker_numerator")
+            )
+        else:
+            col_list.append(lit(0).alias("out_of_scope_marker_numerator"))
+    else:
+        col_list.append(lit(0).alias("out_of_scope_marker_numerator"))
+        col_list.append(lit(0).alias("out_of_scope_marker_denominator"))
+
     if auxiliary_col is not None:
         col_list.append(col(auxiliary_col).alias("auxiliary"))
 
@@ -243,6 +290,8 @@ def estimate(
             sum(col("sample_marker")),
             sum(col("death_marker")),
             first(col("h_value").cast("integer")).alias("first(h_value)"),
+            sum(col("out_of_scope_marker_numerator")),
+            sum(col("out_of_scope_marker_denominator")),
             count(col("sample_marker")),
         )
         .withColumn(
@@ -257,8 +306,15 @@ def estimate(
                     1
                     + (
                         col("first(h_value)")
-                        * col("sum(death_marker)")
-                        / (col("sum(sample_marker)") - col("sum(death_marker)"))
+                        * (
+                            col("sum(death_marker)")
+                            + col("sum(out_of_scope_marker_numerator)")
+                        )
+                        / (
+                            col("sum(sample_marker)")
+                            - col("sum(death_marker)")
+                            - col("sum(out_of_scope_marker_denominator)")
+                        )
                     )
                 )
             ),
@@ -267,6 +323,8 @@ def estimate(
             "sum(sample_marker)",
             "sum(death_marker)",
             "first(h_value)",
+            "sum(out_of_scope_marker_numerator)",
+            "sum(out_of_scope_marker_denominator)",
             "count(sample_marker)",
         )
     )
 
@@ -0,0 +1,27 @@
+period,reference,strata,calibration_group,auxiliary,out_of_scope_marker,death_marker,H,sample_inclusion_marker
+2010,49001,1,200,2,0,0,True,1
+2010,49002,1,200,4,1,0,True,1
+2010,49003,1,200,3,0,0,True,1
+2010,49004,1,200,6,0,1,True,1
+2010,49005,1,200,7,0,0,True,0
+2010,49006,2,200,10,0,0,True,1
+2010,49007,2,200,15,0,0,True,1
+2010,49008,2,200,12,0,0,True,0
+2010,49009,2,200,13,0,0,True,1
+2010,49010,2,200,19,0,0,True,0
+2010,50001,3,300,24,0,0,False,1
+2010,50002,3,300,27,0,0,False,1
+2010,50003,3,300,34,0,0,False,1
+2011,49001,1,200,1,0,0,True,1
+2011,49002,1,200,5,0,0,True,0
+2011,49003,1,200,3,0,0,True,1
+2011,49004,1,200,4,0,0,True,0
+2011,49005,1,200,7,0,0,True,0
+2011,49006,2,200,10,0,0,True,1
+2011,49007,2,200,15,0,0,True,1
+2011,49008,2,200,12,0,1,True,1
+2011,49009,2,200,13,0,0,True,1
+2011,49010,2,200,19,0,0,True,0
+2011,50001,3,300,24,0,0,False,1
+2011,50002,3,300,27,0,0,False,1
+2011,50003,3,300,34,0,0,False,1
@@ -0,0 +1,7 @@
+period,strata,calibration_group,unadjusted_design_weight,design_weight,calibration_weight
+2010,1,200,1.25,2.5,1.1086294416
+2010,2,200,1.6666666667,1.6666666667,1.1086294416
+2010,3,300,1,1,1
+2011,1,200,2.5,2.5,1.2275862069
+2011,2,200,1.25,1.6666666667,1.2275862069
+2011,3,300,1,1,1
@@ -0,0 +1,27 @@
+period,reference,strata,calibration_group,auxiliary,out_of_scope_marker,death_marker,H,sample_inclusion_marker
+2010,49001,1,200,2,0,0,True,1
+2010,49002,1,200,4,1,0,True,1
+2010,49003,1,200,3,0,0,True,1
+2010,49004,1,200,6,0,1,True,1
+2010,49005,1,200,7,0,0,True,0
+2010,49006,2,200,10,0,0,True,1
+2010,49007,2,200,15,0,0,True,1
+2010,49008,2,200,12,0,0,True,0
+2010,49009,2,200,13,0,0,True,1
+2010,49010,2,200,19,0,0,True,0
+2010,50001,3,300,24,0,0,False,1
+2010,50002,3,300,27,0,0,False,1
+2010,50003,3,300,34,0,0,False,1
+2011,49001,1,200,1,0,0,True,1
+2011,49002,1,200,5,0,0,True,0
+2011,49003,1,200,3,0,0,True,1
+2011,49004,1,200,4,0,0,True,0
+2011,49005,1,200,7,0,0,True,0
+2011,49006,2,200,10,0,0,True,1
+2011,49007,2,200,15,0,0,True,1
+2011,49008,2,200,12,0,1,True,1
+2011,49009,2,200,13,0,0,True,1
+2011,49010,2,200,19,0,0,True,0
+2011,50001,3,300,24,0,0,False,1
+2011,50002,3,300,27,0,0,False,1
+2011,50003,3,300,34,0,0,False,1
@@ -0,0 +1,7 @@
+period,strata,calibration_group,unadjusted_design_weight,design_weight,calibration_weight
+2010,1,200,1.25,1.875,1.1086294416
+2010,2,200,1.6666666667,1.6666666667,1.1086294416
+2010,3,300,1,1,1
+2011,1,200,2.5,2.5,1.2275862069
+2011,2,200,1.25,1.6666666667,1.2275862069
+2011,3,300,1,1,1
@@ -14,6 +14,7 @@
 sample_col = "sample_inclusion_marker"
 death_col = "death_marker"
 h_col = "H"
+out_of_scope_col = "out_of_scope_marker"
 auxiliary_col = "auxiliary"
 calibration_group_col = "calibration_group"
 unadjusted_design_weight_col = "unadjusted_design_weight"
@@ -27,6 +28,7 @@
     sample_col,
     death_col,
     h_col,
+    out_of_scope_col,
     auxiliary_col,
     calibration_group_col,
     design_weight_col,
@@ -41,6 +43,7 @@
     sample_col: "int",
     death_col: "int",
     h_col: "boolean",
+    out_of_scope_col: "int",
     auxiliary_col: "double",
     calibration_group_col: "string",
     design_weight_col: "double",
@@ -316,6 +319,13 @@ def test_calculations(fxt_load_test_csv, scenario_type, scenario):
         estimation_kwargs["death_marker_col"] = death_col
         estimation_kwargs["h_value_col"] = h_col
 
+    if out_of_scope_col in test_dataframe.columns:
+        estimation_kwargs["out_of_scope_marker_col"] = out_of_scope_col
+        if "full" in scenario:
+            estimation_kwargs["out_of_scope_full"] = True
+        else:
+            estimation_kwargs["out_of_scope_full"] = False
+
     if auxiliary_col in test_dataframe.columns:
         estimation_kwargs["auxiliary_col"] = auxiliary_col
 
@@ -335,13 +345,15 @@ def test_calculations(fxt_load_test_csv, scenario_type, scenario):
 
     ret_val = estimation.estimate(test_dataframe, **estimation_kwargs)
 
+    assert isinstance(ret_val, type(test_dataframe))
     sort_col_list = ["period", "strata"]
+    select_cols = list(set(dataframe_columns) & set(exp_val.columns))
     if calibration_group_col in test_dataframe.columns:
         sort_col_list.append(calibration_group_col)
 
     assert_approx_df_equality(
-        ret_val.sort(sort_col_list),
-        exp_val.sort(sort_col_list),
+        ret_val.sort(sort_col_list).select(select_cols),
+        exp_val.sort(sort_col_list).select(select_cols),
         0.01,
         ignore_nullable=True,
     )