Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 08eea28

Browse files
authoredJul 1, 2022
Merge pull request #87 from ONSdigital/SPP-6807
Spp 6807
2 parents 9cd75c9 + b893429 commit 08eea28

8 files changed

+191
-53
lines changed
 

‎poetry.lock

+39-39
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "statistical_methods_library"
3-
version = "4.3.0"
3+
version = "4.4.0"
44
description = ""
55
authors = ["Your Name <you@example.com>"]
66
license = "MIT"

‎statistical_methods_library/estimation.py

+69-11
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ def estimate(
2424
sample_marker_col: str,
2525
death_marker_col: typing.Optional[str] = None,
2626
h_value_col: typing.Optional[str] = None,
27+
out_of_scope_marker_col: typing.Optional[str] = None,
28+
out_of_scope_full: typing.Optional[bool] = None,
2729
auxiliary_col: typing.Optional[str] = None,
2830
calibration_group_col: typing.Optional[str] = None,
2931
unadjusted_design_weight_col: typing.Optional[str] = None,
@@ -35,28 +37,37 @@ def estimate(
3537
Ratio estimation.
3638
3739
###Arguments
38-
* input_df: The input data frame.
39-
* unique_identifier_col: The name of the column containing the unique identifier
40+
* `input_df`: The input data frame.
41+
* `unique_identifier_col`: The name of the column containing the unique identifier
4042
for the contributors.
41-
* period_col: The name of the column containing the period information for
43+
* `period_col`: The name of the column containing the period information for
4244
the contributor.
43-
* strata_col: The name of the column containing the strata of the contributor.
44-
* sample_marker_col: The name of the column containing a marker
45+
* `strata_col`: The name of the column containing the strata of the contributor.
46+
* `sample_marker_col`: The name of the column containing a marker
4547
for whether to include the contributor in the sample or only in the
4648
population. This column must only contain values of 0 or 1 where 0 means
4749
to exclude the contributor from the sample and 1 means the contributor
4850
will be included in the sample count.
49-
* death_marker_col: The name of the column containing a marker for whether
51+
* `death_marker_col`: The name of the column containing a marker for whether
5052
the contributor is dead. This column must only contain the values 0
5153
meaning the contributor is not dead and 1 meaning that the contributor is dead.
52-
* h_value_col: The name of the column containing the h value for the strata.
54+
* `h_value_col`: The name of the column containing the h value for the strata.
55+
* `out_of_scope_marker_col`: The name of the column containing a marker for
56+
whether the contributor is out of scope. This column must only contain
57+
the values 0 meaning the contributor is not out of scope and 1 meaning
58+
that the contributor is out of scope.
59+
* out_of_scope_full: A parameter that specifies what type of out of scope
60+
to run when an `out_of_scope_marker_col` is provided. True specifies
61+
that the out of scope is used on both sides of the adjustment fraction.
62+
False specifies that the out of scope is used only on the denominator of
63+
the adjustment fraction.
5364
* auxiliary_col: The name of the column containing the auxiliary value for
5465
the contributor.
5566
* calibration_group_col: The name of the column containing the calibration
5667
group for the contributor.
5768
* unadjusted_design_weight_col: The name of the column which will contain
58-
the unadjusted design weight for the contributor.
59-
Defaults to None, this will mean the column isn't output unless a name is provided.
69+
the unadjusted design weight for the contributor. The column isn't
70+
output unless a name is provided.
6071
* design_weight_col: The name of the column which will contain the
6172
design weight for the contributor. Defaults to `design_weight`.
6273
* calibration_weight_col: The name of the column which will containthe
@@ -93,6 +104,10 @@ def estimate(
93104
is per-stratum, the `h_value_col` must not change within a given period and
94105
stratum.
95106
107+
If `out_of_scope_marker_col` is specified the `out_of_scope_full`
108+
parameter must also be set. In addition `death_marker_col` and `h_value_col`
109+
must be provided.
110+
96111
If `auxiliary_col` is specified then one of Separate Ratio or Combined Ratio
97112
estimation is performed. This depends on whether `calibration_group_col`
98113
is specified. If so then Combined Ratio estimation is performed, otherwise
@@ -118,6 +133,19 @@ def estimate(
118133
"Either both or none of death_marker_col and h_value_col must be specified."
119134
)
120135

136+
# Not the same as death_cols because when out_of_scope_full is false the
137+
# all fails.
138+
out_of_scope_cols = (out_of_scope_marker_col, out_of_scope_full)
139+
if out_of_scope_cols.count(None) == 1:
140+
raise TypeError(
141+
"Either both or none of out_of_scope_marker_col "
142+
+ "and out_of_scope_full must be specified."
143+
)
144+
if any(out_of_scope_cols) and not any(death_cols):
145+
raise TypeError(
146+
"For out of scope, death_marker_col and h_value_col must be specified."
147+
)
148+
121149
if calibration_group_col is not None and auxiliary_col is None:
122150
raise TypeError(
123151
"If calibration_group_col is specified then auxiliary_col must be provided."
@@ -132,6 +160,9 @@ def estimate(
132160
if death_marker_col is not None:
133161
expected_cols += [death_marker_col, h_value_col]
134162

163+
if out_of_scope_marker_col is not None:
164+
expected_cols.append(out_of_scope_marker_col)
165+
135166
if auxiliary_col is not None:
136167
expected_cols.append(auxiliary_col)
137168

@@ -168,6 +199,8 @@ def estimate(
168199
marker_cols = [sample_marker_col]
169200
if death_marker_col is not None:
170201
marker_cols.append(death_marker_col)
202+
if out_of_scope_marker_col is not None:
203+
marker_cols.append(out_of_scope_marker_col)
171204

172205
for col_name in marker_cols:
173206
if input_df.filter((col(col_name) != 0) & (col(col_name) != 1)).count() > 0:
@@ -198,6 +231,20 @@ def estimate(
198231
else:
199232
col_list += [lit(0).alias("death_marker"), lit(0.0).alias("h_value")]
200233

234+
if out_of_scope_marker_col is not None:
235+
col_list.append(
236+
col(out_of_scope_marker_col).alias("out_of_scope_marker_denominator")
237+
)
238+
if out_of_scope_full:
239+
col_list.append(
240+
col(out_of_scope_marker_col).alias("out_of_scope_marker_numerator")
241+
)
242+
else:
243+
col_list.append(lit(0).alias("out_of_scope_marker_numerator"))
244+
else:
245+
col_list.append(lit(0).alias("out_of_scope_marker_numerator"))
246+
col_list.append(lit(0).alias("out_of_scope_marker_denominator"))
247+
201248
if auxiliary_col is not None:
202249
col_list.append(col(auxiliary_col).alias("auxiliary"))
203250

@@ -221,6 +268,8 @@ def estimate(
221268
sum(col("sample_marker")),
222269
sum(col("death_marker")),
223270
first(col("h_value").cast("integer")).alias("first(h_value)"),
271+
sum(col("out_of_scope_marker_numerator")),
272+
sum(col("out_of_scope_marker_denominator")),
224273
count(col("sample_marker")),
225274
)
226275
.withColumn(
@@ -235,8 +284,15 @@ def estimate(
235284
1
236285
+ (
237286
col("first(h_value)")
238-
* col("sum(death_marker)")
239-
/ (col("sum(sample_marker)") - col("sum(death_marker)"))
287+
* (
288+
col("sum(death_marker)")
289+
+ col("sum(out_of_scope_marker_numerator)")
290+
)
291+
/ (
292+
col("sum(sample_marker)")
293+
- col("sum(death_marker)")
294+
- col("sum(out_of_scope_marker_denominator)")
295+
)
240296
)
241297
)
242298
),
@@ -245,6 +301,8 @@ def estimate(
245301
"sum(sample_marker)",
246302
"sum(death_marker)",
247303
"first(h_value)",
304+
"sum(out_of_scope_marker_numerator)",
305+
"sum(out_of_scope_marker_denominator)",
248306
"count(sample_marker)",
249307
)
250308
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
period,reference,strata,calibration_group,auxiliary,out_of_scope_marker,death_marker,H,sample_inclusion_marker
2+
2010,49001,1,200,2,0,0,True,1
3+
2010,49002,1,200,4,1,0,True,1
4+
2010,49003,1,200,3,0,0,True,1
5+
2010,49004,1,200,6,0,1,True,1
6+
2010,49005,1,200,7,0,0,True,0
7+
2010,49006,2,200,10,0,0,True,1
8+
2010,49007,2,200,15,0,0,True,1
9+
2010,49008,2,200,12,0,0,True,0
10+
2010,49009,2,200,13,0,0,True,1
11+
2010,49010,2,200,19,0,0,True,0
12+
2010,50001,3,300,24,0,0,False,1
13+
2010,50002,3,300,27,0,0,False,1
14+
2010,50003,3,300,34,0,0,False,1
15+
2011,49001,1,200,1,0,0,True,1
16+
2011,49002,1,200,5,0,0,True,0
17+
2011,49003,1,200,3,0,0,True,1
18+
2011,49004,1,200,4,0,0,True,0
19+
2011,49005,1,200,7,0,0,True,0
20+
2011,49006,2,200,10,0,0,True,1
21+
2011,49007,2,200,15,0,0,True,1
22+
2011,49008,2,200,12,0,1,True,1
23+
2011,49009,2,200,13,0,0,True,1
24+
2011,49010,2,200,19,0,0,True,0
25+
2011,50001,3,300,24,0,0,False,1
26+
2011,50002,3,300,27,0,0,False,1
27+
2011,50003,3,300,34,0,0,False,1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
period,strata,calibration_group,unadjusted_design_weight,design_weight,calibration_weight
2+
2010,1,200,1.25,2.5,1.1086294416
3+
2010,2,200,1.6666666667,1.6666666667,1.1086294416
4+
2010,3,300,1,1,1
5+
2011,1,200,2.5,2.5,1.2275862069
6+
2011,2,200,1.25,1.6666666667,1.2275862069
7+
2011,3,300,1,1,1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
period,reference,strata,calibration_group,auxiliary,out_of_scope_marker,death_marker,H,sample_inclusion_marker
2+
2010,49001,1,200,2,0,0,True,1
3+
2010,49002,1,200,4,1,0,True,1
4+
2010,49003,1,200,3,0,0,True,1
5+
2010,49004,1,200,6,0,1,True,1
6+
2010,49005,1,200,7,0,0,True,0
7+
2010,49006,2,200,10,0,0,True,1
8+
2010,49007,2,200,15,0,0,True,1
9+
2010,49008,2,200,12,0,0,True,0
10+
2010,49009,2,200,13,0,0,True,1
11+
2010,49010,2,200,19,0,0,True,0
12+
2010,50001,3,300,24,0,0,False,1
13+
2010,50002,3,300,27,0,0,False,1
14+
2010,50003,3,300,34,0,0,False,1
15+
2011,49001,1,200,1,0,0,True,1
16+
2011,49002,1,200,5,0,0,True,0
17+
2011,49003,1,200,3,0,0,True,1
18+
2011,49004,1,200,4,0,0,True,0
19+
2011,49005,1,200,7,0,0,True,0
20+
2011,49006,2,200,10,0,0,True,1
21+
2011,49007,2,200,15,0,0,True,1
22+
2011,49008,2,200,12,0,1,True,1
23+
2011,49009,2,200,13,0,0,True,1
24+
2011,49010,2,200,19,0,0,True,0
25+
2011,50001,3,300,24,0,0,False,1
26+
2011,50002,3,300,27,0,0,False,1
27+
2011,50003,3,300,34,0,0,False,1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
period,strata,calibration_group,unadjusted_design_weight,design_weight,calibration_weight
2+
2010,1,200,1.25,1.875,1.1086294416
3+
2010,2,200,1.6666666667,1.6666666667,1.1086294416
4+
2010,3,300,1,1,1
5+
2011,1,200,2.5,2.5,1.2275862069
6+
2011,2,200,1.25,1.6666666667,1.2275862069
7+
2011,3,300,1,1,1

‎tests/test_estimation.py

+14-2
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
sample_col = "sample_inclusion_marker"
1515
death_col = "death_marker"
1616
h_col = "H"
17+
out_of_scope_col = "out_of_scope_marker"
1718
auxiliary_col = "auxiliary"
1819
calibration_group_col = "calibration_group"
1920
unadjusted_design_weight_col = "unadjusted_design_weight"
@@ -27,6 +28,7 @@
2728
sample_col,
2829
death_col,
2930
h_col,
31+
out_of_scope_col,
3032
auxiliary_col,
3133
calibration_group_col,
3234
design_weight_col,
@@ -41,6 +43,7 @@
4143
sample_col: "int",
4244
death_col: "int",
4345
h_col: "boolean",
46+
out_of_scope_col: "int",
4447
auxiliary_col: "double",
4548
calibration_group_col: "string",
4649
design_weight_col: "double",
@@ -302,6 +305,13 @@ def test_calculations(fxt_load_test_csv, scenario_type, scenario):
302305
estimation_kwargs["death_marker_col"] = death_col
303306
estimation_kwargs["h_value_col"] = h_col
304307

308+
if out_of_scope_col in test_dataframe.columns:
309+
estimation_kwargs["out_of_scope_marker_col"] = out_of_scope_col
310+
if "full" in scenario:
311+
estimation_kwargs["out_of_scope_full"] = True
312+
else:
313+
estimation_kwargs["out_of_scope_full"] = False
314+
305315
if auxiliary_col in test_dataframe.columns:
306316
estimation_kwargs["auxiliary_col"] = auxiliary_col
307317

@@ -321,13 +331,15 @@ def test_calculations(fxt_load_test_csv, scenario_type, scenario):
321331

322332
ret_val = estimation.estimate(test_dataframe, **estimation_kwargs)
323333

334+
assert isinstance(ret_val, type(test_dataframe))
324335
sort_col_list = ["period", "strata"]
336+
select_cols = list(set(dataframe_columns) & set(exp_val.columns))
325337
if calibration_group_col in test_dataframe.columns:
326338
sort_col_list.append(calibration_group_col)
327339

328340
assert_approx_df_equality(
329-
ret_val.sort(sort_col_list),
330-
exp_val.sort(sort_col_list),
341+
ret_val.sort(sort_col_list).select(select_cols),
342+
exp_val.sort(sort_col_list).select(select_cols),
331343
0.01,
332344
ignore_nullable=True,
333345
)

0 commit comments

Comments
 (0)
Please sign in to comment.