Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit b348cd6

Browse files
committedJul 1, 2022
Merge branch 'main' into spp_5971_death_count_validation
2 parents 5898ae7 + 08eea28 commit b348cd6

8 files changed

+191
-53
lines changed
 

‎poetry.lock

+39-39
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "statistical_methods_library"
3-
version = "4.3.0"
3+
version = "4.4.0"
44
description = ""
55
authors = ["Your Name <you@example.com>"]
66
license = "MIT"

‎statistical_methods_library/estimation.py

+69-11
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ def estimate(
2424
sample_marker_col: str,
2525
death_marker_col: typing.Optional[str] = None,
2626
h_value_col: typing.Optional[str] = None,
27+
out_of_scope_marker_col: typing.Optional[str] = None,
28+
out_of_scope_full: typing.Optional[bool] = None,
2729
auxiliary_col: typing.Optional[str] = None,
2830
calibration_group_col: typing.Optional[str] = None,
2931
unadjusted_design_weight_col: typing.Optional[str] = None,
@@ -35,28 +37,37 @@ def estimate(
3537
Ratio estimation.
3638
3739
###Arguments
38-
* input_df: The input data frame.
39-
* unique_identifier_col: The name of the column containing the unique identifier
40+
* `input_df`: The input data frame.
41+
* `unique_identifier_col`: The name of the column containing the unique identifier
4042
for the contributors.
41-
* period_col: The name of the column containing the period information for
43+
* `period_col`: The name of the column containing the period information for
4244
the contributor.
43-
* strata_col: The name of the column containing the strata of the contributor.
44-
* sample_marker_col: The name of the column containing a marker
45+
* `strata_col`: The name of the column containing the strata of the contributor.
46+
* `sample_marker_col`: The name of the column containing a marker
4547
for whether to include the contributor in the sample or only in the
4648
population. This column must only contain values of 0 or 1 where 0 means
4749
to exclude the contributor from the sample and 1 means the contributor
4850
will be included in the sample count.
49-
* death_marker_col: The name of the column containing a marker for whether
51+
* `death_marker_col`: The name of the column containing a marker for whether
5052
the contributor is dead. This column must only contain the values 0
5153
meaning the contributor is not dead and 1 meaning that the contributor is dead.
52-
* h_value_col: The name of the column containing the h value for the strata.
54+
* `h_value_col`: The name of the column containing the h value for the strata.
55+
* `out_of_scope_marker_col`: The name of the column containing a marker for
56+
whether the contributor is out of scope. This column must only contain
57+
the values 0 meaning the contributor is not out of scope and 1 meaning
58+
that the contributor is out of scope.
59+
* out_of_scope_full: A parameter that specifies what type of out of scope
60+
to run when an `out_of_scope_marker_col` is provided. True specifies
61+
that the out of scope is used on both sides of the adjustment fraction.
62+
False specifies that the out of scope is used only on the denominator of
63+
the adjustment fraction.
5364
* auxiliary_col: The name of the column containing the auxiliary value for
5465
the contributor.
5566
* calibration_group_col: The name of the column containing the calibration
5667
group for the contributor.
5768
* unadjusted_design_weight_col: The name of the column which will contain
58-
the unadjusted design weight for the contributor.
59-
Defaults to None, this will mean the column isn't output unless a name is provided.
69+
the unadjusted design weight for the contributor. The column isn't
70+
output unless a name is provided.
6071
* design_weight_col: The name of the column which will contain the
6172
design weight for the contributor. Defaults to `design_weight`.
6273
* calibration_weight_col: The name of the column which will containthe
@@ -93,6 +104,10 @@ def estimate(
93104
is per-stratum, the `h_value_col` must not change within a given period and
94105
stratum.
95106
107+
If `out_of_scope_marker_col` is specified the `out_of_scope_full`
108+
parameter must also be set. In addition `death_marker_col` and `h_value_col`
109+
must be provided.
110+
96111
If `auxiliary_col` is specified then one of Separate Ratio or Combined Ratio
97112
estimation is performed. This depends on whether `calibration_group_col`
98113
is specified. If so then Combined Ratio estimation is performed, otherwise
@@ -118,6 +133,19 @@ def estimate(
118133
"Either both or none of death_marker_col and h_value_col must be specified."
119134
)
120135

136+
# Not the same as death_cols because when out_of_scope_full is false the
137+
# all fails.
138+
out_of_scope_cols = (out_of_scope_marker_col, out_of_scope_full)
139+
if out_of_scope_cols.count(None) == 1:
140+
raise TypeError(
141+
"Either both or none of out_of_scope_marker_col "
142+
+ "and out_of_scope_full must be specified."
143+
)
144+
if any(out_of_scope_cols) and not any(death_cols):
145+
raise TypeError(
146+
"For out of scope, death_marker_col and h_value_col must be specified."
147+
)
148+
121149
if calibration_group_col is not None and auxiliary_col is None:
122150
raise TypeError(
123151
"If calibration_group_col is specified then auxiliary_col must be provided."
@@ -132,6 +160,9 @@ def estimate(
132160
if death_marker_col is not None:
133161
expected_cols += [death_marker_col, h_value_col]
134162

163+
if out_of_scope_marker_col is not None:
164+
expected_cols.append(out_of_scope_marker_col)
165+
135166
if auxiliary_col is not None:
136167
expected_cols.append(auxiliary_col)
137168

@@ -168,6 +199,8 @@ def estimate(
168199
marker_cols = [sample_marker_col]
169200
if death_marker_col is not None:
170201
marker_cols.append(death_marker_col)
202+
if out_of_scope_marker_col is not None:
203+
marker_cols.append(out_of_scope_marker_col)
171204

172205
for col_name in marker_cols:
173206
if input_df.filter((col(col_name) != 0) & (col(col_name) != 1)).count() > 0:
@@ -220,6 +253,20 @@ def estimate(
220253
else:
221254
col_list += [lit(0).alias("death_marker"), lit(0.0).alias("h_value")]
222255

256+
if out_of_scope_marker_col is not None:
257+
col_list.append(
258+
col(out_of_scope_marker_col).alias("out_of_scope_marker_denominator")
259+
)
260+
if out_of_scope_full:
261+
col_list.append(
262+
col(out_of_scope_marker_col).alias("out_of_scope_marker_numerator")
263+
)
264+
else:
265+
col_list.append(lit(0).alias("out_of_scope_marker_numerator"))
266+
else:
267+
col_list.append(lit(0).alias("out_of_scope_marker_numerator"))
268+
col_list.append(lit(0).alias("out_of_scope_marker_denominator"))
269+
223270
if auxiliary_col is not None:
224271
col_list.append(col(auxiliary_col).alias("auxiliary"))
225272

@@ -243,6 +290,8 @@ def estimate(
243290
sum(col("sample_marker")),
244291
sum(col("death_marker")),
245292
first(col("h_value").cast("integer")).alias("first(h_value)"),
293+
sum(col("out_of_scope_marker_numerator")),
294+
sum(col("out_of_scope_marker_denominator")),
246295
count(col("sample_marker")),
247296
)
248297
.withColumn(
@@ -257,8 +306,15 @@ def estimate(
257306
1
258307
+ (
259308
col("first(h_value)")
260-
* col("sum(death_marker)")
261-
/ (col("sum(sample_marker)") - col("sum(death_marker)"))
309+
* (
310+
col("sum(death_marker)")
311+
+ col("sum(out_of_scope_marker_numerator)")
312+
)
313+
/ (
314+
col("sum(sample_marker)")
315+
- col("sum(death_marker)")
316+
- col("sum(out_of_scope_marker_denominator)")
317+
)
262318
)
263319
)
264320
),
@@ -267,6 +323,8 @@ def estimate(
267323
"sum(sample_marker)",
268324
"sum(death_marker)",
269325
"first(h_value)",
326+
"sum(out_of_scope_marker_numerator)",
327+
"sum(out_of_scope_marker_denominator)",
270328
"count(sample_marker)",
271329
)
272330
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
period,reference,strata,calibration_group,auxiliary,out_of_scope_marker,death_marker,H,sample_inclusion_marker
2+
2010,49001,1,200,2,0,0,True,1
3+
2010,49002,1,200,4,1,0,True,1
4+
2010,49003,1,200,3,0,0,True,1
5+
2010,49004,1,200,6,0,1,True,1
6+
2010,49005,1,200,7,0,0,True,0
7+
2010,49006,2,200,10,0,0,True,1
8+
2010,49007,2,200,15,0,0,True,1
9+
2010,49008,2,200,12,0,0,True,0
10+
2010,49009,2,200,13,0,0,True,1
11+
2010,49010,2,200,19,0,0,True,0
12+
2010,50001,3,300,24,0,0,False,1
13+
2010,50002,3,300,27,0,0,False,1
14+
2010,50003,3,300,34,0,0,False,1
15+
2011,49001,1,200,1,0,0,True,1
16+
2011,49002,1,200,5,0,0,True,0
17+
2011,49003,1,200,3,0,0,True,1
18+
2011,49004,1,200,4,0,0,True,0
19+
2011,49005,1,200,7,0,0,True,0
20+
2011,49006,2,200,10,0,0,True,1
21+
2011,49007,2,200,15,0,0,True,1
22+
2011,49008,2,200,12,0,1,True,1
23+
2011,49009,2,200,13,0,0,True,1
24+
2011,49010,2,200,19,0,0,True,0
25+
2011,50001,3,300,24,0,0,False,1
26+
2011,50002,3,300,27,0,0,False,1
27+
2011,50003,3,300,34,0,0,False,1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
period,strata,calibration_group,unadjusted_design_weight,design_weight,calibration_weight
2+
2010,1,200,1.25,2.5,1.1086294416
3+
2010,2,200,1.6666666667,1.6666666667,1.1086294416
4+
2010,3,300,1,1,1
5+
2011,1,200,2.5,2.5,1.2275862069
6+
2011,2,200,1.25,1.6666666667,1.2275862069
7+
2011,3,300,1,1,1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
period,reference,strata,calibration_group,auxiliary,out_of_scope_marker,death_marker,H,sample_inclusion_marker
2+
2010,49001,1,200,2,0,0,True,1
3+
2010,49002,1,200,4,1,0,True,1
4+
2010,49003,1,200,3,0,0,True,1
5+
2010,49004,1,200,6,0,1,True,1
6+
2010,49005,1,200,7,0,0,True,0
7+
2010,49006,2,200,10,0,0,True,1
8+
2010,49007,2,200,15,0,0,True,1
9+
2010,49008,2,200,12,0,0,True,0
10+
2010,49009,2,200,13,0,0,True,1
11+
2010,49010,2,200,19,0,0,True,0
12+
2010,50001,3,300,24,0,0,False,1
13+
2010,50002,3,300,27,0,0,False,1
14+
2010,50003,3,300,34,0,0,False,1
15+
2011,49001,1,200,1,0,0,True,1
16+
2011,49002,1,200,5,0,0,True,0
17+
2011,49003,1,200,3,0,0,True,1
18+
2011,49004,1,200,4,0,0,True,0
19+
2011,49005,1,200,7,0,0,True,0
20+
2011,49006,2,200,10,0,0,True,1
21+
2011,49007,2,200,15,0,0,True,1
22+
2011,49008,2,200,12,0,1,True,1
23+
2011,49009,2,200,13,0,0,True,1
24+
2011,49010,2,200,19,0,0,True,0
25+
2011,50001,3,300,24,0,0,False,1
26+
2011,50002,3,300,27,0,0,False,1
27+
2011,50003,3,300,34,0,0,False,1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
period,strata,calibration_group,unadjusted_design_weight,design_weight,calibration_weight
2+
2010,1,200,1.25,1.875,1.1086294416
3+
2010,2,200,1.6666666667,1.6666666667,1.1086294416
4+
2010,3,300,1,1,1
5+
2011,1,200,2.5,2.5,1.2275862069
6+
2011,2,200,1.25,1.6666666667,1.2275862069
7+
2011,3,300,1,1,1

‎tests/test_estimation.py

+14-2
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
sample_col = "sample_inclusion_marker"
1515
death_col = "death_marker"
1616
h_col = "H"
17+
out_of_scope_col = "out_of_scope_marker"
1718
auxiliary_col = "auxiliary"
1819
calibration_group_col = "calibration_group"
1920
unadjusted_design_weight_col = "unadjusted_design_weight"
@@ -27,6 +28,7 @@
2728
sample_col,
2829
death_col,
2930
h_col,
31+
out_of_scope_col,
3032
auxiliary_col,
3133
calibration_group_col,
3234
design_weight_col,
@@ -41,6 +43,7 @@
4143
sample_col: "int",
4244
death_col: "int",
4345
h_col: "boolean",
46+
out_of_scope_col: "int",
4447
auxiliary_col: "double",
4548
calibration_group_col: "string",
4649
design_weight_col: "double",
@@ -316,6 +319,13 @@ def test_calculations(fxt_load_test_csv, scenario_type, scenario):
316319
estimation_kwargs["death_marker_col"] = death_col
317320
estimation_kwargs["h_value_col"] = h_col
318321

322+
if out_of_scope_col in test_dataframe.columns:
323+
estimation_kwargs["out_of_scope_marker_col"] = out_of_scope_col
324+
if "full" in scenario:
325+
estimation_kwargs["out_of_scope_full"] = True
326+
else:
327+
estimation_kwargs["out_of_scope_full"] = False
328+
319329
if auxiliary_col in test_dataframe.columns:
320330
estimation_kwargs["auxiliary_col"] = auxiliary_col
321331

@@ -335,13 +345,15 @@ def test_calculations(fxt_load_test_csv, scenario_type, scenario):
335345

336346
ret_val = estimation.estimate(test_dataframe, **estimation_kwargs)
337347

348+
assert isinstance(ret_val, type(test_dataframe))
338349
sort_col_list = ["period", "strata"]
350+
select_cols = list(set(dataframe_columns) & set(exp_val.columns))
339351
if calibration_group_col in test_dataframe.columns:
340352
sort_col_list.append(calibration_group_col)
341353

342354
assert_approx_df_equality(
343-
ret_val.sort(sort_col_list),
344-
exp_val.sort(sort_col_list),
355+
ret_val.sort(sort_col_list).select(select_cols),
356+
exp_val.sort(sort_col_list).select(select_cols),
345357
0.01,
346358
ignore_nullable=True,
347359
)

0 commit comments

Comments
 (0)
Please sign in to comment.