@@ -24,6 +24,8 @@ def estimate(
24
24
sample_marker_col : str ,
25
25
death_marker_col : typing .Optional [str ] = None ,
26
26
h_value_col : typing .Optional [str ] = None ,
27
+ out_of_scope_marker_col : typing .Optional [str ] = None ,
28
+ out_of_scope_full : typing .Optional [bool ] = None ,
27
29
auxiliary_col : typing .Optional [str ] = None ,
28
30
calibration_group_col : typing .Optional [str ] = None ,
29
31
unadjusted_design_weight_col : typing .Optional [str ] = None ,
@@ -35,28 +37,37 @@ def estimate(
35
37
Ratio estimation.
36
38
37
39
###Arguments
38
- * input_df: The input data frame.
39
- * unique_identifier_col: The name of the column containing the unique identifier
40
+ * ` input_df` : The input data frame.
41
+ * ` unique_identifier_col` : The name of the column containing the unique identifier
40
42
for the contributors.
41
- * period_col: The name of the column containing the period information for
43
+ * ` period_col` : The name of the column containing the period information for
42
44
the contributor.
43
- * strata_col: The name of the column containing the strata of the contributor.
44
- * sample_marker_col: The name of the column containing a marker
45
+ * ` strata_col` : The name of the column containing the strata of the contributor.
46
+ * ` sample_marker_col` : The name of the column containing a marker
45
47
for whether to include the contributor in the sample or only in the
46
48
population. This column must only contain values of 0 or 1 where 0 means
47
49
to exclude the contributor from the sample and 1 means the contributor
48
50
will be included in the sample count.
49
- * death_marker_col: The name of the column containing a marker for whether
51
+ * ` death_marker_col` : The name of the column containing a marker for whether
50
52
the contributor is dead. This column must only contain the values 0
51
53
meaning the contributor is not dead and 1 meaning that the contributor is dead.
52
- * h_value_col: The name of the column containing the h value for the strata.
54
+ * `h_value_col`: The name of the column containing the h value for the strata.
55
+ * `out_of_scope_marker_col`: The name of the column containing a marker for
56
+ whether the contributor is out of scope. This column must only contain
57
+ the values 0 meaning the contributor is not out of scope and 1 meaning
58
+ that the contributor is out of scope.
59
+ * out_of_scope_full: A parameter that specifies what type of out of scope
60
+ to run when an `out_of_scope_marker_col` is provided. True specifies
61
+ that the out of scope is used on both sides of the adjustment fraction.
62
+ False specifies that the out of scope is used only on the denominator of
63
+ the adjustment fraction.
53
64
* auxiliary_col: The name of the column containing the auxiliary value for
54
65
the contributor.
55
66
* calibration_group_col: The name of the column containing the calibration
56
67
group for the contributor.
57
68
* unadjusted_design_weight_col: The name of the column which will contain
58
- the unadjusted design weight for the contributor.
59
- Defaults to None, this will mean the column isn't output unless a name is provided.
69
+ the unadjusted design weight for the contributor. The column isn't
70
+ output unless a name is provided.
60
71
* design_weight_col: The name of the column which will contain the
61
72
design weight for the contributor. Defaults to `design_weight`.
62
73
* calibration_weight_col: The name of the column which will containthe
@@ -93,6 +104,10 @@ def estimate(
93
104
is per-stratum, the `h_value_col` must not change within a given period and
94
105
stratum.
95
106
107
+ If `out_of_scope_marker_col` is specified the `out_of_scope_full`
108
+ parameter must also be set. In addition `death_marker_col` and `h_value_col`
109
+ must be provided.
110
+
96
111
If `auxiliary_col` is specified then one of Separate Ratio or Combined Ratio
97
112
estimation is performed. This depends on whether `calibration_group_col`
98
113
is specified. If so then Combined Ratio estimation is performed, otherwise
@@ -118,6 +133,19 @@ def estimate(
118
133
"Either both or none of death_marker_col and h_value_col must be specified."
119
134
)
120
135
136
+ # Not the same as death_cols because when out_of_scope_full is false the
137
+ # all fails.
138
+ out_of_scope_cols = (out_of_scope_marker_col , out_of_scope_full )
139
+ if out_of_scope_cols .count (None ) == 1 :
140
+ raise TypeError (
141
+ "Either both or none of out_of_scope_marker_col "
142
+ + "and out_of_scope_full must be specified."
143
+ )
144
+ if any (out_of_scope_cols ) and not any (death_cols ):
145
+ raise TypeError (
146
+ "For out of scope, death_marker_col and h_value_col must be specified."
147
+ )
148
+
121
149
if calibration_group_col is not None and auxiliary_col is None :
122
150
raise TypeError (
123
151
"If calibration_group_col is specified then auxiliary_col must be provided."
@@ -132,6 +160,9 @@ def estimate(
132
160
if death_marker_col is not None :
133
161
expected_cols += [death_marker_col , h_value_col ]
134
162
163
+ if out_of_scope_marker_col is not None :
164
+ expected_cols .append (out_of_scope_marker_col )
165
+
135
166
if auxiliary_col is not None :
136
167
expected_cols .append (auxiliary_col )
137
168
@@ -168,6 +199,8 @@ def estimate(
168
199
marker_cols = [sample_marker_col ]
169
200
if death_marker_col is not None :
170
201
marker_cols .append (death_marker_col )
202
+ if out_of_scope_marker_col is not None :
203
+ marker_cols .append (out_of_scope_marker_col )
171
204
172
205
for col_name in marker_cols :
173
206
if input_df .filter ((col (col_name ) != 0 ) & (col (col_name ) != 1 )).count () > 0 :
@@ -198,6 +231,20 @@ def estimate(
198
231
else :
199
232
col_list += [lit (0 ).alias ("death_marker" ), lit (0.0 ).alias ("h_value" )]
200
233
234
+ if out_of_scope_marker_col is not None :
235
+ col_list .append (
236
+ col (out_of_scope_marker_col ).alias ("out_of_scope_marker_denominator" )
237
+ )
238
+ if out_of_scope_full :
239
+ col_list .append (
240
+ col (out_of_scope_marker_col ).alias ("out_of_scope_marker_numerator" )
241
+ )
242
+ else :
243
+ col_list .append (lit (0 ).alias ("out_of_scope_marker_numerator" ))
244
+ else :
245
+ col_list .append (lit (0 ).alias ("out_of_scope_marker_numerator" ))
246
+ col_list .append (lit (0 ).alias ("out_of_scope_marker_denominator" ))
247
+
201
248
if auxiliary_col is not None :
202
249
col_list .append (col (auxiliary_col ).alias ("auxiliary" ))
203
250
@@ -221,6 +268,8 @@ def estimate(
221
268
sum (col ("sample_marker" )),
222
269
sum (col ("death_marker" )),
223
270
first (col ("h_value" ).cast ("integer" )).alias ("first(h_value)" ),
271
+ sum (col ("out_of_scope_marker_numerator" )),
272
+ sum (col ("out_of_scope_marker_denominator" )),
224
273
count (col ("sample_marker" )),
225
274
)
226
275
.withColumn (
@@ -235,8 +284,15 @@ def estimate(
235
284
1
236
285
+ (
237
286
col ("first(h_value)" )
238
- * col ("sum(death_marker)" )
239
- / (col ("sum(sample_marker)" ) - col ("sum(death_marker)" ))
287
+ * (
288
+ col ("sum(death_marker)" )
289
+ + col ("sum(out_of_scope_marker_numerator)" )
290
+ )
291
+ / (
292
+ col ("sum(sample_marker)" )
293
+ - col ("sum(death_marker)" )
294
+ - col ("sum(out_of_scope_marker_denominator)" )
295
+ )
240
296
)
241
297
)
242
298
),
@@ -245,6 +301,8 @@ def estimate(
245
301
"sum(sample_marker)" ,
246
302
"sum(death_marker)" ,
247
303
"first(h_value)" ,
304
+ "sum(out_of_scope_marker_numerator)" ,
305
+ "sum(out_of_scope_marker_denominator)" ,
248
306
"count(sample_marker)" ,
249
307
)
250
308
)
0 commit comments