Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Imputation manual construction #186

Merged
merged 19 commits into from
May 23, 2024
Merged
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 127 additions & 9 deletions statistical_methods_library/imputation/engine.py
Original file line number Diff line number Diff line change
@@ -43,6 +43,12 @@ class Marker(Enum):
FORWARD_IMPUTE_FROM_CONSTRUCTION = "FIC"
"""The value has been forward imputed from a constructed value."""

MANUAL_CONSTRUCTION = "MC"
"""The value is manual construction."""

FORWARD_IMPUTE_FROM_MANUAL_CONSTRUCTION = "FIMC"
"""The value has been forward imputed from a manual construction."""


def impute(
*,
@@ -78,6 +84,7 @@ def impute(
unweighted_forward_link_col: Optional[str] = "forward_unweighted",
unweighted_backward_link_col: Optional[str] = "backward_unweighted",
unweighted_construction_link_col: Optional[str] = "construction_unweighted",
manual_construction_col: Optional[str] = None,
**ratio_calculator_params,
) -> DataFrame:
"""
@@ -188,7 +195,6 @@ def impute(
link_cols = [forward_link_col, backward_link_col]
if any(link_cols) and not all(link_cols):
raise TypeError("Either all or no link columns must be specified")

input_params = {
"ref": reference_col,
"period": period_col,
@@ -233,6 +239,14 @@ def impute(
"output": output_col,
"marker": marker_col,
}
# Add manual_construction parm
# only if manual_construction_col is not None.
if manual_construction_col:
input_params["manual_const"] = manual_construction_col

if back_data_df:
if not isinstance(back_data_df, DataFrame):
raise TypeError("Input is not a DataFrame")

if weight is not None:
if not isinstance(weight, Decimal):
@@ -266,6 +280,7 @@ def impute(
"forward_unweighted": DecimalType,
"backward_unweighted": DecimalType,
"construction_unweighted": DecimalType,
"manual_const": DecimalType,
}

if link_filter:
@@ -289,7 +304,7 @@ def impute(
input_params,
type_mapping,
["ref", "period", "grouping"],
["target"],
["target", "manual_const"],
)
.withColumnRenamed("target", "output")
.withColumn("marker", when(~col("output").isNull(), Marker.RESPONSE.value))
@@ -301,13 +316,43 @@ def impute(
prior_period_df = prepared_df.selectExpr(
"min(previous_period) AS prior_period"
).localCheckpoint(eager=False)
if manual_construction_col:
# Set manual construction value as output
# and set marker as MC
df_with_mc_data = prepared_df.withColumn(
"marker",
when(
(col("manual_const").isNotNull()) & (col("output").isNull()),
lit(Marker.MANUAL_CONSTRUCTION.value),
).otherwise(col("marker")),
).withColumn(
"output",
when(
(col("manual_const").isNotNull()) & (col("output").isNull()),
col("manual_const"),
).otherwise(col("output")),
)
manual_construction_data = df_with_mc_data.filter(
(col("marker") == Marker.MANUAL_CONSTRUCTION.value)
| (col("marker") == Marker.FORWARD_IMPUTE_FROM_MANUAL_CONSTRUCTION.value)
)
# Filter out the MC and FIMC data so
# it will be not inculded in the link calculations

prepared_df = df_with_mc_data.filter(
col("marker").isNull()
| (
~(col("marker") == Marker.MANUAL_CONSTRUCTION.value)
& ~(
col("marker")
== Marker.FORWARD_IMPUTE_FROM_MANUAL_CONSTRUCTION.value
)
)
)

if back_data_df:
validated_back_data_df = validate_dataframe(
back_data_df,
back_input_params,
type_mapping,
["ref", "period", "grouping"],
back_data_df, back_input_params, type_mapping, ["ref", "period", "grouping"]
).localCheckpoint(eager=False)
back_data_period_df = (
validated_back_data_df.select(
@@ -325,6 +370,7 @@ def impute(
)
.localCheckpoint(eager=False)
)

prepared_df = prepared_df.unionByName(
back_data_period_df.filter(col("marker") == lit(Marker.RESPONSE.value)),
allowMissingColumns=True,
@@ -535,6 +581,52 @@ def calculate_weighted_link(link_name):
)

calculate_ratios()
if manual_construction_col:
# populate link, count, default information
# for manual_construction data
manual_construction_data = (
manual_construction_data.alias("mc")
.join(prepared_df, ["period", "grouping"], "leftouter")
.select(
"mc.ref",
"mc.period",
"mc.grouping",
"mc.aux",
"mc.manual_const",
"mc.previous_period",
"mc.next_period",
"mc.output",
"mc.marker",
when(col("forward").isNull(), lit(1).cast("long"))
.otherwise(col("forward"))
.alias("forward"),
when(col("backward").isNull(), lit(1).cast("long"))
.otherwise(col("backward"))
.alias("backward"),
when(col("construction").isNull(), lit(1).cast("long"))
.otherwise(col("construction"))
.alias("construction"),
when(col("count_forward").isNull(), lit(0).cast("int"))
.otherwise(col("count_forward"))
.alias("count_forward"),
when(col("count_backward").isNull(), lit(0).cast("int"))
.otherwise(col("count_backward"))
.alias("count_backward"),
when(col("count_construction").isNull(), lit(0).cast("int"))
.otherwise(col("count_construction"))
.alias("count_construction"),
when(col("default_forward").isNull(), lit(True))
.otherwise(col("default_forward"))
.alias("default_forward"),
when(col("default_backward").isNull(), lit(True))
.otherwise(col("default_backward"))
.alias("default_backward"),
when(col("default_construction").isNull(), lit(True))
.otherwise(col("default_construction"))
.alias("default_construction"),
)
.distinct()
)

# Caching for both imputed and unimputed data.
imputed_df = None
@@ -565,7 +657,6 @@ def impute_helper(
"forward",
"backward",
)

# Anything which isn't null is already imputed or a response and thus
# can be imputed from. Note that in the case of backward imputation
# this still holds since it always happens after forward imputation
@@ -628,7 +719,6 @@ def impute_helper(
["ref", "period", "grouping"],
"leftanti",
).localCheckpoint(eager=True)

# We should now have an output column which is as fully populated as
# this phase of imputation can manage. As such replace the existing
# output column with our one. Same goes for the marker column.
@@ -653,6 +743,29 @@ def forward_impute_from_response(df: DataFrame) -> DataFrame:
def backward_impute(df: DataFrame) -> DataFrame:
return impute_helper(df, "backward", Marker.BACKWARD_IMPUTE, False)

# --- Forward impute from manual construction ---
def forward_impute_from_manual_construction(df: DataFrame) -> DataFrame:
nonlocal imputed_df
nonlocal null_response_df
imputed_df = None
null_response_df = None
if back_data_df:
# Add the MC and FIMC from the back data
df = df.unionByName(
back_data_period_df.filter(
(col("marker") == lit(Marker.MANUAL_CONSTRUCTION.value))
| (
col("marker")
== lit(Marker.FORWARD_IMPUTE_FROM_MANUAL_CONSTRUCTION.value)
)
),
allowMissingColumns=True,
)

return impute_helper(
df, "forward", Marker.FORWARD_IMPUTE_FROM_MANUAL_CONSTRUCTION, True
)

# --- Construction functions ---
def construct_values(df: DataFrame) -> DataFrame:
if back_data_df:
@@ -725,11 +838,16 @@ def forward_impute_from_construction(df: DataFrame) -> DataFrame:
for stage in (
forward_impute_from_response,
backward_impute,
forward_impute_from_manual_construction,
construct_values,
forward_impute_from_construction,
):
if manual_construction_col and stage == forward_impute_from_manual_construction:
# Add the mc data
df = df.unionByName(manual_construction_data, allowMissingColumns=True)

df = stage(df).localCheckpoint(eager=False)
if df.filter(col("output").isNull()).count() == 0:
if df.filter(col("output").isNull()).count() == 0 and stage == construct_values:
break

return df.join(prior_period_df, [col("prior_period") < col("period")]).select(
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
identifier,date,group,output,other,marker
1234,"202104","900",10,2,MC
1235,"202104","100",20,2,FIMC
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
identifier,date,group,question,other,manual_construction
1234,"202105",900,,78,
1235,"202105",100,,81,
1236,"202105",100,2113,81,
1237,"202105",200,,81,3189
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
identifier,date,group,output,marker,forward,backward,construction,count_forward,count_backward,count_construction,default_forward,default_backward,default_construction
1234,202105,900,10,FIMC,1,1,1,0,0,0,true,true,true
1235,202105,100,20,FIMC,1,1,26.08641975,0,0,1,true,true,false
1236,202105,100,2113,R,1,1,26.08641975,0,0,1,true,true,false
1237,202105,200,3189,MC,1,1,1,0,0,0,true,true,true
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
identifier,date,group,question,other,manual_construction
30001,202001,100,8444,51,
30001,202002,100,7476,51,
30001,202003,100,2003,51,
30002,202001,100,9343,72,
30002,202002,100,7818,72,
30002,202003,100,4897,72,
30003,202001,100,7511,7,
30003,202002,100,1761,7,
30003,202003,100,6492,7,
30004,202001,100,,81,4321
30004,202002,100,2113,81,
30004,202003,100,,81,3189
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
identifier,date,group,growth_forward,growth_backward,forward,backward,construction,output,marker,count_forward,count_backward,count_construction,default_forward,default_backward,default_construction
30001,202001,100,,1.129481006,1,2.196577972,194.6,8444,R,0,3,3,true,false,false
30001,202002,100,0.885362387,3.732401398,0.652198238,1.866715325,90.8436019,7476,R,3,3,4,false,false,false
30001,202003,100,0.267924024,,1.526946931,1,103.0153846,2003,R,3,0,3,false,true,false
30002,202001,100,,1.195062676,1,2.196577972,194.6,9343,R,0,3,3,true,false,false
30002,202002,100,0.836776196,1.596487645,0.652198238,1.866715325,90.8436019,7818,R,3,3,4,false,false,false
30002,202003,100,0.626375032,,1.526947,1,103.0153846,4897,R,3,0,3,false,true,false
30003,202001,100,,4.265190233,1,2.196577972,194.6,7511,R,0,3,3,true,false,false
30003,202002,100,0.234456131,0.271256932,0.652198238,1.866715325,90.8436019,1761,R,3,3,4,false,false,false
30003,202003,100,3.686541738,,1.526947,1,103.0153846,6492,R,3,0,3,false,true,false
30004,202001,100,,,1,2.196577972,194.6,4321,MC,0,3,3,true,false,false
30004,202002,100,,,0.652198238,1.866715325,90.8436019,2113,R,3,3,4,false,false,false
30004,202003,100,,,1.526947,1,103.0153846,3189,MC,3,0,3,false,true,false
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
identifier,date,group,question,other,manual_construction
40001,202001,100,9491,35,
40001,202002,100,4783,35,
40001,202003,100,7902,35,
40001,202004,100,4911,35,
40002,202001,100,2095,63,
40002,202002,100,442,63,
40002,202003,100,3136,63,
40002,202004,100,2115,63,
40003,202001,100,7863,16,
40003,202002,100,8121,16,
40003,202003,100,2151,16,
40003,202004,100,1377,16,
40004,202001,100,5131,78,
40004,202002,100,9836,78,
40004,202003,100,,78,7525,
40004,202004,100,,78,
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
identifier,date,group,growth_forward,growth_backward,forward,backward,construction,output,marker,count_forward,count_backward,count_construction,default_forward,default_backward,default_construction
40001,202001,100,,1.984319465,1,2.053506032,128.0208333,9491,R,0,4,4,true,false,false
40001,202002,100,0.503951112,0.6052898,0.916179196,1.507228985,120.7395833,4783,R,4,3,4,false,false,false
40001,202003,100,1.652101192,1.609040929,3.003997558,1.551291583,115.6929824,7902,R,3,3,3,false,false,false
40001,202004,100,0.621488231,,0.645360538,1,73.71052632,4911,R,3,0,3,false,true,false
40002,202001,100,,4.739819005,1,2.053506032,128.0208333,2095,R,0,4,4,true,false,false
40002,202002,100,0.21097852,0.140943878,0.916179196,1.507228985,120.7395833,442,R,4,3,4,false,false,false
40002,202003,100,7.095022624,1.482742317,3.003997558,1.551291583,115.6929824,3136,R,3,3,3,false,false,false
40002,202004,100,0.67442602,,0.645360538,1,73.71052632,2115,R,3,0,3,false,true,false
40003,202001,100,,0.968230513,1,2.053506032,128.0208333,7863,R,0,4,4,true,false,false
40003,202002,100,1.032811904,3.775453278,0.916179196,1.507228985,120.7395833,8121,R,4,3,4,false,false,false
40003,202003,100,0.264868859,1.562091503,3.003997558,1.551291583,115.6929824,2151,R,3,3,3,false,false,false
40003,202004,100,0.640167364,,0.645360538,1,73.71052632,1377,R,3,0,3,false,true,false
40004,202001,100,,0.521655144,1,2.053506032,128.0208333,5131,R,0,4,4,true,false,false
40004,202002,100,1.916975248,,0.916179196,1.507228985,120.7395833,9836,R,4,3,4,false,false,false
40004,202003,100,,,3.003997558,1.551291583,115.6929824,7525,MC,3,3,3,false,false,false
40004,202004,100,,,0.645360538,1,73.71052632,4856.338052,FIMC,3,0,3,false,true,false
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
identifier,date,group,question,other,manual_construction
50001,202001,100,6362,59,
50001,202002,100,8542,59,
50001,202003,100,5623,59,
50001,202004,100,7769,59,
50001,202005,100,4687,59,
50002,202001,100,4851,36,
50002,202002,100,8894,36,
50002,202003,100,3372,36,
50002,202004,100,3522,36,
50002,202005,100,2327,36,
50003,202001,100,2238,76,
50003,202002,100,769,76,
50003,202003,100,7722,76,
50003,202004,100,6445,76,
50003,202005,100,1521,76,
50004,202001,100,688,30,
50004,202002,100,3245,30,
50004,202003,100,,30,1487
50004,202004,100,,30,
50004,202005,100,,30,
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
identifier,date,group,growth_forward,growth_backward,forward,backward,construction,output,marker,count_forward,count_backward,count_construction,default_forward,default_backward,default_construction
50001,202001,100,,0.744790447,1,1.103126475,70.34328358,6362,R,0,4,4,TRUE,FALSE,FALSE
50001,202002,100,1.342659541,1.519117909,2.05906902,1.418769101,106.7164179,8542,R,4,3,4,FALSE,FALSE,FALSE
50001,202003,100,0.65827675,0.723773973,3.693007078,0.959774209,97.76023392,5623,R,3,3,3,FALSE,FALSE,FALSE
50001,202004,100,1.381646808,1.657563473,1.086919709,2.469481356,103.7192982,7769,R,3,3,3,FALSE,FALSE,FALSE
50001,202005,100,0.603295147,,0.49999873,1,49.9122807,4687,R,3,0,3,FALSE,TRUE,FALSE
50002,202001,100,,0.545423881,1,1.103126475,70.34328358,4851,R,0,4,4,TRUE,FALSE,FALSE
50002,202002,100,1.833436405,2.637603796,2.05906902,1.418769101,106.7164179,8894,R,4,3,4,FALSE,FALSE,FALSE
50002,202003,100,0.379131999,0.957410562,3.693007078,0.959774209,97.76023392,3372,R,3,3,3,FALSE,FALSE,FALSE
50002,202004,100,1.044483986,1.513536743,1.086919709,2.469481356,103.7192982,3522,R,3,3,3,FALSE,FALSE,FALSE
50002,202005,100,0.660704145,,0.49999873,1,49.9122807,2327,R,3,0,3,FALSE,TRUE,FALSE
50003,202001,100,,2.910273082,1,1.103126475,70.34328358,2238,R,0,4,4,TRUE,FALSE,FALSE
50003,202002,100,0.343610366,0.0995856,2.05906902,1.418769101,106.7164179,769,R,4,3,4,FALSE,FALSE,FALSE
50003,202003,100,10.04161248,1.198138092,3.693007078,0.959774209,97.76023392,7722,R,3,3,3,FALSE,FALSE,FALSE
50003,202004,100,0.834628335,4.237343853,1.086919709,2.469481356,103.7192982,6445,R,3,3,3,FALSE,FALSE,FALSE
50003,202005,100,0.235996897,,0.49999873,1,49.9122807,1521,R,3,0,3,FALSE,TRUE,FALSE
50004,202001,100,,0.21201849,1,1.103126475,70.34328358,688,R,0,4,4,TRUE,FALSE,FALSE
50004,202002,100,4.716569767,,2.05906902,1.418769101,106.7164179,3245,R,4,3,4,FALSE,FALSE,FALSE
50004,202003,100,,,3.693007078,0.959774209,97.76023392,1487,MC,3,3,3,FALSE,FALSE,FALSE
50004,202004,100,,,1.086919709,2.469481356,103.7192982,1616.249608,FIMC,3,3,3,FALSE,FALSE,FALSE
50004,202005,100,,,0.49999873,1,49.9122807,808.122751,FIMC,3,0,3,FALSE,TRUE,FALSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
identifier,date,group,question,other,manual_construction
100001,202001,100,3074,26,
100001,202002,100,9529,26,
100001,202003,100,615,26,
100001,202004,100,3540,26,
100002,202001,100,8084,19,
100002,202002,100,2422,19,
100002,202003,100,3058,19,
100002,202004,100,5608,19,
100003,202001,100,5161,46,
100003,202002,100,3648,46,
100003,202003,100,205,46,
100003,202004,100,2594,46,
100004,202001,100,,86,
100004,202002,100,,86,
100004,202003,100,,86,
100004,202004,100,,86,9352
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
identifier,date,group,growth_forward,growth_backward,forward,backward,construction,output,marker,count_forward,count_backward,count_construction,default_forward,default_backward,default_construction
100001,202001,100,,0.322594186,1,1.691693133,179.3296703,3074,R,0,3,3,TRUE,FALSE,FALSE
100001,202002,100,3.099869876,15.49430894,1.368771264,11.36048394,171.4175824,9529,R,3,3,3,FALSE,FALSE,FALSE
100001,202003,100,0.064539826,0.173728814,0.4611093,0.266016593,42.61538462,615,R,3,3,3,FALSE,FALSE,FALSE
100001,202004,100,5.756097561,,6.74787815,1,129.032967,3540,R,3,0,3,FALSE,TRUE,FALSE
100002,202001,100,,3.337737407,1,1.691693133,179.3296703,8084,R,0,3,3,TRUE,FALSE,FALSE
100002,202002,100,0.299604156,0.792020929,1.368771264,11.36048394,171.4175824,2422,R,3,3,3,FALSE,FALSE,FALSE
100002,202003,100,1.262592898,0.545292439,0.4611093,0.266016593,42.61538462,3058,R,3,3,3,FALSE,FALSE,FALSE
100002,202004,100,1.833878352,,6.74787815,1,129.032967,5608,R,3,0,3,FALSE,TRUE,FALSE
100003,202001,100,,1.414747807,1,1.691693133,179.3296703,5161,R,0,3,3,TRUE,FALSE,FALSE
100003,202002,100,0.70683976,17.79512195,1.368771264,11.36048394,171.4175824,3648,R,3,3,3,FALSE,FALSE,FALSE
100003,202003,100,0.056195175,0.079028527,0.4611093,0.266016593,42.61538462,205,R,3,3,3,FALSE,FALSE,FALSE
100003,202004,100,12.65365854,,6.74787815,1,129.032967,2594,R,3,0,3,FALSE,TRUE,FALSE
100004,202001,100,,,1,1.691693133,179.3296703,15422.351648,C,0,3,3,TRUE,FALSE,FALSE
100004,202002,100,,,1.368771264,11.36048394,171.4175824,21109.671762,FIC,3,3,3,FALSE,FALSE,FALSE
100004,202003,100,,,0.4611093,0.266016593,42.61538462,9733.865967,FIC,3,3,3,FALSE,FALSE,FALSE
100004,202004,100,,,6.74787815,1,129.032967,9352,MC,3,0,3,FALSE,TRUE,FALSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
identifier,date,group,question,other,manual_construction
100001,202001,100,3074,26,
100001,202002,100,9529,26,
100001,202003,100,615,26,
100001,202004,100,3540,26,
100002,202001,100,8084,19,
100002,202002,100,2422,19,
100002,202003,100,3058,19,
100002,202004,100,5608,19,
100003,202001,100,5161,46,
100003,202002,100,3648,46,
100003,202003,100,205,46,
100003,202004,100,2594,46,
100004,202001,100,,86,
100004,202002,100,,86,
100004,202003,100,,86,9352
100004,202004,100,8762,86,
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
identifier,date,group,growth_forward,growth_backward,forward,backward,construction,output,marker,count_forward,count_backward,count_construction,default_forward,default_backward,default_construction
100001,202001,100,,0.322594186,1,1.691693133,179.3296703,3074,R,0,3,3,TRUE,FALSE,FALSE
100001,202002,100,3.099869876,15.49430894,1.368771264,11.36048394,171.4175824,9529,R,3,3,3,FALSE,FALSE,FALSE
100001,202003,100,0.064539826,0.173728814,0.4611093,0.266016593,42.61538462,615,R,3,3,3,FALSE,FALSE,FALSE
100001,202004,100,5.756097561,,6.74787815,1,115.8418079,3540,R,3,0,4,FALSE,TRUE,FALSE
100002,202001,100,,3.337737407,1,1.691693133,179.3296703,8084,R,0,3,3,TRUE,FALSE,FALSE
100002,202002,100,0.299604156,0.792020929,1.368771264,11.36048394,171.4175824,2422,R,3,3,3,FALSE,FALSE,FALSE
100002,202003,100,1.262592898,0.545292439,0.4611093,0.266016593,42.61538462,3058,R,3,3,3,FALSE,FALSE,FALSE
100002,202004,100,1.833878352,,6.74787815,1,115.8418079,5608,R,3,0,4,FALSE,TRUE,FALSE
100003,202001,100,,1.414747807,1,1.691693133,179.3296703,5161,R,0,3,3,TRUE,FALSE,FALSE
100003,202002,100,0.70683976,17.79512195,1.368771264,11.36048394,171.4175824,3648,R,3,3,3,FALSE,FALSE,FALSE
100003,202003,100,0.056195175,0.079028527,0.4611093,0.266016593,42.61538462,205,R,3,3,3,FALSE,FALSE,FALSE
100003,202004,100,12.65365854,,6.74787815,1,115.8418079,2594,R,3,0,4,FALSE,TRUE,FALSE
100004,202001,100,,,1,1.691693133,179.3296703,15422.351648,C,0,3,3,TRUE,FALSE,FALSE
100004,202002,100,,,1.368771264,11.36048394,171.4175824,21109.671762,FIC,3,3,3,FALSE,FALSE,FALSE
100004,202003,100,,,0.4611093,0.266016593,42.61538462,9352,MC,3,3,3,FALSE,FALSE,FALSE
100004,202004,100,,,6.74787815,1,115.8418079,8762,R,3,0,4,FALSE,TRUE,FALSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
identifier,date,group,question,other,manual_construction
20001,202001,100,2536,35,
20001,202002,100,8283,35,
20002,202001,100,9113,72,
20002,202002,100,2970,72,
20003,202001,100,5644,77,
20003,202002,100,989,77,
20004,202001,100,,30,
20004,202002,100,,30,3021
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
identifier,date,group,growth_forward,growth_backward,forward,backward,construction,output,marker,count_forward,count_backward,count_construction,default_forward,default_backward,default_construction
20001,202001,100,,0.306169262,1,3.027097983,93.98369565,2536,R,0,3,3,TRUE,FALSE,FALSE
20001,202002,100,3.266167192,,1.255768523,1,66.5326087,8283,R,3,0,3,FALSE,TRUE,FALSE
20002,202001,100,,3.068350168,1,3.027097983,93.98369565,9113,R,0,3,3,TRUE,FALSE,FALSE
20002,202002,100,0.325908043,,1.255768523,1,66.5326087,2970,R,3,0,3,FALSE,TRUE,FALSE
20003,202001,100,,5.70677452,1,3.027097983,93.98369565,5644,R,0,3,3,TRUE,FALSE,FALSE
20003,202002,100,0.175230333,,1.255768523,1,66.5326087,989,R,3,0,3,FALSE,TRUE,FALSE
20004,202001,100,,,1,3.027097983,93.98369565,2819.51087,C,0,3,3,TRUE,FALSE,FALSE
20004,202002,100,,,1.255768523,1,66.5326087,3021,MC,3,0,3,FALSE,TRUE,FALSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
identifier,date,group,question,other,manual_construction
40001,202001,100,9491,35,
40001,202002,100,4783,35,
40001,202003,100,7902,35,
40001,202004,100,4911,35,
40002,202001,100,2095,63,
40002,202002,100,442,63,
40002,202003,100,3136,63,
40002,202004,100,2115,63,
40003,202001,100,7863,16,
40003,202002,100,8121,16,
40003,202003,100,2151,16,
40003,202004,100,1377,16,
40004,202001,100,5131,78,
40004,202002,100,,78,
40004,202003,100,,78,7525
40004,202004,100,,78,
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
identifier,date,group,growth_forward,growth_backward,forward,backward,construction,output,marker,count_forward,count_backward,count_construction,default_forward,default_backward,default_construction
40001,202001,100,,1.984319465,1,2.564122994,128.0208333,9491,R,0,3,4,TRUE,FALSE,FALSE
40001,202002,100,0.503951112,0.6052898,0.582580512,1.507228985,117.0701754,4783,R,3,3,3,FALSE,FALSE,FALSE
40001,202003,100,1.652101192,1.609040929,3.003997558,1.551291583,115.692982,7902,R,3,3,3,FALSE,FALSE,FALSE
40001,202004,100,0.621488231,,0.645360538,1,73.71052632,4911,R,3,0,3,FALSE,TRUE,FALSE
40002,202001,100,,4.739819005,1,2.564122994,128.0208333,2095,R,0,3,4,TRUE,FALSE,FALSE
40002,202002,100,0.21097852,0.140943878,0.582580512,1.507228985,117.0701754,442,R,3,3,3,FALSE,FALSE,FALSE
40002,202003,100,7.095022624,1.482742317,3.003997558,1.551291583,115.692982,3136,R,3,3,3,FALSE,FALSE,FALSE
40002,202004,100,0.67442602,,0.645360538,1,73.71052632,2115,R,3,0,3,FALSE,TRUE,FALSE
40003,202001,100,,0.968230513,1,2.564122994,128.0208333,7863,R,0,3,4,TRUE,FALSE,FALSE
40003,202002,100,1.032811904,3.775453278,0.582580512,1.507228985,117.0701754,8121,R,3,3,3,FALSE,FALSE,FALSE
40003,202003,100,0.264868859,1.562091503,3.003997558,1.551291583,115.692982,2151,R,3,3,3,FALSE,FALSE,FALSE
40003,202004,100,0.640167364,,0.645360538,1,73.71052632,1377,R,3,0,3,FALSE,TRUE,FALSE
40004,202001,100,,,1,2.564122994,128.0208333,5131,R,0,3,4,TRUE,FALSE,FALSE
40004,202002,100,,,0.582580512,1.507228985,117.0701754,2989.220607,FIR,3,3,3,FALSE,FALSE,FALSE
40004,202003,100,,,3.003997558,1.551291583,115.692982,7525,MC,3,3,3,FALSE,FALSE,FALSE
40004,202004,100,,,0.645360538,1,73.71052632,4856.338052,FIMC,3,0,3,FALSE,TRUE,FALSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
identifier,date,group,question,other,manual_construction
100001,202001,100,3074,26,
100001,202002,100,9529,26,
100001,202003,100,615,26,
100001,202004,100,3540,26,
100002,202001,100,8084,19,
100002,202002,100,2422,19,
100002,202003,100,3058,19,
100002,202004,100,5608,19,
100003,202001,100,5161,46,
100003,202002,100,3648,46,
100003,202003,100,205,46,
100003,202004,100,2594,46,
100004,202001,100,,86,
100004,202002,100,,86,
100004,202003,100,,86,11085
100004,202004,100,,86,
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
identifier,date,group,growth_forward,growth_backward,forward,backward,construction,output,marker,count_forward,count_backward,count_construction,default_forward,default_backward,default_construction
100001,202001,100,,0.322594186,1,1.691693133,179.3296703,3074,R,0,3,3,TRUE,FALSE,FALSE
100001,202002,100,3.099869876,15.49430894,1.368771264,11.36048394,171.4175824,9529,R,3,3,3,FALSE,FALSE,FALSE
100001,202003,100,0.064539826,0.173728814,0.4611093,0.266016593,42.61538462,615,R,3,3,3,FALSE,FALSE,FALSE
100001,202004,100,5.756097561,,6.74787815,1,129.032967,3540,R,3,0,3,FALSE,TRUE,FALSE
100002,202001,100,,3.337737407,1,1.691693133,179.3296703,8084,R,0,3,3,TRUE,FALSE,FALSE
100002,202002,100,0.299604156,0.792020929,1.368771264,11.36048394,171.4175824,2422,R,3,3,3,FALSE,FALSE,FALSE
100002,202003,100,1.262592898,0.545292439,0.4611093,0.266016593,42.61538462,3058,R,3,3,3,FALSE,FALSE,FALSE
100002,202004,100,1.833878352,,6.74787815,1,129.032967,5608,R,3,0,3,FALSE,TRUE,FALSE
100003,202001,100,,1.414747807,1,1.691693133,179.3296703,5161,R,0,3,3,TRUE,FALSE,FALSE
100003,202002,100,0.70683976,17.79512195,1.368771264,11.36048394,171.4175824,3648,R,3,3,3,FALSE,FALSE,FALSE
100003,202003,100,0.056195175,0.079028527,0.4611093,0.266016593,42.61538462,205,R,3,3,3,FALSE,FALSE,FALSE
100003,202004,100,12.65365854,,6.74787815,1,129.032967,2594,R,3,0,3,FALSE,TRUE,FALSE
100004,202001,100,,,1,1.691693133,179.3296703,15422.351648,C,0,3,3,TRUE,FALSE,FALSE
100004,202002,100,,,1.368771264,11.36048394,171.4175824,21109.671762,FIC,3,3,3,FALSE,FALSE,FALSE
100004,202003,100,,,0.4611093,0.266016593,42.61538462,11085,MC,3,3,3,FALSE,FALSE,FALSE
100004,202004,100,,,6.74787815,1,129.032967,74800.229291,FIMC,3,0,3,FALSE,TRUE,FALSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
identifier,date,group,question,other,manual_construction
60001,202001,100,5077,15,
60001,202002,100,7830,15,
60001,202003,100,1046,15,
60002,202001,100,1588,71,
60002,202002,100,1213,71,
60002,202003,100,3807,71,
60003,202001,100,6541,26,
60003,202002,100,336,26,
60003,202003,100,6351,26,
60004,202001,100,,3,839
60004,202002,100,,3,
60004,202003,100,401,3,
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
identifier,date,group,growth_forward,growth_backward,forward,backward,construction,output,marker,count_forward,count_backward,count_construction,default_forward,default_backward,default_construction
60001,202001,100,,0.648403576,1,7.141605449,117.9107143,5077,R,0,3,3,TRUE,FALSE,FALSE
60001,202002,100,1.54224936,7.485659656,0.785823852,2.619062766,83.74107143,7830,R,3,3,3,FALSE,FALSE,FALSE
60001,202003,100,0.133588761,,7.391291354,1,100.9130434,1046,R,3,0,4,FALSE,TRUE,FALSE
60002,202001,100,,1.309150866,1,7.141605449,117.9107143,1588,R,0,3,3,TRUE,FALSE,FALSE
60002,202002,100,0.763853904,0.318623588,0.785823852,2.619062766,83.74107143,1213,R,3,3,3,FALSE,FALSE,FALSE
60002,202003,100,3.138499588,,7.391291354,1,100.9130434,3807,R,3,0,4,FALSE,TRUE,FALSE
60003,202001,100,,19.4672619,1,7.141605449,117.9107143,6541,R,0,3,3,TRUE,FALSE,FALSE
60003,202002,100,0.051368292,0.052905054,0.785823852,2.619062766,83.74107143,336,R,3,3,3,FALSE,FALSE,FALSE
60003,202003,100,18.90178571,,7.391291354,1,100.9130434,6351,R,3,0,4,FALSE,TRUE,FALSE
60004,202001,100,,,1,7.141605449,117.9107143,839,MC,0,3,3,TRUE,FALSE,FALSE
60004,202002,100,,,0.785823852,2.619062766,83.74107143,1050.244169,BI,3,3,3,FALSE,FALSE,FALSE
60004,202003,100,,,7.391291354,1,100.9130434,401,R,3,0,4,FALSE,TRUE,FALSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
identifier,date,group,question,other,manual_construction
30001,202001,100,8444,51,
30001,202002,100,7476,51,
30001,202003,100,2003,51,
30002,202001,100,9343,72,
30002,202002,100,7818,72,
30002,202003,100,4897,72,
30003,202001,100,7511,7,
30003,202002,100,1761,7,
30003,202003,100,6492,7,
30004,202001,100,64,81,4321
30004,202002,100,2113,81,
30004,202003,100,,81,
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
identifier,date,group,growth_forward,growth_backward,forward,backward,construction,output,marker,count_forward,count_backward,count_construction,default_forward,default_backward,default_construction
30001,202001,100,,1.129481006,1,1.655005651,120.199052,8444,R,0,4,4,TRUE,FALSE,FALSE
30001,202002,100,0.885362387,3.732401398,8.743054929,1.866715325,90.843602,7476,R,4,3,4,FALSE,FALSE,FALSE
30001,202003,100,0.267924024,,1.526947,1,103.015385,2003,R,3,0,3,FALSE,TRUE,FALSE
30002,202001,100,,1.195062676,1,1.655006,120.199052,9343,R,0,4,4,TRUE,FALSE,FALSE
30002,202002,100,0.836776196,1.596487645,8.743054929,1.866715325,90.843602,7818,R,4,3,4,FALSE,FALSE,FALSE
30002,202003,100,0.626375032,,1.526947,1,103.015385,4897,R,3,0,3,FALSE,TRUE,FALSE
30003,202001,100,,4.265190233,1,1.655006,120.199052,7511,R,0,4,4,TRUE,FALSE,FALSE
30003,202002,100,0.234456131,0.271256932,8.743054929,1.866715325,90.843602,1761,R,4,3,4,FALSE,FALSE,FALSE
30003,202003,100,3.686541738,,1.526947,1,103.015385,6492,R,3,0,3,FALSE,TRUE,FALSE
30004,202001,100,,0.030288689,1,1.655006,120.199052,64,R,0,4,4,TRUE,FALSE,FALSE
30004,202002,100,33.015625,,8.743054929,1.866715325,90.843602,2113,R,4,3,4,FALSE,FALSE,FALSE
30004,202003,100,,,1.526946931056,1,103.015385,3226.438865,FIR,3,0,3,FALSE,TRUE,FALSE
30 changes: 30 additions & 0 deletions tests/imputation/mean_of_ratios.toml
Original file line number Diff line number Diff line change
@@ -195,3 +195,33 @@ weight_periodicity_multiplier = 12
[scenarios.70_C_FI_FI_65_weight]
weight = "0.65"
weight_periodicity_multiplier = 12

[scenarios.71_MC_R_MC]
manual_construction_col = "manual_construction"

[scenarios.72_MC_FIMC]
manual_construction_col = "manual_construction"

[scenarios.73_R_R_MC_FIMC_FIMC]
manual_construction_col = "manual_construction"

[scenarios.74_C_FIC_FIC_MC]
manual_construction_col = "manual_construction"

[scenarios.75_C_FIC_MC_R]
manual_construction_col = "manual_construction"

[scenarios.76_C_MC]
manual_construction_col = "manual_construction"

[scenarios.77_R_FIR_MC_FIMC]
manual_construction_col = "manual_construction"

[scenarios.78_C_FIC_MC_FIMC]
manual_construction_col = "manual_construction"

[scenarios.79_MC_BI_R]
manual_construction_col = "manual_construction"

[scenarios.80_MC_R_FIR]
manual_construction_col = "manual_construction"
72 changes: 69 additions & 3 deletions tests/imputation/test_engine.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import pytest
from pyspark.sql.functions import lit
from pyspark.sql.types import DecimalType, LongType, StringType
from pyspark.sql.functions import lit, col
from pyspark.sql.types import DecimalType, LongType, StringType, BooleanType

from statistical_methods_library.imputation import impute, ratio_of_means
from statistical_methods_library.utilities.exceptions import ValidationError
from tests.helpers import check_df_equality

auxiliary_col = "other"
backward_col = "backward"
@@ -18,6 +19,10 @@
count_forward_col = "count_forward"
count_backward_col = "count_backward"
count_construction_col = "count_construction"
manual_construction_col = "manual_construction"
default_forward_col = "default_forward"
default_backward_col = "default_backward"
default_construction_col = "default_construction"

decimal_type = DecimalType(15, 6)

@@ -34,6 +39,10 @@
count_forward_type = LongType()
count_backward_type = LongType()
count_construction_type = LongType()
manual_construction_type = decimal_type
default_forward_type = BooleanType()
default_backward_type = BooleanType()
default_construction_type = BooleanType()

# Columns we expect in either our input or output test dataframes and their
# respective types
@@ -51,6 +60,10 @@
count_forward_col,
count_backward_col,
count_construction_col,
manual_construction_col,
default_forward_col,
default_backward_col,
default_construction_col,
)

dataframe_types = {
@@ -67,6 +80,10 @@
count_forward_col: count_forward_type,
count_backward_col: count_backward_type,
count_construction_col: count_construction_type,
manual_construction_col: manual_construction_type,
default_forward_col: default_forward_type,
default_backward_col: default_backward_type,
default_construction_col: default_construction_type,
}

bad_dataframe_types = dataframe_types.copy()
@@ -338,7 +355,6 @@ def test_back_data_drops_link_cols_when_present(fxt_load_test_csv, fxt_spark_ses
)

ret_val = impute(input_df=test_dataframe, **params, back_data_df=back_data)

assert ret_val.count() == 1


@@ -407,3 +423,53 @@ def test_input_data_contains_nulls(fxt_load_test_csv, fxt_spark_session):

with pytest.raises(ValidationError):
impute(input_df=test_dataframe, **params)


def test_back_data_fimc(fxt_load_test_csv, fxt_spark_session):
test_dataframe = fxt_load_test_csv(
dataframe_columns,
dataframe_types,
"imputation",
"engine",
"unit",
"manual_construction_input",
)

back_data = fxt_load_test_csv(
dataframe_columns,
dataframe_types,
"imputation",
"engine",
"unit",
"manual_construction_back_data",
)

expected_data = fxt_load_test_csv(
dataframe_columns,
dataframe_types,
"imputation",
"engine",
"unit",
"manual_construction_output",
)
params.update({"manual_construction_col": manual_construction_col})

scenario_actual_output = impute(
input_df=test_dataframe, **params, back_data_df=back_data
)
for field_name, field_type in scenario_actual_output.dtypes:
if field_type.startswith("decimal"):
scenario_actual_output = scenario_actual_output.withColumn(
field_name, col(field_name).cast("decimal(15, 6)")
)

sort_cols = [
params["reference_col"],
params["period_col"],
params["grouping_col"],
]
check_df_equality(
actual=scenario_actual_output.sort(sort_cols),
expected=expected_data.sort(sort_cols),
keep_cols=sort_cols,
)
7 changes: 5 additions & 2 deletions tests/imputation/test_scenarios.py
Original file line number Diff line number Diff line change
@@ -78,7 +78,12 @@ def test_calculations(fxt_load_test_csv, ratio_calculator, scenario_type, scenar
field_types = default_config["field_types"]
field_types.update(test_config.get("field_types", {}))
field_types.update(scenario_config.get("field_types", {}))

imputation_kwargs.update(scenario_config)
if "manual_construction_col" in imputation_kwargs:
field_types.update({"manual_construction_col": "decimal(15,6)"})
fields.update({"manual_construction_col": "manual_construction"})

types = {fields[k]: v for k, v in field_types.items()}
scenario_file_type = scenario_type.replace("back_data_", "")
scenario_input = fxt_load_test_csv(
@@ -112,7 +117,6 @@ def test_calculations(fxt_load_test_csv, ratio_calculator, scenario_type, scenar
scenario_expected_output = scenario_expected_output.filter(
col(fields["period_col"]) >= starting_period
)

scenario_actual_output = imputation.impute(
input_df=scenario_input, **imputation_kwargs
)
@@ -121,7 +125,6 @@ def test_calculations(fxt_load_test_csv, ratio_calculator, scenario_type, scenar
scenario_actual_output = scenario_actual_output.withColumn(
field_name, col(field_name).cast("decimal(15, 6)")
)

sort_cols = [
fields["reference_col"],
fields["period_col"],