Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Imputation manual construction #186

Merged
merged 19 commits into from
May 23, 2024
Merged
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 107 additions & 13 deletions statistical_methods_library/imputation/engine.py
Original file line number Diff line number Diff line change
@@ -43,6 +43,12 @@ class Marker(Enum):
FORWARD_IMPUTE_FROM_CONSTRUCTION = "FIC"
"""The value has been forward imputed from a constructed value."""

MANUAL_CONSTRUCTION = "MC"
"""The value is manual construction."""

FORWARD_IMPUTE_FROM_MANUAL_CONSTRUCTION = "FIMC"
"""The value has been forward imputed from a manual construction."""


def impute(
*,
@@ -78,6 +84,7 @@ def impute(
unweighted_forward_link_col: Optional[str] = "forward_unweighted",
unweighted_backward_link_col: Optional[str] = "backward_unweighted",
unweighted_construction_link_col: Optional[str] = "construction_unweighted",
manual_construction_col: Optional[str] = None,
**ratio_calculator_params,
) -> DataFrame:
"""
@@ -188,7 +195,6 @@ def impute(
link_cols = [forward_link_col, backward_link_col]
if any(link_cols) and not all(link_cols):
raise TypeError("Either all or no link columns must be specified")

input_params = {
"ref": reference_col,
"period": period_col,
@@ -233,6 +239,15 @@ def impute(
"output": output_col,
"marker": marker_col,
}
# Add manual_construction parm
# only if manual_construction_col is not None.
if manual_construction_col:
input_params["manual_const"] = manual_construction_col
fill_values_mc = {}

if back_data_df:
if not isinstance(back_data_df, DataFrame):
raise TypeError("Input is not a DataFrame")

if weight is not None:
if not isinstance(weight, Decimal):
@@ -266,6 +281,7 @@ def impute(
"forward_unweighted": DecimalType,
"backward_unweighted": DecimalType,
"construction_unweighted": DecimalType,
"manual_const": DecimalType,
}

if link_filter:
@@ -289,7 +305,7 @@ def impute(
input_params,
type_mapping,
["ref", "period", "grouping"],
["target"],
["target", "manual_const"],
)
.withColumnRenamed("target", "output")
.withColumn("marker", when(~col("output").isNull(), Marker.RESPONSE.value))
@@ -301,13 +317,34 @@ def impute(
prior_period_df = prepared_df.selectExpr(
"min(previous_period) AS prior_period"
).localCheckpoint(eager=False)

if manual_construction_col:
# Set manual construction value as output
# and set marker as MC
mc_df = prepared_df.withColumn(
"marker",
when(
(col("manual_const").isNotNull()) & (col("output").isNull()),
lit(Marker.MANUAL_CONSTRUCTION.value),
).otherwise(col("marker")),
).withColumn(
"output",
when(
(col("manual_const").isNotNull()) & (col("output").isNull()),
col("manual_const"),
).otherwise(col("output")),
)
manual_construction_df = mc_df.filter(
(col("marker") == Marker.MANUAL_CONSTRUCTION.value)
)
# Filter out the MC data so
# it will be not inculded in the link calculations
prepared_df = mc_df.filter(
col("marker").isNull()
| (~(col("marker") == Marker.MANUAL_CONSTRUCTION.value))
)
if back_data_df:
validated_back_data_df = validate_dataframe(
back_data_df,
back_input_params,
type_mapping,
["ref", "period", "grouping"],
back_data_df, back_input_params, type_mapping, ["ref", "period", "grouping"]
).localCheckpoint(eager=False)
back_data_period_df = (
validated_back_data_df.select(
@@ -325,6 +362,7 @@ def impute(
)
.localCheckpoint(eager=False)
)

prepared_df = prepared_df.unionByName(
back_data_period_df.filter(col("marker") == lit(Marker.RESPONSE.value)),
allowMissingColumns=True,
@@ -333,6 +371,7 @@ def impute(
def calculate_ratios():
# This allows us to return early if we have nothing to do
nonlocal prepared_df
nonlocal fill_values_mc
ratio_calculators = []
if "forward" in prepared_df.columns:
prepared_df = (
@@ -430,8 +469,9 @@ def calculate_ratios():
fill_values.update(result.fill_values)
output_col_mapping.update(result.additional_outputs)

for fill_column, fill_value in fill_values.items():
prepared_df = prepared_df.fillna(fill_value, fill_column)
prepared_df = prepared_df.fillna(fill_values)

fill_values_mc = fill_values

if link_filter:
prepared_df = prepared_df.join(
@@ -536,6 +576,31 @@ def calculate_weighted_link(link_name):

calculate_ratios()

if manual_construction_col:
# populate link, count, default information
# for manual_construction data
unique_grp_prd = prepared_df.dropDuplicates(["period", "grouping"])
# Get the required additional output columns
mc_cols = manual_construction_df.columns
mc_additional_cols = []
for key in output_col_mapping.keys():
# Remove growth_forward and growth_backward
# as it should be null for non responder
if (key not in mc_cols) and (
key not in ["growth_forward", "growth_backward"]
):
mc_additional_cols.append(key)
manual_construction_df = (
manual_construction_df.alias("mc")
.join(unique_grp_prd, ["period", "grouping"], "leftouter")
.select(
*(f"mc.{name}" for name in mc_cols),
*mc_additional_cols,
)
)
# Fill null additional columns value with default value.
manual_construction_df = manual_construction_df.fillna(fill_values_mc)

# Caching for both imputed and unimputed data.
imputed_df = None
null_response_df = None
@@ -565,7 +630,6 @@ def impute_helper(
"forward",
"backward",
)

# Anything which isn't null is already imputed or a response and thus
# can be imputed from. Note that in the case of backward imputation
# this still holds since it always happens after forward imputation
@@ -628,7 +692,6 @@ def impute_helper(
["ref", "period", "grouping"],
"leftanti",
).localCheckpoint(eager=True)

# We should now have an output column which is as fully populated as
# this phase of imputation can manage. As such replace the existing
# output column with our one. Same goes for the marker column.
@@ -653,6 +716,29 @@ def forward_impute_from_response(df: DataFrame) -> DataFrame:
def backward_impute(df: DataFrame) -> DataFrame:
return impute_helper(df, "backward", Marker.BACKWARD_IMPUTE, False)

# --- Forward impute from manual construction ---
def forward_impute_from_manual_construction(df: DataFrame) -> DataFrame:
nonlocal imputed_df
nonlocal null_response_df
imputed_df = None
null_response_df = None
if back_data_df:
# Add the MC and FIMC from the back data
df = df.unionByName(
back_data_period_df.filter(
(col("marker") == lit(Marker.MANUAL_CONSTRUCTION.value))
| (
col("marker")
== lit(Marker.FORWARD_IMPUTE_FROM_MANUAL_CONSTRUCTION.value)
)
),
allowMissingColumns=True,
)

return impute_helper(
df, "forward", Marker.FORWARD_IMPUTE_FROM_MANUAL_CONSTRUCTION, True
)

# --- Construction functions ---
def construct_values(df: DataFrame) -> DataFrame:
if back_data_df:
@@ -725,13 +811,21 @@ def forward_impute_from_construction(df: DataFrame) -> DataFrame:
for stage in (
forward_impute_from_response,
backward_impute,
forward_impute_from_manual_construction,
construct_values,
forward_impute_from_construction,
):
if manual_construction_col and stage == forward_impute_from_manual_construction:
# Add the mc data
df = df.unionByName(manual_construction_df, allowMissingColumns=True)

df = stage(df).localCheckpoint(eager=False)
if df.filter(col("output").isNull()).count() == 0:
break

if df.filter(col("output").isNull()).count() == 0:
if (not manual_construction_col) or (
manual_construction_col and stage == construct_values
):
break
return df.join(prior_period_df, [col("prior_period") < col("period")]).select(
[
col(k).alias(output_col_mapping[k])
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
identifier,date,group,output,other,marker
1234,"202104","900",10,2,MC
1235,"202104","100",20,2,FIMC
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
identifier,date,group,question,other,manual_construction
1234,"202105",900,,78,
1235,"202105",100,,81,
1236,"202105",100,2113,81,
1237,"202105",200,,81,3189
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
identifier,date,group,output,marker,forward,backward,construction,count_forward,count_backward,count_construction,default_forward,default_backward,default_construction
1234,202105,900,10,FIMC,1,1,1,0,0,0,true,true,true
1235,202105,100,20,FIMC,1,1,26.08641975,0,0,1,true,true,false
1236,202105,100,2113,R,1,1,26.08641975,0,0,1,true,true,false
1237,202105,200,3189,MC,1,1,1,0,0,0,true,true,true
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
identifier,date,group,question,other,manual_construction
30001,202001,100,8444,51,
30001,202002,100,7476,51,
30001,202003,100,2003,51,
30002,202001,100,9343,72,
30002,202002,100,7818,72,
30002,202003,100,4897,72,
30003,202001,100,7511,7,
30003,202002,100,1761,7,
30003,202003,100,6492,7,
30004,202001,100,,81,4321
30004,202002,100,2113,81,
30004,202003,100,,81,3189
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
identifier,date,group,growth_forward,growth_backward,forward,backward,construction,output,marker,count_forward,count_backward,count_construction,default_forward,default_backward,default_construction
30001,202001,100,,1.129481006,1,2.196577972,194.6,8444,R,0,3,3,true,false,false
30001,202002,100,0.885362387,3.732401398,0.652198238,1.866715325,90.8436019,7476,R,3,3,4,false,false,false
30001,202003,100,0.267924024,,1.526946931,1,103.0153846,2003,R,3,0,3,false,true,false
30002,202001,100,,1.195062676,1,2.196577972,194.6,9343,R,0,3,3,true,false,false
30002,202002,100,0.836776196,1.596487645,0.652198238,1.866715325,90.8436019,7818,R,3,3,4,false,false,false
30002,202003,100,0.626375032,,1.526947,1,103.0153846,4897,R,3,0,3,false,true,false
30003,202001,100,,4.265190233,1,2.196577972,194.6,7511,R,0,3,3,true,false,false
30003,202002,100,0.234456131,0.271256932,0.652198238,1.866715325,90.8436019,1761,R,3,3,4,false,false,false
30003,202003,100,3.686541738,,1.526947,1,103.0153846,6492,R,3,0,3,false,true,false
30004,202001,100,,,1,2.196577972,194.6,4321,MC,0,3,3,true,false,false
30004,202002,100,,,0.652198238,1.866715325,90.8436019,2113,R,3,3,4,false,false,false
30004,202003,100,,,1.526947,1,103.0153846,3189,MC,3,0,3,false,true,false
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
identifier,date,group,question,other,manual_construction
40001,202001,100,9491,35,
40001,202002,100,4783,35,
40001,202003,100,7902,35,
40001,202004,100,4911,35,
40002,202001,100,2095,63,
40002,202002,100,442,63,
40002,202003,100,3136,63,
40002,202004,100,2115,63,
40003,202001,100,7863,16,
40003,202002,100,8121,16,
40003,202003,100,2151,16,
40003,202004,100,1377,16,
40004,202001,100,5131,78,
40004,202002,100,9836,78,
40004,202003,100,,78,7525,
40004,202004,100,,78,
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
identifier,date,group,growth_forward,growth_backward,forward,backward,construction,output,marker,count_forward,count_backward,count_construction,default_forward,default_backward,default_construction
40001,202001,100,,1.984319465,1,2.053506032,128.0208333,9491,R,0,4,4,true,false,false
40001,202002,100,0.503951112,0.6052898,0.916179196,1.507228985,120.7395833,4783,R,4,3,4,false,false,false
40001,202003,100,1.652101192,1.609040929,3.003997558,1.551291583,115.6929824,7902,R,3,3,3,false,false,false
40001,202004,100,0.621488231,,0.645360538,1,73.71052632,4911,R,3,0,3,false,true,false
40002,202001,100,,4.739819005,1,2.053506032,128.0208333,2095,R,0,4,4,true,false,false
40002,202002,100,0.21097852,0.140943878,0.916179196,1.507228985,120.7395833,442,R,4,3,4,false,false,false
40002,202003,100,7.095022624,1.482742317,3.003997558,1.551291583,115.6929824,3136,R,3,3,3,false,false,false
40002,202004,100,0.67442602,,0.645360538,1,73.71052632,2115,R,3,0,3,false,true,false
40003,202001,100,,0.968230513,1,2.053506032,128.0208333,7863,R,0,4,4,true,false,false
40003,202002,100,1.032811904,3.775453278,0.916179196,1.507228985,120.7395833,8121,R,4,3,4,false,false,false
40003,202003,100,0.264868859,1.562091503,3.003997558,1.551291583,115.6929824,2151,R,3,3,3,false,false,false
40003,202004,100,0.640167364,,0.645360538,1,73.71052632,1377,R,3,0,3,false,true,false
40004,202001,100,,0.521655144,1,2.053506032,128.0208333,5131,R,0,4,4,true,false,false
40004,202002,100,1.916975248,,0.916179196,1.507228985,120.7395833,9836,R,4,3,4,false,false,false
40004,202003,100,,,3.003997558,1.551291583,115.6929824,7525,MC,3,3,3,false,false,false
40004,202004,100,,,0.645360538,1,73.71052632,4856.338052,FIMC,3,0,3,false,true,false
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
identifier,date,group,question,other,manual_construction
50001,202001,100,6362,59,
50001,202002,100,8542,59,
50001,202003,100,5623,59,
50001,202004,100,7769,59,
50001,202005,100,4687,59,
50002,202001,100,4851,36,
50002,202002,100,8894,36,
50002,202003,100,3372,36,
50002,202004,100,3522,36,
50002,202005,100,2327,36,
50003,202001,100,2238,76,
50003,202002,100,769,76,
50003,202003,100,7722,76,
50003,202004,100,6445,76,
50003,202005,100,1521,76,
50004,202001,100,688,30,
50004,202002,100,3245,30,
50004,202003,100,,30,1487
50004,202004,100,,30,
50004,202005,100,,30,
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
identifier,date,group,growth_forward,growth_backward,forward,backward,construction,output,marker,count_forward,count_backward,count_construction,default_forward,default_backward,default_construction
50001,202001,100,,0.744790447,1,1.103126475,70.34328358,6362,R,0,4,4,TRUE,FALSE,FALSE
50001,202002,100,1.342659541,1.519117909,2.05906902,1.418769101,106.7164179,8542,R,4,3,4,FALSE,FALSE,FALSE
50001,202003,100,0.65827675,0.723773973,3.693007078,0.959774209,97.76023392,5623,R,3,3,3,FALSE,FALSE,FALSE
50001,202004,100,1.381646808,1.657563473,1.086919709,2.469481356,103.7192982,7769,R,3,3,3,FALSE,FALSE,FALSE
50001,202005,100,0.603295147,,0.49999873,1,49.9122807,4687,R,3,0,3,FALSE,TRUE,FALSE
50002,202001,100,,0.545423881,1,1.103126475,70.34328358,4851,R,0,4,4,TRUE,FALSE,FALSE
50002,202002,100,1.833436405,2.637603796,2.05906902,1.418769101,106.7164179,8894,R,4,3,4,FALSE,FALSE,FALSE
50002,202003,100,0.379131999,0.957410562,3.693007078,0.959774209,97.76023392,3372,R,3,3,3,FALSE,FALSE,FALSE
50002,202004,100,1.044483986,1.513536743,1.086919709,2.469481356,103.7192982,3522,R,3,3,3,FALSE,FALSE,FALSE
50002,202005,100,0.660704145,,0.49999873,1,49.9122807,2327,R,3,0,3,FALSE,TRUE,FALSE
50003,202001,100,,2.910273082,1,1.103126475,70.34328358,2238,R,0,4,4,TRUE,FALSE,FALSE
50003,202002,100,0.343610366,0.0995856,2.05906902,1.418769101,106.7164179,769,R,4,3,4,FALSE,FALSE,FALSE
50003,202003,100,10.04161248,1.198138092,3.693007078,0.959774209,97.76023392,7722,R,3,3,3,FALSE,FALSE,FALSE
50003,202004,100,0.834628335,4.237343853,1.086919709,2.469481356,103.7192982,6445,R,3,3,3,FALSE,FALSE,FALSE
50003,202005,100,0.235996897,,0.49999873,1,49.9122807,1521,R,3,0,3,FALSE,TRUE,FALSE
50004,202001,100,,0.21201849,1,1.103126475,70.34328358,688,R,0,4,4,TRUE,FALSE,FALSE
50004,202002,100,4.716569767,,2.05906902,1.418769101,106.7164179,3245,R,4,3,4,FALSE,FALSE,FALSE
50004,202003,100,,,3.693007078,0.959774209,97.76023392,1487,MC,3,3,3,FALSE,FALSE,FALSE
50004,202004,100,,,1.086919709,2.469481356,103.7192982,1616.249608,FIMC,3,3,3,FALSE,FALSE,FALSE
50004,202005,100,,,0.49999873,1,49.9122807,808.122751,FIMC,3,0,3,FALSE,TRUE,FALSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
identifier,date,group,question,other,manual_construction
100001,202001,100,3074,26,
100001,202002,100,9529,26,
100001,202003,100,615,26,
100001,202004,100,3540,26,
100002,202001,100,8084,19,
100002,202002,100,2422,19,
100002,202003,100,3058,19,
100002,202004,100,5608,19,
100003,202001,100,5161,46,
100003,202002,100,3648,46,
100003,202003,100,205,46,
100003,202004,100,2594,46,
100004,202001,100,,86,
100004,202002,100,,86,
100004,202003,100,,86,
100004,202004,100,,86,9352
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
identifier,date,group,growth_forward,growth_backward,forward,backward,construction,output,marker,count_forward,count_backward,count_construction,default_forward,default_backward,default_construction
100001,202001,100,,0.322594186,1,1.691693133,179.3296703,3074,R,0,3,3,TRUE,FALSE,FALSE
100001,202002,100,3.099869876,15.49430894,1.368771264,11.36048394,171.4175824,9529,R,3,3,3,FALSE,FALSE,FALSE
100001,202003,100,0.064539826,0.173728814,0.4611093,0.266016593,42.61538462,615,R,3,3,3,FALSE,FALSE,FALSE
100001,202004,100,5.756097561,,6.74787815,1,129.032967,3540,R,3,0,3,FALSE,TRUE,FALSE
100002,202001,100,,3.337737407,1,1.691693133,179.3296703,8084,R,0,3,3,TRUE,FALSE,FALSE
100002,202002,100,0.299604156,0.792020929,1.368771264,11.36048394,171.4175824,2422,R,3,3,3,FALSE,FALSE,FALSE
100002,202003,100,1.262592898,0.545292439,0.4611093,0.266016593,42.61538462,3058,R,3,3,3,FALSE,FALSE,FALSE
100002,202004,100,1.833878352,,6.74787815,1,129.032967,5608,R,3,0,3,FALSE,TRUE,FALSE
100003,202001,100,,1.414747807,1,1.691693133,179.3296703,5161,R,0,3,3,TRUE,FALSE,FALSE
100003,202002,100,0.70683976,17.79512195,1.368771264,11.36048394,171.4175824,3648,R,3,3,3,FALSE,FALSE,FALSE
100003,202003,100,0.056195175,0.079028527,0.4611093,0.266016593,42.61538462,205,R,3,3,3,FALSE,FALSE,FALSE
100003,202004,100,12.65365854,,6.74787815,1,129.032967,2594,R,3,0,3,FALSE,TRUE,FALSE
100004,202001,100,,,1,1.691693133,179.3296703,15422.351648,C,0,3,3,TRUE,FALSE,FALSE
100004,202002,100,,,1.368771264,11.36048394,171.4175824,21109.671762,FIC,3,3,3,FALSE,FALSE,FALSE
100004,202003,100,,,0.4611093,0.266016593,42.61538462,9733.865967,FIC,3,3,3,FALSE,FALSE,FALSE
100004,202004,100,,,6.74787815,1,129.032967,9352,MC,3,0,3,FALSE,TRUE,FALSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
identifier,date,group,question,other,manual_construction
100001,202001,100,3074,26,
100001,202002,100,9529,26,
100001,202003,100,615,26,
100001,202004,100,3540,26,
100002,202001,100,8084,19,
100002,202002,100,2422,19,
100002,202003,100,3058,19,
100002,202004,100,5608,19,
100003,202001,100,5161,46,
100003,202002,100,3648,46,
100003,202003,100,205,46,
100003,202004,100,2594,46,
100004,202001,100,,86,
100004,202002,100,,86,
100004,202003,100,,86,9352
100004,202004,100,8762,86,
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
identifier,date,group,growth_forward,growth_backward,forward,backward,construction,output,marker,count_forward,count_backward,count_construction,default_forward,default_backward,default_construction
100001,202001,100,,0.322594186,1,1.691693133,179.3296703,3074,R,0,3,3,TRUE,FALSE,FALSE
100001,202002,100,3.099869876,15.49430894,1.368771264,11.36048394,171.4175824,9529,R,3,3,3,FALSE,FALSE,FALSE
100001,202003,100,0.064539826,0.173728814,0.4611093,0.266016593,42.61538462,615,R,3,3,3,FALSE,FALSE,FALSE
100001,202004,100,5.756097561,,6.74787815,1,115.8418079,3540,R,3,0,4,FALSE,TRUE,FALSE
100002,202001,100,,3.337737407,1,1.691693133,179.3296703,8084,R,0,3,3,TRUE,FALSE,FALSE
100002,202002,100,0.299604156,0.792020929,1.368771264,11.36048394,171.4175824,2422,R,3,3,3,FALSE,FALSE,FALSE
100002,202003,100,1.262592898,0.545292439,0.4611093,0.266016593,42.61538462,3058,R,3,3,3,FALSE,FALSE,FALSE
100002,202004,100,1.833878352,,6.74787815,1,115.8418079,5608,R,3,0,4,FALSE,TRUE,FALSE
100003,202001,100,,1.414747807,1,1.691693133,179.3296703,5161,R,0,3,3,TRUE,FALSE,FALSE
100003,202002,100,0.70683976,17.79512195,1.368771264,11.36048394,171.4175824,3648,R,3,3,3,FALSE,FALSE,FALSE
100003,202003,100,0.056195175,0.079028527,0.4611093,0.266016593,42.61538462,205,R,3,3,3,FALSE,FALSE,FALSE
100003,202004,100,12.65365854,,6.74787815,1,115.8418079,2594,R,3,0,4,FALSE,TRUE,FALSE
100004,202001,100,,,1,1.691693133,179.3296703,15422.351648,C,0,3,3,TRUE,FALSE,FALSE
100004,202002,100,,,1.368771264,11.36048394,171.4175824,21109.671762,FIC,3,3,3,FALSE,FALSE,FALSE
100004,202003,100,,,0.4611093,0.266016593,42.61538462,9352,MC,3,3,3,FALSE,FALSE,FALSE
100004,202004,100,,,6.74787815,1,115.8418079,8762,R,3,0,4,FALSE,TRUE,FALSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
identifier,date,group,question,other,manual_construction
20001,202001,100,2536,35,
20001,202002,100,8283,35,
20002,202001,100,9113,72,
20002,202002,100,2970,72,
20003,202001,100,5644,77,
20003,202002,100,989,77,
20004,202001,100,,30,
20004,202002,100,,30,3021
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
identifier,date,group,growth_forward,growth_backward,forward,backward,construction,output,marker,count_forward,count_backward,count_construction,default_forward,default_backward,default_construction
20001,202001,100,,0.306169262,1,3.027097983,93.98369565,2536,R,0,3,3,TRUE,FALSE,FALSE
20001,202002,100,3.266167192,,1.255768523,1,66.5326087,8283,R,3,0,3,FALSE,TRUE,FALSE
20002,202001,100,,3.068350168,1,3.027097983,93.98369565,9113,R,0,3,3,TRUE,FALSE,FALSE
20002,202002,100,0.325908043,,1.255768523,1,66.5326087,2970,R,3,0,3,FALSE,TRUE,FALSE
20003,202001,100,,5.70677452,1,3.027097983,93.98369565,5644,R,0,3,3,TRUE,FALSE,FALSE
20003,202002,100,0.175230333,,1.255768523,1,66.5326087,989,R,3,0,3,FALSE,TRUE,FALSE
20004,202001,100,,,1,3.027097983,93.98369565,2819.51087,C,0,3,3,TRUE,FALSE,FALSE
20004,202002,100,,,1.255768523,1,66.5326087,3021,MC,3,0,3,FALSE,TRUE,FALSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
identifier,date,group,question,other,manual_construction
40001,202001,100,9491,35,
40001,202002,100,4783,35,
40001,202003,100,7902,35,
40001,202004,100,4911,35,
40002,202001,100,2095,63,
40002,202002,100,442,63,
40002,202003,100,3136,63,
40002,202004,100,2115,63,
40003,202001,100,7863,16,
40003,202002,100,8121,16,
40003,202003,100,2151,16,
40003,202004,100,1377,16,
40004,202001,100,5131,78,
40004,202002,100,,78,
40004,202003,100,,78,7525
40004,202004,100,,78,
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
identifier,date,group,growth_forward,growth_backward,forward,backward,construction,output,marker,count_forward,count_backward,count_construction,default_forward,default_backward,default_construction
40001,202001,100,,1.984319465,1,2.564122994,128.0208333,9491,R,0,3,4,TRUE,FALSE,FALSE
40001,202002,100,0.503951112,0.6052898,0.582580512,1.507228985,117.0701754,4783,R,3,3,3,FALSE,FALSE,FALSE
40001,202003,100,1.652101192,1.609040929,3.003997558,1.551291583,115.692982,7902,R,3,3,3,FALSE,FALSE,FALSE
40001,202004,100,0.621488231,,0.645360538,1,73.71052632,4911,R,3,0,3,FALSE,TRUE,FALSE
40002,202001,100,,4.739819005,1,2.564122994,128.0208333,2095,R,0,3,4,TRUE,FALSE,FALSE
40002,202002,100,0.21097852,0.140943878,0.582580512,1.507228985,117.0701754,442,R,3,3,3,FALSE,FALSE,FALSE
40002,202003,100,7.095022624,1.482742317,3.003997558,1.551291583,115.692982,3136,R,3,3,3,FALSE,FALSE,FALSE
40002,202004,100,0.67442602,,0.645360538,1,73.71052632,2115,R,3,0,3,FALSE,TRUE,FALSE
40003,202001,100,,0.968230513,1,2.564122994,128.0208333,7863,R,0,3,4,TRUE,FALSE,FALSE
40003,202002,100,1.032811904,3.775453278,0.582580512,1.507228985,117.0701754,8121,R,3,3,3,FALSE,FALSE,FALSE
40003,202003,100,0.264868859,1.562091503,3.003997558,1.551291583,115.692982,2151,R,3,3,3,FALSE,FALSE,FALSE
40003,202004,100,0.640167364,,0.645360538,1,73.71052632,1377,R,3,0,3,FALSE,TRUE,FALSE
40004,202001,100,,,1,2.564122994,128.0208333,5131,R,0,3,4,TRUE,FALSE,FALSE
40004,202002,100,,,0.582580512,1.507228985,117.0701754,2989.220607,FIR,3,3,3,FALSE,FALSE,FALSE
40004,202003,100,,,3.003997558,1.551291583,115.692982,7525,MC,3,3,3,FALSE,FALSE,FALSE
40004,202004,100,,,0.645360538,1,73.71052632,4856.338052,FIMC,3,0,3,FALSE,TRUE,FALSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
identifier,date,group,question,other,manual_construction
100001,202001,100,3074,26,
100001,202002,100,9529,26,
100001,202003,100,615,26,
100001,202004,100,3540,26,
100002,202001,100,8084,19,
100002,202002,100,2422,19,
100002,202003,100,3058,19,
100002,202004,100,5608,19,
100003,202001,100,5161,46,
100003,202002,100,3648,46,
100003,202003,100,205,46,
100003,202004,100,2594,46,
100004,202001,100,,86,
100004,202002,100,,86,
100004,202003,100,,86,11085
100004,202004,100,,86,
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
identifier,date,group,growth_forward,growth_backward,forward,backward,construction,output,marker,count_forward,count_backward,count_construction,default_forward,default_backward,default_construction
100001,202001,100,,0.322594186,1,1.691693133,179.3296703,3074,R,0,3,3,TRUE,FALSE,FALSE
100001,202002,100,3.099869876,15.49430894,1.368771264,11.36048394,171.4175824,9529,R,3,3,3,FALSE,FALSE,FALSE
100001,202003,100,0.064539826,0.173728814,0.4611093,0.266016593,42.61538462,615,R,3,3,3,FALSE,FALSE,FALSE
100001,202004,100,5.756097561,,6.74787815,1,129.032967,3540,R,3,0,3,FALSE,TRUE,FALSE
100002,202001,100,,3.337737407,1,1.691693133,179.3296703,8084,R,0,3,3,TRUE,FALSE,FALSE
100002,202002,100,0.299604156,0.792020929,1.368771264,11.36048394,171.4175824,2422,R,3,3,3,FALSE,FALSE,FALSE
100002,202003,100,1.262592898,0.545292439,0.4611093,0.266016593,42.61538462,3058,R,3,3,3,FALSE,FALSE,FALSE
100002,202004,100,1.833878352,,6.74787815,1,129.032967,5608,R,3,0,3,FALSE,TRUE,FALSE
100003,202001,100,,1.414747807,1,1.691693133,179.3296703,5161,R,0,3,3,TRUE,FALSE,FALSE
100003,202002,100,0.70683976,17.79512195,1.368771264,11.36048394,171.4175824,3648,R,3,3,3,FALSE,FALSE,FALSE
100003,202003,100,0.056195175,0.079028527,0.4611093,0.266016593,42.61538462,205,R,3,3,3,FALSE,FALSE,FALSE
100003,202004,100,12.65365854,,6.74787815,1,129.032967,2594,R,3,0,3,FALSE,TRUE,FALSE
100004,202001,100,,,1,1.691693133,179.3296703,15422.351648,C,0,3,3,TRUE,FALSE,FALSE
100004,202002,100,,,1.368771264,11.36048394,171.4175824,21109.671762,FIC,3,3,3,FALSE,FALSE,FALSE
100004,202003,100,,,0.4611093,0.266016593,42.61538462,11085,MC,3,3,3,FALSE,FALSE,FALSE
100004,202004,100,,,6.74787815,1,129.032967,74800.229291,FIMC,3,0,3,FALSE,TRUE,FALSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
identifier,date,group,question,other,manual_construction
60001,202001,100,5077,15,
60001,202002,100,7830,15,
60001,202003,100,1046,15,
60002,202001,100,1588,71,
60002,202002,100,1213,71,
60002,202003,100,3807,71,
60003,202001,100,6541,26,
60003,202002,100,336,26,
60003,202003,100,6351,26,
60004,202001,100,,3,839
60004,202002,100,,3,
60004,202003,100,401,3,
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
identifier,date,group,growth_forward,growth_backward,forward,backward,construction,output,marker,count_forward,count_backward,count_construction,default_forward,default_backward,default_construction
60001,202001,100,,0.648403576,1,7.141605449,117.9107143,5077,R,0,3,3,TRUE,FALSE,FALSE
60001,202002,100,1.54224936,7.485659656,0.785823852,2.619062766,83.74107143,7830,R,3,3,3,FALSE,FALSE,FALSE
60001,202003,100,0.133588761,,7.391291354,1,100.9130434,1046,R,3,0,4,FALSE,TRUE,FALSE
60002,202001,100,,1.309150866,1,7.141605449,117.9107143,1588,R,0,3,3,TRUE,FALSE,FALSE
60002,202002,100,0.763853904,0.318623588,0.785823852,2.619062766,83.74107143,1213,R,3,3,3,FALSE,FALSE,FALSE
60002,202003,100,3.138499588,,7.391291354,1,100.9130434,3807,R,3,0,4,FALSE,TRUE,FALSE
60003,202001,100,,19.4672619,1,7.141605449,117.9107143,6541,R,0,3,3,TRUE,FALSE,FALSE
60003,202002,100,0.051368292,0.052905054,0.785823852,2.619062766,83.74107143,336,R,3,3,3,FALSE,FALSE,FALSE
60003,202003,100,18.90178571,,7.391291354,1,100.9130434,6351,R,3,0,4,FALSE,TRUE,FALSE
60004,202001,100,,,1,7.141605449,117.9107143,839,MC,0,3,3,TRUE,FALSE,FALSE
60004,202002,100,,,0.785823852,2.619062766,83.74107143,1050.244169,BI,3,3,3,FALSE,FALSE,FALSE
60004,202003,100,,,7.391291354,1,100.9130434,401,R,3,0,4,FALSE,TRUE,FALSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
identifier,date,group,question,other,manual_construction
30001,202001,100,8444,51,
30001,202002,100,7476,51,
30001,202003,100,2003,51,
30002,202001,100,9343,72,
30002,202002,100,7818,72,
30002,202003,100,4897,72,
30003,202001,100,7511,7,
30003,202002,100,1761,7,
30003,202003,100,6492,7,
30004,202001,100,64,81,4321
30004,202002,100,2113,81,
30004,202003,100,,81,
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
identifier,date,group,growth_forward,growth_backward,forward,backward,construction,output,marker,count_forward,count_backward,count_construction,default_forward,default_backward,default_construction
30001,202001,100,,1.129481006,1,1.655005651,120.199052,8444,R,0,4,4,TRUE,FALSE,FALSE
30001,202002,100,0.885362387,3.732401398,8.743054929,1.866715325,90.843602,7476,R,4,3,4,FALSE,FALSE,FALSE
30001,202003,100,0.267924024,,1.526947,1,103.015385,2003,R,3,0,3,FALSE,TRUE,FALSE
30002,202001,100,,1.195062676,1,1.655006,120.199052,9343,R,0,4,4,TRUE,FALSE,FALSE
30002,202002,100,0.836776196,1.596487645,8.743054929,1.866715325,90.843602,7818,R,4,3,4,FALSE,FALSE,FALSE
30002,202003,100,0.626375032,,1.526947,1,103.015385,4897,R,3,0,3,FALSE,TRUE,FALSE
30003,202001,100,,4.265190233,1,1.655006,120.199052,7511,R,0,4,4,TRUE,FALSE,FALSE
30003,202002,100,0.234456131,0.271256932,8.743054929,1.866715325,90.843602,1761,R,4,3,4,FALSE,FALSE,FALSE
30003,202003,100,3.686541738,,1.526947,1,103.015385,6492,R,3,0,3,FALSE,TRUE,FALSE
30004,202001,100,,0.030288689,1,1.655006,120.199052,64,R,0,4,4,TRUE,FALSE,FALSE
30004,202002,100,33.015625,,8.743054929,1.866715325,90.843602,2113,R,4,3,4,FALSE,FALSE,FALSE
30004,202003,100,,,1.526946931056,1,103.015385,3226.438865,FIR,3,0,3,FALSE,TRUE,FALSE
30 changes: 30 additions & 0 deletions tests/imputation/mean_of_ratios.toml
Original file line number Diff line number Diff line change
@@ -195,3 +195,33 @@ weight_periodicity_multiplier = 12
[scenarios.70_C_FI_FI_65_weight]
weight = "0.65"
weight_periodicity_multiplier = 12

[scenarios.71_MC_R_MC]
manual_construction_col = "manual_construction"

[scenarios.72_MC_FIMC]
manual_construction_col = "manual_construction"

[scenarios.73_R_R_MC_FIMC_FIMC]
manual_construction_col = "manual_construction"

[scenarios.74_C_FIC_FIC_MC]
manual_construction_col = "manual_construction"

[scenarios.75_C_FIC_MC_R]
manual_construction_col = "manual_construction"

[scenarios.76_C_MC]
manual_construction_col = "manual_construction"

[scenarios.77_R_FIR_MC_FIMC]
manual_construction_col = "manual_construction"

[scenarios.78_C_FIC_MC_FIMC]
manual_construction_col = "manual_construction"

[scenarios.79_MC_BI_R]
manual_construction_col = "manual_construction"

[scenarios.80_MC_R_FIR]
manual_construction_col = "manual_construction"
72 changes: 69 additions & 3 deletions tests/imputation/test_engine.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import pytest
from pyspark.sql.functions import lit
from pyspark.sql.types import DecimalType, LongType, StringType
from pyspark.sql.functions import lit, col
from pyspark.sql.types import DecimalType, LongType, StringType, BooleanType

from statistical_methods_library.imputation import impute, ratio_of_means
from statistical_methods_library.utilities.exceptions import ValidationError
from tests.helpers import check_df_equality

auxiliary_col = "other"
backward_col = "backward"
@@ -18,6 +19,10 @@
count_forward_col = "count_forward"
count_backward_col = "count_backward"
count_construction_col = "count_construction"
manual_construction_col = "manual_construction"
default_forward_col = "default_forward"
default_backward_col = "default_backward"
default_construction_col = "default_construction"

decimal_type = DecimalType(15, 6)

@@ -34,6 +39,10 @@
count_forward_type = LongType()
count_backward_type = LongType()
count_construction_type = LongType()
manual_construction_type = decimal_type
default_forward_type = BooleanType()
default_backward_type = BooleanType()
default_construction_type = BooleanType()

# Columns we expect in either our input or output test dataframes and their
# respective types
@@ -51,6 +60,10 @@
count_forward_col,
count_backward_col,
count_construction_col,
manual_construction_col,
default_forward_col,
default_backward_col,
default_construction_col,
)

dataframe_types = {
@@ -67,6 +80,10 @@
count_forward_col: count_forward_type,
count_backward_col: count_backward_type,
count_construction_col: count_construction_type,
manual_construction_col: manual_construction_type,
default_forward_col: default_forward_type,
default_backward_col: default_backward_type,
default_construction_col: default_construction_type,
}

bad_dataframe_types = dataframe_types.copy()
@@ -338,7 +355,6 @@ def test_back_data_drops_link_cols_when_present(fxt_load_test_csv, fxt_spark_ses
)

ret_val = impute(input_df=test_dataframe, **params, back_data_df=back_data)

assert ret_val.count() == 1


@@ -407,3 +423,53 @@ def test_input_data_contains_nulls(fxt_load_test_csv, fxt_spark_session):

with pytest.raises(ValidationError):
impute(input_df=test_dataframe, **params)


def test_back_data_fimc(fxt_load_test_csv, fxt_spark_session):
test_dataframe = fxt_load_test_csv(
dataframe_columns,
dataframe_types,
"imputation",
"engine",
"unit",
"manual_construction_input",
)

back_data = fxt_load_test_csv(
dataframe_columns,
dataframe_types,
"imputation",
"engine",
"unit",
"manual_construction_back_data",
)

expected_data = fxt_load_test_csv(
dataframe_columns,
dataframe_types,
"imputation",
"engine",
"unit",
"manual_construction_output",
)
params.update({"manual_construction_col": manual_construction_col})

scenario_actual_output = impute(
input_df=test_dataframe, **params, back_data_df=back_data
)
for field_name, field_type in scenario_actual_output.dtypes:
if field_type.startswith("decimal"):
scenario_actual_output = scenario_actual_output.withColumn(
field_name, col(field_name).cast("decimal(15, 6)")
)

sort_cols = [
params["reference_col"],
params["period_col"],
params["grouping_col"],
]
check_df_equality(
actual=scenario_actual_output.sort(sort_cols),
expected=expected_data.sort(sort_cols),
keep_cols=sort_cols,
)
7 changes: 5 additions & 2 deletions tests/imputation/test_scenarios.py
Original file line number Diff line number Diff line change
@@ -78,7 +78,12 @@ def test_calculations(fxt_load_test_csv, ratio_calculator, scenario_type, scenar
field_types = default_config["field_types"]
field_types.update(test_config.get("field_types", {}))
field_types.update(scenario_config.get("field_types", {}))

imputation_kwargs.update(scenario_config)
if "manual_construction_col" in imputation_kwargs:
field_types.update({"manual_construction_col": "decimal(15,6)"})
fields.update({"manual_construction_col": "manual_construction"})

types = {fields[k]: v for k, v in field_types.items()}
scenario_file_type = scenario_type.replace("back_data_", "")
scenario_input = fxt_load_test_csv(
@@ -112,7 +117,6 @@ def test_calculations(fxt_load_test_csv, ratio_calculator, scenario_type, scenar
scenario_expected_output = scenario_expected_output.filter(
col(fields["period_col"]) >= starting_period
)

scenario_actual_output = imputation.impute(
input_df=scenario_input, **imputation_kwargs
)
@@ -121,7 +125,6 @@ def test_calculations(fxt_load_test_csv, ratio_calculator, scenario_type, scenar
scenario_actual_output = scenario_actual_output.withColumn(
field_name, col(field_name).cast("decimal(15, 6)")
)

sort_cols = [
fields["reference_col"],
fields["period_col"],