Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Spp 7071 g weight,design weight and l_value validations #94

Merged
merged 5 commits into from
Jul 25, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "statistical_methods_library"
version = "5.0.1"
version = "5.0.2"
description = ""
authors = ["Your Name <[email protected]>"]
license = "MIT"
8 changes: 6 additions & 2 deletions statistical_methods_library/estimation.py
Original file line number Diff line number Diff line change
@@ -199,7 +199,9 @@ def ht_ratio(
input_df.select(period_col, strata_col).distinct().count()
!= input_df.select(period_col, strata_col, h_value_col).distinct().count()
):
raise ValidationError("The h value must be the same per period and stratum.")
raise ValidationError(
f"The {h_value_col} must be the same per period and stratum."
)

# --- prepare our working data frame ---
col_list = [
@@ -257,7 +259,9 @@ def ht_ratio(
)
> 0
):
raise ValidationError("The death count must be less than sample count.")
raise ValidationError(
f"The {death_marker_col} count must be less than {sample_marker_col} count."
)

# --- Expansion estimation ---
# If we've got a death marker and h value then we'll use these, otherwise
14 changes: 14 additions & 0 deletions statistical_methods_library/outliering.py
Original file line number Diff line number Diff line change
@@ -132,6 +132,20 @@ def winsorise(
if input_df.filter(col(col_name).isNull()).count() > 0:
raise ValidationError(f"Column {col_name} must not contain null values.")

if input_df.filter(col(design_col) < 1).count() > 0:
raise ValidationError(
f"Column {design_col} must not contain values smaller than one."
)
if input_df.filter(col(l_value_col) < 0).count() > 0:
raise ValidationError(f"Column {l_value_col} must not contain negative values.")

if calibration_col is not None and (
input_df.filter(col(calibration_col) <= 0).count() > 0
):
raise ValidationError(
f"Column {calibration_col} must not contain zero or negative values."
)

col_list = [
col(reference_col).alias("reference"),
col(period_col).alias("period"),
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
period,ref,grouping,target,design_weight,l_value
202201,345671,1,17.6,0.999,400
202201,345672,1,18.4,100,400
202201,345673,1,20,100,400
202201,345674,1,21.6,100,400
202201,345675,1,22.4,100,400
202201,345676,1,23.2,100,400
202201,345677,2,40,60,400
202201,345678,2,25.6,60,400
202201,345679,2,26.4,60,400
202201,345680,2,28,60,400
202201,345681,2,28.8,60,400
202201,345682,2,29.6,60,400
202201,345683,3,32.8,40,400
202201,345684,3,34.4,40,400
202201,345685,3,36.8,40,400
202201,345686,3,37.6,40,400
202201,345687,3,39.2,40,400
202201,345688,3,38,1,400
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
ref,period,grouping,target,design_weight,l_value,calibration_weight,auxiliary
1,202001,1,10,1,5,-0.0001,8
1,202002,1,10,2,5,4,8
1,202003,1,15,3,5,6,8
2,202001,1,20,1,5,2,8
2,202002,1,20,2,5,4,8
2,202003,1,25,3,5,6,8
3,202001,2,30,1,10,2,8
3,202002,2,30,2,10,4,8
3,202003,2,35,3,10,6,8
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
ref,period,grouping,target,design_weight,l_value,calibration_weight,auxiliary
1,202001,1,10,1,-0.05,4,8
1,202002,1,10,2,-0.05,4,8
1,202003,1,15,3,-0.05,6,8
2,202001,1,20,1,-0.05,2,8
2,202002,1,20,2,-0.05,4,8
2,202003,1,25,3,-0.05,6,8
3,202001,2,30,1,10,2,8
3,202002,2,30,2,10,4,8
3,202003,2,35,3,10,6,8
70 changes: 70 additions & 0 deletions tests/test_outlering.py
Original file line number Diff line number Diff line change
@@ -332,6 +332,76 @@ def test_winsorise_different_stratum_l_values_in_same_period_fails(fxt_load_test
)


@pytest.mark.dependency()
def test_winsorise_negative_calibration_weight_fails(fxt_load_test_csv):
test_dataframe = fxt_load_test_csv(
dataframe_columns,
dataframe_types,
"outliering",
"winsorise",
"unit",
"negative_calibration_weight",
)

with pytest.raises(
outliering.ValidationError,
match=rf"Column {calibration_weight_col} must "
+ "not contain zero or negative values.",
):
additional_params = [
*default_params,
outlier_weight_col,
calibration_weight_col,
auxiliary_col,
]
outliering.winsorise(
test_dataframe,
*additional_params,
)


@pytest.mark.dependency()
def test_winsorise_negative_l_value_fails(fxt_load_test_csv):
test_dataframe = fxt_load_test_csv(
dataframe_columns,
dataframe_types,
"outliering",
"winsorise",
"unit",
"negative_l_value",
)

with pytest.raises(
outliering.ValidationError,
match=rf"Column {l_value_col} must not contain negative values.",
):
outliering.winsorise(
test_dataframe,
*default_params,
)


@pytest.mark.dependency()
def test_winsorise_design_weight_smaller_than_one_fails(fxt_load_test_csv):
test_dataframe = fxt_load_test_csv(
dataframe_columns,
dataframe_types,
"outliering",
"winsorise",
"unit",
"design_weight_smaller_than_one",
)

with pytest.raises(
outliering.ValidationError,
match=rf"Column {design_weight_col} must not contain values smaller than one.",
):
outliering.winsorise(
test_dataframe,
*default_params,
)


@pytest.mark.dependency()
def test_winsorise_different_stratum_l_values_in_different_periods_succeeds(
fxt_load_test_csv,