-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathhandstats.py
139 lines (110 loc) · 7.04 KB
/
handstats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# todo:
# 1. move Date to first column or so.
# 2. data table columns need to be ordered by importance. possibly eliminate some unimportant columns.
import streamlit as st
import pathlib
import pickle
import pyarrow.parquet as pq
import duckdb
import polars as pl
import altair as alt
import matplotlib.pyplot as plt
import time
import bridgestatslib
import sys
sys.path.append(str(pathlib.Path.cwd().joinpath('streamlitlib'))) # global
import streamlitlib # must be placed after sys.path.append. vscode re-format likes to move this to the top
def create_query(database_name, brs_regex, sample_size=100000):
query_select = "SELECT *"
query_from = f"FROM {database_name}"
query_brs_regex = '' if len(brs_regex)==0 else f"regexp_matches(board_record_string, '{brs_regex}')"
query_where_string = ' AND'.join(s for s in [query_brs_regex] if len(s))
query_where = '' if len(query_where_string)==0 else 'WHERE '+query_where_string
query_sample = f"USING SAMPLE {sample_size}"
query_limit = '' #if limit==0 else f"LIMIT {limit}"
query = ' '.join([query_select, query_from, query_where, query_sample, query_limit])
return query
def Stats(club_or_tournament, pair_or_player, chart_options, groupby):
st.set_page_config(layout="wide", initial_sidebar_state="expanded")
streamlitlib.widen_scrollbars()
st.header(f"Hand Record Statistics for ACBL {club_or_tournament.capitalize()} Pair Games")
st.sidebar.header("Settings for Hand Record Statistics")
key_prefix = club_or_tournament # pair_or_player isn't used here
rootPath = pathlib.Path('.')
dataPath = rootPath.joinpath('data')
acbl_hand_records_augmented_filename = f"acbl_{club_or_tournament}_hand_records_augmented_narrow.parquet"
acbl_hand_records_augmented_file = dataPath.joinpath(acbl_hand_records_augmented_filename)
# tournament data is as early as 2013? club data as early as 2019?
start_date = st.sidebar.text_input('Enter start date:', value='2000-01-01', key=key_prefix+'_HandRecord-Start_Date', help='Enter starting date in YYYY-MM-DD format. Earliest year is 2019')
end_date_default = time.strftime("%Y-%m-%d")
end_date = st.sidebar.text_input('Enter end date:', value=end_date_default, key=key_prefix+'_HandRecord-End_Date', help='Enter ending date in YYYY-MM-DD format.')
chart_options = ['Par_Score','CT_NS,CT_EW','DD_N_C,DD_N_D,DD_N_H,DD_N_S,DD_N_N','SL_N_C,SL_N_D,SL_N_H,SL_N_S','SL_N_ML_SJ','HCP_NS,HCP_EW','HCP_N,HCP_E,HCP_S,HCP_W','QT_N,QT_E,QT_S,QT_W','QT_NS,QT_EW','DP_N','DP_N_C,DP_N_D,DP_N_H,DP_N_S','DP_NS,DP_EW','LoTT_Tricks','LoTT_Suit_Length','LoTT_Variance','HCP_NS,DP_NS,DD_N_N','HCP_NS,QT_NS,DD_N_N']
selected_charts = st.sidebar.multiselect('Select charts to display', chart_options, default=chart_options, key=key_prefix+'_HandRecord-Charts')
st.sidebar.header("Advanced Settings")
# select using regex
brs_regex = st.sidebar.text_input('Restrict results to boards matching this regex:', value='', key=key_prefix+'_HandRecord-brs', help='Example: ^SAK.*$').strip() # e.g. ^SAK.*$
table_display_limit = 100 # streamlit gets choked up pretty quickly. need to limit table to 100.
sample_size = 100000
st.warning('Table and charts take up to 30 to 60 seconds to render.')
with st.spinner(text="Reading hand record data ..."):
start_time = time.time()
database_name = 'hand_records_arrow'
if club_or_tournament == 'club':
hand_records_arrow = bridgestatslib.load_club_hand_records(acbl_hand_records_augmented_file)
else:
hand_records_arrow = bridgestatslib.load_tournament_hand_records(acbl_hand_records_augmented_file)
hand_records_len = hand_records_arrow.collect().height
database_column_names = hand_records_arrow.collect().columns
end_time = time.time()
st.info(f"Data read completed in {round(end_time-start_time,2)} seconds. {hand_records_len} rows read.")
with st.spinner(text="Selecting database rows ..."):
start_time = time.time()
query = create_query(database_name, brs_regex, sample_size)
query = st.text_input('Sql query',value=query,label_visibility='hidden', key=key_prefix+'_HandRecord-query')
# Convert DuckDB result to Polars DataFrame - now collecting the LazyFrame first
selected_df = pl.from_arrow(
duckdb.arrow(hand_records_arrow.collect()).query('hand_records', query).arrow()
)
# Drop duplicates based on board_record_string
selected_df = selected_df.unique(subset=["board_record_string"])
uniques = selected_df.height
# Filter by date range
selected_df = selected_df.filter(
(pl.col('Date') >= start_date) & (pl.col('Date') <= end_date)
)
selected_df_len = selected_df.height
end_time = time.time()
st.info(f"Query completed in {round(end_time-start_time,2)} seconds. Database has {hand_records_len} rows. Sampling {sample_size} random rows. {uniques} unique hands found.")
# prepare data columns
# Remove columns starting with '__'
selected_df = selected_df.select([
col for col in selected_df.columns if not col.startswith('__')
])
# Round float columns to 2 decimal places
float_cols = [col for col in selected_df.columns if selected_df[col].dtype in [pl.Float32, pl.Float64]]
selected_df = selected_df.with_columns([
pl.col(col).round(2) for col in float_cols
])
table, chart = st.tabs(["Data Table", "Charts"])
with table:
with st.spinner(text="Creating data table ..."):
start_time = time.time()
# Sample rows and sort
table_df = selected_df.sample(n=min(table_display_limit, selected_df.height))
st.text(f"Table of Hand Records. {selected_df_len} random rows selected. Table display limited to {table_df.height} random rows.")
streamlitlib.ShowDataFrameTable(table_df)
del table_df
end_time = time.time()
st.info(f"Data table created in {round(end_time-start_time,2)} seconds.")
with chart:
with st.spinner(text="Creating Charts"):
start_time = time.time()
st.write(f"Abbreviations for Chart Type: CT is Contract Type (passed-out, partial, game, small slam, grand slam), DD is Double Dummy, DP is Distribution Points, HCP is High Card Points, LoTT is Law of Total Tricks, QT is Quick Tricks, SL is Suit Length")
st.write(f"Abbreviations for individual directions: N is North, S is South, E is East W is West.")
st.write(f"Abbreviations for pair direction: NS is North-South, EW is East-West.")
st.write(f"Abbreviations for strains (suits): C is Clubs, D is Diamonds, H is Hearts, S is Spades, N is No-Trump")
st.write(f"For example: DD_N_N is Double Dummy - North - No-Trump")
st.text(f"{selected_df_len} random rows selected.")
bridgestatslib.ShowCharts(selected_df, selected_charts)
end_time = time.time()
st.info(f"Charts created in {round(end_time-start_time,2)} seconds.")