diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 000000000..0dd6f0561 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,6 @@ +[*.py] +indent_style = space +indent_size = 4 +end_of_line = lf +insert_final_newline = true +trim_trailing_whitespace = true diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 5ca0973f8..000000000 --- a/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -.DS_Store - diff --git a/ann/src/main/python/dataflow/faiss_index_bq_dataset.py b/ann/src/main/python/dataflow/faiss_index_bq_dataset.py index dd45070db..b9eca2fc3 100644 --- a/ann/src/main/python/dataflow/faiss_index_bq_dataset.py +++ b/ann/src/main/python/dataflow/faiss_index_bq_dataset.py @@ -3,230 +3,246 @@ import os import pkgutil import sys +from typing import Dict, List, Optional from urllib.parse import urlsplit import apache_beam as beam -from apache_beam.options.pipeline_options import PipelineOptions import faiss +from apache_beam.options.pipeline_options import PipelineOptions -def parse_d6w_config(argv=None): - """Parse d6w config. - :param argv: d6w config - :return: dictionary containing d6w config - """ - - parser = argparse.ArgumentParser( - description="See https://docbird.twitter.biz/d6w/model.html for any parameters inherited from d6w job config" - ) - parser.add_argument("--job_name", dest="job_name", required=True, help="d6w attribute") - parser.add_argument("--project", dest="project", required=True, help="d6w attribute") - parser.add_argument( - "--staging_location", dest="staging_location", required=True, help="d6w attribute" - ) - parser.add_argument("--temp_location", dest="temp_location", required=True, help="d6w attribute") - parser.add_argument( - "--output_location", - dest="output_location", - required=True, - help="GCS bucket and path where resulting artifacts are uploaded", - ) - parser.add_argument( - "--service_account_email", dest="service_account_email", required=True, help="d6w attribute" - ) - parser.add_argument( - "--factory_string", - dest="factory_string", - required=False, - help="FAISS factory string describing index to build. See https://github.com/facebookresearch/faiss/wiki/The-index-factory", - ) - parser.add_argument( - "--metric", - dest="metric", - required=True, - help="Metric used to compute distance between embeddings. Valid values are 'l2', 'ip', 'l1', 'linf'", - ) - parser.add_argument( - "--use_gpu", - dest="gpu", - required=True, - help="--use_gpu=yes if you want to use GPU during index building", - ) - - known_args, unknown_args = parser.parse_known_args(argv) - d6w_config = vars(known_args) - d6w_config["gpu"] = d6w_config["gpu"].lower() == "yes" - d6w_config["metric"] = parse_metric(d6w_config) - - """ - WARNING: Currently, d6w (a Twitter tool used to deploy Dataflow jobs to GCP) and - PipelineOptions.for_dataflow_runner (a helper method in twitter.ml.common.apache_beam) do not - play nicely together. The helper method will overwrite some of the config specified in the d6w - file using the defaults in https://sourcegraph.twitter.biz/git.twitter.biz/source/-/blob/src/python/twitter/ml/common/apache_beam/__init__.py?L24.' - However, the d6w output message will still report that the config specified in the d6w file was used. - """ - logging.warning( - f"The following d6w config parameters will be overwritten by the defaults in " - f"https://sourcegraph.twitter.biz/git.twitter.biz/source/-/blob/src/python/twitter/ml/common/apache_beam/__init__.py?L24\n" - f"{str(unknown_args)}" - ) - return d6w_config - - -def get_bq_query(): - """ - Query is expected to return rows with unique entityId - """ - return pkgutil.get_data(__name__, "bq.sql").decode("utf-8") - - -def parse_metric(config): - metric_str = config["metric"].lower() - if metric_str == "l2": - return faiss.METRIC_L2 - elif metric_str == "ip": - return faiss.METRIC_INNER_PRODUCT - elif metric_str == "l1": - return faiss.METRIC_L1 - elif metric_str == "linf": - return faiss.METRIC_Linf - else: - raise Exception(f"Unknown metric: {metric_str}") +def parse_d6w_config(argv: Optional[List[str]] = None): + """Parse d6w config. + :param argv: d6w config + :return: dictionary containing d6w config + """ - -def run_pipeline(argv=[]): - config = parse_d6w_config(argv) - argv_with_extras = argv - if config["gpu"]: - argv_with_extras.extend(["--experiments", "use_runner_v2"]) - argv_with_extras.extend( - ["--experiments", "worker_accelerator=type:nvidia-tesla-t4;count:1;install-nvidia-driver"] + parser = argparse.ArgumentParser( + description="See https://docbird.twitter.biz/d6w/model.html for any parameters inherited from d6w job config" ) - argv_with_extras.extend( - [ - "--worker_harness_container_image", - "gcr.io/twttr-recos-ml-prod/dataflow-gpu/beam2_39_0_py3_7", - ] + parser.add_argument( + "--job_name", dest="job_name", required=True, help="d6w attribute" ) - - options = PipelineOptions(argv_with_extras) - output_bucket_name = urlsplit(config["output_location"]).netloc - - with beam.Pipeline(options=options) as p: - input_data = p | "Read from BigQuery" >> beam.io.ReadFromBigQuery( - method=beam.io.ReadFromBigQuery.Method.DIRECT_READ, - query=get_bq_query(), - use_standard_sql=True, + parser.add_argument( + "--project", dest="project", required=True, help="d6w attribute" + ) + parser.add_argument( + "--staging_location", + dest="staging_location", + required=True, + help="d6w attribute", + ) + parser.add_argument( + "--temp_location", dest="temp_location", required=True, help="d6w attribute" + ) + parser.add_argument( + "--output_location", + dest="output_location", + required=True, + help="GCS bucket and path where resulting artifacts are uploaded", + ) + parser.add_argument( + "--service_account_email", + dest="service_account_email", + required=True, + help="d6w attribute", + ) + parser.add_argument( + "--factory_string", + dest="factory_string", + required=False, + help="FAISS factory string describing index to build. See https://github.com/facebookresearch/faiss/wiki/The-index-factory", + ) + parser.add_argument( + "--metric", + dest="metric", + required=True, + help="Metric used to compute distance between embeddings. Valid values are 'l2', 'ip', 'l1', 'linf'", + ) + parser.add_argument( + "--use_gpu", + dest="gpu", + required=True, + help="--use_gpu=yes if you want to use GPU during index building", ) - index_built = input_data | "Build and upload index" >> beam.CombineGlobally( - MergeAndBuildIndex( - output_bucket_name, - config["output_location"], - config["factory_string"], - config["metric"], - config["gpu"], - ) + known_args, unknown_args = parser.parse_known_args(argv) + d6w_config = vars(known_args) + d6w_config["gpu"] = d6w_config["gpu"].lower() == "yes" + d6w_config["metric"] = parse_metric(d6w_config) + + """ + WARNING: Currently, d6w (a Twitter tool used to deploy Dataflow jobs to GCP) and + PipelineOptions.for_dataflow_runner (a helper method in twitter.ml.common.apache_beam) do not + play nicely together. The helper method will overwrite some of the config specified in the d6w + file using the defaults in https://sourcegraph.twitter.biz/git.twitter.biz/source/-/blob/src/python/twitter/ml/common/apache_beam/__init__.py?L24.' + However, the d6w output message will still report that the config specified in the d6w file was used. + """ + logging.warning( + f"The following d6w config parameters will be overwritten by the defaults in " + f"https://sourcegraph.twitter.biz/git.twitter.biz/source/-/blob/src/python/twitter/ml/common/apache_beam/__init__.py?L24\n" + f"{str(unknown_args)}" ) + return d6w_config + + +def get_bq_query(): + """ + Query is expected to return rows with unique entityId + """ + return pkgutil.get_data(__name__, "bq.sql").decode("utf-8") + + +def parse_metric(config: Dict[str, str]): + metric_str = config["metric"].lower() + if metric_str == "l2": + return faiss.METRIC_L2 + elif metric_str == "ip": + return faiss.METRIC_INNER_PRODUCT + elif metric_str == "l1": + return faiss.METRIC_L1 + elif metric_str == "linf": + return faiss.METRIC_Linf + raise Exception(f"Unknown metric: {metric_str}") + - # Make linter happy - index_built +def run_pipeline(argv: List[str] = []): + config = parse_d6w_config(argv) + argv_with_extras = argv + if config["gpu"]: + argv_with_extras.extend(["--experiments", "use_runner_v2"]) + argv_with_extras.extend( + [ + "--experiments", + "worker_accelerator=type:nvidia-tesla-t4;count:1;install-nvidia-driver", + ] + ) + argv_with_extras.extend( + [ + "--worker_harness_container_image", + "gcr.io/twttr-recos-ml-prod/dataflow-gpu/beam2_39_0_py3_7", + ] + ) + + options = PipelineOptions(argv_with_extras) + output_bucket_name = urlsplit(config["output_location"]).netloc + + with beam.Pipeline(options=options) as p: + input_data = p | "Read from BigQuery" >> beam.io.ReadFromBigQuery( + method=beam.io.ReadFromBigQuery.Method.DIRECT_READ, + query=get_bq_query(), + use_standard_sql=True, + ) + + index_built = input_data | "Build and upload index" >> beam.CombineGlobally( + MergeAndBuildIndex( + output_bucket_name, + config["output_location"], + config["factory_string"], + config["metric"], + config["gpu"], + ) + ) # pylint: disable=unused-variable class MergeAndBuildIndex(beam.CombineFn): - def __init__(self, bucket_name, gcs_output_path, factory_string, metric, gpu): - self.bucket_name = bucket_name - self.gcs_output_path = gcs_output_path - self.factory_string = factory_string - self.metric = metric - self.gpu = gpu - - def create_accumulator(self): - return [] - - def add_input(self, accumulator, element): - accumulator.append(element) - return accumulator - - def merge_accumulators(self, accumulators): - merged = [] - for accum in accumulators: - merged.extend(accum) - return merged - - def extract_output(self, rows): - # Reimports are needed on workers - import glob - import subprocess - - import faiss - from google.cloud import storage - import numpy as np - - client = storage.Client() - bucket = client.get_bucket(self.bucket_name) - - logging.info("Building FAISS index") - logging.info(f"There are {len(rows)} rows") - - ids = np.array([x["entityId"] for x in rows]).astype("long") - embeds = np.array([x["embedding"] for x in rows]).astype("float32") - dimensions = len(embeds[0]) - N = ids.shape[0] - logging.info(f"There are {dimensions} dimensions") - - if self.factory_string is None: - M = 48 - - divideable_dimensions = (dimensions // M) * M - if divideable_dimensions != dimensions: - opq_prefix = f"OPQ{M}_{divideable_dimensions}" - else: - opq_prefix = f"OPQ{M}" - - clusters = N // 20 - self.factory_string = f"{opq_prefix},IVF{clusters},PQ{M}" - - logging.info(f"Factory string is {self.factory_string}, metric={self.metric}") - - if self.gpu: - logging.info("Using GPU") - - res = faiss.StandardGpuResources() - cpu_index = faiss.index_factory(dimensions, self.factory_string, self.metric) - cpu_index = faiss.IndexIDMap(cpu_index) - gpu_index = faiss.index_cpu_to_gpu(res, 0, cpu_index) - gpu_index.train(embeds) - gpu_index.add_with_ids(embeds, ids) - cpu_index = faiss.index_gpu_to_cpu(gpu_index) - else: - logging.info("Using CPU") - - cpu_index = faiss.index_factory(dimensions, self.factory_string, self.metric) - cpu_index = faiss.IndexIDMap(cpu_index) - cpu_index.train(embeds) - cpu_index.add_with_ids(embeds, ids) - - logging.info("Built faiss index") - - local_path = "/indices" - logging.info(f"Writing indices to local {local_path}") - subprocess.run(f"mkdir -p {local_path}".strip().split()) - local_index_path = os.path.join(local_path, "result.index") - - faiss.write_index(cpu_index, local_index_path) - logging.info(f"Done writing indices to local {local_path}") - - logging.info(f"Uploading to GCS with path {self.gcs_output_path}") - assert os.path.isdir(local_path) - for local_file in glob.glob(local_path + "/*"): - remote_path = os.path.join( - self.gcs_output_path.split("/")[-1], local_file[1 + len(local_path) :] - ) - blob = bucket.blob(remote_path) - blob.upload_from_filename(local_file) + def __init__(self, bucket_name, gcs_output_path, factory_string, metric, gpu): + self.bucket_name = bucket_name + self.gcs_output_path = gcs_output_path + self.factory_string = factory_string + self.metric = metric + self.gpu = gpu + + def create_accumulator(self): + return [] + + def add_input(self, accumulator: List, element) -> List: + accumulator.append(element) + return accumulator + + def merge_accumulators(self, accumulators): + merged = [] + for accum in accumulators: + merged.extend(accum) + return merged + + def extract_output(self, rows): + # Reimports are needed on workers + import glob + import subprocess + + import faiss + import numpy as np + from google.cloud import storage + + client = storage.Client() + bucket = client.get_bucket(self.bucket_name) + + logging.info("Building FAISS index") + logging.info(f"There are {len(rows)} rows") + + ids = np.array([x["entityId"] for x in rows]).astype("long") + embeds = np.array([x["embedding"] for x in rows]).astype("float32") + dimensions = len(embeds[0]) + N = ids.shape[0] + logging.info(f"There are {dimensions} dimensions") + + if self.factory_string is None: + M = 48 + + divideable_dimensions = (dimensions // M) * M + if divideable_dimensions != dimensions: + opq_prefix = f"OPQ{M}_{divideable_dimensions}" + else: + opq_prefix = f"OPQ{M}" + + clusters = N // 20 + self.factory_string = f"{opq_prefix},IVF{clusters},PQ{M}" + + logging.info(f"Factory string is {self.factory_string}, metric={self.metric}") + + if self.gpu: + logging.info("Using GPU") + + res = faiss.StandardGpuResources() + cpu_index = faiss.index_factory( + dimensions, self.factory_string, self.metric + ) + cpu_index = faiss.IndexIDMap(cpu_index) + gpu_index = faiss.index_cpu_to_gpu(res, 0, cpu_index) + gpu_index.train(embeds) + gpu_index.add_with_ids(embeds, ids) + cpu_index = faiss.index_gpu_to_cpu(gpu_index) + else: + logging.info("Using CPU") + + cpu_index = faiss.index_factory( + dimensions, self.factory_string, self.metric + ) + cpu_index = faiss.IndexIDMap(cpu_index) + cpu_index.train(embeds) + cpu_index.add_with_ids(embeds, ids) + + logging.info("Built faiss index") + + local_path = "/indices" + logging.info(f"Writing indices to local {local_path}") + subprocess.run(f"mkdir -p {local_path}".strip().split()) + local_index_path = os.path.join(local_path, "result.index") + + faiss.write_index(cpu_index, local_index_path) + logging.info(f"Done writing indices to local {local_path}") + + logging.info(f"Uploading to GCS with path {self.gcs_output_path}") + assert os.path.isdir(local_path) + for local_file in glob.glob(local_path + "/*"): + remote_path = os.path.join( + self.gcs_output_path.split("/")[-1], local_file[1 + len(local_path) :] + ) + blob = bucket.blob(remote_path) + blob.upload_from_filename(local_file) if __name__ == "__main__": - logging.getLogger().setLevel(logging.INFO) - run_pipeline(sys.argv) + logging.getLogger().setLevel(logging.INFO) + run_pipeline(sys.argv) diff --git a/src/python/twitter/deepbird/projects/timelines/configs/recap_earlybird/feature_config.py b/src/python/twitter/deepbird/projects/timelines/configs/recap_earlybird/feature_config.py index 167756c01..7bc382b54 100644 --- a/src/python/twitter/deepbird/projects/timelines/configs/recap_earlybird/feature_config.py +++ b/src/python/twitter/deepbird/projects/timelines/configs/recap_earlybird/feature_config.py @@ -2,82 +2,82 @@ from twml.feature_config import FeatureConfigBuilder -def get_feature_config(data_spec_path, label): - return ( - FeatureConfigBuilder(data_spec_path=data_spec_path, debug=True) - .batch_add_features( - [ - ("ebd.author_specific_score", "A"), - ("ebd.has_diff_lang", "A"), - ("ebd.has_english_tweet_diff_ui_lang", "A"), - ("ebd.has_english_ui_diff_tweet_lang", "A"), - ("ebd.is_self_tweet", "A"), - ("ebd.tweet_age_in_secs", "A"), - ("encoded_tweet_features.favorite_count", "A"), - ("encoded_tweet_features.from_verified_account_flag", "A"), - ("encoded_tweet_features.has_card_flag", "A"), - # ("encoded_tweet_features.has_consumer_video_flag", "A"), - ("encoded_tweet_features.has_image_url_flag", "A"), - ("encoded_tweet_features.has_link_flag", "A"), - ("encoded_tweet_features.has_multiple_hashtags_or_trends_flag", "A"), - # ("encoded_tweet_features.has_multiple_media_flag", "A"), - ("encoded_tweet_features.has_native_image_flag", "A"), - ("encoded_tweet_features.has_news_url_flag", "A"), - ("encoded_tweet_features.has_periscope_flag", "A"), - ("encoded_tweet_features.has_pro_video_flag", "A"), - ("encoded_tweet_features.has_quote_flag", "A"), - ("encoded_tweet_features.has_trend_flag", "A"), - ("encoded_tweet_features.has_video_url_flag", "A"), - ("encoded_tweet_features.has_vine_flag", "A"), - ("encoded_tweet_features.has_visible_link_flag", "A"), - ("encoded_tweet_features.is_offensive_flag", "A"), - ("encoded_tweet_features.is_reply_flag", "A"), - ("encoded_tweet_features.is_retweet_flag", "A"), - ("encoded_tweet_features.is_sensitive_content", "A"), - # ("encoded_tweet_features.is_user_new_flag", "A"), - ("encoded_tweet_features.language", "A"), - ("encoded_tweet_features.link_language", "A"), - ("encoded_tweet_features.num_hashtags", "A"), - ("encoded_tweet_features.num_mentions", "A"), - # ("encoded_tweet_features.profile_is_egg_flag", "A"), - ("encoded_tweet_features.reply_count", "A"), - ("encoded_tweet_features.retweet_count", "A"), - ("encoded_tweet_features.text_score", "A"), - ("encoded_tweet_features.user_reputation", "A"), - ("extended_encoded_tweet_features.embeds_impression_count", "A"), - ("extended_encoded_tweet_features.embeds_impression_count_v2", "A"), - ("extended_encoded_tweet_features.embeds_url_count", "A"), - ("extended_encoded_tweet_features.embeds_url_count_v2", "A"), - ("extended_encoded_tweet_features.favorite_count_v2", "A"), - ("extended_encoded_tweet_features.label_abusive_hi_rcl_flag", "A"), - ("extended_encoded_tweet_features.label_dup_content_flag", "A"), - ("extended_encoded_tweet_features.label_nsfw_hi_prc_flag", "A"), - ("extended_encoded_tweet_features.label_nsfw_hi_rcl_flag", "A"), - ("extended_encoded_tweet_features.label_spam_flag", "A"), - ("extended_encoded_tweet_features.label_spam_hi_rcl_flag", "A"), - ("extended_encoded_tweet_features.quote_count", "A"), - ("extended_encoded_tweet_features.reply_count_v2", "A"), - ("extended_encoded_tweet_features.retweet_count_v2", "A"), - ("extended_encoded_tweet_features.weighted_favorite_count", "A"), - ("extended_encoded_tweet_features.weighted_quote_count", "A"), - ("extended_encoded_tweet_features.weighted_reply_count", "A"), - ("extended_encoded_tweet_features.weighted_retweet_count", "A"), - ] +def get_feature_config(data_spec_path: str, label: str) -> FeatureConfigBuilder: + return ( + FeatureConfigBuilder(data_spec_path=data_spec_path, debug=True) + .batch_add_features( + [ + ("ebd.author_specific_score", "A"), + ("ebd.has_diff_lang", "A"), + ("ebd.has_english_tweet_diff_ui_lang", "A"), + ("ebd.has_english_ui_diff_tweet_lang", "A"), + ("ebd.is_self_tweet", "A"), + ("ebd.tweet_age_in_secs", "A"), + ("encoded_tweet_features.favorite_count", "A"), + ("encoded_tweet_features.from_verified_account_flag", "A"), + ("encoded_tweet_features.has_card_flag", "A"), + # ("encoded_tweet_features.has_consumer_video_flag", "A"), + ("encoded_tweet_features.has_image_url_flag", "A"), + ("encoded_tweet_features.has_link_flag", "A"), + ("encoded_tweet_features.has_multiple_hashtags_or_trends_flag", "A"), + # ("encoded_tweet_features.has_multiple_media_flag", "A"), + ("encoded_tweet_features.has_native_image_flag", "A"), + ("encoded_tweet_features.has_news_url_flag", "A"), + ("encoded_tweet_features.has_periscope_flag", "A"), + ("encoded_tweet_features.has_pro_video_flag", "A"), + ("encoded_tweet_features.has_quote_flag", "A"), + ("encoded_tweet_features.has_trend_flag", "A"), + ("encoded_tweet_features.has_video_url_flag", "A"), + ("encoded_tweet_features.has_vine_flag", "A"), + ("encoded_tweet_features.has_visible_link_flag", "A"), + ("encoded_tweet_features.is_offensive_flag", "A"), + ("encoded_tweet_features.is_reply_flag", "A"), + ("encoded_tweet_features.is_retweet_flag", "A"), + ("encoded_tweet_features.is_sensitive_content", "A"), + # ("encoded_tweet_features.is_user_new_flag", "A"), + ("encoded_tweet_features.language", "A"), + ("encoded_tweet_features.link_language", "A"), + ("encoded_tweet_features.num_hashtags", "A"), + ("encoded_tweet_features.num_mentions", "A"), + # ("encoded_tweet_features.profile_is_egg_flag", "A"), + ("encoded_tweet_features.reply_count", "A"), + ("encoded_tweet_features.retweet_count", "A"), + ("encoded_tweet_features.text_score", "A"), + ("encoded_tweet_features.user_reputation", "A"), + ("extended_encoded_tweet_features.embeds_impression_count", "A"), + ("extended_encoded_tweet_features.embeds_impression_count_v2", "A"), + ("extended_encoded_tweet_features.embeds_url_count", "A"), + ("extended_encoded_tweet_features.embeds_url_count_v2", "A"), + ("extended_encoded_tweet_features.favorite_count_v2", "A"), + ("extended_encoded_tweet_features.label_abusive_hi_rcl_flag", "A"), + ("extended_encoded_tweet_features.label_dup_content_flag", "A"), + ("extended_encoded_tweet_features.label_nsfw_hi_prc_flag", "A"), + ("extended_encoded_tweet_features.label_nsfw_hi_rcl_flag", "A"), + ("extended_encoded_tweet_features.label_spam_flag", "A"), + ("extended_encoded_tweet_features.label_spam_hi_rcl_flag", "A"), + ("extended_encoded_tweet_features.quote_count", "A"), + ("extended_encoded_tweet_features.reply_count_v2", "A"), + ("extended_encoded_tweet_features.retweet_count_v2", "A"), + ("extended_encoded_tweet_features.weighted_favorite_count", "A"), + ("extended_encoded_tweet_features.weighted_quote_count", "A"), + ("extended_encoded_tweet_features.weighted_reply_count", "A"), + ("extended_encoded_tweet_features.weighted_retweet_count", "A"), + ] + ) + .add_labels( + [ + label, # Tensor index: 0 + "recap.engagement.is_clicked", # Tensor index: 1 + "recap.engagement.is_favorited", # Tensor index: 2 + "recap.engagement.is_open_linked", # Tensor index: 3 + "recap.engagement.is_photo_expanded", # Tensor index: 4 + "recap.engagement.is_profile_clicked", # Tensor index: 5 + "recap.engagement.is_replied", # Tensor index: 6 + "recap.engagement.is_retweeted", # Tensor index: 7 + "recap.engagement.is_video_playback_50", # Tensor index: 8 + "timelines.earlybird_score", # Tensor index: 9 + ] + ) + .define_weight("meta.record_weight/type=earlybird") + .build() ) - .add_labels( - [ - label, # Tensor index: 0 - "recap.engagement.is_clicked", # Tensor index: 1 - "recap.engagement.is_favorited", # Tensor index: 2 - "recap.engagement.is_open_linked", # Tensor index: 3 - "recap.engagement.is_photo_expanded", # Tensor index: 4 - "recap.engagement.is_profile_clicked", # Tensor index: 5 - "recap.engagement.is_replied", # Tensor index: 6 - "recap.engagement.is_retweeted", # Tensor index: 7 - "recap.engagement.is_video_playback_50", # Tensor index: 8 - "timelines.earlybird_score", # Tensor index: 9 - ] - ) - .define_weight("meta.record_weight/type=earlybird") - .build() - ) diff --git a/src/python/twitter/deepbird/projects/timelines/configs/rectweet_earlybird/feature_config.py b/src/python/twitter/deepbird/projects/timelines/configs/rectweet_earlybird/feature_config.py index 85b7d7f10..faec156c6 100644 --- a/src/python/twitter/deepbird/projects/timelines/configs/rectweet_earlybird/feature_config.py +++ b/src/python/twitter/deepbird/projects/timelines/configs/rectweet_earlybird/feature_config.py @@ -2,73 +2,78 @@ from twml.feature_config import FeatureConfigBuilder -def get_feature_config(data_spec_path, label): - return FeatureConfigBuilder(data_spec_path=data_spec_path, debug=True) \ - .batch_add_features( - [ - ("ebd.has_diff_lang", "A"), - ("ebd.tweet_age_in_secs", "A"), - ("encoded_tweet_features.composer_source_is_camera_flag", "A"), - ("encoded_tweet_features.favorite_count", "A"), - ("encoded_tweet_features.has_card_flag", "A"), - ("encoded_tweet_features.has_image_url_flag", "A"), - ("encoded_tweet_features.has_native_image_flag", "A"), - ("encoded_tweet_features.has_news_url_flag", "A"), - ("encoded_tweet_features.has_periscope_flag", "A"), - ("encoded_tweet_features.has_pro_video_flag", "A"), - ("encoded_tweet_features.has_quote_flag", "A"), - ("encoded_tweet_features.has_video_url_flag", "A"), - ("encoded_tweet_features.has_vine_flag", "A"), - ("encoded_tweet_features.has_visible_link_flag", "A"), - ("encoded_tweet_features.is_sensitive_content", "A"), - ("encoded_tweet_features.is_user_spam_flag", "A"), - ("encoded_tweet_features.link_language", "A"), - ("encoded_tweet_features.num_hashtags", "A"), - ("encoded_tweet_features.num_mentions", "A"), - ("encoded_tweet_features.reply_count", "A"), - ("encoded_tweet_features.retweet_count", "A"), - ("encoded_tweet_features.text_score", "A"), - ("encoded_tweet_features.user_reputation", "A"), - ("extended_encoded_tweet_features.decayed_favorite_count", "A"), - ("extended_encoded_tweet_features.decayed_quote_count", "A"), - ("extended_encoded_tweet_features.decayed_reply_count", "A"), - ("extended_encoded_tweet_features.decayed_retweet_count", "A"), - ("extended_encoded_tweet_features.embeds_impression_count_v2", "A"), - ("extended_encoded_tweet_features.embeds_url_count_v2", "A"), - ("extended_encoded_tweet_features.fake_favorite_count", "A"), - ("extended_encoded_tweet_features.fake_quote_count", "A"), - ("extended_encoded_tweet_features.fake_reply_count", "A"), - ("extended_encoded_tweet_features.fake_retweet_count", "A"), - ("extended_encoded_tweet_features.favorite_count_v2", "A"), - ("extended_encoded_tweet_features.label_dup_content_flag", "A"), - ("extended_encoded_tweet_features.label_nsfw_hi_prc_flag", "A"), - ("extended_encoded_tweet_features.label_nsfw_hi_rcl_flag", "A"), - ("extended_encoded_tweet_features.label_spam_hi_rcl_flag", "A"), - ("extended_encoded_tweet_features.periscope_exists", "A"), - ("extended_encoded_tweet_features.periscope_has_been_featured", "A"), - ("extended_encoded_tweet_features.periscope_is_currently_featured", "A"), - ("extended_encoded_tweet_features.periscope_is_from_quality_source", "A"), - ("extended_encoded_tweet_features.periscope_is_live", "A"), - ("extended_encoded_tweet_features.quote_count", "A"), - ("extended_encoded_tweet_features.reply_count_v2", "A"), - ("extended_encoded_tweet_features.retweet_count_v2", "A"), - ("extended_encoded_tweet_features.weighted_favorite_count", "A"), - ("extended_encoded_tweet_features.weighted_quote_count", "A"), - ("extended_encoded_tweet_features.weighted_reply_count", "A"), - ("extended_encoded_tweet_features.weighted_retweet_count", "A"), - ("timelines.earlybird.visible_token_ratio", "A") - ] - ).add_labels([ - label, # Tensor index: 0 - "itl.engagement.is_clicked", # Tensor index: 1 - "itl.engagement.is_favorited", # Tensor index: 2 - "itl.engagement.is_open_linked", # Tensor index: 3 - "itl.engagement.is_photo_expanded", # Tensor index: 4 - "itl.engagement.is_profile_clicked", # Tensor index: 5 - "itl.engagement.is_replied", # Tensor index: 6 - "itl.engagement.is_retweeted", # Tensor index: 7 - "itl.engagement.is_video_playback_50", # Tensor index: 8 - "timelines.earlybird_score", # Tensor index: 9 - ]) \ - .define_weight("meta.record_weight/type=earlybird") \ - .build() +def get_feature_config(data_spec_path: str, label: str) -> FeatureConfigBuilder: + return ( + FeatureConfigBuilder(data_spec_path=data_spec_path, debug=True) + .batch_add_features( + [ + ("ebd.has_diff_lang", "A"), + ("ebd.tweet_age_in_secs", "A"), + ("encoded_tweet_features.composer_source_is_camera_flag", "A"), + ("encoded_tweet_features.favorite_count", "A"), + ("encoded_tweet_features.has_card_flag", "A"), + ("encoded_tweet_features.has_image_url_flag", "A"), + ("encoded_tweet_features.has_native_image_flag", "A"), + ("encoded_tweet_features.has_news_url_flag", "A"), + ("encoded_tweet_features.has_periscope_flag", "A"), + ("encoded_tweet_features.has_pro_video_flag", "A"), + ("encoded_tweet_features.has_quote_flag", "A"), + ("encoded_tweet_features.has_video_url_flag", "A"), + ("encoded_tweet_features.has_vine_flag", "A"), + ("encoded_tweet_features.has_visible_link_flag", "A"), + ("encoded_tweet_features.is_sensitive_content", "A"), + ("encoded_tweet_features.is_user_spam_flag", "A"), + ("encoded_tweet_features.link_language", "A"), + ("encoded_tweet_features.num_hashtags", "A"), + ("encoded_tweet_features.num_mentions", "A"), + ("encoded_tweet_features.reply_count", "A"), + ("encoded_tweet_features.retweet_count", "A"), + ("encoded_tweet_features.text_score", "A"), + ("encoded_tweet_features.user_reputation", "A"), + ("extended_encoded_tweet_features.decayed_favorite_count", "A"), + ("extended_encoded_tweet_features.decayed_quote_count", "A"), + ("extended_encoded_tweet_features.decayed_reply_count", "A"), + ("extended_encoded_tweet_features.decayed_retweet_count", "A"), + ("extended_encoded_tweet_features.embeds_impression_count_v2", "A"), + ("extended_encoded_tweet_features.embeds_url_count_v2", "A"), + ("extended_encoded_tweet_features.fake_favorite_count", "A"), + ("extended_encoded_tweet_features.fake_quote_count", "A"), + ("extended_encoded_tweet_features.fake_reply_count", "A"), + ("extended_encoded_tweet_features.fake_retweet_count", "A"), + ("extended_encoded_tweet_features.favorite_count_v2", "A"), + ("extended_encoded_tweet_features.label_dup_content_flag", "A"), + ("extended_encoded_tweet_features.label_nsfw_hi_prc_flag", "A"), + ("extended_encoded_tweet_features.label_nsfw_hi_rcl_flag", "A"), + ("extended_encoded_tweet_features.label_spam_hi_rcl_flag", "A"), + ("extended_encoded_tweet_features.periscope_exists", "A"), + ("extended_encoded_tweet_features.periscope_has_been_featured", "A"), + ("extended_encoded_tweet_features.periscope_is_currently_featured","A"), + ("extended_encoded_tweet_features.periscope_is_from_quality_source", "A"), + ("extended_encoded_tweet_features.periscope_is_live", "A"), + ("extended_encoded_tweet_features.quote_count", "A"), + ("extended_encoded_tweet_features.reply_count_v2", "A"), + ("extended_encoded_tweet_features.retweet_count_v2", "A"), + ("extended_encoded_tweet_features.weighted_favorite_count", "A"), + ("extended_encoded_tweet_features.weighted_quote_count", "A"), + ("extended_encoded_tweet_features.weighted_reply_count", "A"), + ("extended_encoded_tweet_features.weighted_retweet_count", "A"), + ("timelines.earlybird.visible_token_ratio", "A"), + ] + ) + .add_labels( + [ + label, # Tensor index: 0 + "itl.engagement.is_clicked", # Tensor index: 1 + "itl.engagement.is_favorited", # Tensor index: 2 + "itl.engagement.is_open_linked", # Tensor index: 3 + "itl.engagement.is_photo_expanded", # Tensor index: 4 + "itl.engagement.is_profile_clicked", # Tensor index: 5 + "itl.engagement.is_replied", # Tensor index: 6 + "itl.engagement.is_retweeted", # Tensor index: 7 + "itl.engagement.is_video_playback_50", # Tensor index: 8 + "timelines.earlybird_score", # Tensor index: 9 + ] + ) + .define_weight("meta.record_weight/type=earlybird") + .build() + ) diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/constants.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/constants.py index 57178b92c..d20fccb52 100644 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/constants.py +++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/constants.py @@ -1,21 +1,30 @@ # checkstyle: noqa INDEX_BY_LABEL = { - "is_clicked": 1, - "is_favorited": 2, - "is_open_linked": 3, - "is_photo_expanded": 4, - "is_profile_clicked": 5, - "is_replied": 6, - "is_retweeted": 7, - "is_video_playback_50": 8 + "is_clicked": 1, + "is_favorited": 2, + "is_open_linked": 3, + "is_photo_expanded": 4, + "is_profile_clicked": 5, + "is_replied": 6, + "is_retweeted": 7, + "is_video_playback_50": 8, } TARGET_LABEL_IDX = 0 + EB_SCORE_IDX = 9 -LABEL_NAMES = [label_name for label_name, _ in sorted(INDEX_BY_LABEL.items(), key=lambda item: item[1])] +LABEL_NAMES = [ + label_name + for label_name, _ in sorted(INDEX_BY_LABEL.items(), key=lambda item: item[1]) +] -PREDICTED_CLASSES = \ - ["tf_target"] + ["tf_" + label_name for label_name in LABEL_NAMES] + ["tf_timelines.earlybird_score"] + \ - ["lolly_target"] + ["lolly_" + label_name for label_name in LABEL_NAMES] + ["lolly_timelines.earlybird_score"] +PREDICTED_CLASSES = ( + ["tf_target"] + + ["tf_" + label_name for label_name in LABEL_NAMES] + + ["tf_timelines.earlybird_score"] + + ["lolly_target"] + + ["lolly_" + label_name for label_name in LABEL_NAMES] + + ["lolly_timelines.earlybird_score"] +) diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/example_weights.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/example_weights.py index cf0c38ecc..f361c6cd5 100644 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/example_weights.py +++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/example_weights.py @@ -1,43 +1,65 @@ # checkstyle: noqa +from typing import Dict, Final +from tensorflow import Tensor +from twml import DefaultSubcommandArgParse +import argparse import tensorflow.compat.v1 as tf + from .constants import INDEX_BY_LABEL, LABEL_NAMES # TODO: Read these from command line arguments, since they specify the existing example weights in the input data. -DEFAULT_WEIGHT_BY_LABEL = { - "is_clicked": 0.3, - "is_favorited": 1.0, - "is_open_linked": 0.1, - "is_photo_expanded": 0.03, - "is_profile_clicked": 1.0, - "is_replied": 9.0, - "is_retweeted": 1.0, - "is_video_playback_50": 0.01 + +DEFAULT_WEIGHT_BY_LABEL: Final[Dict[str, float]] = { + "is_clicked": 0.3, + "is_favorited": 1.0, + "is_open_linked": 0.1, + "is_photo_expanded": 0.03, + "is_profile_clicked": 1.0, + "is_replied": 9.0, + "is_retweeted": 1.0, + "is_video_playback_50": 0.01, } -def add_weight_arguments(parser): - for label_name in LABEL_NAMES: - parser.add_argument( - _make_weight_cli_argument_name(label_name), - type=float, - default=DEFAULT_WEIGHT_BY_LABEL[label_name], - dest=_make_weight_param_name(label_name) - ) - -def make_weights_tensor(input_weights, label, params): - ''' - Replaces the weights for each positive engagement and keeps the input weights for negative examples. - ''' - weight_tensors = [input_weights] - for label_name in LABEL_NAMES: - index, default_weight = INDEX_BY_LABEL[label_name], DEFAULT_WEIGHT_BY_LABEL[label_name] - weight_param_name =_make_weight_param_name(label_name) - weight_tensors.append( - tf.reshape(tf.math.scalar_mul(getattr(params, weight_param_name) - default_weight, label[:, index]), [-1, 1]) - ) - return tf.math.accumulate_n(weight_tensors) - -def _make_weight_cli_argument_name(label_name): - return f"--weight.{label_name}" - -def _make_weight_param_name(label_name): - return f"weight_{label_name}" +def add_weight_arguments(parser: DefaultSubcommandArgParse) -> DefaultSubcommandArgParse: + """Adds command line arguments for example weights.""" + + for label_name in LABEL_NAMES: + parser.add_argument( + _make_weight_cli_argument_name(label_name), + type=float, + default=DEFAULT_WEIGHT_BY_LABEL[label_name], + dest=_make_weight_param_name(label_name), + ) + + +def make_weights_tensor( + input_weights: tf.Tensor, label: tf.Tensor, params +) -> Tensor: + """Replaces the weights for each positive engagement and keeps the input weights for negative examples.""" + + weight_tensors = [input_weights] + for label_name in LABEL_NAMES: + index, default_weight = ( + INDEX_BY_LABEL[label_name], + DEFAULT_WEIGHT_BY_LABEL[label_name], + ) + weight_param_name = _make_weight_param_name(label_name) + weight_tensors.append( + tf.reshape( + tf.math.scalar_mul( + getattr(params, weight_param_name) - default_weight, label[:, index] + ), + [-1, 1], + ) + ) + return tf.math.accumulate_n(weight_tensors) + +def _make_weight_cli_argument_name(label_name: str) -> str: + """Returns the name of the command line argument that holds the weight for the given label.""" + + return f"--weight.{label_name}" + +def _make_weight_param_name(label_name: str) -> str: + """Returns the name of the parameter that holds the weight for the given label.""" + + return f"weight_{label_name}" \ No newline at end of file diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/data_helpers.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/data_helpers.py index 723dd626c..5d7e7124a 100644 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/data_helpers.py +++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/data_helpers.py @@ -1,23 +1,36 @@ # checkstyle: noqa import tensorflow.compat.v1 as tf + from ..constants import EB_SCORE_IDX + # The rationale behind this logic is available at TQ-9678. -def get_lolly_logits(labels): - ''' - :param labels: tf.Tensor of shape (batch size, num labels) with labels as specified by the feature config. - :return: tf.Tensor of shape (batch size) with the extracted lolly logits. - ''' - eb_lolly_scores = get_lolly_scores(labels) - inverse_eb_lolly_scores = tf.math.subtract(1.0, eb_lolly_scores) - lolly_activations = tf.math.subtract(tf.math.log(eb_lolly_scores), tf.math.log(inverse_eb_lolly_scores)) - return lolly_activations - -def get_lolly_scores(labels): - ''' - :param labels: tf.Tensor of shape (batch size, num labels) with labels as specified by the feature config. - :return: tf.Tensor of shape (batch size) with the extracted lolly scores. - ''' - logged_eb_lolly_scores = tf.reshape(labels[:, EB_SCORE_IDX], (-1, 1)) - eb_lolly_scores = tf.truediv(logged_eb_lolly_scores, 100.0) - return eb_lolly_scores +def get_lolly_logits(labels: tf.Tensor) -> tf.Tensor: + """ + Args: + labels: tf.Tensor of shape (batch size, num labels) with labels as specified by the feature config. + + Returns: + tf.Tensor of shape (batch size) with the extracted lolly logits. + """ + + eb_lolly_scores = get_lolly_scores(labels) + inverse_eb_lolly_scores = tf.math.subtract(1.0, eb_lolly_scores) + lolly_activations = tf.math.subtract( + tf.math.log(eb_lolly_scores), tf.math.log(inverse_eb_lolly_scores) + ) + return lolly_activations + + +def get_lolly_scores(labels: tf.Tensor) -> tf.Tensor: + """ + Args: + labels: tf.Tensor of shape (batch size, num labels) with labels as specified by the feature config. + + Returns: + tf.Tensor of shape (batch size) with the extracted lolly scores. + """ + + logged_eb_lolly_scores = tf.reshape(labels[:, EB_SCORE_IDX], (-1, 1)) + eb_lolly_scores = tf.truediv(logged_eb_lolly_scores, 100.0) + return eb_lolly_scores diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/parsers.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/parsers.py index cb39c67a7..b2454a870 100644 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/parsers.py +++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/parsers.py @@ -1,145 +1,153 @@ import re +from typing import Tuple from twitter.deepbird.io.util import _get_feature_id class Parser(object): - def parse(self, line): - match = re.search(self.pattern(), line) - if match: - return self._parse_match(match) - return None + """Base class for parsers.""" - def pattern(self): - raise NotImplementedError + def parse(self, line: str) -> object: + match = re.search(self.pattern(), line) + if match: + return self._parse_match(match) + return None - def _parse_match(self, match): - raise NotImplementedError + def _parse_match(self, match: re.Match) -> float: + return float(match.group(1)) + + def pattern(self): + raise NotImplementedError class BiasParser(Parser): - ''' - Parses the bias feature available in lolly model tsv files. - ''' + """Parses the bias feature available in lolly model tsv files.""" - def pattern(self): - ''' - Matches lines like: - unified_engagement bias -0.935945 - :return: a RegEx that extracts feature weight. - ''' - return r"\t(bias)\t([^\s]+)" + def pattern(self) -> str: + """ + Matches lines like: + unified_engagement bias -0.935945 + :return: a RegEx that extracts feature weight. + """ + return r"\t(bias)\t([^\s]+)" - def _parse_match(self, match): - return float(match.group(2)) + def _parse_match(self, match: re.Match) -> float: + return float(match.group(2)) class BinaryFeatureParser(Parser): - ''' - Parses binary features available in lolly model tsv files. - ''' + """Parses binary features available in lolly model tsv files.""" - def pattern(self): - ''' - Matches lines like: - unified_engagement encoded_tweet_features.is_user_spam_flag -0.181130 - :return: a RegEx that extracts feature name and weight. - ''' - return r"\t([\w\.]+)\t([^\s]+)" + def pattern(self) -> str: + """ + Matches lines like: + unified_engagement encoded_tweet_features.is_user_spam_flag -0.181130 + :return: a RegEx that extracts feature name and weight. + """ + return r"\t([\w\.]+)\t([^\s]+)" - def _parse_match(self, match): - return (match.group(1), float(match.group(2))) + def _parse_match(self, match: re.Match) -> Tuple[str, float]: + return (match.group(1), float(match.group(2))) class DiscretizedFeatureParser(Parser): - ''' - Parses discretized features available in lolly model tsv files. - ''' - - def pattern(self): - ''' - Matches lines like: - unified_engagement encoded_tweet_features.user_reputation.dz/dz_model=mdl/dz_range=1.000000e+00_2.000000e+00 0.031004 - :return: a RegEx that extracts feature name, bin boundaries and weight. - ''' - return r"([\w\.]+)\.dz\/dz_model=mdl\/dz_range=([^\s]+)\t([^\s]+)" - - def _parse_match(self, match): - left_bin_side, right_bin_side = [float(number) for number in match.group(2).split("_")] - return ( - match.group(1), - left_bin_side, - right_bin_side, - float(match.group(3)) - ) + """Parses discretized features available in lolly model tsv files.""" + + def pattern(self) -> str: + """ + Matches lines like: + unified_engagement encoded_tweet_features.user_reputation.dz/dz_model=mdl/dz_range=1.000000e+00_2.000000e+00 0.031004 + :return: a RegEx that extracts feature name, bin boundaries and weight. + """ + return r"([\w\.]+)\.dz\/dz_model=mdl\/dz_range=([^\s]+)\t([^\s]+)" + + def _parse_match(self, match: re.Match) -> Tuple[str, float, float, float]: + left_bin_side, right_bin_side = [ + float(number) for number in match.group(2).split("_") + ] + return (match.group(1), left_bin_side, right_bin_side, float(match.group(3))) class LollyModelFeaturesParser(Parser): - def __init__(self, bias_parser=BiasParser(), binary_feature_parser=BinaryFeatureParser(), discretized_feature_parser=DiscretizedFeatureParser()): - self._bias_parser = bias_parser - self._binary_feature_parser = binary_feature_parser - self._discretized_feature_parser = discretized_feature_parser - - def parse(self, lolly_model_reader): - parsed_features = { - "bias": None, - "binary": {}, - "discretized": {} - } - def process_line_fn(line): - bias_parser_result = self._bias_parser.parse(line) - if bias_parser_result: - parsed_features["bias"] = bias_parser_result - return - - binary_feature_parser_result = self._binary_feature_parser.parse(line) - if binary_feature_parser_result: - name, value = binary_feature_parser_result - parsed_features["binary"][name] = value - return - - discretized_feature_parser_result = self._discretized_feature_parser.parse(line) - if discretized_feature_parser_result: - name, left_bin, right_bin, weight = discretized_feature_parser_result - discretized_features = parsed_features["discretized"] - if name not in discretized_features: - discretized_features[name] = [] - discretized_features[name].append((left_bin, right_bin, weight)) - - lolly_model_reader.read(process_line_fn) - - return parsed_features + """Parses lolly model tsv files.""" + + def __init__( + self, + bias_parser: BiasParser = BiasParser(), + binary_feature_parser: BinaryFeatureParser = BinaryFeatureParser(), + discretized_feature_parser: DiscretizedFeatureParser = DiscretizedFeatureParser(), + ): + self._bias_parser = bias_parser + self._binary_feature_parser = binary_feature_parser + self._discretized_feature_parser = discretized_feature_parser + + def parse(self, lolly_model_reader: object) -> dict: + parsed_features = {"bias": None, "binary": {}, "discretized": {}} + + def process_line_fn(line: str) -> None: + bias_parser_result = self._bias_parser.parse(line) + if bias_parser_result: + parsed_features["bias"] = bias_parser_result + return + binary_feature_parser_result = self._binary_feature_parser.parse(line) + if binary_feature_parser_result: + name, value = binary_feature_parser_result + parsed_features["binary"][name] = value + return + discretized_feature_parser_result = self._discretized_feature_parser.parse( + line + ) + if discretized_feature_parser_result: + name, left_bin, right_bin, weight = discretized_feature_parser_result + discretized_features = parsed_features["discretized"] + if name not in discretized_features: + discretized_features[name] = [] + discretized_features[name].append((left_bin, right_bin, weight)) + + lolly_model_reader.read(process_line_fn) + + return parsed_features class DBv2DataExampleParser(Parser): - ''' - Parses data records printed by the DBv2 train.py build_graph function. - Format: [[dbv2 logit]][[logged lolly logit]][[space separated feature ids]][[space separated feature values]] - ''' - - def __init__(self, lolly_model_reader, lolly_model_features_parser=LollyModelFeaturesParser()): - self.features = lolly_model_features_parser.parse(lolly_model_reader) - self.feature_name_by_dbv2_id = {} - - for feature_name in list(self.features["binary"].keys()) + list(self.features["discretized"].keys()): - self.feature_name_by_dbv2_id[str(_get_feature_id(feature_name))] = feature_name - - def pattern(self): - ''' - :return: a RegEx that extracts dbv2 logit, logged lolly logit, feature ids and feature values. - ''' - return r"\[\[([\w\.\-]+)\]\]\[\[([\w\.\-]+)\]\]\[\[([\w\.\- ]+)\]\]\[\[([\w\. ]+)\]\]" - - def _parse_match(self, match): - feature_ids = match.group(3).split(" ") - feature_values = match.group(4).split(" ") - - value_by_feature_name = {} - for index in range(len(feature_ids)): - feature_id = feature_ids[index] - if feature_id not in self.feature_name_by_dbv2_id: - print("Missing feature with id: " + str(feature_id)) - continue - value_by_feature_name[self.feature_name_by_dbv2_id[feature_id]] = float(feature_values[index]) - - return value_by_feature_name + """ + Parses data records printed by the DBv2 train.py build_graph function. + Format: [[dbv2 logit]][[logged lolly logit]][[space separated feature ids]][[space separated feature values]] + """ + + def __init__( + self, + lolly_model_reader: object, + lolly_model_features_parser: LollyModelFeaturesParser = LollyModelFeaturesParser(), + ): + self.features = lolly_model_features_parser.parse(lolly_model_reader) + self.feature_name_by_dbv2_id = {} + + for feature_name in ( + self.features["binary"].keys() + self.features["discretized"].keys() + ): + self.feature_name_by_dbv2_id[ + str(_get_feature_id(feature_name)) + ] = feature_name + + def pattern(self) -> str: + """ + :return: a RegEx that extracts dbv2 logit, logged lolly logit, feature ids and feature values. + """ + return r"\[\[([\w\.\-]+)\]\]\[\[([\w\.\-]+)\]\]\[\[([\w\.\- ]+)\]\]\[\[([\w\. ]+)\]\]" + + def _parse_match(self, match) -> dict: + feature_ids = match.group(3).split(" ") + feature_values = match.group(4).split(" ") + + value_by_feature_name = dict() + for index in range(len(feature_ids)): + feature_id = feature_ids[index] + if feature_id not in self.feature_name_by_dbv2_id: + print("Missing feature with id: " + str(feature_id)) + continue + value_by_feature_name[self.feature_name_by_dbv2_id[feature_id]] = float( + feature_values[index] + ) + + return value_by_feature_name diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/reader.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/reader.py index ab33ee4e7..3c76233af 100644 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/reader.py +++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/reader.py @@ -1,8 +1,11 @@ +from typing import Callable + + class LollyModelReader(object): - def __init__(self, lolly_model_file_path): - self._lolly_model_file_path = lolly_model_file_path + def __init__(self, lolly_model_file_path: str): + self._lolly_model_file_path = lolly_model_file_path - def read(self, process_line_fn): - with open(self._lolly_model_file_path, "r") as file: - for line in file: - process_line_fn(line) + def read(self, process_line_fn: Callable[[str], None]): + with open(self._lolly_model_file_path, "r") as file: + for line in file: + process_line_fn(line) diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/score.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/score.py index 5692616c2..b018844c5 100644 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/score.py +++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/score.py @@ -4,10 +4,8 @@ from .reader import LollyModelReader from .scorer import LollyModelScorer - if __name__ == "__main__": - lolly_model_reader = LollyModelReader(lolly_model_file_path=sys.argv[1]) - lolly_model_scorer = LollyModelScorer(data_example_parser=DBv2DataExampleParser(lolly_model_reader)) - - score = lolly_model_scorer.score(data_example=sys.argv[2]) - print(score) + lolly_model_reader = LollyModelReader(lolly_model_file_path=sys.argv[1]) + lolly_model_scorer = LollyModelScorer(DBv2DataExampleParser(lolly_model_reader)) + score = lolly_model_scorer.score(data_example=sys.argv[2]) + print(score) diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/scorer.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/scorer.py index 621c43388..932e3b51f 100644 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/scorer.py +++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/scorer.py @@ -1,37 +1,56 @@ +from typing import List + +from python.twitter.deepbird.projects.timelines.scripts.models.earlybird.lolly.parsers import ( + DBv2DataExampleParser, +) + + class LollyModelScorer(object): - def __init__(self, data_example_parser): - self._data_example_parser = data_example_parser - - def score(self, data_example): - value_by_feature_name = self._data_example_parser.parse(data_example) - features = self._data_example_parser.features - return self._score(value_by_feature_name, features) - - def _score(self, value_by_feature_name, features): - score = features["bias"] - score += self._score_binary_features(features["binary"], value_by_feature_name) - score += self._score_discretized_features(features["discretized"], value_by_feature_name) - return score - - def _score_binary_features(self, binary_features, value_by_feature_name): - score = 0.0 - for binary_feature_name, binary_feature_weight in binary_features.items(): - if binary_feature_name in value_by_feature_name: - score += binary_feature_weight - return score - - def _score_discretized_features(self, discretized_features, value_by_feature_name): - score = 0.0 - for discretized_feature_name, buckets in discretized_features.items(): - if discretized_feature_name in value_by_feature_name: - feature_value = value_by_feature_name[discretized_feature_name] - score += self._find_matching_bucket_weight(buckets, feature_value) - return score - - def _find_matching_bucket_weight(self, buckets, feature_value): - for left_side, right_side, weight in buckets: - # The Earlybird Lolly prediction engine discretizer bin membership interval is [a, b) - if feature_value >= left_side and feature_value < right_side: - return weight - - raise LookupError("Couldn't find a matching bucket for the given feature value.") + def __init__(self, data_example_parser: DBv2DataExampleParser): + self._data_example_parser = data_example_parser + + def score(self, data_example: str) -> float: + value_by_feature_name = self._data_example_parser.parse(data_example) + features = self._data_example_parser.features + return self._score(value_by_feature_name, features) + + def _score(self, value_by_feature_name: dict, features: dict) -> float: + score = features["bias"] + score += self._score_binary_features(features["binary"], value_by_feature_name) + score += self._score_discretized_features( + features["discretized"], value_by_feature_name + ) + return score + + def _score_binary_features( + self, binary_features: dict, value_by_feature_name: dict + ) -> float: + score = 0.0 + for binary_feature_name, binary_feature_weight in binary_features.items(): + if binary_feature_name in value_by_feature_name: + score += binary_feature_weight + return score + + def _score_discretized_features( + self, discretized_features: dict, value_by_feature_name: dict + ) -> float: + score = 0.0 + for discretized_feature_name, buckets in discretized_features.items(): + if discretized_feature_name in value_by_feature_name: + feature_value = value_by_feature_name[discretized_feature_name] + score += self._find_matching_bucket_weight(buckets, feature_value) + return score + + def _find_matching_bucket_weight( + self, + buckets: List[tuple[float, float, float]], + feature_value: float, + ) -> float: + for left_side, right_side, weight in buckets: + # The Earlybird Lolly prediction engine discretizer bin membership interval is [a, b) + if feature_value >= left_side and feature_value < right_side: + return weight + + raise LookupError( + "Couldn't find a matching bucket for the given feature value." + ) diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/tf_model_initializer_builder.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/tf_model_initializer_builder.py index 2d0342551..4a576d749 100644 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/tf_model_initializer_builder.py +++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/tf_model_initializer_builder.py @@ -1,91 +1,97 @@ +from typing import Any, Dict, List, Tuple + from .parsers import LollyModelFeaturesParser class TFModelInitializerBuilder: - - def __init__(self, model_features_parser=LollyModelFeaturesParser()): - self._model_features_parser = model_features_parser - - def build(self, lolly_model_reader): - ''' - :param lolly_model_reader: LollyModelReader instance - :return: tf_model_initializer dictionary of the following format: - { - "features": { - "bias": 0.0, - "binary": { - # (feature name : feature weight) pairs - "feature_name_1": 0.0, - ... - "feature_nameN": 0.0 - }, - "discretized": { - # (feature name : index aligned lists of bin_boundaries and weights - "feature_name_1": { - "bin_boundaries": [1, ..., inf], - "weights": [0.0, ..., 0.0] - } - ... - "feature_name_K": { - "bin_boundaries": [1, ..., inf], - "weights": [0.0, ..., 0.0] + def __init__(self, model_features_parser=LollyModelFeaturesParser()): + self._model_features_parser = model_features_parser + + def build(self, lolly_model_reader: object) -> Dict[str, Dict[str, Any]]: + """ + :param lolly_model_reader: LollyModelReader instance + :return: tf_model_initializer dictionary of the following format: + { + "features": { + "bias": 0.0, + "binary": { + # (feature name : feature weight) pairs + "feature_name_1": 0.0, + ... + "feature_nameN": 0.0 + }, + "discretized": { + # (feature name : index aligned lists of bin_boundaries and weights + "feature_name_1": { + "bin_boundaries": [1, ..., inf], + "weights": [0.0, ..., 0.0] + } + ... + "feature_name_K": { + "bin_boundaries": [1, ..., inf], + "weights": [0.0, ..., 0.0] + } + } } - } } - } - ''' - tf_model_initializer = { - "features": {} - } - - features = self._model_features_parser.parse(lolly_model_reader) - tf_model_initializer["features"]["bias"] = features["bias"] - self._set_discretized_features(features["discretized"], tf_model_initializer) - - self._dedup_binary_features(features["binary"], features["discretized"]) - tf_model_initializer["features"]["binary"] = features["binary"] - - return tf_model_initializer - - def _set_discretized_features(self, discretized_features, tf_model_initializer): - if len(discretized_features) == 0: - return - - num_bins = max([len(bins) for bins in discretized_features.values()]) - - bin_boundaries_and_weights = {} - for feature_name in discretized_features: - bin_boundaries_and_weights[feature_name] = self._extract_bin_boundaries_and_weights( - discretized_features[feature_name], num_bins) - - tf_model_initializer["features"]["discretized"] = bin_boundaries_and_weights - - def _dedup_binary_features(self, binary_features, discretized_features): - [binary_features.pop(feature_name) for feature_name in discretized_features] - - def _extract_bin_boundaries_and_weights(self, discretized_feature_buckets, num_bins): - bin_boundary_weight_pairs = [] - - for bucket in discretized_feature_buckets: - bin_boundary_weight_pairs.append([bucket[0], bucket[2]]) - - # The default DBv2 HashingDiscretizer bin membership interval is (a, b] - # - # The Earlybird Lolly prediction engine discretizer bin membership interval is [a, b) - # - # Thus, convert (a, b] to [a, b) by inverting the bin boundaries. - for bin_boundary_weight_pair in bin_boundary_weight_pairs: - if bin_boundary_weight_pair[0] < float("inf"): - bin_boundary_weight_pair[0] *= -1 - - while len(bin_boundary_weight_pairs) < num_bins: - bin_boundary_weight_pairs.append([float("inf"), float(0)]) - - bin_boundary_weight_pairs.sort(key=lambda bin_boundary_weight_pair: bin_boundary_weight_pair[0]) - - bin_boundaries, weights = list(zip(*bin_boundary_weight_pairs)) - - return { - "bin_boundaries": bin_boundaries, - "weights": weights - } + """ + tf_model_initializer = {"features": {}} + + features = self._model_features_parser.parse(lolly_model_reader) + tf_model_initializer["features"]["bias"] = features["bias"] + self._set_discretized_features(features["discretized"], tf_model_initializer) + + self._dedup_binary_features(features["binary"], features["discretized"]) + tf_model_initializer["features"]["binary"] = features["binary"] + + return tf_model_initializer + + def _set_discretized_features( + self, discretized_features: dict, tf_model_initializer: dict + ) -> None: + if len(discretized_features) == 0: + return + + num_bins = max([len(bins) for bins in discretized_features.values()]) + + bin_boundaries_and_weights = {} + for feature_name in discretized_features: + bin_boundaries_and_weights[ + feature_name + ] = self._extract_bin_boundaries_and_weights( + discretized_features[feature_name], num_bins + ) + + tf_model_initializer["features"]["discretized"] = bin_boundaries_and_weights + + def _dedup_binary_features( + self, binary_features: dict, discretized_features: dict + ) -> None: + [binary_features.pop(feature_name) for feature_name in discretized_features] + + def _extract_bin_boundaries_and_weights( + self, + discretized_feature_buckets: List[List[float]], + num_bins: int, + ) -> Dict[str, Tuple[float]]: + bin_boundary_weight_pairs = [ + [bucket[0], bucket[2]] for bucket in discretized_feature_buckets + ] + + # The default DBv2 HashingDiscretizer bin membership interval is (a, b] + # The Earlybird Lolly prediction engine discretizer bin membership interval is [a, b) + # Thus, convert (a, b] to [a, b) by inverting the bin boundaries. + for bin_boundary_weight_pair in bin_boundary_weight_pairs: + if bin_boundary_weight_pair[0] < float("inf"): + bin_boundary_weight_pair[0] *= -1 + + while len(bin_boundary_weight_pairs) < num_bins: + bin_boundary_weight_pairs.append([float("inf"), float(0)]) + + bin_boundary_weight_pairs.sort( + key=lambda bin_boundary_weight_pair: bin_boundary_weight_pair[0] + ) + + bin_boundaries, weights = list(zip(*bin_boundary_weight_pairs)) + + return {"bin_boundaries": bin_boundaries, "weights": weights} diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/metrics.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/metrics.py index 6919914f8..21bca238d 100644 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/metrics.py +++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/metrics.py @@ -1,120 +1,147 @@ # checkstyle: noqa +from typing import Dict, List, Optional + import tensorflow.compat.v1 as tf -from collections import OrderedDict + +import twml + from .constants import EB_SCORE_IDX from .lolly.data_helpers import get_lolly_scores -import twml -def get_multi_binary_class_metric_fn(metrics, classes=None, class_dim=1): - """ - This function was copied from twml/metrics.py with the following adjustments: - - Override example weights with the ones set in graph_output. - - Tile labels in order to support per engagement metrics for both TF and Lolly scores. - - Add lolly_tf_score_MSE metric. - Note: All custom lines have a comment that starts with 'Added' - """ - # pylint: disable=invalid-name,dict-keys-not-iterating - if metrics is None: - # remove expensive metrics by default for faster eval - metrics = list(twml.metrics.SUPPORTED_BINARY_CLASS_METRICS.keys()) - metrics.remove('pr_curve') - - def get_eval_metric_ops(graph_output, labels, weights): +def get_multi_binary_class_metric_fn( + metrics: Dict[str, float], + classes: Optional[List[str]] = None, + class_dim: int = 1, +) -> callable: """ - graph_output: - dict that is returned by build_graph given input features. - labels: - target labels associated to batch. - weights: - weights of the samples.. + This function was copied from twml/metrics.py with the following adjustments: + - Override example weights with the ones set in graph_output. + - Tile labels in order to support per engagement metrics for both TF and Lolly scores. + - Add lolly_tf_score_MSE metric. + Note: All custom lines have a comment that starts with 'Added' """ - - # Added to support the example weights overriding. - weights = graph_output["weights"] - # Added to support per engagement metrics for both TF and Lolly scores. - labels = tf.tile(labels, [1, 2]) - - eval_metric_ops = OrderedDict() - - preds = graph_output['output'] - - threshold = graph_output['threshold'] if 'threshold' in graph_output else 0.5 - - hard_preds = graph_output.get('hard_output') - if not hard_preds: - hard_preds = tf.greater_equal(preds, threshold) - - shape = labels.get_shape() - - # basic sanity check: multi_metric dimension must exist - assert len(shape) > class_dim, "Dimension specified by class_dim does not exist." - - num_labels = shape[class_dim] - # If we are doing multi-class / multi-label metric, the number of classes / labels must - # be know at graph construction time. This dimension cannot have size None. - assert num_labels is not None, "The multi-metric dimension cannot be None." - assert classes is None or len(classes) == num_labels, ( - "Number of classes must match the number of labels") - - weights_shape = weights.get_shape() if weights is not None else None - if weights_shape is None: - num_weights = None - elif len(weights_shape) > 1: - num_weights = weights_shape[class_dim] - else: - num_weights = 1 - - for i in range(num_labels): - - # add metrics to eval_metric_ops dict - for metric_name in metrics: - metric_name = metric_name.lower() # metric name are case insensitive. - - class_metric_name = metric_name + "_" + (classes[i] if classes is not None else str(i)) - - if class_metric_name in eval_metric_ops: - # avoid adding duplicate metrics. - continue - - class_labels = tf.gather(labels, indices=[i], axis=class_dim) - class_preds = tf.gather(preds, indices=[i], axis=class_dim) - class_hard_preds = tf.gather(hard_preds, indices=[i], axis=class_dim) - - if num_weights is None: - class_weights = None - elif num_weights == num_labels: - class_weights = tf.gather(weights, indices=[i], axis=class_dim) - elif num_weights == 1: - class_weights = weights - else: - raise ValueError("num_weights (%d) and num_labels (%d) do not match" - % (num_weights, num_labels)) - - metric_factory, requires_threshold = twml.metrics.SUPPORTED_BINARY_CLASS_METRICS.get(metric_name) - if metric_factory: - value_op, update_op = metric_factory( - labels=class_labels, - predictions=(class_hard_preds if requires_threshold else class_preds), - weights=class_weights, name=class_metric_name) - eval_metric_ops[class_metric_name] = (value_op, update_op) + # pylint: disable=invalid-name,dict-keys-not-iterating + if metrics is None: + # remove expensive metrics by default for faster eval + metrics = list(twml.metrics.SUPPORTED_BINARY_CLASS_METRICS.keys()) + metrics.remove("pr_curve") + + def get_eval_metric_ops( + graph_output: Dict[str, tf.Tensor], + labels: tf.Tensor, + weights: tf.Tensor, + ) -> dict: + """ + Args: + graph_output: + dict that is returned by build_graph given input features. + labels: + target labels associated to batch. + weights: + weights of the samples. + + Returns: + dict of metric name to tuple of (value_op, update_op). + """ + + # Added to support the example weights overriding. + weights = graph_output["weights"] + # Added to support per engagement metrics for both TF and Lolly scores. + labels = tf.tile(labels, [1, 2]) + + eval_metric_ops = dict() + + preds = graph_output["output"] + + threshold = graph_output["threshold"] if "threshold" in graph_output else 0.5 + + hard_preds = graph_output.get("hard_output") + if not hard_preds: + hard_preds = tf.greater_equal(preds, threshold) + + shape = labels.get_shape() + + # basic sanity check: multi_metric dimension must exist + assert ( + len(shape) > class_dim + ), "Dimension specified by class_dim does not exist." + + num_labels = shape[class_dim] + # If we are doing multi-class / multi-label metric, the number of classes / labels must + # be know at graph construction time. This dimension cannot have size None. + assert num_labels is not None, "The multi-metric dimension cannot be None." + assert ( + classes is None or len(classes) == num_labels + ), "Number of classes must match the number of labels" + + weights_shape = weights.get_shape() if weights is not None else None + if weights_shape is None: + num_weights = None + elif len(weights_shape) > 1: + num_weights = weights_shape[class_dim] else: - raise ValueError('Cannot find the metric named ' + metric_name) - - # Added to compare TF and Lolly scores. - eval_metric_ops["lolly_tf_score_MSE"] = get_mse(graph_output["output"], labels) - - return eval_metric_ops - - return get_eval_metric_ops - - -def get_mse(predictions, labels): - lolly_scores = get_lolly_scores(labels) - tf_scores = predictions[:, EB_SCORE_IDX] - squared_lolly_tf_score_diff = tf.square(tf.subtract(tf_scores, lolly_scores)) - - value_op = tf.reduce_mean(squared_lolly_tf_score_diff, name="value_op") - update_op = tf.reduce_mean(squared_lolly_tf_score_diff, name="update_op") - - return value_op, update_op + num_weights = 1 + + for i in range(num_labels): + # add metrics to eval_metric_ops dict + for metric_name in metrics: + metric_name = metric_name.lower() # metric name are case insensitive. + + class_metric_name = ( + metric_name + "_" + (classes[i] if classes is not None else str(i)) + ) + + if class_metric_name in eval_metric_ops: + # avoid adding duplicate metrics. + continue + + class_labels = tf.gather(labels, indices=[i], axis=class_dim) + class_preds = tf.gather(preds, indices=[i], axis=class_dim) + class_hard_preds = tf.gather(hard_preds, indices=[i], axis=class_dim) + + if num_weights is None: + class_weights = None + elif num_weights == num_labels: + class_weights = tf.gather(weights, indices=[i], axis=class_dim) + elif num_weights == 1: + class_weights = weights + else: + raise ValueError( + "num_weights (%d) and num_labels (%d) do not match" + % (num_weights, num_labels) + ) + + ( + metric_factory, + requires_threshold, + ) = twml.metrics.SUPPORTED_BINARY_CLASS_METRICS.get(metric_name) + if metric_factory: + value_op, update_op = metric_factory( + labels=class_labels, + predictions=( + class_hard_preds if requires_threshold else class_preds + ), + weights=class_weights, + name=class_metric_name, + ) + eval_metric_ops[class_metric_name] = (value_op, update_op) + else: + raise ValueError("Cannot find the metric named " + metric_name) + + # Added to compare TF and Lolly scores. + eval_metric_ops["lolly_tf_score_MSE"] = get_mse(graph_output["output"], labels) + return eval_metric_ops + + return get_eval_metric_ops + + +def get_mse(predictions: tf.Tensor, labels: tf.Tensor) -> tf.Tensor: + lolly_scores = get_lolly_scores(labels) + tf_scores = predictions[:, EB_SCORE_IDX] + squared_lolly_tf_score_diff = tf.square(tf.subtract(tf_scores, lolly_scores)) + + value_op = tf.reduce_mean(squared_lolly_tf_score_diff, name="value_op") + update_op = tf.reduce_mean(squared_lolly_tf_score_diff, name="update_op") + + return value_op, update_op diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/discretizer_builder.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/discretizer_builder.py index 82c31bde0..619f9306c 100644 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/discretizer_builder.py +++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/discretizer_builder.py @@ -1,62 +1,62 @@ -from .hashing_utils import make_feature_id +from typing import Any, Dict -from twml.contrib.layers.hashing_discretizer import HashingDiscretizer import numpy as np +from twml.contrib.layers.hashing_discretizer import HashingDiscretizer -class TFModelDiscretizerBuilder(object): - def __init__(self, num_bits): - self.num_bits = num_bits - - def build(self, tf_model_initializer): - ''' - :param tf_model_initializer: dictionary of the following format: - { - "features": { - "bias": 0.0, - "binary": { - # (feature name : feature weight) pairs - "feature_name_1": 0.0, - ... - "feature_nameN": 0.0 - }, - "discretized": { - # (feature name : index aligned lists of bin_boundaries and weights - "feature_name_1": { - "bin_boundaries": [1, ..., inf], - "weights": [0.0, ..., 0.0] - } - ... - "feature_name_K": { - "bin_boundaries": [1, ..., inf], - "weights": [0.0, ..., 0.0] - } - } - } - } - :return: a HashingDiscretizer instance. - ''' - discretized_features = tf_model_initializer["features"]["discretized"] - - max_bins = 0 - - feature_ids = [] - bin_vals = [] - for feature_name in discretized_features: - bin_boundaries = discretized_features[feature_name]["bin_boundaries"] - feature_id = make_feature_id(feature_name, self.num_bits) - feature_ids.append(feature_id) - np_bin_boundaries = [np.float(bin_boundary) for bin_boundary in bin_boundaries] - bin_vals.append(np_bin_boundaries) - - max_bins = max(max_bins, len(np_bin_boundaries)) +from .hashing_utils import make_feature_id - feature_ids_np = np.array(feature_ids) - bin_vals_np = np.array(bin_vals).flatten() - return HashingDiscretizer( - feature_ids=feature_ids_np, - bin_vals=bin_vals_np, - n_bin=max_bins, - out_bits=self.num_bits - ) +class TFModelDiscretizerBuilder(object): + def __init__(self, num_bits: int): + self.num_bits = num_bits + + def build(self, tf_model_initializer: Dict[str, Any]) -> HashingDiscretizer: + """ + :param tf_model_initializer: dictionary of the following format: + { + "features": { + "bias": 0.0, + "binary": { + # (feature name : feature weight) pairs + "feature_name_1": 0.0, + ... + "feature_nameN": 0.0 + }, + "discretized": { + # (feature name : index aligned lists of bin_boundaries and weights + "feature_name_1": { + "bin_boundaries": [1, ..., inf], + "weights": [0.0, ..., 0.0] + } + ... + "feature_name_K": { + "bin_boundaries": [1, ..., inf], + "weights": [0.0, ..., 0.0] + } + } + } + } + :return: a HashingDiscretizer instance. + """ + discretized_features = tf_model_initializer["features"]["discretized"] + max_bins = 0 + feature_ids = [] + bin_vals = [] + for feature_name in discretized_features: + bin_boundaries = discretized_features[feature_name]["bin_boundaries"] + feature_id = make_feature_id(feature_name, self.num_bits) + feature_ids.append(feature_id) + np_bin_boundaries = [ + np.float(bin_boundary) for bin_boundary in bin_boundaries + ] + bin_vals.append(np_bin_boundaries) + max_bins = max(max_bins, len(np_bin_boundaries)) + feature_ids_np = np.array(feature_ids) + bin_vals_np = np.array(bin_vals).flatten() + return HashingDiscretizer( + feature_ids=feature_ids_np, + bin_vals=bin_vals_np, + n_bin=max_bins, + out_bits=self.num_bits, + ) diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/hashing_utils.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/hashing_utils.py index 2c57f8d63..acb668587 100644 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/hashing_utils.py +++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/hashing_utils.py @@ -1,29 +1,33 @@ +import numpy as np from twitter.deepbird.io.util import _get_feature_id -import numpy as np + +def numpy_hashing_uniform(the_id: int, bin_idx: int, output_bits: int) -> int: + """ + integer_multiplicative_hashing + This is a reimplementation, for testing purposes, of the + c++ version found in hashing_discretizer_impl.cpp + """ + + hashing_constant = 2654435761 + N = 32 + with np.errstate(over="ignore"): + the_id *= hashing_constant + the_id += bin_idx + the_id *= hashing_constant + the_id >>= N - output_bits + the_id &= (1 << output_bits) - 1 + return the_id -def numpy_hashing_uniform(the_id, bin_idx, output_bits): - """ - integer_multiplicative_hashing - This is a reimplementation, for testing purposes, of the - c++ version found in hashing_discretizer_impl.cpp - """ - hashing_constant = 2654435761 - N = 32 - with np.errstate(over='ignore'): - the_id *= hashing_constant - the_id += bin_idx - the_id *= hashing_constant - the_id >>= N - output_bits - the_id &= (1 << output_bits) - 1 - return the_id +def make_feature_id(name: str, num_bits: int) -> np.int64: + """Returns a feature id for the given feature name.""" + feature_id = _get_feature_id(name) + return np.int64(limit_bits(feature_id, num_bits)) -def make_feature_id(name, num_bits): - feature_id = _get_feature_id(name) - return np.int64(limit_bits(feature_id, num_bits)) +def limit_bits(value: int, num_bits: int) -> int: + """Limits the number of bits in the given value.""" -def limit_bits(value, num_bits): - return value & ((2 ** num_bits) - 1) + return value & ((1< Tuple[Callable, Callable]: + """ + :return: (bias_initializer, weight_initializer) + """ + + initial_weights = np.zeros((1 << self.num_bits, 1)) + features = tf_model_initializer["features"] + + self._set_binary_feature_weights(initial_weights, features["binary"]) + self._set_discretized_feature_weights(initial_weights, features["discretized"]) + + return tf.constant_initializer( + features["bias"] + ), twml.contrib.initializers.PartitionConstant(initial_weights) + + def _set_binary_feature_weights( + self, + initial_weights: np.ndarray, + binary_features: Dict[str, float], + ) -> None: + """set weights for binary features""" + + for feature_name, weight in binary_features.items(): + feature_id = make_feature_id(feature_name, self.num_bits) + initial_weights[feature_id][0] = weight + + def _set_discretized_feature_weights( + self, + initial_weights: np.ndarray, + discretized_features: Dict[str, Dict[str, Any]], + ) -> None: + """set weights for discretized features""" + + for feature_name, discretized_feature in discretized_features.items(): + feature_id = make_feature_id(feature_name, self.num_bits) + for bin_idx, weight in enumerate(discretized_feature["weights"]): + final_bucket_id = numpy_hashing_uniform( + feature_id, bin_idx, self.num_bits + ) + initial_weights[final_bucket_id][0] = weight diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/train.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/train.py index 6ef181f5f..f5cc3deab 100644 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/train.py +++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/train.py @@ -1,212 +1,275 @@ # checkstyle: noqa +from datetime import datetime +from typing import Any, Dict, Optional + import tensorflow.compat.v1 as tf -from tensorflow.python.estimator.export.export import build_raw_serving_input_receiver_fn -from tensorflow.python.framework import dtypes -from tensorflow.python.ops import array_ops import tensorflow_hub as hub - -from datetime import datetime from tensorflow.compat.v1 import logging +from tensorflow.python.estimator.export.export import ( + build_raw_serving_input_receiver_fn, +) +from tensorflow.python.framework import dtypes +from tensorflow.python.ops import array_ops from twitter.deepbird.projects.timelines.configs import all_configs + +import twml +from twml.contrib.calibrators.common_calibrators import ( + build_percentile_discretizer_graph, + calibrate_discretizer_and_export, +) from twml.trainers import DataRecordTrainer -from twml.contrib.calibrators.common_calibrators import build_percentile_discretizer_graph -from twml.contrib.calibrators.common_calibrators import calibrate_discretizer_and_export -from .metrics import get_multi_binary_class_metric_fn -from .constants import TARGET_LABEL_IDX, PREDICTED_CLASSES + +from .constants import PREDICTED_CLASSES, TARGET_LABEL_IDX from .example_weights import add_weight_arguments, make_weights_tensor from .lolly.data_helpers import get_lolly_logits -from .lolly.tf_model_initializer_builder import TFModelInitializerBuilder from .lolly.reader import LollyModelReader +from .lolly.tf_model_initializer_builder import TFModelInitializerBuilder +from .metrics import get_multi_binary_class_metric_fn from .tf_model.discretizer_builder import TFModelDiscretizerBuilder from .tf_model.weights_initializer_builder import TFModelWeightsInitializerBuilder -import twml -def get_feature_values(features_values, params): - if params.lolly_model_tsv: - # The default DBv2 HashingDiscretizer bin membership interval is (a, b] - # - # The Earlybird Lolly prediction engine discretizer bin membership interval is [a, b) - # - # TFModelInitializerBuilder converts (a, b] to [a, b) by inverting the bin boundaries. - # - # Thus, invert the feature values, so that HashingDiscretizer can to find the correct bucket. - return tf.multiply(features_values, -1.0) - else: +def get_feature_values( + features_values: tf.Tensor, params: tf.contrib.training.HParams +) -> tf.Tensor: + if params.lolly_model_tsv: + # The default DBv2 HashingDiscretizer bin membership interval is (a, b] + # The Earlybird Lolly prediction engine discretizer bin membership interval is [a, b) + # TFModelInitializerBuilder converts (a, b] to [a, b) by inverting the bin boundaries. + # Thus, invert the feature values, so that HashingDiscretizer can to find the correct bucket. + return tf.multiply(features_values, -1.0) return features_values -def build_graph(features, label, mode, params, config=None): - weights = None - if "weights" in features: - weights = make_weights_tensor(features["weights"], label, params) - num_bits = params.input_size_bits +def build_graph( + features: Dict[str, tf.Tensor], + label: tf.Tensor, + mode: str, + params: tf.contrib.training.HParams, + config: Optional[tf.estimator.RunConfig] = None, +) -> Dict[str, Any]: + weights = None + if "weights" in features: + weights = make_weights_tensor(features["weights"], label, params) - if mode == "infer": - indices = twml.limit_bits(features["input_sparse_tensor_indices"], num_bits) - dense_shape = tf.stack([features["input_sparse_tensor_shape"][0], 1 << num_bits]) - sparse_tf = tf.SparseTensor( - indices=indices, - values=get_feature_values(features["input_sparse_tensor_values"], params), - dense_shape=dense_shape - ) - else: - features["values"] = get_feature_values(features["values"], params) - sparse_tf = twml.util.convert_to_sparse(features, num_bits) - - if params.lolly_model_tsv: - tf_model_initializer = TFModelInitializerBuilder().build(LollyModelReader(params.lolly_model_tsv)) - bias_initializer, weight_initializer = TFModelWeightsInitializerBuilder(num_bits).build(tf_model_initializer) - discretizer = TFModelDiscretizerBuilder(num_bits).build(tf_model_initializer) - else: - discretizer = hub.Module(params.discretizer_save_dir) - bias_initializer, weight_initializer = None, None - - input_sparse = discretizer(sparse_tf, signature="hashing_discretizer_calibrator") - - logits = twml.layers.full_sparse( - inputs=input_sparse, - output_size=1, - bias_initializer=bias_initializer, - weight_initializer=weight_initializer, - use_sparse_grads=(mode == "train"), - use_binary_values=True, - name="full_sparse_1" - ) - - loss = None - - if mode != "infer": - lolly_activations = get_lolly_logits(label) - - if opt.print_data_examples: - logits = print_data_example(logits, lolly_activations, features) - - if params.replicate_lolly: - loss = tf.reduce_mean(tf.math.squared_difference(logits, lolly_activations)) + num_bits = params.input_size_bits + + if mode == "infer": + indices = twml.limit_bits(features["input_sparse_tensor_indices"], num_bits) + dense_shape = tf.stack( + [features["input_sparse_tensor_shape"][0], 1 << num_bits] + ) + sparse_tf = tf.SparseTensor( + indices=indices, + values=get_feature_values(features["input_sparse_tensor_values"], params), + dense_shape=dense_shape, + ) else: - batch_size = tf.shape(label)[0] - target_label = tf.reshape(tensor=label[:, TARGET_LABEL_IDX], shape=(batch_size, 1)) - loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=target_label, logits=logits) - loss = twml.util.weighted_average(loss, weights) - - num_labels = tf.shape(label)[1] - eb_scores = tf.tile(lolly_activations, [1, num_labels]) - logits = tf.tile(logits, [1, num_labels]) - logits = tf.concat([logits, eb_scores], axis=1) - - output = tf.nn.sigmoid(logits) - - return {"output": output, "loss": loss, "weights": weights} - -def print_data_example(logits, lolly_activations, features): - return tf.Print( - logits, - [logits, lolly_activations, tf.reshape(features['keys'], (1, -1)), tf.reshape(tf.multiply(features['values'], -1.0), (1, -1))], - message="DATA EXAMPLE = ", - summarize=10000 - ) - -def earlybird_output_fn(graph_output): - export_outputs = { - tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: - tf.estimator.export.PredictOutput( - {"prediction": tf.identity(graph_output["output"], name="output_scores")} - ) - } - return export_outputs + features["values"] = get_feature_values(features["values"], params) + sparse_tf = twml.util.convert_to_sparse(features, num_bits) -if __name__ == "__main__": - parser = DataRecordTrainer.add_parser_arguments() - - parser = twml.contrib.calibrators.add_discretizer_arguments(parser) - - parser.add_argument("--label", type=str, help="label for the engagement") - parser.add_argument("--model.use_existing_discretizer", action="store_true", - dest="model_use_existing_discretizer", - help="Load a pre-trained calibration or train a new one") - parser.add_argument("--input_size_bits", type=int) - parser.add_argument("--export_module_name", type=str, default="base_mlp", dest="export_module_name") - parser.add_argument("--feature_config", type=str) - parser.add_argument("--replicate_lolly", type=bool, default=False, dest="replicate_lolly", - help="Train a regression model with MSE loss and the logged Earlybird score as a label") - parser.add_argument("--lolly_model_tsv", type=str, required=False, dest="lolly_model_tsv", - help="Initialize with weights and discretizer bins available in the given Lolly model tsv file" - "No discretizer gets trained or loaded if set.") - parser.add_argument("--print_data_examples", type=bool, default=False, dest="print_data_examples", - help="Prints 'DATA EXAMPLE = [[tf logit]][[logged lolly logit]][[feature ids][feature values]]'") - add_weight_arguments(parser) - - opt = parser.parse_args() - - feature_config_module = all_configs.select_feature_config(opt.feature_config) - - feature_config = feature_config_module.get_feature_config(data_spec_path=opt.data_spec, label=opt.label) - - parse_fn = twml.parsers.get_sparse_parse_fn( - feature_config, - keep_fields=("ids", "keys", "values", "batch_size", "total_size", "codes")) - - if not opt.lolly_model_tsv: - if opt.model_use_existing_discretizer: - logging.info("Skipping discretizer calibration [model.use_existing_discretizer=True]") - logging.info(f"Using calibration at {opt.discretizer_save_dir}") + if params.lolly_model_tsv: + tf_model_initializer = TFModelInitializerBuilder().build( + LollyModelReader(params.lolly_model_tsv) + ) + bias_initializer, weight_initializer = TFModelWeightsInitializerBuilder( + num_bits + ).build(tf_model_initializer) + discretizer = TFModelDiscretizerBuilder(num_bits).build(tf_model_initializer) else: - logging.info("Calibrating new discretizer [model.use_existing_discretizer=False]") - calibrator = twml.contrib.calibrators.HashingDiscretizerCalibrator( - opt.discretizer_num_bins, - opt.discretizer_output_size_bits - ) - calibrate_discretizer_and_export(name="recap_earlybird_hashing_discretizer", - params=opt, - calibrator=calibrator, - build_graph_fn=build_percentile_discretizer_graph, - feature_config=feature_config) - - trainer = DataRecordTrainer( - name="earlybird", - params=opt, - build_graph_fn=build_graph, - save_dir=opt.save_dir, - feature_config=feature_config, - metric_fn=get_multi_binary_class_metric_fn( - metrics=["roc_auc"], - classes=PREDICTED_CLASSES - ), - warm_start_from=None - ) - - train_input_fn = trainer.get_train_input_fn(parse_fn=parse_fn) - eval_input_fn = trainer.get_eval_input_fn(parse_fn=parse_fn) - - logging.info("Training and Evaluation ...") - trainingStartTime = datetime.now() - trainer.train_and_evaluate(train_input_fn=train_input_fn, eval_input_fn=eval_input_fn) - trainingEndTime = datetime.now() - logging.info("Training and Evaluation time: " + str(trainingEndTime - trainingStartTime)) - - if trainer._estimator.config.is_chief: - serving_input_in_earlybird = { - "input_sparse_tensor_indices": array_ops.placeholder( - name="input_sparse_tensor_indices", - shape=[None, 2], - dtype=dtypes.int64), - "input_sparse_tensor_values": array_ops.placeholder( - name="input_sparse_tensor_values", - shape=[None], - dtype=dtypes.float32), - "input_sparse_tensor_shape": array_ops.placeholder( - name="input_sparse_tensor_shape", - shape=[2], - dtype=dtypes.int64) + discretizer = hub.Module(params.discretizer_save_dir) + bias_initializer, weight_initializer = None, None + + input_sparse = discretizer(sparse_tf, signature="hashing_discretizer_calibrator") + + logits = twml.layers.full_sparse( + inputs=input_sparse, + output_size=1, + bias_initializer=bias_initializer, + weight_initializer=weight_initializer, + use_sparse_grads=(mode == "train"), + use_binary_values=True, + name="full_sparse_1", + ) + + loss = None + + if mode != "infer": + lolly_activations = get_lolly_logits(label) + + if opt.print_data_examples: + logits = print_data_example(logits, lolly_activations, features) + + if params.replicate_lolly: + loss = tf.reduce_mean(tf.math.squared_difference(logits, lolly_activations)) + else: + batch_size = tf.shape(label)[0] + target_label = tf.reshape( + tensor=label[:, TARGET_LABEL_IDX], shape=(batch_size, 1) + ) + loss = tf.nn.sigmoid_cross_entropy_with_logits( + labels=target_label, logits=logits + ) + loss = twml.util.weighted_average(loss, weights) + + num_labels = tf.shape(label)[1] + eb_scores = tf.tile(lolly_activations, [1, num_labels]) + logits = tf.tile(logits, [1, num_labels]) + logits = tf.concat([logits, eb_scores], axis=1) + + output = tf.nn.sigmoid(logits) + + return {"output": output, "loss": loss, "weights": weights} + + +def print_data_example( + logits: tf.Tensor, + lolly_activations: tf.Tensor, + features: Dict[str, tf.Tensor], +) -> tf.Tensor: + return tf.Print( + logits, + [ + logits, + lolly_activations, + tf.reshape(features["keys"], (1, -1)), + tf.reshape(tf.multiply(features["values"], -1.0), (1, -1)), + ], + message="DATA EXAMPLE = ", + summarize=10000, + ) + + +def earlybird_output_fn(graph_output: Dict[str, Any]) -> Dict[str, Any]: + export_outputs = { + tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: tf.estimator.export.PredictOutput( + {"prediction": tf.identity(graph_output["output"], name="output_scores")} + ) } - serving_input_receiver_fn = build_raw_serving_input_receiver_fn(serving_input_in_earlybird) - twml.contrib.export.export_fn.export_all_models( - trainer=trainer, - export_dir=opt.export_dir, - parse_fn=parse_fn, - serving_input_receiver_fn=serving_input_receiver_fn, - export_output_fn=earlybird_output_fn, - feature_spec=feature_config.get_feature_spec() + return export_outputs + + +if __name__ == "__main__": + parser = DataRecordTrainer.add_parser_arguments() + + parser = twml.contrib.calibrators.add_discretizer_arguments(parser) + + parser.add_argument("--label", type=str, help="label for the engagement") + parser.add_argument( + "--model.use_existing_discretizer", + action="store_true", + dest="model_use_existing_discretizer", + help="Load a pre-trained calibration or train a new one", + ) + parser.add_argument("--input_size_bits", type=int) + parser.add_argument( + "--export_module_name", type=str, default="base_mlp", dest="export_module_name" + ) + parser.add_argument("--feature_config", type=str) + parser.add_argument( + "--replicate_lolly", + type=bool, + default=False, + dest="replicate_lolly", + help="Train a regression model with MSE loss and the logged Earlybird score as a label", + ) + parser.add_argument( + "--lolly_model_tsv", + type=str, + required=False, + dest="lolly_model_tsv", + help="Initialize with weights and discretizer bins available in the given Lolly model tsv file" + "No discretizer gets trained or loaded if set.", + ) + parser.add_argument( + "--print_data_examples", + type=bool, + default=False, + dest="print_data_examples", + help="Prints 'DATA EXAMPLE = [[tf logit]][[logged lolly logit]][[feature ids][feature values]]'", ) - logging.info("The export model path is: " + opt.export_dir) + add_weight_arguments(parser) + + opt = parser.parse_args() + + feature_config_module = all_configs.select_feature_config(opt.feature_config) + + feature_config = feature_config_module.get_feature_config( + data_spec_path=opt.data_spec, label=opt.label + ) + + parse_fn = twml.parsers.get_sparse_parse_fn( + feature_config, + keep_fields=("ids", "keys", "values", "batch_size", "total_size", "codes"), + ) + + if not opt.lolly_model_tsv: + if opt.model_use_existing_discretizer: + logging.info( + "Skipping discretizer calibration [model.use_existing_discretizer=True]" + ) + logging.info(f"Using calibration at {opt.discretizer_save_dir}") + else: + logging.info( + "Calibrating new discretizer [model.use_existing_discretizer=False]" + ) + calibrator = twml.contrib.calibrators.HashingDiscretizerCalibrator( + opt.discretizer_num_bins, opt.discretizer_output_size_bits + ) + calibrate_discretizer_and_export( + name="recap_earlybird_hashing_discretizer", + params=opt, + calibrator=calibrator, + build_graph_fn=build_percentile_discretizer_graph, + feature_config=feature_config, + ) + + trainer = DataRecordTrainer( + name="earlybird", + params=opt, + build_graph_fn=build_graph, + save_dir=opt.save_dir, + feature_config=feature_config, + metric_fn=get_multi_binary_class_metric_fn( + metrics=["roc_auc"], classes=PREDICTED_CLASSES + ), + warm_start_from=None, + ) + + train_input_fn = trainer.get_train_input_fn(parse_fn=parse_fn) + eval_input_fn = trainer.get_eval_input_fn(parse_fn=parse_fn) + + logging.info("Training and Evaluation ...") + trainingStartTime = datetime.now() + trainer.train_and_evaluate( + train_input_fn=train_input_fn, eval_input_fn=eval_input_fn + ) + trainingEndTime = datetime.now() + logging.info( + "Training and Evaluation time: " + str(trainingEndTime - trainingStartTime) + ) + + if trainer._estimator.config.is_chief: + serving_input_in_earlybird = { + "input_sparse_tensor_indices": array_ops.placeholder( + name="input_sparse_tensor_indices", shape=[None, 2], dtype=dtypes.int64 + ), + "input_sparse_tensor_values": array_ops.placeholder( + name="input_sparse_tensor_values", shape=[None], dtype=dtypes.float32 + ), + "input_sparse_tensor_shape": array_ops.placeholder( + name="input_sparse_tensor_shape", shape=[2], dtype=dtypes.int64 + ), + } + serving_input_receiver_fn = build_raw_serving_input_receiver_fn( + serving_input_in_earlybird + ) + twml.contrib.export.export_fn.export_all_models( + trainer=trainer, + export_dir=opt.export_dir, + parse_fn=parse_fn, + serving_input_receiver_fn=serving_input_receiver_fn, + export_output_fn=earlybird_output_fn, + feature_spec=feature_config.get_feature_spec(), + ) + logging.info("The export model path is: " + opt.export_dir) diff --git a/trust_and_safety_models/abusive/abusive_model.py b/trust_and_safety_models/abusive/abusive_model.py index 06fff4ed2..5cc7d5086 100644 --- a/trust_and_safety_models/abusive/abusive_model.py +++ b/trust_and_safety_models/abusive/abusive_model.py @@ -1,48 +1,57 @@ import tensorflow as tf -physical_devices = tf.config.list_physical_devices('GPU') +physical_devices = tf.config.list_physical_devices("GPU") for device in physical_devices: tf.config.experimental.set_memory_growth(device, True) -from twitter.hmli.nimbus.modeling.model_config import FeatureType, EncodingType, Feature, Model, LogType -from twitter.hmli.nimbus.modeling.feature_loader import BigQueryFeatureLoader -from twitter.cuad.representation.models.text_encoder import TextEncoder -from twitter.cuad.representation.models.optimization import create_optimizer -from twitter.hmli.nimbus.modeling.feature_encoder import FeatureEncoder - import numpy as np import pandas as pd import utils +from twitter.cuad.representation.models.optimization import create_optimizer +from twitter.cuad.representation.models.text_encoder import TextEncoder +from twitter.hmli.nimbus.modeling.feature_encoder import FeatureEncoder +from twitter.hmli.nimbus.modeling.feature_loader import BigQueryFeatureLoader +from twitter.hmli.nimbus.modeling.model_config import ( + EncodingType, + Feature, + FeatureType, + LogType, + Model, +) -cat_names = [ -... -] +cat_names = [...] -category_features = [Feature(name=cat_name, ftype=FeatureType.CONTINUOUS) for cat_name in cat_names] +category_features = [ + Feature(name=cat_name, ftype=FeatureType.CONTINUOUS) for cat_name in cat_names +] features = [ - Feature(name="tweet_text_with_media_annotations", ftype=FeatureType.STRING, encoding=EncodingType.BERT), - Feature(name="precision_nsfw", ftype=FeatureType.CONTINUOUS), - Feature(name="has_media", ftype=FeatureType.BINARY), - Feature(name="num_media", ftype=FeatureType.DISCRETE) + Feature( + name="tweet_text_with_media_annotations", + ftype=FeatureType.STRING, + encoding=EncodingType.BERT, + ), + Feature(name="precision_nsfw", ftype=FeatureType.CONTINUOUS), + Feature(name="has_media", ftype=FeatureType.BINARY), + Feature(name="num_media", ftype=FeatureType.DISCRETE), ] + category_features ptos_prototype = Model( - name='ptos_prototype', - export_path="...", - features=features, + name="ptos_prototype", + export_path="...", + features=features, ) print(ptos_prototype) cq_loader = BigQueryFeatureLoader(gcp_project=COMPUTE_PROJECT) labels = [ - "has_non_punitive_action", - "has_punitive_action", - "has_punitive_action_contains_self_harm", - "has_punitive_action_encourage_self_harm", - "has_punitive_action_episodic", - "has_punitive_action_episodic_hateful_conduct", - "has_punitive_action_other_abuse_policy", - "has_punitive_action_without_self_harm" + "has_non_punitive_action", + "has_punitive_action", + "has_punitive_action_contains_self_harm", + "has_punitive_action_encourage_self_harm", + "has_punitive_action_episodic", + "has_punitive_action_episodic_hateful_conduct", + "has_punitive_action_other_abuse_policy", + "has_punitive_action_without_self_harm", ] train_query = f""" @@ -64,112 +73,128 @@ print(train.describe(model=ptos_prototype)) params = { - 'max_seq_lengths': 128, - 'batch_size': 196, - 'lr': 1e-5, - 'optimizer_type': 'adamw', - 'warmup_steps': 0, - 'cls_dropout_rate': 0.1, - 'epochs': 30, - 'steps_per_epoch': 5000, - 'model_type': 'twitter_multilingual_bert_base_cased_mlm', - 'mixed_precision': True, + "max_seq_lengths": 128, + "batch_size": 196, + "lr": 1e-5, + "optimizer_type": "adamw", + "warmup_steps": 0, + "cls_dropout_rate": 0.1, + "epochs": 30, + "steps_per_epoch": 5000, + "model_type": "twitter_multilingual_bert_base_cased_mlm", + "mixed_precision": True, } params + def parse_labeled_data(row_dict): - label = [row_dict.pop(l) for l in labels] - return row_dict, label + label = [row_dict.pop(l) for l in labels] + return row_dict, label -mirrored_strategy = tf.distribute.MirroredStrategy() -BATCH_SIZE = params['batch_size'] * mirrored_strategy.num_replicas_in_sync -train_ds = train.to_tf_dataset().map(parse_labeled_data).shuffle(BATCH_SIZE*100).batch(BATCH_SIZE).repeat() +mirrored_strategy = tf.distribute.MirroredStrategy() +BATCH_SIZE = params["batch_size"] * mirrored_strategy.num_replicas_in_sync + +train_ds = ( + train.to_tf_dataset() + .map(parse_labeled_data) + .shuffle(BATCH_SIZE * 100) + .batch(BATCH_SIZE) + .repeat() +) val_ds = val.to_tf_dataset().map(parse_labeled_data).batch(BATCH_SIZE) for record in train_ds: - tf.print(record) - break + tf.print(record) + break + def get_positive_weights(): - """Computes positive weights used for class imbalance from training data.""" - label_weights_df = utils.get_label_weights( - "tos-data-media-full", - project_id="twttr-abusive-interact-prod", - dataset_id="tos_policy" - ) - pos_weight_tensor = tf.cast( - label_weights_df.sort_values(by='label').positive_class_weight, - dtype=tf.float32 - ) - return pos_weight_tensor + """Computes positive weights used for class imbalance from training data.""" + label_weights_df = utils.get_label_weights( + "tos-data-media-full", + project_id="twttr-abusive-interact-prod", + dataset_id="tos_policy", + ) + pos_weight_tensor = tf.cast( + label_weights_df.sort_values(by="label").positive_class_weight, dtype=tf.float32 + ) + return pos_weight_tensor + pos_weight_tensor = get_positive_weights() print(pos_weight_tensor) + class TextEncoderPooledOutput(TextEncoder): - def call(self, x): - return super().call([x])["pooled_output"] + def call(self, x): + return super().call([x])["pooled_output"] + + def get_config(self): + return super().get_config() - def get_config(self): - return super().get_config() with mirrored_strategy.scope(): - text_encoder_pooled_output = TextEncoderPooledOutput( - params['max_seq_lengths'], - model_type=params['model_type'], - trainable=True - ) - - fe = FeatureEncoder(train) - inputs, preprocessing_head = fe.build_model_head(model=ptos_prototype, text_encoder=text_encoder_pooled_output) - - cls_dropout = tf.keras.layers.Dropout(params['cls_dropout_rate'], name="cls_dropout") - outputs = cls_dropout(preprocessing_head) - outputs = tf.keras.layers.Dense(8, name="output", dtype="float32")(outputs) - - model = tf.keras.Model( - inputs=inputs, - outputs=outputs - ) - pr_auc = tf.keras.metrics.AUC(curve="PR", num_thresholds=1000, multi_label=True, from_logits=True) - - custom_loss = lambda y_true, y_pred: utils.multilabel_weighted_loss(y_true, y_pred, weights=pos_weight_tensor) - optimizer = create_optimizer( - init_lr=params["lr"], - num_train_steps=(params["epochs"] * params["steps_per_epoch"]), - num_warmup_steps=params["warmup_steps"], - optimizer_type=params["optimizer_type"], - ) - if params.get("mixed_precision"): - optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer) - - model.compile( - optimizer=optimizer, - loss=custom_loss, - metrics=[pr_auc] - ) + text_encoder_pooled_output = TextEncoderPooledOutput( + params["max_seq_lengths"], model_type=params["model_type"], trainable=True + ) + + fe = FeatureEncoder(train) + inputs, preprocessing_head = fe.build_model_head( + model=ptos_prototype, text_encoder=text_encoder_pooled_output + ) + + cls_dropout = tf.keras.layers.Dropout( + params["cls_dropout_rate"], name="cls_dropout" + ) + outputs = cls_dropout(preprocessing_head) + outputs = tf.keras.layers.Dense(8, name="output", dtype="float32")(outputs) + + model = tf.keras.Model(inputs=inputs, outputs=outputs) + pr_auc = tf.keras.metrics.AUC( + curve="PR", num_thresholds=1000, multi_label=True, from_logits=True + ) + + custom_loss = lambda y_true, y_pred: utils.multilabel_weighted_loss( + y_true, y_pred, weights=pos_weight_tensor + ) + optimizer = create_optimizer( + init_lr=params["lr"], + num_train_steps=(params["epochs"] * params["steps_per_epoch"]), + num_warmup_steps=params["warmup_steps"], + optimizer_type=params["optimizer_type"], + ) + if params.get("mixed_precision"): + optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite( + optimizer + ) + + model.compile(optimizer=optimizer, loss=custom_loss, metrics=[pr_auc]) model.weights model.summary() pr_auc.name import getpass + import wandb from wandb.keras import WandbCallback + try: - wandb_key = ... - wandb.login(...) - run = wandb.init(project='ptos_with_media', - group='new-split-trains', - notes='tweet text with only (num_media, precision_nsfw). on full train set, new split.', - entity='absv', - config=params, - name='tweet-text-w-nsfw-1.1', - sync_tensorboard=True) + wandb_key = ... + wandb.login(...) + run = wandb.init( + project="ptos_with_media", + group="new-split-trains", + notes="tweet text with only (num_media, precision_nsfw). on full train set, new split.", + entity="absv", + config=params, + name="tweet-text-w-nsfw-1.1", + sync_tensorboard=True, + ) except FileNotFoundError: - print('Wandb key not found') - run = wandb.init(mode='disabled') + print("Wandb key not found") + run = wandb.init(mode="disabled") import datetime import os @@ -179,27 +204,34 @@ def get_config(self): print("Saving model checkpoints here: ", checkpoint_path) cp_callback = tf.keras.callbacks.ModelCheckpoint( - filepath=os.path.join(checkpoint_path, "model.{epoch:04d}.tf"), - verbose=1, - monitor=f'val_{pr_auc.name}', - mode='max', - save_freq='epoch', - save_best_only=True + filepath=os.path.join(checkpoint_path, "model.{epoch:04d}.tf"), + verbose=1, + monitor=f"val_{pr_auc.name}", + mode="max", + save_freq="epoch", + save_best_only=True, ) -early_stopping_callback = tf.keras.callbacks.EarlyStopping(patience=7, - monitor=f"val_{pr_auc.name}", - mode="max") +early_stopping_callback = tf.keras.callbacks.EarlyStopping( + patience=7, monitor=f"val_{pr_auc.name}", mode="max" +) -model.fit(train_ds, epochs=params["epochs"], validation_data=val_ds, callbacks=[cp_callback, early_stopping_callback], - steps_per_epoch=params["steps_per_epoch"], - verbose=2) +model.fit( + train_ds, + epochs=params["epochs"], + validation_data=val_ds, + callbacks=[cp_callback, early_stopping_callback], + steps_per_epoch=params["steps_per_epoch"], + verbose=2, +) import tensorflow_hub as hub gs_model_path = ... reloaded_keras_layer = hub.KerasLayer(gs_model_path) -inputs = tf.keras.layers.Input(name="tweet__core__tweet__text", shape=(1,), dtype=tf.string) +inputs = tf.keras.layers.Input( + name="tweet__core__tweet__text", shape=(1,), dtype=tf.string +) output = reloaded_keras_layer(inputs) v7_model = tf.keras.models.Model(inputs=inputs, outputs=output) pr_auc = tf.keras.metrics.AUC(curve="PR", name="pr_auc") @@ -210,7 +242,7 @@ def get_config(self): candidate_model = model with mirrored_strategy.scope(): - candidate_eval = candidate_model.evaluate(val_ds) + candidate_eval = candidate_model.evaluate(val_ds) test_query = f""" SELECT @@ -229,48 +261,64 @@ def get_config(self): test_only_media = test.filter(lambda x, y: tf.equal(x["has_media"], True)) test_only_nsfw = test.filter(lambda x, y: tf.greater_equal(x["precision_nsfw"], 0.95)) test_no_media = test.filter(lambda x, y: tf.equal(x["has_media"], False)) -test_media_not_nsfw = test.filter(lambda x, y: tf.logical_and(tf.equal(x["has_media"], True), tf.less(x["precision_nsfw"], 0.95))) +test_media_not_nsfw = test.filter( + lambda x, y: tf.logical_and( + tf.equal(x["has_media"], True), tf.less(x["precision_nsfw"], 0.95) + ) +) for d in [test, test_only_media, test_only_nsfw, test_no_media, test_media_not_nsfw]: - print(d.reduce(0, lambda x, _: x + 1).numpy()) + print(d.reduce(0, lambda x, _: x + 1).numpy()) -from notebook_eval_utils import SparseMultilabelEvaluator, EvalConfig from dataclasses import asdict +from notebook_eval_utils import EvalConfig, SparseMultilabelEvaluator + + def display_metrics(probs, targets, labels=labels): - eval_config = EvalConfig(prediction_threshold=0.5, precision_k=0.9) - for eval_mode, y_mask in [("implicit", np.ones(targets.shape))]: - print("Evaluation mode", eval_mode) - metrics = SparseMultilabelEvaluator.evaluate( - targets, np.array(probs), y_mask, classes=labels, eval_config=eval_config - ) - metrics_df = pd.DataFrame.from_dict(asdict(metrics)["per_topic_metrics"]).transpose() - metrics_df["pos_to_neg"] = metrics_df["num_pos_samples"] / (metrics_df["num_neg_samples"] + 1) - display(metrics_df.median()) - display(metrics_df) - return metrics_df + eval_config = EvalConfig(prediction_threshold=0.5, precision_k=0.9) + for eval_mode, y_mask in [("implicit", np.ones(targets.shape))]: + print("Evaluation mode", eval_mode) + metrics = SparseMultilabelEvaluator.evaluate( + targets, np.array(probs), y_mask, classes=labels, eval_config=eval_config + ) + metrics_df = pd.DataFrame.from_dict( + asdict(metrics)["per_topic_metrics"] + ).transpose() + metrics_df["pos_to_neg"] = metrics_df["num_pos_samples"] / ( + metrics_df["num_neg_samples"] + 1 + ) + display(metrics_df.median()) + display(metrics_df) + return metrics_df def eval_model(model, df): - with mirrored_strategy.scope(): - targets = np.stack(list(df.map(lambda x, y: y).as_numpy_iterator()), axis=0) - df = df.padded_batch(BATCH_SIZE) - preds = model.predict(df) - return display_metrics(preds, targets) - -subsets = {"test": test, - "test_only_media": test_only_media, - "test_only_nsfw": test_only_nsfw, - "test_no_media": test_no_media, - "test_media_not_nsfw": test_media_not_nsfw} + with mirrored_strategy.scope(): + targets = np.stack(list(df.map(lambda x, y: y).as_numpy_iterator()), axis=0) + df = df.padded_batch(BATCH_SIZE) + preds = model.predict(df) + return display_metrics(preds, targets) + + +subsets = { + "test": test, + "test_only_media": test_only_media, + "test_only_nsfw": test_only_nsfw, + "test_no_media": test_no_media, + "test_media_not_nsfw": test_media_not_nsfw, +} metrics = {} for name, df in subsets.items(): - metrics[name] = eval_model(candidate_model, df) + metrics[name] = eval_model(candidate_model, df) [(name, m.pr_auc) for name, m in metrics.items()] -for name, x in [(name, m.pr_auc.to_string(index=False).strip().split("\n")) for name, m in metrics.items()]: - print(name) - for y in x: - print(y.strip(), end="\t") - print(".") +for name, x in [ + (name, m.pr_auc.to_string(index=False).strip().split("\n")) + for name, m in metrics.items() +]: + print(name) + for y in x: + print(y.strip(), end="\t") + print(".") for d in [test, test_only_media, test_only_nsfw, test_no_media, test_media_not_nsfw]: - print(d.reduce(0, lambda x, _: x + 1).numpy()) \ No newline at end of file + print(d.reduce(0, lambda x, _: x + 1).numpy()) diff --git a/trust_and_safety_models/nsfw/nsfw_media.py b/trust_and_safety_models/nsfw/nsfw_media.py index b5dfebb65..4975b4b32 100644 --- a/trust_and_safety_models/nsfw/nsfw_media.py +++ b/trust_and_safety_models/nsfw/nsfw_media.py @@ -1,51 +1,55 @@ -import kerastuner as kt +import glob import math +import os +import random + +import kerastuner as kt import numpy as np import pandas as pd -import random import sklearn.metrics import tensorflow as tf -import os -import glob - -from tqdm import tqdm +from google.cloud import storage from matplotlib import pyplot as plt -from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense -from google.cloud import storage +from tensorflow.keras.models import Sequential +from tqdm import tqdm -physical_devices = tf.config.list_physical_devices('GPU') +physical_devices = tf.config.list_physical_devices("GPU") physical_devices -tf.config.set_visible_devices([tf.config.PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')], 'GPU') -tf.config.get_visible_devices('GPU') +tf.config.set_visible_devices( + [tf.config.PhysicalDevice(name="/physical_device:GPU:1", device_type="GPU")], "GPU" +) +tf.config.get_visible_devices("GPU") + def decode_fn_embedding(example_proto): - - feature_description = { - "embedding": tf.io.FixedLenFeature([256], dtype=tf.float32), - "labels": tf.io.FixedLenFeature([], dtype=tf.int64), - } - - example = tf.io.parse_single_example( - example_proto, - feature_description - ) - - return example - -def preprocess_embedding_example(example_dict, positive_label=1, features_as_dict=False): - labels = example_dict["labels"] - label = tf.math.reduce_any(labels == positive_label) - label = tf.cast(label, tf.int32) - embedding = example_dict["embedding"] - - if features_as_dict: - features = {"embedding": embedding} - else: - features = embedding - - return features, label + feature_description = { + "embedding": tf.io.FixedLenFeature([256], dtype=tf.float32), + "labels": tf.io.FixedLenFeature([], dtype=tf.int64), + } + + example = tf.io.parse_single_example(example_proto, feature_description) + + return example + + +def preprocess_embedding_example( + example_dict, positive_label=1, features_as_dict=False +): + labels = example_dict["labels"] + label = tf.math.reduce_any(labels == positive_label) + label = tf.cast(label, tf.int32) + embedding = example_dict["embedding"] + + if features_as_dict: + features = {"embedding": embedding} + else: + features = embedding + + return features, label + + input_root = ... sens_prev_input_root = ... @@ -58,161 +62,189 @@ def preprocess_embedding_example(example_dict, positive_label=1, features_as_dic validation_batch_size = 256 do_resample = False + + def class_func(features, label): - return label + return label + resample_fn = tf.data.experimental.rejection_resample( - class_func, target_dist = [0.5, 0.5], seed=0 + class_func, target_dist=[0.5, 0.5], seed=0 ) train_glob = f"{input_root}/train/tfrecord/*.tfrecord" train_files = tf.io.gfile.glob(train_glob) if use_sens_prev_data: - train_sens_prev_glob = f"{sens_prev_input_root}/train/tfrecord/*.tfrecord" - train_sens_prev_files = tf.io.gfile.glob(train_sens_prev_glob) - train_files = train_files + train_sens_prev_files - + train_sens_prev_glob = f"{sens_prev_input_root}/train/tfrecord/*.tfrecord" + train_sens_prev_files = tf.io.gfile.glob(train_sens_prev_glob) + train_files = train_files + train_sens_prev_files + random.shuffle(train_files) if not len(train_files): - raise ValueError(f"Did not find any train files matching {train_glob}") + raise ValueError(f"Did not find any train files matching {train_glob}") test_glob = f"{input_root}/test/tfrecord/*.tfrecord" -test_files = tf.io.gfile.glob(test_glob) +test_files = tf.io.gfile.glob(test_glob) if not len(test_files): - raise ValueError(f"Did not find any eval files matching {test_glob}") - + raise ValueError(f"Did not find any eval files matching {test_glob}") + test_ds = tf.data.TFRecordDataset(test_files).map(decode_fn_embedding) -test_ds = test_ds.map(lambda x: preprocess_embedding_example(x, positive_label=positive_label)).batch(batch_size=test_batch_size) - +test_ds = test_ds.map( + lambda x: preprocess_embedding_example(x, positive_label=positive_label) +).batch(batch_size=test_batch_size) + if use_sens_prev_data: - test_sens_prev_glob = f"{sens_prev_input_root}/test/tfrecord/*.tfrecord" - test_sens_prev_files = tf.io.gfile.glob(test_sens_prev_glob) - - if not len(test_sens_prev_files): - raise ValueError(f"Did not find any eval files matching {test_sens_prev_glob}") - - test_sens_prev_ds = tf.data.TFRecordDataset(test_sens_prev_files).map(decode_fn_embedding) - test_sens_prev_ds = test_sens_prev_ds.map(lambda x: preprocess_embedding_example(x, positive_label=positive_label)).batch(batch_size=test_batch_size) + test_sens_prev_glob = f"{sens_prev_input_root}/test/tfrecord/*.tfrecord" + test_sens_prev_files = tf.io.gfile.glob(test_sens_prev_glob) + + if not len(test_sens_prev_files): + raise ValueError(f"Did not find any eval files matching {test_sens_prev_glob}") + + test_sens_prev_ds = tf.data.TFRecordDataset(test_sens_prev_files).map( + decode_fn_embedding + ) + test_sens_prev_ds = test_sens_prev_ds.map( + lambda x: preprocess_embedding_example(x, positive_label=positive_label) + ).batch(batch_size=test_batch_size) train_ds = tf.data.TFRecordDataset(train_files).map(decode_fn_embedding) -train_ds = train_ds.map(lambda x: preprocess_embedding_example(x, positive_label=positive_label)) +train_ds = train_ds.map( + lambda x: preprocess_embedding_example(x, positive_label=positive_label) +) if do_resample: - train_ds = train_ds.apply(resample_fn).map(lambda _,b:(b)) + train_ds = train_ds.apply(resample_fn).map(lambda _, b: (b)) train_ds = train_ds.batch(batch_size=256).shuffle(buffer_size=10) train_ds = train_ds.repeat() - - -if has_validation_data: - eval_glob = f"{input_root}/validation/tfrecord/*.tfrecord" - eval_files = tf.io.gfile.glob(eval_glob) - - if use_sens_prev_data: - eval_sens_prev_glob = f"{sens_prev_input_root}/validation/tfrecord/*.tfrecord" - eval_sens_prev_files = tf.io.gfile.glob(eval_sens_prev_glob) - eval_files = eval_files + eval_sens_prev_files - - - if not len(eval_files): - raise ValueError(f"Did not find any eval files matching {eval_glob}") - - eval_ds = tf.data.TFRecordDataset(eval_files).map(decode_fn_embedding) - eval_ds = eval_ds.map(lambda x: preprocess_embedding_example(x, positive_label=positive_label)).batch(batch_size=validation_batch_size) + + +if has_validation_data: + eval_glob = f"{input_root}/validation/tfrecord/*.tfrecord" + eval_files = tf.io.gfile.glob(eval_glob) + + if use_sens_prev_data: + eval_sens_prev_glob = f"{sens_prev_input_root}/validation/tfrecord/*.tfrecord" + eval_sens_prev_files = tf.io.gfile.glob(eval_sens_prev_glob) + eval_files = eval_files + eval_sens_prev_files + + if not len(eval_files): + raise ValueError(f"Did not find any eval files matching {eval_glob}") + + eval_ds = tf.data.TFRecordDataset(eval_files).map(decode_fn_embedding) + eval_ds = eval_ds.map( + lambda x: preprocess_embedding_example(x, positive_label=positive_label) + ).batch(batch_size=validation_batch_size) else: - - eval_ds = tf.data.TFRecordDataset(test_files).map(decode_fn_embedding) - eval_ds = eval_ds.map(lambda x: preprocess_embedding_example(x, positive_label=positive_label)).batch(batch_size=validation_batch_size) + eval_ds = tf.data.TFRecordDataset(test_files).map(decode_fn_embedding) + eval_ds = eval_ds.map( + lambda x: preprocess_embedding_example(x, positive_label=positive_label) + ).batch(batch_size=validation_batch_size) check_ds = tf.data.TFRecordDataset(train_files).map(decode_fn_embedding) cnt = 0 pos_cnt = 0 for example in tqdm(check_ds): - label = example['labels'] - if label == 1: - pos_cnt += 1 - cnt += 1 -print(f'{cnt} train entries with {pos_cnt} positive') + label = example["labels"] + if label == 1: + pos_cnt += 1 + cnt += 1 +print(f"{cnt} train entries with {pos_cnt} positive") metrics = [] metrics.append( - tf.keras.metrics.PrecisionAtRecall( - recall=0.9, num_thresholds=200, class_id=None, name=None, dtype=None - ) + tf.keras.metrics.PrecisionAtRecall( + recall=0.9, num_thresholds=200, class_id=None, name=None, dtype=None + ) ) metrics.append( - tf.keras.metrics.AUC( - num_thresholds=200, - curve="PR", - ) + tf.keras.metrics.AUC( + num_thresholds=200, + curve="PR", + ) ) + + def build_model(hp): - model = Sequential() + model = Sequential() + + optimizer = tf.keras.optimizers.Adam( + learning_rate=0.001, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-08, + amsgrad=False, + name="Adam", + ) + + activation = hp.Choice("activation", ["tanh", "gelu"]) + kernel_initializer = hp.Choice( + "kernel_initializer", ["he_uniform", "glorot_uniform"] + ) + for i in range(hp.Int("num_layers", 1, 2)): + model.add(tf.keras.layers.BatchNormalization()) + + units = hp.Int("units", min_value=128, max_value=256, step=128) + + if i == 0: + model.add( + Dense( + units=units, + activation=activation, + kernel_initializer=kernel_initializer, + input_shape=(None, 256), + ) + ) + else: + model.add( + Dense( + units=units, + activation=activation, + kernel_initializer=kernel_initializer, + ) + ) + + model.add(Dense(1, activation="sigmoid", kernel_initializer=kernel_initializer)) + model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=metrics) + + return model - optimizer = tf.keras.optimizers.Adam( - learning_rate=0.001, - beta_1=0.9, - beta_2=0.999, - epsilon=1e-08, - amsgrad=False, - name="Adam", - ) - - activation=hp.Choice("activation", ["tanh", "gelu"]) - kernel_initializer=hp.Choice("kernel_initializer", ["he_uniform", "glorot_uniform"]) - for i in range(hp.Int("num_layers", 1, 2)): - model.add(tf.keras.layers.BatchNormalization()) - - units=hp.Int("units", min_value=128, max_value=256, step=128) - - if i == 0: - model.add( - Dense( - units=units, - activation=activation, - kernel_initializer=kernel_initializer, - input_shape=(None, 256) - ) - ) - else: - model.add( - Dense( - units=units, - activation=activation, - kernel_initializer=kernel_initializer, - ) - ) - - model.add(Dense(1, activation='sigmoid', kernel_initializer=kernel_initializer)) - model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=metrics) - - return model tuner = kt.tuners.BayesianOptimization( - build_model, - objective=kt.Objective('val_loss', direction="min"), - max_trials=30, - directory='tuner_dir', - project_name='with_twitter_clip') + build_model, + objective=kt.Objective("val_loss", direction="min"), + max_trials=30, + directory="tuner_dir", + project_name="with_twitter_clip", +) -callbacks = [tf.keras.callbacks.EarlyStopping( - monitor='val_loss', min_delta=0, patience=5, verbose=0, - mode='auto', baseline=None, restore_best_weights=True -)] +callbacks = [ + tf.keras.callbacks.EarlyStopping( + monitor="val_loss", + min_delta=0, + patience=5, + verbose=0, + mode="auto", + baseline=None, + restore_best_weights=True, + ) +] steps_per_epoch = 400 -tuner.search(train_ds, - epochs=100, - batch_size=256, - steps_per_epoch=steps_per_epoch, - verbose=2, - validation_data=eval_ds, - callbacks=callbacks) +tuner.search( + train_ds, + epochs=100, + batch_size=256, + steps_per_epoch=steps_per_epoch, + verbose=2, + validation_data=eval_ds, + callbacks=callbacks, +) tuner.results_summary() models = tuner.get_best_models(num_models=2) @@ -230,109 +262,126 @@ def build_model(hp): epsilon=1e-08, amsgrad=False, name="Adam", - ) -best_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=metrics) +) +best_model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=metrics) best_model.summary() -callbacks = [tf.keras.callbacks.EarlyStopping( - monitor='val_loss', min_delta=0, patience=10, verbose=0, - mode='auto', baseline=None, restore_best_weights=True -)] -history = best_model.fit(train_ds, epochs=100, validation_data=eval_ds, steps_per_epoch=steps_per_epoch, callbacks=callbacks) +callbacks = [ + tf.keras.callbacks.EarlyStopping( + monitor="val_loss", + min_delta=0, + patience=10, + verbose=0, + mode="auto", + baseline=None, + restore_best_weights=True, + ) +] +history = best_model.fit( + train_ds, + epochs=100, + validation_data=eval_ds, + steps_per_epoch=steps_per_epoch, + callbacks=callbacks, +) -model_name = 'twitter_hypertuned' -model_path = f'models/nsfw_Keras_with_CLIP_{model_name}' +model_name = "twitter_hypertuned" +model_path = f"models/nsfw_Keras_with_CLIP_{model_name}" tf.keras.models.save_model(best_model, model_path) + def copy_local_directory_to_gcs(local_path, bucket, gcs_path): """Recursively copy a directory of files to GCS. local_path should be a directory and not have a trailing slash. """ assert os.path.isdir(local_path) - for local_file in glob.glob(local_path + '/**'): + for local_file in glob.glob(local_path + "/**"): if not os.path.isfile(local_file): dir_name = os.path.basename(os.path.normpath(local_file)) copy_local_directory_to_gcs(local_file, bucket, f"{gcs_path}/{dir_name}") else: - remote_path = os.path.join(gcs_path, local_file[1 + len(local_path) :]) - blob = bucket.blob(remote_path) - blob.upload_from_filename(local_file) + remote_path = os.path.join(gcs_path, local_file[1 + len(local_path) :]) + blob = bucket.blob(remote_path) + blob.upload_from_filename(local_file) + client = storage.Client(project=...) bucket = client.get_bucket(...) copy_local_directory_to_gcs(model_path, bucket, model_path) -copy_local_directory_to_gcs('tuner_dir', bucket, 'tuner_dir') +copy_local_directory_to_gcs("tuner_dir", bucket, "tuner_dir") loaded_model = tf.keras.models.load_model(model_path) print(history.history.keys()) -plt.figure(figsize = (20, 5)) +plt.figure(figsize=(20, 5)) plt.subplot(1, 3, 1) -plt.plot(history.history['auc']) -plt.plot(history.history['val_auc']) -plt.title('model auc') -plt.ylabel('auc') -plt.xlabel('epoch') -plt.legend(['train', 'test'], loc='upper left') +plt.plot(history.history["auc"]) +plt.plot(history.history["val_auc"]) +plt.title("model auc") +plt.ylabel("auc") +plt.xlabel("epoch") +plt.legend(["train", "test"], loc="upper left") plt.subplot(1, 3, 2) -plt.plot(history.history['loss']) -plt.plot(history.history['val_loss']) -plt.title('model loss') -plt.ylabel('loss') -plt.xlabel('epoch') -plt.legend(['train', 'test'], loc='upper left') +plt.plot(history.history["loss"]) +plt.plot(history.history["val_loss"]) +plt.title("model loss") +plt.ylabel("loss") +plt.xlabel("epoch") +plt.legend(["train", "test"], loc="upper left") plt.subplot(1, 3, 3) -plt.plot(history.history['precision_at_recall']) -plt.plot(history.history['val_precision_at_recall']) -plt.title('model precision at 0.9 recall') -plt.ylabel('precision_at_recall') -plt.xlabel('epoch') -plt.legend(['train', 'test'], loc='upper left') +plt.plot(history.history["precision_at_recall"]) +plt.plot(history.history["val_precision_at_recall"]) +plt.title("model precision at 0.9 recall") +plt.ylabel("precision_at_recall") +plt.xlabel("epoch") +plt.legend(["train", "test"], loc="upper left") -plt.savefig('history_with_twitter_clip.pdf') +plt.savefig("history_with_twitter_clip.pdf") test_labels = [] test_preds = [] for batch_features, batch_labels in tqdm(test_ds): - test_preds.extend(loaded_model.predict_proba(batch_features)) - test_labels.extend(batch_labels.numpy()) - + test_preds.extend(loaded_model.predict_proba(batch_features)) + test_labels.extend(batch_labels.numpy()) + test_sens_prev_labels = [] test_sens_prev_preds = [] for batch_features, batch_labels in tqdm(test_sens_prev_ds): - test_sens_prev_preds.extend(loaded_model.predict_proba(batch_features)) - test_sens_prev_labels.extend(batch_labels.numpy()) - + test_sens_prev_preds.extend(loaded_model.predict_proba(batch_features)) + test_sens_prev_labels.extend(batch_labels.numpy()) + n_test_pos = 0 n_test_neg = 0 n_test = 0 for label in test_labels: - n_test +=1 - if label == 1: - n_test_pos +=1 - else: - n_test_neg +=1 + n_test += 1 + if label == 1: + n_test_pos += 1 + else: + n_test_neg += 1 -print(f'n_test = {n_test}, n_pos = {n_test_pos}, n_neg = {n_test_neg}') +print(f"n_test = {n_test}, n_pos = {n_test_pos}, n_neg = {n_test_neg}") n_test_sens_prev_pos = 0 n_test_sens_prev_neg = 0 n_test_sens_prev = 0 for label in test_sens_prev_labels: - n_test_sens_prev +=1 - if label == 1: - n_test_sens_prev_pos +=1 - else: - n_test_sens_prev_neg +=1 + n_test_sens_prev += 1 + if label == 1: + n_test_sens_prev_pos += 1 + else: + n_test_sens_prev_neg += 1 -print(f'n_test_sens_prev = {n_test_sens_prev}, n_pos_sens_prev = {n_test_sens_prev_pos}, n_neg = {n_test_sens_prev_neg}') +print( + f"n_test_sens_prev = {n_test_sens_prev}, n_pos_sens_prev = {n_test_sens_prev_pos}, n_neg = {n_test_sens_prev_neg}" +) test_weights = np.ones(np.asarray(test_preds).shape) @@ -340,9 +389,7 @@ def copy_local_directory_to_gcs(local_path, bucket, gcs_path): test_preds = np.asarray(test_preds) test_weights = np.asarray(test_weights) -pr = sklearn.metrics.precision_recall_curve( - test_labels, - test_preds) +pr = sklearn.metrics.precision_recall_curve(test_labels, test_preds) auc = sklearn.metrics.auc(pr[1], pr[0]) plt.plot(pr[1], pr[0]) @@ -355,25 +402,26 @@ def copy_local_directory_to_gcs(local_path, bucket, gcs_path): test_sens_prev_weights = np.asarray(test_sens_prev_weights) pr_sens_prev = sklearn.metrics.precision_recall_curve( - test_sens_prev_labels, - test_sens_prev_preds) + test_sens_prev_labels, test_sens_prev_preds +) auc_sens_prev = sklearn.metrics.auc(pr_sens_prev[1], pr_sens_prev[0]) plt.plot(pr_sens_prev[1], pr_sens_prev[0]) plt.title("nsfw (sens prev test set)") df = pd.DataFrame( - { - "label": test_labels.squeeze(), - "preds_keras": np.asarray(test_preds).flatten(), - }) + { + "label": test_labels.squeeze(), + "preds_keras": np.asarray(test_preds).flatten(), + } +) plt.figure(figsize=(15, 10)) df["preds_keras"].hist() plt.title("Keras predictions", size=20) -plt.xlabel('score') +plt.xlabel("score") plt.ylabel("freq") -plt.figure(figsize = (20, 5)) +plt.figure(figsize=(20, 5)) plt.subplot(1, 3, 1) plt.plot(pr[2], pr[0][0:-1]) @@ -393,15 +441,19 @@ def copy_local_directory_to_gcs(local_path, bucket, gcs_path): plt.xlabel("recall") plt.ylabel("precision") -plt.savefig('with_twitter_clip.pdf') +plt.savefig("with_twitter_clip.pdf") + def get_point_for_recall(recall_value, recall, precision): - idx = np.argmin(np.abs(recall - recall_value)) - return (recall[idx], precision[idx]) + idx = np.argmin(np.abs(recall - recall_value)) + return (recall[idx], precision[idx]) + def get_point_for_precision(precision_value, recall, precision): - idx = np.argmin(np.abs(precision - precision_value)) - return (recall[idx], precision[idx]) + idx = np.argmin(np.abs(precision - precision_value)) + return (recall[idx], precision[idx]) + + precision, recall, thresholds = pr auc_precision_recall = sklearn.metrics.auc(recall, precision) @@ -416,23 +468,23 @@ def get_point_for_precision(precision_value, recall, precision): ptAt50 = get_point_for_recall(0.5, recall, precision) print(ptAt50) -plt.plot( [ptAt50[0],ptAt50[0]], [0,ptAt50[1]], 'r') -plt.plot([0, ptAt50[0]], [ptAt50[1], ptAt50[1]], 'r') +plt.plot([ptAt50[0], ptAt50[0]], [0, ptAt50[1]], "r") +plt.plot([0, ptAt50[0]], [ptAt50[1], ptAt50[1]], "r") ptAt90 = get_point_for_recall(0.9, recall, precision) print(ptAt90) -plt.plot( [ptAt90[0],ptAt90[0]], [0,ptAt90[1]], 'b') -plt.plot([0, ptAt90[0]], [ptAt90[1], ptAt90[1]], 'b') +plt.plot([ptAt90[0], ptAt90[0]], [0, ptAt90[1]], "b") +plt.plot([0, ptAt90[0]], [ptAt90[1], ptAt90[1]], "b") ptAt50fmt = "%.4f" % ptAt50[1] ptAt90fmt = "%.4f" % ptAt90[1] aucFmt = "%.4f" % auc_precision_recall plt.title( - f"Keras (nsfw MU test)\nAUC={aucFmt}\np={ptAt50fmt} @ r=0.5\np={ptAt90fmt} @ r=0.9\nN_train={...}} ({...} pos), N_test={n_test} ({n_test_pos} pos)", - size=20 + f"Keras (nsfw MU test)\nAUC={aucFmt}\np={ptAt50fmt} @ r=0.5\np={ptAt90fmt} @ r=0.9\nN_train={...} ({...} pos), N_test={n_test} ({n_test_pos} pos)", + size=20, ) plt.subplots_adjust(top=0.72) -plt.savefig('recall_precision_nsfw_Keras_with_twitter_CLIP_MU_test.pdf') +plt.savefig("recall_precision_nsfw_Keras_with_twitter_CLIP_MU_test.pdf") precision, recall, thresholds = pr_sens_prev @@ -447,20 +499,20 @@ def get_point_for_precision(precision_value, recall, precision): ptAt50 = get_point_for_recall(0.5, recall, precision) print(ptAt50) -plt.plot( [ptAt50[0],ptAt50[0]], [0,ptAt50[1]], 'r') -plt.plot([0, ptAt50[0]], [ptAt50[1], ptAt50[1]], 'r') +plt.plot([ptAt50[0], ptAt50[0]], [0, ptAt50[1]], "r") +plt.plot([0, ptAt50[0]], [ptAt50[1], ptAt50[1]], "r") ptAt90 = get_point_for_recall(0.9, recall, precision) print(ptAt90) -plt.plot( [ptAt90[0],ptAt90[0]], [0,ptAt90[1]], 'b') -plt.plot([0, ptAt90[0]], [ptAt90[1], ptAt90[1]], 'b') +plt.plot([ptAt90[0], ptAt90[0]], [0, ptAt90[1]], "b") +plt.plot([0, ptAt90[0]], [ptAt90[1], ptAt90[1]], "b") ptAt50fmt = "%.4f" % ptAt50[1] ptAt90fmt = "%.4f" % ptAt90[1] aucFmt = "%.4f" % auc_precision_recall plt.title( - f"Keras (nsfw sens prev test)\nAUC={aucFmt}\np={ptAt50fmt} @ r=0.5\np={ptAt90fmt} @ r=0.9\nN_train={...} ({...} pos), N_test={n_test_sens_prev} ({n_test_sens_prev_pos} pos)", - size=20 + f"Keras (nsfw sens prev test)\nAUC={aucFmt}\np={ptAt50fmt} @ r=0.5\np={ptAt90fmt} @ r=0.9\nN_train={...} ({...} pos), N_test={n_test_sens_prev} ({n_test_sens_prev_pos} pos)", + size=20, ) plt.subplots_adjust(top=0.72) -plt.savefig('recall_precision_nsfw_Keras_with_twitter_CLIP_sens_prev_test.pdf') \ No newline at end of file +plt.savefig("recall_precision_nsfw_Keras_with_twitter_CLIP_sens_prev_test.pdf") diff --git a/trust_and_safety_models/nsfw/nsfw_text.py b/trust_and_safety_models/nsfw/nsfw_text.py index 980fc8fd4..0d7735371 100644 --- a/trust_and_safety_models/nsfw/nsfw_text.py +++ b/trust_and_safety_models/nsfw/nsfw_text.py @@ -1,41 +1,47 @@ +import os +import re from datetime import datetime from functools import reduce -import os + +import matplotlib.pyplot as plt import pandas as pd -import re -from sklearn.metrics import average_precision_score, classification_report, precision_recall_curve, PrecisionRecallDisplay -from sklearn.model_selection import train_test_split import tensorflow as tf -import matplotlib.pyplot as plt -import re - +from sklearn.metrics import ( + PrecisionRecallDisplay, + average_precision_score, + classification_report, + precision_recall_curve, +) +from sklearn.model_selection import train_test_split from twitter.cuad.representation.models.optimization import create_optimizer from twitter.cuad.representation.models.text_encoder import TextEncoder -pd.set_option('display.max_colwidth', None) -pd.set_option('display.expand_frame_repr', False) +pd.set_option("display.max_colwidth", None) +pd.set_option("display.expand_frame_repr", False) print(tf.__version__) print(tf.config.list_physical_devices()) -log_path = os.path.join('pnsfwtweettext_model_runs', datetime.now().strftime('%Y-%m-%d_%H.%M.%S')) +log_path = os.path.join( + "pnsfwtweettext_model_runs", datetime.now().strftime("%Y-%m-%d_%H.%M.%S") +) -tweet_text_feature = 'text' +tweet_text_feature = "text" params = { - 'batch_size': 32, - 'max_seq_lengths': 256, - 'model_type': 'twitter_bert_base_en_uncased_augmented_mlm', - 'trainable_text_encoder': True, - 'lr': 5e-5, - 'epochs': 10, + "batch_size": 32, + "max_seq_lengths": 256, + "model_type": "twitter_bert_base_en_uncased_augmented_mlm", + "trainable_text_encoder": True, + "lr": 5e-5, + "epochs": 10, } REGEX_PATTERNS = [ - r'^RT @[A-Za-z0-9_]+: ', + r"^RT @[A-Za-z0-9_]+: ", r"@[A-Za-z0-9_]+", - r'https:\/\/t\.co\/[A-Za-z0-9]{10}', - r'@\?\?\?\?\?', + r"https:\/\/t\.co\/[A-Za-z0-9]{10}", + r"@\?\?\?\?\?", ] EMOJI_PATTERN = re.compile( @@ -52,34 +58,40 @@ "\U0001FA70-\U0001FAFF" "\U00002702-\U000027B0" "])" - ) +) + def clean_tweet(text): for pattern in REGEX_PATTERNS: - text = re.sub(pattern, '', text) + text = re.sub(pattern, "", text) + + text = re.sub(EMOJI_PATTERN, r" \1 ", text) + + text = re.sub(r"\n", " ", text) - text = re.sub(EMOJI_PATTERN, r' \1 ', text) - - text = re.sub(r'\n', ' ', text) - return text.strip().lower() -df['processed_text'] = df['text'].astype(str).map(clean_tweet) +df["processed_text"] = df["text"].astype(str).map(clean_tweet) df.sample(10) -X_train, X_val, y_train, y_val = train_test_split(df[['processed_text']], df['is_nsfw'], test_size=0.1, random_state=1) +X_train, X_val, y_train, y_val = train_test_split( + df[["processed_text"]], df["is_nsfw"], test_size=0.1, random_state=1 +) + def df_to_ds(X, y, shuffle=False): - ds = tf.data.Dataset.from_tensor_slices(( - X.values, - tf.one_hot(tf.cast(y.values, tf.int32), depth=2, axis=-1) - )) - - if shuffle: - ds = ds.shuffle(1000, seed=1, reshuffle_each_iteration=True) - - return ds.map(lambda text, label: ({ tweet_text_feature: text }, label)).batch(params['batch_size']) + ds = tf.data.Dataset.from_tensor_slices( + (X.values, tf.one_hot(tf.cast(y.values, tf.int32), depth=2, axis=-1)) + ) + + if shuffle: + ds = ds.shuffle(1000, seed=1, reshuffle_each_iteration=True) + + return ds.map(lambda text, label: ({tweet_text_feature: text}, label)).batch( + params["batch_size"] + ) + ds_train = df_to_ds(X_train, y_train, shuffle=True) ds_val = df_to_ds(X_val, y_val) @@ -87,51 +99,47 @@ def df_to_ds(X, y, shuffle=False): inputs = tf.keras.layers.Input(shape=(), dtype=tf.string, name=tweet_text_feature) encoder = TextEncoder( - max_seq_lengths=params['max_seq_lengths'], - model_type=params['model_type'], - trainable=params['trainable_text_encoder'], - local_preprocessor_path='demo-preprocessor' + max_seq_lengths=params["max_seq_lengths"], + model_type=params["model_type"], + trainable=params["trainable_text_encoder"], + local_preprocessor_path="demo-preprocessor", ) embedding = encoder([inputs])["pooled_output"] -predictions = tf.keras.layers.Dense(2, activation='softmax')(embedding) +predictions = tf.keras.layers.Dense(2, activation="softmax")(embedding) model = tf.keras.models.Model(inputs=inputs, outputs=predictions) model.summary() optimizer = create_optimizer( - params['lr'], - params['epochs'] * len(ds_train), - 0, - weight_decay_rate=0.01, - optimizer_type='adamw' + params["lr"], + params["epochs"] * len(ds_train), + 0, + weight_decay_rate=0.01, + optimizer_type="adamw", ) bce = tf.keras.losses.BinaryCrossentropy(from_logits=False) -pr_auc = tf.keras.metrics.AUC(curve='PR', num_thresholds=1000, from_logits=False) +pr_auc = tf.keras.metrics.AUC(curve="PR", num_thresholds=1000, from_logits=False) model.compile(optimizer=optimizer, loss=bce, metrics=[pr_auc]) callbacks = [ - tf.keras.callbacks.EarlyStopping( - monitor='val_loss', - mode='min', - patience=1, - restore_best_weights=True - ), - tf.keras.callbacks.ModelCheckpoint( - filepath=os.path.join(log_path, 'checkpoints', '{epoch:02d}'), - save_freq='epoch' - ), - tf.keras.callbacks.TensorBoard( - log_dir=os.path.join(log_path, 'scalars'), - update_freq='batch', - write_graph=False - ) + tf.keras.callbacks.EarlyStopping( + monitor="val_loss", mode="min", patience=1, restore_best_weights=True + ), + tf.keras.callbacks.ModelCheckpoint( + filepath=os.path.join(log_path, "checkpoints", "{epoch:02d}"), save_freq="epoch" + ), + tf.keras.callbacks.TensorBoard( + log_dir=os.path.join(log_path, "scalars"), + update_freq="batch", + write_graph=False, + ), ] history = model.fit( - ds_train, - epochs=params['epochs'], - callbacks=callbacks, - validation_data=ds_val, - steps_per_epoch=len(ds_train) + ds_train, + epochs=params["epochs"], + callbacks=callbacks, + validation_data=ds_val, + steps_per_epoch=len(ds_train), ) model.predict(["xxx 🍑"]) diff --git a/trust_and_safety_models/toxicity/data/data_preprocessing.py b/trust_and_safety_models/toxicity/data/data_preprocessing.py index f7da608f6..7d2ece32e 100644 --- a/trust_and_safety_models/toxicity/data/data_preprocessing.py +++ b/trust_and_safety_models/toxicity/data/data_preprocessing.py @@ -1,118 +1,130 @@ -from abc import ABC import re - -from toxicity_ml_pipeline.settings.hcomp_settings import TOXIC_35 +from abc import ABC import numpy as np - +import pandas as pd +from toxicity_ml_pipeline.settings.hcomp_settings import TOXIC_35 TOXIC_35_set = set(TOXIC_35) -url_group = r"(\bhttps?:\/\/\S+)" -mention_group = r"(\B@\S+)" -urls_mentions_re = re.compile(url_group + r"|" + mention_group, re.IGNORECASE) -url_re = re.compile(url_group, re.IGNORECASE) -mention_re = re.compile(mention_group, re.IGNORECASE) -newline_re = re.compile(r"\n+", re.IGNORECASE) -and_re = re.compile(r"&\s?amp\s?;", re.IGNORECASE) +URL_GROUP = r"(\bhttps?:\/\/\S+)" +MENTION_GROUP = r"(\B@\S+)" +URLS_MENTIONS_RE = re.compile(URL_GROUP + r"|" + MENTION_GROUP, re.IGNORECASE) +URL_RE = re.compile(URL_GROUP, re.IGNORECASE) +MENTION_RE = re.compile(MENTION_GROUP, re.IGNORECASE) +NEWLINE_RE = re.compile(r"\n+", re.IGNORECASE) +AND_RE = re.compile(r"&\s?amp\s?;", re.IGNORECASE) class DataframeCleaner(ABC): - def __init__(self): - pass - - def _clean(self, df): - return df + def __init__(self): + pass - def _systematic_preprocessing(self, df): - df.reset_index(inplace=True, drop=True) - if "media_url" in df.columns: - print(".... removing tweets with media") - df.drop(df[~df.media_url.isna()].index, inplace=True, axis=0) - else: - print("WARNING you are not removing tweets with media to train a BERT model.") + def _clean(self, df: pd.DataFrame) -> pd.DataFrame: + return df - print(".... deleting duplicates") - df.drop_duplicates("text", inplace=True, keep="last") - print(f"Got {df.shape[0]} after cleaning") - - return df.reset_index(inplace=False, drop=True) - - def _postprocess(self, df, *args, **kwargs): - return df + def _systematic_preprocessing(self, df: pd.DataFrame) -> pd.DataFrame: + df.reset_index(inplace=True, drop=True) + if "media_url" in df.columns: + print(".... removing tweets with media") + df.drop(df[~df.media_url.isna()].index, inplace=True, axis=0) + else: + print( + "WARNING you are not removing tweets with media to train a BERT model." + ) - def __call__(self, df, *args, **kwargs): - print(f"Got {df.shape[0]} before cleaning") + print(".... deleting duplicates") + df.drop_duplicates("text", inplace=True, keep="last") + print(f"Got {df.shape[0]} after cleaning") - df["raw_text"] = df.text - df = self._clean(df) + return df.reset_index(inplace=False, drop=True) - df = self._systematic_preprocessing(df) + def _postprocess(self, df: pd.DataFrame, *args, **kwargs) -> pd.DataFrame: + return df - return self._postprocess(df, *args, **kwargs) + def __call__(self, df: pd.DataFrame, *args, **kwargs) -> pd.DataFrame: + print(f"Got {df.shape[0]} before cleaning") + df["raw_text"] = df.text + df = self._clean(df) + df = self._systematic_preprocessing(df) + return self._postprocess(df, *args, **kwargs) -def mapping_func(el): - if el.aggregated_content in TOXIC_35_set: - return 2 - if el.label == 1: - return 1 - return 0 +def mapping_func(el: pd.Series) -> int: + if el.aggregated_content in TOXIC_35_set: + return 2 + if el.label == 1: + return 1 + return 0 class DefaultENNoPreprocessor(DataframeCleaner): - def _postprocess(self, df, *args, **kwargs): - if "toxic_count" in df.columns and "non_toxic_count" in df.columns: - df["vote"] = df.toxic_count / (df.toxic_count + df.non_toxic_count) - df["agreement_rate"] = np.max((df.vote, 1 - df.vote), axis=0) - - if "label_column" in kwargs and kwargs["label_column"] != "label": - if kwargs["label_column"] == "aggregated_content": - print("Replacing v3 label by v3.5 label.") - if "num_classes" in kwargs and kwargs["num_classes"] < 3: - df["label"] = np.where(df.aggregated_content.isin(TOXIC_35_set), 1, 0) - elif "num_classes" in kwargs and kwargs["num_classes"] == 3: - print("Making it a 3-class pb") - df["label"] = df.apply(mapping_func, axis=1) - else: - raise NotImplementedError - elif kwargs['label_column'] in df.columns: - df['label'] = df[kwargs['label_column']] - if kwargs['class_weight'] is not None: - df["class_weight"] = np.where(df['label'] == 1, 1-kwargs['class_weight'], - kwargs['class_weight']) - else: - raise NotImplementedError - - if "filter_low_agreements" in kwargs and kwargs["filter_low_agreements"] == True: - df.drop(df[(df.agreement_rate <= 0.6)].index, axis=0, inplace=True) - raise NotImplementedError - - return df + def _postprocess(self, df: pd.DataFrame, *args, **kwargs) -> pd.DataFrame: + if "toxic_count" in df.columns and "non_toxic_count" in df.columns: + df["vote"] = df.toxic_count / (df.toxic_count + df.non_toxic_count) + df["agreement_rate"] = np.max((df.vote, 1 - df.vote), axis=0) + + if "label_column" in kwargs and kwargs["label_column"] != "label": + if kwargs["label_column"] == "aggregated_content": + print("Replacing v3 label by v3.5 label.") + if "num_classes" in kwargs and kwargs["num_classes"] < 3: + df["label"] = np.where( + df.aggregated_content.isin(TOXIC_35_set), 1, 0 + ) + elif "num_classes" in kwargs and kwargs["num_classes"] == 3: + print("Making it a 3-class pb") + df["label"] = df.apply(mapping_func, axis=1) + else: + raise NotImplementedError + elif kwargs["label_column"] in df.columns: + df["label"] = df[kwargs["label_column"]] + if kwargs["class_weight"] is not None: + df["class_weight"] = np.where( + df["label"] == 1, + 1 - kwargs["class_weight"], + kwargs["class_weight"], + ) + else: + raise NotImplementedError + + if ( + "filter_low_agreements" in kwargs + and kwargs["filter_low_agreements"] == True + ): + df.drop(df[(df.agreement_rate <= 0.6)].index, axis=0, inplace=True) + raise NotImplementedError + + return df class DefaultENPreprocessor(DefaultENNoPreprocessor): - def _clean(self, adhoc_df): - print( - ".... removing \\n and replacing @mentions and URLs by placeholders. " - "Emoji filtering is not done." - ) - adhoc_df["text"] = [url_re.sub("URL", tweet) for tweet in adhoc_df.raw_text.values] - adhoc_df["text"] = [mention_re.sub("MENTION", tweet) for tweet in adhoc_df.text.values] - adhoc_df["text"] = [ - newline_re.sub(" ", tweet).lstrip(" ").rstrip(" ") for tweet in adhoc_df.text.values - ] - adhoc_df["text"] = [and_re.sub("&", tweet) for tweet in adhoc_df.text.values] - - return adhoc_df + def _clean(self, adhoc_df: pd.DataFrame) -> pd.DataFrame: + print( + "... removing \\n and replacing @mentions and URLs by placeholders. " + "Emoji filtering is not done." + ) + adhoc_df["text"] = [ + URL_RE.sub("URL", tweet) for tweet in adhoc_df.raw_text.values + ] + adhoc_df["text"] = [ + MENTION_RE.sub("MENTION", tweet) for tweet in adhoc_df.text.values + ] + adhoc_df["text"] = [ + NEWLINE_RE.sub(" ", tweet).lstrip(" ").rstrip(" ") + for tweet in adhoc_df.text.values + ] + adhoc_df["text"] = [AND_RE.sub("&", tweet) for tweet in adhoc_df.text.values] + return adhoc_df class Defaulti18nPreprocessor(DataframeCleaner): - def _clean(self, adhoc_df): - print(".... removing @mentions, \\n and URLs. Emoji filtering is not done.") - adhoc_df["text"] = [urls_mentions_re.sub("", tweet) for tweet in adhoc_df.raw_text.values] - adhoc_df["text"] = [ - newline_re.sub(" ", tweet).lstrip(" ").rstrip(" ") for tweet in adhoc_df.text.values - ] - - return adhoc_df + def _clean(self, adhoc_df): + print("... removing @mentions, \\n and URLs. Emoji filtering is not done.") + adhoc_df["text"] = [ + URLS_MENTIONS_RE.sub("", tweet) for tweet in adhoc_df.raw_text.values + ] + adhoc_df["text"] = [ + NEWLINE_RE.sub(" ", tweet).lstrip(" ").rstrip(" ") + for tweet in adhoc_df.text.values + ] + return adhoc_df diff --git a/trust_and_safety_models/toxicity/data/dataframe_loader.py b/trust_and_safety_models/toxicity/data/dataframe_loader.py index f3855d6b5..b9b1613ff 100644 --- a/trust_and_safety_models/toxicity/data/dataframe_loader.py +++ b/trust_and_safety_models/toxicity/data/dataframe_loader.py @@ -1,348 +1,359 @@ +import pickle from abc import ABC, abstractmethod from datetime import date from importlib import import_module -import pickle +from typing import Optional, Tuple +import numpy as np +import pandas as pd from toxicity_ml_pipeline.settings.default_settings_tox import ( - CLIENT, - EXISTING_TASK_VERSIONS, - GCS_ADDRESS, - TRAINING_DATA_LOCATION, -) + CLIENT, EXISTING_TASK_VERSIONS, GCS_ADDRESS, TRAINING_DATA_LOCATION) from toxicity_ml_pipeline.utils.helpers import execute_command, execute_query -from toxicity_ml_pipeline.utils.queries import ( - FULL_QUERY, - FULL_QUERY_W_TWEET_TYPES, - PARSER_UDF, - QUERY_SETTINGS, -) - -import numpy as np -import pandas +from toxicity_ml_pipeline.utils.queries import (FULL_QUERY, + FULL_QUERY_W_TWEET_TYPES, + PARSER_UDF, QUERY_SETTINGS) class DataframeLoader(ABC): + def __init__(self, project: str): + self.project = project - def __init__(self, project): - self.project = project - - @abstractmethod - def produce_query(self): - pass + @abstractmethod + def produce_query(self): + pass - @abstractmethod - def load_data(self, test=False): - pass + @abstractmethod + def load_data(self, test=False): + pass class ENLoader(DataframeLoader): - def __init__(self, project, setting_file): - super(ENLoader, self).__init__(project=project) - self.date_begin = setting_file.DATE_BEGIN - self.date_end = setting_file.DATE_END - TASK_VERSION = setting_file.TASK_VERSION - if TASK_VERSION not in EXISTING_TASK_VERSIONS: - raise ValueError - self.task_version = TASK_VERSION - self.query_settings = dict(QUERY_SETTINGS) - self.full_query = FULL_QUERY - - def produce_query(self, date_begin, date_end, task_version=None, **keys): - task_version = self.task_version if task_version is None else task_version - - if task_version in keys["table"]: - table_name = keys["table"][task_version] - print(f"Loading {table_name}") - - main_query = keys["main"].format( - table=table_name, - parser_udf=PARSER_UDF[task_version], - date_begin=date_begin, - date_end=date_end, - ) - - return self.full_query.format( - main_table_query=main_query, date_begin=date_begin, date_end=date_end - ) - return "" - - def _reload(self, test, file_keyword): - query = f"SELECT * from `{TRAINING_DATA_LOCATION.format(project=self.project)}_{file_keyword}`" - - if test: - query += " ORDER BY RAND() LIMIT 1000" - try: - df = execute_query(client=CLIENT, query=query) - except Exception: - print( - "Loading from BQ failed, trying to load from GCS. " - "NB: use this option only for intermediate files, which will be deleted at the end of " - "the project." - ) - copy_cmd = f"gsutil cp {GCS_ADDRESS.format(project=self.project)}/training_data/{file_keyword}.pkl ." - execute_command(copy_cmd) - try: - with open(f"{file_keyword}.pkl", "rb") as file: - df = pickle.load(file) - except Exception: - return None - - if test: - df = df.sample(frac=1) - return df.iloc[:1000] - - return df - - def load_data(self, test=False, **kwargs): - if "reload" in kwargs and kwargs["reload"]: - df = self._reload(test, kwargs["reload"]) - if df is not None and df.shape[0] > 0: + def __init__(self, project: str, setting_file): + super(ENLoader, self).__init__(project=project) + self.date_begin = setting_file.DATE_BEGIN + self.date_end = setting_file.DATE_END + TASK_VERSION = setting_file.TASK_VERSION + if TASK_VERSION not in EXISTING_TASK_VERSIONS: + raise ValueError + self.task_version = TASK_VERSION + self.query_settings = dict(QUERY_SETTINGS) + self.full_query = FULL_QUERY + + def produce_query( + self, date_begin: str, date_end: str, task_version: float = None, **keys + ) -> str: + task_version = self.task_version if task_version is None else task_version + + if task_version in keys["table"]: + table_name = keys["table"][task_version] + print(f"Loading {table_name}") + + main_query = keys["main"].format( + table=table_name, + parser_udf=PARSER_UDF[task_version], + date_begin=date_begin, + date_end=date_end, + ) + + return self.full_query.format( + main_table_query=main_query, date_begin=date_begin, date_end=date_end + ) + return "" + + def _reload(self, test: bool, file_keyword: str) -> pd.DataFrame: + query = f"SELECT * from `{TRAINING_DATA_LOCATION.format(project=self.project)}_{file_keyword}`" + if test: + query += " ORDER BY RAND() LIMIT 1000" + try: + df = execute_query(client=CLIENT, query=query) + except Exception: + print( + "Loading from BQ failed, trying to load from GCS. " + "NB: use this option only for intermediate files, which will be deleted at the end of " + "the project." + ) + copy_cmd = f"gsutil cp {GCS_ADDRESS.format(project=self.project)}/training_data/{file_keyword}.pkl ." + execute_command(copy_cmd) + try: + with open(f"{file_keyword}.pkl", "rb") as file: + df = pickle.load(file) + except Exception: + return None + + if test: + df = df.sample(frac=1) + return df.iloc[:1000] + return df + + def load_data(self, test: bool = False, **kwargs) -> Optional[pd.DataFrame]: + if "reload" in kwargs and kwargs["reload"]: + df = self._reload(test, kwargs["reload"]) + if df is not None and df.shape[0] > 0: + return df + + df = None + query_settings = self.query_settings + if test: + query_settings = {"fairness": self.query_settings["fairness"]} + query_settings["fairness"]["main"] += " LIMIT 500" + for table, query_info in query_settings.items(): + curr_query = self.produce_query( + date_begin=self.date_begin, date_end=self.date_end, **query_info + ) + if curr_query == "": + continue + curr_df = execute_query(client=CLIENT, query=curr_query) + curr_df["origin"] = table + df = curr_df if df is None else pd.concat((df, curr_df)) + df["loading_date"] = date.today() + df["date"] = pd.to_datetime(df.date) return df - df = None - query_settings = self.query_settings - if test: - query_settings = {"fairness": self.query_settings["fairness"]} - query_settings["fairness"]["main"] += " LIMIT 500" - - for table, query_info in query_settings.items(): - curr_query = self.produce_query( - date_begin=self.date_begin, date_end=self.date_end, **query_info - ) - if curr_query == "": - continue - curr_df = execute_query(client=CLIENT, query=curr_query) - curr_df["origin"] = table - df = curr_df if df is None else pandas.concat((df, curr_df)) - - df["loading_date"] = date.today() - df["date"] = pandas.to_datetime(df.date) - return df - - def load_precision_set( - self, begin_date="...", end_date="...", with_tweet_types=False, task_version=3.5 - ): - if with_tweet_types: - self.full_query = FULL_QUERY_W_TWEET_TYPES - - query_settings = self.query_settings - curr_query = self.produce_query( - date_begin=begin_date, - date_end=end_date, - task_version=task_version, - **query_settings["precision"], - ) - curr_df = execute_query(client=CLIENT, query=curr_query) - - curr_df.rename(columns={"media_url": "media_presence"}, inplace=True) - return curr_df + def load_precision_set( + self, + begin_date: str = "...", + end_date: str = "...", + with_tweet_types: bool = False, + task_version: float = 3.5, + ) -> pd.DataFrame: + if with_tweet_types: + self.full_query = FULL_QUERY_W_TWEET_TYPES + + query_settings = self.query_settings + curr_query = self.produce_query( + date_begin=begin_date, + date_end=end_date, + task_version=task_version, + **query_settings["precision"], + ) + curr_df = execute_query(client=CLIENT, query=curr_query) + + curr_df.rename(columns={"media_url": "media_presence"}, inplace=True) + return curr_df class ENLoaderWithSampling(ENLoader): + keywords = { + "politics": [...], + "insults": [...], + "race": [...], + } + n = ... + N = ... + + def __init__(self, project: str): + self.raw_loader = ENLoader(project=project) + if project == ...: + self.project = project + else: + raise ValueError + + def sample_with_weights(self, df: pd.DataFrame, n: int) -> pd.DataFrame: + w = df["label"].value_counts(normalize=True)[1] + dist = np.full((df.shape[0],), w) + sampled_df = df.sample(n=n, weights=dist, replace=False) + return sampled_df + + def sample_keywords( + self, df: pd.DataFrame, N: int, group: str + ) -> Tuple[pd.DataFrame, pd.DataFrame]: + print("\nmatching", group, "keywords...") + keyword_list = self.keywords[group] + match_df = df.loc[ + df.text.str.lower().str.contains("|".join(keyword_list), regex=True) + ] + + print("sampling N/3 from", group) + if match_df.shape[0] <= N / 3: + print( + "WARNING: Sampling only", + match_df.shape[0], + "instead of", + N / 3, + "examples from race focused tweets due to insufficient data", + ) + sample_df = match_df + + else: + print( + "sampling", + group, + "at", + round(match_df["label"].value_counts(normalize=True)[1], 3), + "% action rate", + ) + sample_df = self.sample_with_weights(match_df, int(N / 3)) + print(sample_df.shape) + print(sample_df.label.value_counts(normalize=True)) + + print( + "\nshape of df before dropping sampled rows after", + group, + "matching..", + df.shape[0], + ) + df = df.loc[df.index.difference(sample_df.index),] + print( + "\nshape of df after dropping sampled rows after", + group, + "matching..", + df.shape[0], + ) + return df, sample_df + + def sample_first_set_helper( + self, train_df: pd.DataFrame, first_set: pd.DataFrame, new_n: int + ) -> pd.DataFrame: + if first_set == "prev": + fset = train_df.loc[ + train_df["origin"].isin(["prevalence", "causal prevalence"]) + ] + print( + f"sampling prev at {fset['label'].value_counts(normalize=True)[1]:.3f}% action rate" + ) + else: + fset = train_df + + n_fset = self.sample_with_weights(fset, new_n) + print("len of sampled first set", n_fset.shape[0]) + print(n_fset.label.value_counts(normalize=True)) + + return n_fset + + def sample( + self, + df: pd.DataFrame, + first_set: pd.DataFrame, + second_set: pd.DataFrame, + keyword_sampling: bool, + n: int, + N: int, + ) -> pd.DataFrame: + train_df = df[df.origin != "precision"] + val_test_df = df[df.origin == "precision"] + + print("\nsampling first set of data") + new_n = n - N if second_set is not None else n + n_fset = self.sample_first_set_helper(train_df, first_set, new_n) + + print("\nsampling second set of data") + train_df = train_df.loc[train_df.index.difference(n_fset.index),] + + if second_set is None: + print("no second set sampling being done") + df = n_fset.append(val_test_df) + return df + + if second_set == "prev": + sset = train_df.loc[ + train_df["origin"].isin(["prevalence", "causal prevalence"]) + ] + elif second_set == "fdr": + sset = train_df.loc[train_df["origin"] == "fdr"] + else: + sset = train_df + + if keyword_sampling == True: + print("sampling based off of keywords defined...") + print("second set is", second_set, "with length", sset.shape[0]) + sset, n_politics = self.sample_keywords(sset, N, "politics") + sset, n_insults = self.sample_keywords(sset, N, "insults") + sset, n_race = self.sample_keywords(sset, N, "race") + n_sset = n_politics.append([n_insults, n_race]) + print("len of sampled second set", n_sset.shape[0]) + else: + print( + "No keyword sampling. Instead random sampling from", + second_set, + "at", + round(sset["label"].value_counts(normalize=True)[1], 3), + "% action rate", + ) + n_sset = self.sample_with_weights(sset, N) + print("len of sampled second set", n_sset.shape[0]) + print(n_sset.label.value_counts(normalize=True)) + + df = n_fset.append([n_sset, val_test_df]) + df = df.sample(frac=1).reset_index(drop=True) + return df + + def load_data( + self, + first_set: str = "prev", + second_set: str = None, + keyword_sampling: bool = False, + test: bool = False, + **kwargs, + ) -> pd.DataFrame: + n = kwargs.get("n", self.n) + N = kwargs.get("N", self.N) - keywords = { - "politics": [ -... - ], - "insults": [ -... - ], - "race": [ -... - ], - } - n = ... - N = ... - - def __init__(self, project): - self.raw_loader = ENLoader(project=project) - if project == ...: - self.project = project - else: - raise ValueError - - def sample_with_weights(self, df, n): - w = df["label"].value_counts(normalize=True)[1] - dist = np.full((df.shape[0],), w) - sampled_df = df.sample(n=n, weights=dist, replace=False) - return sampled_df - - def sample_keywords(self, df, N, group): - print("\nmatching", group, "keywords...") - - keyword_list = self.keywords[group] - match_df = df.loc[df.text.str.lower().str.contains("|".join(keyword_list), regex=True)] - - print("sampling N/3 from", group) - if match_df.shape[0] <= N / 3: - print( - "WARNING: Sampling only", - match_df.shape[0], - "instead of", - N / 3, - "examples from race focused tweets due to insufficient data", - ) - sample_df = match_df - - else: - print( - "sampling", - group, - "at", - round(match_df["label"].value_counts(normalize=True)[1], 3), - "% action rate", - ) - sample_df = self.sample_with_weights(match_df, int(N / 3)) - print(sample_df.shape) - print(sample_df.label.value_counts(normalize=True)) - - print("\nshape of df before dropping sampled rows after", group, "matching..", df.shape[0]) - df = df.loc[ - df.index.difference(sample_df.index), - ] - print("\nshape of df after dropping sampled rows after", group, "matching..", df.shape[0]) - - return df, sample_df - - def sample_first_set_helper(self, train_df, first_set, new_n): - if first_set == "prev": - fset = train_df.loc[train_df["origin"].isin(["prevalence", "causal prevalence"])] - print( - "sampling prev at", round(fset["label"].value_counts(normalize=True)[1], 3), "% action rate" - ) - else: - fset = train_df - - n_fset = self.sample_with_weights(fset, new_n) - print("len of sampled first set", n_fset.shape[0]) - print(n_fset.label.value_counts(normalize=True)) - - return n_fset - - def sample(self, df, first_set, second_set, keyword_sampling, n, N): - train_df = df[df.origin != "precision"] - val_test_df = df[df.origin == "precision"] - - print("\nsampling first set of data") - new_n = n - N if second_set is not None else n - n_fset = self.sample_first_set_helper(train_df, first_set, new_n) - - print("\nsampling second set of data") - train_df = train_df.loc[ - train_df.index.difference(n_fset.index), - ] - - if second_set is None: - print("no second set sampling being done") - df = n_fset.append(val_test_df) - return df - - if second_set == "prev": - sset = train_df.loc[train_df["origin"].isin(["prevalence", "causal prevalence"])] - - elif second_set == "fdr": - sset = train_df.loc[train_df["origin"] == "fdr"] - - else: - sset = train_df - - if keyword_sampling == True: - print("sampling based off of keywords defined...") - print("second set is", second_set, "with length", sset.shape[0]) - - sset, n_politics = self.sample_keywords(sset, N, "politics") - sset, n_insults = self.sample_keywords(sset, N, "insults") - sset, n_race = self.sample_keywords(sset, N, "race") - - n_sset = n_politics.append([n_insults, n_race]) - print("len of sampled second set", n_sset.shape[0]) - - else: - print( - "No keyword sampling. Instead random sampling from", - second_set, - "at", - round(sset["label"].value_counts(normalize=True)[1], 3), - "% action rate", - ) - n_sset = self.sample_with_weights(sset, N) - print("len of sampled second set", n_sset.shape[0]) - print(n_sset.label.value_counts(normalize=True)) - - df = n_fset.append([n_sset, val_test_df]) - df = df.sample(frac=1).reset_index(drop=True) - - return df - - def load_data( - self, first_set="prev", second_set=None, keyword_sampling=False, test=False, **kwargs - ): - n = kwargs.get("n", self.n) - N = kwargs.get("N", self.N) - - df = self.raw_loader.load_data(test=test, **kwargs) - return self.sample(df, first_set, second_set, keyword_sampling, n, N) + df = self.raw_loader.load_data(test=test, **kwargs) + return self.sample(df, first_set, second_set, keyword_sampling, n, N) class I18nLoader(DataframeLoader): - def __init__(self): - super().__init__(project=...) - from archive.settings.... import ACCEPTED_LANGUAGES, QUERY_SETTINGS - - self.accepted_languages = ACCEPTED_LANGUAGES - self.query_settings = dict(QUERY_SETTINGS) - - def produce_query(self, language, query, dataset, table, lang): - query = query.format(dataset=dataset, table=table) - add_query = f"AND reviewed.{lang}='{language}'" - query += add_query - - return query - - def query_keys(self, language, task=2, size="50"): - if task == 2: - if language == "ar": - self.query_settings["adhoc_v2"]["table"] = "..." - elif language == "tr": - self.query_settings["adhoc_v2"]["table"] = "..." - elif language == "es": - self.query_settings["adhoc_v2"]["table"] = f"..." - else: - self.query_settings["adhoc_v2"]["table"] = "..." - - return self.query_settings["adhoc_v2"] - - if task == 3: - return self.query_settings["adhoc_v3"] - - raise ValueError(f"There are no other tasks than 2 or 3. {task} does not exist.") - - def load_data(self, language, test=False, task=2): - if language not in self.accepted_languages: - raise ValueError( - f"Language not in the data {language}. Accepted values are " f"{self.accepted_languages}" - ) - - print(".... adhoc data") - key_dict = self.query_keys(language=language, task=task) - query_adhoc = self.produce_query(language=language, **key_dict) - if test: - query_adhoc += " LIMIT 500" - adhoc_df = execute_query(CLIENT, query_adhoc) - - if not (test or language == "tr" or task == 3): - if language == "es": - print(".... additional adhoc data") - key_dict = self.query_keys(language=language, size="100") - query_adhoc = self.produce_query(language=language, **key_dict) - adhoc_df = pandas.concat( - (adhoc_df, execute_query(CLIENT, query_adhoc)), axis=0, ignore_index=True + def __init__(self): + super().__init__(project=...) + from archive.settings.... import ACCEPTED_LANGUAGES, QUERY_SETTINGS + + self.accepted_languages = ACCEPTED_LANGUAGES + self.query_settings = dict(QUERY_SETTINGS) + + def produce_query(self, language: str, query: str, dataset: str, table: str, lang: str) -> str: + query = query.format(dataset=dataset, table=table) + add_query = f"AND reviewed.{lang}='{language}'" + query += add_query + return query + + def query_keys(self, language: str, task: int=2, size: str="50"): + if task == 2: + if language == "ar": + self.query_settings["adhoc_v2"]["table"] = "..." + elif language == "tr": + self.query_settings["adhoc_v2"]["table"] = "..." + elif language == "es": + self.query_settings["adhoc_v2"]["table"] = f"..." + else: + self.query_settings["adhoc_v2"]["table"] = "..." + return self.query_settings["adhoc_v2"] + if task == 3: + return self.query_settings["adhoc_v3"] + raise ValueError( + f"There are no other tasks than 2 or 3. {task} does not exist." ) - print(".... prevalence data") - query_prev = self.produce_query(language=language, **self.query_settings["prevalence_v2"]) - prev_df = execute_query(CLIENT, query_prev) - prev_df["description"] = "Prevalence" - adhoc_df = pandas.concat((adhoc_df, prev_df), axis=0, ignore_index=True) + def load_data(self, language: str, test: bool=False, task: int=2): + if language not in self.accepted_languages: + raise ValueError( + f"Language not in the data {language}. Accepted values are " + f"{self.accepted_languages}" + ) - return self.clean(adhoc_df) + print(".... adhoc data") + key_dict = self.query_keys(language=language, task=task) + query_adhoc = self.produce_query(language=language, **key_dict) + if test: + query_adhoc += " LIMIT 500" + adhoc_df = execute_query(CLIENT, query_adhoc) + + if not (test or language == "tr" or task == 3): + if language == "es": + print(".... additional adhoc data") + key_dict = self.query_keys(language=language, size="100") + query_adhoc = self.produce_query(language=language, **key_dict) + adhoc_df = pd.concat( + (adhoc_df, execute_query(CLIENT, query_adhoc)), + axis=0, + ignore_index=True, + ) + + print(".... prevalence data") + query_prev = self.produce_query( + language=language, **self.query_settings["prevalence_v2"] + ) + prev_df = execute_query(CLIENT, query_prev) + prev_df["description"] = "Prevalence" + adhoc_df = pd.concat((adhoc_df, prev_df), axis=0, ignore_index=True) + + return self.clean(adhoc_df) diff --git a/trust_and_safety_models/toxicity/data/mb_generator.py b/trust_and_safety_models/toxicity/data/mb_generator.py index 58a89f8c5..350a6ebdd 100644 --- a/trust_and_safety_models/toxicity/data/mb_generator.py +++ b/trust_and_safety_models/toxicity/data/mb_generator.py @@ -1,284 +1,325 @@ -from importlib import import_module import os +from importlib import import_module +from typing import Tuple, Union +import numpy as np +import pandas as pd +import tensorflow as tf +from sklearn.model_selection import StratifiedKFold from toxicity_ml_pipeline.settings.default_settings_tox import ( - INNER_CV, - LOCAL_DIR, - MAX_SEQ_LENGTH, - NUM_PREFETCH, - NUM_WORKERS, - OUTER_CV, - TARGET_POS_PER_EPOCH, + INNER_CV, + LOCAL_DIR, + MAX_SEQ_LENGTH, + NUM_PREFETCH, + NUM_WORKERS, + OUTER_CV, + TARGET_POS_PER_EPOCH, ) from toxicity_ml_pipeline.utils.helpers import execute_command -import numpy as np -import pandas -from sklearn.model_selection import StratifiedKFold -import tensorflow as tf - - try: - from transformers import AutoTokenizer, DataCollatorWithPadding + from transformers import AutoTokenizer, DataCollatorWithPadding except ModuleNotFoundError: - print("...") + print("...") else: - from datasets import Dataset + from datasets import Dataset class BalancedMiniBatchLoader(object): - def __init__( - self, - fold, - mb_size, - seed, - perc_training_tox, - scope="TOX", - project=..., - dual_head=None, - n_outer_splits=None, - n_inner_splits=None, - sample_weights=None, - huggingface=False, - ): - if 0 >= perc_training_tox or perc_training_tox > 0.5: - raise ValueError("Perc_training_tox should be in ]0; 0.5]") - - self.perc_training_tox = perc_training_tox - if not n_outer_splits: - n_outer_splits = OUTER_CV - if isinstance(n_outer_splits, int): - self.n_outer_splits = n_outer_splits - self.get_outer_fold = self._get_outer_cv_fold - if fold < 0 or fold >= self.n_outer_splits or int(fold) != fold: - raise ValueError(f"Number of fold should be an integer in [0 ; {self.n_outer_splits} [.") - - elif n_outer_splits == "time": - self.get_outer_fold = self._get_time_fold - if fold != "time": - raise ValueError( - "To avoid repeating the same run many times, the external fold" - "should be time when test data is split according to dates." - ) - try: - setting_file = import_module(f"toxicity_ml_pipeline.settings.{scope.lower()}{project}_settings") - except ModuleNotFoundError: - raise ValueError(f"You need to define a setting file for your project {project}.") - self.test_begin_date = setting_file.TEST_BEGIN_DATE - self.test_end_date = setting_file.TEST_END_DATE - - else: - raise ValueError( - f"Argument n_outer_splits should either an integer or 'time'. Provided: {n_outer_splits}" - ) - - self.n_inner_splits = n_inner_splits if n_inner_splits is not None else INNER_CV - - self.seed = seed - self.mb_size = mb_size - self.fold = fold - - self.sample_weights = sample_weights - self.dual_head = dual_head - self.huggingface = huggingface - if self.huggingface: - self._load_tokenizer() - - def _load_tokenizer(self): - print("Making a local copy of Bertweet-base model") - local_model_dir = os.path.join(LOCAL_DIR, "models") - cmd = f"mkdir {local_model_dir} ; gsutil -m cp -r gs://... {local_model_dir}" - execute_command(cmd) - - self.tokenizer = AutoTokenizer.from_pretrained( - os.path.join(local_model_dir, "bertweet-base"), normalization=True - ) - - def tokenize_function(self, el): - return self.tokenizer( - el["text"], - max_length=MAX_SEQ_LENGTH, - padding="max_length", - truncation=True, - add_special_tokens=True, - return_token_type_ids=False, - return_attention_mask=False, - ) - - def _get_stratified_kfold(self, n_splits): - return StratifiedKFold(shuffle=True, n_splits=n_splits, random_state=self.seed) - - def _get_time_fold(self, df): - test_begin_date = pandas.to_datetime(self.test_begin_date).date() - test_end_date = pandas.to_datetime(self.test_end_date).date() - print(f"Test is going from {test_begin_date} to {test_end_date}.") - test_data = df.query("@test_begin_date <= date <= @test_end_date") - - query = "date < @test_begin_date" - other_set = df.query(query) - return other_set, test_data - - def _get_outer_cv_fold(self, df): - labels = df.int_label - stratifier = self._get_stratified_kfold(n_splits=self.n_outer_splits) - - k = 0 - for train_index, test_index in stratifier.split(np.zeros(len(labels)), labels): - if k == self.fold: - break - k += 1 - - train_data = df.iloc[train_index].copy() - test_data = df.iloc[test_index].copy() - - return train_data, test_data - - def get_steps_per_epoch(self, nb_pos_examples): - return int(max(TARGET_POS_PER_EPOCH, nb_pos_examples) / self.mb_size / self.perc_training_tox) - - def make_huggingface_tensorflow_ds(self, group, mb_size=None, shuffle=True): - huggingface_ds = Dataset.from_pandas(group).map(self.tokenize_function, batched=True) - data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer, return_tensors="tf") - tensorflow_ds = huggingface_ds.to_tf_dataset( - columns=["input_ids"], - label_cols=["labels"], - shuffle=shuffle, - batch_size=self.mb_size if mb_size is None else mb_size, - collate_fn=data_collator, - ) - - if shuffle: - return tensorflow_ds.repeat() - return tensorflow_ds - - def make_pure_tensorflow_ds(self, df, nb_samples): - buffer_size = nb_samples * 2 - - if self.sample_weights is not None: - if self.sample_weights not in df.columns: - raise ValueError - ds = tf.data.Dataset.from_tensor_slices( - (df.text.values, df.label.values, df[self.sample_weights].values) - ) - elif self.dual_head: - label_d = {f'{e}_output': df[f'{e}_label'].values for e in self.dual_head} - label_d['content_output'] = tf.keras.utils.to_categorical(label_d['content_output'], num_classes=3) - ds = tf.data.Dataset.from_tensor_slices((df.text.values, label_d)) - - else: - ds = tf.data.Dataset.from_tensor_slices((df.text.values, df.label.values)) - ds = ds.shuffle(buffer_size, seed=self.seed, reshuffle_each_iteration=True).repeat() - return ds - - def get_balanced_dataset(self, training_data, size_limit=None, return_as_batch=True): - training_data = training_data.sample(frac=1, random_state=self.seed) - nb_samples = training_data.shape[0] if not size_limit else size_limit - - num_classes = training_data.int_label.nunique() - toxic_class = training_data.int_label.max() - if size_limit: - training_data = training_data[: size_limit * num_classes] - - print( - ".... {} examples, incl. {:.2f}% tox in train, {} classes".format( - nb_samples, - 100 * training_data[training_data.int_label == toxic_class].shape[0] / nb_samples, - num_classes, - ) - ) - label_groups = training_data.groupby("int_label") - if self.huggingface: - label_datasets = { - label: self.make_huggingface_tensorflow_ds(group) for label, group in label_groups - } - - else: - label_datasets = { - label: self.make_pure_tensorflow_ds(group, nb_samples=nb_samples * 2) - for label, group in label_groups - } - - datasets = [label_datasets[0], label_datasets[1]] - weights = [1 - self.perc_training_tox, self.perc_training_tox] - if num_classes == 3: - datasets.append(label_datasets[2]) - weights = [1 - self.perc_training_tox, self.perc_training_tox / 2, self.perc_training_tox / 2] - elif num_classes != 2: - raise ValueError("Currently it should not be possible to get other than 2 or 3 classes") - resampled_ds = tf.data.experimental.sample_from_datasets(datasets, weights, seed=self.seed) - - if return_as_batch and not self.huggingface: - return resampled_ds.batch( - self.mb_size, drop_remainder=True, num_parallel_calls=NUM_WORKERS, deterministic=True - ).prefetch(NUM_PREFETCH) - - return resampled_ds - - @staticmethod - def _compute_int_labels(full_df): - if full_df.label.dtype == int: - full_df["int_label"] = full_df.label - - elif "int_label" not in full_df.columns: - if full_df.label.max() > 1: - raise ValueError("Binarizing labels that should not be.") - full_df["int_label"] = np.where(full_df.label >= 0.5, 1, 0) - - return full_df - - def __call__(self, full_df, *args, **kwargs): - full_df = self._compute_int_labels(full_df) - - train_data, test_data = self.get_outer_fold(df=full_df) - - stratifier = self._get_stratified_kfold(n_splits=self.n_inner_splits) - for train_index, val_index in stratifier.split( - np.zeros(train_data.shape[0]), train_data.int_label + def __init__( + self, + fold: int, + mb_size: int, + seed: int, + perc_training_tox: float, + scope: str = "TOX", + project=..., + dual_head=None, + n_outer_splits: Union[str, int] = None, + n_inner_splits: int = None, + sample_weights=None, + huggingface: bool = False, ): - curr_train_data = train_data.iloc[train_index] + if 0 >= perc_training_tox or perc_training_tox > 0.5: + raise ValueError("Perc_training_tox should be in ]0; 0.5]") + + self.perc_training_tox = perc_training_tox + if not n_outer_splits: + n_outer_splits = OUTER_CV + if isinstance(n_outer_splits, int): + self.n_outer_splits = n_outer_splits + self.get_outer_fold = self._get_outer_cv_fold + if fold < 0 or fold >= self.n_outer_splits or int(fold) != fold: + raise ValueError( + f"Number of fold should be an integer in [0 ; {self.n_outer_splits} [." + ) + + elif n_outer_splits == "time": + self.get_outer_fold = self._get_time_fold + if fold != "time": + raise ValueError( + "To avoid repeating the same run many times, the external fold" + "should be time when test data is split according to dates." + ) + try: + setting_file = import_module( + f"toxicity_ml_pipeline.settings.{scope.lower()}{project}_settings" + ) + except ModuleNotFoundError: + raise ValueError( + f"You need to define a setting file for your project {project}." + ) + self.test_begin_date = setting_file.TEST_BEGIN_DATE + self.test_end_date = setting_file.TEST_END_DATE + + else: + raise ValueError( + f"Argument n_outer_splits should either an integer or 'time'. Provided: {n_outer_splits}" + ) + + self.n_inner_splits = n_inner_splits if n_inner_splits is not None else INNER_CV + self.seed = seed + self.mb_size = mb_size + self.fold = fold + self.sample_weights = sample_weights + self.dual_head = dual_head + self.huggingface = huggingface + if self.huggingface: + self._load_tokenizer() + + def _load_tokenizer(self): + print("Making a local copy of Bertweet-base model") + local_model_dir = os.path.join(LOCAL_DIR, "models") + cmd = f"mkdir {local_model_dir} ; gsutil -m cp -r gs://... {local_model_dir}" + execute_command(cmd) + + self.tokenizer = AutoTokenizer.from_pretrained( + os.path.join(local_model_dir, "bertweet-base"), normalization=True + ) + + def tokenize_function(self, el: dict) -> dict: + return self.tokenizer( + el["text"], + max_length=MAX_SEQ_LENGTH, + padding="max_length", + truncation=True, + add_special_tokens=True, + return_token_type_ids=False, + return_attention_mask=False, + ) + + def _get_stratified_kfold(self, n_splits: int) -> StratifiedKFold: + return StratifiedKFold(shuffle=True, n_splits=n_splits, random_state=self.seed) + + def _get_time_fold(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]: + test_begin_date = pd.to_datetime(self.test_begin_date).date() + test_end_date = pd.to_datetime(self.test_end_date).date() + print(f"Test is going from {test_begin_date} to {test_end_date}.") + test_data = df.query("@test_begin_date <= date <= @test_end_date") + + query = "date < @test_begin_date" + other_set = df.query(query) + return other_set, test_data + + def _get_outer_cv_fold(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]: + labels = df.int_label + stratifier = self._get_stratified_kfold(n_splits=self.n_outer_splits) + + k = 0 + for train_index, test_index in stratifier.split(np.zeros(len(labels)), labels): + if k == self.fold: + break + k += 1 + train_data = df.iloc[train_index].copy() + test_data = df.iloc[test_index].copy() + return train_data, test_data + + def get_steps_per_epoch(self, nb_pos_examples: int) -> int: + return int( + max(TARGET_POS_PER_EPOCH, nb_pos_examples) + / self.mb_size + / self.perc_training_tox + ) - mini_batches = self.get_balanced_dataset(curr_train_data) + def make_huggingface_tensorflow_ds(self, group, mb_size=None, shuffle: bool = True): + huggingface_ds = Dataset.from_pandas(group).map( + self.tokenize_function, batched=True + ) + data_collator = DataCollatorWithPadding( + tokenizer=self.tokenizer, return_tensors="tf" + ) + tensorflow_ds = huggingface_ds.to_tf_dataset( + columns=["input_ids"], + label_cols=["labels"], + shuffle=shuffle, + batch_size=self.mb_size if mb_size is None else mb_size, + collate_fn=data_collator, + ) + + if shuffle: + return tensorflow_ds.repeat() + return tensorflow_ds + + def make_pure_tensorflow_ds( + self, df: pd.DataFrame, nb_samples: int + ) -> tf.data.Dataset: + buffer_size = nb_samples * 2 + + if self.sample_weights is not None: + if self.sample_weights not in df.columns: + raise ValueError + ds = tf.data.Dataset.from_tensor_slices( + (df.text.values, df.label.values, df[self.sample_weights].values) + ) + elif self.dual_head: + label_d = {f"{e}_output": df[f"{e}_label"].values for e in self.dual_head} + label_d["content_output"] = tf.keras.utils.to_categorical( + label_d["content_output"], num_classes=3 + ) + ds = tf.data.Dataset.from_tensor_slices((df.text.values, label_d)) + + else: + ds = tf.data.Dataset.from_tensor_slices((df.text.values, df.label.values)) + ds = ds.shuffle( + buffer_size, seed=self.seed, reshuffle_each_iteration=True + ).repeat() + return ds + + def get_balanced_dataset( + self, + training_data: pd.DataFrame, + size_limit: int = None, + return_as_batch: bool = True, + ) -> tf.data.Dataset: + training_data = training_data.sample(frac=1, random_state=self.seed) + nb_samples = training_data.shape[0] if not size_limit else size_limit + + num_classes = training_data.int_label.nunique() + toxic_class = training_data.int_label.max() + if size_limit: + training_data = training_data[: size_limit * num_classes] + + percent_tox = ( + 100 + * training_data[training_data.int_label == toxic_class].shape[0] + / nb_samples + ) + print( + f"... {nb_samples} examples, incl. {percent_tox:.2f}% tox in train, {num_classes} classes" + ) + label_groups = training_data.groupby("int_label") + if self.huggingface: + label_datasets = { + label: self.make_huggingface_tensorflow_ds(group) + for label, group in label_groups + } + + else: + label_datasets = { + label: self.make_pure_tensorflow_ds(group, nb_samples=nb_samples * 2) + for label, group in label_groups + } + + datasets = [label_datasets[0], label_datasets[1]] + weights = [1 - self.perc_training_tox, self.perc_training_tox] + if num_classes == 3: + datasets.append(label_datasets[2]) + weights = [ + 1 - self.perc_training_tox, + self.perc_training_tox / 2, + self.perc_training_tox / 2, + ] + elif num_classes != 2: + raise ValueError( + "Currently it should not be possible to get other than 2 or 3 classes" + ) + resampled_ds = tf.data.experimental.sample_from_datasets( + datasets, weights, seed=self.seed + ) + + if return_as_batch and not self.huggingface: + return resampled_ds.batch( + self.mb_size, + drop_remainder=True, + num_parallel_calls=NUM_WORKERS, + deterministic=True, + ).prefetch(NUM_PREFETCH) - steps_per_epoch = self.get_steps_per_epoch( - nb_pos_examples=curr_train_data[curr_train_data.int_label != 0].shape[0] - ) + return resampled_ds - val_data = train_data.iloc[val_index].copy() + @staticmethod + def _compute_int_labels(full_df: pd.DataFrame) -> pd.DataFrame: + if full_df.label.dtype == int: + full_df["int_label"] = full_df.label + elif "int_label" not in full_df.columns: + if full_df.label.max() > 1: + raise ValueError("Binarizing labels that should not be.") + full_df["int_label"] = np.where(full_df.label >= 0.5, 1, 0) - yield mini_batches, steps_per_epoch, val_data, test_data + return full_df - def simple_cv_load(self, full_df): - full_df = self._compute_int_labels(full_df) + def __call__(self, full_df: pd.DataFrame, *args, **kwargs): + full_df = self._compute_int_labels(full_df) - train_data, test_data = self.get_outer_fold(df=full_df) - if test_data.shape[0] == 0: - test_data = train_data.iloc[:500] + train_data, test_data = self.get_outer_fold(df=full_df) - mini_batches = self.get_balanced_dataset(train_data) - steps_per_epoch = self.get_steps_per_epoch( - nb_pos_examples=train_data[train_data.int_label != 0].shape[0] - ) + stratifier = self._get_stratified_kfold(n_splits=self.n_inner_splits) + for train_index, val_index in stratifier.split( + np.zeros(train_data.shape[0]), train_data.int_label + ): + curr_train_data = train_data.iloc[train_index] + + mini_batches = self.get_balanced_dataset(curr_train_data) + + steps_per_epoch = self.get_steps_per_epoch( + nb_pos_examples=curr_train_data[curr_train_data.int_label != 0].shape[0] + ) + + val_data = train_data.iloc[val_index].copy() + + yield mini_batches, steps_per_epoch, val_data, test_data + + def simple_cv_load( + self, full_df: pd.DataFrame + ) -> Tuple[tf.data.Dataset, pd.DataFrame, int]: + full_df = self._compute_int_labels(full_df) + + train_data, test_data = self.get_outer_fold(df=full_df) + if test_data.shape[0] == 0: + test_data = train_data.iloc[:500] + + mini_batches = self.get_balanced_dataset(train_data) + steps_per_epoch = self.get_steps_per_epoch( + nb_pos_examples=train_data[train_data.int_label != 0].shape[0] + ) - return mini_batches, test_data, steps_per_epoch + return mini_batches, test_data, steps_per_epoch - def no_cv_load(self, full_df): - full_df = self._compute_int_labels(full_df) + def no_cv_load( + self, full_df: pd.DataFrame + ) -> Tuple[tf.data.Dataset, pd.DataFrame, int]: + full_df = self._compute_int_labels(full_df) - val_test = full_df[full_df.origin == "precision"].copy(deep=True) - val_data, test_data = self.get_outer_fold(df=val_test) + val_test = full_df[full_df.origin == "precision"].copy(deep=True) + val_data, test_data = self.get_outer_fold(df=val_test) - train_data = full_df.drop(full_df[full_df.origin == "precision"].index, axis=0) - if test_data.shape[0] == 0: - test_data = train_data.iloc[:500] + train_data = full_df.drop(full_df[full_df.origin == "precision"].index, axis=0) + if test_data.shape[0] == 0: + test_data = train_data.iloc[:500] - mini_batches = self.get_balanced_dataset(train_data) - if train_data.int_label.nunique() == 1: - raise ValueError('Should be at least two labels') + mini_batches = self.get_balanced_dataset(train_data) + if train_data.int_label.nunique() == 1: + raise ValueError("Should be at least two labels") - num_examples = train_data[train_data.int_label == 1].shape[0] - if train_data.int_label.nunique() > 2: - second_most_frequent_label = train_data.loc[train_data.int_label != 0, 'int_label'].mode().values[0] - num_examples = train_data[train_data.int_label == second_most_frequent_label].shape[0] * 2 - steps_per_epoch = self.get_steps_per_epoch(nb_pos_examples=num_examples) + num_examples = train_data[train_data.int_label == 1].shape[0] + if train_data.int_label.nunique() > 2: + second_most_frequent_label = ( + train_data.loc[train_data.int_label != 0, "int_label"].mode().values[0] + ) + num_examples = ( + train_data[train_data.int_label == second_most_frequent_label].shape[0] + * 2 + ) + steps_per_epoch = self.get_steps_per_epoch(nb_pos_examples=num_examples) - return mini_batches, steps_per_epoch, val_data, test_data + return mini_batches, steps_per_epoch, val_data, test_data diff --git a/trust_and_safety_models/toxicity/load_model.py b/trust_and_safety_models/toxicity/load_model.py index 7b271066f..b052c34ce 100644 --- a/trust_and_safety_models/toxicity/load_model.py +++ b/trust_and_safety_models/toxicity/load_model.py @@ -1,227 +1,253 @@ import os from toxicity_ml_pipeline.settings.default_settings_tox import LOCAL_DIR, MAX_SEQ_LENGTH + try: - from toxicity_ml_pipeline.optim.losses import MaskedBCE + from toxicity_ml_pipeline.optim.losses import MaskedBCE except ImportError: - print('No MaskedBCE loss') -from toxicity_ml_pipeline.utils.helpers import execute_command - + print("No MaskedBCE loss") import tensorflow as tf - +from toxicity_ml_pipeline.utils.helpers import execute_command try: - from twitter.cuad.representation.models.text_encoder import TextEncoder + from twitter.cuad.representation.models.text_encoder import TextEncoder except ModuleNotFoundError: - print("No TextEncoder package") + print("No TextEncoder package") try: - from transformers import TFAutoModelForSequenceClassification + from transformers import TFAutoModelForSequenceClassification except ModuleNotFoundError: - print("No HuggingFace package") + print("No HuggingFace package") LOCAL_MODEL_DIR = os.path.join(LOCAL_DIR, "models") -def reload_model_weights(weights_dir, language, **kwargs): - optimizer = tf.keras.optimizers.Adam(0.01) - model_type = ( - "twitter_bert_base_en_uncased_mlm" - if language == "en" - else "twitter_multilingual_bert_base_cased_mlm" - ) - model = load(optimizer=optimizer, seed=42, model_type=model_type, **kwargs) - model.load_weights(weights_dir) - - return model - - -def _locally_copy_models(model_type): - if model_type == "twitter_multilingual_bert_base_cased_mlm": - preprocessor = "bert_multi_cased_preprocess_3" - elif model_type == "twitter_bert_base_en_uncased_mlm": - preprocessor = "bert_en_uncased_preprocess_3" - else: - raise NotImplementedError - - copy_cmd = """mkdir {local_dir} -gsutil cp -r ... -gsutil cp -r ...""" - execute_command( - copy_cmd.format(model_type=model_type, preprocessor=preprocessor, local_dir=LOCAL_MODEL_DIR) - ) - - return preprocessor - - -def load_encoder(model_type, trainable): - try: - model = TextEncoder( - max_seq_lengths=MAX_SEQ_LENGTH, - model_type=model_type, - cluster="gcp", - trainable=trainable, - enable_dynamic_shapes=True, +def reload_model_weights(weights_dir, language: str, **kwargs): + optimizer = tf.keras.optimizers.Adam(0.01) + model_type = ( + "twitter_bert_base_en_uncased_mlm" + if language == "en" + else "twitter_multilingual_bert_base_cased_mlm" ) - except (OSError, tf.errors.AbortedError) as e: - print(e) - preprocessor = _locally_copy_models(model_type) - - model = TextEncoder( - max_seq_lengths=MAX_SEQ_LENGTH, - local_model_path=f"models/{model_type}", - local_preprocessor_path=f"models/{preprocessor}", - cluster="gcp", - trainable=trainable, - enable_dynamic_shapes=True, - ) - - return model - + model = load(optimizer=optimizer, seed=42, model_type=model_type, **kwargs) + model.load_weights(weights_dir) -def get_loss(loss_name, from_logits, **kwargs): - loss_name = loss_name.lower() - if loss_name == "bce": - print("Binary CE loss") - return tf.keras.losses.BinaryCrossentropy(from_logits=from_logits) + return model - if loss_name == "cce": - print("Categorical cross-entropy loss") - return tf.keras.losses.CategoricalCrossentropy(from_logits=from_logits) - if loss_name == "scce": - print("Sparse categorical cross-entropy loss") - return tf.keras.losses.SparseCategoricalCrossentropy(from_logits=from_logits) +def _locally_copy_models(model_type: str): + if model_type == "twitter_multilingual_bert_base_cased_mlm": + preprocessor = "bert_multi_cased_preprocess_3" + elif model_type == "twitter_bert_base_en_uncased_mlm": + preprocessor = "bert_en_uncased_preprocess_3" + else: + raise NotImplementedError - if loss_name == "focal_bce": - gamma = kwargs.get("gamma", 2) - print("Focal binary CE loss", gamma) - return tf.keras.losses.BinaryFocalCrossentropy(gamma=gamma, from_logits=from_logits) + copy_cmd = "mkdir {local_dir}\ngsutil cp -r ...\ngsutil cp -r ..." + execute_command( + copy_cmd.format( + model_type=model_type, preprocessor=preprocessor, local_dir=LOCAL_MODEL_DIR + ) + ) - if loss_name == 'masked_bce': - multitask = kwargs.get("multitask", False) - if from_logits or multitask: - raise NotImplementedError - print(f'Masked Binary Cross Entropy') - return MaskedBCE() + return preprocessor + + +def load_encoder(model_type: str, trainable: bool): + try: + model = TextEncoder( + max_seq_lengths=MAX_SEQ_LENGTH, + model_type=model_type, + cluster="gcp", + trainable=trainable, + enable_dynamic_shapes=True, + ) + except (OSError, tf.errors.AbortedError) as e: + print(e) + preprocessor = _locally_copy_models(model_type) + + model = TextEncoder( + max_seq_lengths=MAX_SEQ_LENGTH, + local_model_path=f"models/{model_type}", + local_preprocessor_path=f"models/{preprocessor}", + cluster="gcp", + trainable=trainable, + enable_dynamic_shapes=True, + ) + + return model + + +def get_loss(loss_name: str, from_logits, **kwargs): + loss_name = loss_name.lower() + if loss_name == "bce": + print("Binary CE loss") + return tf.keras.losses.BinaryCrossentropy(from_logits=from_logits) + + if loss_name == "cce": + print("Categorical cross-entropy loss") + return tf.keras.losses.CategoricalCrossentropy(from_logits=from_logits) + + if loss_name == "scce": + print("Sparse categorical cross-entropy loss") + return tf.keras.losses.SparseCategoricalCrossentropy(from_logits=from_logits) + + if loss_name == "focal_bce": + gamma = kwargs.get("gamma", 2) + print("Focal binary CE loss", gamma) + return tf.keras.losses.BinaryFocalCrossentropy( + gamma=gamma, from_logits=from_logits + ) + + if loss_name == "masked_bce": + multitask = kwargs.get("multitask", False) + if from_logits or multitask: + raise NotImplementedError + print(f"Masked Binary Cross Entropy") + return MaskedBCE() + + if loss_name == "inv_kl_loss": + raise NotImplementedError + + raise ValueError( + f"This loss name is not valid: {loss_name}. Accepted loss names: BCE, masked BCE, CCE, sCCE, " + f"Focal_BCE, inv_KL_loss" + ) - if loss_name == "inv_kl_loss": - raise NotImplementedError - raise ValueError( - f"This loss name is not valid: {loss_name}. Accepted loss names: BCE, masked BCE, CCE, sCCE, " - f"Focal_BCE, inv_KL_loss" - ) +def _add_additional_embedding_layer(doc_embedding, glorot, seed: int): + doc_embedding = tf.keras.layers.Dense( + 768, activation="tanh", kernel_initializer=glorot + )(doc_embedding) + doc_embedding = tf.keras.layers.Dropout(rate=0.1, seed=seed)(doc_embedding) + return doc_embedding -def _add_additional_embedding_layer(doc_embedding, glorot, seed): - doc_embedding = tf.keras.layers.Dense(768, activation="tanh", kernel_initializer=glorot)(doc_embedding) - doc_embedding = tf.keras.layers.Dropout(rate=0.1, seed=seed)(doc_embedding) - return doc_embedding def _get_bias(**kwargs): - smart_bias_value = kwargs.get('smart_bias_value', 0) - print('Smart bias init to ', smart_bias_value) - output_bias = tf.keras.initializers.Constant(smart_bias_value) - return output_bias + smart_bias_value = kwargs.get("smart_bias_value", 0) + print("Smart bias init to ", smart_bias_value) + output_bias = tf.keras.initializers.Constant(smart_bias_value) + return output_bias def load_inhouse_bert(model_type, trainable, seed, **kwargs): - inputs = tf.keras.layers.Input(shape=(), dtype=tf.string) - encoder = load_encoder(model_type=model_type, trainable=trainable) - doc_embedding = encoder([inputs])["pooled_output"] - doc_embedding = tf.keras.layers.Dropout(rate=0.1, seed=seed)(doc_embedding) - - glorot = tf.keras.initializers.glorot_uniform(seed=seed) - if kwargs.get("additional_layer", False): - doc_embedding = _add_additional_embedding_layer(doc_embedding, glorot, seed) - - if kwargs.get('content_num_classes', None): - probs = get_last_layer(glorot=glorot, last_layer_name='target_output', **kwargs)(doc_embedding) - second_probs = get_last_layer(num_classes=kwargs['content_num_classes'], - last_layer_name='content_output', - glorot=glorot)(doc_embedding) - probs = [probs, second_probs] - else: - probs = get_last_layer(glorot=glorot, **kwargs)(doc_embedding) - model = tf.keras.models.Model(inputs=inputs, outputs=probs) - - return model, False + inputs = tf.keras.layers.Input(shape=(), dtype=tf.string) + encoder = load_encoder(model_type=model_type, trainable=trainable) + doc_embedding = encoder([inputs])["pooled_output"] + doc_embedding = tf.keras.layers.Dropout(rate=0.1, seed=seed)(doc_embedding) + + glorot = tf.keras.initializers.glorot_uniform(seed=seed) + if kwargs.get("additional_layer", False): + doc_embedding = _add_additional_embedding_layer(doc_embedding, glorot, seed) + + if kwargs.get("content_num_classes", None): + probs = get_last_layer( + glorot=glorot, last_layer_name="target_output", **kwargs + )(doc_embedding) + second_probs = get_last_layer( + num_classes=kwargs["content_num_classes"], + last_layer_name="content_output", + glorot=glorot, + )(doc_embedding) + probs = [probs, second_probs] + else: + probs = get_last_layer(glorot=glorot, **kwargs)(doc_embedding) + model = tf.keras.models.Model(inputs=inputs, outputs=probs) + + return model, False -def get_last_layer(**kwargs): - output_bias = _get_bias(**kwargs) - if 'glorot' in kwargs: - glorot = kwargs['glorot'] - else: - glorot = tf.keras.initializers.glorot_uniform(seed=kwargs['seed']) - layer_name = kwargs.get('last_layer_name', 'dense_1') - - if kwargs.get('num_classes', 1) > 1: - last_layer = tf.keras.layers.Dense( - kwargs["num_classes"], activation="softmax", kernel_initializer=glorot, - bias_initializer=output_bias, name=layer_name - ) - elif kwargs.get('num_raters', 1) > 1: - if kwargs.get('multitask', False): - raise NotImplementedError - last_layer = tf.keras.layers.Dense( - kwargs['num_raters'], activation="sigmoid", kernel_initializer=glorot, - bias_initializer=output_bias, name='probs') - - else: - last_layer = tf.keras.layers.Dense( - 1, activation="sigmoid", kernel_initializer=glorot, - bias_initializer=output_bias, name=layer_name - ) +def get_last_layer(**kwargs): + output_bias = _get_bias(**kwargs) + if "glorot" in kwargs: + glorot = kwargs["glorot"] + else: + glorot = tf.keras.initializers.glorot_uniform(seed=kwargs["seed"]) + layer_name = kwargs.get("last_layer_name", "dense_1") + + if kwargs.get("num_classes", 1) > 1: + last_layer = tf.keras.layers.Dense( + kwargs["num_classes"], + activation="softmax", + kernel_initializer=glorot, + bias_initializer=output_bias, + name=layer_name, + ) + + elif kwargs.get("num_raters", 1) > 1: + if kwargs.get("multitask", False): + raise NotImplementedError + last_layer = tf.keras.layers.Dense( + kwargs["num_raters"], + activation="sigmoid", + kernel_initializer=glorot, + bias_initializer=output_bias, + name="probs", + ) + + else: + last_layer = tf.keras.layers.Dense( + 1, + activation="sigmoid", + kernel_initializer=glorot, + bias_initializer=output_bias, + name=layer_name, + ) + + return last_layer - return last_layer def load_bertweet(**kwargs): - bert = TFAutoModelForSequenceClassification.from_pretrained( - os.path.join(LOCAL_MODEL_DIR, "bertweet-base"), - num_labels=1, - classifier_dropout=0.1, - hidden_size=768, - ) - if "num_classes" in kwargs and kwargs["num_classes"] > 2: - raise NotImplementedError + bert = TFAutoModelForSequenceClassification.from_pretrained( + os.path.join(LOCAL_MODEL_DIR, "bertweet-base"), + num_labels=1, + classifier_dropout=0.1, + hidden_size=768, + ) + if "num_classes" in kwargs and kwargs["num_classes"] > 2: + raise NotImplementedError - return bert, True + return bert, True def load( - optimizer, - seed, - model_type="twitter_multilingual_bert_base_cased_mlm", - loss_name="BCE", - trainable=True, - **kwargs, + optimizer: str, + seed: int, + model_type: str = "twitter_multilingual_bert_base_cased_mlm", + loss_name: str = "BCE", + trainable: bool = True, + **kwargs, ): - if model_type == "bertweet-base": - model, from_logits = load_bertweet() - else: - model, from_logits = load_inhouse_bert(model_type, trainable, seed, **kwargs) - - pr_auc = tf.keras.metrics.AUC(curve="PR", name="pr_auc", from_logits=from_logits) - roc_auc = tf.keras.metrics.AUC(curve="ROC", name="roc_auc", from_logits=from_logits) - - loss = get_loss(loss_name, from_logits, **kwargs) - if kwargs.get('content_num_classes', None): - second_loss = get_loss(loss_name=kwargs['content_loss_name'], from_logits=from_logits) - loss_weights = {'content_output': kwargs['content_loss_weight'], 'target_output': 1} - model.compile( - optimizer=optimizer, - loss={'content_output': second_loss, 'target_output': loss}, - loss_weights=loss_weights, - metrics=[pr_auc, roc_auc], - ) - - else: - model.compile( - optimizer=optimizer, - loss=loss, - metrics=[pr_auc, roc_auc], - ) - print(model.summary(), "logits: ", from_logits) - - return model \ No newline at end of file + if model_type == "bertweet-base": + model, from_logits = load_bertweet() + else: + model, from_logits = load_inhouse_bert(model_type, trainable, seed, **kwargs) + + pr_auc = tf.keras.metrics.AUC(curve="PR", name="pr_auc", from_logits=from_logits) + roc_auc = tf.keras.metrics.AUC(curve="ROC", name="roc_auc", from_logits=from_logits) + + loss = get_loss(loss_name, from_logits, **kwargs) + if kwargs.get("content_num_classes", None): + second_loss = get_loss( + loss_name=kwargs["content_loss_name"], from_logits=from_logits + ) + loss_weights = { + "content_output": kwargs["content_loss_weight"], + "target_output": 1, + } + model.compile( + optimizer=optimizer, + loss={"content_output": second_loss, "target_output": loss}, + loss_weights=loss_weights, + metrics=[pr_auc, roc_auc], + ) + + else: + model.compile( + optimizer=optimizer, + loss=loss, + metrics=[pr_auc, roc_auc], + ) + print(model.summary(), "logits: ", from_logits) + + return model diff --git a/trust_and_safety_models/toxicity/optim/callbacks.py b/trust_and_safety_models/toxicity/optim/callbacks.py index bbf8d7c97..bced640df 100644 --- a/trust_and_safety_models/toxicity/optim/callbacks.py +++ b/trust_and_safety_models/toxicity/optim/callbacks.py @@ -1,220 +1,246 @@ -from collections import defaultdict import os +from collections import defaultdict +from typing import List -from toxicity_ml_pipeline.settings.default_settings_tox import REMOTE_LOGDIR -from toxicity_ml_pipeline.settings.default_settings_abs import LABEL_NAMES -from toxicity_ml_pipeline.utils.absv_utils import parse_labeled_data -from toxicity_ml_pipeline.utils.helpers import compute_precision_fixed_recall, execute_command - -from sklearn.metrics import average_precision_score, roc_auc_score +import numpy as np import tensorflow as tf import wandb +from sklearn.metrics import average_precision_score, roc_auc_score +from toxicity_ml_pipeline.settings.default_settings_abs import LABEL_NAMES +from toxicity_ml_pipeline.settings.default_settings_tox import REMOTE_LOGDIR +from toxicity_ml_pipeline.utils.absv_utils import parse_labeled_data +from toxicity_ml_pipeline.utils.helpers import ( + compute_precision_fixed_recall, + execute_command, +) class NothingCallback(tf.keras.callbacks.Callback): - def on_epoch_begin(self, epoch, logs=None): - print("ici, ", epoch) + def on_epoch_begin(self, epoch, logs=None): + print("ici, ", epoch) - def on_epoch_end(self, epoch, logs=None): - print("fin ", epoch) + def on_epoch_end(self, epoch, logs=None): + print("fin ", epoch) - def on_train_batch_end(self, batch, logs=None): - print("fin de batch ", batch) + def on_train_batch_end(self, batch, logs=None): + print("fin de batch ", batch) class ControlledStoppingCheckpointCallback(tf.keras.callbacks.ModelCheckpoint): - def __init__(self, stopping_epoch, *args, **kwargs): - super().__init__(*args, **kwargs) - self.stopping_epoch = stopping_epoch + def __init__(self, stopping_epoch, *args, **kwargs): + super().__init__(*args, **kwargs) + self.stopping_epoch = stopping_epoch - def on_epoch_end(self, epoch, logs=None): - super().on_epoch_end(epoch, logs) - if epoch == self.stopping_epoch: - self.model.stop_training = True + def on_epoch_end(self, epoch, logs=None): + super().on_epoch_end(epoch, logs) + if epoch == self.stopping_epoch: + self.model.stop_training = True class SyncingTensorBoard(tf.keras.callbacks.TensorBoard): - def __init__(self, remote_logdir=None, *args, **kwargs): - super().__init__(*args, **kwargs) - self.remote_logdir = remote_logdir if remote_logdir is not None else REMOTE_LOGDIR + def __init__(self, remote_logdir=None, *args, **kwargs): + super().__init__(*args, **kwargs) + self.remote_logdir = ( + remote_logdir if remote_logdir is not None else REMOTE_LOGDIR + ) - def on_epoch_end(self, epoch, logs=None): - super().on_epoch_end(epoch, logs=logs) - self.synchronize() + def on_epoch_end(self, epoch, logs=None): + super().on_epoch_end(epoch, logs=logs) + self.synchronize() - def synchronize(self): - base_dir = os.path.dirname(self.log_dir) - cmd = f"gsutil -m rsync -r {base_dir} {self.remote_logdir}" - execute_command(cmd) + def synchronize(self): + base_dir = os.path.dirname(self.log_dir) + cmd = f"gsutil -m rsync -r {base_dir} {self.remote_logdir}" + execute_command(cmd) class GradientLoggingTensorBoard(SyncingTensorBoard): - def __init__(self, loader, val_data, freq, *args, **kwargs): - super().__init__(*args, **kwargs) - val_dataset = loader.get_balanced_dataset( - training_data=val_data, size_limit=50, return_as_batch=False - ) - data_args = list(val_dataset.batch(32).take(1))[0] - self.x_batch, self.y_batch = data_args[0], data_args[1] - self.freq = freq - self.counter = 0 - - def _log_gradients(self): - writer = self._train_writer - - with writer.as_default(): - with tf.GradientTape() as tape: - y_pred = self.model(self.x_batch) - loss = self.model.compiled_loss(y_true=self.y_batch, y_pred=y_pred) - gradient_norm = tf.linalg.global_norm(tape.gradient(loss, self.model.trainable_weights)) - - tf.summary.scalar("gradient_norm", data=gradient_norm, step=self.counter) - writer.flush() - - def on_train_batch_end(self, batch, logs=None): - super().on_batch_end(batch, logs=logs) - self.counter += 1 - if batch % self.freq == 0: - self._log_gradients() + def __init__(self, loader, val_data, freq, *args, **kwargs): + super().__init__(*args, **kwargs) + val_dataset = loader.get_balanced_dataset( + training_data=val_data, size_limit=50, return_as_batch=False + ) + data_args = list(val_dataset.batch(32).take(1))[0] + self.x_batch, self.y_batch = data_args[0], data_args[1] + self.freq = freq + self.counter = 0 + + def _log_gradients(self): + writer = self._train_writer + + with writer.as_default(): + with tf.GradientTape() as tape: + y_pred = self.model(self.x_batch) + loss = self.model.compiled_loss(y_true=self.y_batch, y_pred=y_pred) + gradient_norm = tf.linalg.global_norm( + tape.gradient(loss, self.model.trainable_weights) + ) + + tf.summary.scalar("gradient_norm", data=gradient_norm, step=self.counter) + writer.flush() + + def on_train_batch_end(self, batch, logs=None): + super().on_batch_end(batch, logs=logs) + self.counter += 1 + if batch % self.freq == 0: + self._log_gradients() class AdditionalResultLogger(tf.keras.callbacks.Callback): - def __init__( - self, - data, - set_, - fixed_recall=0.85, - from_logits=False, - dataset_transform_func=None, - batch_size=64, - dual_head=None, - *args, - **kwargs, - ): - super().__init__(*args, **kwargs) - self.set_ = set_ - if data is None: - return None - - self.single_head = True - try: - self.labels = data.int_label.values - except AttributeError: - self.labels = data.to_dataframe()[LABEL_NAMES].values.astype('int') - self.data = data.to_tf_dataset().map(parse_labeled_data).batch(batch_size) - self.label_names = LABEL_NAMES - else: - self.label_names = [''] - if dual_head: - self.label_names = [f'{e}_label' for e in dual_head] - self.labels = {f'{e}_output': data[f'{e}_label'].values for e in dual_head} - self.single_head = False - if dataset_transform_func is None: - self.data = data.text.values - else: - self.data = dataset_transform_func(data, mb_size=batch_size, shuffle=False) - - finally: - if len(self.label_names) == 1: - self.metric_kw = {} - else: - self.metric_kw = {'average': None} - - self.counter = 0 - self.best_metrics = defaultdict(float) - self.from_logits = from_logits - print(f"Loaded callback for {set_}, from_logits: {from_logits}, labels {self.label_names}") - - if 1 < fixed_recall <= 100: - fixed_recall = fixed_recall / 100 - elif not (0 < fixed_recall <= 100): - raise ValueError("Threshold should be between 0 and 1, or 0 and 100") - self.fixed_recall = fixed_recall - self.batch_size = batch_size - - def compute_precision_fixed_recall(self, labels, preds): - result, _ = compute_precision_fixed_recall(labels=labels, preds=preds, - fixed_recall=self.fixed_recall) - - return result - - def on_epoch_end(self, epoch, logs=None): - self.additional_evaluations(step=epoch, eval_time="epoch") - - def on_train_batch_end(self, batch, logs=None): - self.counter += 1 - if self.counter % 2000 == 0: - self.additional_evaluations(step=self.counter, eval_time="batch") - - def _binary_evaluations(self, preds, label_name=None, class_index=None): - mask = None - curr_labels = self.labels - if label_name is not None: - curr_labels = self.labels[label_name] - if class_index is not None: - curr_labels = (curr_labels == class_index).astype(int) - - if -1 in curr_labels: - mask = curr_labels != -1 - curr_labels = curr_labels[mask] - preds = preds[mask] - - return { - f"precision_recall{self.fixed_recall}": self.compute_precision_fixed_recall( - labels=curr_labels, preds=preds - ), - "pr_auc": average_precision_score(y_true=curr_labels, y_score=preds), - "roc_auc": roc_auc_score(y_true=curr_labels, y_score=preds), - } - - - def _multiclass_evaluations(self, preds): - pr_auc_l = average_precision_score(y_true=self.labels, y_score=preds, **self.metric_kw) - roc_auc_l = roc_auc_score(y_true=self.labels, y_score=preds, **self.metric_kw) - metrics = {} - for i, label in enumerate(self.label_names): - metrics[f'pr_auc_{label}'] = pr_auc_l[i] - metrics[f'roc_auc_{label}'] = roc_auc_l[i] - - return metrics - - def additional_evaluations(self, step, eval_time): - print("Evaluating ", self.set_, eval_time, step) - - preds = self.model.predict(x=self.data, batch_size=self.batch_size) - if self.from_logits: - preds = tf.keras.activations.sigmoid(preds.logits).numpy() - - if self.single_head: - if len(self.label_names) == 1: - metrics = self._binary_evaluations(preds) - else: - metrics = self._multiclass_evaluations(preds) - else: - if preds[0].shape[1] == 1: - binary_preds = preds[0] - multic_preds = preds[1] - else: - binary_preds = preds[1] - multic_preds = preds[0] - - binary_metrics = self._binary_evaluations(binary_preds, label_name='target_output') - metrics = {f'{k}_target': v for k, v in binary_metrics.items()} - num_classes = multic_preds.shape[1] - for class_ in range(num_classes): - binary_metrics = self._binary_evaluations(multic_preds[:, class_], label_name='content_output', class_index=class_) - metrics.update({f'{k}_content_{class_}': v for k, v in binary_metrics.items()}) - - for k, v in metrics.items(): - self.best_metrics[f"max_{k}"] = max(v, self.best_metrics[f"max_{k}"]) - - self.log_metrics(metrics, step=step, eval_time=eval_time) - - def log_metrics(self, metrics_d, step, eval_time): - commit = False if self.set_ == "validation" else True - to_report = {self.set_: {**metrics_d, **self.best_metrics}} - - if eval_time == "epoch": - to_report["epoch"] = step - - wandb.log(to_report, commit=commit) + def __init__( + self, + data: tf.data.Dataset, + set_: str, + fixed_recall: float = 0.85, + from_logits: bool = False, + dataset_transform_func: callable = None, + batch_size: int = 64, + dual_head: List[str] = None, + *args, + **kwargs, + ): + super().__init__(*args, **kwargs) + self.set_ = set_ + if data is None: + return None + + self.single_head = True + try: + self.labels = data.int_label.values + except AttributeError: + self.labels = data.to_dataframe()[LABEL_NAMES].values.astype("int") + self.data = data.to_tf_dataset().map(parse_labeled_data).batch(batch_size) + self.label_names = LABEL_NAMES + else: + self.label_names = [""] + if dual_head: + self.label_names = [f"{e}_label" for e in dual_head] + self.labels = { + f"{e}_output": data[f"{e}_label"].values for e in dual_head + } + self.single_head = False + if dataset_transform_func is None: + self.data = data.text.values + else: + self.data = dataset_transform_func( + data, mb_size=batch_size, shuffle=False + ) + + finally: + if len(self.label_names) == 1: + self.metric_kw = {} + else: + self.metric_kw = {"average": None} + + self.counter = 0 + self.best_metrics = defaultdict(float) + self.from_logits = from_logits + print( + f"Loaded callback for {set_}, from_logits: {from_logits}, labels {self.label_names}" + ) + + if 1 < fixed_recall <= 100: + fixed_recall = fixed_recall / 100 + elif not (0 < fixed_recall <= 100): + raise ValueError("Threshold should be between 0 and 1, or 0 and 100") + self.fixed_recall = fixed_recall + self.batch_size = batch_size + + def compute_precision_fixed_recall(self, labels: np.ndarray, preds: np.ndarray): + result, _ = compute_precision_fixed_recall( + labels=labels, preds=preds, fixed_recall=self.fixed_recall + ) + + return result + + def on_epoch_end(self, epoch, logs=None): + self.additional_evaluations(step=epoch, eval_time="epoch") + + def on_train_batch_end(self, batch, logs=None): + self.counter += 1 + if self.counter % 2000 == 0: + self.additional_evaluations(step=self.counter, eval_time="batch") + + def _binary_evaluations( + self, preds: np.ndarray, label_name=None, class_index: int = None + ): + mask = None + curr_labels = self.labels + if label_name is not None: + curr_labels = self.labels[label_name] + if class_index is not None: + curr_labels = (curr_labels == class_index).astype(int) + + if -1 in curr_labels: + mask = curr_labels != -1 + curr_labels = curr_labels[mask] + preds = preds[mask] + + return { + f"precision_recall{self.fixed_recall}": self.compute_precision_fixed_recall( + labels=curr_labels, preds=preds + ), + "pr_auc": average_precision_score(y_true=curr_labels, y_score=preds), + "roc_auc": roc_auc_score(y_true=curr_labels, y_score=preds), + } + + def _multiclass_evaluations(self, preds: np.ndarray): + pr_auc_l = average_precision_score( + y_true=self.labels, y_score=preds, **self.metric_kw + ) + roc_auc_l = roc_auc_score(y_true=self.labels, y_score=preds, **self.metric_kw) + metrics = {} + for i, label in enumerate(self.label_names): + metrics[f"pr_auc_{label}"] = pr_auc_l[i] + metrics[f"roc_auc_{label}"] = roc_auc_l[i] + + return metrics + + def additional_evaluations(self, step: int, eval_time: str): + print("Evaluating ", self.set_, eval_time, step) + + preds = self.model.predict(x=self.data, batch_size=self.batch_size) + if self.from_logits: + preds = tf.keras.activations.sigmoid(preds.logits).numpy() + + if self.single_head: + if len(self.label_names) == 1: + metrics = self._binary_evaluations(preds) + else: + metrics = self._multiclass_evaluations(preds) + else: + if preds[0].shape[1] == 1: + binary_preds = preds[0] + multic_preds = preds[1] + else: + binary_preds = preds[1] + multic_preds = preds[0] + + binary_metrics = self._binary_evaluations( + binary_preds, label_name="target_output" + ) + metrics = {f"{k}_target": v for k, v in binary_metrics.items()} + num_classes = multic_preds.shape[1] + for class_ in range(num_classes): + binary_metrics = self._binary_evaluations( + multic_preds[:, class_], + label_name="content_output", + class_index=class_, + ) + metrics.update( + {f"{k}_content_{class_}": v for k, v in binary_metrics.items()} + ) + + for k, v in metrics.items(): + self.best_metrics[f"max_{k}"] = max(v, self.best_metrics[f"max_{k}"]) + + self.log_metrics(metrics, step=step, eval_time=eval_time) + + def log_metrics(self, metrics_d: dict, step: int, eval_time: str): + commit = False if self.set_ == "validation" else True + to_report = {self.set_: {**metrics_d, **self.best_metrics}} + + if eval_time == "epoch": + to_report["epoch"] = step + + wandb.log(to_report, commit=commit) diff --git a/trust_and_safety_models/toxicity/optim/losses.py b/trust_and_safety_models/toxicity/optim/losses.py index 273c6676e..ede9a3c0c 100644 --- a/trust_and_safety_models/toxicity/optim/losses.py +++ b/trust_and_safety_models/toxicity/optim/losses.py @@ -1,56 +1,59 @@ import tensorflow as tf -from keras.utils import tf_utils -from keras.utils import losses_utils from keras import backend +from keras.utils import losses_utils, tf_utils -def inv_kl_divergence(y_true, y_pred): - y_pred = tf.convert_to_tensor(y_pred) - y_true = tf.cast(y_true, y_pred.dtype) - y_true = backend.clip(y_true, backend.epsilon(), 1) - y_pred = backend.clip(y_pred, backend.epsilon(), 1) - return tf.reduce_sum(y_pred * tf.math.log(y_pred / y_true), axis=-1) -def masked_bce(y_true, y_pred): - y_true = tf.cast(y_true, dtype=tf.float32) - mask = y_true != -1 - - return tf.keras.metrics.binary_crossentropy(tf.boolean_mask(y_true, mask), - tf.boolean_mask(y_pred, mask)) +def inv_kl_divergence(y_true: tf.Tensor, y_pred: tf.Tensor) -> tf.Tensor: + y_pred = tf.convert_to_tensor(y_pred) + y_true = tf.cast(y_true, y_pred.dtype) + y_true = backend.clip(y_true, backend.epsilon(), 1) + y_pred = backend.clip(y_pred, backend.epsilon(), 1) + return tf.reduce_sum(y_pred * tf.math.log(y_pred / y_true), axis=-1) + + +def masked_bce(y_true: tf.Tensor, y_pred: tf.Tensor) -> tf.Tensor: + y_true = tf.cast(y_true, dtype=tf.float32) + mask = y_true != -1 + + return tf.keras.metrics.binary_crossentropy( + tf.boolean_mask(y_true, mask), tf.boolean_mask(y_pred, mask) + ) class LossFunctionWrapper(tf.keras.losses.Loss): - def __init__(self, - fn, - reduction=losses_utils.ReductionV2.AUTO, - name=None, - **kwargs): - super().__init__(reduction=reduction, name=name) - self.fn = fn - self._fn_kwargs = kwargs - - def call(self, y_true, y_pred): - if tf.is_tensor(y_pred) and tf.is_tensor(y_true): - y_pred, y_true = losses_utils.squeeze_or_expand_dimensions(y_pred, y_true) - - ag_fn = tf.__internal__.autograph.tf_convert(self.fn, tf.__internal__.autograph.control_status_ctx()) - return ag_fn(y_true, y_pred, **self._fn_kwargs) - - def get_config(self): - config = {} - for k, v in self._fn_kwargs.items(): - config[k] = backend.eval(v) if tf_utils.is_tensor_or_variable(v) else v - base_config = super().get_config() - return dict(list(base_config.items()) + list(config.items())) + def __init__( + self, fn, reduction=losses_utils.ReductionV2.AUTO, name=None, **kwargs + ): + super().__init__(reduction=reduction, name=name) + self.fn = fn + self._fn_kwargs = kwargs + + def call(self, y_true: tf.Tensor, y_pred: tf.Tensor) -> tf.Tensor: + if tf.is_tensor(y_pred) and tf.is_tensor(y_true): + y_pred, y_true = losses_utils.squeeze_or_expand_dimensions(y_pred, y_true) + + ag_fn = tf.__internal__.autograph.tf_convert( + self.fn, tf.__internal__.autograph.control_status_ctx() + ) + return ag_fn(y_true, y_pred, **self._fn_kwargs) + + def get_config(self) -> dict: + config = {} + for k, v in self._fn_kwargs.items(): + config[k] = backend.eval(v) if tf_utils.is_tensor_or_variable(v) else v + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + class InvKLD(LossFunctionWrapper): - def __init__(self, - reduction=losses_utils.ReductionV2.AUTO, - name='inv_kl_divergence'): - super().__init__(inv_kl_divergence, name=name, reduction=reduction) + def __init__( + self, reduction=losses_utils.ReductionV2.AUTO, name: str = "inv_kl_divergence" + ): + super().__init__(inv_kl_divergence, name=name, reduction=reduction) class MaskedBCE(LossFunctionWrapper): - def __init__(self, - reduction=losses_utils.ReductionV2.AUTO, - name='masked_bce'): - super().__init__(masked_bce, name=name, reduction=reduction) + def __init__( + self, reduction=losses_utils.ReductionV2.AUTO, name: str = "masked_bce" + ): + super().__init__(masked_bce, name=name, reduction=reduction) diff --git a/trust_and_safety_models/toxicity/optim/schedulers.py b/trust_and_safety_models/toxicity/optim/schedulers.py index 59f6c9afa..4a3d5091e 100644 --- a/trust_and_safety_models/toxicity/optim/schedulers.py +++ b/trust_and_safety_models/toxicity/optim/schedulers.py @@ -4,41 +4,41 @@ class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule): - def __init__( - self, - initial_learning_rate: float, - decay_schedule_fn: Callable, - warmup_steps: int, - power: float = 1.0, - name: str = "", - ): - super().__init__() - self.initial_learning_rate = initial_learning_rate - self.warmup_steps = warmup_steps - self.power = power - self.decay_schedule_fn = decay_schedule_fn - self.name = name + def __init__( + self, + initial_learning_rate: float, + decay_schedule_fn: Callable, + warmup_steps: int, + power: float = 1.0, + name: str = "", + ): + super().__init__() + self.initial_learning_rate = initial_learning_rate + self.warmup_steps = warmup_steps + self.power = power + self.decay_schedule_fn = decay_schedule_fn + self.name = name - def __call__(self, step): - with tf.name_scope(self.name or "WarmUp") as name: - global_step_float = tf.cast(step, tf.float32) - warmup_steps_float = tf.cast(self.warmup_steps, tf.float32) - warmup_percent_done = global_step_float / warmup_steps_float - warmup_learning_rate = self.initial_learning_rate * tf.math.pow( - warmup_percent_done, self.power - ) - return tf.cond( - global_step_float < warmup_steps_float, - lambda: warmup_learning_rate, - lambda: self.decay_schedule_fn(step - self.warmup_steps), - name=name, - ) + def __call__(self, step): + with tf.name_scope(self.name or "WarmUp") as name: + global_step_float = tf.cast(step, tf.float32) + warmup_steps_float = tf.cast(self.warmup_steps, tf.float32) + warmup_percent_done = global_step_float / warmup_steps_float + warmup_learning_rate = self.initial_learning_rate * tf.math.pow( + warmup_percent_done, self.power + ) + return tf.cond( + global_step_float < warmup_steps_float, + lambda: warmup_learning_rate, + lambda: self.decay_schedule_fn(step - self.warmup_steps), + name=name, + ) - def get_config(self): - return { - "initial_learning_rate": self.initial_learning_rate, - "decay_schedule_fn": self.decay_schedule_fn, - "warmup_steps": self.warmup_steps, - "power": self.power, - "name": self.name, - } + def get_config(self): + return { + "initial_learning_rate": self.initial_learning_rate, + "decay_schedule_fn": self.decay_schedule_fn, + "warmup_steps": self.warmup_steps, + "power": self.power, + "name": self.name, + } diff --git a/trust_and_safety_models/toxicity/rescoring.py b/trust_and_safety_models/toxicity/rescoring.py index 71d95ed76..0fe1d71b3 100644 --- a/trust_and_safety_models/toxicity/rescoring.py +++ b/trust_and_safety_models/toxicity/rescoring.py @@ -1,54 +1,72 @@ +import numpy as np +import pandas as pd +import tensorflow as tf from toxicity_ml_pipeline.load_model import reload_model_weights from toxicity_ml_pipeline.utils.helpers import load_inference_func, upload_model -import numpy as np -import tensorflow as tf +def score( + language: str, + df: pd.DataFrame, + gcs_model_path: str, + batch_size: int = 64, + text_col: str = "text", + kw: str = "", + **kwargs, +): + if language != "en": + raise NotImplementedError( + "Data preprocessing not implemented here, needs to be added for i18n models" + ) + model_folder = upload_model(full_gcs_model_path=gcs_model_path) + try: + inference_func = load_inference_func(model_folder) + except OSError: + model = reload_model_weights(model_folder, language, **kwargs) + preds = model.predict(x=df[text_col], batch_size=batch_size) + if type(preds) != list: + if len(preds.shape) > 1 and preds.shape[1] > 1: + if "num_classes" in kwargs and kwargs["num_classes"] > 1: + raise NotImplementedError + preds = np.mean(preds, 1) -def score(language, df, gcs_model_path, batch_size=64, text_col="text", kw="", **kwargs): - if language != "en": - raise NotImplementedError( - "Data preprocessing not implemented here, needs to be added for i18n models" - ) - model_folder = upload_model(full_gcs_model_path=gcs_model_path) - try: - inference_func = load_inference_func(model_folder) - except OSError: - model = reload_model_weights(model_folder, language, **kwargs) - preds = model.predict(x=df[text_col], batch_size=batch_size) - if type(preds) != list: - if len(preds.shape)> 1 and preds.shape[1] > 1: - if 'num_classes' in kwargs and kwargs['num_classes'] > 1: - raise NotImplementedError - preds = np.mean(preds, 1) - - df[f"prediction_{kw}"] = preds - else: - if len(preds) > 2: - raise NotImplementedError - for preds_arr in preds: - if preds_arr.shape[1] == 1: - df[f"prediction_{kw}_target"] = preds_arr + df[f"prediction_{kw}"] = preds else: - for ind in range(preds_arr.shape[1]): - df[f"prediction_{kw}_content_{ind}"] = preds_arr[:, ind] + if len(preds) > 2: + raise NotImplementedError + for preds_arr in preds: + if preds_arr.shape[1] == 1: + df[f"prediction_{kw}_target"] = preds_arr + else: + for ind in range(preds_arr.shape[1]): + df[f"prediction_{kw}_content_{ind}"] = preds_arr[:, ind] - return df - else: - return _get_score(inference_func, df, kw=kw, batch_size=batch_size, text_col=text_col) + return df + else: + return _get_score( + inference_func, df, kw=kw, batch_size=batch_size, text_col=text_col + ) -def _get_score(inference_func, df, text_col="text", kw="", batch_size=64): - score_col = f"prediction_{kw}" - beginning = 0 - end = df.shape[0] - predictions = np.zeros(shape=end, dtype=float) +def _get_score( + inference_func: tf.function, + df: pd.DataFrame, + text_col: str = "text", + kw: str = "", + batch_size: int = 64, +) -> pd.DataFrame: + score_col = f"prediction_{kw}" + beginning = 0 + end = df.shape[0] + predictions = np.zeros(shape=end, dtype=float) - while beginning < end: - mb = df[text_col].values[beginning : beginning + batch_size] - res = inference_func(input_1=tf.constant(mb)) - predictions[beginning : beginning + batch_size] = list(res.values())[0].numpy()[:, 0] - beginning += batch_size + while beginning < end: + mb = df[text_col].values[beginning : beginning + batch_size] + res = inference_func(input_1=tf.constant(mb)) + predictions[beginning : beginning + batch_size] = list(res.values())[0].numpy()[ + :, 0 + ] + beginning += batch_size - df[score_col] = predictions - return df + df[score_col] = predictions + return df diff --git a/trust_and_safety_models/toxicity/settings/default_settings_tox.py b/trust_and_safety_models/toxicity/settings/default_settings_tox.py index 0968b9adc..dbae08a50 100644 --- a/trust_and_safety_models/toxicity/settings/default_settings_tox.py +++ b/trust_and_safety_models/toxicity/settings/default_settings_tox.py @@ -1,20 +1,19 @@ import os - TEAM_PROJECT = "twttr-toxicity-prod" try: - from google.cloud import bigquery + from google.cloud import bigquery except (ModuleNotFoundError, ImportError): - print("No Google packages") - CLIENT = None + print("No Google packages") + CLIENT = None else: - from google.auth.exceptions import DefaultCredentialsError + from google.auth.exceptions import DefaultCredentialsError - try: - CLIENT = bigquery.Client(project=TEAM_PROJECT) - except DefaultCredentialsError as e: - CLIENT = None - print("Issue at logging time", e) + try: + CLIENT = bigquery.Client(project=TEAM_PROJECT) + except DefaultCredentialsError as e: + CLIENT = None + print("Issue at logging time", e) TRAINING_DATA_LOCATION = f"..." GCS_ADDRESS = "..." diff --git a/trust_and_safety_models/toxicity/train.py b/trust_and_safety_models/toxicity/train.py index de450ee7b..26c4bf7a1 100644 --- a/trust_and_safety_models/toxicity/train.py +++ b/trust_and_safety_models/toxicity/train.py @@ -1,401 +1,444 @@ +import os from datetime import datetime from importlib import import_module -import os +import numpy as np +import pandas as pd +import tensorflow as tf from toxicity_ml_pipeline.data.data_preprocessing import ( - DefaultENNoPreprocessor, - DefaultENPreprocessor, + DefaultENNoPreprocessor, + DefaultENPreprocessor, ) from toxicity_ml_pipeline.data.dataframe_loader import ENLoader, ENLoaderWithSampling from toxicity_ml_pipeline.data.mb_generator import BalancedMiniBatchLoader -from toxicity_ml_pipeline.load_model import load, get_last_layer +from toxicity_ml_pipeline.load_model import get_last_layer, load from toxicity_ml_pipeline.optim.callbacks import ( - AdditionalResultLogger, - ControlledStoppingCheckpointCallback, - GradientLoggingTensorBoard, - SyncingTensorBoard, + AdditionalResultLogger, + ControlledStoppingCheckpointCallback, + GradientLoggingTensorBoard, + SyncingTensorBoard, ) from toxicity_ml_pipeline.optim.schedulers import WarmUp from toxicity_ml_pipeline.settings.default_settings_abs import GCS_ADDRESS as ABS_GCS +from toxicity_ml_pipeline.settings.default_settings_tox import GCS_ADDRESS as TOX_GCS from toxicity_ml_pipeline.settings.default_settings_tox import ( - GCS_ADDRESS as TOX_GCS, - MODEL_DIR, - RANDOM_SEED, - REMOTE_LOGDIR, - WARM_UP_PERC, + MODEL_DIR, + RANDOM_SEED, + REMOTE_LOGDIR, + WARM_UP_PERC, ) from toxicity_ml_pipeline.utils.helpers import check_gpu, set_seeds, upload_model -import numpy as np -import tensorflow as tf - - try: - from tensorflow_addons.optimizers import AdamW + from tensorflow_addons.optimizers import AdamW except ModuleNotFoundError: - print("No TFA") + print("No TFA") class Trainer(object): - OPTIMIZERS = ["Adam", "AdamW"] - - def __init__( - self, - optimizer_name, - weight_decay, - learning_rate, - mb_size, - train_epochs, - content_loss_weight=1, - language="en", - scope='TOX', - project=..., - experiment_id="default", - gradient_clipping=None, - fold="time", - seed=RANDOM_SEED, - log_gradients=False, - kw="", - stopping_epoch=None, - test=False, - ): - self.seed = seed - self.weight_decay = weight_decay - self.learning_rate = learning_rate - self.mb_size = mb_size - self.train_epochs = train_epochs - self.gradient_clipping = gradient_clipping - - if optimizer_name not in self.OPTIMIZERS: - raise ValueError( - f"Optimizer {optimizer_name} not implemented. Accepted values {self.OPTIMIZERS}." - ) - self.optimizer_name = optimizer_name - self.log_gradients = log_gradients - self.test = test - self.fold = fold - self.stopping_epoch = stopping_epoch - self.language = language - if scope == 'TOX': - GCS_ADDRESS = TOX_GCS.format(project=project) - elif scope == 'ABS': - GCS_ADDRESS = ABS_GCS - else: - raise ValueError - GCS_ADDRESS = GCS_ADDRESS.format(project=project) - try: - self.setting_file = import_module(f"toxicity_ml_pipeline.settings.{scope.lower()}{project}_settings") - except ModuleNotFoundError: - raise ValueError(f"You need to define a setting file for your project {project}.") - experiment_settings = self.setting_file.experiment_settings - - self.project = project - self.remote_logdir = REMOTE_LOGDIR.format(GCS_ADDRESS=GCS_ADDRESS, project=project) - self.model_dir = MODEL_DIR.format(GCS_ADDRESS=GCS_ADDRESS, project=project) - - if experiment_id not in experiment_settings: - raise ValueError("This is not an experiment id as defined in the settings file.") - - for var, default_value in experiment_settings["default"].items(): - override_val = experiment_settings[experiment_id].get(var, default_value) - print("Setting ", var, override_val) - self.__setattr__(var, override_val) - - self.content_loss_weight = content_loss_weight if self.dual_head else None - - self.mb_loader = BalancedMiniBatchLoader( - fold=self.fold, - seed=self.seed, - perc_training_tox=self.perc_training_tox, - mb_size=self.mb_size, - n_outer_splits="time", - scope=scope, - project=project, - dual_head=self.dual_head, - sample_weights=self.sample_weights, - huggingface=("bertweet" in self.model_type), - ) - self._init_dirnames(kw=kw, experiment_id=experiment_id) - print("------- Checking there is a GPU") - check_gpu() - - def _init_dirnames(self, kw, experiment_id): - kw = "test" if self.test else kw - hyper_param_kw = "" - if self.optimizer_name == "AdamW": - hyper_param_kw += f"{self.weight_decay}_" - if self.gradient_clipping: - hyper_param_kw += f"{self.gradient_clipping}_" - if self.content_loss_weight: - hyper_param_kw += f"{self.content_loss_weight}_" - experiment_name = ( - f"{self.language}{str(datetime.now()).replace(' ', '')[:-7]}{kw}_{experiment_id}{self.fold}_" - f"{self.optimizer_name}_" - f"{self.learning_rate}_" - f"{hyper_param_kw}" - f"{self.mb_size}_" - f"{self.perc_training_tox}_" - f"{self.train_epochs}_seed{self.seed}" - ) - print("------- Experiment name: ", experiment_name) - self.logdir = ( - f"..." - if self.test - else f"..." - ) - self.checkpoint_path = f"{self.model_dir}/{experiment_name}" - - @staticmethod - def _additional_writers(logdir, metric_name): - return tf.summary.create_file_writer(os.path.join(logdir, metric_name)) - - def get_callbacks(self, fold, val_data, test_data): - fold_logdir = self.logdir + f"_fold{fold}" - fold_checkpoint_path = self.checkpoint_path + f"_fold{fold}/{{epoch:02d}}" - - tb_args = { - "log_dir": fold_logdir, - "histogram_freq": 0, - "update_freq": 500, - "embeddings_freq": 0, - "remote_logdir": f"{self.remote_logdir}_{self.language}" - if not self.test - else f"{self.remote_logdir}_test", - } - tensorboard_callback = ( - GradientLoggingTensorBoard(loader=self.mb_loader, val_data=val_data, freq=10, **tb_args) - if self.log_gradients - else SyncingTensorBoard(**tb_args) - ) - - callbacks = [tensorboard_callback] - if "bertweet" in self.model_type: - from_logits = True - dataset_transform_func = self.mb_loader.make_huggingface_tensorflow_ds - else: - from_logits = False - dataset_transform_func = None - - fixed_recall = 0.85 if not self.dual_head else 0.5 - val_callback = AdditionalResultLogger( - data=val_data, - set_="validation", - from_logits=from_logits, - dataset_transform_func=dataset_transform_func, - dual_head=self.dual_head, - fixed_recall=fixed_recall - ) - if val_callback is not None: - callbacks.append(val_callback) - - test_callback = AdditionalResultLogger( - data=test_data, - set_="test", - from_logits=from_logits, - dataset_transform_func=dataset_transform_func, - dual_head=self.dual_head, - fixed_recall=fixed_recall - ) - callbacks.append(test_callback) - - checkpoint_args = { - "filepath": fold_checkpoint_path, - "verbose": 0, - "monitor": "val_pr_auc", - "save_weights_only": True, - "mode": "max", - "save_freq": "epoch", - } - if self.stopping_epoch: - checkpoint_callback = ControlledStoppingCheckpointCallback( - **checkpoint_args, - stopping_epoch=self.stopping_epoch, - save_best_only=False, - ) - callbacks.append(checkpoint_callback) - - return callbacks - - def get_lr_schedule(self, steps_per_epoch): - total_num_steps = steps_per_epoch * self.train_epochs - - warm_up_perc = WARM_UP_PERC if self.learning_rate >= 1e-3 else 0 - warm_up_steps = int(total_num_steps * warm_up_perc) - if self.linear_lr_decay: - learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay( - self.learning_rate, - total_num_steps - warm_up_steps, - end_learning_rate=0.0, - power=1.0, - cycle=False, - ) - else: - print('Constant learning rate') - learning_rate_fn = self.learning_rate - - if warm_up_perc > 0: - print(f".... using warm-up for {warm_up_steps} steps") - warm_up_schedule = WarmUp( - initial_learning_rate=self.learning_rate, - decay_schedule_fn=learning_rate_fn, - warmup_steps=warm_up_steps, - ) - return warm_up_schedule - return learning_rate_fn - - def get_optimizer(self, schedule): - optim_args = { - "learning_rate": schedule, - "beta_1": 0.9, - "beta_2": 0.999, - "epsilon": 1e-6, - "amsgrad": False, - } - if self.gradient_clipping: - optim_args["global_clipnorm"] = self.gradient_clipping - - print(f".... {self.optimizer_name} w global clipnorm {self.gradient_clipping}") - if self.optimizer_name == "Adam": - return tf.keras.optimizers.Adam(**optim_args) - - if self.optimizer_name == "AdamW": - optim_args["weight_decay"] = self.weight_decay - return AdamW(**optim_args) - raise NotImplementedError - - def get_training_actors(self, steps_per_epoch, val_data, test_data, fold): - callbacks = self.get_callbacks(fold=fold, val_data=val_data, test_data=test_data) - schedule = self.get_lr_schedule(steps_per_epoch=steps_per_epoch) - - optimizer = self.get_optimizer(schedule) - - return optimizer, callbacks - - def load_data(self): - if self.project == 435 or self.project == 211: - if self.dataset_type is None: - data_loader = ENLoader(project=self.project, setting_file=self.setting_file) - dataset_type_args = {} - else: - data_loader = ENLoaderWithSampling(project=self.project, setting_file=self.setting_file) - dataset_type_args = self.dataset_type - - df = data_loader.load_data( - language=self.language, test=self.test, reload=self.dataset_reload, **dataset_type_args - ) - - return df - - def preprocess(self, df): - if self.project == 435 or self.project == 211: - if self.preprocessing is None: - data_prepro = DefaultENNoPreprocessor() - elif self.preprocessing == "default": - data_prepro = DefaultENPreprocessor() - else: + OPTIMIZERS = ["Adam", "AdamW"] + + def __init__( + self, + optimizer_name: str, + weight_decay: float, + learning_rate: float, + mb_size: int, + train_epochs: int, + content_loss_weight: float = 1.0, + language: str = "en", + scope: str = "TOX", + project=..., + experiment_id: str = "default", + gradient_clipping: float = None, + fold: str = "time", + seed: int = RANDOM_SEED, + log_gradients: bool = False, + kw: str = "", + stopping_epoch: int = None, + test: bool = False, + ): + self.seed = seed + self.weight_decay = weight_decay + self.learning_rate = learning_rate + self.mb_size = mb_size + self.train_epochs = train_epochs + self.gradient_clipping = gradient_clipping + + if optimizer_name not in self.OPTIMIZERS: + raise ValueError( + f"Optimizer {optimizer_name} not implemented. Accepted values {self.OPTIMIZERS}." + ) + self.optimizer_name = optimizer_name + self.log_gradients = log_gradients + self.test = test + self.fold = fold + self.stopping_epoch = stopping_epoch + self.language = language + if scope == "TOX": + GCS_ADDRESS = TOX_GCS.format(project=project) + elif scope == "ABS": + GCS_ADDRESS = ABS_GCS + else: + raise ValueError + GCS_ADDRESS = GCS_ADDRESS.format(project=project) + try: + self.setting_file = import_module( + f"toxicity_ml_pipeline.settings.{scope.lower()}{project}_settings" + ) + except ModuleNotFoundError: + raise ValueError( + f"You need to define a setting file for your project {project}." + ) + experiment_settings = self.setting_file.experiment_settings + + self.project = project + self.remote_logdir = REMOTE_LOGDIR.format( + GCS_ADDRESS=GCS_ADDRESS, project=project + ) + self.model_dir = MODEL_DIR.format(GCS_ADDRESS=GCS_ADDRESS, project=project) + + if experiment_id not in experiment_settings: + raise ValueError( + "This is not an experiment id as defined in the settings file." + ) + + for var, default_value in experiment_settings["default"].items(): + override_val = experiment_settings[experiment_id].get(var, default_value) + print("Setting ", var, override_val) + self.__setattr__(var, override_val) + + self.content_loss_weight = content_loss_weight if self.dual_head else None + + self.mb_loader = BalancedMiniBatchLoader( + fold=self.fold, + seed=self.seed, + perc_training_tox=self.perc_training_tox, + mb_size=self.mb_size, + n_outer_splits="time", + scope=scope, + project=project, + dual_head=self.dual_head, + sample_weights=self.sample_weights, + huggingface=("bertweet" in self.model_type), + ) + self._init_dirnames(kw=kw, experiment_id=experiment_id) + print("------- Checking there is a GPU") + check_gpu() + + def _init_dirnames(self, kw: str, experiment_id: str): + kw = "test" if self.test else kw + hyper_param_kw = "" + if self.optimizer_name == "AdamW": + hyper_param_kw += f"{self.weight_decay}_" + if self.gradient_clipping: + hyper_param_kw += f"{self.gradient_clipping}_" + if self.content_loss_weight: + hyper_param_kw += f"{self.content_loss_weight}_" + experiment_name = ( + f"{self.language}{str(datetime.now()).replace(' ', '')[:-7]}{kw}_{experiment_id}{self.fold}_" + f"{self.optimizer_name}_" + f"{self.learning_rate}_" + f"{hyper_param_kw}" + f"{self.mb_size}_" + f"{self.perc_training_tox}_" + f"{self.train_epochs}_seed{self.seed}" + ) + print("------- Experiment name: ", experiment_name) + self.logdir = f"..." if self.test else f"..." + self.checkpoint_path = f"{self.model_dir}/{experiment_name}" + + @staticmethod + def _additional_writers(logdir: str, metric_name: str) -> tf.summary.SummaryWriter: + return tf.summary.create_file_writer(os.path.join(logdir, metric_name)) + + def get_callbacks(self, fold, val_data, test_data): + fold_logdir = self.logdir + f"_fold{fold}" + fold_checkpoint_path = self.checkpoint_path + f"_fold{fold}/{{epoch:02d}}" + + tb_args = { + "log_dir": fold_logdir, + "histogram_freq": 0, + "update_freq": 500, + "embeddings_freq": 0, + "remote_logdir": f"{self.remote_logdir}_{self.language}" + if not self.test + else f"{self.remote_logdir}_test", + } + tensorboard_callback = ( + GradientLoggingTensorBoard( + loader=self.mb_loader, val_data=val_data, freq=10, **tb_args + ) + if self.log_gradients + else SyncingTensorBoard(**tb_args) + ) + + callbacks = [tensorboard_callback] + if "bertweet" in self.model_type: + from_logits = True + dataset_transform_func = self.mb_loader.make_huggingface_tensorflow_ds + else: + from_logits = False + dataset_transform_func = None + + fixed_recall = 0.85 if not self.dual_head else 0.5 + val_callback = AdditionalResultLogger( + data=val_data, + set_="validation", + from_logits=from_logits, + dataset_transform_func=dataset_transform_func, + dual_head=self.dual_head, + fixed_recall=fixed_recall, + ) + if val_callback is not None: + callbacks.append(val_callback) + + test_callback = AdditionalResultLogger( + data=test_data, + set_="test", + from_logits=from_logits, + dataset_transform_func=dataset_transform_func, + dual_head=self.dual_head, + fixed_recall=fixed_recall, + ) + callbacks.append(test_callback) + + checkpoint_args = { + "filepath": fold_checkpoint_path, + "verbose": 0, + "monitor": "val_pr_auc", + "save_weights_only": True, + "mode": "max", + "save_freq": "epoch", + } + if self.stopping_epoch: + checkpoint_callback = ControlledStoppingCheckpointCallback( + **checkpoint_args, + stopping_epoch=self.stopping_epoch, + save_best_only=False, + ) + callbacks.append(checkpoint_callback) + + return callbacks + + def get_lr_schedule(self, steps_per_epoch): + total_num_steps = steps_per_epoch * self.train_epochs + + warm_up_perc = WARM_UP_PERC if self.learning_rate >= 1e-3 else 0 + warm_up_steps = int(total_num_steps * warm_up_perc) + if self.linear_lr_decay: + learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay( + self.learning_rate, + total_num_steps - warm_up_steps, + end_learning_rate=0.0, + power=1.0, + cycle=False, + ) + else: + print("Constant learning rate") + learning_rate_fn = self.learning_rate + + if warm_up_perc > 0: + print(f".... using warm-up for {warm_up_steps} steps") + warm_up_schedule = WarmUp( + initial_learning_rate=self.learning_rate, + decay_schedule_fn=learning_rate_fn, + warmup_steps=warm_up_steps, + ) + return warm_up_schedule + return learning_rate_fn + + def get_optimizer( + self, schedule: tf.keras.optimizers.schedules.LearningRateSchedule + ): + optim_args = { + "learning_rate": schedule, + "beta_1": 0.9, + "beta_2": 0.999, + "epsilon": 1e-6, + "amsgrad": False, + } + if self.gradient_clipping: + optim_args["global_clipnorm"] = self.gradient_clipping + + print(f".... {self.optimizer_name} w global clipnorm {self.gradient_clipping}") + if self.optimizer_name == "Adam": + return tf.keras.optimizers.Adam(**optim_args) + + if self.optimizer_name == "AdamW": + optim_args["weight_decay"] = self.weight_decay + return AdamW(**optim_args) raise NotImplementedError - return data_prepro( - df=df, - label_column=self.label_column, - class_weight=self.perc_training_tox if self.sample_weights == 'class_weight' else None, - filter_low_agreements=self.filter_low_agreements, - num_classes=self.num_classes, - ) - - def load_model(self, optimizer): - smart_bias_value = ( - np.log(self.perc_training_tox / (1 - self.perc_training_tox)) if self.smart_bias_init else 0 - ) - model = load( - optimizer, - seed=self.seed, - trainable=self.trainable, - model_type=self.model_type, - loss_name=self.loss_name, - num_classes=self.num_classes, - additional_layer=self.additional_layer, - smart_bias_value=smart_bias_value, - content_num_classes=self.content_num_classes, - content_loss_name=self.content_loss_name, - content_loss_weight=self.content_loss_weight - ) - - if self.model_reload is not False: - model_folder = upload_model(full_gcs_model_path=os.path.join(self.model_dir, self.model_reload)) - model.load_weights(model_folder) - if self.scratch_last_layer: - print('Putting the last layer back to scratch') - model.layers[-1] = get_last_layer(seed=self.seed, - num_classes=self.num_classes, - smart_bias_value=smart_bias_value) - - return model - - def _train_single_fold(self, mb_generator, test_data, steps_per_epoch, fold, val_data=None): - steps_per_epoch = 100 if self.test else steps_per_epoch - - optimizer, callbacks = self.get_training_actors( - steps_per_epoch=steps_per_epoch, val_data=val_data, test_data=test_data, fold=fold - ) - print("Loading model") - model = self.load_model(optimizer) - print(f"Nb of steps per epoch: {steps_per_epoch} ---- launching training") - training_args = { - "epochs": self.train_epochs, - "steps_per_epoch": steps_per_epoch, - "batch_size": self.mb_size, - "callbacks": callbacks, - "verbose": 2, - } - - model.fit(mb_generator, **training_args) - return - - def train_full_model(self): - print("Setting up random seed.") - set_seeds(self.seed) - - print(f"Loading {self.language} data") - df = self.load_data() - df = self.preprocess(df=df) - - print("Going to train on everything but the test dataset") - mini_batches, test_data, steps_per_epoch = self.mb_loader.simple_cv_load(df) - - self._train_single_fold( - mb_generator=mini_batches, test_data=test_data, steps_per_epoch=steps_per_epoch, fold="full" - ) - - def train(self): - print("Setting up random seed.") - set_seeds(self.seed) - - print(f"Loading {self.language} data") - df = self.load_data() - df = self.preprocess(df=df) - - print("Loading MB generator") - i = 0 - if self.project == 435 or self.project == 211: - mb_generator, steps_per_epoch, val_data, test_data = self.mb_loader.no_cv_load(full_df=df) - self._train_single_fold( - mb_generator=mb_generator, - val_data=val_data, - test_data=test_data, - steps_per_epoch=steps_per_epoch, - fold=i, - ) - else: - raise ValueError("Sure you want to do multiple fold training") - for mb_generator, steps_per_epoch, val_data, test_data in self.mb_loader(full_df=df): + def get_training_actors(self, steps_per_epoch, val_data, test_data, fold): + callbacks = self.get_callbacks( + fold=fold, val_data=val_data, test_data=test_data + ) + schedule = self.get_lr_schedule(steps_per_epoch=steps_per_epoch) + + optimizer = self.get_optimizer(schedule) + + return optimizer, callbacks + + def load_data(self) -> pd.DataFrame: + if self.project == 435 or self.project == 211: + if self.dataset_type is None: + data_loader = ENLoader( + project=self.project, setting_file=self.setting_file + ) + dataset_type_args = {} + else: + data_loader = ENLoaderWithSampling( + project=self.project, setting_file=self.setting_file + ) + dataset_type_args = self.dataset_type + + df = data_loader.load_data( + language=self.language, + test=self.test, + reload=self.dataset_reload, + **dataset_type_args, + ) + + return df + + def preprocess(self, df: pd.DataFrame): + if self.project == 435 or self.project == 211: + if self.preprocessing is None: + data_prepro = DefaultENNoPreprocessor() + elif self.preprocessing == "default": + data_prepro = DefaultENPreprocessor() + else: + raise NotImplementedError + + return data_prepro( + df=df, + label_column=self.label_column, + class_weight=self.perc_training_tox + if self.sample_weights == "class_weight" + else None, + filter_low_agreements=self.filter_low_agreements, + num_classes=self.num_classes, + ) + + def load_model(self, optimizer: tf.keras.optimizers.Optimizer): + smart_bias_value = ( + np.log(self.perc_training_tox / (1 - self.perc_training_tox)) + if self.smart_bias_init + else 0 + ) + model = load( + optimizer, + seed=self.seed, + trainable=self.trainable, + model_type=self.model_type, + loss_name=self.loss_name, + num_classes=self.num_classes, + additional_layer=self.additional_layer, + smart_bias_value=smart_bias_value, + content_num_classes=self.content_num_classes, + content_loss_name=self.content_loss_name, + content_loss_weight=self.content_loss_weight, + ) + + if self.model_reload is not False: + model_folder = upload_model( + full_gcs_model_path=os.path.join(self.model_dir, self.model_reload) + ) + model.load_weights(model_folder) + if self.scratch_last_layer: + print("Putting the last layer back to scratch") + model.layers[-1] = get_last_layer( + seed=self.seed, + num_classes=self.num_classes, + smart_bias_value=smart_bias_value, + ) + + return model + + def _train_single_fold( + self, + mb_generator: tf.data.Dataset, + test_data: pd.DataFrame, + steps_per_epoch: int, + fold: int, + val_data: pd.DataFrame = None, + ): + steps_per_epoch = 100 if self.test else steps_per_epoch + + optimizer, callbacks = self.get_training_actors( + steps_per_epoch=steps_per_epoch, + val_data=val_data, + test_data=test_data, + fold=fold, + ) + print("Loading model") + model = self.load_model(optimizer) + print(f"Nb of steps per epoch: {steps_per_epoch} ---- launching training") + training_args = { + "epochs": self.train_epochs, + "steps_per_epoch": steps_per_epoch, + "batch_size": self.mb_size, + "callbacks": callbacks, + "verbose": 2, + } + + model.fit(mb_generator, **training_args) + + def train_full_model(self): + print("Setting up random seed.") + set_seeds(self.seed) + + print(f"Loading {self.language} data") + df = self.load_data() + df = self.preprocess(df=df) + + print("Going to train on everything but the test dataset") + mini_batches, test_data, steps_per_epoch = self.mb_loader.simple_cv_load(df) + self._train_single_fold( - mb_generator=mb_generator, - val_data=val_data, - test_data=test_data, - steps_per_epoch=steps_per_epoch, - fold=i, + mb_generator=mini_batches, + test_data=test_data, + steps_per_epoch=steps_per_epoch, + fold="full", ) - i += 1 - if i == 3: - break + + def train(self): + print("Setting up random seed.") + set_seeds(self.seed) + print(f"Loading {self.language} data") + df = self.load_data() + df = self.preprocess(df=df) + print("Loading MB generator") + i = 0 + + if self.project == 435 or self.project == 211: + ( + mb_generator, + steps_per_epoch, + val_data, + test_data, + ) = self.mb_loader.no_cv_load(full_df=df) + self._train_single_fold( + mb_generator=mb_generator, + val_data=val_data, + test_data=test_data, + steps_per_epoch=steps_per_epoch, + fold=i, + ) + else: + raise ValueError("Sure you want to do multiple fold training") + + for mb_generator, steps_per_epoch, val_data, test_data in self.mb_loader( + full_df=df + ): + self._train_single_fold( + mb_generator=mb_generator, + val_data=val_data, + test_data=test_data, + steps_per_epoch=steps_per_epoch, + fold=i, + ) + i += 1 + if i == 3: + break diff --git a/trust_and_safety_models/toxicity/utils/helpers.py b/trust_and_safety_models/toxicity/utils/helpers.py index c21d7eb1c..8924978bb 100644 --- a/trust_and_safety_models/toxicity/utils/helpers.py +++ b/trust_and_safety_models/toxicity/utils/helpers.py @@ -3,97 +3,101 @@ import random as python_random import subprocess -from toxicity_ml_pipeline.settings.default_settings_tox import LOCAL_DIR - import numpy as np from sklearn.metrics import precision_recall_curve - +from toxicity_ml_pipeline.settings.default_settings_tox import LOCAL_DIR try: - import tensorflow as tf + import tensorflow as tf except ModuleNotFoundError: - pass - + pass -def upload_model(full_gcs_model_path): - folder_name = full_gcs_model_path - if folder_name[:5] != "gs://": - folder_name = "gs://" + folder_name - dirname = os.path.dirname(folder_name) - epoch = os.path.basename(folder_name) +def upload_model(full_gcs_model_path: str): + folder_name = full_gcs_model_path + if folder_name[:5] != "gs://": + folder_name = "gs://" + folder_name + + dirname = os.path.dirname(folder_name) + epoch = os.path.basename(folder_name) + + model_dir = os.path.join(LOCAL_DIR, "models") + cmd = f"mkdir {model_dir}" + try: + execute_command(cmd) + except subprocess.CalledProcessError: + pass + model_dir = os.path.join(model_dir, os.path.basename(dirname)) + cmd = f"mkdir {model_dir}" + try: + execute_command(cmd) + except subprocess.CalledProcessError: + pass + + try: + _ = int(epoch) + except ValueError: + cmd = f"gsutil rsync -r '{folder_name}' {model_dir}" + weights_dir = model_dir + + else: + cmd = f"gsutil cp '{dirname}/checkpoint' {model_dir}/" + execute_command(cmd) + cmd = f"gsutil cp '{os.path.join(dirname, epoch)}*' {model_dir}/" + weights_dir = f"{model_dir}/{epoch}" - model_dir = os.path.join(LOCAL_DIR, "models") - cmd = f"mkdir {model_dir}" - try: - execute_command(cmd) - except subprocess.CalledProcessError: - pass - model_dir = os.path.join(model_dir, os.path.basename(dirname)) - cmd = f"mkdir {model_dir}" - try: execute_command(cmd) - except subprocess.CalledProcessError: - pass - - try: - _ = int(epoch) - except ValueError: - cmd = f"gsutil rsync -r '{folder_name}' {model_dir}" - weights_dir = model_dir + return weights_dir - else: - cmd = f"gsutil cp '{dirname}/checkpoint' {model_dir}/" - execute_command(cmd) - cmd = f"gsutil cp '{os.path.join(dirname, epoch)}*' {model_dir}/" - weights_dir = f"{model_dir}/{epoch}" - execute_command(cmd) - return weights_dir +def compute_precision_fixed_recall( + labels: np.ndarray, preds: np.ndarray, fixed_recall: float +): + precision_values, recall_values, thresholds = precision_recall_curve( + y_true=labels, probas_pred=preds + ) + index_recall = bisect.bisect_left(-recall_values, -1 * fixed_recall) + result = precision_values[index_recall - 1] + print(f"Precision at {recall_values[index_recall-1]} recall: {result}") -def compute_precision_fixed_recall(labels, preds, fixed_recall): - precision_values, recall_values, thresholds = precision_recall_curve(y_true=labels, probas_pred=preds) - index_recall = bisect.bisect_left(-recall_values, -1 * fixed_recall) - result = precision_values[index_recall - 1] - print(f"Precision at {recall_values[index_recall-1]} recall: {result}") + return result, thresholds[index_recall - 1] - return result, thresholds[index_recall - 1] -def load_inference_func(model_folder): - model = tf.saved_model.load(model_folder, ["serve"]) - inference_func = model.signatures["serving_default"] - return inference_func +def load_inference_func(model_folder: str): + model = tf.saved_model.load(model_folder, ["serve"]) + inference_func = model.signatures["serving_default"] + return inference_func def execute_query(client, query): - job = client.query(query) - df = job.result().to_dataframe() - return df + job = client.query(query) + df = job.result().to_dataframe() + return df -def execute_command(cmd, print_=True): - s = subprocess.run(cmd, shell=True, capture_output=print_, check=True) - if print_: - print(s.stderr.decode("utf-8")) - print(s.stdout.decode("utf-8")) +def execute_command(cmd: str, print_: bool = True): + s = subprocess.run(cmd, shell=True, capture_output=print_, check=True) + if print_: + print(s.stderr.decode("utf-8")) + print(s.stdout.decode("utf-8")) def check_gpu(): - try: - execute_command("nvidia-smi") - except subprocess.CalledProcessError: - print("There is no GPU when there should be one.") - raise AttributeError - - l = tf.config.list_physical_devices("GPU") - if len(l) == 0: - raise ModuleNotFoundError("Tensorflow has not found the GPU. Check your installation") - print(l) - - -def set_seeds(seed): - np.random.seed(seed) - - python_random.seed(seed) - - tf.random.set_seed(seed) + try: + execute_command("nvidia-smi") + except subprocess.CalledProcessError: + print("There is no GPU when there should be one.") + raise AttributeError + + l = tf.config.list_physical_devices("GPU") + if len(l) == 0: + raise ModuleNotFoundError( + "Tensorflow has not found the GPU. Check your installation" + ) + print(l) + + +def set_seeds(seed: int): + np.random.seed(seed) + python_random.seed(seed) + tf.random.set_seed(seed) diff --git a/twml/libtwml/setup.py b/twml/libtwml/setup.py index 2dcfa105d..ebd76e577 100644 --- a/twml/libtwml/setup.py +++ b/twml/libtwml/setup.py @@ -1,12 +1,12 @@ """ libtwml setup.py module """ -from setuptools import setup, find_packages +from setuptools import find_packages, setup setup( - name='libtwml', - version='2.0', - description="Tensorflow C++ ops for twml", - packages=find_packages(), - data_files=[('', ['libtwml_tf.so'])], + name="libtwml", + version="2.0", + description="Tensorflow C++ ops for twml", + packages=find_packages(), + data_files=[("", ["libtwml_tf.so"])], ) diff --git a/twml/libtwml/src/ops/scripts/get_inc.py b/twml/libtwml/src/ops/scripts/get_inc.py index c50edfa90..df92dea44 100644 --- a/twml/libtwml/src/ops/scripts/get_inc.py +++ b/twml/libtwml/src/ops/scripts/get_inc.py @@ -2,4 +2,4 @@ import tensorflow.compat.v1 as tf -print(tf.sysconfig.get_include(), end='') +print(tf.sysconfig.get_include(), end="") diff --git a/twml/libtwml/src/ops/scripts/get_lib.py b/twml/libtwml/src/ops/scripts/get_lib.py index 7150c48b7..c212e27d0 100644 --- a/twml/libtwml/src/ops/scripts/get_lib.py +++ b/twml/libtwml/src/ops/scripts/get_lib.py @@ -2,4 +2,4 @@ import tensorflow.compat.v1 as tf -print(tf.sysconfig.get_lib(), end='') +print(tf.sysconfig.get_lib(), end="") diff --git a/twml/setup.py b/twml/setup.py index 7e4003bae..bdd548874 100644 --- a/twml/setup.py +++ b/twml/setup.py @@ -2,28 +2,27 @@ from setuptools import find_packages, setup - THIS_DIR = os.path.dirname(os.path.realpath(__file__)) -TWML_TEST_DATA_DIR = os.path.join(THIS_DIR, 'twml/tests/data') +TWML_TEST_DATA_DIR = os.path.join(THIS_DIR, "twml/tests/data") data_files = [] for parent, children, files in os.walk(TWML_TEST_DATA_DIR): - data_files += [os.path.join(parent, f) for f in files] + data_files += [os.path.join(parent, f) for f in files] setup( - name='twml', - version='2.0', - description="Tensorflow wrapper for twml", - packages=find_packages(exclude=["build"]), - install_requires=[ - 'thriftpy2', - 'numpy', - 'pyyaml', - 'future', - 'scikit-learn', - 'scipy' - ], - package_data={ - 'twml': data_files, - }, + name="twml", + version="2.0", + description="Tensorflow wrapper for twml", + packages=find_packages(exclude=["build"]), + install_requires=[ + "thriftpy2", + "numpy", + "pyyaml", + "future", + "scikit-learn", + "scipy", + ], + package_data={ + "twml": data_files, + }, ) diff --git a/twml/twml/__init__.py b/twml/twml/__init__.py index 0c96df68b..0abfbed35 100644 --- a/twml/twml/__init__.py +++ b/twml/twml/__init__.py @@ -2,60 +2,67 @@ import os +import tensorflow.compat.v1 as tf # noqa: F402 + # Import from twitter.deepbird from twitter.deepbird.logging.log_level import set_logging_level # noqa: F401 from twitter.deepbird.sparse import SparseTensor # noqa: F401 from twitter.deepbird.sparse import sparse_dense_matmul # noqa: F401 -from .util import dynamic_partition, feature_id, limit_bits, limit_sparse_tensor_size # noqa: F401 -from .util import write_file, fixed_length_tensor, setup_tf_logging_formatter # noqa: F401 -from .array import Array # noqa: F401 +from . import constants # noqa: F401 +from . import errors # noqa: F401 +from . import layers # noqa: F401 +from . import lookup # noqa: F401 +from . import readers # noqa: F401 +from . import summary # noqa: F401 +from . import tensorboard # noqa: F401 -# Module to parse feature patterns and match them from data_spec.json -from .feature_config import FeatureConfig, FeatureConfigBuilder # noqa: F401 +# Custom argparser for Trainer +from .argument_parser import * # noqa: T400 +from .array import Array # noqa: F401 +from .block_format_writer import * # noqa: T400 # Data record streaming, reading, writing, and parsing. from .dataset import * # noqa: T400 -from .readers import * # noqa: T400 -from .block_format_writer import * # noqa: T400 # Graph output functions from .export_output_fns import * # noqa: T400 -# Input parsers -from .parsers import * # noqa: T400 - -# Input functions -from .input_fns import * # noqa: T400 +# Module to parse feature patterns and match them from data_spec.json +from .feature_config import FeatureConfig, FeatureConfigBuilder # noqa: F401 # Feature filter functions from .filters import * # noqa: T400 -# Custom argparser for Trainer -from .argument_parser import * # noqa: T400 +# Input functions +from .input_fns import * # noqa: T400 -from . import constants # noqa: F401 -from . import errors # noqa: F401 -from . import layers # noqa: F401 -from . import lookup # noqa: F401 -from . import readers # noqa: F401 -from . import summary # noqa: F401 -from . import tensorboard # noqa: F401 +# Input parsers +from .parsers import * # noqa: T400 +from .readers import * # noqa: T400 +from .util import feature_id # noqa: F401 +from .util import ( + dynamic_partition, + fixed_length_tensor, + limit_bits, + limit_sparse_tensor_size, + setup_tf_logging_formatter, + write_file, +) -import tensorflow.compat.v1 as tf # noqa: F402 tf.disable_eager_execution() # TODO: Figure out a better way to deal with this. -if 'OMP_NUM_THREADS' not in os.environ and 'MKL_NUM_THREADS' not in os.environ: - os.environ["OMP_NUM_THREADS"] = '1' +if "OMP_NUM_THREADS" not in os.environ and "MKL_NUM_THREADS" not in os.environ: + os.environ["OMP_NUM_THREADS"] = "1" # Import all custom C++ ops -from libtwml import add1, partition_sparse_tensor, CLIB # noqa: F401 +from libtwml import CLIB, add1, partition_sparse_tensor # noqa: F401 # Configure logging levels to info for various frameworks -set_logging_level('INFO') +set_logging_level("INFO") from . import contrib # noqa: F401 from . import hooks # noqa: F401 -from . import trainers # noqa: F401 from . import metrics # noqa: F401 +from . import trainers # noqa: F401 diff --git a/twml/twml/argument_parser.py b/twml/twml/argument_parser.py index c771eebdf..29ab45c86 100644 --- a/twml/twml/argument_parser.py +++ b/twml/twml/argument_parser.py @@ -3,559 +3,804 @@ Command-line argument parsing for the Trainer. """ import argparse +import tempfile from argparse import ArgumentError from operator import attrgetter -import tempfile +from typing import List -import twml import tensorflow.compat.v1 as tf +import twml SERIAL = "serial" TREE = "tree" LOG_LEVELS = { - "debug": tf.logging.DEBUG, - "info": tf.logging.INFO, - "warn": tf.logging.WARN, - "error": tf.logging.ERROR} + "debug": tf.logging.DEBUG, + "info": tf.logging.INFO, + "warn": tf.logging.WARN, + "error": tf.logging.ERROR, +} class SortingHelpFormatter(argparse.HelpFormatter): - """ - Used to sort args alphabetically in the help message. - """ - - def add_arguments(self, actions): - actions = sorted(actions, key=attrgetter('option_strings')) - super(SortingHelpFormatter, self).add_arguments(actions) - - -def _set_log_level(level=None): - """Sets the tensorflow log level to the input level.""" - if level is None: - return None - level = level.lower() - if level not in LOG_LEVELS.keys(): - raise ValueError(f"Unexpected log level {level} was given but expected one of {LOG_LEVELS.keys()}.") - tf.logging.set_verbosity(LOG_LEVELS[level]) - tf.logging.info(f"Setting tensorflow logging level to {level} or {LOG_LEVELS[level]}") - return level - - -def get_trainer_parser(): - """ - Add common commandline args to parse for the Trainer class. - Typically, the user calls this function and then parses cmd-line arguments - into an argparse.Namespace object which is then passed to the Trainer constructor - via the params argument. - - See the `code <_modules/twml/argument_parser.html#get_trainer_parser>`_ - for a list and description of all cmd-line arguments. - - Args: - learning_rate_decay: - Defaults to False. When True, parses learning rate decay arguments. - - Returns: - argparse.ArgumentParser instance with some useful args already added. - """ - parser = twml.DefaultSubcommandArgParse(formatter_class=SortingHelpFormatter) - - parser.add_argument( - "--save_dir", type=str, default=tempfile.mkdtemp(), - help="Path to the training result directory." - "supports local filesystem path and hdfs://default/ which requires " - "setting HDFS configuration via env variable HADOOP_CONF_DIR ") - parser.add_argument( - "--export_dir", type=str, default=None, - help="Path to the directory to export a SavedModel for prediction servers.") - parser.add_argument( - "--log_aggregation_app_id", type=str, default=None, - help="specify app_id for log aggregation. disabled by default.") - parser.add_argument( - "--train.batch_size", "--train_batch_size", type=int, default=32, - dest='train_batch_size', - help="number of samples per training batch") - parser.add_argument( - "--eval.batch_size", "--eval_batch_size", type=int, default=32, - dest='eval_batch_size', - help="number of samples per cross-validation batch. Defaults to train_batch_size") - parser.add_argument( - "--train.learning_rate", "--learning_rate", type=float, default=0.002, - dest='learning_rate', - help="learning rate. Scales the gradient update.") - parser.add_argument( - "--train.steps", "--train_steps", type=int, default=-1, - dest='train_steps', - help="number of training batches before running evaluation." - "Defaults to -1 (runs through entire dataset). " - "Only used for Trainer.[train,learn]. " - "For Trainer.train_and_evaluate, use train.max_steps instead. ") - parser.add_argument( - "--eval.steps", "--eval_steps", type=int, default=-1, - dest="eval_steps", - help="number of steps per evaluation. Each batch is a step." - "Defaults to -1 (runs through entire dataset). ") - parser.add_argument( - "--eval.period", "--eval_period", type=int, default=600, - dest="eval_period", - help="Trainer.train_and_evaluate waits for this long after each evaluation. " - "Defaults to 600 seconds (evaluate every ten minutes). " - "Note that anything lower than 10*60seconds is probably a bad idea because TF saves " - "checkpoints every 10mins by default. eval.delay is time to wait before doing first eval. " - "eval.period is time between successive evals.") - parser.add_argument( - "--eval.delay", "--eval_delay", type=int, default=120, - dest="eval_delay", - help="Trainer.train_and_evaluate waits for this long before performing the first evaluation" - "Defaults to 120 seconds (evaluate after first 2 minutes of training). " - "eval.delay is time to wait before doing first eval. " - "eval.period is time between successive evals.") - parser.add_argument( - "--train.max_steps", "--train_max_steps", type=int, default=None, - dest="train_max_steps", - help="Stop training after this many global steps. Each training batch is its own step." - "If set to None, step after one train()/evaluate() call. Useful when train.steps=-1." - "If set to a non-positive value, loop forever. Usually useful with early stopping.") - parser.add_argument( - "--train.log_metrics", dest="train_log_metrics", action="store_true", default=False, - help="Set this to true to see metrics during training. " - "WARNING: metrics during training does not represent model performance. " - "WARNING: use for debugging only as this slows down training.") - parser.add_argument( - "--train.early_stop_patience", "--early_stop_patience", type=int, default=-1, - dest="early_stop_patience", - help="max number of evaluations (epochs) to wait for an improvement in the early_stop_metric." - "Defaults to -1 (no early-stopping)." - "NOTE: This can not be enabled when --distributed is also set.") - parser.add_argument( - "--train.early_stop_tolerance", "--early_stop_tolerance", type=float, default=0, - dest="early_stop_tolerance", - help="a non-negative tolerance for comparing early_stop_metric." - "e.g. when maximizing the condition is current_metric > best_metric + tolerance." - "Defaults to 0.") - parser.add_argument( - "--train.dataset_shards", "--train_dataset_shards", - dest="train_dataset_shards", - type=int, default=None, - help="An int value that indicates the number of partitions (shards) for the dataset. This is" - " useful for codistillation and other techniques that require each worker to train on disjoint" - " partitions of the dataset.") - parser.add_argument( - "--train.dataset_shard_index", "--train_dataset_shard_index", - dest="train_dataset_shard_index", - type=int, default=None, - help="An int value (starting at zero) that indicates which partition (shard) of the dataset" - " to use if --train.dataset_shards is set.") - parser.add_argument( - "--continue_from_checkpoint", dest="continue_from_checkpoint", action="store_true", - help="DEPRECATED. This option is currently a no-op." - " Continuing from the provided checkpoint is now the default." - " Use --overwrite_save_dir if you would like to override it instead" - " and restart training from scratch.") - parser.add_argument( - "--overwrite_save_dir", dest="overwrite_save_dir", action="store_true", - help="Delete the contents of the current save_dir if it exists") - parser.add_argument( - "--data_threads", "--num_threads", type=int, default=2, - dest="num_threads", - help="Number of threads to use for loading the dataset. " - "num_threads is deprecated and to be removed in future versions. Use data_threads.") - parser.add_argument( - "--max_duration", "--max_duration", type=float, default=None, - dest="max_duration", - help="Maximum duration (in secs) that training/validation will be allowed to run for before being automatically terminated.") - parser.add_argument( - "--num_workers", type=int, default=None, - help="Number of workers to use when training in hogwild manner on a single node.") - parser.add_argument( - "--distributed", dest="distributed", action="store_true", - help="Pass this flag to use train_and_evaluate to train in a distributed fashion" - "NOTE: You can not use early stopping when --distributed is enabled" - ) - parser.add_argument( - "--distributed_training_cleanup", - dest="distributed_training_cleanup", - action="store_true", - help="Set if using distributed training on GKE to stop TwitterSetDeployment" - "from continuing training upon restarts (will be deprecated once we migrate off" - "TwitterSetDeployment for distributed training on GKE)." - ) - parser.add_argument( - "--disable_auto_ps_shutdown", default=False, action="store_true", - help="Disable the functionality of automatically shutting down parameter server after " - "distributed training complete (either succeed or failed)." - ) - parser.add_argument( - "--disable_tensorboard", default=False, action="store_true", - help="Do not start the TensorBoard server." - ) - parser.add_argument( - "--tensorboard_port", type=int, default=None, - help="Port for tensorboard to run on. Ignored if --disable_tensorboard is set.") - parser.add_argument( - "--health_port", type=int, default=None, - help="Port to listen on for health-related endpoints (e.g. graceful shutdown)." - "Not user-facing as it is set automatically by the twml_cli." - ) - parser.add_argument( - "--stats_port", type=int, default=None, - help="Port to listen on for stats endpoints" - ) - parser.add_argument( - "--experiment_tracking_path", - dest="experiment_tracking_path", - type=str, default=None, - help="The tracking path of this experiment. Format: \ - user_name:project_name:experiment_name:run_name. The path is used to track and display \ - a record of this experiment on ML Dashboard. Note: this embedded experiment tracking is \ - disabled when the deprecated Model Repo TrackRun is used in your model config. ") - parser.add_argument( - "--disable_experiment_tracking", - dest="disable_experiment_tracking", - action="store_true", - help="Whether experiment tracking should be disabled.") - parser.add_argument( - "--config.save_checkpoints_secs", "--save_checkpoints_secs", type=int, default=600, - dest='save_checkpoints_secs', - help="Configures the tf.estimator.RunConfig.save_checkpoints_secs attribute. " - "Specifies how often checkpoints are saved in seconds. Defaults to 10*60 seconds.") - parser.add_argument( - "--config.keep_checkpoint_max", "--keep_checkpoint_max", type=int, default=20, - dest='keep_checkpoint_max', - help="Configures the tf.estimator.RunConfig.keep_checkpoint_max attribute. " - "Specifies how many checkpoints to keep. Defaults to 20.") - parser.add_argument( - "--config.tf_random_seed", "--tf_random_seed", type=int, default=None, - dest='tf_random_seed', - help="Configures the tf.estimator.RunConfig.tf_random_seed attribute. " - "Specifies the seed to use. Defaults to None.") - parser.add_argument( - "--optimizer", type=str, default='SGD', - help="Optimizer to use: SGD (Default), Adagrad, Adam, Ftrl, Momentum, RMSProp, LazyAdam, DGC.") - parser.add_argument( - "--gradient_noise_scale", type=float, default=None, - help="adds 0-mean normal noise scaled by this value. Defaults to None.") - parser.add_argument( - "--clip_gradients", type=float, default=None, - help="If specified, a global clipping is applied to prevent " - "the norm of the gradient to exceed this value. Defaults to None.") - parser.add_argument( - "--dgc.density", "--dgc_density", type=float, default=0.1, - dest="dgc_density", - help="Specifies gradient density level when using deep gradient compression optimizer." - "E.g., default value being 0.1 means that only top 10%% most significant rows " - "(based on absolute value sums) are kept." - ) - parser.add_argument( - "--dgc.density_decay", "--dgc_density_decay", type=bool, default=True, - dest="dgc_density_decay", - help="Specifies whether to (exponentially) decay the gradient density level when" - " doing gradient compression. If set 'False', the 'density_decay_steps', " - "'density_decay_rate' and 'min_density' arguments will be ignored." - ) - parser.add_argument( - "--dgc.density_decay_steps", "--dgc_density_decay_steps", type=int, default=10000, - dest="dgc_density_decay_steps", - help="Specifies the step interval to perform density decay." - ) - parser.add_argument( - "--dgc.density_decay_rate", "--dgc_density_decay_rate", type=float, default=0.5, - dest="dgc_density_decay_rate", - help="Specifies the decay rate when perfoming density decay." - ) - parser.add_argument( - "--dgc.min_density", "--dgc_min_density", type=float, default=0.1, - dest="dgc_min_density", - help="Specifies the minimum density level when perfoming density decay." - ) - parser.add_argument( - "--dgc.accumulation", "--dgc_accumulation", type=bool, default=False, - dest="dgc_accumulation", - help="Specifies whether to accumulate small gradients when using deep gradient compression " - "optimizer." - ) - parser.add_argument( - "--show_optimizer_summaries", dest="show_optimizer_summaries", action="store_true", - help="When specified, displays gradients and learning rate in tensorboard." - "Turning it on has 10-20%% performance hit. Enable for debugging only") - - parser.add_argument( - "--num_mkl_threads", dest="num_mkl_threads", default=1, type=int, - help="Specifies how many threads to use for MKL" - "inter_op_ parallelism_threds is set to TWML_NUM_CPUS / num_mkl_threads." - "intra_op_parallelism_threads is set to num_mkl_threads.") - - parser.add_argument("--verbosity", type=_set_log_level, choices=LOG_LEVELS.keys(), default=None, - help="Sets log level to a given verbosity.") - - parser.add_argument( - "--feature_importance.algorithm", dest="feature_importance_algorithm", - type=str, default=TREE, choices=[SERIAL, TREE], - help=""" - There are two algorithms that the module supports, `serial` and `tree`. - The `serial` algorithm computes feature importances for each feature, and - the `tree` algorithm groups features by feature name prefix, computes feature - importances for groups of features, and then only 'zooms-in' on a group when the - importance is greater than the `--feature_importance.sensitivity` value. The `tree` algorithm - will usually run faster, but for relatively unimportant features it will only compute an - upper bound rather than an exact importance value. We suggest that users generally stick - to the `tree` algorithm, unless if they have a very small number of features or - near-random model performance. - """) - - parser.add_argument( - "--feature_importance.sensitivity", dest="feature_importance_sensitivity", type=float, default=0.03, - help=""" - The maximum amount that permuting a feature group can cause the model performance (determined - by `feature_importance.metric`) to drop before the algorithm decides to not expand the feature - group. This is only used for the `tree` algorithm. - """) - - parser.add_argument( - "--feature_importance.dont_build_tree", dest="dont_build_tree", action="store_true", default=False, - help=""" - If True, don't build the feature trie for the tree algorithm and only use the extra_groups - """) - - parser.add_argument( - "--feature_importance.split_feature_group_on_period", dest="split_feature_group_on_period", action="store_true", default=False, - help="If true, split feature groups by the period rather than the optimal prefix. Only used for the TREE algorithm") - - parser.add_argument( - "--feature_importance.example_count", dest="feature_importance_example_count", type=int, default=10000, - help=""" - The number of examples used to compute feature importance. - Larger values yield more reliable results, but also take longer to compute. - These records are loaded into memory. This number is agnostic to batch size. - """) - - parser.add_argument( - "--feature_importance.data_dir", dest="feature_importance_data_dir", type=str, default=None, - help="Path to the dataset used to compute feature importance." - "supports local filesystem path and hdfs://default/ which requires " - "setting HDFS configuration via env variable HADOOP_CONF_DIR " - "Defaults to eval_data_dir") - - parser.add_argument( - "--feature_importance.metric", dest="feature_importance_metric", type=str, default="roc_auc", - help="The metric used to determine when to stop expanding the feature importance tree. This is only used for the `tree` algorithm.") - - parser.add_argument( - "--feature_importance.is_metric_larger_the_better", dest="feature_importance_is_metric_larger_the_better", action="store_true", default=False, - help="If true, interpret `--feature_importance.metric` to be a metric where larger values are better (e.g. ROC_AUC)") - - parser.add_argument( - "--feature_importance.is_metric_smaller_the_better", dest="feature_importance_is_metric_smaller_the_better", action="store_true", default=False, - help="If true, interpret `--feature_importance.metric` to be a metric where smaller values are better (e.g. LOSS)") - - subparsers = parser.add_subparsers(help='Learning Rate Decay Functions. Can only pass 1.' - 'Should be specified after all the optional arguments' - 'and followed by its specific args' - 'e.g. --learning_rate 0.01 inverse_learning_rate_decay_fn' - ' --decay_rate 0.0004 --min_learning_rate 0.001', - dest='learning_rate_decay') - - # Create the parser for the "exponential_learning_rate_decay_fn" - parser_exponential = subparsers.add_parser('exponential_learning_rate_decay', - help='Exponential learning rate decay. ' - 'Exponential decay implements:' - 'decayed_learning_rate = learning_rate * ' - 'exponential_decay_rate ^ ' - '(global_step / decay_steps') - parser_exponential.add_argument( - "--decay_steps", type=float, default=None, - help="Required for 'exponential' learning_rate_decay.") - parser_exponential.add_argument( - "--exponential_decay_rate", type=float, default=None, - help="Required for 'exponential' learning_rate_decay. Must be positive. ") - - # Create the parser for the "polynomial_learning_rate_decay_fn" - parser_polynomial = subparsers.add_parser('polynomial_learning_rate_decay', - help='Polynomial learning rate decay. ' - 'Polynomial decay implements: ' - 'global_step = min(global_step, decay_steps)' - 'decayed_learning_rate = ' - '(learning_rate - end_learning_rate) * ' - '(1 - global_step / decay_steps) ^ ' - '(polynomial_power) + end_learning_rate' - 'So for linear decay you can use a ' - 'polynomial_power=1 (the default)') - parser_polynomial.add_argument( - "--end_learning_rate", type=float, default=0.0001, - help="Required for 'polynomial' learning_rate_decay (ignored otherwise).") - parser_polynomial.add_argument( - "--polynomial_power", type=float, default=0.0001, - help="Required for 'polynomial' learning_rate_decay." - "The power of the polynomial. Defaults to linear, 1.0.") - parser_polynomial.add_argument( - "--decay_steps", type=float, default=None, - help="Required for 'polynomial' learning_rate_decay. ") - - # Create the parser for the "piecewise_constant_learning_rate_decay_fn" - parser_piecewise_constant = subparsers.add_parser('piecewise_constant_learning_rate_decay', - help='Piecewise Constant ' - 'learning rate decay. ' - 'For piecewise_constant, ' - 'consider this example: ' - 'We want to use a learning rate ' - 'that is 1.0 for' - 'the first 100000 steps,' - '0.5 for steps 100001 to 110000, ' - 'and 0.1 for any additional steps. ' - 'To do so, specify ' - '--piecewise_constant_boundaries=100000,110000' - '--piecewise_constant_values=1.0,0.5,0.1') - parser_piecewise_constant.add_argument( - "--piecewise_constant_values", - action=parse_comma_separated_list(element_type=float), - default=None, - help="Required for 'piecewise_constant_values' learning_rate_decay. " - "A list of comma seperated floats or ints that specifies the values " - "for the intervals defined by boundaries. It should have one more " - "element than boundaries.") - parser_piecewise_constant.add_argument( - "--piecewise_constant_boundaries", - action=parse_comma_separated_list(element_type=int), - default=None, - help="Required for 'piecewise_constant_values' learning_rate_decay. " - "A list of comma seperated integers, with strictly increasing entries.") - - # Create the parser for the "inverse_learning_rate_decay_fn" - parser_inverse = subparsers.add_parser('inverse_learning_rate_decay', - help='Inverse Leaning rate decay. ' - 'Inverse implements:' - 'decayed_lr = max(lr /(1 + decay_rate * ' - 'floor(global_step /decay_step)),' - ' min_learning_rate)' - 'When decay_step=1 this mimics the behaviour' - 'of the default learning rate decay' - 'of DeepBird v1.') - - parser_inverse.add_argument( - "--decay_rate", type=float, default=None, - help="Required for 'inverse' learning_rate_decay. Rate in which we decay the learning rate.") - parser_inverse.add_argument( - "--min_learning_rate", type=float, default=None, - help="Required for 'inverse' learning_rate_decay.Minimum possible learning_rate.") - parser_inverse.add_argument( - "--decay_steps", type=float, default=1, - help="Required for 'inverse' learning_rate_decay.") - - # Create the parser for the "cosine_learning_rate_decay_fn" - parser_cosine = subparsers.add_parser('cosine_learning_rate_decay', - help='Cosine Leaning rate decay. ' - 'Cosine implements:' - 'decayed_lr = 0.5 * (1 + cos(pi *\ - global_step / decay_steps)) * lr' - ) - - parser_cosine.add_argument( - "--alpha", type=float, default=0, - help="A scalar float32 or float64 Tensor or a Python number.\ - Minimum learning rate value as a fraction of learning_rate.") - parser_cosine.add_argument( - "--decay_steps", type=float, - help="Required for 'inverse' learning_rate_decay.") - - # Create the parser for the "cosine_restart_learning_rate_decay_fn" - parser_cosine_restart = subparsers.add_parser('cosine_restarts_learning_rate_decay', - help='Applies cosine decay with restarts \ - to the learning rate' - 'See [Loshchilov & Hutter, ICLR2016],\ - SGDR: Stochastic' - 'Gradient Descent with Warm Restarts.' - 'https://arxiv.org/abs/1608.03983' - ) - parser_cosine_restart.add_argument( - "--first_decay_steps", type=float, - help="Required for 'cosine_restart' learning_rate_decay.") - parser_cosine_restart.add_argument( - "--alpha", type=float, default=0, - help="A scalar float32 or float64 Tensor or a Python number. \ - Minimum learning rate value as a fraction of learning_rate.") - parser_cosine_restart.add_argument( - "--t_mul", type=float, default=2, - help="A scalar float32 or float64 Tensor or a Python number. \ - Used to derive the number of iterations in the i-th period") - parser_cosine_restart.add_argument( - "--m_mul", type=float, default=1, - help="A scalar float32 or float64 Tensor or a Python number. \ - Used to derive the initial learning rate of the i-th period.") - - # Create dummy parser for None, which is the default. - parser_default = subparsers.add_parser( - 'no_learning_rate_decay', - help='No learning rate decay') # noqa: F841 - - parser.set_default_subparser('no_learning_rate_decay') - - return parser + """ + Used to sort args alphabetically in the help message. + """ + def add_arguments(self, actions: argparse.Action) -> None: + actions = sorted(actions, key=attrgetter("option_strings")) + super(SortingHelpFormatter, self).add_arguments(actions) -class DefaultSubcommandArgParse(argparse.ArgumentParser): - """ - Subclass of argparse.ArgumentParser that sets default parser - """ - _DEFAULT_SUBPARSER = None - def set_default_subparser(self, name): +def _set_log_level(level: str = None) -> str: + """Sets the tensorflow log level to the input level.""" + if level is None: + return None + level = level.lower() + if level not in LOG_LEVELS.keys(): + raise ValueError( + f"Unexpected log level {level} was given but expected one of {LOG_LEVELS.keys()}." + ) + tf.logging.set_verbosity(LOG_LEVELS[level]) + tf.logging.info( + f"Setting tensorflow logging level to {level} or {LOG_LEVELS[level]}" + ) + return level + + +def get_trainer_parser() -> argparse.ArgumentParser: """ - sets the default subparser + Add common commandline args to parse for the Trainer class. + Typically, the user calls this function and then parses cmd-line arguments + into an argparse.Namespace object which is then passed to the Trainer constructor + via the params argument. + + See the `code <_modules/twml/argument_parser.html#get_trainer_parser>`_ + for a list and description of all cmd-line arguments. + + Args: + learning_rate_decay: Defaults to False. When True, parses learning rate decay arguments. + + Returns: + argparse.ArgumentParser instance with some useful args already added. """ - self._DEFAULT_SUBPARSER = name + # define the parser + parser = twml.DefaultSubcommandArgParse(formatter_class=SortingHelpFormatter) + + parser.add_argument( + "--save_dir", + type=str, + default=tempfile.mkdtemp(), + help="Path to the training result directory." + "supports local filesystem path and hdfs://default/ which requires " + "setting HDFS configuration via env variable HADOOP_CONF_DIR ", + ) + parser.add_argument( + "--export_dir", + type=str, + default=None, + help="Path to the directory to export a SavedModel for prediction servers.", + ) + parser.add_argument( + "--log_aggregation_app_id", + type=str, + default=None, + help="specify app_id for log aggregation. disabled by default.", + ) + parser.add_argument( + "--train.batch_size", + "--train_batch_size", + type=int, + default=32, + dest="train_batch_size", + help="number of samples per training batch", + ) + parser.add_argument( + "--eval.batch_size", + "--eval_batch_size", + type=int, + default=32, + dest="eval_batch_size", + help="number of samples per cross-validation batch. Defaults to train_batch_size", + ) + parser.add_argument( + "--train.learning_rate", + "--learning_rate", + type=float, + default=0.002, + dest="learning_rate", + help="learning rate. Scales the gradient update.", + ) + parser.add_argument( + "--train.steps", + "--train_steps", + type=int, + default=-1, + dest="train_steps", + help="number of training batches before running evaluation." + "Defaults to -1 (runs through entire dataset). " + "Only used for Trainer.[train,learn]. " + "For Trainer.train_and_evaluate, use train.max_steps instead. ", + ) + parser.add_argument( + "--eval.steps", + "--eval_steps", + type=int, + default=-1, + dest="eval_steps", + help="number of steps per evaluation. Each batch is a step." + "Defaults to -1 (runs through entire dataset). ", + ) + parser.add_argument( + "--eval.period", + "--eval_period", + type=int, + default=600, + dest="eval_period", + help="Trainer.train_and_evaluate waits for this long after each evaluation. " + "Defaults to 600 seconds (evaluate every ten minutes). " + "Note that anything lower than 10*60seconds is probably a bad idea because TF saves " + "checkpoints every 10mins by default. eval.delay is time to wait before doing first eval. " + "eval.period is time between successive evals.", + ) + parser.add_argument( + "--eval.delay", + "--eval_delay", + type=int, + default=120, + dest="eval_delay", + help="Trainer.train_and_evaluate waits for this long before performing the first evaluation" + "Defaults to 120 seconds (evaluate after first 2 minutes of training). " + "eval.delay is time to wait before doing first eval. " + "eval.period is time between successive evals.", + ) + parser.add_argument( + "--train.max_steps", + "--train_max_steps", + type=int, + default=None, + dest="train_max_steps", + help="Stop training after this many global steps. Each training batch is its own step." + "If set to None, step after one train()/evaluate() call. Useful when train.steps=-1." + "If set to a non-positive value, loop forever. Usually useful with early stopping.", + ) + parser.add_argument( + "--train.log_metrics", + dest="train_log_metrics", + action="store_true", + default=False, + help="Set this to true to see metrics during training. " + "WARNING: metrics during training does not represent model performance. " + "WARNING: use for debugging only as this slows down training.", + ) + parser.add_argument( + "--train.early_stop_patience", + "--early_stop_patience", + type=int, + default=-1, + dest="early_stop_patience", + help="max number of evaluations (epochs) to wait for an improvement in the early_stop_metric." + "Defaults to -1 (no early-stopping)." + "NOTE: This can not be enabled when --distributed is also set.", + ) + parser.add_argument( + "--train.early_stop_tolerance", + "--early_stop_tolerance", + type=float, + default=0, + dest="early_stop_tolerance", + help="a non-negative tolerance for comparing early_stop_metric." + "e.g. when maximizing the condition is current_metric > best_metric + tolerance." + "Defaults to 0.", + ) + parser.add_argument( + "--train.dataset_shards", + "--train_dataset_shards", + dest="train_dataset_shards", + type=int, + default=None, + help="An int value that indicates the number of partitions (shards) for the dataset. This is" + " useful for codistillation and other techniques that require each worker to train on disjoint" + " partitions of the dataset.", + ) + parser.add_argument( + "--train.dataset_shard_index", + "--train_dataset_shard_index", + dest="train_dataset_shard_index", + type=int, + default=None, + help="An int value (starting at zero) that indicates which partition (shard) of the dataset" + " to use if --train.dataset_shards is set.", + ) + parser.add_argument( + "--continue_from_checkpoint", + dest="continue_from_checkpoint", + action="store_true", + help="DEPRECATED. This option is currently a no-op." + " Continuing from the provided checkpoint is now the default." + " Use --overwrite_save_dir if you would like to override it instead" + " and restart training from scratch.", + ) + parser.add_argument( + "--overwrite_save_dir", + dest="overwrite_save_dir", + action="store_true", + help="Delete the contents of the current save_dir if it exists", + ) + parser.add_argument( + "--data_threads", + "--num_threads", + type=int, + default=2, + dest="num_threads", + help="Number of threads to use for loading the dataset. " + "num_threads is deprecated and to be removed in future versions. Use data_threads.", + ) + parser.add_argument( + "--max_duration", + "--max_duration", + type=float, + default=None, + dest="max_duration", + help="Maximum duration (in secs) that training/validation will be allowed to run for before being automatically terminated.", + ) + parser.add_argument( + "--num_workers", + type=int, + default=None, + help="Number of workers to use when training in hogwild manner on a single node.", + ) + parser.add_argument( + "--distributed", + dest="distributed", + action="store_true", + help="Pass this flag to use train_and_evaluate to train in a distributed fashion" + "NOTE: You can not use early stopping when --distributed is enabled", + ) + parser.add_argument( + "--distributed_training_cleanup", + dest="distributed_training_cleanup", + action="store_true", + help="Set if using distributed training on GKE to stop TwitterSetDeployment" + "from continuing training upon restarts (will be deprecated once we migrate off" + "TwitterSetDeployment for distributed training on GKE).", + ) + parser.add_argument( + "--disable_auto_ps_shutdown", + default=False, + action="store_true", + help="Disable the functionality of automatically shutting down parameter server after " + "distributed training complete (either succeed or failed).", + ) + parser.add_argument( + "--disable_tensorboard", + default=False, + action="store_true", + help="Do not start the TensorBoard server.", + ) + parser.add_argument( + "--tensorboard_port", + type=int, + default=None, + help="Port for tensorboard to run on. Ignored if --disable_tensorboard is set.", + ) + parser.add_argument( + "--health_port", + type=int, + default=None, + help="Port to listen on for health-related endpoints (e.g. graceful shutdown)." + "Not user-facing as it is set automatically by the twml_cli.", + ) + parser.add_argument( + "--stats_port", + type=int, + default=None, + help="Port to listen on for stats endpoints", + ) + parser.add_argument( + "--experiment_tracking_path", + dest="experiment_tracking_path", + type=str, + default=None, + help="The tracking path of this experiment. Format: \ + user_name:project_name:experiment_name:run_name. The path is used to track and display \ + a record of this experiment on ML Dashboard. Note: this embedded experiment tracking is \ + disabled when the deprecated Model Repo TrackRun is used in your model config. ", + ) + parser.add_argument( + "--disable_experiment_tracking", + dest="disable_experiment_tracking", + action="store_true", + help="Whether experiment tracking should be disabled.", + ) + parser.add_argument( + "--config.save_checkpoints_secs", + "--save_checkpoints_secs", + type=int, + default=600, + dest="save_checkpoints_secs", + help="Configures the tf.estimator.RunConfig.save_checkpoints_secs attribute. " + "Specifies how often checkpoints are saved in seconds. Defaults to 10*60 seconds.", + ) + parser.add_argument( + "--config.keep_checkpoint_max", + "--keep_checkpoint_max", + type=int, + default=20, + dest="keep_checkpoint_max", + help="Configures the tf.estimator.RunConfig.keep_checkpoint_max attribute. " + "Specifies how many checkpoints to keep. Defaults to 20.", + ) + parser.add_argument( + "--config.tf_random_seed", + "--tf_random_seed", + type=int, + default=None, + dest="tf_random_seed", + help="Configures the tf.estimator.RunConfig.tf_random_seed attribute. " + "Specifies the seed to use. Defaults to None.", + ) + parser.add_argument( + "--optimizer", + type=str, + default="SGD", + help="Optimizer to use: SGD (Default), Adagrad, Adam, Ftrl, Momentum, RMSProp, LazyAdam, DGC.", + ) + parser.add_argument( + "--gradient_noise_scale", + type=float, + default=None, + help="adds 0-mean normal noise scaled by this value. Defaults to None.", + ) + parser.add_argument( + "--clip_gradients", + type=float, + default=None, + help="If specified, a global clipping is applied to prevent " + "the norm of the gradient to exceed this value. Defaults to None.", + ) + parser.add_argument( + "--dgc.density", + "--dgc_density", + type=float, + default=0.1, + dest="dgc_density", + help="Specifies gradient density level when using deep gradient compression optimizer." + "E.g., default value being 0.1 means that only top 10%% most significant rows " + "(based on absolute value sums) are kept.", + ) + parser.add_argument( + "--dgc.density_decay", + "--dgc_density_decay", + type=bool, + default=True, + dest="dgc_density_decay", + help="Specifies whether to (exponentially) decay the gradient density level when" + " doing gradient compression. If set 'False', the 'density_decay_steps', " + "'density_decay_rate' and 'min_density' arguments will be ignored.", + ) + parser.add_argument( + "--dgc.density_decay_steps", + "--dgc_density_decay_steps", + type=int, + default=10000, + dest="dgc_density_decay_steps", + help="Specifies the step interval to perform density decay.", + ) + parser.add_argument( + "--dgc.density_decay_rate", + "--dgc_density_decay_rate", + type=float, + default=0.5, + dest="dgc_density_decay_rate", + help="Specifies the decay rate when perfoming density decay.", + ) + parser.add_argument( + "--dgc.min_density", + "--dgc_min_density", + type=float, + default=0.1, + dest="dgc_min_density", + help="Specifies the minimum density level when perfoming density decay.", + ) + parser.add_argument( + "--dgc.accumulation", + "--dgc_accumulation", + type=bool, + default=False, + dest="dgc_accumulation", + help="Specifies whether to accumulate small gradients when using deep gradient compression " + "optimizer.", + ) + parser.add_argument( + "--show_optimizer_summaries", + dest="show_optimizer_summaries", + action="store_true", + help="When specified, displays gradients and learning rate in tensorboard." + "Turning it on has 10-20%% performance hit. Enable for debugging only", + ) + + parser.add_argument( + "--num_mkl_threads", + dest="num_mkl_threads", + default=1, + type=int, + help="Specifies how many threads to use for MKL" + "inter_op_ parallelism_threds is set to TWML_NUM_CPUS / num_mkl_threads." + "intra_op_parallelism_threads is set to num_mkl_threads.", + ) - def _parse_known_args(self, arg_strings, *args, **kwargs): + parser.add_argument( + "--verbosity", + type=_set_log_level, + choices=LOG_LEVELS.keys(), + default=None, + help="Sets log level to a given verbosity.", + ) + + parser.add_argument( + "--feature_importance.algorithm", + dest="feature_importance_algorithm", + type=str, + default=TREE, + choices=[SERIAL, TREE], + help=""" + There are two algorithms that the module supports, `serial` and `tree`. + The `serial` algorithm computes feature importance for each feature, and + the `tree` algorithm groups features by feature name prefix, computes feature + importance for groups of features, and then only 'zooms-in' on a group when the + importance is greater than the `--feature_importance.sensitivity` value. The `tree` algorithm + will usually run faster, but for relatively unimportant features it will only compute an + upper bound rather than an exact importance value. We suggest that users generally stick + to the `tree` algorithm, unless if they have a very small number of features or + near-random model performance. + """, + ) + + parser.add_argument( + "--feature_importance.sensitivity", + dest="feature_importance_sensitivity", + type=float, + default=0.03, + help=""" + The maximum amount that permuting a feature group can cause the model performance (determined + by `feature_importance.metric`) to drop before the algorithm decides to not expand the feature + group. This is only used for the `tree` algorithm. + """, + ) + + parser.add_argument( + "--feature_importance.dont_build_tree", + dest="dont_build_tree", + action="store_true", + default=False, + help=""" + If True, don't build the feature trie for the tree algorithm and only use the extra_groups + """, + ) + + parser.add_argument( + "--feature_importance.split_feature_group_on_period", + dest="split_feature_group_on_period", + action="store_true", + default=False, + help="If true, split feature groups by the period rather than the optimal prefix. Only used for the TREE algorithm", + ) + + parser.add_argument( + "--feature_importance.example_count", + dest="feature_importance_example_count", + type=int, + default=10000, + help=""" + The number of examples used to compute feature importance. + Larger values yield more reliable results, but also take longer to compute. + These records are loaded into memory. This number is agnostic to batch size. + """, + ) + + parser.add_argument( + "--feature_importance.data_dir", + dest="feature_importance_data_dir", + type=str, + default=None, + help="Path to the dataset used to compute feature importance." + "supports local filesystem path and hdfs://default/ which requires " + "setting HDFS configuration via env variable HADOOP_CONF_DIR " + "Defaults to eval_data_dir", + ) + + parser.add_argument( + "--feature_importance.metric", + dest="feature_importance_metric", + type=str, + default="roc_auc", + help="The metric used to determine when to stop expanding the feature importance tree. This is only used for the `tree` algorithm.", + ) + + parser.add_argument( + "--feature_importance.is_metric_larger_the_better", + dest="feature_importance_is_metric_larger_the_better", + action="store_true", + default=False, + help="If true, interpret `--feature_importance.metric` to be a metric where larger values are better (e.g. ROC_AUC)", + ) + + parser.add_argument( + "--feature_importance.is_metric_smaller_the_better", + dest="feature_importance_is_metric_smaller_the_better", + action="store_true", + default=False, + help="If true, interpret `--feature_importance.metric` to be a metric where smaller values are better (e.g. LOSS)", + ) + + subparsers = parser.add_subparsers( + help="Learning Rate Decay Functions. Can only pass 1." + "Should be specified after all the optional arguments" + "and followed by its specific args" + "e.g. --learning_rate 0.01 inverse_learning_rate_decay_fn" + " --decay_rate 0.0004 --min_learning_rate 0.001", + dest="learning_rate_decay", + ) + + # Create the parser for the "exponential_learning_rate_decay_fn" + parser_exponential = subparsers.add_parser( + "exponential_learning_rate_decay", + help="Exponential learning rate decay. " + "Exponential decay implements:" + "decayed_learning_rate = learning_rate * " + "exponential_decay_rate ^ " + "(global_step / decay_steps", + ) + parser_exponential.add_argument( + "--decay_steps", + type=float, + default=None, + help="Required for 'exponential' learning_rate_decay.", + ) + parser_exponential.add_argument( + "--exponential_decay_rate", + type=float, + default=None, + help="Required for 'exponential' learning_rate_decay. Must be positive. ", + ) + + # Create the parser for the "polynomial_learning_rate_decay_fn" + parser_polynomial = subparsers.add_parser( + "polynomial_learning_rate_decay", + help="Polynomial learning rate decay. " + "Polynomial decay implements: " + "global_step = min(global_step, decay_steps)" + "decayed_learning_rate = " + "(learning_rate - end_learning_rate) * " + "(1 - global_step / decay_steps) ^ " + "(polynomial_power) + end_learning_rate" + "So for linear decay you can use a " + "polynomial_power=1 (the default)", + ) + parser_polynomial.add_argument( + "--end_learning_rate", + type=float, + default=0.0001, + help="Required for 'polynomial' learning_rate_decay (ignored otherwise).", + ) + parser_polynomial.add_argument( + "--polynomial_power", + type=float, + default=0.0001, + help="Required for 'polynomial' learning_rate_decay." + "The power of the polynomial. Defaults to linear, 1.0.", + ) + parser_polynomial.add_argument( + "--decay_steps", + type=float, + default=None, + help="Required for 'polynomial' learning_rate_decay. ", + ) + + # Create the parser for the "piecewise_constant_learning_rate_decay_fn" + parser_piecewise_constant = subparsers.add_parser( + "piecewise_constant_learning_rate_decay", + help="Piecewise Constant " + "learning rate decay. " + "For piecewise_constant, " + "consider this example: " + "We want to use a learning rate " + "that is 1.0 for" + "the first 100000 steps," + "0.5 for steps 100001 to 110000, " + "and 0.1 for any additional steps. " + "To do so, specify " + "--piecewise_constant_boundaries=100000,110000" + "--piecewise_constant_values=1.0,0.5,0.1", + ) + parser_piecewise_constant.add_argument( + "--piecewise_constant_values", + action=parse_comma_separated_list(element_type=float), + default=None, + help="Required for 'piecewise_constant_values' learning_rate_decay. " + "A list of comma seperated floats or ints that specifies the values " + "for the intervals defined by boundaries. It should have one more " + "element than boundaries.", + ) + parser_piecewise_constant.add_argument( + "--piecewise_constant_boundaries", + action=parse_comma_separated_list(element_type=int), + default=None, + help="Required for 'piecewise_constant_values' learning_rate_decay. " + "A list of comma seperated integers, with strictly increasing entries.", + ) + + # Create the parser for the "inverse_learning_rate_decay_fn" + parser_inverse = subparsers.add_parser( + "inverse_learning_rate_decay", + help="Inverse Leaning rate decay. " + "Inverse implements:" + "decayed_lr = max(lr /(1 + decay_rate * " + "floor(global_step /decay_step))," + " min_learning_rate)" + "When decay_step=1 this mimics the behaviour" + "of the default learning rate decay" + "of DeepBird v1.", + ) + + parser_inverse.add_argument( + "--decay_rate", + type=float, + default=None, + help="Required for 'inverse' learning_rate_decay. Rate in which we decay the learning rate.", + ) + parser_inverse.add_argument( + "--min_learning_rate", + type=float, + default=None, + help="Required for 'inverse' learning_rate_decay.Minimum possible learning_rate.", + ) + parser_inverse.add_argument( + "--decay_steps", + type=float, + default=1, + help="Required for 'inverse' learning_rate_decay.", + ) + + # Create the parser for the "cosine_learning_rate_decay_fn" + parser_cosine = subparsers.add_parser( + "cosine_learning_rate_decay", + help="Cosine Leaning rate decay. " + "Cosine implements:" + "decayed_lr = 0.5 * (1 + cos(pi *\ + global_step / decay_steps)) * lr", + ) + + parser_cosine.add_argument( + "--alpha", + type=float, + default=0, + help="A scalar float32 or float64 Tensor or a Python number.\ + Minimum learning rate value as a fraction of learning_rate.", + ) + parser_cosine.add_argument( + "--decay_steps", type=float, help="Required for 'inverse' learning_rate_decay." + ) + + # Create the parser for the "cosine_restart_learning_rate_decay_fn" + parser_cosine_restart = subparsers.add_parser( + "cosine_restarts_learning_rate_decay", + help="Applies cosine decay with restarts \ + to the learning rate" + "See [Loshchilov & Hutter, ICLR2016],\ + SGDR: Stochastic" + "Gradient Descent with Warm Restarts." + "https://arxiv.org/abs/1608.03983", + ) + parser_cosine_restart.add_argument( + "--first_decay_steps", + type=float, + help="Required for 'cosine_restart' learning_rate_decay.", + ) + parser_cosine_restart.add_argument( + "--alpha", + type=float, + default=0, + help="A scalar float32 or float64 Tensor or a Python number. \ + Minimum learning rate value as a fraction of learning_rate.", + ) + parser_cosine_restart.add_argument( + "--t_mul", + type=float, + default=2, + help="A scalar float32 or float64 Tensor or a Python number. \ + Used to derive the number of iterations in the i-th period", + ) + parser_cosine_restart.add_argument( + "--m_mul", + type=float, + default=1, + help="A scalar float32 or float64 Tensor or a Python number. \ + Used to derive the initial learning rate of the i-th period.", + ) + + # Create dummy parser for None, which is the default. + parser_default = subparsers.add_parser( + "no_learning_rate_decay", help="No learning rate decay" + ) # noqa: F841 + + parser.set_default_subparser("no_learning_rate_decay") + + return parser + + +class DefaultSubcommandArgParse(argparse.ArgumentParser): """ - Overwrites _parse_known_args + Subclass of argparse.ArgumentParser that sets default parser """ - in_args = set(arg_strings) - d_sp = self._DEFAULT_SUBPARSER - if d_sp is not None and not {'-h', '--help'}.intersection(in_args): - for x_val in self._subparsers._actions: - subparser_found = ( - isinstance(x_val, argparse._SubParsersAction) and - in_args.intersection(x_val._name_parser_map.keys()) + + _DEFAULT_SUBPARSER = None + + def set_default_subparser(self, name: str) -> None: + """ + sets the default subparser + """ + self._DEFAULT_SUBPARSER = name + + def _parse_known_args( + self, arg_strings: List[str], *args, **kwargs + ) -> argparse.Namespace: + """ + Overwrites _parse_known_args + """ + in_args = set(arg_strings) + d_sp = self._DEFAULT_SUBPARSER + if d_sp is not None and not {"-h", "--help"}.intersection(in_args): + for x_val in self._subparsers._actions: + subparser_found = isinstance( + x_val, argparse._SubParsersAction + ) and in_args.intersection(x_val._name_parser_map.keys()) + if subparser_found: + break + else: + # insert default in first position, this implies no + # global options without a sub_parsers specified + arg_strings = arg_strings + [d_sp] + return super(DefaultSubcommandArgParse, self)._parse_known_args( + arg_strings, *args, **kwargs ) - if subparser_found: - break - else: - # insert default in first position, this implies no - # global options without a sub_parsers specified - arg_strings = arg_strings + [d_sp] - return super(DefaultSubcommandArgParse, self)._parse_known_args( - arg_strings, *args, **kwargs - ) - - def _check_value(self, action, value): - try: - super(DefaultSubcommandArgParse, self)._check_value( - action, value - ) - except ArgumentError as error: - error.message += ("\nERROR: Deepbird is trying to interpret \"{}\" as a value of {}. If this is not what you expected, " - "then most likely one of the following two things are happening: Either one of your cli arguments are not recognized, " - "probably {} or whichever argument you are passing {} as a value to OR you are passing in an argument after " - "the `learning_rate_decay` argument.\n").format(value, action.dest, value, value) - raise error - - -def parse_comma_separated_list(element_type=str): - """ - Generates an argparse.Action that converts a string representing a comma separated list to a - list and converts each element to a specified type. - """ - - # pylint: disable-msg=too-few-public-methods - class _ParseCommaSeparatedList(argparse.Action): + + def _check_value(self, action: argparse.Action, value: str) -> None: + try: + super(DefaultSubcommandArgParse, self)._check_value(action, value) + except ArgumentError as error: + error.message += ( + '\nERROR: Deepbird is trying to interpret "{}" as a value of {}. If this is not what you expected, ' + "then most likely one of the following two things are happening: Either one of your cli arguments are not recognized, " + "probably {} or whichever argument you are passing {} as a value to OR you are passing in an argument after " + "the `learning_rate_decay` argument.\n" + ).format(value, action.dest, value, value) + raise error + + +def parse_comma_separated_list(element_type=str) -> argparse.Action: """ - Converts a string representing a comma separated list to a list and converts each element to a - specified type. + Generates an argparse.Action that converts a string representing a comma separated list to a + list and converts each element to a specified type. """ - def __call__(self, parser, namespace, values, option_string=None): - if values is not None: - values = [element_type(v) for v in values.split(',')] - setattr(namespace, self.dest, values) - - return _ParseCommaSeparatedList + # pylint: disable-msg=too-few-public-methods + class _ParseCommaSeparatedList(argparse.Action): + """ + Converts a string representing a comma separated list to a list and converts each element to a + specified type. + """ + + def __call__( + self, + parser: argparse.ArgumentParser, + namespace: argparse.Namespace, + values: str, + option_string: str = None, + ) -> None: # pylint: disable-unused-argument + if values is not None: + values = [element_type(v) for v in values.split(",")] + setattr(namespace, self.dest, values) + + return _ParseCommaSeparatedList diff --git a/twml/twml/array.py b/twml/twml/array.py index a8524a06d..3b7adea93 100644 --- a/twml/twml/array.py +++ b/twml/twml/array.py @@ -2,100 +2,102 @@ import ctypes as ct +import numpy as np from absl import logging from libtwml import CLIB -import numpy as np - _NP_TO_TWML_TYPE = { - 'float32': ct.c_int(1), - 'float64': ct.c_int(2), - 'int32': ct.c_int(3), - 'int64': ct.c_int(4), - 'int8': ct.c_int(5), - 'uint8': ct.c_int(6), + "float32": ct.c_int(1), + "float64": ct.c_int(2), + "int32": ct.c_int(3), + "int64": ct.c_int(4), + "int8": ct.c_int(5), + "uint8": ct.c_int(6), } class Array(object): - """ - Wrapper class to allow numpy arrays to work with twml functions. - """ - - def __init__(self, array): - """ - Wraps numpy array and creates a handle that can be passed to C functions from libtwml. - - array: Numpy array - """ - if not isinstance(array, np.ndarray): - raise TypeError("Input must be a numpy array") - - try: - ttype = _NP_TO_TWML_TYPE[array.dtype.name] - except KeyError as err: - logging.error("Unsupported numpy type") - raise err - - handle = ct.c_void_p(0) - ndim = ct.c_int(array.ndim) - dims = array.ctypes.get_shape() - isize = array.dtype.itemsize - - strides_t = ct.c_size_t * array.ndim - strides = strides_t(*[n // isize for n in array.strides]) - - err = CLIB.twml_tensor_create(ct.pointer(handle), - array.ctypes.get_as_parameter(), - ndim, dims, strides, ttype) - - if err != 1000: - raise RuntimeError("Error from libtwml") - - # Store the numpy array to ensure it isn't deleted before self - self._array = array - - self._handle = handle - - self._type = ttype - - @property - def handle(self): - """ - Return the twml handle - """ - return self._handle - - @property - def shape(self): """ - Return the shape + Wrapper class to allow numpy arrays to work with twml functions. """ - return self._array.shape - @property - def ndim(self): - """ - Return the shape - """ - return self._array.ndim - - @property - def array(self): - """ - Return the numpy array - """ - return self._array - - @property - def dtype(self): - """ - Return numpy dtype - """ - return self._array.dtype - - def __del__(self): - """ - Delete the handle - """ - CLIB.twml_tensor_delete(self._handle) + def __init__(self, array: np.ndarray): + """ + Wraps numpy array and creates a handle that can be passed to C functions from libtwml. + + array: Numpy array + """ + if not isinstance(array, np.ndarray): + raise TypeError("Input must be a numpy array") + + try: + ttype = _NP_TO_TWML_TYPE[array.dtype.name] + except KeyError as err: + logging.error("Unsupported numpy type") + raise err + + handle = ct.c_void_p(0) + ndim = ct.c_int(array.ndim) + dims = array.ctypes.get_shape() + isize = array.dtype.itemsize + + strides_t = ct.c_size_t * array.ndim + strides = strides_t(*[n // isize for n in array.strides]) + + err = CLIB.twml_tensor_create( + ct.pointer(handle), + array.ctypes.get_as_parameter(), + ndim, + dims, + strides, + ttype, + ) + + if err != 1000: + raise RuntimeError("Error from libtwml") + + # Store the numpy array to ensure it isn't deleted before self + self._array = array + self._handle = handle + self._type = ttype + + @property + def handle(self) -> ct.c_void_p: + """ + Return the twml handle + """ + return self._handle + + @property + def shape(self) -> tuple: + """ + Return the shape + """ + return self._array.shape + + @property + def ndim(self) -> int: + """ + Return the shape + """ + return self._array.ndim + + @property + def array(self) -> np.ndarray: + """ + Return the numpy array + """ + return self._array + + @property + def dtype(self) -> np.dtype: + """ + Return numpy dtype + """ + return self._array.dtype + + def __del__(self) -> None: + """ + Delete the handle + """ + CLIB.twml_tensor_delete(self._handle) diff --git a/twml/twml/block_format_writer.py b/twml/twml/block_format_writer.py index 9c4a9b6a8..8132b456c 100644 --- a/twml/twml/block_format_writer.py +++ b/twml/twml/block_format_writer.py @@ -5,61 +5,61 @@ class BlockFormatWriter(object): - """ - Class to write block format file. - """ + """ + Class to write block format file. + """ - def __init__(self, file_name, records_per_block=100): - file_name = file_name - if not isinstance(file_name, str): - raise ValueError("file_name has to be of type str") + def __init__(self, file_name: str, records_per_block: int = 100): + file_name = file_name + if not isinstance(file_name, str): + raise ValueError("file_name has to be of type str") - self.file_name = ct.c_char_p(file_name.encode()) - self.records_per_block = ct.c_int(int(records_per_block)) - handle = ct.c_void_p(0) - err = CLIB.block_format_writer_create(ct.pointer(handle), - self.file_name, - self.records_per_block) - self._handle = None - # 1000 means TWML_ERR_NONE - if err != 1000: - raise RuntimeError("Error from libtwml") - self._handle = handle + self.file_name = ct.c_char_p(file_name.encode()) + self.records_per_block = ct.c_int(int(records_per_block)) + handle = ct.c_void_p(0) + err = CLIB.block_format_writer_create( + ct.pointer(handle), self.file_name, self.records_per_block + ) + self._handle = None + # 1000 means TWML_ERR_NONE + if err != 1000: + raise RuntimeError("Error from libtwml") + self._handle = handle - @property - def handle(self): - """ - Return the handle - """ - return self._handle + @property + def handle(self) -> ct.c_void_p: + """ + Return the handle + """ + return self._handle - def write(self, class_name, record): - """ - Write a record. + def write(self, class_name: str, record: bytes) -> None: + """ + Write a record. - Note: `record` needs to be in a format that can be converted to ctypes.c_char_p. - """ - if not isinstance(class_name, str): - raise ValueError("class_name has to be of type str") + Note: `record` needs to be in a format that can be converted to ctypes.c_char_p. + """ + if not isinstance(class_name, str): + raise ValueError("class_name has to be of type str") - record_len = len(record) - class_name = ct.c_char_p(class_name.encode()) - record = ct.c_char_p(record) - err = CLIB.block_format_write(self._handle, class_name, record, record_len) - if err != 1000: - raise RuntimeError("Error from libtwml") + record_len = len(record) + class_name = ct.c_char_p(class_name.encode()) + record = ct.c_char_p(record) + err = CLIB.block_format_write(self._handle, class_name, record, record_len) + if err != 1000: + raise RuntimeError("Error from libtwml") - def flush(self): - """ - Flush records in buffer to outputfile. - """ - err = CLIB.block_format_flush(self._handle) - if err != 1000: - raise RuntimeError("Error from libtwml") + def flush(self) -> None: + """ + Flush records in buffer to outputfile. + """ + err = CLIB.block_format_flush(self._handle) + if err != 1000: + raise RuntimeError("Error from libtwml") - def __del__(self): - """ - Delete the handle - """ - if self._handle: - CLIB.block_format_writer_delete(self._handle) + def __del__(self) -> None: + """ + Delete the handle + """ + if self._handle: + CLIB.block_format_writer_delete(self._handle) diff --git a/twml/twml/constants.py b/twml/twml/constants.py index c6c726eed..8d71c4210 100644 --- a/twml/twml/constants.py +++ b/twml/twml/constants.py @@ -1,11 +1,11 @@ # These should coincide with 'enum class DecodeMode' values in HashedDataRecordReader.h +from twitter.deepbird.io.legacy.constants import DECODE_MODES # noqa: F401 +from twitter.deepbird.io.legacy.constants import DEFAULT_DECODE_MODE # noqa: F401 +from twitter.deepbird.io.legacy.constants import DEFAULT_ZOOKEEPER_HOST # noqa: F401 +from twitter.deepbird.io.legacy.constants import HASH_FNAME_AND_VALNAME # noqa: F401 +from twitter.deepbird.io.legacy.constants import HASH_VALNAME # noqa: F401 +from twitter.deepbird.io.legacy.constants import HashingDiscretizerOptions # noqa: F401 from twitter.deepbird.io.legacy.constants import ( - DECODE_MODES, # noqa: F401 - DEFAULT_DECODE_MODE, # noqa: F401 - HASH_FNAME_AND_VALNAME, # noqa: F401 - HASH_VALNAME, # noqa: F401 - HashingDiscretizerOptions, # noqa: F401 - DEFAULT_ZOOKEEPER_BASE_ZNODE, # noqa: F401 - DEFAULT_ZOOKEEPER_HOST, # noqa: F401 -) + DEFAULT_ZOOKEEPER_BASE_ZNODE, +) # noqa: F401 diff --git a/twml/twml/contrib/__init__.py b/twml/twml/contrib/__init__.py index 1a5e8efe4..2860971b6 100644 --- a/twml/twml/contrib/__init__.py +++ b/twml/twml/contrib/__init__.py @@ -1,21 +1,21 @@ # pylint: disable=wildcard-import """ experimental and contributed modules """ -from . import layers # noqa: F401 -from . import feature_importances # noqa: F401 -from . import calibrators # noqa: F401 -from . import readers # noqa: F401 -from . import utils # noqa: F401 -from . import build_graphs_fns # noqa: F401 -from . import feature_config # noqa: F401 -from . import parsers # noqa: F401 -from . import initializers # noqa: F401 -from . import export # noqa: F401 -from . import feature_config_parsers # noqa: F401 - # These imports do not work with TF 2.x and are not needed either. # If you are using TF 2.x, use the modular targets under src/python/twitter/deepbird. import tensorflow -from . import trainers # noqa: F401 -from . import metrics # noqa: F401 + +from . import build_graphs_fns # noqa: F401 +from . import calibrators # noqa: F401 +from . import export # noqa: F401 +from . import feature_config # noqa: F401 +from . import feature_config_parsers # noqa: F401 +from . import feature_importances # noqa: F401 from . import hooks # noqa: F401 +from . import initializers # noqa: F401 +from . import layers # noqa: F401 +from . import metrics # noqa: F401 +from . import parsers # noqa: F401 +from . import readers # noqa: F401 +from . import trainers # noqa: F401 +from . import utils # noqa: F401 diff --git a/twml/twml/contrib/build_graphs_fns.py b/twml/twml/contrib/build_graphs_fns.py index 829f61512..108d9d4d1 100644 --- a/twml/twml/contrib/build_graphs_fns.py +++ b/twml/twml/contrib/build_graphs_fns.py @@ -1,32 +1,35 @@ # pylint: disable=unused-argument, missing-docstring -''' +""" Common build graphs that can be reused -''' +""" import tensorflow.compat.v1 as tf -def get_saved_modules_graph(input_graph_fn): - """ - Get common graph for stitching different saved modules for export. - This graph is used to save checkpoints; and then export the modules - as a unity. - Args: +def get_saved_modules_graph( + input_graph_fn: callable, params: dict, features: dict, mode: str = "train" +) -> dict: + """ + Get common graph for stitching different saved modules for export. + This graph is used to save checkpoints; and then export the modules + as a unity. + Args: features: - model features + model features params: - model params + model params input_graph_fn: - main logic for the stitching - Returns: - build_graph - """ - def build_graph(features, label, mode, params, config=None): + main logic for the stitching + mode: + the mode of the graph + Returns: + output of input_graph_fn + """ + output = input_graph_fn(features, params) # If mode is train, we just need to assign a dummy loss # and update the train op. This is done to save the graph to save_dir. - if mode == 'train': - loss = tf.constant(1) - train_op = tf.assign_add(tf.train.get_global_step(), 1) - return {'train_op': train_op, 'loss': loss} + if mode == "train": + loss = tf.constant(1) + train_op = tf.assign_add(tf.train.get_global_step(), 1) + return {"train_op": train_op, "loss": loss} return output - return build_graph diff --git a/twml/twml/contrib/calibrators/__init__.py b/twml/twml/contrib/calibrators/__init__.py index 02181ed12..0f17fdf55 100644 --- a/twml/twml/contrib/calibrators/__init__.py +++ b/twml/twml/contrib/calibrators/__init__.py @@ -9,10 +9,13 @@ Ultimately, the ``Calibrator`` should produce an initialized layer via its ``to_layer()`` method. """ -from .common_calibrators import calibrate_discretizer_and_export, add_discretizer_arguments # noqa: F401 from .calibrator import Calibrator # noqa: F401 -from .mdl import MDLCalibrator # noqa: F401 +from .common_calibrators import add_discretizer_arguments # noqa: F401 +from .common_calibrators import calibrate_discretizer_and_export +from .hashed_percentile_discretizer import ( + HashedPercentileDiscretizerCalibrator, +) # noqa: F401 +from .hashing_discretizer import HashingDiscretizerCalibrator # noqa: F401 from .isotonic import IsotonicCalibrator # noqa: F401 +from .mdl import MDLCalibrator # noqa: F401 from .percentile_discretizer import PercentileDiscretizerCalibrator # noqa: F401 -from .hashed_percentile_discretizer import HashedPercentileDiscretizerCalibrator # noqa: F401 -from .hashing_discretizer import HashingDiscretizerCalibrator # noqa: F401 \ No newline at end of file diff --git a/twml/twml/contrib/calibrators/calibrator.py b/twml/twml/contrib/calibrators/calibrator.py index 7408412e0..aba4615e2 100644 --- a/twml/twml/contrib/calibrators/calibrator.py +++ b/twml/twml/contrib/calibrators/calibrator.py @@ -1,5 +1,5 @@ # pylint: disable=missing-docstring, unused-argument -''' Contains the base classes for CalibrationFeature and Calibrator ''' +""" Contains the base classes for CalibrationFeature and Calibrator """ from collections import defaultdict @@ -7,151 +7,155 @@ import numpy as np import tensorflow.compat.v1 as tf import tensorflow_hub as hub + import twml import twml.util class CalibrationFeature(object): - ''' - Accumulates values and weights for individual features. - Typically, each unique feature defined in the accumulated SparseTensor or Tensor - would have its own CalibrationFeature instance. - ''' - - def __init__(self, feature_id): - ''' Constructs a CalibrationFeature - - Arguments: - feature_id: - number identifying the feature. - ''' - self.feature_id = feature_id - self._calibrated = False - self._features_dict = defaultdict(list) - - def add_values(self, new_features): - ''' - Extends lists to contain the values in this batch - ''' - for key in new_features: - self._features_dict[key].append(new_features[key]) - - def _concat_arrays(self): - ''' - This class calls this function after you have added all the values. - It creates a dictionary with the concatanated arrays - ''' - self._features_dict.update((k, np.concatenate(v)) for k, v in self._features_dict.items()) - - def calibrate(self, *args, **kwargs): - raise NotImplementedError + """ + Accumulates values and weights for individual features. + Typically, each unique feature defined in the accumulated SparseTensor or Tensor + would have its own CalibrationFeature instance. + """ + + def __init__(self, feature_id: int): + """Constructs a CalibrationFeature + + Args: + feature_id: + number identifying the feature. + """ + self.feature_id = feature_id + self._calibrated = False + self._features_dict = defaultdict(list) + + def add_values(self, new_features: dict): + """Extends lists to contain the values in this batch""" + for key in new_features: + self._features_dict[key].append(new_features[key]) + + def _concat_arrays(self): + """ + This class calls this function after you have added all the values. + It creates a dictionary with the concatenated arrays + """ + for k, v in self._features_dict.items(): + self._features_dict[k] = np.concatenate(v) + + def calibrate(self, *args, **kwargs): + raise NotImplementedError class Calibrator(object): - ''' - Accumulates features and their respective values for Calibration - The steps for calibration are typically as follows: - - 1. accumulate feature values from batches by calling ``accumulate()`` and; - 2. calibrate by calling ``calibrate()``; - 3. convert to a twml.layers layer by calling ``to_layer()``. - - Note you can only use one calibrator per Trainer. - ''' - - def __init__(self, calibrator_name=None, **kwargs): - ''' - Arguments: - calibrator_name. - Default: if set to None it will be the same as the class name. - Please be reminded that if in the model there are many calibrators - of the same type the calibrator_name should be changed to avoid confusion. - ''' - self._calibrated = False - if calibrator_name is None: - calibrator_name = twml.util.to_snake_case(self.__class__.__name__) - self._calibrator_name = calibrator_name - self._kwargs = kwargs - - @property - def is_calibrated(self): - return self._calibrated - - @property - def name(self): - return self._calibrator_name - - def accumulate(self, *args, **kwargs): - '''Accumulates features and their respective values for Calibration.''' - raise NotImplementedError - - def calibrate(self): - '''Calibrates after the accumulation has ended.''' - self._calibrated = True - - def to_layer(self, name=None): - ''' - Returns a twml.layers.Layer instance with the result of calibrator. - - Arguments: - name: - name-scope of the layer - ''' - raise NotImplementedError - - def get_layer_args(self): - ''' - Returns layer arguments required to implement multi-phase training. - - Returns: - dictionary of Layer constructor arguments to initialize the - layer Variables. Typically, this should contain enough information - to initialize empty layer Variables of the correct size, which will then - be filled with the right data using init_map. - ''' - raise NotImplementedError - - def save(self, save_dir, name="default", verbose=False): - '''Save the calibrator into the given save_directory. - Arguments: - save_dir: - name of the saving directory. Default (string): "default". - name: - name for the calibrator. - ''' - if not self._calibrated: - raise RuntimeError("Expecting prior call to calibrate().Cannot save() prior to calibrate()") - - # This module allows for the calibrator to save be saved as part of - # Tensorflow Hub (this will allow it to be used in further steps) - def calibrator_module(): - # Note that this is usually expecting a sparse_placeholder - inputs = tf.sparse_placeholder(tf.float32) - calibrator_layer = self.to_layer() - output = calibrator_layer(inputs) - # creates the signature to the calibrator module - hub.add_signature(inputs=inputs, outputs=output, name=name) - - # exports the module to the save_dir - spec = hub.create_module_spec(calibrator_module) - with tf.Graph().as_default(): - module = hub.Module(spec) - with tf.Session() as session: - module.export(save_dir, session) - - def write_summary(self, writer, sess=None): """ - This method is called by save() to write tensorboard summaries to disk. - See MDLCalibrator.write_summary for an example. - By default, the method does nothing. It can be overloaded by child-classes. - - Arguments: - writer: - `tf.summary.FilteWriter - `_ - instance. - The ``writer`` is used to add summaries to event files for inclusion in tensorboard. - sess (optional): - `tf.Session `_ - instance. The ``sess`` is used to produces summaries for the writer. + Accumulates features and their respective values for Calibration + The steps for calibration are typically as follows: + + 1. accumulate feature values from batches by calling ``accumulate()`` and; + 2. calibrate by calling ``calibrate()``; + 3. convert to a twml.layers layer by calling ``to_layer()``. + + Note you can only use one calibrator per Trainer. """ + + def __init__(self, calibrator_name: str = None, **kwargs): + """ + Args: + calibrator_name (str): + Default: if set to None it will be the same as the class name. + Please be reminded that if in the model there are many calibrators + of the same type the calibrator_name should be changed to avoid confusion. + """ + self._calibrated = False + if calibrator_name is None: + calibrator_name = twml.util.to_snake_case(self.__class__.__name__) + self._calibrator_name = calibrator_name + self._kwargs = kwargs + + @property + def is_calibrated(self) -> bool: + return self._calibrated + + @property + def name(self) -> str: + return self._calibrator_name + + def accumulate(self, *args, **kwargs): + """Accumulates features and their respective values for Calibration.""" + raise NotImplementedError + + def calibrate(self): + """Calibrates after the accumulation has ended.""" + self._calibrated = True + + def to_layer(self, name: str = None): + """ + Returns a twml.layers.Layer instance with the result of calibrator. + + Args: + name (str): + name-scope of the layer + """ + raise NotImplementedError + + def get_layer_args(self): + """ + Returns layer arguments required to implement multi-phase training. + + Returns: + dictionary of Layer constructor arguments to initialize the + layer Variables. Typically, this should contain enough information + to initialize empty layer Variables of the correct size, which will then + be filled with the right data using init_map. + """ + raise NotImplementedError + + def save( + self, save_dir: str, name: str = "default", verbose: bool = False + ): # pylint: disable=unused-argument + """Save the calibrator into the given save_directory. + Args: + save_dir (str): + name of the saving directory. + name (str): + name for the calibrator. Default (string): "default". + """ + if not self._calibrated: + raise RuntimeError( + "Expecting prior call to calibrate().Cannot save() prior to calibrate()" + ) + + # This module allows for the calibrator to save be saved as part of + # Tensorflow Hub (this will allow it to be used in further steps) + def calibrator_module(): + # Note that this is usually expecting a sparse_placeholder + inputs = tf.sparse_placeholder(tf.float32) + calibrator_layer = self.to_layer() + output = calibrator_layer(inputs) + # creates the signature to the calibrator module + hub.add_signature(inputs=inputs, outputs=output, name=name) + + # exports the module to the save_dir + spec = hub.create_module_spec(calibrator_module) + with tf.Graph().as_default(): + module = hub.Module(spec) + with tf.Session() as session: + module.export(save_dir, session) + + def write_summary(self, writer: tf.summary.FileWriter, sess=None): + """ + This method is called by save() to write tensorboard summaries to disk. + See MDLCalibrator.write_summary for an example. + By default, the method does nothing. It can be overloaded by child-classes. + + Args: + writer: + `tf.summary.FileWriter + `_ + instance. + The ``writer`` is used to add summaries to event files for inclusion in tensorboard. + sess (optional): + `tf.Session `_ + instance. The ``sess`` is used to produces summaries for the writer. + """ diff --git a/twml/twml/contrib/calibrators/common_calibrators.py b/twml/twml/contrib/calibrators/common_calibrators.py index 5301901e4..f554fceb5 100644 --- a/twml/twml/contrib/calibrators/common_calibrators.py +++ b/twml/twml/contrib/calibrators/common_calibrators.py @@ -9,699 +9,952 @@ # TODO: many of these functions aren't common at all. # For example, Discretizer functions should be moved to PercentileDiscretizer. +import argparse import copy import os import time +from typing import Callable -from absl import logging import tensorflow.compat.v1 as tf import tensorflow_hub as hub +from absl import logging + import twml from twml.argument_parser import SortingHelpFormatter +from twml.contrib.calibrators.isotonic import IsotonicCalibrator from twml.input_fns import data_record_input_fn +from twml.twml.feature_config import FeatureConfig +from twml.twml.trainers.trainer import Trainer from twml.util import list_files_by_datetime, sanitize_hdfs_path -from twml.contrib.calibrators.isotonic import IsotonicCalibrator -def calibrator_arguments(parser): - """ - Calibrator Parameters to add to relevant parameters to the DataRecordTrainerParser. - Otherwise, if alone in a file, it just creates its own default parser. - Arguments: - parser: - Parser with the options to the model - """ - parser.add_argument("--calibrator.save_dir", type=str, - dest="calibrator_save_dir", - help="Path to save or load calibrator calibration") - parser.add_argument("--calibrator_batch_size", type=int, default=128, - dest="calibrator_batch_size", - help="calibrator batch size") - parser.add_argument("--calibrator_parts_downsampling_rate", type=float, default=1, - dest="calibrator_parts_downsampling_rate", - help="Parts downsampling rate") - parser.add_argument("--calibrator_max_steps", type=int, default=None, - dest="calibrator_max_steps", - help="Max Steps taken by calibrator to accumulate samples") - parser.add_argument("--calibrator_num_bins", type=int, default=22, - dest="calibrator_num_bins", - help="Num bins of calibrator") - parser.add_argument("--isotonic_calibrator", dest='isotonic_calibrator', action='store_true', - help="Isotonic Calibrator present") - parser.add_argument("--calibrator_keep_rate", type=float, default=1.0, - dest="calibrator_keep_rate", - help="Keep rate") - return parser +def calibrator_arguments(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: + """ + Calibrator Parameters to add to relevant parameters to the DataRecordTrainerParser. + Otherwise, if alone in a file, it just creates its own default parser. + + Args: + parser: + Parser with the options to the model + """ + parser.add_argument( + "--calibrator.save_dir", + type=str, + dest="calibrator_save_dir", + help="Path to save or load calibrator calibration", + ) + parser.add_argument( + "--calibrator_batch_size", + type=int, + default=128, + dest="calibrator_batch_size", + help="calibrator batch size", + ) + parser.add_argument( + "--calibrator_parts_downsampling_rate", + type=float, + default=1, + dest="calibrator_parts_downsampling_rate", + help="Parts downsampling rate", + ) + parser.add_argument( + "--calibrator_max_steps", + type=int, + default=None, + dest="calibrator_max_steps", + help="Max Steps taken by calibrator to accumulate samples", + ) + parser.add_argument( + "--calibrator_num_bins", + type=int, + default=22, + dest="calibrator_num_bins", + help="Num bins of calibrator", + ) + parser.add_argument( + "--isotonic_calibrator", + dest="isotonic_calibrator", + action="store_true", + help="Isotonic Calibrator present", + ) + parser.add_argument( + "--calibrator_keep_rate", + type=float, + default=1.0, + dest="calibrator_keep_rate", + help="Keep rate", + ) + return parser def _generate_files_by_datetime(params): + files = list_files_by_datetime( + base_path=sanitize_hdfs_path(params.train_data_dir), + start_datetime=params.train_start_datetime, + end_datetime=params.train_end_datetime, + datetime_prefix_format=params.datetime_format, + extension="lzo", + parallelism=1, + hour_resolution=params.hour_resolution, + sort=True, + ) + + return files + + +def get_calibrate_input_fn(parse_fn: callable, params: argparse.Namespace) -> callable: + """ + Default input function used for the calibrator. + Args: + parse_fn: + Parse_fn + params: + Parameters + Returns: + input_fn + """ + + return lambda: data_record_input_fn( + files=_generate_files_by_datetime(params), + batch_size=params.calibrator_batch_size, + parse_fn=parse_fn, + num_threads=1, + repeat=False, + keep_rate=params.calibrator_keep_rate, + parts_downsampling_rate=params.calibrator_parts_downsampling_rate, + shards=None, + shard_index=None, + shuffle=True, + shuffle_files=True, + interleave=True, + ) - files = list_files_by_datetime( - base_path=sanitize_hdfs_path(params.train_data_dir), - start_datetime=params.train_start_datetime, - end_datetime=params.train_end_datetime, - datetime_prefix_format=params.datetime_format, - extension="lzo", - parallelism=1, - hour_resolution=params.hour_resolution, - sort=True) - - return files - - -def get_calibrate_input_fn(parse_fn, params): - """ - Default input function used for the calibrator. - Arguments: - parse_fn: - Parse_fn - params: - Parameters - Returns: - input_fn - """ - - return lambda: data_record_input_fn( - files=_generate_files_by_datetime(params), - batch_size=params.calibrator_batch_size, - parse_fn=parse_fn, - num_threads=1, - repeat=False, - keep_rate=params.calibrator_keep_rate, - parts_downsampling_rate=params.calibrator_parts_downsampling_rate, - shards=None, - shard_index=None, - shuffle=True, - shuffle_files=True, - interleave=True) - - -def get_discretize_input_fn(parse_fn, params): - """ - Default input function used for the calibrator. - Arguments: - parse_fn: - Parse_fn - params: - Parameters - Returns: - input_fn - """ - - return lambda: data_record_input_fn( - files=_generate_files_by_datetime(params), - batch_size=params.discretizer_batch_size, - parse_fn=parse_fn, - num_threads=1, - repeat=False, - keep_rate=params.discretizer_keep_rate, - parts_downsampling_rate=params.discretizer_parts_downsampling_rate, - shards=None, - shard_index=None, - shuffle=True, - shuffle_files=True, - interleave=True) - - -def discretizer_arguments(parser=None): - """ - Discretizer Parameters to add to relevant parameters to the DataRecordTrainerParser. - Otherwise, if alone in a file, it just creates its own default parser. - Arguments: - parser: - Parser with the options to the model. Defaults to None - """ - - if parser is None: - parser = twml.DefaultSubcommandArgParse(formatter_class=SortingHelpFormatter) + +def get_discretize_input_fn(parse_fn: callable, params: argparse.Namespace) -> callable: + """ + Default input function used for the calibrator. + Args: + parse_fn: + Parse_fn + params: + Parameters + Returns: + input_fn + """ + + return lambda: data_record_input_fn( + files=_generate_files_by_datetime(params), + batch_size=params.discretizer_batch_size, + parse_fn=parse_fn, + num_threads=1, + repeat=False, + keep_rate=params.discretizer_keep_rate, + parts_downsampling_rate=params.discretizer_parts_downsampling_rate, + shards=None, + shard_index=None, + shuffle=True, + shuffle_files=True, + interleave=True, + ) + + +def discretizer_arguments(parser: argparse.ArgumentParser = None): + """ + Discretizer Parameters to add to relevant parameters to the DataRecordTrainerParser. + Otherwise, if alone in a file, it just creates its own default parser. + + Args: + parser: + Parser with the options to the model. Defaults to None + """ + + if parser is None: + parser = twml.DefaultSubcommandArgParse(formatter_class=SortingHelpFormatter) + parser.add_argument( + "--overwrite_save_dir", + dest="overwrite_save_dir", + action="store_true", + help="Delete the contents of the current save_dir if it exists", + ) + parser.add_argument( + "--train.data_dir", + "--train_data_dir", + type=str, + default=None, + dest="train_data_dir", + help="Path to the training data directory." + "Supports local and HDFS (hdfs://default/ ) paths.", + ) + parser.add_argument( + "--train.start_date", + "--train_start_datetime", + type=str, + default=None, + dest="train_start_datetime", + help="Starting date for training inside the train data dir." + "The start datetime is inclusive." + "e.g. 2019/01/15", + ) + parser.add_argument( + "--train.end_date", + "--train_end_datetime", + type=str, + default=None, + dest="train_end_datetime", + help="Ending date for training inside the train data dir." + "The end datetime is inclusive." + "e.g. 2019/01/15", + ) + parser.add_argument( + "--datetime_format", + type=str, + default="%Y/%m/%d", + help="Date format for training and evaluation datasets." + "Has to be a format that is understood by python datetime." + "e.g. %Y/%m/%d for 2019/01/15." + "Used only if {train/eval}.{start/end}_date are provided.", + ) + parser.add_argument( + "--hour_resolution", + type=int, + default=None, + help="Specify the hourly resolution of the stored data.", + ) + parser.add_argument( + "--tensorboard_port", + type=int, + default=None, + help="Port for tensorboard to run on.", + ) + parser.add_argument( + "--stats_port", + type=int, + default=None, + help="Port for stats server to run on.", + ) + parser.add_argument( + "--health_port", + type=int, + default=None, + help="Port to listen on for health-related endpoints (e.g. graceful shutdown)." + "Not user-facing as it is set automatically by the twml_cli.", + ) + parser.add_argument( + "--data_spec", + type=str, + default=None, + help="Path to data specification JSON file. This file is used to decode DataRecords", + ) parser.add_argument( - "--overwrite_save_dir", dest="overwrite_save_dir", action="store_true", - help="Delete the contents of the current save_dir if it exists") + "--discretizer.save_dir", + type=str, + dest="discretizer_save_dir", + help="Path to save or load discretizer calibration", + ) parser.add_argument( - "--train.data_dir", "--train_data_dir", type=str, default=None, - dest="train_data_dir", - help="Path to the training data directory." - "Supports local and HDFS (hdfs://default/ ) paths.") + "--discretizer_batch_size", + type=int, + default=128, + dest="discretizer_batch_size", + help="Discretizer batch size", + ) parser.add_argument( - "--train.start_date", "--train_start_datetime", - type=str, default=None, - dest="train_start_datetime", - help="Starting date for training inside the train data dir." - "The start datetime is inclusive." - "e.g. 2019/01/15") + "--discretizer_keep_rate", + type=float, + default=0.0008, + dest="discretizer_keep_rate", + help="Keep rate", + ) parser.add_argument( - "--train.end_date", "--train_end_datetime", type=str, default=None, - dest="train_end_datetime", - help="Ending date for training inside the train data dir." - "The end datetime is inclusive." - "e.g. 2019/01/15") + "--discretizer_parts_downsampling_rate", + type=float, + default=0.2, + dest="discretizer_parts_downsampling_rate", + help="Parts downsampling rate", + ) parser.add_argument( - "--datetime_format", type=str, default="%Y/%m/%d", - help="Date format for training and evaluation datasets." - "Has to be a format that is understood by python datetime." - "e.g. %Y/%m/%d for 2019/01/15." - "Used only if {train/eval}.{start/end}_date are provided.") + "--discretizer_max_steps", + type=int, + default=None, + dest="discretizer_max_steps", + help="Max Steps taken by discretizer to accumulate samples", + ) + return parser + + +def calibrate( + trainer: Trainer, + params: argparse.Namespace, + build_graph: callable, + input_fn: callable, + debug: bool = False, +): + """ + Calibrate Isotonic Calibration + Args: + trainer: + Trainer + params: + Parameters + build_graph: + Build Graph used to be the input to the calibrator + input_fn: + Input Function specified by the user + debug: + Defaults to False. Returns the calibrator + """ + + if trainer._estimator.config.is_chief: + # overwrite the current save_dir + if params.overwrite_save_dir and tf.io.gfile.exists(params.calibrator_save_dir): + logging.info( + "Trainer overwriting existing save directory: %s (params.overwrite_save_dir)" + % params.calibrator_save_dir + ) + tf.io.gfile.rmtree(params.calibrator_save_dir) + + calibrator = IsotonicCalibrator(params.calibrator_num_bins) + + # chief trains discretizer + logging.info("Chief training calibrator") + + # Accumulate the features for each calibrator + features, labels = input_fn() + if "weights" not in features: + raise ValueError("Weights need to be returned as part of the parse_fn") + weights = features.pop("weights") + + preds = build_graph( + features=features, label=None, mode="infer", params=params, config=None + ) + init = tf.global_variables_initializer() + table_init = tf.tables_initializer() + with tf.Session() as sess: + sess.run(init) + sess.run(table_init) + count = 0 + max_steps = params.calibrator_max_steps or -1 + while max_steps <= 0 or count <= max_steps: + try: + weights_vals, labels_vals, preds_vals = sess.run( + [weights, labels, preds["output"]] + ) + calibrator.accumulate( + preds_vals, labels_vals, weights_vals.flatten() + ) + except tf.errors.OutOfRangeError: + break + count += 1 + + calibrator.calibrate() + calibrator.save(params.calibrator_save_dir) + trainer.estimator._params.isotonic_calibrator = True + + if debug: + return calibrator + + else: + calibrator_save_dir = twml.util.sanitize_hdfs_path(params.calibrator_save_dir) + # workers wait for calibration to be ready + while not tf.io.gfile.exists( + calibrator_save_dir + os.path.sep + "tfhub_module.pb" + ): + logging.info("Worker waiting for calibration at %s" % calibrator_save_dir) + time.sleep(60) + + +def discretize( + params: argparse.Namespace, + feature_config: dict, + input_fn: callable, + debug: bool = False, +): + """ + Discretizes continuous features + + Args: + params (argparse.Namespace): + Parameters + feature_config (dict): + Feature Config + input_fn (callable): + Input Function specified by the user + debug (bool): + Defaults to False. Returns the calibrator + """ + + if ( + os.environ.get("TWML_HOGWILD_TASK_TYPE") == "chief" + or "num_workers" not in params + or params.num_workers is None + ): + # overwrite the current save_dir + if params.overwrite_save_dir and tf.io.gfile.exists( + params.discretizer_save_dir + ): + logging.info( + "Trainer overwriting existing save directory: %s (params.overwrite_save_dir)" + % params.discretizer_save_dir + ) + tf.io.gfile.rmtree(params.discretizer_save_dir) + + config_map = feature_config() + discretize_dict = config_map["discretize_config"] + + # chief trains discretizer + logging.info("Chief training discretizer") + + batch = input_fn() + # Accumulate the features for each calibrator + with tf.Session() as sess: + count = 0 + max_steps = params.discretizer_max_steps or -1 + while max_steps <= 0 or count <= max_steps: + try: + inputs = sess.run(batch) + for name, clbrt in discretize_dict.items(): + clbrt.accumulate_features(inputs[0], name) + except tf.errors.OutOfRangeError: + break + count += 1 + + # This module allows for the calibrator to save be saved as part of + # Tensorflow Hub (this will allow it to be used in further steps) + def calibrator_module(): + # Note that this is usually expecting a sparse_placeholder + for name, clbrt in discretize_dict.items(): + clbrt.calibrate() + clbrt.add_hub_signatures(name) + + # exports the module to the save_dir + spec = hub.create_module_spec(calibrator_module) + with tf.Graph().as_default(): + module = hub.Module(spec) + with tf.Session() as session: + module.export(params.discretizer_save_dir, session) + + for name, clbrt in discretize_dict.items(): + clbrt.write_summary_json(params.discretizer_save_dir, name) + + if debug: + return discretize_dict + + else: + # wait for the file to be removed (if necessary) + # should be removed after an actual fix applied + time.sleep(60) + discretizer_save_dir = twml.util.sanitize_hdfs_path(params.discretizer_save_dir) + # workers wait for calibration to be ready + while not tf.io.gfile.exists( + discretizer_save_dir + os.path.sep + "tfhub_module.pb" + ): + logging.info("Worker waiting for calibration at %s" % discretizer_save_dir) + time.sleep(60) + + +def add_discretizer_arguments(parser): + """ + Add discretizer-specific command-line arguments to a Trainer parser. + + Args: + parser: argparse.ArgumentParser instance obtained from Trainer.get_trainer_parser + + Returns: + argparse.ArgumentParser instance with discretizer-specific arguments added + """ + parser.add_argument( - "--hour_resolution", type=int, default=None, - help="Specify the hourly resolution of the stored data.") + "--discretizer.save_dir", + type=str, + dest="discretizer_save_dir", + help="Path to save or load discretizer calibration", + ) parser.add_argument( - "--tensorboard_port", type=int, default=None, - help="Port for tensorboard to run on.") + "--discretizer.batch_size", + type=int, + default=128, + dest="discretizer_batch_size", + help="Discretizer batch size", + ) + parser.add_argument( + "--discretizer.keep_rate", + type=float, + default=0.0008, + dest="discretizer_keep_rate", + help="Keep rate", + ) parser.add_argument( - "--stats_port", type=int, default=None, - help="Port for stats server to run on.") + "--discretizer.parts_downsampling_rate", + type=float, + default=0.2, + dest="discretizer_parts_downsampling_rate", + help="Parts downsampling rate", + ) parser.add_argument( - "--health_port", type=int, default=None, - help="Port to listen on for health-related endpoints (e.g. graceful shutdown)." - "Not user-facing as it is set automatically by the twml_cli." + "--discretizer.num_bins", + type=int, + default=20, + dest="discretizer_num_bins", + help="Number of bins per feature", ) parser.add_argument( - "--data_spec", type=str, default=None, - help="Path to data specification JSON file. This file is used to decode DataRecords") - parser.add_argument("--discretizer.save_dir", type=str, - dest="discretizer_save_dir", - help="Path to save or load discretizer calibration") - parser.add_argument("--discretizer_batch_size", type=int, default=128, - dest="discretizer_batch_size", - help="Discretizer batch size") - parser.add_argument("--discretizer_keep_rate", type=float, default=0.0008, - dest="discretizer_keep_rate", - help="Keep rate") - parser.add_argument("--discretizer_parts_downsampling_rate", type=float, default=0.2, - dest="discretizer_parts_downsampling_rate", - help="Parts downsampling rate") - parser.add_argument("--discretizer_max_steps", type=int, default=None, - dest="discretizer_max_steps", - help="Max Steps taken by discretizer to accumulate samples") - return parser - - -def calibrate(trainer, params, build_graph, input_fn, debug=False): - """ - Calibrate Isotonic Calibration - Arguments: - trainer: - Trainer - params: - Parameters - build_graph: - Build Graph used to be the input to the calibrator - input_fn: - Input Function specified by the user - debug: - Defaults to False. Returns the calibrator - """ - - if trainer._estimator.config.is_chief: - - # overwrite the current save_dir - if params.overwrite_save_dir and tf.io.gfile.exists(params.calibrator_save_dir): - logging.info("Trainer overwriting existing save directory: %s (params.overwrite_save_dir)" - % params.calibrator_save_dir) - tf.io.gfile.rmtree(params.calibrator_save_dir) - - calibrator = IsotonicCalibrator(params.calibrator_num_bins) - - # chief trains discretizer - logging.info("Chief training calibrator") - - # Accumulate the features for each calibrator - features, labels = input_fn() - if 'weights' not in features: - raise ValueError("Weights need to be returned as part of the parse_fn") - weights = features.pop('weights') - - preds = build_graph(features=features, label=None, mode='infer', params=params, config=None) - init = tf.global_variables_initializer() - table_init = tf.tables_initializer() - with tf.Session() as sess: - sess.run(init) - sess.run(table_init) - count = 0 - max_steps = params.calibrator_max_steps or -1 - while max_steps <= 0 or count <= max_steps: - try: - weights_vals, labels_vals, preds_vals = sess.run([weights, labels, preds['output']]) - calibrator.accumulate(preds_vals, labels_vals, weights_vals.flatten()) - except tf.errors.OutOfRangeError: - break - count += 1 - - calibrator.calibrate() - calibrator.save(params.calibrator_save_dir) - trainer.estimator._params.isotonic_calibrator = True - - if debug: - return calibrator - - else: - calibrator_save_dir = twml.util.sanitize_hdfs_path(params.calibrator_save_dir) - # workers wait for calibration to be ready - while not tf.io.gfile.exists(calibrator_save_dir + os.path.sep + "tfhub_module.pb"): - logging.info("Worker waiting for calibration at %s" % calibrator_save_dir) - time.sleep(60) - - -def discretize(params, feature_config, input_fn, debug=False): - """ - Discretizes continuous features - Arguments: - params: - Parameters - input_fn: - Input Function specified by the user - debug: - Defaults to False. Returns the calibrator - """ - - if (os.environ.get("TWML_HOGWILD_TASK_TYPE") == "chief" or "num_workers" not in params or - params.num_workers is None): - - # overwrite the current save_dir - if params.overwrite_save_dir and tf.io.gfile.exists(params.discretizer_save_dir): - logging.info("Trainer overwriting existing save directory: %s (params.overwrite_save_dir)" - % params.discretizer_save_dir) - tf.io.gfile.rmtree(params.discretizer_save_dir) - - config_map = feature_config() - discretize_dict = config_map['discretize_config'] - - # chief trains discretizer - logging.info("Chief training discretizer") - - batch = input_fn() - # Accumulate the features for each calibrator - with tf.Session() as sess: - count = 0 - max_steps = params.discretizer_max_steps or -1 - while max_steps <= 0 or count <= max_steps: - try: - inputs = sess.run(batch) - for name, clbrt in discretize_dict.items(): - clbrt.accumulate_features(inputs[0], name) - except tf.errors.OutOfRangeError: - break - count += 1 - - # This module allows for the calibrator to save be saved as part of - # Tensorflow Hub (this will allow it to be used in further steps) - def calibrator_module(): - # Note that this is usually expecting a sparse_placeholder - for name, clbrt in discretize_dict.items(): - clbrt.calibrate() - clbrt.add_hub_signatures(name) - - # exports the module to the save_dir - spec = hub.create_module_spec(calibrator_module) - with tf.Graph().as_default(): - module = hub.Module(spec) - with tf.Session() as session: - module.export(params.discretizer_save_dir, session) - - for name, clbrt in discretize_dict.items(): - clbrt.write_summary_json(params.discretizer_save_dir, name) - - if debug: - return discretize_dict - - else: - # wait for the file to be removed (if necessary) - # should be removed after an actual fix applied - time.sleep(60) - discretizer_save_dir = twml.util.sanitize_hdfs_path(params.discretizer_save_dir) - # workers wait for calibration to be ready - while not tf.io.gfile.exists(discretizer_save_dir + os.path.sep + "tfhub_module.pb"): - logging.info("Worker waiting for calibration at %s" % discretizer_save_dir) - time.sleep(60) + "--discretizer.output_size_bits", + type=int, + default=22, + dest="discretizer_output_size_bits", + help="Number of bits allocated to the output size", + ) + return parser -def add_discretizer_arguments(parser): - """ - Add discretizer-specific command-line arguments to a Trainer parser. - - Arguments: - parser: argparse.ArgumentParser instance obtained from Trainer.get_trainer_parser - - Returns: - argparse.ArgumentParser instance with discretizer-specific arguments added - """ - - parser.add_argument("--discretizer.save_dir", type=str, - dest="discretizer_save_dir", - help="Path to save or load discretizer calibration") - parser.add_argument("--discretizer.batch_size", type=int, default=128, - dest="discretizer_batch_size", - help="Discretizer batch size") - parser.add_argument("--discretizer.keep_rate", type=float, default=0.0008, - dest="discretizer_keep_rate", - help="Keep rate") - parser.add_argument("--discretizer.parts_downsampling_rate", type=float, default=0.2, - dest="discretizer_parts_downsampling_rate", - help="Parts downsampling rate") - parser.add_argument("--discretizer.num_bins", type=int, default=20, - dest="discretizer_num_bins", - help="Number of bins per feature") - parser.add_argument("--discretizer.output_size_bits", type=int, default=22, - dest="discretizer_output_size_bits", - help="Number of bits allocated to the output size") - return parser - - -def add_isotonic_calibrator_arguments(parser): - """ - Add discretizer-specific command-line arguments to a Trainer parser. - - Arguments: - parser: argparse.ArgumentParser instance obtained from Trainer.get_trainer_parser - - Returns: - argparse.ArgumentParser instance with discretizer-specific arguments added - """ - parser.add_argument("--calibrator.num_bins", type=int, - default=25000, dest="calibrator_num_bins", - help="number of bins for isotonic calibration") - parser.add_argument("--calibrator.parts_downsampling_rate", type=float, default=0.1, - dest="calibrator_parts_downsampling_rate", help="Parts downsampling rate") - parser.add_argument("--calibrator.save_dir", type=str, - dest="calibrator_save_dir", help="Path to save or load calibrator output") - parser.add_argument("--calibrator.load_tensorflow_module", type=str, default=None, - dest="calibrator_load_tensorflow_module", - help="Location from where to load a pretrained graph from. \ - Typically, this is where the MLP graph is saved") - parser.add_argument("--calibrator.export_mlp_module_name", type=str, default='tf_hub_mlp', - help="Name for loaded hub signature", - dest="export_mlp_module_name") - parser.add_argument("--calibrator.export_isotonic_module_name", - type=str, default="tf_hub_isotonic", - dest="calibrator_export_module_name", - help="export module name") - parser.add_argument("--calibrator.final_evaluation_steps", type=int, - dest="calibrator_final_evaluation_steps", default=None, - help="number of steps for final evaluation") - parser.add_argument("--calibrator.train_steps", type=int, default=-1, - dest="calibrator_train_steps", - help="number of steps for calibration") - parser.add_argument("--calibrator.batch_size", type=int, default=1024, - dest="calibrator_batch_size", - help="Calibrator batch size") - parser.add_argument("--calibrator.is_calibrating", action='store_true', - dest="is_calibrating", - help="Dummy argument to allow running in chief worker") - return parser - - -def calibrate_calibrator_and_export(name, calibrator, build_graph_fn, params, feature_config, - run_eval=True, input_fn=None, metric_fn=None, - export_task_type_overrider=None): - """ - Pre-set `isotonic calibrator` calibrator. - Args: - name: - scope name used for the calibrator - calibrator: - calibrator that will be calibrated and exported. - build_graph_fn: - build graph function for the calibrator - params: - params passed to the calibrator - feature_config: - feature config which will be passed to the trainer - export_task_type_overrider: - the task type for exporting the calibrator - if specified, this will override the default export task type in trainer.hub_export(..) - """ - - # create calibrator params - params_c = copy.deepcopy(params) - params_c.data_threads = 1 - params_c.num_workers = 1 - params_c.continue_from_checkpoint = True - params_c.overwrite_save_dir = False - params_c.stats_port = None - - # Automatically load from the saved Tensorflow Hub module if not specified. - if params_c.calibrator_load_tensorflow_module is None: - path_saved_tensorflow_model = os.path.join(params.save_dir, params.export_mlp_module_name) - params_c.calibrator_load_tensorflow_module = path_saved_tensorflow_model - - if "calibrator_parts_downsampling_rate" in params_c: - params_c.train_parts_downsampling_rate = params_c.calibrator_parts_downsampling_rate - if "calibrator_save_dir" in params_c: - params_c.save_dir = params_c.calibrator_save_dir - if "calibrator_batch_size" in params_c: - params_c.train_batch_size = params_c.calibrator_batch_size - params_c.eval_batch_size = params_c.calibrator_batch_size - # TODO: Deprecate this option. It is not actually used. Calibrator - # simply iterates until the end of input_fn. - if "calibrator_train_steps" in params_c: - params_c.train_steps = params_c.calibrator_train_steps - - if metric_fn is None: - metric_fn = twml.metrics.get_multi_binary_class_metric_fn(None) - - # Common Trainer which will also be used by all workers - trainer = twml.trainers.DataRecordTrainer( - name=name, - params=params_c, - feature_config=feature_config, - build_graph_fn=build_graph_fn, - save_dir=params_c.save_dir, - metric_fn=metric_fn - ) - - if trainer._estimator.config.is_chief: - - # Chief trains calibrator - logging.info("Chief training calibrator") - - # Disregard hogwild config - os_twml_hogwild_ports = os.environ.get("TWML_HOGWILD_PORTS") - os.environ["TWML_HOGWILD_PORTS"] = "" - - hooks = None - if params_c.calibrator_train_steps > 0: - hooks = [twml.hooks.StepProgressHook(params_c.calibrator_train_steps)] - - def parse_fn(input_x): - fc_parse_fn = feature_config.get_parse_fn() - features, labels = fc_parse_fn(input_x) - features['labels'] = labels - return features, labels - - if input_fn is None: - input_fn = trainer.get_train_input_fn(parse_fn=parse_fn, repeat=False) - - # Calibrate stage - trainer.estimator._params.mode = 'calibrate' - trainer.calibrate(calibrator=calibrator, - input_fn=input_fn, - steps=params_c.calibrator_train_steps, - hooks=hooks) - - # Save Checkpoint - # We need to train for 1 step, to save the graph to checkpoint. - # This is done just by the chief. - # We need to set the mode to evaluate to save the graph that will be consumed - # In the final evaluation - trainer.estimator._params.mode = 'evaluate' - trainer.train(input_fn=input_fn, steps=1) - - # Restore hogwild setup - if os_twml_hogwild_ports is not None: - os.environ["TWML_HOGWILD_PORTS"] = os_twml_hogwild_ports - else: - # Workers wait for calibration to be ready - final_calibrator_path = os.path.join(params_c.calibrator_save_dir, - params_c.calibrator_export_module_name) - - final_calibrator_path = twml.util.sanitize_hdfs_path(final_calibrator_path) - - while not tf.io.gfile.exists(final_calibrator_path + os.path.sep + "tfhub_module.pb"): - logging.info("Worker waiting for calibration at %s" % final_calibrator_path) - time.sleep(60) - - # Evaluate stage - if run_eval: - trainer.estimator._params.mode = 'evaluate' - # This will allow the Evaluate method to be run in Hogwild - # trainer.estimator._params.continue_from_checkpoint = True - trainer.evaluate(name='test', input_fn=input_fn, steps=params_c.calibrator_final_evaluation_steps) - - trainer.hub_export(name=params_c.calibrator_export_module_name, - export_task_type_overrider=export_task_type_overrider, - serving_input_receiver_fn=feature_config.get_serving_input_receiver_fn()) - - return trainer - - -def calibrate_discretizer_and_export(name, calibrator, build_graph_fn, params, feature_config): - """ - Pre-set percentile discretizer calibrator. - Args: - name: - scope name used for the calibrator - calibrator: - calibrator that will be calibrated and exported. - build_graph_fn: - build graph function for the calibrator - params: - params passed to the calibrator - feature_config: - feature config or input_fn which will be passed to the trainer. - """ - - if (os.environ.get("TWML_HOGWILD_TASK_TYPE") == "chief" or "num_workers" not in params or - params.num_workers is None): - - # chief trains discretizer - logging.info("Chief training discretizer") - - # disregard hogwild config - os_twml_hogwild_ports = os.environ.get("TWML_HOGWILD_PORTS") - os.environ["TWML_HOGWILD_PORTS"] = "" - - # create discretizer params +def add_isotonic_calibrator_arguments( + parser: argparse.ArgumentParser, +) -> argparse.ArgumentParser: + """ + Add discretizer-specific command-line arguments to a Trainer parser. + + Args: + parser: argparse.ArgumentParser instance obtained from Trainer.get_trainer_parser + + Returns: + argparse.ArgumentParser instance with discretizer-specific arguments added + """ + parser.add_argument( + "--calibrator.num_bins", + type=int, + default=25000, + dest="calibrator_num_bins", + help="number of bins for isotonic calibration", + ) + parser.add_argument( + "--calibrator.parts_downsampling_rate", + type=float, + default=0.1, + dest="calibrator_parts_downsampling_rate", + help="Parts downsampling rate", + ) + parser.add_argument( + "--calibrator.save_dir", + type=str, + dest="calibrator_save_dir", + help="Path to save or load calibrator output", + ) + parser.add_argument( + "--calibrator.load_tensorflow_module", + type=str, + default=None, + dest="calibrator_load_tensorflow_module", + help="Location from where to load a pretrained graph from. Typically, this is where the MLP graph is saved", + ) + parser.add_argument( + "--calibrator.export_mlp_module_name", + type=str, + default="tf_hub_mlp", + help="Name for loaded hub signature", + dest="export_mlp_module_name", + ) + parser.add_argument( + "--calibrator.export_isotonic_module_name", + type=str, + default="tf_hub_isotonic", + dest="calibrator_export_module_name", + help="export module name", + ) + parser.add_argument( + "--calibrator.final_evaluation_steps", + type=int, + dest="calibrator_final_evaluation_steps", + default=None, + help="number of steps for final evaluation", + ) + parser.add_argument( + "--calibrator.train_steps", + type=int, + default=-1, + dest="calibrator_train_steps", + help="number of steps for calibration", + ) + parser.add_argument( + "--calibrator.batch_size", + type=int, + default=1024, + dest="calibrator_batch_size", + help="Calibrator batch size", + ) + parser.add_argument( + "--calibrator.is_calibrating", + action="store_true", + dest="is_calibrating", + help="Dummy argument to allow running in chief worker", + ) + return parser + + +def calibrate_calibrator_and_export( + name: str, + calibrator: tf.estimator.Estimator, + build_graph_fn: Callable, + params: tf.contrib.training.HParams, + feature_config: FeatureConfig, + run_eval: bool = True, + input_fn: Callable = None, + metric_fn: Callable = None, + export_task_type_overrider: str = None, +): + """ + Pre-set `isotonic calibrator` calibrator. + Args: + name (str): + scope name used for the calibrator + calibrator (tf.estimator.Estimator): + calibrator that will be calibrated and exported. + build_graph_fn (Callable): + build graph function for the calibrator + params (tf.contrib.training.HParams): + params passed to the calibrator + feature_config (FeatureConfig): + feature config which will be passed to the trainer + run_eval (bool): + whether to run evaluation after calibration. Default is True. + input_fn (Callable): + input function for the calibrator. If not specified, the default input function will be used. + metric_fn (Callable): + metric function for the calibrator. If not specified, the default metric function will be used. + export_task_type_overrider: + the task type for exporting the calibrator + if specified, this will override the default export task type in trainer.hub_export(..) + """ + + # create calibrator params params_c = copy.deepcopy(params) params_c.data_threads = 1 - params_c.train_steps = -1 - params_c.train_max_steps = None - params_c.eval_steps = -1 params_c.num_workers = 1 - params_c.tensorboard_port = None + params_c.continue_from_checkpoint = True + params_c.overwrite_save_dir = False params_c.stats_port = None - if "discretizer_batch_size" in params_c: - params_c.train_batch_size = params_c.discretizer_batch_size - params_c.eval_batch_size = params_c.discretizer_batch_size - if "discretizer_keep_rate" in params_c: - params_c.train_keep_rate = params_c.discretizer_keep_rate - if "discretizer_parts_downsampling_rate" in params_c: - params_c.train_parts_downsampling_rate = params_c.discretizer_parts_downsampling_rate - if "discretizer_save_dir" in params_c: - params_c.save_dir = params_c.discretizer_save_dir - - # train discretizer + # Automatically load from the saved Tensorflow Hub module if not specified. + if params_c.calibrator_load_tensorflow_module is None: + path_saved_tensorflow_model = os.path.join( + params.save_dir, params.export_mlp_module_name + ) + params_c.calibrator_load_tensorflow_module = path_saved_tensorflow_model + + if "calibrator_parts_downsampling_rate" in params_c: + params_c.train_parts_downsampling_rate = ( + params_c.calibrator_parts_downsampling_rate + ) + + if "calibrator_save_dir" in params_c: + params_c.save_dir = params_c.calibrator_save_dir + + if "calibrator_batch_size" in params_c: + params_c.train_batch_size = params_c.calibrator_batch_size + params_c.eval_batch_size = params_c.calibrator_batch_size + + # TODO: Deprecate this option. It is not actually used. Calibrator simply iterates until the end of input_fn. + if "calibrator_train_steps" in params_c: + params_c.train_steps = params_c.calibrator_train_steps + + if metric_fn is None: + metric_fn = twml.metrics.get_multi_binary_class_metric_fn(None) + + # Common Trainer which will also be used by all workers trainer = twml.trainers.DataRecordTrainer( - name=name, - params=params_c, - build_graph_fn=build_graph_fn, - save_dir=params_c.save_dir, + name=name, + params=params_c, + feature_config=feature_config, + build_graph_fn=build_graph_fn, + save_dir=params_c.save_dir, + metric_fn=metric_fn, + ) + + if trainer._estimator.config.is_chief: + # Chief trains calibrator + logging.info("Chief training calibrator") + + # Disregard hogwild config + os_twml_hogwild_ports = os.environ.get("TWML_HOGWILD_PORTS") + os.environ["TWML_HOGWILD_PORTS"] = "" + + hooks = None + if params_c.calibrator_train_steps > 0: + hooks = [twml.hooks.StepProgressHook(params_c.calibrator_train_steps)] + + def parse_fn(input_x): + fc_parse_fn = feature_config.get_parse_fn() + features, labels = fc_parse_fn(input_x) + features["labels"] = labels + return features, labels + + if input_fn is None: + input_fn = trainer.get_train_input_fn(parse_fn=parse_fn, repeat=False) + + # Calibrate stage + trainer.estimator._params.mode = "calibrate" + trainer.calibrate( + calibrator=calibrator, + input_fn=input_fn, + steps=params_c.calibrator_train_steps, + hooks=hooks, + ) + + # Save Checkpoint + # We need to train for 1 step, to save the graph to checkpoint. + # This is done just by the chief. + # We need to set the mode to evaluate to save the graph that will be consumed + # In the final evaluation + trainer.estimator._params.mode = "evaluate" + trainer.train(input_fn=input_fn, steps=1) + + # Restore hogwild setup + if os_twml_hogwild_ports is not None: + os.environ["TWML_HOGWILD_PORTS"] = os_twml_hogwild_ports + else: + # Workers wait for calibration to be ready + final_calibrator_path = os.path.join( + params_c.calibrator_save_dir, params_c.calibrator_export_module_name + ) + + final_calibrator_path = twml.util.sanitize_hdfs_path(final_calibrator_path) + + while not tf.io.gfile.exists( + final_calibrator_path + os.path.sep + "tfhub_module.pb" + ): + logging.info("Worker waiting for calibration at %s" % final_calibrator_path) + time.sleep(60) + + # Evaluate stage + if run_eval: + trainer.estimator._params.mode = "evaluate" + # This will allow the Evaluate method to be run in Hogwild + # trainer.estimator._params.continue_from_checkpoint = True + trainer.evaluate( + name="test", + input_fn=input_fn, + steps=params_c.calibrator_final_evaluation_steps, + ) + + trainer.hub_export( + name=params_c.calibrator_export_module_name, + export_task_type_overrider=export_task_type_overrider, + serving_input_receiver_fn=feature_config.get_serving_input_receiver_fn(), ) - if isinstance(feature_config, twml.feature_config.FeatureConfig): - parse_fn = twml.parsers.get_continuous_parse_fn(feature_config) - input_fn = trainer.get_train_input_fn(parse_fn=parse_fn, repeat=False) - elif callable(feature_config): - input_fn = feature_config + return trainer + + +def calibrate_discretizer_and_export( + name: str, + calibrator: twml.calibrators.Calibrator, + build_graph_fn: callable, + params: twml.params.TrainParams, + feature_config: FeatureConfig, +): + """ + Pre-set percentile discretizer calibrator. + Args: + name (str): + scope name used for the calibrator + calibrator (twml.calibrators.Calibrator): + calibrator that will be calibrated and exported. + build_graph_fn (function): + build graph function for the calibrator + params (twml.params.TrainParams): + params passed to the calibrator + feature_config (twml.feature_config.FeatureConfig): + feature config or input_fn which will be passed to the trainer. + """ + + if ( + os.environ.get("TWML_HOGWILD_TASK_TYPE") == "chief" + or "num_workers" not in params + or params.num_workers is None + ): + # chief trains discretizer + logging.info("Chief training discretizer") + + # disregard hogwild config + os_twml_hogwild_ports = os.environ.get("TWML_HOGWILD_PORTS") + os.environ["TWML_HOGWILD_PORTS"] = "" + + # create discretizer params + params_c = copy.deepcopy(params) + params_c.data_threads = 1 + params_c.train_steps = -1 + params_c.train_max_steps = None + params_c.eval_steps = -1 + params_c.num_workers = 1 + params_c.tensorboard_port = None + params_c.stats_port = None + + if "discretizer_batch_size" in params_c: + params_c.train_batch_size = params_c.discretizer_batch_size + params_c.eval_batch_size = params_c.discretizer_batch_size + if "discretizer_keep_rate" in params_c: + params_c.train_keep_rate = params_c.discretizer_keep_rate + if "discretizer_parts_downsampling_rate" in params_c: + params_c.train_parts_downsampling_rate = ( + params_c.discretizer_parts_downsampling_rate + ) + if "discretizer_save_dir" in params_c: + params_c.save_dir = params_c.discretizer_save_dir + + # train discretizer + trainer = twml.trainers.DataRecordTrainer( + name=name, + params=params_c, + build_graph_fn=build_graph_fn, + save_dir=params_c.save_dir, + ) + + if isinstance(feature_config, twml.feature_config.FeatureConfig): + parse_fn = twml.parsers.get_continuous_parse_fn(feature_config) + input_fn = trainer.get_train_input_fn(parse_fn=parse_fn, repeat=False) + elif callable(feature_config): + input_fn = feature_config + else: + got_type = type(feature_config).__name__ + raise ValueError( + "Expecting feature_config to be FeatureConfig or function got %s" + % got_type + ) + + hooks = None + if params_c.train_steps > 0: + hooks = [twml.hooks.StepProgressHook(params_c.train_steps)] + + trainer.calibrate( + calibrator=calibrator, + input_fn=input_fn, + steps=params_c.train_steps, + hooks=hooks, + ) + # restore hogwild setup + if os_twml_hogwild_ports is not None: + os.environ["TWML_HOGWILD_PORTS"] = os_twml_hogwild_ports else: - got_type = type(feature_config).__name__ - raise ValueError( - "Expecting feature_config to be FeatureConfig or function got %s" % got_type) - - hooks = None - if params_c.train_steps > 0: - hooks = [twml.hooks.StepProgressHook(params_c.train_steps)] - - trainer.calibrate(calibrator=calibrator, input_fn=input_fn, - steps=params_c.train_steps, hooks=hooks) - # restore hogwild setup - if os_twml_hogwild_ports is not None: - os.environ["TWML_HOGWILD_PORTS"] = os_twml_hogwild_ports - else: - discretizer_save_dir = twml.util.sanitize_hdfs_path(params.discretizer_save_dir) - # workers wait for calibration to be ready - while not tf.io.gfile.exists(discretizer_save_dir + os.path.sep + "tfhub_module.pb"): - logging.info("Worker waiting for calibration at %s" % discretizer_save_dir) - time.sleep(60) + discretizer_save_dir = twml.util.sanitize_hdfs_path(params.discretizer_save_dir) + # workers wait for calibration to be ready + while not tf.io.gfile.exists( + discretizer_save_dir + os.path.sep + "tfhub_module.pb" + ): + logging.info("Worker waiting for calibration at %s" % discretizer_save_dir) + time.sleep(60) def build_percentile_discretizer_graph(features, label, mode, params, config=None): - """ - Pre-set Percentile Discretizer Build Graph - Follows the same signature as build_graph - """ - sparse_tf = twml.util.convert_to_sparse(features, params.input_size_bits) - weights = tf.reshape(features['weights'], tf.reshape(features['batch_size'], [1])) - if isinstance(sparse_tf, tf.SparseTensor): - indices = sparse_tf.indices[:, 1] - ids = sparse_tf.indices[:, 0] - elif isinstance(sparse_tf, twml.SparseTensor): - indices = sparse_tf.indices - ids = sparse_tf.ids - - # Return weights, feature_ids, feature_values - weights = tf.gather(params=weights, indices=ids) - feature_ids = indices - feature_values = sparse_tf.values - # Update train_op and assign dummy_loss - train_op = tf.assign_add(tf.train.get_global_step(), 1) - loss = tf.constant(1) - if mode == 'train': - return {'train_op': train_op, 'loss': loss} - return {'feature_ids': feature_ids, 'feature_values': feature_values, 'weights': weights} + """ + Pre-set Percentile Discretizer Build Graph + Follows the same signature as build_graph + """ + sparse_tf = twml.util.convert_to_sparse(features, params.input_size_bits) + weights = tf.reshape(features["weights"], tf.reshape(features["batch_size"], [1])) + if isinstance(sparse_tf, tf.SparseTensor): + indices = sparse_tf.indices[:, 1] + ids = sparse_tf.indices[:, 0] + elif isinstance(sparse_tf, twml.SparseTensor): + indices = sparse_tf.indices + ids = sparse_tf.ids + + # Return weights, feature_ids, feature_values + weights = tf.gather(params=weights, indices=ids) + feature_ids = indices + feature_values = sparse_tf.values + # Update train_op and assign dummy_loss + train_op = tf.assign_add(tf.train.get_global_step(), 1) + loss = tf.constant(1) + if mode == "train": + return {"train_op": train_op, "loss": loss} + return { + "feature_ids": feature_ids, + "feature_values": feature_values, + "weights": weights, + } def isotonic_module(mode, params): - """ - Common Isotonic Calibrator module for Hub Export - """ - inputs = tf.sparse_placeholder(tf.float32, name="sparse_input") - mlp = hub.Module(params.calibrator_load_tensorflow_module) - logits = mlp(inputs, signature=params.export_mlp_module_name) - isotonic_calibrator = hub.Module(params.save_dir) - output = isotonic_calibrator(logits, signature="isotonic_calibrator") - hub.add_signature(inputs={"sparse_input": inputs}, - outputs={"default": output}, - name=params.calibrator_export_module_name) - - -def build_isotonic_graph_from_inputs(inputs, features, label, mode, params, config=None, isotonic_fn=None): - """ - Helper function to build_isotonic_graph - Pre-set Isotonic Calibrator Build Graph - Follows the same signature as build_graph - """ - if params.mode == 'calibrate': + """ + Common Isotonic Calibrator module for Hub Export + """ + inputs = tf.sparse_placeholder(tf.float32, name="sparse_input") mlp = hub.Module(params.calibrator_load_tensorflow_module) logits = mlp(inputs, signature=params.export_mlp_module_name) - weights = tf.reshape(features['weights'], tf.reshape(features['batch_size'], [1])) - # Update train_op and assign dummy_loss - train_op = tf.assign_add(tf.train.get_global_step(), 1) - loss = tf.constant(1) - if mode == 'train': - return {'train_op': train_op, 'loss': loss} - return {'predictions': logits, 'targets': features['labels'], 'weights': weights} - else: - if isotonic_fn is None: - isotonic_spec = twml.util.create_module_spec(mlp_fn=isotonic_module, mode=mode, params=params) + isotonic_calibrator = hub.Module(params.save_dir) + output = isotonic_calibrator(logits, signature="isotonic_calibrator") + hub.add_signature( + inputs={"sparse_input": inputs}, + outputs={"default": output}, + name=params.calibrator_export_module_name, + ) + + +def build_isotonic_graph_from_inputs( + inputs, features, label, mode, params, config=None, isotonic_fn=None +): + """ + Helper function to build_isotonic_graph + Pre-set Isotonic Calibrator Build Graph + Follows the same signature as build_graph + """ + if params.mode == "calibrate": + mlp = hub.Module(params.calibrator_load_tensorflow_module) + logits = mlp(inputs, signature=params.export_mlp_module_name) + weights = tf.reshape( + features["weights"], tf.reshape(features["batch_size"], [1]) + ) + # Update train_op and assign dummy_loss + train_op = tf.assign_add(tf.train.get_global_step(), 1) + loss = tf.constant(1) + if mode == "train": + return {"train_op": train_op, "loss": loss} + return { + "predictions": logits, + "targets": features["labels"], + "weights": weights, + } else: - isotonic_spec = twml.util.create_module_spec(mlp_fn=isotonic_fn, mode=mode, params=params) - output_hub = hub.Module(isotonic_spec, - name=params.calibrator_export_module_name) - hub.register_module_for_export(output_hub, params.calibrator_export_module_name) - output = output_hub(inputs, signature=params.calibrator_export_module_name) - output = tf.clip_by_value(output, 0, 1) - loss = tf.reduce_sum(tf.stop_gradient(output)) - train_op = tf.assign_add(tf.train.get_global_step(), 1) - return {'train_op': train_op, 'loss': loss, 'output': output} - - -def build_isotonic_graph(features, label, mode, params, config=None, export_discretizer=True): - """ - Pre-set Isotonic Calibrator Build Graph - Follows the same signature as build_graph - This assumes that MLP already contains all modules (include percentile - discretizer); if export_discretizer is set - then it does not export the MDL phase. - """ - sparse_tf = twml.util.convert_to_sparse(features, params.input_size_bits) - if export_discretizer: - return build_isotonic_graph_from_inputs(sparse_tf, features, label, mode, params, config) - discretizer = hub.Module(params.discretizer_path) - - if params.discretizer_signature is None: - discretizer_signature = "percentile_discretizer_calibrator" - else: - discretizer_signature = params.discretizer_signature - input_sparse = discretizer(sparse_tf, signature=discretizer_signature) - return build_isotonic_graph_from_inputs(input_sparse, features, label, mode, params, config) + if isotonic_fn is None: + isotonic_spec = twml.util.create_module_spec( + mlp_fn=isotonic_module, mode=mode, params=params + ) + else: + isotonic_spec = twml.util.create_module_spec( + mlp_fn=isotonic_fn, mode=mode, params=params + ) + output_hub = hub.Module( + isotonic_spec, name=params.calibrator_export_module_name + ) + hub.register_module_for_export(output_hub, params.calibrator_export_module_name) + output = output_hub(inputs, signature=params.calibrator_export_module_name) + output = tf.clip_by_value(output, 0, 1) + loss = tf.reduce_sum(tf.stop_gradient(output)) + train_op = tf.assign_add(tf.train.get_global_step(), 1) + return {"train_op": train_op, "loss": loss, "output": output} + + +def build_isotonic_graph( + features, label, mode, params, config=None, export_discretizer=True +): + """ + Pre-set Isotonic Calibrator Build Graph + Follows the same signature as build_graph + This assumes that MLP already contains all modules (include percentile + discretizer); if export_discretizer is set + then it does not export the MDL phase. + """ + sparse_tf = twml.util.convert_to_sparse(features, params.input_size_bits) + if export_discretizer: + return build_isotonic_graph_from_inputs( + sparse_tf, features, label, mode, params, config + ) + discretizer = hub.Module(params.discretizer_path) + + if params.discretizer_signature is None: + discretizer_signature = "percentile_discretizer_calibrator" + else: + discretizer_signature = params.discretizer_signature + input_sparse = discretizer(sparse_tf, signature=discretizer_signature) + return build_isotonic_graph_from_inputs( + input_sparse, features, label, mode, params, config + ) diff --git a/twml/twml/contrib/calibrators/hashed_percentile_discretizer.py b/twml/twml/contrib/calibrators/hashed_percentile_discretizer.py index e14f62303..09a70d94b 100644 --- a/twml/twml/contrib/calibrators/hashed_percentile_discretizer.py +++ b/twml/twml/contrib/calibrators/hashed_percentile_discretizer.py @@ -1,22 +1,34 @@ # pylint: disable=arguments-differ,no-member,too-many-statements -''' Contains HashedPercentileDiscretizerCalibrator used for calibration ''' -from .percentile_discretizer import PercentileDiscretizerCalibrator +""" Contains HashedPercentileDiscretizerCalibrator used for calibration """ +import numpy as np import twml +from .percentile_discretizer import PercentileDiscretizerCalibrator + class HashedPercentileDiscretizerCalibrator(PercentileDiscretizerCalibrator): - ''' Accumulates features and their respective values for HashedPercentileDiscretizer calibration. - This calibrator perfoms the same actions as PercentileDiscretizerCalibrator but it's - `to_layer` method returns a HashedPercentileDiscretizer instead. - ''' + """Accumulates features and their respective values for HashedPercentileDiscretizer calibration. + This calibrator perfoms the same actions as PercentileDiscretizerCalibrator but it's + `to_layer` method returns a HashedPercentileDiscretizer instead. + """ - def _create_discretizer_layer(self, n_feature, hash_map_keys, hash_map_values, - feature_offsets, name): - return twml.contrib.layers.HashedPercentileDiscretizer( - n_feature=n_feature, n_bin=self._n_bin, - name=name, out_bits=self._out_bits, - hash_keys=hash_map_keys, hash_values=hash_map_values, - bin_ids=self._bin_ids.flatten(), bin_values=self._bin_vals.flatten(), - feature_offsets=feature_offsets - ) + def _create_discretizer_layer( + self, + n_feature: int, + hash_map_keys: np.ndarray, + hash_map_values: np.ndarray, + feature_offsets: np.ndarray, + name: str, + ): + return twml.contrib.layers.HashedPercentileDiscretizer( + n_feature=n_feature, + n_bin=self._n_bin, + name=name, + out_bits=self._out_bits, + hash_keys=hash_map_keys, + hash_values=hash_map_values, + bin_ids=self._bin_ids.flatten(), + bin_values=self._bin_vals.flatten(), + feature_offsets=feature_offsets, + ) diff --git a/twml/twml/contrib/calibrators/hashing_discretizer.py b/twml/twml/contrib/calibrators/hashing_discretizer.py index 965ced934..6f1bfb11b 100644 --- a/twml/twml/contrib/calibrators/hashing_discretizer.py +++ b/twml/twml/contrib/calibrators/hashing_discretizer.py @@ -1,35 +1,42 @@ # pylint: disable=arguments-differ,no-member,too-many-statements -''' Contains HashedPercentileDiscretizerCalibrator used for calibration ''' -from .percentile_discretizer import PercentileDiscretizerCalibrator - +""" Contains HashedPercentileDiscretizerCalibrator used for calibration """ import numpy as np + import twml +from .percentile_discretizer import PercentileDiscretizerCalibrator + class HashingDiscretizerCalibrator(PercentileDiscretizerCalibrator): - ''' Accumulates features and their respective values for HashingDiscretizer calibration. - This calibrator perfoms the same actions as PercentileDiscretizerCalibrator but it's - `to_layer` method returns a HashingDiscretizer instead. - ''' + """Accumulates features and their respective values for HashingDiscretizer calibration. + This calibrator perfoms the same actions as PercentileDiscretizerCalibrator but it's + `to_layer` method returns a HashingDiscretizer instead. + """ - def _create_discretizer_layer(self, n_feature, hash_map_keys, hash_map_values, - feature_offsets, name): - # Need to sort hash_map_keys according to hash_map_values - # just in case they're not in order of being put in the dict - # hash_map_values is already 0 through len(hash_map_values)-1 - hash_map_keys = hash_map_keys.flatten() - # why is this float32 in PercentileDiscretizerCalibrator.to_layer ???? - # need int for indexing - hash_map_values = hash_map_values.flatten().astype(np.int32) - feature_ids = np.zeros((len(hash_map_keys),), dtype=np.int64) - for idx in range(len(hash_map_keys)): - feature_ids[hash_map_values[idx]] = hash_map_keys[idx] + def _create_discretizer_layer( + self, + n_feature: int, + hash_map_keys: np.ndarray, + hash_map_values: np.ndarray, + feature_offsets: np.ndarray, + name: str, + ) -> twml.contrib.layers.HashingDiscretizer: + # Need to sort hash_map_keys according to hash_map_values + # just in case they're not in order of being put in the dict + # hash_map_values is already 0 through len(hash_map_values)-1 + hash_map_keys = hash_map_keys.flatten() + # why is this float32 in PercentileDiscretizerCalibrator.to_layer ???? + # need int for indexing + hash_map_values = hash_map_values.flatten().astype(np.int32) + feature_ids = np.zeros((len(hash_map_keys),), dtype=np.int64) + for idx in range(len(hash_map_keys)): + feature_ids[hash_map_values[idx]] = hash_map_keys[idx] - return twml.contrib.layers.HashingDiscretizer( - feature_ids=feature_ids, - bin_vals=self._bin_vals.flatten(), - n_bin=self._n_bin + 1, # (self._n_bin + 1) bin_vals for each feature_id - out_bits=self._out_bits, - cost_per_unit=500, - name=name - ) + return twml.contrib.layers.HashingDiscretizer( + feature_ids=feature_ids, + bin_vals=self._bin_vals.flatten(), + n_bin=self._n_bin + 1, # (self._n_bin + 1) bin_vals for each feature_id + out_bits=self._out_bits, + cost_per_unit=500, + name=name, + ) diff --git a/twml/twml/contrib/calibrators/isotonic.py b/twml/twml/contrib/calibrators/isotonic.py index d03a75ff8..d8a325613 100644 --- a/twml/twml/contrib/calibrators/isotonic.py +++ b/twml/twml/contrib/calibrators/isotonic.py @@ -1,317 +1,370 @@ # pylint: disable=arguments-differ, unused-argument -''' Contains Isotonic Calibration''' +""" Contains Isotonic Calibration""" -from .calibrator import CalibrationFeature, Calibrator +from typing import Dict, Optional, Tuple -from absl import logging import numpy as np -from sklearn.isotonic import isotonic_regression import tensorflow.compat.v1 as tf import tensorflow_hub as hub +from absl import logging +from sklearn.isotonic import isotonic_regression + import twml import twml.layers +from .calibrator import CalibrationFeature, Calibrator DEFAULT_SAMPLE_WEIGHT = 1 -def sort_values(inputs, target, weight, ascending=True): - ''' - Sorts arrays based on the first array. - - Arguments: - inputs: - 1D array which will dictate the order which the remainder 2 arrays will be sorted - target: - 1D array - weight: - 1D array - ascending: - Boolean. If set to True (the default), sorts values in ascending order. - - Returns: - sorted inputs: - 1D array sorted by the order of `ascending` - sorted targets: - 1D array - sorted weight: - 1D array - ''' - # assert that the length of inputs and target are the same - if len(inputs) != len(target): - raise ValueError('Expecting inputs and target sizes to match') - # assert that the length of inputs and weight are the same - if len(inputs) != len(weight): - raise ValueError('Expecting inputs and weight sizes to match') - inds = inputs.argsort() - if not ascending: - inds = inds[::-1] - return inputs[inds], target[inds], weight[inds] - - -class IsotonicFeature(CalibrationFeature): - ''' - IsotonicFeature adds values, weights and targets to each feature and then runs - isotonic regression by calling `sklearn.isotonic.isotonic_regression - `_ - ''' - - def _get_bin_boundaries(self, n_samples, bins, similar_bins): +def sort_values( + inputs: np.ndarray, + target: np.ndarray, + weight: np.ndarray, + ascending: bool = True, +) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """ - Calculates the sample indices that define bin boundaries - - Arguments: - n_samples: - (int) number of samples - bins: - (int) number of bins. Needs to be smaller or equal than n_samples. - similar_bins: - (bool) If True, samples will be distributed in bins of equal size (up to one sample). - If False bins will be filled with step = N_samples//bins, and last bin will contain all remaining samples. - Note that equal_bins=False can create a last bins with a very large number of samples. + Sorts arrays based on the first array. + + Args: + inputs: + 1D array which will dictate the order which the remainder 2 arrays will be sorted + target: + 1D array + weight: + 1D array + ascending: + Boolean. If set to True (the default), sorts values in ascending order. Returns: - (list[int]) List of sample indices defining bin boundaries + sorted inputs: + 1D array sorted by the order of `ascending` + sorted targets: + 1D array + sorted weight: + 1D array """ + # assert that the length of inputs and target are the same + if len(inputs) != len(target): + raise ValueError("Expecting inputs and target sizes to match") + # assert that the length of inputs and weight are the same + if len(inputs) != len(weight): + raise ValueError("Expecting inputs and weight sizes to match") - if bins > n_samples: - raise ValueError( - "The number of bins needs to be less than or equal to the number of samples. " - "Currently bins={0} and n_samples={1}.".format(bins, n_samples) - ) - - step = n_samples // bins + inds = inputs.argsort() + if not ascending: + inds = inds[::-1] + return inputs[inds], target[inds], weight[inds] - if similar_bins: - # dtype=int will floor the linspace - bin_boundaries = np.linspace(0, n_samples - step, num=bins, dtype=int) - else: - bin_boundaries = range(0, step * bins, step) - bin_boundaries = np.append(bin_boundaries, n_samples) - - return bin_boundaries - - def calibrate(self, bins, similar_bins=False, debug=False): - '''Calibrates the IsotonicFeature into calibrated weights and bias. - - 1. Sorts the values of the feature class, based on the order of values - 2. Performs isotonic regression using sklearn.isotonic.isotonic_regression - 3. Performs the binning of the samples, in order to obtain the final weight and bias - which will be used for inference - - Note that this method can only be called once. - - Arguments: - bins: - number of bins. - similar_bins: - If True, samples will be distributed in bins of equal size (up to one sample). - If False bins will be filled with step = N_samples//bins, and last bin will contain all remaining samples. - Note that equal_bins=False can create a last bins with a very large number of samples. - debug: - Defaults to False. If debug is set to true, output other parameters useful for debugging. +class IsotonicFeature(CalibrationFeature): + """ + IsotonicFeature adds values, weights and targets to each feature and then runs + isotonic regression by calling `sklearn.isotonic.isotonic_regression + `_ + """ - Returns: - [calibrated weight, calibrated bias] - ''' - if self._calibrated: - raise RuntimeError("Can only calibrate once") - # parse through the dict to obtain the targets, weights and values - self._concat_arrays() - feature_targets = self._features_dict['targets'] - feature_values = self._features_dict['values'] - feature_weights = self._features_dict['weights'] - srtd_feature_values, srtd_feature_targets, srtd_feature_weights = sort_values( - inputs=feature_values, - target=feature_targets, - weight=feature_weights - ) - calibrated_feature_values = isotonic_regression( - srtd_feature_targets, sample_weight=srtd_feature_weights) - # create the final outputs for the prediction of each class - bpreds = [] - btargets = [] - bweights = [] - rpreds = [] - - # Create bin boundaries - bin_boundaries = self._get_bin_boundaries( - len(calibrated_feature_values), bins, similar_bins=similar_bins) - - for sidx, eidx in zip(bin_boundaries, bin_boundaries[1:]): - # separate each one of the arrays based on their respective bins - lpreds = srtd_feature_values[int(sidx):int(eidx)] - lrpreds = calibrated_feature_values[int(sidx):int(eidx)] - ltargets = srtd_feature_targets[int(sidx):int(eidx)] - lweights = srtd_feature_weights[int(sidx):int(eidx)] - - # calculate the outputs (including the bpreds and rpreds) - bpreds.append(np.sum(lpreds * lweights) / (np.squeeze(np.sum(lweights)))) - rpreds.append(np.sum(lrpreds * lweights) / (np.squeeze(np.sum(lweights)))) - btargets.append(np.sum(ltargets * lweights) / (np.squeeze(np.sum(lweights)))) - bweights.append(np.squeeze(np.sum(lweights))) - # transposing the bpreds and rpreds which will be used as input to the inference step - bpreds = np.asarray(bpreds).T - rpreds = np.asarray(rpreds).T - btargets = np.asarray(btargets).T - bweights = np.asarray(bweights).T - # setting _calibrated to be True which is necessary in order to prevent it to re-calibrate - self._calibrated = True - if debug: - return bpreds, rpreds, btargets, bweights - return bpreds, rpreds + def _get_bin_boundaries( + self, n_samples: int, bins: int, similar_bins: bool + ) -> np.ndarray: + """ + Calculates the sample indices that define bin boundaries + + Args: + n_samples: + (int) number of samples + bins: + (int) number of bins. Needs to be smaller or equal than n_samples. + similar_bins: + (bool) If True, samples will be distributed in bins of equal size (up to one sample). + If False bins will be filled with step = N_samples//bins, and last bin will contain all remaining samples. + Note that equal_bins=False can create a last bins with a very large number of samples. + + Returns: + (np.ndarray) List of sample indices defining bin boundaries + """ + + if bins > n_samples: + raise ValueError( + "The number of bins needs to be less than or equal to the number of samples. " + f"Currently bins={bins} and n_samples={n_samples}" + ) + + step = n_samples // bins + + if similar_bins: + # dtype=int will floor the linspace + bin_boundaries = np.linspace(0, n_samples - step, num=bins, dtype=int) + else: + bin_boundaries = range(0, step * bins, step) + + bin_boundaries = np.append(bin_boundaries, n_samples) + + return bin_boundaries + + def calibrate( + self, + bins: int, + similar_bins: bool = False, + debug: bool = False, + ) -> Tuple[np.ndarray, np.ndarray]: + """Calibrates the IsotonicFeature into calibrated weights and bias. + + 1. Sorts the values of the feature class, based on the order of values + 2. Performs isotonic regression using sklearn.isotonic.isotonic_regression + 3. Performs the binning of the samples, in order to obtain the final weight and bias + which will be used for inference + + Note that this method can only be called once. + + Args: + bins (int): + number of bins. + similar_bins (bool): + If True, samples will be distributed in bins of equal size (up to one sample). + If False bins will be filled with step = N_samples//bins, and last bin will contain all remaining samples. + Note that equal_bins=False can create a last bins with a very large number of samples. + debug (bool): + Defaults to False. If debug is set to true, output other parameters useful for debugging. + + Returns: + [calibrated weight, calibrated bias] + """ + + if self._calibrated: + raise RuntimeError("Can only calibrate once") + + # parse through the dict to obtain the targets, weights and values + self._concat_arrays() + feature_targets = self._features_dict["targets"] + feature_values = self._features_dict["values"] + feature_weights = self._features_dict["weights"] + srtd_feature_values, srtd_feature_targets, srtd_feature_weights = sort_values( + inputs=feature_values, + target=feature_targets, + weight=feature_weights, + ) + calibrated_feature_values = isotonic_regression( + srtd_feature_targets, + sample_weight=srtd_feature_weights, + ) + # create the final outputs for the prediction of each class + bpreds = [] + btargets = [] + bweights = [] + rpreds = [] + + # Create bin boundaries + bin_boundaries = self._get_bin_boundaries( + len(calibrated_feature_values), + bins, + similar_bins=similar_bins, + ) + + for sidx, eidx in zip(bin_boundaries, bin_boundaries[1:]): + # separate each one of the arrays based on their respective bins + lpreds = srtd_feature_values[int(sidx) : int(eidx)] + lrpreds = calibrated_feature_values[int(sidx) : int(eidx)] + ltargets = srtd_feature_targets[int(sidx) : int(eidx)] + lweights = srtd_feature_weights[int(sidx) : int(eidx)] + + # calculate the outputs (including the bpreds and rpreds) + bpreds.append(np.sum(lpreds * lweights) / (np.squeeze(np.sum(lweights)))) + rpreds.append(np.sum(lrpreds * lweights) / (np.squeeze(np.sum(lweights)))) + btargets.append( + np.sum(ltargets * lweights) / (np.squeeze(np.sum(lweights))) + ) + bweights.append(np.squeeze(np.sum(lweights))) + # transposing the bpreds and rpreds which will be used as input to the inference step + bpreds = np.asarray(bpreds).T + rpreds = np.asarray(rpreds).T + btargets = np.asarray(btargets).T + bweights = np.asarray(bweights).T + # setting _calibrated to be True which is necessary in order to prevent it to re-calibrate + self._calibrated = True + if debug: + return bpreds, rpreds, btargets, bweights + + return bpreds, rpreds class IsotonicCalibrator(Calibrator): - ''' Accumulates features and their respective values for isotonic calibration. - Internally, each feature's values is accumulated via its own isotonicFeature object. - The steps for calibration are typically as follows: - - 1. accumulate feature values from batches by calling ``accumulate()``; - 2. calibrate all feature into Isotonic ``bpreds``, ``rpreds`` by calling ``calibrate()``; and - 3. convert to a ``twml.layers.Isotonic`` layer by calling ``to_layer()``. - - ''' - - def __init__(self, n_bin, similar_bins=False, **kwargs): - ''' Constructs an isotonicCalibrator instance. - - Arguments: - n_bin: - the number of bins per feature to use for isotonic. - Note that each feature actually maps to ``n_bin+1`` output IDs. - ''' - super(IsotonicCalibrator, self).__init__(**kwargs) - self._n_bin = n_bin - self._similar_bins = similar_bins - self._ys_input = [] - self._xs_input = [] - self._isotonic_feature_dict = {} - - def accumulate_feature(self, output): - ''' - Wrapper around accumulate for trainer API. - Arguments: - output: output of prediction of build_graph for calibrator - ''' - weights = output['weights'] if 'weights' in output else None - return self.accumulate(output['predictions'], output['targets'], weights) - - def accumulate(self, predictions, targets, weights=None): - ''' - Accumulate a single batch of class predictions, class targets and class weights. - These are accumulated until calibrate() is called. - - Arguments: - predictions: - float matrix of class values. Each dimension corresponds to a different class. - Shape is ``[n, d]``, where d is the number of classes. - targets: - float matrix of class targets. Each dimension corresponds to a different class. - Shape ``[n, d]``, where d is the number of classes. - weights: - Defaults to weights of 1. - 1D array containing the weights of each prediction. - ''' - if predictions.shape != targets.shape: - raise ValueError( - 'Expecting predictions.shape == targets.shape, got %s and %s instead' % - (str(predictions.shape), str(targets.shape))) - if weights is not None: - if weights.ndim != 1: - raise ValueError('Expecting 1D weight, got %dD instead' % weights.ndim) - elif weights.size != predictions.shape[0]: - raise ValueError( - 'Expecting predictions.shape[0] == weights.size, got %d != %d instead' % - (predictions.shape[0], weights.size)) - # iterate through the rows of predictions and sets one class to each row - if weights is None: - weights = np.full(predictions.shape[0], fill_value=DEFAULT_SAMPLE_WEIGHT) - for class_key in range(predictions.shape[1]): - # gets the predictions and targets for that class - class_predictions = predictions[:, class_key] - class_targets = targets[:, class_key] - if class_key not in self._isotonic_feature_dict: - isotonic_feature = IsotonicFeature(class_key) - self._isotonic_feature_dict[class_key] = isotonic_feature - else: - isotonic_feature = self._isotonic_feature_dict[class_key] - isotonic_feature.add_values({'values': class_predictions, 'weights': weights, - 'targets': class_targets}) - - def calibrate(self, debug=False): - ''' - Calibrates each IsotonicFeature after accumulation is complete. - Results are stored in ``self._ys_input`` and ``self._xs_input`` - - Arguments: - debug: - Defaults to False. If set to true, returns the ``xs_input`` and ``ys_input``. - ''' - super(IsotonicCalibrator, self).calibrate() - bias_temp = [] - weight_temp = [] - logging.info("Beginning isotonic calibration.") - isotonic_features_dict = self._isotonic_feature_dict - for class_id in isotonic_features_dict: - bpreds, rpreds = isotonic_features_dict[class_id].calibrate(bins=self._n_bin, similar_bins=self._similar_bins) - weight_temp.append(bpreds) - bias_temp.append(rpreds) - # save isotonic results onto a matrix - self._xs_input = np.array(weight_temp, dtype=np.float32) - self._ys_input = np.array(bias_temp, dtype=np.float32) - logging.info("Isotonic calibration finished.") - if debug: - return np.array(weight_temp), np.array(bias_temp) - return None - - def save(self, save_dir, name="default", verbose=False): - '''Save the calibrator into the given save_directory. - Arguments: - save_dir: - name of the saving directory. Default (string): "default". - ''' - if not self._calibrated: - raise RuntimeError("Expecting prior call to calibrate().Cannot save() prior to calibrate()") - - # This module allows for the calibrator to save be saved as part of - # Tensorflow Hub (this will allow it to be used in further steps) - logging.info("You probably do not need to save the isotonic layer. \ - So feel free to set save to False in the Trainer. \ - Additionally this only saves the layer not the whole graph.") - - def calibrator_module(): - ''' - Way to save Isotonic layer - ''' - # The input to isotonic is a dense layer - inputs = tf.placeholder(tf.float32) - calibrator_layer = self.to_layer() - output = calibrator_layer(inputs) - # creates the signature to the calibrator module - hub.add_signature(inputs=inputs, outputs=output, name=name) - - # exports the module to the save_dir - spec = hub.create_module_spec(calibrator_module) - with tf.Graph().as_default(): - module = hub.Module(spec) - with tf.Session() as session: - module.export(save_dir, session) - - def to_layer(self): - """ Returns a twml.layers.Isotonic Layer that can be used for feature discretization. - """ - if not self._calibrated: - raise RuntimeError("Expecting prior call to calibrate()") - - isotonic_layer = twml.layers.Isotonic( - n_unit=self._xs_input.shape[0], n_bin=self._xs_input.shape[1], - xs_input=self._xs_input, ys_input=self._ys_input, - **self._kwargs) + """Accumulates features and their respective values for isotonic calibration. + Internally, each feature's values is accumulated via its own isotonicFeature object. + The steps for calibration are typically as follows: + 1. accumulate feature values from batches by calling ``accumulate()``; + 2. calibrate all feature into Isotonic ``bpreds``, ``rpreds`` by calling ``calibrate()``; and + 3. convert to a ``twml.layers.Isotonic`` layer by calling ``to_layer()``. - return isotonic_layer + """ - def get_layer_args(self, name=None): - """ Returns layer args. See ``Calibrator.get_layer_args`` for more detailed documentation """ - return {'n_unit': self._xs_input.shape[0], 'n_bin': self._xs_input.shape[1]} + def __init__(self, n_bin: int, similar_bins: bool = False, **kwargs): + """Constructs an isotonicCalibrator instance. + + Args: + n_bin: + the number of bins per feature to use for isotonic. + Note that each feature actually maps to ``n_bin+1`` output IDs. + similar_bins: + If True, samples will be distributed in bins of equal size (up to one sample). + defaults to False. If False bins will be filled with step = N_samples//bins, and last bin will contain all remaining samples. + """ + super(IsotonicCalibrator, self).__init__(**kwargs) + self._n_bin = n_bin + self._similar_bins = similar_bins + self._ys_input = [] + self._xs_input = [] + self._isotonic_feature_dict = {} + + def accumulate_feature(self, output: Dict[str, np.ndarray]) -> None: + """ + Wrapper around accumulate for trainer API. + Args: + output (dict): + output of prediction of build_graph for calibrator + """ + weights = output["weights"] if "weights" in output else None + self.accumulate(output["predictions"], output["targets"], weights) + + def accumulate( + self, predictions: np.ndarray, targets: np.ndarray, weights: np.ndarray = None + ) -> None: + """ + Accumulate a single batch of class predictions, class targets and class weights. + These are accumulated until calibrate() is called. + + Args: + predictions (np.ndarray): + float matrix of class values. Each dimension corresponds to a different class. + Shape is ``[n, d]``, where d is the number of classes. + targets (np.ndarray): + float matrix of class targets. Each dimension corresponds to a different class. + Shape ``[n, d]``, where d is the number of classes. + weights (np.ndarray): + Defaults to weights of 1. + 1D array containing the weights of each prediction. + """ + if predictions.shape != targets.shape: + raise ValueError( + "Expecting predictions.shape == targets.shape, got %s and %s instead" + % (str(predictions.shape), str(targets.shape)) + ) + if weights is not None: + if weights.ndim != 1: + raise ValueError("Expecting 1D weight, got %dD instead" % weights.ndim) + elif weights.size != predictions.shape[0]: + raise ValueError( + "Expecting predictions.shape[0] == weights.size, got %d != %d instead" + % (predictions.shape[0], weights.size) + ) + # iterate through the rows of predictions and sets one class to each row + if weights is None: + weights = np.full(predictions.shape[0], fill_value=DEFAULT_SAMPLE_WEIGHT) + for class_key in range(predictions.shape[1]): + # gets the predictions and targets for that class + class_predictions = predictions[:, class_key] + class_targets = targets[:, class_key] + if class_key not in self._isotonic_feature_dict: + isotonic_feature = IsotonicFeature(class_key) + self._isotonic_feature_dict[class_key] = isotonic_feature + else: + isotonic_feature = self._isotonic_feature_dict[class_key] + isotonic_feature.add_values( + { + "values": class_predictions, + "weights": weights, + "targets": class_targets, + } + ) + + def calibrate(self, debug: bool = False) -> Optional[Tuple[np.ndarray, np.ndarray]]: + """ + Calibrates each IsotonicFeature after accumulation is complete. + Results are stored in ``self._ys_input`` and ``self._xs_input`` + + Args: + debug: + Defaults to False. If set to true, returns the ``xs_input`` and ``ys_input``. + + Returns: + If debug is set to True, returns the ``xs_input`` and ``ys_input``. + """ + super(IsotonicCalibrator, self).calibrate() + bias_temp = [] + weight_temp = [] + logging.info("Beginning isotonic calibration.") + isotonic_features_dict = self._isotonic_feature_dict + for class_id in isotonic_features_dict: + bpreds, rpreds = isotonic_features_dict[class_id].calibrate( + bins=self._n_bin, similar_bins=self._similar_bins + ) + weight_temp.append(bpreds) + bias_temp.append(rpreds) + # save isotonic results onto a matrix + self._xs_input = np.array(weight_temp, dtype=np.float32) + self._ys_input = np.array(bias_temp, dtype=np.float32) + logging.info("Isotonic calibration finished.") + if debug: + return np.array(weight_temp), np.array(bias_temp) + + def save( + self, + save_dir: str, + name: str = "default", + verbose: bool = False, + ): # pylint: disable=unused-argument + """Save the calibrator into the given save_directory. + Args: + save_dir (str): + name of the saving directory. + name (str): + name of the calibrator. Default (string): "default". + """ + if not self._calibrated: + raise RuntimeError( + "Expecting prior call to calibrate().Cannot save() prior to calibrate()" + ) + + # This module allows for the calibrator to save be saved as part of + # Tensorflow Hub (this will allow it to be used in further steps) + logging.info( + "You probably do not need to save the isotonic layer. \ + So feel free to set save to False in the Trainer. \ + Additionally this only saves the layer not the whole graph." + ) + + def calibrator_module(): + """Way to save Isotonic layer""" + + # The input to isotonic is a dense layer + inputs = tf.placeholder(tf.float32) + calibrator_layer = self.to_layer() + output = calibrator_layer(inputs) + # creates the signature to the calibrator module + hub.add_signature(inputs=inputs, outputs=output, name=name) + + # exports the module to the save_dir + spec = hub.create_module_spec(calibrator_module) + with tf.Graph().as_default(): + module = hub.Module(spec) + with tf.Session() as session: + module.export(save_dir, session) + + def to_layer(self) -> twml.layers.Isotonic: + """Returns a twml.layers.Isotonic Layer that can be used for feature discretization.""" + if not self._calibrated: + raise RuntimeError("Expecting prior call to calibrate()") + + isotonic_layer = twml.layers.Isotonic( + n_unit=self._xs_input.shape[0], + n_bin=self._xs_input.shape[1], + xs_input=self._xs_input, + ys_input=self._ys_input, + **self._kwargs, + ) + + return isotonic_layer + + def get_layer_args(self, name: str = None) -> Dict[str, int]: + """Returns layer args. See ``Calibrator.get_layer_args`` for more detailed documentation""" + return {"n_unit": self._xs_input.shape[0], "n_bin": self._xs_input.shape[1]} diff --git a/twml/twml/contrib/calibrators/mdl.py b/twml/twml/contrib/calibrators/mdl.py index 0fe3265a4..66d5d5512 100644 --- a/twml/twml/contrib/calibrators/mdl.py +++ b/twml/twml/contrib/calibrators/mdl.py @@ -1,118 +1,132 @@ # pylint: disable=arguments-differ,no-member,too-many-statements -''' Contains MDLFeature and MDLCalibrator used for MDL calibration ''' +""" Contains MDLFeature and MDLCalibrator used for MDL calibration """ import os -from .percentile_discretizer import PercentileDiscretizerCalibrator, PercentileDiscretizerFeature - -from absl import logging import numpy as np import tensorflow.compat.v1 as tf +from absl import logging + import twml import twml.layers +from .percentile_discretizer import ( + PercentileDiscretizerCalibrator, + PercentileDiscretizerFeature, +) DEFAULT_SAMPLE_WEIGHT = 1 class MDLFeature(PercentileDiscretizerFeature): - ''' Accumulates and calibrates a single sparse MDL feature. ''' + """Accumulates and calibrates a single sparse MDL feature.""" class MDLCalibrator(PercentileDiscretizerCalibrator): - ''' Accumulates features and their respective values for MDL calibration. - Internally, each feature's values is accumulated via its own ``MDLFeature`` object. - The steps for calibration are typically as follows: - - 1. accumulate feature values from batches by calling ``accumulate()``; - 2. calibrate all feature into MDL bin_vals by calling ``calibrate()``; and - 3. convert to a twml.layers.MDL layer by calling ``to_layer()``. - - ''' - - def to_layer(self, name=None): + """Accumulates features and their respective values for MDL calibration. + Internally, each feature's values is accumulated via its own ``MDLFeature`` object. + The steps for calibration are typically as follows: + 1. accumulate feature values from batches by calling ``accumulate()``; + 2. calibrate all feature into MDL bin_vals by calling ``calibrate()``; and + 3. convert to a twml.layers.MDL layer by calling ``to_layer()``. """ - Returns a twml.layers.PercentileDiscretizer Layer - that can be used for feature discretization. - Arguments: - name: - name-scope of the PercentileDiscretizer layer - """ - n_feature = len(self._discretizer_feature_dict) - max_discretizer_feature = n_feature * (self._n_bin + 1) - - if not self._calibrated: - raise RuntimeError("Expecting prior call to calibrate()") - - if self._bin_ids.shape[0] != n_feature: - raise RuntimeError("Expecting self._bin_ids.shape[0] \ - != len(self._discretizer_feature_dict)") - if self._bin_vals.shape[0] != n_feature: - raise RuntimeError("Expecting self._bin_vals.shape[0] \ - != len(self._discretizer_feature_dict)") - - # can add at most #features * (n_bin+1) new feature ids - if 2**self._out_bits <= max_discretizer_feature: - raise ValueError("""Maximum number of features created by discretizer is + def to_layer(self, name: str = None): + """ + Returns a twml.layers.PercentileDiscretizer Layer + that can be used for feature discretization. + + Args: + name (str): + name-scope of the PercentileDiscretizer layer + """ + n_feature = len(self._discretizer_feature_dict) + max_discretizer_feature = n_feature * (self._n_bin + 1) + + if not self._calibrated: + raise RuntimeError("Expecting prior call to calibrate()") + + if self._bin_ids.shape[0] != n_feature: + raise RuntimeError( + "Expecting self._bin_ids.shape[0] != len(self._discretizer_feature_dict)" + ) + if self._bin_vals.shape[0] != n_feature: + raise RuntimeError( + "Expecting self._bin_vals.shape[0] != len(self._discretizer_feature_dict)" + ) + + # can add at most #features * (n_bin+1) new feature ids + if (1 << self._out_bits) <= max_discretizer_feature: + raise ValueError( + """Maximum number of features created by discretizer is %d but requested that the output be limited to %d values (%d bits), which is smaller than that. Please ensure the output has enough bits to represent at least the new features""" - % (max_discretizer_feature, 2**self._out_bits, self._out_bits)) - - # build feature_offsets, hash_map_keys, hash_map_values - feature_offsets = np.arange(0, max_discretizer_feature, - self._n_bin + 1, dtype='int64') - hash_map_keys = np.array(list(self._hash_map.keys()), dtype=np.int64) - hash_map_values = np.array(list(self._hash_map.values()), dtype=np.float32) - - discretizer = twml.layers.MDL( - n_feature=n_feature, n_bin=self._n_bin, - name=name, out_bits=self._out_bits, - hash_keys=hash_map_keys, hash_values=hash_map_values, - bin_ids=self._bin_ids.flatten(), bin_values=self._bin_vals.flatten(), - feature_offsets=feature_offsets, - **self._kwargs - ) - - return discretizer - - def save(self, save_dir, name='calibrator', verbose=False): - '''Save the calibrator into the given save_directory. - Arguments: - save_dir: - name of the saving directory - name: - name for the graph scope. Passed to to_layer(name=name) to set - scope of layer. - ''' - if not self._calibrated: - raise RuntimeError("Expecting prior call to calibrate().Cannot save() prior to calibrate()") - - layer_args = self.get_layer_args() - - calibrator_filename = os.path.join(save_dir, name + '.json.tf') - calibrator_dict = { - 'layer_args': layer_args, - 'saved_layer_scope': name + '/', - } - twml.write_file(calibrator_filename, calibrator_dict, encode='json') - - if verbose: - logging.info("The layer graph and other information necessary ") - logging.info("for multi-phase training is saved in directory:") - logging.info(save_dir) - logging.info("This directory can be specified as --init_from_dir argument.") - logging.info("") - logging.info("Other information is available in: %s.json.tf", name) - logging.info("This file can be loaded with twml.read_file(decode='json) to obtain ") - logging.info("layer_args, saved_layer_scope and variable_names") - - graph = tf.Graph() - # save graph for tensorboard as well - writer = tf.summary.FileWriter(logdir=save_dir, graph=graph) - - with tf.Session(graph=graph) as sess: - self.write_summary(writer, sess) - writer.flush() + % (max_discretizer_feature, (1 << self._out_bits), self._out_bits) + ) + + # build feature_offsets, hash_map_keys, hash_map_values + feature_offsets = np.arange( + 0, max_discretizer_feature, self._n_bin + 1, dtype="int64" + ) + hash_map_keys = np.array(list(self._hash_map.keys()), dtype=np.int64) + hash_map_values = np.array(list(self._hash_map.values()), dtype=np.float32) + + discretizer = twml.layers.MDL( + n_feature=n_feature, + n_bin=self._n_bin, + name=name, + out_bits=self._out_bits, + hash_keys=hash_map_keys, + hash_values=hash_map_values, + bin_ids=self._bin_ids.flatten(), + bin_values=self._bin_vals.flatten(), + feature_offsets=feature_offsets, + **self._kwargs + ) + + return discretizer + + def save(self, save_dir, name="calibrator", verbose=False): + """Save the calibrator into the given save_directory. + Args: + save_dir: + name of the saving directory + name: + name for the graph scope. Passed to to_layer(name=name) to set + scope of layer. + """ + if not self._calibrated: + raise RuntimeError( + "Expecting prior call to calibrate().Cannot save() prior to calibrate()" + ) + + layer_args = self.get_layer_args() + + calibrator_filename = os.path.join(save_dir, name + ".json.tf") + calibrator_dict = { + "layer_args": layer_args, + "saved_layer_scope": name + "/", + } + twml.write_file(calibrator_filename, calibrator_dict, encode="json") + + if verbose: + logging.info("The layer graph and other information necessary ") + logging.info("for multi-phase training is saved in directory:") + logging.info(save_dir) + logging.info("This directory can be specified as --init_from_dir argument.") + logging.info("") + logging.info("Other information is available in: %s.json.tf", name) + logging.info( + "This file can be loaded with twml.read_file(decode='json) to obtain " + ) + logging.info("layer_args, saved_layer_scope and variable_names") + + graph = tf.Graph() + # save graph for tensorboard as well + writer = tf.summary.FileWriter(logdir=save_dir, graph=graph) + + with tf.Session(graph=graph) as sess: + self.write_summary(writer, sess) + writer.flush() diff --git a/twml/twml/contrib/calibrators/percentile_discretizer.py b/twml/twml/contrib/calibrators/percentile_discretizer.py index eefce62c2..1e65c31e3 100644 --- a/twml/twml/contrib/calibrators/percentile_discretizer.py +++ b/twml/twml/contrib/calibrators/percentile_discretizer.py @@ -1,577 +1,613 @@ # pylint: disable=arguments-differ,no-member,too-many-statements -''' Contains PercentileDiscretizerFeature and PercentileDiscretizerCalibrator used \ - for PercentileDiscretizer calibration ''' +""" Contains PercentileDiscretizerFeature and PercentileDiscretizerCalibrator used \ + for PercentileDiscretizer calibration """ - -from .calibrator import CalibrationFeature, Calibrator - import os +from typing import Any, Dict, Optional, Tuple + import numpy as np import tensorflow.compat.v1 as tf import tensorflow_hub as hub + import twml import twml.layers +from .calibrator import CalibrationFeature, Calibrator DEFAULT_SAMPLE_WEIGHT = 1 class PercentileDiscretizerFeature(CalibrationFeature): - ''' Accumulates and calibrates a single sparse PercentileDiscretizer feature. ''' - - @staticmethod - def _gather_debug_info(values, indices, bin_vals, bin_counts_buffer): - ''' - Determine how many training values fell into a given bin during calibration. - This is calculated by finding the index of the first appearance of each bin - boundary in values (values may repeat, so that isn't trivially in indices.) - Subtracting each bin boundary index from the next tells you how many values fall in - that bin. - To get this to calculate the last bin correctly, len(values) is appended to the - list of bound indices. - - This assumes that ``bin_vals`` excludes np.inf bin boundaries when - PercentileDiscretizer was calibrated - with fewer values than bins. - - Arguments: - values: - 1D ndarray of the PercentileDiscretizerFeature's accumulated values, sorted ascending - indices: - 1D int32 ndarray of the indices (in values) of the bin boundaries - bin_vals: - 1D ndarray containing the bin boundaries - bin_counts_buffer: - ndarray buffer for returning the PercentileDiscretizer histogram - ''' - # np.flatnonzero(np.diff(x)) gives you the indices i in x s.t. x[i] != x[i+1] - # append index of the last bin since that cannot be empty with how - # PercentileDiscretizer is implemented - nonempty_bins = np.append(np.flatnonzero(np.diff(bin_vals)), len(bin_vals) - 1) - bin_start_indices = indices.take(nonempty_bins) - - # if multiples of a bin's lower bound value exist, find the first one - for (i, idx) in enumerate(bin_start_indices): - cur_idx = idx - while cur_idx > 0 and values[cur_idx] == values[cur_idx - 1]: - bin_start_indices[i] = cur_idx = cur_idx - 1 - - # the end of each bin is the start of the next bin, - # until the last, which is the end of the array - # broadcast the counts to the nonempty bins, 0 otherwise - bin_counts_buffer[:] = 0 - bin_counts_buffer[nonempty_bins] = np.diff(np.append(bin_start_indices, values.size)) - - def calibrate( - self, - bin_vals, percentiles, percentile_indices, - bin_counts_buffer=None): - '''Calibrates the PercentileDiscretizerFeature into bin values for - use in PercentileDiscretizerCalibrator. - Note that this method can only be called once. - - Arguments: - bin_vals: - Row in the PercentileDiscretizerCalibrator.bin_vals matrix corresponding to this feature. - Will be updated with the results of the calibration. - A 1D ndarray. - percentiles: - 1D array of size n_bin with values ranging from 0 to 1. - For example, ``percentiles = np.linspace(0, 1, num=self._n_bin+1, dtype=np.float32)`` - percentile_indices: - Empty 1D array of size n_bin used to store intermediate results when - calling twml.twml_optim_nearest_interpolation(). - For example, np.empty(self._n_bin + 1, dtype=np.float32). - bin_counts_buffer: - optional ndarray buffer used for retaining count of values per PercentileDiscretizer - bucket (for debug and feature exploration purposes) - - Returns: - calibrated bin_vals for use by ``PercentileDiscretizerCalibrator`` - ''' - if self._calibrated: - raise RuntimeError("Can only calibrate once") - if bin_vals.ndim != 1: - raise RuntimeError("Expecting bin_vals row") - - # # concatenate values and weights buffers - self._concat_arrays() - feature_values = self._features_dict['values'] - feature_weights = self._features_dict['weights'] - - # get features ready for the bins, order array indices by feature values. - indices = np.argsort(feature_values) - - # get ordered values and weights using array indices - values = feature_values.take(indices) - weights = feature_weights.take(indices) - - # Normalizes the sum of weights to be between 0 and 1 - weights = np.cumsum(weights, out=feature_weights) - weights -= weights[0] - if weights[-1] > 0: # prevent zero-division - weights /= weights[-1] - - # Check if we have less values than bin_vals - if values.size < bin_vals.size: - # Fills all the bins with a value that won't ever be reached - bin_vals.fill(np.inf) - # Forces the first to be -inf - bin_vals[0] = -np.inf - # Copies the values as boundaries - bin_vals[1:values.size + 1] = values - - if bin_counts_buffer is not None: - # slice out bins with +/-np.inf boundary -- their count will be zero anyway - # we can't just assume all other bins will have 1 value since there can be dups - short_indices = np.arange(values.size, dtype=np.int32) - bin_counts_buffer.fill(0) - self._gather_debug_info( - values, short_indices, bin_vals[1:values.size + 1], - bin_counts_buffer[1:values.size + 1]) - - else: - # Gets the indices for the values that define the boundary for the bins - indices_float = np.arange(0, weights.size, dtype=np.float32) - - # Gets things in the correct shape for the linear interpolation - weights = weights.reshape(1, weights.size) - indices_float = indices_float.reshape(1, weights.size) - - # wrap ndarrays into twml.Array - percentiles_tarray = twml.Array(percentiles.reshape(percentiles.size, 1)) - weights_tarray = twml.Array(weights) - indices_float_tarray = twml.Array(indices_float) - percentile_indices_tarray = twml.Array(percentile_indices.reshape(percentiles.size, 1)) - - # Performs the binary search to find the indices corresponding to the percentiles - err = twml.CLIB.twml_optim_nearest_interpolation( - percentile_indices_tarray.handle, percentiles_tarray.handle, # output, input - weights_tarray.handle, indices_float_tarray.handle # xs, ys - ) - if err != 1000: - raise ValueError("""twml.CLIB.twml_optim_nearest_interpolation - caught an error (see previous stdout). Error code: """ % err) - - indices = indices[:bin_vals.size] - indices[:] = percentile_indices - indices[0] = 0 - indices[-1] = weights.size - 1 - - # Gets the values at those indices and copies them into bin_vals - values.take(indices, out=bin_vals) - - # get # of values per bucket - if bin_counts_buffer is not None: - self._gather_debug_info(values, indices, bin_vals, bin_counts_buffer) - - self._calibrated = True + """Accumulates and calibrates a single sparse PercentileDiscretizer feature.""" + + @staticmethod + def _gather_debug_info(values, indices, bin_vals, bin_counts_buffer): + """ + Determine how many training values fell into a given bin during calibration. + This is calculated by finding the index of the first appearance of each bin + boundary in values (values may repeat, so that isn't trivially in indices.) + Subtracting each bin boundary index from the next tells you how many values fall in + that bin. + To get this to calculate the last bin correctly, len(values) is appended to the + list of bound indices. + + This assumes that ``bin_vals`` excludes np.inf bin boundaries when + PercentileDiscretizer was calibrated + with fewer values than bins. + + Args: + values: + 1D ndarray of the PercentileDiscretizerFeature's accumulated values, sorted ascending + indices: + 1D int32 ndarray of the indices (in values) of the bin boundaries + bin_vals: + 1D ndarray containing the bin boundaries + bin_counts_buffer: + ndarray buffer for returning the PercentileDiscretizer histogram + """ + # np.flatnonzero(np.diff(x)) gives you the indices i in x s.t. x[i] != x[i+1] + # append index of the last bin since that cannot be empty with how + # PercentileDiscretizer is implemented + nonempty_bins = np.append(np.flatnonzero(np.diff(bin_vals)), len(bin_vals) - 1) + bin_start_indices = indices.take(nonempty_bins) + + # if multiples of a bin's lower bound value exist, find the first one + for i, idx in enumerate(bin_start_indices): + cur_idx = idx + while cur_idx > 0 and values[cur_idx] == values[cur_idx - 1]: + bin_start_indices[i] = cur_idx = cur_idx - 1 + + # the end of each bin is the start of the next bin, + # until the last, which is the end of the array + # broadcast the counts to the nonempty bins, 0 otherwise + bin_counts_buffer[:] = 0 + bin_counts_buffer[nonempty_bins] = np.diff( + np.append(bin_start_indices, values.size) + ) + + def calibrate( + self, bin_vals, percentiles, percentile_indices, bin_counts_buffer=None + ): + """Calibrates the PercentileDiscretizerFeature into bin values for + use in PercentileDiscretizerCalibrator. + Note that this method can only be called once. + + Args: + bin_vals: + Row in the PercentileDiscretizerCalibrator.bin_vals matrix corresponding to this feature. + Will be updated with the results of the calibration. + A 1D ndarray. + percentiles: + 1D array of size n_bin with values ranging from 0 to 1. + For example, ``percentiles = np.linspace(0, 1, num=self._n_bin+1, dtype=np.float32)`` + percentile_indices: + Empty 1D array of size n_bin used to store intermediate results when + calling twml.twml_optim_nearest_interpolation(). + For example, np.empty(self._n_bin + 1, dtype=np.float32). + bin_counts_buffer: + optional ndarray buffer used for retaining count of values per PercentileDiscretizer + bucket (for debug and feature exploration purposes) + + Returns: + calibrated bin_vals for use by ``PercentileDiscretizerCalibrator`` + """ + if self._calibrated: + raise RuntimeError("Can only calibrate once") + if bin_vals.ndim != 1: + raise RuntimeError("Expecting bin_vals row") + + # # concatenate values and weights buffers + self._concat_arrays() + feature_values = self._features_dict["values"] + feature_weights = self._features_dict["weights"] + + # get features ready for the bins, order array indices by feature values. + indices = np.argsort(feature_values) + + # get ordered values and weights using array indices + values = feature_values.take(indices) + weights = feature_weights.take(indices) + + # Normalizes the sum of weights to be between 0 and 1 + weights = np.cumsum(weights, out=feature_weights) + weights -= weights[0] + if weights[-1] > 0: # prevent zero-division + weights /= weights[-1] + + # Check if we have less values than bin_vals + if values.size < bin_vals.size: + # Fills all the bins with a value that won't ever be reached + bin_vals.fill(np.inf) + # Forces the first to be -inf + bin_vals[0] = -np.inf + # Copies the values as boundaries + bin_vals[1 : values.size + 1] = values + + if bin_counts_buffer is not None: + # slice out bins with +/-np.inf boundary -- their count will be zero anyway + # we can't just assume all other bins will have 1 value since there can be dups + short_indices = np.arange(values.size, dtype=np.int32) + bin_counts_buffer.fill(0) + self._gather_debug_info( + values, + short_indices, + bin_vals[1 : values.size + 1], + bin_counts_buffer[1 : values.size + 1], + ) + + else: + # Gets the indices for the values that define the boundary for the bins + indices_float = np.arange(0, weights.size, dtype=np.float32) + + # Gets things in the correct shape for the linear interpolation + weights = weights.reshape(1, weights.size) + indices_float = indices_float.reshape(1, weights.size) + + # wrap ndarrays into twml.Array + percentiles_tarray = twml.Array(percentiles.reshape(percentiles.size, 1)) + weights_tarray = twml.Array(weights) + indices_float_tarray = twml.Array(indices_float) + percentile_indices_tarray = twml.Array( + percentile_indices.reshape(percentiles.size, 1) + ) + + # Performs the binary search to find the indices corresponding to the percentiles + err = twml.CLIB.twml_optim_nearest_interpolation( + percentile_indices_tarray.handle, + percentiles_tarray.handle, # output, input + weights_tarray.handle, + indices_float_tarray.handle, # xs, ys + ) + if err != 1000: + raise ValueError( + """twml.CLIB.twml_optim_nearest_interpolation + caught an error (see previous stdout). Error code: """ + % err + ) + + indices = indices[: bin_vals.size] + indices[:] = percentile_indices + indices[0] = 0 + indices[-1] = weights.size - 1 + + # Gets the values at those indices and copies them into bin_vals + values.take(indices, out=bin_vals) + + # get # of values per bucket + if bin_counts_buffer is not None: + self._gather_debug_info(values, indices, bin_vals, bin_counts_buffer) + + self._calibrated = True class PercentileDiscretizerCalibrator(Calibrator): - ''' Accumulates features and their respective values for PercentileDiscretizer calibration. - Internally, each feature's values is accumulated via its own - ``PercentileDiscretizerFeature`` object. - The steps for calibration are typically as follows: - - 1. accumulate feature values from batches by calling ``accumulate()``; - 2. calibrate all feature into PercentileDiscretizer bin_vals by calling ``calibrate()``; and - 3. convert to a twml.layers.PercentileDiscretizer layer by calling ``to_layer()``. - - ''' - - def __init__(self, n_bin, out_bits, bin_histogram=True, - allow_empty_calibration=False, **kwargs): - ''' Constructs an PercentileDiscretizerCalibrator instance. - - Arguments: - n_bin: - the number of bins per feature to use for PercentileDiscretizer. - Note that each feature actually maps to n_bin+1 output IDs. - out_bits: - The maximum number of bits to use for the output IDs. - 2**out_bits must be greater than bin_ids.size or an error is raised. - bin_histogram: - When True (the default), gathers information during calibration - to build a bin_histogram. - allow_empty_calibration: - allows operation where we might not calibrate any features. - Default False to error out if no features were calibrated. - Typically, values of uncalibrated features pass through discretizers - untouched (though the feature ids will be truncated to obey out_bits). - ''' - super(PercentileDiscretizerCalibrator, self).__init__(**kwargs) - self._n_bin = n_bin - self._out_bits = out_bits - - self._bin_ids = None - self._bin_vals = np.empty(0, dtype=np.float32) # Note changed from 64 (v1) to 32 (v2) - - self._bin_histogram = bin_histogram - self._bin_histogram_dict = None - - self._hash_map_counter = 0 - self._hash_map = {} - - self._discretizer_feature_dict = {} - self._allow_empty_calibration = allow_empty_calibration - - @property - def bin_ids(self): - ''' - Gets bin_ids - ''' - return self._bin_ids - - @property - def bin_vals(self): - ''' - Gets bin_vals - ''' - return self._bin_vals - - @property - def hash_map(self): - ''' - Gets hash_map - ''' - return self._hash_map - - @property - def discretizer_feature_dict(self): - ''' - Gets feature_dict - ''' - return self._discretizer_feature_dict - - def accumulate_features(self, inputs, name): - ''' - Wrapper around accumulate for PercentileDiscretizer. - Arguments: - inputs: - batch that will be accumulated - name: - name of the tensor that will be accumulated - - ''' - sparse_tf = inputs[name] - indices = sparse_tf.indices[:, 1] - ids = sparse_tf.indices[:, 0] - weights = np.take(inputs["weights"], ids) - return self.accumulate(indices, sparse_tf.values, weights) - - def accumulate_feature(self, output): - ''' - Wrapper around accumulate for trainer API. - Arguments: - output: - output of prediction of build_graph for calibrator - ''' - return self.accumulate(output['feature_ids'], output['feature_values'], output['weights']) - - def accumulate(self, feature_keys, feature_vals, weights=None): - '''Accumulate a single batch of feature keys, values and weights. - - These are accumulate until ``calibrate()`` is called. - - Arguments: - feature_keys: - 1D int64 array of feature keys. - feature_vals: - 1D float array of feature values. Each element of this array - maps to the commensurate element in ``feature_keys``. - weights: - Defaults to weights of 1. - 1D array containing the weights of each feature key, value pair. - Typically, this is the weight of each sample (but you still need - to provide one weight per key,value pair). - Each element of this array maps to the commensurate element in feature_keys. - ''' - if feature_keys.ndim != 1: - raise ValueError('Expecting 1D feature_keys, got %dD' % feature_keys.ndim) - if feature_vals.ndim != 1: - raise ValueError('Expecting 1D feature_values, got %dD' % feature_vals.ndim) - if feature_vals.size != feature_keys.size: - raise ValueError( - 'Expecting feature_keys.size == feature_values.size, got %d != %d' % - (feature_keys.size, feature_vals.size)) - if weights is not None: - weights = np.squeeze(weights) - if weights.ndim != 1: - raise ValueError('Expecting 1D weights, got %dD' % weights.ndim) - elif weights.size != feature_keys.size: - raise ValueError( - 'Expecting feature_keys.size == weights.size, got %d != %d' % - (feature_keys.size, weights.size)) - if weights is None: - weights = np.full(feature_vals.size, fill_value=DEFAULT_SAMPLE_WEIGHT) - unique_keys = np.unique(feature_keys) - for feature_id in unique_keys: - idx = np.where(feature_keys == feature_id) - if feature_id not in self._discretizer_feature_dict: - self._hash_map[feature_id] = self._hash_map_counter - # unlike v1, the hash_map_counter is incremented AFTER assignment. - # This makes the hash_map features zero-indexed: 0, 1, 2 instead of 1, 2, 3 - self._hash_map_counter += 1 - # creates a new cache if we never saw the feature before - discretizer_feature = PercentileDiscretizerFeature(feature_id) - self._discretizer_feature_dict[feature_id] = discretizer_feature - else: - discretizer_feature = self._discretizer_feature_dict[feature_id] - discretizer_feature.add_values({'values': feature_vals[idx], 'weights': weights[idx]}) - - def calibrate(self, debug=False): - ''' - Calibrates each PercentileDiscretizer feature after accumulation is complete. - - Arguments: - debug: - Boolean to request debug info be returned by the method. - (see Returns section below) - - The calibration results are stored in two matrices: - bin_ids: - 2D array of size number of accumulate ``features x n_bin+1``. - Contains the new IDs generated by PercentileDiscretizer. Each row maps to a feature. - Each row maps to different value bins. The IDs - are in the range ``1 -> bin_ids.size+1`` - bin_vals: - 2D array of the same size as bin_ids. - Each row maps to a feature. Each row contains the bin boundaries. - These boundaries represent feature values. - - Returns: - if debug is True, the method returns - - - 1D int64 array of feature_ids - - 2D float32 array copy of bin_vals (the bin boundaries) for each feature - - 2D int64 array of bin counts corresponding to the bin boundaries - - ''' - n_feature = len(self._discretizer_feature_dict) - if n_feature == 0 and not self._allow_empty_calibration: - raise RuntimeError("Need to accumulate some features for calibration\n" - "Likely, the calibration data is empty. This can\n" - "happen if the dataset is small, or if the following\n" - "cli args are set too low:\n" - " --discretizer_keep_rate (default=0.0008)\n" - " --discretizer_parts_downsampling_rate (default=0.2)\n" - "Consider increasing the values of these args.\n" - "To allow empty calibration data (and degenerate discretizer),\n" - "use the allow_empty_calibration input of the constructor.") - - self._bin_ids = np.arange(1, n_feature * (self._n_bin + 1) + 1) - self._bin_ids = self._bin_ids.reshape(n_feature, self._n_bin + 1) - - self._bin_vals.resize(n_feature, self._n_bin + 1) - - # buffers shared by PercentileDiscretizerFeature.calibrate() - percentile_indices = np.empty(self._n_bin + 1, dtype=np.float32) - - # Tensor from 0 to 1 in the number of steps provided - percentiles = np.linspace(0, 1, num=self._n_bin + 1, dtype=np.float32) - - if debug or self._bin_histogram: - debug_feature_ids = np.empty(n_feature, dtype=np.int64) - bin_counts = np.empty((n_feature, self._n_bin + 1), dtype=np.int64) - - # progress bar for calibration phase - progress_bar = tf.keras.utils.Progbar(n_feature) - - discretizer_features_dict = self._discretizer_feature_dict - for i, feature_id in enumerate(discretizer_features_dict): - if debug or self._bin_histogram: - debug_feature_ids[self._hash_map[feature_id]] = feature_id - bin_counts_buffer = bin_counts[self._hash_map[feature_id]] - else: - bin_counts_buffer = None - - # calibrate each PercentileDiscretizer feature (puts results in bin_vals) - discretizer_features_dict[feature_id].calibrate( - self._bin_vals[self._hash_map[feature_id]], # Gets feature-values - percentiles, percentile_indices, - bin_counts_buffer=bin_counts_buffer - ) - - # update progress bar 20 times - if (i % max(1.0, round(n_feature / 20)) == 0) or (i == n_feature - 1): - progress_bar.update(i + 1) - - super(PercentileDiscretizerCalibrator, self).calibrate() - - if self._bin_histogram: - # save bin histogram data for later - self._bin_histogram_dict = { - 'feature_ids': debug_feature_ids, - 'bin_counts': bin_counts, - 'bin_vals': self._bin_vals, - 'out_bits': self._out_bits, - } - - if debug: - return debug_feature_ids, self._bin_vals.copy(), bin_counts - - return None - - def _create_discretizer_layer(self, n_feature, hash_map_keys, hash_map_values, - feature_offsets, name): - return twml.layers.PercentileDiscretizer( - n_feature=n_feature, - n_bin=self._n_bin, - out_bits=self._out_bits, - bin_values=self._bin_vals.flatten(), - hash_keys=hash_map_keys, - hash_values=hash_map_values.astype(np.int64), - bin_ids=self._bin_ids.flatten().astype(np.int64), - feature_offsets=feature_offsets, - name=name, - **self._kwargs - ) - - def to_layer(self, name=None): - """ - Returns a twml.layers.PercentileDiscretizer Layer - that can be used for feature discretization. + """Accumulates features and their respective values for PercentileDiscretizer calibration. + Internally, each feature's values is accumulated via its own + ``PercentileDiscretizerFeature`` object. + The steps for calibration are typically as follows: - Arguments: - name: - name-scope of the PercentileDiscretizer layer - """ - n_feature = len(self._discretizer_feature_dict) - max_discretizer_feature = n_feature * (self._n_bin + 1) - - if not self._calibrated: - raise RuntimeError("Expecting prior call to calibrate()") - - if self._bin_ids.shape[0] != n_feature: - raise RuntimeError("Expecting self._bin_ids.shape[0] \ - != len(self._discretizer_feature_dict)") - if self._bin_vals.shape[0] != n_feature: - raise RuntimeError("Expecting self._bin_vals.shape[0] \ - != len(self._discretizer_feature_dict)") - - # can add at most #features * (n_bin+1) new feature ids - if 2**self._out_bits <= max_discretizer_feature: - raise ValueError("""Maximum number of features created by discretizer is - %d but requested that the output be limited to %d values (%d bits), - which is smaller than that. Please ensure the output has enough bits - to represent at least the new features""" - % (max_discretizer_feature, 2**self._out_bits, self._out_bits)) - - # build feature_offsets, hash_map_keys, hash_map_values - feature_offsets = np.arange(0, max_discretizer_feature, - self._n_bin + 1, dtype='int64') - hash_map_keys = np.array(list(self._hash_map.keys()), dtype=np.int64) - hash_map_values = np.array(list(self._hash_map.values()), dtype=np.float32) - - discretizer = self._create_discretizer_layer(n_feature, hash_map_keys, - hash_map_values, feature_offsets, name) - - return discretizer - - def get_layer_args(self): - ''' - Returns layer arguments required to implement multi-phase training. - See twml.calibrator.Calibrator.get_layer_args for more detailed documentation. - ''' - layer_args = { - 'n_feature': len(self._discretizer_feature_dict), - 'n_bin': self._n_bin, - 'out_bits': self._out_bits, - } - - return layer_args - - def add_hub_signatures(self, name): - """ - Add Hub Signatures for each calibrator + 1. accumulate feature values from batches by calling ``accumulate()``; + 2. calibrate all feature into PercentileDiscretizer bin_vals by calling ``calibrate()``; and + 3. convert to a twml.layers.PercentileDiscretizer layer by calling ``to_layer()``. - Arguments: - name: - Calibrator name - """ - sparse_tf = tf.sparse_placeholder(tf.float32) - calibrator_layer = self.to_layer() - hub.add_signature( - inputs=sparse_tf, - outputs=calibrator_layer(sparse_tf, keep_inputs=False), - name=name) - - def write_summary(self, writer, sess=None): - """ - This method is called by save() to write a histogram of - PercentileDiscretizer feature bins to disk. A histogram is included for each - feature. - - Arguments: - writer: - tf.summary.FilteWriter instance. - used to add summaries to event files for inclusion in tensorboard. - sess: - tf.Session instance. Used to produces summaries for the writer. """ - bin_counts_ph = tf.placeholder(tf.int64) - bin_counts = self._bin_histogram_dict['bin_counts'] - # Record that distribution into a histogram summary - histo = tf.summary.histogram("discretizer_feature_bin_counts", bin_counts_ph) - for i in range(bin_counts.shape[0]): - bin_counts_summary = sess.run(histo, feed_dict={bin_counts_ph: bin_counts[i]}) - writer.add_summary(bin_counts_summary, global_step=i) - - def write_summary_json(self, save_dir, name="default"): - """ - Export bin information to HDFS. - - Arguments: - save_dir: - name of the saving directory. - name: - prefix of the saved hub signature. Default (string): "default". - """ - # Since the size is small: (# of bins) * (# of features), we always dump the file. - discretizer_export_bin_filename = os.path.join(save_dir, name + '_bin.json') - discretizer_export_bin_dict = { - 'feature_ids': self._bin_histogram_dict['feature_ids'].tolist(), - 'bin_boundaries': self._bin_histogram_dict['bin_vals'].tolist(), - 'output_bits': self._bin_histogram_dict['out_bits'] - } - twml.write_file(discretizer_export_bin_filename, discretizer_export_bin_dict, encode='json') - - def save(self, save_dir, name="default", verbose=False): - '''Save the calibrator into the given save_directory using TF Hub. - Arguments: - save_dir: - name of the saving directory. - name: - prefix of the saved hub signature. Default (string): "default". - ''' - if not self._calibrated: - raise RuntimeError("Expecting prior call to calibrate().Cannot save() prior to calibrate()") - - # This module allows for the calibrator to save be saved as part of - # Tensorflow Hub (this will allow it to be used in further steps) - def calibrator_module(): - # Note that this is usually expecting a sparse_placeholder - inputs = tf.sparse_placeholder(tf.float32) - calibrator_layer = self.to_layer() - # creates the signature to the calibrator module - hub.add_signature( - inputs=inputs, - outputs=calibrator_layer(inputs, keep_inputs=False), - name=name) - # and another signature for keep_inputs mode - hub.add_signature( - inputs=inputs, - outputs=calibrator_layer(inputs, keep_inputs=True), - name=name + '_keep_inputs') - - # exports the module to the save_dir - spec = hub.create_module_spec(calibrator_module) - with tf.Graph().as_default(): - module = hub.Module(spec) - with tf.Session() as session: - module.export(save_dir, session) - - self.write_summary_json(save_dir, name) + def __init__( + self, + n_bin: int, + out_bits: int, + bin_histogram: bool = True, + allow_empty_calibration: bool = False, + **kwargs + ): + """Constructs an PercentileDiscretizerCalibrator instance. + + Args: + n_bin: + the number of bins per feature to use for PercentileDiscretizer. + Note that each feature actually maps to n_bin+1 output IDs. + out_bits: + The maximum number of bits to use for the output IDs. + 2**out_bits must be greater than bin_ids.size or an error is raised. + bin_histogram: + When True (the default), gathers information during calibration + to build a bin_histogram. + allow_empty_calibration: + allows operation where we might not calibrate any features. + Default False to error out if no features were calibrated. + Typically, values of uncalibrated features pass through discretizers + untouched (though the feature ids will be truncated to obey out_bits). + """ + super(PercentileDiscretizerCalibrator, self).__init__(**kwargs) + self._n_bin = n_bin + self._out_bits = out_bits + + self._bin_ids = None + self._bin_vals = np.empty( + 0, dtype=np.float32 + ) # Note changed from 64 (v1) to 32 (v2) + + self._bin_histogram = bin_histogram + self._bin_histogram_dict = None + + self._hash_map_counter = 0 + self._hash_map = {} + + self._discretizer_feature_dict = {} + self._allow_empty_calibration = allow_empty_calibration + + @property + def bin_ids(self) -> np.ndarray: + """Gets bin_ids""" + return self._bin_ids + + @property + def bin_vals(self) -> np.ndarray: + """Gets bin_vals""" + return self._bin_vals + + @property + def hash_map(self) -> Dict[str, int]: + """Gets hash_map""" + return self._hash_map + + @property + def discretizer_feature_dict(self) -> Dict[str, PercentileDiscretizerFeature]: + """Gets feature_dict""" + return self._discretizer_feature_dict + + def accumulate_features(self, inputs: Dict[str, Any], name: str) -> None: + """ + Wrapper around accumulate for PercentileDiscretizer. + Args: + inputs (dict): + batch that will be accumulated + name (str): + name of the tensor that will be accumulated + """ + sparse_tf = inputs[name] + indices = sparse_tf.indices[:, 1] + ids = sparse_tf.indices[:, 0] + weights = np.take(inputs["weights"], ids) + return self.accumulate(indices, sparse_tf.values, weights) + + def accumulate_feature(self, output: Dict[str, Any]) -> None: + """ + Wrapper around accumulate for trainer API. + Args: + output: + output of prediction of build_graph for calibrator + """ + return self.accumulate( + output["feature_ids"], output["feature_values"], output["weights"] + ) + + def accumulate( + self, + feature_keys: np.ndarray, + feature_vals: np.ndarray, + weights: np.ndarray = None, + ) -> None: + """Accumulate a single batch of feature keys, values and weights. + These are accumulate until ``calibrate()`` is called. + + Args: + feature_keys (np.ndarray): + 1D int64 array of feature keys. + feature_vals (np.ndarray): + 1D float array of feature values. Each element of this array + maps to the commensurate element in ``feature_keys``. + weights (np.ndarray, optional): + Defaults to weights of 1. + 1D array containing the weights of each feature key, value pair. + Typically, this is the weight of each sample (but you still need + to provide one weight per key,value pair). + Each element of this array maps to the commensurate element in feature_keys. + """ + if feature_keys.ndim != 1: + raise ValueError("Expecting 1D feature_keys, got %dD" % feature_keys.ndim) + if feature_vals.ndim != 1: + raise ValueError("Expecting 1D feature_values, got %dD" % feature_vals.ndim) + if feature_vals.size != feature_keys.size: + raise ValueError( + "Expecting feature_keys.size == feature_values.size, got %d != %d" + % (feature_keys.size, feature_vals.size) + ) + if weights is not None: + weights = np.squeeze(weights) + if weights.ndim != 1: + raise ValueError("Expecting 1D weights, got %dD" % weights.ndim) + elif weights.size != feature_keys.size: + raise ValueError( + "Expecting feature_keys.size == weights.size, got %d != %d" + % (feature_keys.size, weights.size) + ) + if weights is None: + weights = np.full(feature_vals.size, fill_value=DEFAULT_SAMPLE_WEIGHT) + unique_keys = np.unique(feature_keys) + for feature_id in unique_keys: + idx = np.where(feature_keys == feature_id) + if feature_id not in self._discretizer_feature_dict: + self._hash_map[feature_id] = self._hash_map_counter + # unlike v1, the hash_map_counter is incremented AFTER assignment. + # This makes the hash_map features zero-indexed: 0, 1, 2 instead of 1, 2, 3 + self._hash_map_counter += 1 + # creates a new cache if we never saw the feature before + discretizer_feature = PercentileDiscretizerFeature(feature_id) + self._discretizer_feature_dict[feature_id] = discretizer_feature + else: + discretizer_feature = self._discretizer_feature_dict[feature_id] + discretizer_feature.add_values( + {"values": feature_vals[idx], "weights": weights[idx]} + ) + + def calibrate( + self, debug: bool = False + ) -> Optional[Tuple[np.ndarray, np.ndarray, np.ndarray]]: + """ + Calibrates each PercentileDiscretizer feature after accumulation is complete. + + Args: + debug (bool): + Boolean to request debug info be returned by the method. + (see Returns section below) + The calibration results are stored in two matrices: + + Returns: + if debug is True, the method returns + - 1D int64 array of feature_ids + - 2D float32 array copy of bin_vals (the bin boundaries) for each feature + - 2D int64 array of bin counts corresponding to the bin boundaries + """ + n_feature = len(self._discretizer_feature_dict) + if n_feature == 0 and not self._allow_empty_calibration: + raise RuntimeError( + "Need to accumulate some features for calibration\n" + "Likely, the calibration data is empty. This can\n" + "happen if the dataset is small, or if the following\n" + "cli args are set too low:\n" + " --discretizer_keep_rate (default=0.0008)\n" + " --discretizer_parts_downsampling_rate (default=0.2)\n" + "Consider increasing the values of these args.\n" + "To allow empty calibration data (and degenerate discretizer),\n" + "use the allow_empty_calibration input of the constructor." + ) + + self._bin_ids = np.arange(1, n_feature * (self._n_bin + 1) + 1) + self._bin_ids = self._bin_ids.reshape(n_feature, self._n_bin + 1) + + self._bin_vals.resize(n_feature, self._n_bin + 1) + + # buffers shared by PercentileDiscretizerFeature.calibrate() + percentile_indices = np.empty(self._n_bin + 1, dtype=np.float32) + + # Tensor from 0 to 1 in the number of steps provided + percentiles = np.linspace(0, 1, num=self._n_bin + 1, dtype=np.float32) + + if debug or self._bin_histogram: + debug_feature_ids = np.empty(n_feature, dtype=np.int64) + bin_counts = np.empty((n_feature, self._n_bin + 1), dtype=np.int64) + + # progress bar for calibration phase + progress_bar = tf.keras.utils.Progbar(n_feature) + + discretizer_features_dict = self._discretizer_feature_dict + for i, feature_id in enumerate(discretizer_features_dict): + if debug or self._bin_histogram: + debug_feature_ids[self._hash_map[feature_id]] = feature_id + bin_counts_buffer = bin_counts[self._hash_map[feature_id]] + else: + bin_counts_buffer = None + + # calibrate each PercentileDiscretizer feature (puts results in bin_vals) + discretizer_features_dict[feature_id].calibrate( + self._bin_vals[self._hash_map[feature_id]], # Gets feature-values + percentiles, + percentile_indices, + bin_counts_buffer=bin_counts_buffer, + ) + + # update progress bar 20 times + if (i % max(1.0, round(n_feature / 20)) == 0) or (i == n_feature - 1): + progress_bar.update(i + 1) + + super(PercentileDiscretizerCalibrator, self).calibrate() + + if self._bin_histogram: + # save bin histogram data for later + self._bin_histogram_dict = { + "feature_ids": debug_feature_ids, + "bin_counts": bin_counts, + "bin_vals": self._bin_vals, + "out_bits": self._out_bits, + } + + if debug: + return debug_feature_ids, self._bin_vals.copy(), bin_counts + + return None + + def _create_discretizer_layer( + self, + n_feature: int, + hash_map_keys: np.ndarray, + hash_map_values: np.ndarray, + feature_offsets: np.ndarray, + name: Optional[str] = None, + ): + return twml.layers.PercentileDiscretizer( + n_feature=n_feature, + n_bin=self._n_bin, + out_bits=self._out_bits, + bin_values=self._bin_vals.flatten(), + hash_keys=hash_map_keys, + hash_values=hash_map_values.astype(np.int64), + bin_ids=self._bin_ids.flatten().astype(np.int64), + feature_offsets=feature_offsets, + name=name, + **self._kwargs + ) + + def to_layer(self, name: str = None): + """ + Returns a twml.layers.PercentileDiscretizer Layer + that can be used for feature discretization. + + Args: + name: + name-scope of the PercentileDiscretizer layer + """ + n_feature = len(self._discretizer_feature_dict) + max_discretizer_feature = n_feature * (self._n_bin + 1) + + if not self._calibrated: + raise RuntimeError("Expecting prior call to calibrate()") + if self._bin_ids.shape[0] != n_feature: + raise RuntimeError( + "Expecting self._bin_ids.shape[0] != len(self._discretizer_feature_dict)" + ) + if self._bin_vals.shape[0] != n_feature: + raise RuntimeError( + "Expecting self._bin_vals.shape[0] != len(self._discretizer_feature_dict)" + ) + + # can add at most #features * (n_bin+1) new feature ids + if (1 << self._out_bits) <= max_discretizer_feature: + raise ValueError( + """Maximum number of features created by discretizer is + %d but requested that the output be limited to %d values (%d bits), + which is smaller than that. Please ensure the output has enough bits + to represent at least the new features""" + % (max_discretizer_feature, (1 << self._out_bits), self._out_bits) + ) + + # build feature_offsets, hash_map_keys, hash_map_values + feature_offsets = np.arange( + 0, max_discretizer_feature, self._n_bin + 1, dtype="int64" + ) + hash_map_keys = np.array(list(self._hash_map.keys()), dtype=np.int64) + hash_map_values = np.array(list(self._hash_map.values()), dtype=np.float32) + + discretizer = self._create_discretizer_layer( + n_feature, hash_map_keys, hash_map_values, feature_offsets, name + ) + + return discretizer + + def get_layer_args(self) -> Dict[str, int]: + """ + Returns layer arguments required to implement multi-phase training. + See twml.calibrator.Calibrator.get_layer_args for more detailed documentation. + """ + layer_args = { + "n_feature": len(self._discretizer_feature_dict), + "n_bin": self._n_bin, + "out_bits": self._out_bits, + } + + return layer_args + + def add_hub_signatures(self, name: str): + """ + Add Hub Signatures for each calibrator + + Args: + name: + Calibrator name + """ + sparse_tf = tf.sparse_placeholder(tf.float32) + calibrator_layer = self.to_layer() + hub.add_signature( + inputs=sparse_tf, + outputs=calibrator_layer(sparse_tf, keep_inputs=False), + name=name, + ) + + def write_summary(self, writer: tf.summary.FileWriter, sess: tf.Session = None): + """ + This method is called by save() to write a histogram of + PercentileDiscretizer feature bins to disk. A histogram is included for each + feature. + + Args: + writer: + tf.summary.FilteWriter instance. + used to add summaries to event files for inclusion in tensorboard. + sess: + tf.Session instance. Used to produces summaries for the writer. + """ + bin_counts_ph = tf.placeholder(tf.int64) + bin_counts = self._bin_histogram_dict["bin_counts"] + + # Record that distribution into a histogram summary + histo = tf.summary.histogram("discretizer_feature_bin_counts", bin_counts_ph) + for i in range(bin_counts.shape[0]): + bin_counts_summary = sess.run( + histo, feed_dict={bin_counts_ph: bin_counts[i]} + ) + writer.add_summary(bin_counts_summary, global_step=i) + + def write_summary_json(self, save_dir: str, name: str = "default"): + """ + Export bin information to HDFS. + + Args: + save_dir (str): + name of the saving directory. + name (str): + prefix of the saved hub signature. Default (string): "default". + """ + # Since the size is small: (# of bins) * (# of features), we always dump the file. + discretizer_export_bin_filename = os.path.join(save_dir, name + "_bin.json") + discretizer_export_bin_dict = { + "feature_ids": self._bin_histogram_dict["feature_ids"].tolist(), + "bin_boundaries": self._bin_histogram_dict["bin_vals"].tolist(), + "output_bits": self._bin_histogram_dict["out_bits"], + } + twml.write_file( + discretizer_export_bin_filename, discretizer_export_bin_dict, encode="json" + ) + + def save( + self, save_dir: str, name: str = "default", verbose: bool = False + ): # pylint: disable=unused-argument + """Save the calibrator into the given save_directory using TF Hub. + Args: + save_dir: + name of the saving directory. + name: + prefix of the saved hub signature. Default (string): "default". + """ + if not self._calibrated: + raise RuntimeError( + "Expecting prior call to calibrate().Cannot save() prior to calibrate()" + ) + + # This module allows for the calibrator to save be saved as part of + # Tensorflow Hub (this will allow it to be used in further steps) + def calibrator_module(): + # Note that this is usually expecting a sparse_placeholder + inputs = tf.sparse_placeholder(tf.float32) + calibrator_layer = self.to_layer() + # creates the signature to the calibrator module + hub.add_signature( + inputs=inputs, + outputs=calibrator_layer(inputs, keep_inputs=False), + name=name, + ) + # and another signature for keep_inputs mode + hub.add_signature( + inputs=inputs, + outputs=calibrator_layer(inputs, keep_inputs=True), + name=name + "_keep_inputs", + ) + + # exports the module to the save_dir + spec = hub.create_module_spec(calibrator_module) + with tf.Graph().as_default(): + module = hub.Module(spec) + with tf.Session() as session: + module.export(save_dir, session) + + self.write_summary_json(save_dir, name) diff --git a/twml/twml/contrib/eventbus/input_fn.py b/twml/twml/contrib/eventbus/input_fn.py index c184d9434..d9ae7ab54 100644 --- a/twml/twml/contrib/eventbus/input_fn.py +++ b/twml/twml/contrib/eventbus/input_fn.py @@ -1,7 +1,9 @@ -from reader import EventBusPipedBinaryRecordReader +from typing import Callable, Generator, Optional + import tensorflow.compat.v1 as tf -import twml +from reader import EventBusPipedBinaryRecordReader +import twml """ This module provides input function for DeepBird v2 training. @@ -9,51 +11,74 @@ """ -def get_eventbus_data_record_generator(eventbus_reader): - """ - This module provides a data record generater from EventBus reader. - - Args: - eventbus_reader: EventBus reader - - Returns: - gen: Data record generater - """ - eventbus_reader.initialize() - counter = [0] - - def gen(): - while True: - record = eventbus_reader.read() - if eventbus_reader.debug: - tf.logging.warn("counter: {}".format(counter[0])) - with open('tmp_record_{}.bin'.format(counter[0]), 'wb') as f: - f.write(record) - counter[0] = counter[0] + 1 - yield record - return gen - - -def get_eventbus_data_record_dataset(eventbus_reader, parse_fn, batch_size): - """ - This module generates batch data for training from a data record generator. - """ - dataset = tf.data.Dataset.from_generator( - get_eventbus_data_record_generator(eventbus_reader), tf.string, tf.TensorShape([])) - return dataset.batch(batch_size).map(parse_fn, num_parallel_calls=4).prefetch(buffer_size=10) - - -def get_train_input_fn(feature_config, params, parse_fn=None): - """ - This module provides input function for DeepBird v2 training. - It gets batched training data from data record generator. - """ - eventbus_reader = EventBusPipedBinaryRecordReader( - params.jar_file, params.num_eb_threads, params.subscriber_id, - filter_str=params.filter_str, debug=params.debug) - - train_parse_fn = parse_fn or twml.parsers.get_sparse_parse_fn( - feature_config, ["ids", "keys", "values", "batch_size", "weights"]) - - return lambda: get_eventbus_data_record_dataset( - eventbus_reader, train_parse_fn, params.train_batch_size) +def get_eventbus_data_record_generator( + eventbus_reader: EventBusPipedBinaryRecordReader, +) -> Generator[bytes, None, None]: + """ + This module provides a data record generater from EventBus reader. + + Args: + eventbus_reader: EventBus reader + + Returns: + gen: Data record generater + """ + eventbus_reader.initialize() + counter = [0] + + def gen() -> Generator[bytes, None, None]: + while True: + record = eventbus_reader.read() + if eventbus_reader.debug: + tf.logging.warn(f"counter: {counter[0]}") + with open(f"tmp_record_{counter[0]}.bin", "wb") as f: + f.write(record) + counter[0] = counter[0] + 1 + yield record + + return gen + + +def get_eventbus_data_record_dataset( + eventbus_reader: EventBusPipedBinaryRecordReader, + parse_fn: Callable[[tf.Tensor], tf.Tensor], + batch_size: int, +) -> tf.data.Dataset: + """This module generates batch data for training from a data record generator.""" + + dataset = tf.data.Dataset.from_generator( + get_eventbus_data_record_generator(eventbus_reader), + tf.string, + tf.TensorShape([]), + ) + return ( + dataset.batch(batch_size) + .map(parse_fn, num_parallel_calls=4) + .prefetch(buffer_size=10) + ) + + +def get_train_input_fn( + feature_config: dict, + params: twml.Params, + parse_fn: Optional[Callable[[tf.Tensor], tf.Tensor]] = None, +) -> Callable[[], tf.data.Dataset]: + """ + This module provides input function for DeepBird v2 training. + It gets batched training data from data record generator. + """ + eventbus_reader = EventBusPipedBinaryRecordReader( + params.jar_file, + params.num_eb_threads, + params.subscriber_id, + filter_str=params.filter_str, + debug=params.debug, + ) + + train_parse_fn = parse_fn or twml.parsers.get_sparse_parse_fn( + feature_config, ["ids", "keys", "values", "batch_size", "weights"] + ) + + return lambda: get_eventbus_data_record_dataset( + eventbus_reader, train_parse_fn, params.train_batch_size + ) diff --git a/twml/twml/contrib/eventbus/reader.py b/twml/twml/contrib/eventbus/reader.py index 2f8e2749e..605cadd1a 100644 --- a/twml/twml/contrib/eventbus/reader.py +++ b/twml/twml/contrib/eventbus/reader.py @@ -2,6 +2,7 @@ import logging import subprocess from threading import Lock +from typing import Any, Optional """ This module provides a binary data record reader for EventBus data. @@ -12,108 +13,135 @@ class BinaryRecordReader(object): - def initialize(self): - pass + def initialize(self): + """Initialize the reader""" + pass - def read(self): - """Read raw bytes for one record - """ - raise NotImplementedError + def read(self): + """Read raw bytes for one record""" + raise NotImplementedError - def close(self): - pass + def close(self): + """Close the reader""" + pass class ReadableWrapper(object): - def __init__(self, internal): - self.internal = internal + def __init__(self, internal: io.BufferedReader): + self.internal = internal - def __getattr__(self, name): - return getattr(self.internal, name) + def __getattr__(self, name: str) -> Any: + return getattr(self.internal, name) - def readable(self): - return True + def readable(self) -> bool: + return True class EventBusPipedBinaryRecordReader(BinaryRecordReader): - - JAVA = '/usr/lib/jvm/java-11-twitter/bin/java' - RECORD_SEPARATOR_HEX = [ - 0x29, 0xd8, 0xd5, 0x06, 0x58, 0xcd, 0x4c, 0x29, - 0xb2, 0xbc, 0x57, 0x99, 0x21, 0x71, 0xbd, 0xff - ] - RECORD_SEPARATOR = ''.join([chr(i) for i in RECORD_SEPARATOR_HEX]) - RECORD_SEPARATOR_LENGTH = len(RECORD_SEPARATOR) - CHUNK_SIZE = 8192 - - def __init__(self, jar_file, num_eb_threads, subscriber_id, - filter_str=None, buffer_size=32768, debug=False): - self.jar_file = jar_file - self.num_eb_threads = num_eb_threads - self.subscriber_id = subscriber_id - self.filter_str = filter_str if filter_str else '""' - self.buffer_size = buffer_size - self.lock = Lock() - self._pipe = None - self._buffered_reader = None - self._bytes_buffer = None - - self.debug = debug - - def initialize(self): - if not self._pipe: - self._pipe = subprocess.Popen( - [ - self.JAVA, '-jar', self.jar_file, - '-subscriberId', self.subscriber_id, - '-numThreads', str(self.num_eb_threads), - '-dataFilter', self.filter_str, - '-debug' if self.debug else '' - ], - stdout=subprocess.PIPE - ) - self._buffered_reader = io.BufferedReader( - ReadableWrapper(self._pipe.stdout), self.buffer_size) - self._bytes_buffer = io.BytesIO() - else: - logging.warning('Already initialized') - - def _find_next_record(self): - tail = [''] - while True: - chunk = tail[0] + self._buffered_reader.read(self.CHUNK_SIZE) - index = chunk.find(self.RECORD_SEPARATOR) - if index < 0: - self._bytes_buffer.write(chunk[:-self.RECORD_SEPARATOR_LENGTH]) - tail[0] = chunk[-self.RECORD_SEPARATOR_LENGTH:] - else: - self._bytes_buffer.write(chunk[:index]) - return chunk[(index + self.RECORD_SEPARATOR_LENGTH):] - - def _read(self): - with self.lock: - remaining = self._find_next_record() - record = self._bytes_buffer.getvalue() - # clean up buffer - self._bytes_buffer.close() - self._bytes_buffer = io.BytesIO() - self._bytes_buffer.write(remaining) - - return record - - def read(self): - while True: - try: - return self._read() - except Exception as e: - logging.error("Error reading bytes for next record: {}".format(e)) - if self.debug: - raise - - def close(self): - try: - self._bytes_buffer.close() - self._buffered_reader.close() - self._pipe.terminate() - except Exception as e: - logging.error("Error closing reader: {}".format(e)) + JAVA = "/usr/lib/jvm/java-11-twitter/bin/java" + RECORD_SEPARATOR_HEX = [ + 0x29, + 0xD8, + 0xD5, + 0x06, + 0x58, + 0xCD, + 0x4C, + 0x29, + 0xB2, + 0xBC, + 0x57, + 0x99, + 0x21, + 0x71, + 0xBD, + 0xFF, + ] + RECORD_SEPARATOR = "".join([chr(i) for i in RECORD_SEPARATOR_HEX]) + RECORD_SEPARATOR_LENGTH = len(RECORD_SEPARATOR) + CHUNK_SIZE = 8192 + + def __init__( + self, + jar_file: str, + num_eb_threads: int, + subscriber_id: str, + filter_str: Optional[str] = None, + buffer_size: int = 32768, + debug: bool = False, + ): + self.jar_file = jar_file + self.num_eb_threads = num_eb_threads + self.subscriber_id = subscriber_id + self.filter_str = filter_str if filter_str else '""' + self.buffer_size = buffer_size + self.lock = Lock() + self._pipe = None + self._buffered_reader = None + self._bytes_buffer = None + self.debug = debug + + def initialize(self) -> None: + if not self._pipe: + self._pipe = subprocess.Popen( + [ + self.JAVA, + "-jar", + self.jar_file, + "-subscriberId", + self.subscriber_id, + "-numThreads", + str(self.num_eb_threads), + "-dataFilter", + self.filter_str, + "-debug" if self.debug else "", + ], + stdout=subprocess.PIPE, + ) + self._buffered_reader = io.BufferedReader( + ReadableWrapper(self._pipe.stdout), self.buffer_size + ) + self._bytes_buffer = io.BytesIO() + else: + logging.warning("Already initialized") + + def _find_next_record(self) -> Optional[bytes]: + tail = [""] + while True: + chunk = tail[0] + self._buffered_reader.read(self.CHUNK_SIZE) + index = chunk.find(self.RECORD_SEPARATOR) + if index < 0: + self._bytes_buffer.write(chunk[: -self.RECORD_SEPARATOR_LENGTH]) + tail[0] = chunk[-self.RECORD_SEPARATOR_LENGTH :] + else: + self._bytes_buffer.write(chunk[:index]) + return chunk[(index + self.RECORD_SEPARATOR_LENGTH) :] + + def _read(self) -> bytes: + with self.lock: + remaining = self._find_next_record() + record = self._bytes_buffer.getvalue() + + # clean up buffer + self._bytes_buffer.close() + self._bytes_buffer = io.BytesIO() + self._bytes_buffer.write(remaining) + + return record + + def read(self) -> bytes: + while True: + try: + return self._read() + except Exception as e: + logging.error(f"Error reading bytes for next record: {e}") + if self.debug: + raise + + def close(self) -> None: + try: + self._bytes_buffer.close() + self._buffered_reader.close() + self._pipe.terminate() + except Exception as e: + logging.error("Error closing reader: {e}") diff --git a/twml/twml/contrib/export/__init__.py b/twml/twml/contrib/export/__init__.py index 99892dcfa..2a6e0f86d 100644 --- a/twml/twml/contrib/export/__init__.py +++ b/twml/twml/contrib/export/__init__.py @@ -1,2 +1,2 @@ -from . import export_fn # noqa: F401 -from . import exporters # noqa: F401 +from . import export_fn # noqa: F401 +from . import exporters # noqa: F401 diff --git a/twml/twml/contrib/export/export_fn.py b/twml/twml/contrib/export/export_fn.py index 6e59fff07..5a338aae9 100644 --- a/twml/twml/contrib/export/export_fn.py +++ b/twml/twml/contrib/export/export_fn.py @@ -1,263 +1,313 @@ """ Functions for exporting models for different modes. """ -from collections import OrderedDict import os +from typing import List import tensorflow.compat.v1 as tf -from tensorflow.python.estimator.export import export -import twml import yaml +from tensorflow.python.estimator.export import export - -def get_sparse_batch_supervised_input_receiver_fn(feature_config, keep_fields=None): - """Gets supervised_input_receiver_fn that decodes a BatchPredictionRequest as sparse tensors - with labels and weights as defined in feature_config. - This input_receiver_fn is required for exporting models with 'train' mode to be trained with - Java API - - Args: - feature_config (FeatureConfig): deepbird v2 feature config object - keep_fields (list): list of fields to keep - - Returns: - supervised_input_receiver_fn: input_receiver_fn used for train mode - """ - def supervised_input_receiver_fn(): - serialized_request = tf.placeholder(dtype=tf.uint8, name='request') - receiver_tensors = {'request': serialized_request} - - bpr = twml.contrib.readers.HashedBatchPredictionRequest(serialized_request, feature_config) - features = bpr.get_sparse_features() if keep_fields is None else bpr.get_features(keep_fields) - features['weights'] = bpr.weights - labels = bpr.labels - features, labels = bpr.apply_filter(features, labels) - - return export.SupervisedInputReceiver(features, labels, receiver_tensors) - - return supervised_input_receiver_fn - - -def update_build_graph_fn_for_train(build_graph_fn): - """Updates a build_graph_fn by inserting in graph output a serialized BatchPredictionResponse - similar to the export_output_fns for serving. - The key difference here is that - 1. We insert serialized BatchPredictionResponse in graph output with key 'prediction' instead of - creating an export_output object. This is because of the way estimators export model in 'train' - mode doesn't take custom export_output - 2. We only do it when `mode == 'train'` to avoid altering the graph when exporting - for 'infer' mode - - Args: - build_graph_fn (Callable): deepbird v2 build graph function - - Returns: - new_build_graph_fn: An updated build_graph_fn that inserts serialized BatchPredictResponse - to graph output when in 'train' mode - """ - def new_build_graph_fn(features, label, mode, params, config=None): - output = build_graph_fn(features, label, mode, params, config) - if mode == tf.estimator.ModeKeys.TRAIN: - output.update( - twml.export_output_fns.batch_prediction_continuous_output_fn(output)[ - tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY].outputs - ) - return output - return new_build_graph_fn +import twml +from twml.twml.feature_config import FeatureConfig + + +def get_sparse_batch_supervised_input_receiver_fn( + feature_config: FeatureConfig, keep_fields: list = None +) -> callable: + """ + Gets supervised_input_receiver_fn that decodes a BatchPredictionRequest as sparse tensors + with labels and weights as defined in feature_config. This input_receiver_fn is required + for exporting models with 'train' mode to be trained with Java API + + Args: + feature_config (FeatureConfig): + deepbird v2 feature config object + keep_fields (list): + list of fields to keep + + Returns: + supervised_input_receiver_fn: input_receiver_fn used for train mode + """ + + def supervised_input_receiver_fn(): + serialized_request = tf.placeholder(dtype=tf.uint8, name="request") + receiver_tensors = {"request": serialized_request} + + bpr = twml.contrib.readers.HashedBatchPredictionRequest( + serialized_request, feature_config + ) + features = ( + bpr.get_sparse_features() + if keep_fields is None + else bpr.get_features(keep_fields) + ) + features["weights"] = bpr.weights + labels = bpr.labels + features, labels = bpr.apply_filter(features, labels) + + return export.SupervisedInputReceiver(features, labels, receiver_tensors) + + return supervised_input_receiver_fn + + +def update_build_graph_fn_for_train(build_graph_fn: callable): + """Updates a build_graph_fn by inserting in graph output a serialized BatchPredictionResponse + similar to the export_output_fns for serving. + The key difference here is that + 1. We insert serialized BatchPredictionResponse in graph output with key 'prediction' instead of + creating an export_output object. This is because of the way estimators export model in 'train' + mode doesn't take custom export_output + 2. We only do it when `mode == 'train'` to avoid altering the graph when exporting for 'infer' mode + + Args: + build_graph_fn (Callable): + deepbird v2 build graph function + + Returns: + new_build_graph_fn: + An updated build_graph_fn that inserts serialized BatchPredictResponse to graph + output when in 'train' mode + """ + + def new_build_graph_fn(features, label, mode, params, config=None): + output = build_graph_fn(features, label, mode, params, config) + if mode == tf.estimator.ModeKeys.TRAIN: + output.update( + twml.export_output_fns.batch_prediction_continuous_output_fn(output)[ + tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY + ].outputs + ) + return output + + return new_build_graph_fn def export_model_for_train_and_infer( - trainer, feature_config, keep_fields, export_dir, as_text=False): - """Function for exporting model with both 'train' and 'infer' mode. - - This means the exported saved_model.pb will contain two meta graphs, one with tag 'train' - and the other with tag 'serve', and it can be loaded in Java API with either tag depending on - the use case - - Args: - trainer (DataRecordTrainer): deepbird v2 DataRecordTrainer - feature_config (FeatureConfig): deepbird v2 feature config - keep_fields (list of string): list of field keys, e.g. - ('ids', 'keys', 'values', 'batch_size', 'total_size', 'codes') - export_dir (str): a directory (local or hdfs) to export model to - as_text (bool): if True, write 'saved_model.pb' as binary file, else write - 'saved_model.pbtxt' as human readable text file. Default False - """ - train_input_receiver_fn = get_sparse_batch_supervised_input_receiver_fn( - feature_config, keep_fields) - predict_input_receiver_fn = twml.parsers.get_sparse_serving_input_receiver_fn( - feature_config, keep_fields) - trainer._export_output_fn = twml.export_output_fns.batch_prediction_continuous_output_fn - trainer._build_graph_fn = update_build_graph_fn_for_train(trainer._build_graph_fn) - trainer._estimator._export_all_saved_models( - export_dir_base=export_dir, - input_receiver_fn_map={ - tf.estimator.ModeKeys.TRAIN: train_input_receiver_fn, - tf.estimator.ModeKeys.PREDICT: predict_input_receiver_fn - }, - as_text=as_text, - ) - - trainer.export_model_effects(export_dir) - - -def export_all_models_with_receivers(estimator, export_dir, - train_input_receiver_fn, - eval_input_receiver_fn, - predict_input_receiver_fn, - export_output_fn, - export_modes=('train', 'eval', 'predict'), - register_model_fn=None, - feature_spec=None, - checkpoint_path=None, - log_features=True): - """ - Function for exporting a model with train, eval, and infer modes. - - Args: - estimator: - Should be of type tf.estimator.Estimator. - You can get this from trainer using trainer.estimator - export_dir: - Directory to export the model. - train_input_receiver_fn: - Input receiver for train interface. - eval_input_receiver_fn: - Input receiver for eval interface. - predict_input_receiver_fn: - Input receiver for predict interface. - export_output_fn: - export_output_fn to be used for serving. - export_modes: - A list to Specify what modes to export. Can be "train", "eval", "predict". - Defaults to ["train", "eval", "predict"] - register_model_fn: - An optional function which is called with export_dir after models are exported. - Defaults to None. - Returns: - The timestamped directory the models are exported to. - """ - # TODO: Fix for hogwild / distributed training. - - if export_dir is None: - raise ValueError("export_dir can not be None") - export_dir = twml.util.sanitize_hdfs_path(export_dir) - input_receiver_fn_map = {} - - if "train" in export_modes: - input_receiver_fn_map[tf.estimator.ModeKeys.TRAIN] = train_input_receiver_fn - - if "eval" in export_modes: - input_receiver_fn_map[tf.estimator.ModeKeys.EVAL] = eval_input_receiver_fn - - if "predict" in export_modes: - input_receiver_fn_map[tf.estimator.ModeKeys.PREDICT] = predict_input_receiver_fn - - export_dir = estimator._export_all_saved_models( - export_dir_base=export_dir, - input_receiver_fn_map=input_receiver_fn_map, - checkpoint_path=checkpoint_path, - ) - - if register_model_fn is not None: - register_model_fn(export_dir, feature_spec, log_features) - - return export_dir - - -def export_all_models(trainer, - export_dir, - parse_fn, - serving_input_receiver_fn, - export_output_fn=None, - export_modes=('train', 'eval', 'predict'), - feature_spec=None, - checkpoint=None, - log_features=True): - """ - Function for exporting a model with train, eval, and infer modes. - - Args: - trainer: - An object of type twml.trainers.Trainer. - export_dir: - Directory to export the model. - parse_fn: - The parse function used parse the inputs for train and eval. - serving_input_receiver_fn: - The input receiver function used during serving. - export_output_fn: - export_output_fn to be used for serving. - export_modes: - A list to Specify what modes to export. Can be "train", "eval", "predict". - Defaults to ["train", "eval", "predict"] - feature_spec: - A dictionary obtained from FeatureConfig.get_feature_spec() to serialize - as feature_spec.yaml in export_dir. - Defaults to None - Returns: - The timestamped directory the models are exported to. - """ - # Only export from chief in hogwild or distributed modes. - if trainer.params.get('distributed', False) and not trainer.estimator.config.is_chief: - tf.logging.info("Trainer.export_model ignored due to instance not being chief.") - return - - if feature_spec is None: - if getattr(trainer, '_feature_config') is None: - raise ValueError("feature_spec is set to None." - "Please pass feature_spec=feature_config.get_feature_spec() to the export_all_model function") - else: - feature_spec = trainer._feature_config.get_feature_spec() - - export_dir = twml.util.sanitize_hdfs_path(export_dir) - old_export_output_fn = trainer._export_output_fn - trainer._export_output_fn = export_output_fn - supervised_input_receiver_fn = twml.parsers.convert_to_supervised_input_receiver_fn(parse_fn) - if not checkpoint: - checkpoint = trainer.best_or_latest_checkpoint - - export_dir = export_all_models_with_receivers(estimator=trainer.estimator, - export_dir=export_dir, - train_input_receiver_fn=supervised_input_receiver_fn, - eval_input_receiver_fn=supervised_input_receiver_fn, - predict_input_receiver_fn=serving_input_receiver_fn, - export_output_fn=export_output_fn, - export_modes=export_modes, - register_model_fn=trainer.export_model_effects, - feature_spec=feature_spec, - checkpoint_path=checkpoint, - log_features=log_features) - trainer._export_output_fn = old_export_output_fn - return export_dir - - -def export_feature_spec(dir_path, feature_spec_dict): - """ - Exports a FeatureConfig.get_feature_spec() dict to /feature_spec.yaml. - """ - def ordered_dict_representer(dumper, data): - return dumper.represent_mapping('tag:yaml.org,2002:map', data.items()) - - try: - # needed for Python 2 - yaml.add_representer(str, yaml.representer.SafeRepresenter.represent_str) - yaml.add_representer(unicode, yaml.representer.SafeRepresenter.represent_unicode) - except NameError: - # 'unicode' type doesn't exist on Python 3 - # PyYAML handles unicode correctly in Python 3 - pass - - yaml.add_representer(OrderedDict, ordered_dict_representer) - - fbase = "feature_spec.yaml" - fname = fbase.encode('utf-8') if type(dir_path) != str else fbase - file_path = os.path.join(dir_path, fname) - with tf.io.gfile.GFile(file_path, mode='w') as f: - yaml.dump(feature_spec_dict, f, default_flow_style=False, allow_unicode=True) - tf.logging.info("Exported feature spec to %s" % file_path) - - return file_path + trainer: twml.DataRecordTrainer, + feature_config: FeatureConfig, + keep_fields: List[str], + export_dir: str, + as_text: bool = False, +): + """Function for exporting model with both 'train' and 'infer' mode. + + This means the exported saved_model.pb will contain two meta graphs, one with tag 'train' + and the other with tag 'serve', and it can be loaded in Java API with either tag depending on + the use case + + Args: + trainer (DataRecordTrainer): + deepbird v2 DataRecordTrainer + feature_config (FeatureConfig): + deepbird v2 feature config + keep_fields (list[string]): + list of field keys, e.g. ('ids', 'keys', 'values', 'batch_size', 'total_size', 'codes') + export_dir (str): + a directory (local or hdfs) to export model to + as_text (bool): + if True, write 'saved_model.pb' as binary file, else write 'saved_model.pbtxt' as human readable text file. Default False + """ + train_input_receiver_fn = get_sparse_batch_supervised_input_receiver_fn( + feature_config, keep_fields + ) + predict_input_receiver_fn = twml.parsers.get_sparse_serving_input_receiver_fn( + feature_config, keep_fields + ) + trainer._export_output_fn = ( + twml.export_output_fns.batch_prediction_continuous_output_fn + ) + trainer._build_graph_fn = update_build_graph_fn_for_train(trainer._build_graph_fn) + trainer._estimator._export_all_saved_models( + export_dir_base=export_dir, + input_receiver_fn_map={ + tf.estimator.ModeKeys.TRAIN: train_input_receiver_fn, + tf.estimator.ModeKeys.PREDICT: predict_input_receiver_fn, + }, + as_text=as_text, + ) + + trainer.export_model_effects(export_dir) + + +def export_all_models_with_receivers( + estimator: tf.estimator.Estimator, + export_dir: str, + train_input_receiver_fn: callable, + eval_input_receiver_fn: callable, + predict_input_receiver_fn: callable, + export_output_fn: callable, + export_modes: List[str] = ["train", "eval", "predict"], + register_model_fn: callable = None, + feature_spec: dict = None, + checkpoint_path: str = None, + log_features: bool = True, +) -> str: + """ + Function for exporting a model with train, eval, and infer modes. + + Args: + estimator (tf.estimator.Estimator): + You can get this from trainer using trainer.estimator + export_dir (str): + Directory to export the model. + train_input_receiver_fn (Callable): + Input receiver for train interface. + eval_input_receiver_fn (Callable): + Input receiver for eval interface. + predict_input_receiver_fn (Callable): + Input receiver for predict interface. + export_output_fn (Callable): + export_output_fn to be used for serving. + export_modes (list[str]): + A list to Specify what modes to export. Can be "train", "eval", "predict". + Defaults to ["train", "eval", "predict"] + register_model_fn (Callable): + An optional function which is called with export_dir after models are exported. + Defaults to None. + feature_spec (dict): + An optional dict of feature names to tf.FixedLenFeature or tf.VarLenFeature. + Defaults to None. + checkpoint_path (str): + An optional path to a specific checkpoint to export. If None, the latest checkpoint + in export_dir is used. Defaults to None. + log_features (bool): + If True, log the features to the console. Defaults to True. + Returns: + The timestamped directory the models are exported to. + """ + # TODO: Fix for hogwild / distributed training. + + if export_dir is None: + raise ValueError("export_dir can not be None") + export_dir = twml.util.sanitize_hdfs_path(export_dir) + input_receiver_fn_map = {} + + if "train" in export_modes: + input_receiver_fn_map[tf.estimator.ModeKeys.TRAIN] = train_input_receiver_fn + + if "eval" in export_modes: + input_receiver_fn_map[tf.estimator.ModeKeys.EVAL] = eval_input_receiver_fn + + if "predict" in export_modes: + input_receiver_fn_map[tf.estimator.ModeKeys.PREDICT] = predict_input_receiver_fn + + export_dir = estimator._export_all_saved_models( + export_dir_base=export_dir, + input_receiver_fn_map=input_receiver_fn_map, + checkpoint_path=checkpoint_path, + ) + + if register_model_fn is not None: + register_model_fn(export_dir, feature_spec, log_features) + + return export_dir + + +def export_all_models( + trainer: twml.trainers.Trainer, + export_dir: str, + parse_fn: callable, + serving_input_receiver_fn: callable, + export_output_fn: callable = None, + export_modes: List[str] = ["train", "eval", "predict"], + feature_spec: dict = None, + checkpoint: str = None, + log_features: bool = True, +) -> str: + """ + Function for exporting a model with train, eval, and infer modes. + + Args: + trainer: + An object of type twml.trainers.Trainer. + export_dir: + Directory to export the model. + parse_fn: + The parse function used parse the inputs for train and eval. + serving_input_receiver_fn: + The input receiver function used during serving. + export_output_fn: + export_output_fn to be used for serving. + export_modes: + A list to Specify what modes to export. Can be "train", "eval", "predict". + Defaults to ["train", "eval", "predict"] + feature_spec: + A dictionary obtained from FeatureConfig.get_feature_spec() to serialize + as feature_spec.yaml in export_dir. + Defaults to None + + Returns: + The timestamped directory the models are exported to. + """ + # Only export from chief in hogwild or distributed modes. + if ( + trainer.params.get("distributed", False) + and not trainer.estimator.config.is_chief + ): + tf.logging.info("Trainer.export_model ignored due to instance not being chief.") + return + + if feature_spec is None: + if getattr(trainer, "_feature_config") is None: + raise ValueError( + "feature_spec is set to None." + "Please pass feature_spec=feature_config.get_feature_spec() to the export_all_model function" + ) + else: + feature_spec = trainer._feature_config.get_feature_spec() + + export_dir = twml.util.sanitize_hdfs_path(export_dir) + old_export_output_fn = trainer._export_output_fn + trainer._export_output_fn = export_output_fn + supervised_input_receiver_fn = twml.parsers.convert_to_supervised_input_receiver_fn( + parse_fn + ) + if not checkpoint: + checkpoint = trainer.best_or_latest_checkpoint + + export_dir = export_all_models_with_receivers( + estimator=trainer.estimator, + export_dir=export_dir, + train_input_receiver_fn=supervised_input_receiver_fn, + eval_input_receiver_fn=supervised_input_receiver_fn, + predict_input_receiver_fn=serving_input_receiver_fn, + export_output_fn=export_output_fn, + export_modes=export_modes, + register_model_fn=trainer.export_model_effects, + feature_spec=feature_spec, + checkpoint_path=checkpoint, + log_features=log_features, + ) + trainer._export_output_fn = old_export_output_fn + return export_dir + + +def export_feature_spec(dir_path: str, feature_spec_dict: dict) -> str: + """Exports a FeatureConfig.get_feature_spec() dict to /feature_spec.yaml""" + + def ordered_dict_representer(dumper, data: dict): + return dumper.represent_mapping("tag:yaml.org,2002:map", data.items()) + + try: + # needed for Python 2 + yaml.add_representer(str, yaml.representer.SafeRepresenter.represent_str) + yaml.add_representer( + unicode, yaml.representer.SafeRepresenter.represent_unicode + ) + except NameError: + pass + + yaml.add_representer(dict, ordered_dict_representer) + + fbase = "feature_spec.yaml" + fname = fbase.encode("utf-8") if type(dir_path) != str else fbase + file_path = os.path.join(dir_path, fname) + with tf.io.gfile.GFile(file_path, mode="w") as f: + yaml.dump(feature_spec_dict, f, default_flow_style=False, allow_unicode=True) + tf.logging.info("Exported feature spec to %s" % file_path) + + return file_path # Keep the alias for compatibility. diff --git a/twml/twml/contrib/export/exporters.py b/twml/twml/contrib/export/exporters.py index 122955cbc..f0a696a37 100644 --- a/twml/twml/contrib/export/exporters.py +++ b/twml/twml/contrib/export/exporters.py @@ -2,144 +2,202 @@ Wrappers around tf.estimator.Exporters to export models and save checkpoints. """ import os +from typing import List import tensorflow.compat.v1 as tf from tensorflow.python.estimator import exporter + import twml class _AllSavedModelsExporter(tf.estimator.Exporter): - """Internal exporter class to be used for exporting models for different modes.""" - - def __init__(self, - name, - input_receiver_fn_map, - backup_checkpoints, - assets_extra=None, - as_text=False): - """ - Args: - name: A unique name to be used for the exporter. This is used in the export path. - input_receiver_fn_map: A map of tf.estimator.ModeKeys to input_receiver_fns. - backup_checkpoints: A flag to specify if backups of checkpoints need to be made. - assets_extra: Additional assets to be included in the exported model. - as_text: Specifies if the exported model should be in a human readable text format. - """ - self._name = name - self._input_receiver_fn_map = input_receiver_fn_map - self._backup_checkpoints = backup_checkpoints - self._assets_extra = assets_extra - self._as_text = as_text - - @property - def name(self): - return self._name - - def export(self, estimator, export_path, checkpoint_path, eval_result, - is_the_final_export): - del is_the_final_export - - export_path = twml.util.sanitize_hdfs_path(export_path) - checkpoint_path = twml.util.sanitize_hdfs_path(checkpoint_path) - - if self._backup_checkpoints: - backup_path = os.path.join(export_path, "checkpoints") - # Ensure backup_path is created. makedirs passes if dir already exists. - tf.io.gfile.makedirs(backup_path) - twml.util.backup_checkpoint(checkpoint_path, backup_path, empty_backup=False) - - export_result = estimator.experimental_export_all_saved_models( - export_path, - self._input_receiver_fn_map, - assets_extra=self._assets_extra, - as_text=self._as_text, - checkpoint_path=checkpoint_path) - - return export_result + """Internal exporter class to be used for exporting models for different modes.""" + + def __init__( + self, + name: str, + input_receiver_fn_map: dict, + backup_checkpoints: bool, + assets_extra: List[str] = None, + as_text: bool = False, + ): + """ + Args: + name (str): + A unique name to be used for the exporter. This is used in the export path. + input_receiver_fn_map (dict): + A map of tf.estimator.ModeKeys to input_receiver_fns. + backup_checkpoints (bool): + A flag to specify if backups of checkpoints need to be made. + assets_extra (list): + Additional assets to be included in the exported model. + as_text (bool): + Specifies if the exported model should be in a human readable text format. + """ + self._name = name + self._input_receiver_fn_map = input_receiver_fn_map + self._backup_checkpoints = backup_checkpoints + self._assets_extra = assets_extra + self._as_text = as_text + + @property + def name(self) -> str: + return self._name + + def export( + self, + estimator: tf.estimator.Estimator, + export_path: str, + checkpoint_path: str, + eval_result: dict, + is_the_final_export: bool = True, + ): # pylint: disable=unused-argument + del is_the_final_export + + export_path = twml.util.sanitize_hdfs_path(export_path) + checkpoint_path = twml.util.sanitize_hdfs_path(checkpoint_path) + + if self._backup_checkpoints: + backup_path = os.path.join(export_path, "checkpoints") + # Ensure backup_path is created. makedirs passes if dir already exists. + tf.io.gfile.makedirs(backup_path) + twml.util.backup_checkpoint( + checkpoint_path, backup_path, empty_backup=False + ) + + export_result = estimator.experimental_export_all_saved_models( + export_path, + self._input_receiver_fn_map, + assets_extra=self._assets_extra, + as_text=self._as_text, + checkpoint_path=checkpoint_path, + ) + + return export_result class BestExporter(tf.estimator.BestExporter): - """ - This class inherits from tf.estimator.BestExporter with the following differences: - - It also creates a backup of the best checkpoint. - - It can export the model for multiple modes. - - A backup / export is performed everytime the evaluated metric is better - than previous models. - """ - - def __init__(self, - name='best_exporter', - input_receiver_fn_map=None, - backup_checkpoints=True, - event_file_pattern='eval/*.tfevents.*', - compare_fn=exporter._loss_smaller, - assets_extra=None, - as_text=False, - exports_to_keep=5): - """ - Args: - name: A unique name to be used for the exporter. This is used in the export path. - input_receiver_fn_map: A map of tf.estimator.ModeKeys to input_receiver_fns. - backup_checkpoints: A flag to specify if backups of checkpoints need to be made. - - Note: - Check the following documentation for more information about the remaining args: - https://www.tensorflow.org/api_docs/python/tf/estimator/BestExporter """ - serving_input_receiver_fn = input_receiver_fn_map.get(tf.estimator.ModeKeys.PREDICT) + This class inherits from tf.estimator.BestExporter with the following differences: + - It also creates a backup of the best checkpoint. + - It can export the model for multiple modes. - super(BestExporter, self).__init__( - name, serving_input_receiver_fn, event_file_pattern, compare_fn, - assets_extra, as_text, exports_to_keep) - - if not hasattr(self, "_saved_model_exporter"): - raise AttributeError( - "_saved_model_exporter needs to exist for this exporter to work." - " This is potentially broken because of an internal change in Tensorflow") + A backup / export is performed every time the evaluated metric is better + than previous models. + """ - # Override the saved_model_exporter with SaveAllmodelsexporter - self._saved_model_exporter = _AllSavedModelsExporter( - name, input_receiver_fn_map, backup_checkpoints, assets_extra, as_text) + def __init__( + self, + name: str = "best_exporter", + input_receiver_fn_map: dict = None, + backup_checkpoints: bool = True, + event_file_pattern: str = "eval/*.tfevents.*", + compare_fn: callable = exporter._loss_smaller, + assets_extra: List[str] = None, + as_text: bool = False, + exports_to_keep: int = 5, + ): + """ + Args: + name (str): + A unique name to be used for the exporter. This is used in the export path. + input_receiver_fn_map (dict): + A map of tf.estimator.ModeKeys to input_receiver_fns. + backup_checkpoints (bool): + A flag to specify if backups of checkpoints need to be made. + event_file_pattern (str): + A glob pattern for the event files in the evaluation directory. + compare_fn (callable): + A function that takes two evaluation results and returns True if the first + one is better than the second one. + assets_extra (list): + Additional assets to be included in the exported model. + as_text (bool): + Specifies if the exported model should be in a human readable text format. + exports_to_keep (int): + The maximum number of exports to keep. Older exports are deleted. + Note: + Check the following documentation for more information about the remaining args: + https://www.tensorflow.org/api_docs/python/tf/estimator/BestExporter + """ + serving_input_receiver_fn = input_receiver_fn_map.get( + tf.estimator.ModeKeys.PREDICT + ) + + super(BestExporter, self).__init__( + name, + serving_input_receiver_fn, + event_file_pattern, + compare_fn, + assets_extra, + as_text, + exports_to_keep, + ) + + if not hasattr(self, "_saved_model_exporter"): + raise AttributeError( + "_saved_model_exporter needs to exist for this exporter to work." + " This is potentially broken because of an internal change in Tensorflow" + ) + + # Override the saved_model_exporter with SaveAllmodelsexporter + self._saved_model_exporter = _AllSavedModelsExporter( + name, input_receiver_fn_map, backup_checkpoints, assets_extra, as_text + ) class LatestExporter(tf.estimator.LatestExporter): - """ - This class inherits from tf.estimator.LatestExporter with the following differences: - - It also creates a backup of the latest checkpoint. - - It can export the model for multiple modes. - - A backup / export is performed everytime the evaluated metric is better - than previous models. - """ - - def __init__(self, - name='latest_exporter', - input_receiver_fn_map=None, - backup_checkpoints=True, - assets_extra=None, - as_text=False, - exports_to_keep=5): """ - Args: - name: A unique name to be used for the exporter. This is used in the export path. - input_receiver_fn_map: A map of tf.estimator.ModeKeys to input_receiver_fns. - backup_checkpoints: A flag to specify if backups of checkpoints need to be made. - - Note: - Check the following documentation for more information about the remaining args: - https://www.tensorflow.org/api_docs/python/tf/estimator/LatestExporter - """ - serving_input_receiver_fn = input_receiver_fn_map.get(tf.estimator.ModeKeys.PREDICT) - - super(LatestExporter, self).__init__( - name, serving_input_receiver_fn, assets_extra, as_text, exports_to_keep) + This class inherits from tf.estimator.LatestExporter with the following differences: + - It also creates a backup of the latest checkpoint. + - It can export the model for multiple modes. - if not hasattr(self, "_saved_model_exporter"): - raise AttributeError( - "_saved_model_exporter needs to exist for this exporter to work." - " This is potentially broken because of an internal change in Tensorflow") + A backup / export is performed every time the evaluated metric is better + than previous models. + """ - # Override the saved_model_exporter with SaveAllmodelsexporter - self._saved_model_exporter = _AllSavedModelsExporter( - name, input_receiver_fn_map, backup_checkpoints, assets_extra, as_text) + def __init__( + self, + name: str = "latest_exporter", + input_receiver_fn_map: dict = None, + backup_checkpoints: bool = True, + assets_extra: List[str] = None, + as_text: bool = False, + exports_to_keep: int = 5, + ): + """ + Args: + name (str): + A unique name to be used for the exporter. This is used in the export path. + input_receiver_fn_map (dict): + A map of tf.estimator.ModeKeys to input_receiver_fns. + backup_checkpoints (bool): + A flag to specify if backups of checkpoints need to be made. + assets_extra (list[str]): + Additional assets to be included in the exported model. + as_text (bool): + Specifies if the exported model should be in a human readable text format. + exports_to_keep (int): + The number of exports to keep. + Note: + Check the following documentation for more information about the remaining args: + https://www.tensorflow.org/api_docs/python/tf/estimator/LatestExporter + """ + serving_input_receiver_fn = input_receiver_fn_map.get( + tf.estimator.ModeKeys.PREDICT + ) + + super(LatestExporter, self).__init__( + name, serving_input_receiver_fn, assets_extra, as_text, exports_to_keep + ) + + if not hasattr(self, "_saved_model_exporter"): + raise AttributeError( + "_saved_model_exporter needs to exist for this exporter to work." + " This is potentially broken because of an internal change in Tensorflow" + ) + + # Override the saved_model_exporter with SaveAllmodelsexporter + self._saved_model_exporter = _AllSavedModelsExporter( + name, input_receiver_fn_map, backup_checkpoints, assets_extra, as_text + ) diff --git a/twml/twml/contrib/feature_config.py b/twml/twml/contrib/feature_config.py index 833695751..c29bb63a6 100644 --- a/twml/twml/contrib/feature_config.py +++ b/twml/twml/contrib/feature_config.py @@ -2,84 +2,83 @@ Feature configuration for DeepBird jobs returns dictionary of sparse and dense Features """ from twitter.deepbird.io.legacy.contrib import feature_config + import twml class FeatureConfig(feature_config.FeatureConfig): - def get_feature_spec(self): - """ - Generates a serialization-friendly dict representing this FeatureConfig. - """ - doc = super(FeatureConfig, self).get_feature_spec() - - # Override the class in the spec. - doc["class"] = "twml.contrib.FeatureConfig" + def get_feature_spec(self) -> dict: + """Generates a serialization-friendly dict representing this FeatureConfig.""" - return doc + doc = super(FeatureConfig, self).get_feature_spec() + # Override the class in the spec. + doc["class"] = "twml.contrib.FeatureConfig" + return doc class FeatureConfigBuilder(feature_config.FeatureConfigBuilder): - # Overwrite self.build() to return twml.FeatureConfig instead - def build(self): - """ - Returns an instance of FeatureConfig with the features passed to the FeatureConfigBuilder. - """ + # Overwrite self.build() to return twml.FeatureConfig instead + def build(self) -> FeatureConfig: + """Returns an instance of FeatureConfig with the features passed to the FeatureConfigBuilder.""" - ( - keep_tensors, - keep_sparse_tensors, - feature_map, - features_add, - feature_name_to_feature_parser, - feature_in_bq_name, - ) = self._build() + ( + keep_tensors, + keep_sparse_tensors, + feature_map, + features_add, + feature_name_to_feature_parser, + feature_in_bq_name, + ) = self._build() - discretize_dict = {} - for config in self._sparse_extraction_configs: - if config.discretize_num_bins and config.discretize_output_size_bits: - if config.discretize_type == "percentile": - calibrator = twml.contrib.calibrators.PercentileDiscretizerCalibrator - elif config.discretize_type == "hashed_percentile": - calibrator = twml.contrib.calibrators.HashedPercentileDiscretizerCalibrator - elif config.discretize_type == "hashing": - calibrator = twml.contrib.calibrators.HashingDiscretizerCalibrator - else: - raise ValueError("Unsupported discretizer type: " + config.discretize_type) - discretize_dict[config.output_name] = calibrator( - config.discretize_num_bins, - config.discretize_output_size_bits, - allow_empty_calibration=config.allow_empty_calibration, - ) - elif config.discretize_num_bins or config.discretize_output_size_bits: - raise ValueError( - "Discretize_num_bins AND discretize_output_size_bits need to be in the FeatureConfig" - ) + discretize_dict = {} + for config in self._sparse_extraction_configs: + if config.discretize_num_bins and config.discretize_output_size_bits: + if config.discretize_type == "percentile": + calibrator = ( + twml.contrib.calibrators.PercentileDiscretizerCalibrator + ) + elif config.discretize_type == "hashed_percentile": + calibrator = ( + twml.contrib.calibrators.HashedPercentileDiscretizerCalibrator + ) + elif config.discretize_type == "hashing": + calibrator = twml.contrib.calibrators.HashingDiscretizerCalibrator + else: + raise ValueError( + "Unsupported discretizer type: " + config.discretize_type + ) + discretize_dict[config.output_name] = calibrator( + config.discretize_num_bins, + config.discretize_output_size_bits, + allow_empty_calibration=config.allow_empty_calibration, + ) + elif config.discretize_num_bins or config.discretize_output_size_bits: + raise ValueError( + "Discretize_num_bins AND discretize_output_size_bits need to be in the FeatureConfig" + ) - return FeatureConfig( - features={}, - labels=self._labels, - weight=self._weight, - filters=self._filter_features, - tensor_types=keep_tensors, - sparse_tensor_types=keep_sparse_tensors, - feature_types=feature_map, - sparse_extraction_configs=self._sparse_extraction_configs, - feature_extraction_configs=self._feature_extraction_configs, - feature_group_extraction_configs=self._feature_group_extraction_configs, - image_configs=self._image_configs, - discretize_config=discretize_dict, - feature_ids=features_add, - decode_mode=self._decode_mode, - legacy_sparse=self._legacy_sparse, - feature_name_to_feature_parser=feature_name_to_feature_parser, - feature_in_bq_name=feature_in_bq_name, - ) + return FeatureConfig( + features={}, + labels=self._labels, + weight=self._weight, + filters=self._filter_features, + tensor_types=keep_tensors, + sparse_tensor_types=keep_sparse_tensors, + feature_types=feature_map, + sparse_extraction_configs=self._sparse_extraction_configs, + feature_extraction_configs=self._feature_extraction_configs, + feature_group_extraction_configs=self._feature_group_extraction_configs, + image_configs=self._image_configs, + discretize_config=discretize_dict, + feature_ids=features_add, + decode_mode=self._decode_mode, + legacy_sparse=self._legacy_sparse, + feature_name_to_feature_parser=feature_name_to_feature_parser, + feature_in_bq_name=feature_in_bq_name, + ) TensorExtractionConfig = feature_config.TensorExtractionConfig - FeatureGroupExtractionConfig = feature_config.FeatureGroupExtractionConfig - ImageExtractionConfig = feature_config.ImageExtractionConfig - _set_tensor_namedtuple = feature_config._set_tensor_namedtuple diff --git a/twml/twml/contrib/feature_config_parsers.py b/twml/twml/contrib/feature_config_parsers.py index 83c402e2e..2fb3dd4a3 100644 --- a/twml/twml/contrib/feature_config_parsers.py +++ b/twml/twml/contrib/feature_config_parsers.py @@ -1,224 +1,247 @@ """Utility functions to create FeatureConfig objects from feature_spec.yaml files""" import os import re +from typing import Dict import tensorflow.compat.v1 as tf import yaml -from twml.feature_config import FeatureConfigBuilder -from twml.contrib.feature_config import FeatureConfigBuilder as FeatureConfigBuilderV2 - -def _get_config_version(config_dict): - doc = config_dict - supported_classes = { - "twml.FeatureConfig": "v1", - "twml.contrib.FeatureConfig": "v2" - } - if "class" not in doc: - raise ValueError("'class' key not found") - if doc["class"] not in supported_classes.keys(): - raise ValueError("Class %s not supported. Supported clases are %s" - % (doc["class"], supported_classes.keys())) - return supported_classes[doc["class"]] - - -def _validate_config_dict_v1(config_dict): - """ - Validate spec exported by twml.FeatureConfig - """ - doc = config_dict - - def malformed_error(msg): - raise ValueError("twml.FeatureConfig: Malformed feature_spec. %s" % msg) - - if doc["class"] != "twml.FeatureConfig": - malformed_error("'class' is not twml.FeatureConfig") - if "format" not in doc: - malformed_error("'format' key not found") - - # validate spec exported by twml.FeatureConfig - if doc["format"] == "exported": - dict_keys = ["features", "labels", "weight", "tensors", "sparse_tensors"] - for key in dict_keys: - if key not in doc: - malformed_error("'%s' key not found" % key) - if type(doc[key]) != dict: - malformed_error("'%s' is not a dict" % key) - if "filters" not in doc: - malformed_error("'filters' key not found") - elif type(doc["filters"]) != list: - malformed_error("'filters' is not a list") - - # validate spec provided by modeler - elif doc["format"] == "manual": - raise NotImplementedError("Manual config support not yet implemented") - else: - malformed_error("'format' must be 'exported' or 'manual'") - - -def _validate_config_dict_v2(config_dict): - """ - Validate spec exported by twml.contrib.FeatureConfig - """ - doc = config_dict - - def malformed_error(msg): - raise ValueError("twml.contrib.FeatureConfig: Malformed feature_spec. %s" % msg) - - if doc["class"] != "twml.contrib.FeatureConfig": - malformed_error("'class' is not twml.contrib.FeatureConfig") - if "format" not in doc: - malformed_error("'format key not found'") - - # validate spec exported by twml.contrib.FeatureConfig (basic validation only) - if doc["format"] == "exported": - dict_keys = ["features", "labels", "weight", "tensors", "sparseTensors", "discretizeConfig"] - for key in dict_keys: - if key not in doc: - malformed_error("'%s' key not found" % key) - if type(doc[key]) != dict: - malformed_error("'%s' is not a dict" % key) - list_keys = ["sparseFeatureGroups", "denseFeatureGroups", "denseFeatures", "images", "filters"] - for key in list_keys: - if key not in doc: - malformed_error("'%s' key not found" % key) - if type(doc[key]) != list: - malformed_error("'%s' is not a list" % key) - - # validate spec provided by modeler - elif doc["format"] == "manual": - raise NotImplementedError("Manual config support not yet implemented") - else: - malformed_error("'format' must be 'exported' or 'manual'") - - -def _create_feature_config_v1(config_dict, data_spec_path): - fc_builder = FeatureConfigBuilder(data_spec_path) - - if config_dict["format"] == "exported": - # add features - for feature_info in config_dict["features"].values(): - feature_name = re.escape(feature_info["featureName"]) - feature_group = feature_info["featureGroup"] - fc_builder.add_feature(feature_name, feature_group) - # add labels - labels = [] - for label_info in config_dict["labels"].values(): - labels.append(label_info["featureName"]) - fc_builder.add_labels(labels) - # feature filters - for feature_name in config_dict["filters"]: - fc_builder.add_filter(feature_name) - # weight - if config_dict["weight"]: - weight_feature = list(config_dict["weight"].values())[0]["featureName"] - fc_builder.define_weight(weight_feature) - else: - raise ValueError("Format '%s' not implemented" % config_dict["format"]) - - return fc_builder.build() - - -def _create_feature_config_v2(config_dict, data_spec_path): - fc_builder = FeatureConfigBuilderV2(data_spec_path) - - if config_dict["format"] == "exported": - # add sparse group extraction configs - for sparse_group in config_dict["sparseFeatureGroups"]: - fids = sparse_group["features"].keys() - fnames = [sparse_group["features"][fid]["featureName"] for fid in fids] - fc_builder.extract_features_as_hashed_sparse( - feature_regexes=[re.escape(fname) for fname in fnames], - output_tensor_name=sparse_group["outputName"], - hash_space_size_bits=sparse_group["hashSpaceBits"], - discretize_num_bins=sparse_group["discretize"]["numBins"], - discretize_output_size_bits=sparse_group["discretize"]["outputSizeBits"], - discretize_type=sparse_group["discretize"]["type"], - type_filter=sparse_group["filterType"]) - - # add dense group extraction configs - for dense_group in config_dict["denseFeatureGroups"]: - fids = dense_group["features"].keys() - fnames = [dense_group["features"][fid]["featureName"] for fid in fids] - fc_builder.extract_feature_group( - feature_regexes=[re.escape(fname) for fname in fnames], - group_name=dense_group["outputName"], - type_filter=dense_group["filterType"], - default_value=dense_group["defaultValue"]) - - # add dense feature configs - for dense_features in config_dict["denseFeatures"]: - fids = dense_features["features"].keys() - fnames = [dense_features["features"][fid]["featureName"] for fid in fids] - default_value = dense_features["defaultValue"] - if len(fnames) == 1 and type(default_value) != dict: - fc_builder.extract_feature( - feature_name=re.escape(fnames[0]), - expected_shape=dense_features["expectedShape"], - default_value=dense_features["defaultValue"]) - else: - fc_builder.extract_features( - feature_regexes=[re.escape(fname) for fname in fnames], - default_value_map=dense_features["defaultValue"]) - - # add image feature configs - for image in config_dict["images"]: - fc_builder.extract_image( - feature_name=image["featureName"], - preprocess=image["preprocess"], - out_type=tf.as_dtype(image["outType"].lower()), - channels=image["channels"], - default_image=image["defaultImage"], - ) - - # add other tensor features (non-image) - tensor_fnames = [] - image_fnames = [img["featureName"] for img in config_dict["images"]] - for tensor_fname in config_dict["tensors"]: - if tensor_fname not in image_fnames: - tensor_fnames.append(tensor_fname) - for sparse_tensor_fname in config_dict["sparseTensors"]: - tensor_fnames.append(sparse_tensor_fname) - fc_builder.extract_tensors(tensor_fnames) - - # add labels - labels = [] - for label_info in config_dict["labels"].values(): - labels.append(label_info["featureName"]) - fc_builder.add_labels(labels) - - else: - raise ValueError("Format '%s' not implemented" % config_dict["format"]) - - return fc_builder.build() - - -def create_feature_config_from_dict(config_dict, data_spec_path): - """ - Create a FeatureConfig object from a feature spec dict. - """ - config_version = _get_config_version(config_dict) - if config_version == "v1": - _validate_config_dict_v1(config_dict) - feature_config = _create_feature_config_v1(config_dict, data_spec_path) - elif config_version == "v2": - _validate_config_dict_v2(config_dict) - feature_config = _create_feature_config_v2(config_dict, data_spec_path) - else: - raise ValueError("version not supported") - - return feature_config - - -def create_feature_config(config_path, data_spec_path): - """ - Create a FeatureConfig object from a feature_spec.yaml file. - """ - _, ext = os.path.splitext(config_path) - if ext not in ['.yaml', '.yml']: - raise ValueError("create_feature_config_from_yaml: Only .yaml/.yml supported") - - with tf.io.gfile.GFile(config_path, mode='r') as fs: - config_dict = yaml.safe_load(fs) - - return create_feature_config_from_dict(config_dict, data_spec_path) +from twml.contrib.feature_config import FeatureConfigBuilder as FeatureConfigBuilderV2 +from twml.feature_config import FeatureConfig, FeatureConfigBuilder + + +def _get_config_version(config_dict: dict) -> str: + """Returns the version of the feature spec""" + + doc = config_dict.copy() + supported_classes = {"twml.FeatureConfig": "v1", "twml.contrib.FeatureConfig": "v2"} + if "class" not in doc: + raise ValueError("'class' key not found") + if doc["class"] not in supported_classes.keys(): + raise ValueError( + "Class %s not supported. Supported clases are %s" + % (doc["class"], supported_classes.keys()) + ) + return supported_classes[doc["class"]] + + +def _validate_config_dict_v1(config_dict: dict) -> None: + """Validate spec exported by twml.FeatureConfig""" + + doc = config_dict + + def malformed_error(msg: str): + raise ValueError("twml.FeatureConfig: Malformed feature_spec. " + msg) + + if doc["class"] != "twml.FeatureConfig": + malformed_error("'class' is not twml.FeatureConfig") + if "format" not in doc: + malformed_error("'format' key not found") + + # validate spec exported by twml.FeatureConfig + if doc["format"] == "exported": + dict_keys = ["features", "labels", "weight", "tensors", "sparse_tensors"] + for key in dict_keys: + if key not in doc: + malformed_error("'%s' key not found" % key) + elif isinstance(doc[key], dict): + malformed_error("'%s' is not a dict" % key) + if "filters" not in doc: + malformed_error("'filters' key not found") + elif isinstance(doc["filters"], list): + malformed_error("'filters' is not a list") + # validate spec provided by modeler + elif doc["format"] == "manual": + raise NotImplementedError("Manual config support not yet implemented") + else: + malformed_error("'format' must be 'exported' or 'manual'") + + +def _validate_config_dict_v2(config_dict: dict) -> None: + """Validate spec exported by twml.contrib.FeatureConfig""" + + doc = config_dict + + def malformed_error(msg: str): + raise ValueError("twml.contrib.FeatureConfig: Malformed feature_spec. " + msg) + + if doc["class"] != "twml.contrib.FeatureConfig": + malformed_error("'class' is not twml.contrib.FeatureConfig") + if "format" not in doc: + malformed_error("'format key not found'") + + # validate spec exported by twml.contrib.FeatureConfig (basic validation only) + if doc["format"] == "exported": + dict_keys = [ + "features", + "labels", + "weight", + "tensors", + "sparseTensors", + "discretizeConfig", + ] + for key in dict_keys: + if key not in doc: + malformed_error("'%s' key not found" % key) + if isinstance(doc[key], dict): + malformed_error("'%s' is not a dict" % key) + list_keys = [ + "sparseFeatureGroups", + "denseFeatureGroups", + "denseFeatures", + "images", + "filters", + ] + for key in list_keys: + if key not in doc: + malformed_error("'%s' key not found" % key) + if type(doc[key]) != list: + malformed_error("'%s' is not a list" % key) + + # validate spec provided by modeler + elif doc["format"] == "manual": + raise NotImplementedError("Manual config support not yet implemented") + else: + malformed_error("'format' must be 'exported' or 'manual'") + + +def _create_feature_config_v1( + config_dict: Dict[str, str], data_spec_path: str +) -> FeatureConfig: + """Create a FeatureConfig object from a feature spec""" + + fc_builder = FeatureConfigBuilder(data_spec_path) + + if config_dict["format"] == "exported": + # add features + for feature_info in config_dict["features"].values(): + feature_name = re.escape(feature_info["featureName"]) + feature_group = feature_info["featureGroup"] + fc_builder.add_feature(feature_name, feature_group) + # add labels + labels = [] + for label_info in config_dict["labels"].values(): + labels.append(label_info["featureName"]) + fc_builder.add_labels(labels) + # feature filters + for feature_name in config_dict["filters"]: + fc_builder.add_filter(feature_name) + # weight + if config_dict["weight"]: + weight_feature = list(config_dict["weight"].values())[0]["featureName"] + fc_builder.define_weight(weight_feature) + else: + raise ValueError("Format '%s' not implemented" % config_dict["format"]) + + return fc_builder.build() + + +def _create_feature_config_v2(config_dict: dict, data_spec_path: str) -> FeatureConfig: + """Create a FeatureConfig object from a feature spec""" + + fc_builder = FeatureConfigBuilderV2(data_spec_path) + + if config_dict["format"] == "exported": + # add sparse group extraction configs + for sparse_group in config_dict["sparseFeatureGroups"]: + fids = sparse_group["features"].keys() + fnames = [sparse_group["features"][fid]["featureName"] for fid in fids] + fc_builder.extract_features_as_hashed_sparse( + feature_regexes=[re.escape(fname) for fname in fnames], + output_tensor_name=sparse_group["outputName"], + hash_space_size_bits=sparse_group["hashSpaceBits"], + discretize_num_bins=sparse_group["discretize"]["numBins"], + discretize_output_size_bits=sparse_group["discretize"][ + "outputSizeBits" + ], + discretize_type=sparse_group["discretize"]["type"], + type_filter=sparse_group["filterType"], + ) + + # add dense group extraction configs + for dense_group in config_dict["denseFeatureGroups"]: + fids = dense_group["features"].keys() + fnames = [dense_group["features"][fid]["featureName"] for fid in fids] + fc_builder.extract_feature_group( + feature_regexes=[re.escape(fname) for fname in fnames], + group_name=dense_group["outputName"], + type_filter=dense_group["filterType"], + default_value=dense_group["defaultValue"], + ) + + # add dense feature configs + for dense_features in config_dict["denseFeatures"]: + fids = dense_features["features"].keys() + fnames = [dense_features["features"][fid]["featureName"] for fid in fids] + default_value = dense_features["defaultValue"] + if len(fnames) == 1 and type(default_value) != dict: + fc_builder.extract_feature( + feature_name=re.escape(fnames[0]), + expected_shape=dense_features["expectedShape"], + default_value=dense_features["defaultValue"], + ) + else: + fc_builder.extract_features( + feature_regexes=[re.escape(fname) for fname in fnames], + default_value_map=dense_features["defaultValue"], + ) + + # add image feature configs + for image in config_dict["images"]: + fc_builder.extract_image( + feature_name=image["featureName"], + preprocess=image["preprocess"], + out_type=tf.as_dtype(image["outType"].lower()), + channels=image["channels"], + default_image=image["defaultImage"], + ) + + # add other tensor features (non-image) + tensor_fnames = [] + image_fnames = [img["featureName"] for img in config_dict["images"]] + for tensor_fname in config_dict["tensors"]: + if tensor_fname not in image_fnames: + tensor_fnames.append(tensor_fname) + for sparse_tensor_fname in config_dict["sparseTensors"]: + tensor_fnames.append(sparse_tensor_fname) + fc_builder.extract_tensors(tensor_fnames) + + # add labels + labels = [] + for label_info in config_dict["labels"].values(): + labels.append(label_info["featureName"]) + fc_builder.add_labels(labels) + + else: + raise ValueError("Format '%s' not implemented" % config_dict["format"]) + return fc_builder.build() + + +def create_feature_config_from_dict( + config_dict: dict, data_spec_path: str +) -> FeatureConfig: + """Create a FeatureConfig object from a feature spec dict.""" + + config_version = _get_config_version(config_dict) + if config_version == "v1": + _validate_config_dict_v1(config_dict) + feature_config = _create_feature_config_v1(config_dict, data_spec_path) + elif config_version == "v2": + _validate_config_dict_v2(config_dict) + feature_config = _create_feature_config_v2(config_dict, data_spec_path) + else: + raise ValueError("version not supported") + return feature_config + + +def create_feature_config(config_path: str, data_spec_path: str) -> FeatureConfig: + """Create a FeatureConfig object from a feature_spec.yaml file.""" + + _, ext = os.path.splitext(config_path) + if ext not in [".yaml", ".yml"]: + raise ValueError("create_feature_config_from_yaml: Only .yaml/.yml supported") + + with tf.io.gfile.GFile(config_path, mode="r") as fs: + config_dict = yaml.safe_load(fs) + + return create_feature_config_from_dict(config_dict, data_spec_path) diff --git a/twml/twml/contrib/feature_importances/feature_importances.py b/twml/twml/contrib/feature_importances/feature_importances.py index a8bfcc129..34e6b2228 100644 --- a/twml/twml/contrib/feature_importances/feature_importances.py +++ b/twml/twml/contrib/feature_importances/feature_importances.py @@ -2,25 +2,28 @@ import time from collections import defaultdict +from queue import Queue +from typing import Any, Dict, List, Tuple from com.twitter.mlmetastore.modelrepo.client import ModelRepoClient from com.twitter.mlmetastore.modelrepo.core import FeatureImportance, FeatureNames +from requests.exceptions import HTTPError, RetryError +from tensorflow.compat.v1 import logging from twitter.deepbird.io.util import match_feature_regex_list -from twml.contrib.feature_importances.helpers import ( - _get_feature_name_from_config, - _get_feature_types_from_records, - _get_metrics_hook, - _expand_prefix, - longest_common_prefix, - write_list_to_hdfs_gfile) from twml.contrib.feature_importances.feature_permutation import PermutedInputFnFactory +from twml.contrib.feature_importances.helpers import ( + _expand_prefix, + _get_feature_name_from_config, + _get_feature_types_from_records, + _get_metrics_hook, + longest_common_prefix, + write_list_to_hdfs_gfile, +) from twml.tracking import ExperimentTracker - -from tensorflow.compat.v1 import logging -from requests.exceptions import HTTPError, RetryError -from queue import Queue - +from twml.twml import contrib +from twml.twml.trainers.data_record_trainer import DataRecordTrainer +from twml.twml.trainers.trainer import Trainer SERIAL = "serial" TREE = "tree" @@ -31,384 +34,562 @@ LOSS = "loss" -def _repartition(feature_list_queue, fnames_ftypes, split_feature_group_on_period): - """ - Iterate through letters to partition each feature by prefix, and then put each tuple - (prefix, feature_partition) into the feature_list_queue - Args: - prefix (str): The prefix shared by each feature in list_of_feature_types - feature_list_queue (Queue<(str, list<(str, str)>)>): The queue of feature groups - fnames_ftypes (list<(str, str)>): List of (fname, ftype) pairs. Each fname begins with prefix - split_feature_group_on_period (str): If true, require that feature groups end in a period - Returns: - Updated queue with each group in fnames_ftypes - """ - assert len(fnames_ftypes) > 1 - - split_character = "." if split_feature_group_on_period else None - # Compute the longest prefix of the words - prefix = longest_common_prefix( - strings=[fname for fname, _ in fnames_ftypes], split_character=split_character) - - # Separate the features by prefix - prefix_to_features = defaultdict(list) - for fname, ftype in fnames_ftypes: - assert fname.startswith(prefix) - new_prefix = _expand_prefix(fname=fname, prefix=prefix, split_character=split_character) - prefix_to_features[new_prefix].append((fname, ftype)) - - # Add all of the new partitions to the queue - for new_prefix, fname_ftype_list in prefix_to_features.items(): - extended_new_prefix = longest_common_prefix( - strings=[fname for fname, _ in fname_ftype_list], split_character=split_character) - assert extended_new_prefix.startswith(new_prefix) - feature_list_queue.put((extended_new_prefix, fname_ftype_list)) - return feature_list_queue - - -def _infer_if_is_metric_larger_the_better(stopping_metric): - # Infers whether a metric should be interpreted such that larger numbers are better (e.g. ROC_AUC), as opposed to - # larger numbers being worse (e.g. LOSS) - if stopping_metric is None: - raise ValueError("Error: Stopping Metric cannot be None") - elif stopping_metric.startswith(LOSS): - logging.info("Interpreting {} to be a metric where larger numbers are worse".format(stopping_metric)) - is_metric_larger_the_better = False - else: - logging.info("Interpreting {} to be a metric where larger numbers are better".format(stopping_metric)) - is_metric_larger_the_better = True - return is_metric_larger_the_better - - -def _check_whether_tree_should_expand(baseline_performance, computed_performance, sensitivity, stopping_metric, is_metric_larger_the_better): - """ - Returns True if - - the metric is positive (e.g. ROC_AUC) and computed_performance is nontrivially smaller than the baseline_performance - - the metric is negative (e.g. LOSS) and computed_performance is nontrivially larger than the baseline_performance - """ - difference = ((baseline_performance[stopping_metric] - computed_performance[stopping_metric]) / - baseline_performance[stopping_metric]) - - if not is_metric_larger_the_better: - difference = -difference - - logging.info( - "Found a {} difference of {}. Sensitivity is {}.".format("positive" if is_metric_larger_the_better else "negative", difference, sensitivity)) - return difference > sensitivity +def _repartition( + feature_list_queue: Queue[Tuple[str, List[Tuple[str, str]]]], + fnames_ftypes: List[Tuple[str, str]], + split_feature_group_on_period: str, +) -> Queue[Tuple[str, List[Tuple[str, str]]]]: + """ + Iterate through letters to partition each feature by prefix, and then put each tuple + (prefix, feature_partition) into the feature_list_queue + + Args: + prefix (str): + The prefix shared by each feature in list_of_feature_types + feature_list_queue (Queue<(str, list<(str, str)>)>): + The queue of feature groups + fnames_ftypes (list<(str, str)>): + List of (fname, ftype) pairs. Each fname begins with prefix + split_feature_group_on_period (str): + If true, require that feature groups end in a period + + Returns: + Updated queue with each group in fnames_ftypes + """ + assert len(fnames_ftypes) > 1 + + split_character = "." if split_feature_group_on_period else None + # Compute the longest prefix of the words + prefix = longest_common_prefix( + strings=[fname for fname, _ in fnames_ftypes], split_character=split_character + ) + + # Separate the features by prefix + prefix_to_features = defaultdict(list) + for fname, ftype in fnames_ftypes: + assert fname.startswith(prefix) + new_prefix = _expand_prefix( + fname=fname, prefix=prefix, split_character=split_character + ) + prefix_to_features[new_prefix].append((fname, ftype)) + + # Add all of the new partitions to the queue + for new_prefix, fname_ftype_list in prefix_to_features.items(): + extended_new_prefix = longest_common_prefix( + strings=[fname for fname, _ in fname_ftype_list], + split_character=split_character, + ) + assert extended_new_prefix.startswith(new_prefix) + feature_list_queue.put((extended_new_prefix, fname_ftype_list)) + return feature_list_queue + + +def _infer_if_is_metric_larger_the_better(stopping_metric: str) -> bool: + # Infers whether a metric should be interpreted such that larger numbers are better (e.g. ROC_AUC), as opposed to + # larger numbers being worse (e.g. LOSS) + if stopping_metric is None: + raise ValueError("Error: Stopping Metric cannot be None") + elif stopping_metric.startswith(LOSS): + logging.info( + f"Interpreting {stopping_metric} to be a metric where larger numbers are worse" + ) + return False + else: + logging.info( + f"Interpreting {stopping_metric} to be a metric where larger numbers are better" + ) + return True + + +def _check_whether_tree_should_expand( + baseline_performance: dict, + computed_performance: dict, + sensitivity: float, + stopping_metric: str, + is_metric_larger_the_better: bool, +) -> bool: + """ + Returns True if + - the metric is positive (e.g. ROC_AUC) and computed_performance is non-trivially smaller than the baseline_performance + - the metric is negative (e.g. LOSS) and computed_performance is non-trivially larger than the baseline_performance + """ + + difference = ( + baseline_performance[stopping_metric] - computed_performance[stopping_metric] + ) / baseline_performance[stopping_metric] + + if not is_metric_larger_the_better: + difference *= -1 + + logging.info( + f"Found a {'positive' if is_metric_larger_the_better else 'negative'} difference of {difference}. Sensitivity is {sensitivity}." + ) + return difference > sensitivity def _compute_multiple_permuted_performances_from_trainer( - factory, fname_ftypes, trainer, parse_fn, record_count): - """Compute performances with fname and fype permuted - """ - metrics_hook = _get_metrics_hook(trainer) - trainer._estimator.evaluate( - input_fn=factory.get_permuted_input_fn( - batch_size=trainer._params.eval_batch_size, parse_fn=parse_fn, fname_ftypes=fname_ftypes), - steps=(record_count + trainer._params.eval_batch_size) // trainer._params.eval_batch_size, - hooks=[metrics_hook], - checkpoint_path=trainer.best_or_latest_checkpoint) - return metrics_hook.metric_values - - -def _get_extra_feature_group_performances(factory, trainer, parse_fn, extra_groups, feature_to_type, record_count): - """Compute performance differences for the extra feature groups - """ - extra_group_feature_performance_results = {} - for group_name, raw_feature_regex_list in extra_groups.items(): - start = time.time() - fnames = match_feature_regex_list( - features=feature_to_type.keys(), - feature_regex_list=[regex for regex in raw_feature_regex_list], - preprocess=False, - as_dict=False) - - fnames_ftypes = [(fname, feature_to_type[fname]) for fname in fnames] - - logging.info("Extracted extra group {} with features {}".format(group_name, fnames_ftypes)) - extra_group_feature_performance_results[group_name] = _compute_multiple_permuted_performances_from_trainer( - factory=factory, fname_ftypes=fnames_ftypes, - trainer=trainer, parse_fn=parse_fn, record_count=record_count) - logging.info("\n\nImportances computed for {} in {} seconds \n\n".format( - group_name, int(time.time() - start))) - return extra_group_feature_performance_results + factory: PermutedInputFnFactory, + fname_ftypes: List[Tuple[str, str]], + trainer: ExperimentTracker, + parse_fn: callable, + record_count: int, +) -> dict: + """Compute performances with fname and ftype permuted""" + metrics_hook = _get_metrics_hook(trainer) + trainer._estimator.evaluate( + input_fn=factory.get_permuted_input_fn( + batch_size=trainer._params.eval_batch_size, + parse_fn=parse_fn, + fname_ftypes=fname_ftypes, + ), + steps=(record_count + trainer._params.eval_batch_size) + // trainer._params.eval_batch_size, + hooks=[metrics_hook], + checkpoint_path=trainer.best_or_latest_checkpoint, + ) + return metrics_hook.metric_values + + +def _get_extra_feature_group_performances( + factory: PermutedInputFnFactory, + trainer: ExperimentTracker, + parse_fn: callable, + extra_groups: dict, + feature_to_type: dict, + record_count: int, +): + """Compute performance differences for the extra feature groups""" + extra_group_feature_performance_results = {} + for group_name, raw_feature_regex_list in extra_groups.items(): + start = time.time() + fnames = match_feature_regex_list( + features=feature_to_type.keys(), + feature_regex_list=[regex for regex in raw_feature_regex_list], + preprocess=False, + as_dict=False, + ) + + fnames_ftypes = [(fname, feature_to_type[fname]) for fname in fnames] + + logging.info( + f"Extracted extra group {group_name} with features {fnames_ftypes}" + ) + extra_group_feature_performance_results[ + group_name + ] = _compute_multiple_permuted_performances_from_trainer( + factory=factory, + fname_ftypes=fnames_ftypes, + trainer=trainer, + parse_fn=parse_fn, + record_count=record_count, + ) + logging.info( + f"\n\nImportance computed for {group_name} in {float(time.time() - start):.3f} seconds \n\n" + ) + return extra_group_feature_performance_results def _feature_importances_tree_algorithm( - data_dir, trainer, parse_fn, fnames, stopping_metric, file_list=None, datarecord_filter_fn=None, split_feature_group_on_period=True, - record_count=99999, is_metric_larger_the_better=None, sensitivity=0.025, extra_groups=None, dont_build_tree=False): - """Tree algorithm for feature and feature group importances. This algorithm build a prefix tree of - the feature names and then traverses the tree with a BFS. At each node (aka group of features with - a shared prefix) the algorithm computes the performance of the model when we permute all features - in the group. The algorithm only zooms-in on groups that impact the performance by more than - sensitivity. As a result, features that affect the model performance by less than sensitivity will - not have an exact importance. - Args: - data_dir: (str): The location of the training or testing data to compute importances over. - If None, the trainer._eval_files are used - trainer: (DataRecordTrainer): A DataRecordTrainer object - parse_fn: (function): The parse_fn used by eval_input_fn - fnames (list): The list of feature names - stopping_metric (str): The metric to use to determine when to stop expanding trees - file_list (list): The list of filenames. Exactly one of file_list and data_dir should be - provided - datarecord_filter_fn (function): a function takes a single data sample in com.twitter.ml.api.ttypes.DataRecord format - and return a boolean value, to indicate if this data record should be kept in feature importance module or not. - split_feature_group_on_period (boolean): If true, split feature groups by period rather than on - optimal prefix - record_count (int): The number of records to compute importances over - is_metric_larger_the_better (boolean): If true, assume that stopping_metric is a metric where larger - values are better (e.g. ROC-AUC) - sensitivity (float): The smallest change in performance to continue to expand the tree - extra_groups (dict>): A dictionary mapping the name of extra feature groups to the list of - the names of the features in the group. You should only supply a value for this argument if you have a set - of features that you want to evaluate as a group but don't share a prefix - dont_build_tree (boolean): If True, don't build the tree and only compute the extra_groups importances - Returns: - A dictionary that contains the individual and group feature importances - """ - factory = PermutedInputFnFactory( - data_dir=data_dir, record_count=record_count, file_list=file_list, datarecord_filter_fn=datarecord_filter_fn) - baseline_performance = _compute_multiple_permuted_performances_from_trainer( - factory=factory, fname_ftypes=[], - trainer=trainer, parse_fn=parse_fn, record_count=record_count) - out = {"None": baseline_performance} - - if stopping_metric not in baseline_performance: - raise ValueError("The stopping metric '{}' not found in baseline_performance. Metrics are {}".format( - stopping_metric, list(baseline_performance.keys()))) - - is_metric_larger_the_better = ( - is_metric_larger_the_better if is_metric_larger_the_better is not None else _infer_if_is_metric_larger_the_better(stopping_metric)) - logging.info("Using {} as the stopping metric for the tree algorithm".format(stopping_metric)) - - feature_to_type = _get_feature_types_from_records(records=factory.records, fnames=fnames) - all_feature_types = list(feature_to_type.items()) - - individual_feature_performances = {} - feature_group_performances = {} - if dont_build_tree: - logging.info("Not building feature importance trie. Will only compute importances for the extra_groups") - else: - logging.info("Building feature importance trie") - # Each element in the Queue will be a tuple of (prefix, list_of_feature_type_pairs) where - # each feature in list_of_feature_type_pairs will have have the prefix "prefix" - feature_list_queue = _repartition( - feature_list_queue=Queue(), fnames_ftypes=all_feature_types, split_feature_group_on_period=split_feature_group_on_period) - - while not feature_list_queue.empty(): - # Pop the queue. We should never have an empty list in the queue - prefix, fnames_ftypes = feature_list_queue.get() - assert len(fnames_ftypes) > 0 - - # Compute performance from permuting all features in fname_ftypes - logging.info( - "\n\nComputing importances for {} ({}...). {} elements left in the queue \n\n".format( - prefix, fnames_ftypes[:5], feature_list_queue.qsize())) - start = time.time() - computed_performance = _compute_multiple_permuted_performances_from_trainer( - factory=factory, fname_ftypes=fnames_ftypes, - trainer=trainer, parse_fn=parse_fn, record_count=record_count) - logging.info("\n\nImportances computed for {} in {} seconds \n\n".format( - prefix, int(time.time() - start))) - if len(fnames_ftypes) == 1: - individual_feature_performances[fnames_ftypes[0][0]] = computed_performance - else: - feature_group_performances[prefix] = computed_performance - # Dig deeper into the features in fname_ftypes only if there is more than one feature in the - # list and the performance drop is nontrivial - logging.info("Checking performance for {} ({}...)".format(prefix, fnames_ftypes[:5])) - check = _check_whether_tree_should_expand( - baseline_performance=baseline_performance, computed_performance=computed_performance, - sensitivity=sensitivity, stopping_metric=stopping_metric, is_metric_larger_the_better=is_metric_larger_the_better) - if len(fnames_ftypes) > 1 and check: - logging.info("Expanding {} ({}...)".format(prefix, fnames_ftypes[:5])) - feature_list_queue = _repartition( - feature_list_queue=feature_list_queue, fnames_ftypes=fnames_ftypes, split_feature_group_on_period=split_feature_group_on_period) - else: - logging.info("Not expanding {} ({}...)".format(prefix, fnames_ftypes[:5])) - - # Baseline performance is grouped in with individual_feature_importance_results - individual_feature_performance_results = dict( - out, **{k: v for k, v in individual_feature_performances.items()}) - group_feature_performance_results = {k: v for k, v in feature_group_performances.items()} - - if extra_groups is not None: - logging.info("Computing performances for extra groups {}".format(extra_groups.keys())) - for group_name, performances in _get_extra_feature_group_performances( + data_dir: str, + trainer: DataRecordTrainer, + parse_fn: callable, + fnames: List[str], + stopping_metric: str, + file_list: List[str] = None, + datarecord_filter_fn: callable = None, + split_feature_group_on_period: bool = True, + record_count: int = 99999, + is_metric_larger_the_better: bool = None, + sensitivity: float = 0.025, + extra_groups: Dict[str, List[str]] = None, + dont_build_tree: bool = False, +) -> Dict[str, Dict[str, Any]]: + """Tree algorithm for feature and feature group importance. This algorithm build a prefix tree of + the feature names and then traverses the tree with a BFS. At each node (aka group of features with + a shared prefix) the algorithm computes the performance of the model when we permute all features + in the group. The algorithm only zooms-in on groups that impact the performance by more than + sensitivity. As a result, features that affect the model performance by less than sensitivity will + not have an exact importance. + Args: + data_dir (str): + The location of the training or testing data to compute importance over. + If None, the trainer._eval_files are used + trainer (DataRecordTrainer): + A DataRecordTrainer object + parse_fn (function): + The parse_fn used by eval_input_fn + fnames (list): + The list of feature names + stopping_metric (str): + The metric to use to determine when to stop expanding trees + file_list (list): + The list of filenames. Exactly one of file_list and data_dir should be provided + datarecord_filter_fn (function): + a function takes a single data sample in com.twitter.ml.api.ttypes.DataRecord format + and return a boolean value, to indicate if this data record should be kept in feature importance module or not. + split_feature_group_on_period (boolean): + If true, split feature groups by period rather than on optimal prefix + record_count (int): + The number of records to compute importance over + is_metric_larger_the_better (boolean): + If true, assume that stopping_metric is a metric where larger values are better (e.g. ROC-AUC) + sensitivity (float): + The smallest change in performance to continue to expand the tree + extra_groups (dict>): + A dictionary mapping the name of extra feature groups to the list of + the names of the features in the group. You should only supply a value for this argument if you have a set + of features that you want to evaluate as a group but don't share a prefix + dont_build_tree (boolean): + If True, don't build the tree and only compute the extra_groups importance + Returns: + A dictionary that contains the individual and group feature importance + """ + factory = PermutedInputFnFactory( + data_dir=data_dir, + record_count=record_count, + file_list=file_list, + datarecord_filter_fn=datarecord_filter_fn, + ) + baseline_performance = _compute_multiple_permuted_performances_from_trainer( factory=factory, + fname_ftypes=[], trainer=trainer, parse_fn=parse_fn, - extra_groups=extra_groups, - feature_to_type=feature_to_type, - record_count=record_count).items(): - group_feature_performance_results[group_name] = performances - else: - logging.info("Not computing performances for extra groups") - - return {INDIVIDUAL: individual_feature_performance_results, - GROUP: group_feature_performance_results} + record_count=record_count, + ) + out = {"None": baseline_performance} + + if stopping_metric not in baseline_performance: + raise ValueError( + f"The stopping metric '{stopping_metric}' not found in baseline_performance. Metrics are {baseline_performance.keys()}" + ) + + is_metric_larger_the_better = ( + is_metric_larger_the_better + if is_metric_larger_the_better is not None + else _infer_if_is_metric_larger_the_better(stopping_metric) + ) + logging.info( + f"Using {stopping_metric} as the stopping metric for the tree algorithm" + ) + + feature_to_type = _get_feature_types_from_records( + records=factory.records, fnames=fnames + ) + all_feature_types = list(feature_to_type.items()) + + individual_feature_performances = {} + feature_group_performances = {} + if dont_build_tree: + logging.info( + "Not building feature importance trie. Will only compute importance for the extra_groups" + ) + else: + logging.info("Building feature importance trie") + # Each element in the Queue will be a tuple of (prefix, list_of_feature_type_pairs) where + # each feature in list_of_feature_type_pairs will have have the prefix "prefix" + feature_list_queue = _repartition( + feature_list_queue=Queue(), + fnames_ftypes=all_feature_types, + split_feature_group_on_period=split_feature_group_on_period, + ) + + while not feature_list_queue.empty(): + # Pop the queue. We should never have an empty list in the queue + prefix, fnames_ftypes = feature_list_queue.get() + assert len(fnames_ftypes) > 0 + + # Compute performance from permuting all features in fname_ftypes + logging.info( + f"\n\nComputing importances for {prefix} ({fnames_ftypes[:5]}...). {feature_list_queue.qsize()} elements left in the queue \n\n" + ) + start = time.time() + computed_performance = _compute_multiple_permuted_performances_from_trainer( + factory=factory, + fname_ftypes=fnames_ftypes, + trainer=trainer, + parse_fn=parse_fn, + record_count=record_count, + ) + logging.info( + f"\n\nImportance computed for {prefix} in {float(time.time() - start):.3f} seconds \n\n" + ) + if len(fnames_ftypes) == 1: + individual_feature_performances[ + fnames_ftypes[0][0] + ] = computed_performance + else: + feature_group_performances[prefix] = computed_performance + # Dig deeper into the features in fname_ftypes only if there is more than one feature in the + # list and the performance drop is nontrivial + logging.info(f"Checking performance for {prefix} ({fnames_ftypes[:5]}...)") + check = _check_whether_tree_should_expand( + baseline_performance=baseline_performance, + computed_performance=computed_performance, + sensitivity=sensitivity, + stopping_metric=stopping_metric, + is_metric_larger_the_better=is_metric_larger_the_better, + ) + if len(fnames_ftypes) > 1 and check: + logging.info(f"Expanding {prefix} ({fnames_ftypes[:5]}...)") + feature_list_queue = _repartition( + feature_list_queue=feature_list_queue, + fnames_ftypes=fnames_ftypes, + split_feature_group_on_period=split_feature_group_on_period, + ) + else: + logging.info(f"Not expanding {prefix} ({fnames_ftypes[:5]}...)") + + # Baseline performance is grouped in with individual_feature_importance_results + individual_feature_performance_results = dict( + out, **{k: v for k, v in individual_feature_performances.items()} + ) + group_feature_performance_results = { + k: v for k, v in feature_group_performances.items() + } + + if extra_groups is not None: + logging.info(f"Computing performances for extra groups {extra_groups.keys()}") + for group_name, performances in _get_extra_feature_group_performances( + factory=factory, + trainer=trainer, + parse_fn=parse_fn, + extra_groups=extra_groups, + feature_to_type=feature_to_type, + record_count=record_count, + ).items(): + group_feature_performance_results[group_name] = performances + else: + logging.info("Not computing performances for extra groups") + + return { + INDIVIDUAL: individual_feature_performance_results, + GROUP: group_feature_performance_results, + } def _feature_importances_serial_algorithm( - data_dir, trainer, parse_fn, fnames, file_list=None, datarecord_filter_fn=None, factory=None, record_count=99999): - """Serial algorithm for feature importances. This algorithm computes the - importance of each feature. - """ - factory = PermutedInputFnFactory( - data_dir=data_dir, record_count=record_count, file_list=file_list, datarecord_filter_fn=datarecord_filter_fn) - feature_to_type = _get_feature_types_from_records(records=factory.records, fnames=fnames) - - out = {} - for fname, ftype in list(feature_to_type.items()) + [(None, None)]: - logging.info("\n\nComputing importances for {}\n\n".format(fname)) - start = time.time() - fname_ftypes = [(fname, ftype)] if fname is not None else [] - out[str(fname)] = _compute_multiple_permuted_performances_from_trainer( - factory=factory, fname_ftypes=fname_ftypes, - trainer=trainer, parse_fn=parse_fn, record_count=record_count) - logging.info("\n\nImportances computed for {} in {} seconds \n\n".format( - fname, int(time.time() - start))) - # The serial algorithm does not compute group feature results. - return {INDIVIDUAL: out, GROUP: {}} + data_dir: str, + trainer: Trainer, + parse_fn: callable, + fnames: List[str], + file_list: List[str] = None, + datarecord_filter_fn: callable = None, + factory: PermutedInputFnFactory = None, + record_count: int = 99999, +): + """Serial algorithm for feature importances. This algorithm computes the + importance of each feature. + """ + factory = PermutedInputFnFactory( + data_dir=data_dir, + record_count=record_count, + file_list=file_list, + datarecord_filter_fn=datarecord_filter_fn, + ) + feature_to_type = _get_feature_types_from_records( + records=factory.records, fnames=fnames + ) + + out = {} + for fname, ftype in list(feature_to_type.items()) + [(None, None)]: + logging.info(f"\n\nComputing importances for {fname}\n\n") + start = time.time() + fname_ftypes = [(fname, ftype)] if fname is not None else [] + out[str(fname)] = _compute_multiple_permuted_performances_from_trainer( + factory=factory, + fname_ftypes=fname_ftypes, + trainer=trainer, + parse_fn=parse_fn, + record_count=record_count, + ) + logging.info( + f"\n\nImportances computed for {fname} in {float(time.time() - start):.3f} seconds \n\n" + ) + # The serial algorithm does not compute group feature results. + return {INDIVIDUAL: out, GROUP: {}} def _process_feature_name_for_mldash(feature_name): - # Using a forward slash in the name causes feature importance writing to fail because strato interprets it as - # part of a url - return feature_name.replace("/", "__") + # Using a forward slash in the name causes feature importance writing to fail because strato interprets it as + # part of a url + return feature_name.replace("/", "__") def compute_feature_importances( - trainer, data_dir=None, feature_config=None, algorithm=TREE, parse_fn=None, datarecord_filter_fn=None, **kwargs): - """Perform a feature importance analysis on a trained model - Args: - trainer: (DataRecordTrainer): A DataRecordTrainer object - data_dir: (str): The location of the training or testing data to compute importances over. - If None, the trainer._eval_files are used - feature_config (contrib.FeatureConfig): The feature config object. If this is not provided, it - is taken from the trainer - algorithm (str): The algorithm to use - parse_fn: (function): The parse_fn used by eval_input_fn. By default this is - feature_config.get_parse_fn() - datarecord_filter_fn (function): a function takes a single data sample in com.twitter.ml.api.ttypes.DataRecord format - and return a boolean value, to indicate if this data record should be kept in feature importance module or not. - """ - - # We only use the trainer's eval files if an override data_dir is not provided - if data_dir is None: - logging.info("Using trainer._eval_files (found {} as files)".format(trainer._eval_files)) - file_list = trainer._eval_files - else: - logging.info("data_dir provided. Looking at {} for data.".format(data_dir)) - file_list = None - - feature_config = feature_config or trainer._feature_config - out = {} - if not feature_config: - logging.warn("WARN: Not computing feature importance because trainer._feature_config is None") - out = None - else: - parse_fn = parse_fn if parse_fn is not None else feature_config.get_parse_fn() - fnames = _get_feature_name_from_config(feature_config) - logging.info("Computing importances for {}".format(fnames)) - logging.info("Using the {} feature importance computation algorithm".format(algorithm)) - algorithm = { - SERIAL: _feature_importances_serial_algorithm, - TREE: _feature_importances_tree_algorithm}[algorithm] - out = algorithm(data_dir=data_dir, trainer=trainer, parse_fn=parse_fn, fnames=fnames, file_list=file_list, datarecord_filter_fn=datarecord_filter_fn, **kwargs) - return out + trainer: DataRecordTrainer, + data_dir: str = None, + feature_config: contrib.feature_config = None, + algorithm: str = TREE, + parse_fn: callable = None, + datarecord_filter_fn: callable = None, + **kwargs, +): + """Perform a feature importance analysis on a trained model + Args: + trainer (DataRecordTrainer): + A DataRecordTrainer object + data_dir (str): + The location of the training or testing data to compute importances over. + If None, the trainer._eval_files are used + feature_config (contrib.FeatureConfig): + The feature config object. If this is not provided, it + is taken from the trainer + algorithm (str): + The algorithm to use + parse_fn (function): + The parse_fn used by eval_input_fn. By default this is feature_config.get_parse_fn() + datarecord_filter_fn (function): + a function takes a single data sample in com.twitter.ml.api.ttypes.DataRecord format and + return a boolean value, to indicate if this data record should be kept in feature importance module or not. + """ + + # We only use the trainer's eval files if an override data_dir is not provided + if data_dir is None: + logging.info( + f"Using trainer._eval_files (found {trainer._eval_files} as files)" + ) + file_list = trainer._eval_files + else: + logging.info(f"data_dir provided. Looking at {data_dir} for data.") + file_list = None + + feature_config = feature_config or trainer._feature_config + out = {} + if not feature_config: + logging.warn( + "WARN: Not computing feature importance because trainer._feature_config is None" + ) + out = None + else: + parse_fn = parse_fn if parse_fn is not None else feature_config.get_parse_fn() + fnames = _get_feature_name_from_config(feature_config) + logging.info(f"Computing importances for {fnames}") + logging.info(f"Using the {algorithm} feature importance computation algorithm") + algorithm = { + SERIAL: _feature_importances_serial_algorithm, + TREE: _feature_importances_tree_algorithm, + }[algorithm] + out = algorithm( + data_dir=data_dir, + trainer=trainer, + parse_fn=parse_fn, + fnames=fnames, + file_list=file_list, + datarecord_filter_fn=datarecord_filter_fn, + **kwargs, + ) + return out def write_feature_importances_to_hdfs( - trainer, feature_importances, output_path=None, metric="roc_auc"): - """Publish a feature importance analysis to hdfs as a tsv - Args: - (see compute_feature_importances for other args) - trainer (Trainer) - feature_importances (dict): Dictionary of feature importances - output_path (str): The remote or local file to write the feature importances to. If not - provided, this is inferred to be the trainer save dir - metric (str): The metric to write to tsv - """ - # String formatting appends (Individual) or (Group) to feature name depending on type - perfs = {"{} ({})".format(k, importance_key) if k != "None" else k: v[metric] - for importance_key, importance_value in feature_importances.items() - for k, v in importance_value.items()} - - output_path = ("{}/feature_importances-{}".format( - trainer._save_dir[:-1] if trainer._save_dir.endswith('/') else trainer._save_dir, - output_path if output_path is not None else str(time.time()))) - - if len(perfs) > 0: - logging.info("Writing feature_importances for {} to hdfs".format(perfs.keys())) - entries = [ - { - "name": name, - "drop": perfs["None"] - perfs[name], - "pdrop": 100 * (perfs["None"] - perfs[name]) / (perfs["None"] + 1e-8), - "perf": perfs[name] - } for name in perfs.keys()] - out = ["Name\tPerformance Drop\tPercent Performance Drop\tPerformance"] - for entry in sorted(entries, key=lambda d: d["drop"]): - out.append("{name}\t{drop}\t{pdrop}%\t{perf}".format(**entry)) - logging.info("\n".join(out)) - write_list_to_hdfs_gfile(out, output_path) - logging.info("Wrote feature feature_importances to {}".format(output_path)) - else: - logging.info("Not writing feature_importances to hdfs") - return output_path - - -def write_feature_importances_to_ml_dash(trainer, feature_importances, feature_config=None): - # type: (DataRecordTrainer, FeatureConfig, dict) -> None - """Publish feature importances + all feature names to ML Metastore - Args: - trainer: (DataRecordTrainer): A DataRecordTrainer object - feature_config (contrib.FeatureConfig): The feature config object. If this is not provided, it - is taken from the trainer - feature_importances (dict, default=None): Dictionary of precomputed feature importances - feature_importance_metric (str, default=None): The metric to write to ML Dashboard - """ - experiment_tracking_path = trainer.experiment_tracker.tracking_path\ - if trainer.experiment_tracker.tracking_path\ - else ExperimentTracker.guess_path(trainer._save_dir) - - logging.info('Computing feature importances for run: {}'.format(experiment_tracking_path)) - - feature_importance_list = [] - for key in feature_importances: - for feature, imps in feature_importances[key].items(): - logging.info('FEATURE NAME: {}'.format(feature)) - feature_name = feature.split(' (').pop(0) - for metric_name, value in imps.items(): - try: - imps[metric_name] = float(value) - logging.info('Wrote feature importance value {} for metric: {}'.format(str(value), metric_name)) - except Exception as ex: - logging.error("Skipping writing metric:{} to ML Metastore due to invalid metric value: {} or value type: {}. Exception: {}".format(metric_name, str(value), type(value), str(ex))) - pass - - feature_importance_list.append(FeatureImportance( - run_id=experiment_tracking_path, - feature_name=_process_feature_name_for_mldash(feature_name), - feature_importance_metrics=imps, - is_group=key == GROUP - )) - -# setting feature config to match the one used in compute_feature_importances - feature_config = feature_config or trainer._feature_config - feature_names = FeatureNames( - run_id=experiment_tracking_path, - names=list(feature_config.features.keys()) - ) - - try: - client = ModelRepoClient() - logging.info('Writing feature importances to ML Metastore') - client.add_feature_importances(feature_importance_list) - logging.info('Writing feature names to ML Metastore') - client.add_feature_names(feature_names) - except (HTTPError, RetryError) as err: - logging.error('Feature importance is not being written due to: ' - 'HTTPError when attempting to write to ML Metastore: \n{}.'.format(err)) + trainer: Trainer, + feature_importances: Dict, + output_path: str = None, + metric: str = "roc_auc", +) -> str: + """Publish a feature importance analysis to hdfs as a tsv + Args: + (see compute_feature_importances for other args) + trainer (Trainer) + feature_importances (dict): + Dictionary of feature importances + output_path (str): + The remote or local file to write the feature importances to. If not + provided, this is inferred to be the trainer save dir + metric (str): + The metric to write to tsv + """ + # String formatting appends (Individual) or (Group) to feature name depending on type + perfs = { + f"{k} ({importance_key})" if k != "None" else k: v[metric] + for importance_key, importance_value in feature_importances.items() + for k, v in importance_value.items() + } + + output_path = f"{trainer._save_dir[:-1] if trainer._save_dir.endswith('/') else trainer._save_dir}/feature_importances-{output_path if output_path is not None else str(time.time())}" + if len(perfs) > 0: + logging.info(f"Writing feature_importances for {perfs.keys()} to hdfs") + entries = [ + { + "name": name, + "drop": perfs["None"] - perfs[name], + "pdrop": 100 * (perfs["None"] - perfs[name]) / (perfs["None"] + 1e-8), + "perf": perfs[name], + } + for name in perfs.keys() + ] + out = ["Name\tPerformance Drop\tPercent Performance Drop\tPerformance"] + for entry in sorted(entries, key=lambda d: d["drop"]): + out.append("{name}\t{drop}\t{pdrop}%\t{perf}".format(**entry)) + logging.info("\n".join(out)) + write_list_to_hdfs_gfile(out, output_path) + logging.info(f"Wrote feature feature_importances to {output_path}") + else: + logging.info("Not writing feature_importances to hdfs") + return output_path + + +def write_feature_importances_to_ml_dash( + trainer: DataRecordTrainer, + feature_importances: Dict[str, Dict[str, Dict[str, float]]], + feature_config: contrib.FeatureConfig = None, +) -> None: + """Publish feature importances + all feature names to ML Metastore + Args: + trainer (DataRecordTrainer): + A DataRecordTrainer object + feature_importances (dict, default=None): + Dictionary of precomputed feature importances + feature_config (contrib.FeatureConfig): + The feature config object. If this is not provided, it is taken from the trainer + """ + experiment_tracking_path = ( + trainer.experiment_tracker.tracking_path + if trainer.experiment_tracker.tracking_path + else ExperimentTracker.guess_path(trainer._save_dir) + ) + + logging.info(f"Computing feature importances for run: {experiment_tracking_path}") + feature_importance_list = [] + for key in feature_importances: + for feature, imps in feature_importances[key].items(): + logging.info(f"FEATURE NAME: {feature}") + feature_name = feature.split(" (").pop(0) + for metric_name, value in imps.items(): + try: + imps[metric_name] = float(value) + logging.info( + f"Wrote feature importance value {value} for metric: {metric_name}" + ) + except Exception as ex: + logging.error( + f"Skipping writing metric:{metric_name} to ML Metastore due to invalid metric value: {value} or value type: {type(value)}. Exception: {ex}" + ) + + feature_importance_list.append( + FeatureImportance( + run_id=experiment_tracking_path, + feature_name=_process_feature_name_for_mldash(feature_name), + feature_importance_metrics=imps, + is_group=key == GROUP, + ) + ) + + # setting feature config to match the one used in compute_feature_importances + feature_config = feature_config or trainer._feature_config + feature_names = FeatureNames( + run_id=experiment_tracking_path, names=list(feature_config.features.keys()) + ) + + try: + client = ModelRepoClient() + logging.info("Writing feature importances to ML Metastore") + client.add_feature_importances(feature_importance_list) + logging.info("Writing feature names to ML Metastore") + client.add_feature_names(feature_names) + except (HTTPError, RetryError) as err: + logging.error( + "Feature importance is not being written due to: " + f"HTTPError when attempting to write to ML Metastore: \n{err}." + ) diff --git a/twml/twml/contrib/feature_importances/feature_permutation.py b/twml/twml/contrib/feature_importances/feature_permutation.py index 809f5fde0..c523106dc 100644 --- a/twml/twml/contrib/feature_importances/feature_permutation.py +++ b/twml/twml/contrib/feature_importances/feature_permutation.py @@ -1,129 +1,190 @@ -from copy import deepcopy import random import types +from copy import deepcopy +from typing import Callable, List, Tuple +import tensorflow.compat.v1 as tf +from com.twitter.ml.api.ttypes import DataRecord # pylint: disable=import-error +from tensorflow.compat.v1 import logging from twitter.deepbird.util.thrift.simple_converters import ( - bytes_to_thrift_object, thrift_object_to_bytes) + bytes_to_thrift_object, + thrift_object_to_bytes, +) -from tensorflow.compat.v1 import logging -from com.twitter.ml.api.ttypes import DataRecord # pylint: disable=import-error -import tensorflow.compat.v1 as tf import twml class PermutedInputFnFactory(object): + def __init__( + self, + data_dir: str, + record_count: int, + file_list: List[str] = None, + datarecord_filter_fn: Callable[[DataRecord], bool] = None, + ): + """ + Args: + data_dir (str): + The location of the records on hdfs + record_count (int): + The number of records to process + file_list (list[str], default=None): + The list of data files on HDFS. If provided, use this instead of data_dir + datarecord_filter_fn (function): + a function takes a single data sample in com.twitter.ml.api.ttypes.DataRecord format + and return a boolean value, to indicate if this data record should be kept in feature importance module or not. + """ + if not (data_dir is None) ^ (file_list is None): + raise ValueError( + f"Exactly one of data_dir and file_list can be provided. Got {data_dir} for data_dir and {file_list} for file_list" + ) + + file_list = ( + file_list + if file_list is not None + else twml.util.list_files(twml.util.preprocess_path(data_dir)) + ) + _next_batch = twml.input_fns.default_input_fn( + file_list, 1, lambda x: x, num_threads=2, shuffle=True, shuffle_files=True + ) + self.records = [] + # Validate datarecord_filter_fn + if datarecord_filter_fn is not None and not isinstance( + datarecord_filter_fn, types.FunctionType + ): + raise TypeError("datarecord_filter_fn is not function type") + with tf.Session() as sess: + for i in range(record_count): + try: + record = bytes_to_thrift_object( + sess.run(_next_batch)[0], DataRecord + ) + if datarecord_filter_fn is None or datarecord_filter_fn(record): + self.records.append(record) + except tf.errors.OutOfRangeError: + logging.info( + f"Stopping after reading {i} records out of {record_count}" + ) + break + if datarecord_filter_fn: + logging.info( + f"datarecord_filter_fn has been applied; keeping {len(self.records)} records out of {record_count}" + ) + + def _get_record_generator(self) -> Tuple[bytes]: + return (thrift_object_to_bytes(r) for r in self.records) + + def get_permuted_input_fn( + self, batch_size: int, parse_fn: callable, fname_ftypes: List[Tuple[str, str]] + ) -> callable: + """Get an input function that passes in a preset number of records that have been feature permuted + + Args: + batch_size (int): The batch size to use + parse_fn (function): The function to parse inputs + fname_ftypes: (list<(str, str)>): The names and types of the features to permute + + Returns: + A function that returns a batch of permuted records + """ + + def permuted_parse_pyfn(bytes_array: List[bytes]) -> List[bytes]: + """Parse a list of bytes into a list of parsed bytes""" + + out = [] + for b in bytes_array: + rec = bytes_to_thrift_object(b, DataRecord) + if fname_ftypes: + rec = _permutate_features( + rec, fname_ftypes=fname_ftypes, records=self.records + ) + out.append(thrift_object_to_bytes(rec)) + return [out] + + def permuted_parse_fn(bytes_tensor: tf.Tensor) -> tf.Tensor: + """Parse a tensor of bytes into a tensor of parsed bytes""" + parsed_bytes_tensor = parse_fn( + tf.py_func(permuted_parse_pyfn, [bytes_tensor], tf.string) + ) + return parsed_bytes_tensor + + def input_fn( + batch_size: int = batch_size, parse_fn: callable = parse_fn, factory=self + ) -> tf.Tensor: + """The input function to return""" + + return ( + tf.data.Dataset.from_generator(self._get_record_generator, tf.string) + .batch(batch_size) + .map(permuted_parse_fn, 4) + .make_one_shot_iterator() + .get_next() + ) + + return input_fn + + +def _permutate_features( + rec: DataRecord, fname_ftypes: List[Tuple[str, str]], records: List[DataRecord] +) -> DataRecord: + """Replace a feature value with a value from random selected record - def __init__(self, data_dir, record_count, file_list=None, datarecord_filter_fn=None): - """ - Args: - data_dir (str): The location of the records on hdfs - record_count (int): The number of records to process - file_list (list, default=None): The list of data files on HDFS. If provided, use this instead - of data_dir - datarecord_filter_fn (function): a function takes a single data sample in com.twitter.ml.api.ttypes.DataRecord format - and return a boolean value, to indicate if this data record should be kept in feature importance module or not. - """ - if not (data_dir is None) ^ (file_list is None): - raise ValueError("Exactly one of data_dir and file_list can be provided. Got {} for data_dir and {} for file_list".format( - data_dir, file_list)) - - file_list = file_list if file_list is not None else twml.util.list_files(twml.util.preprocess_path(data_dir)) - _next_batch = twml.input_fns.default_input_fn(file_list, 1, lambda x: x, - num_threads=2, shuffle=True, shuffle_files=True) - self.records = [] - # Validate datarecord_filter_fn - if datarecord_filter_fn is not None and not isinstance(datarecord_filter_fn, types.FunctionType): - raise TypeError("datarecord_filter_fn is not function type") - with tf.Session() as sess: - for i in range(record_count): - try: - record = bytes_to_thrift_object(sess.run(_next_batch)[0], DataRecord) - if datarecord_filter_fn is None or datarecord_filter_fn(record): - self.records.append(record) - except tf.errors.OutOfRangeError: - logging.info("Stopping after reading {} records out of {}".format(i, record_count)) - break - if datarecord_filter_fn: - logging.info("datarecord_filter_fn has been applied; keeping {} records out of {}".format(len(self.records), record_count)) - - def _get_record_generator(self): - return (thrift_object_to_bytes(r) for r in self.records) - - def get_permuted_input_fn(self, batch_size, parse_fn, fname_ftypes): - """Get an input function that passes in a preset number of records that have been feature permuted Args: - parse_fn (function): The function to parse inputs - fname_ftypes: (list<(str, str)>): The names and types of the features to permute + rec: (datarecord): + A datarecord returned from DataRecordGenerator + fname_ftypes: (list<(str, str)>): + The names and types of the features to permute + records: (list): + The records to sample from + + Returns: + The record with the feature permuted """ - def permuted_parse_pyfn(bytes_array): - out = [] - for b in bytes_array: - rec = bytes_to_thrift_object(b, DataRecord) - if fname_ftypes: - rec = _permutate_features(rec, fname_ftypes=fname_ftypes, records=self.records) - out.append(thrift_object_to_bytes(rec)) - return [out] - - def permuted_parse_fn(bytes_tensor): - parsed_bytes_tensor = parse_fn(tf.py_func(permuted_parse_pyfn, [bytes_tensor], tf.string)) - return parsed_bytes_tensor - - def input_fn(batch_size=batch_size, parse_fn=parse_fn, factory=self): - return (tf.data.Dataset - .from_generator(self._get_record_generator, tf.string) - .batch(batch_size) - .map(permuted_parse_fn, 4) - .make_one_shot_iterator() - .get_next()) - return input_fn - - -def _permutate_features(rec, fname_ftypes, records): - """Replace a feature value with a value from random selected record - Args: - rec: (datarecord): A datarecord returned from DataRecordGenerator - fname_ftypes: (list<(str, str)>): The names and types of the features to permute - records: (list): The records to sample from - Returns: - The record with the feature permuted - """ - rec_new = deepcopy(rec) - rec_replace = random.choice(records) - - # If the replacement datarecord does not have the feature type entirely, add it in - # to make the logic a bit simpler - for fname, feature_type in fname_ftypes: - fid = twml.feature_id(fname)[0] - if rec_replace.__dict__.get(feature_type, None) is None: - rec_replace.__dict__[feature_type] = ( - dict() if feature_type != 'binaryFeatures' else set()) - if rec_new.__dict__.get(feature_type, None) is None: - rec_new.__dict__[feature_type] = ( - dict() if feature_type != 'binaryFeatures' else set()) - - if feature_type != 'binaryFeatures': - if fid not in rec_replace.__dict__[feature_type] and fid in rec_new.__dict__.get(feature_type, dict()): - # If the replacement datarecord does not contain the feature but the original does - del rec_new.__dict__[feature_type][fid] - elif fid in rec_replace.__dict__[feature_type]: - # If the replacement datarecord does contain the feature - if rec_new.__dict__[feature_type] is None: - rec_new.__dict__[feature_type] = dict() - rec_new.__dict__[feature_type][fid] = rec_replace.__dict__[feature_type][fid] - else: - # If neither datarecord contains this feature - pass - else: - if fid not in rec_replace.__dict__[feature_type] and fid in rec_new.__dict__.get(feature_type, set()): - # If the replacement datarecord does not contain the feature but the original does - rec_new.__dict__[feature_type].remove(fid) - elif fid in rec_replace.__dict__[feature_type]: - # If the replacement datarecord does contain the feature - if rec_new.__dict__[feature_type] is None: - rec_new.__dict__[feature_type] = set() - rec_new.__dict__[feature_type].add(fid) - # If neither datarecord contains this feature - else: - # If neither datarecord contains this feature - pass - return rec_new + rec_new = deepcopy(rec) + rec_replace = random.choice(records) + + # If the replacement datarecord does not have the feature type entirely, add it in + # to make the logic a bit simpler + for fname, feature_type in fname_ftypes: + fid = twml.feature_id(fname)[0] + if rec_replace.__dict__.get(feature_type, None) is None: + rec_replace.__dict__[feature_type] = ( + dict() if feature_type != "binaryFeatures" else set() + ) + if rec_new.__dict__.get(feature_type, None) is None: + rec_new.__dict__[feature_type] = ( + dict() if feature_type != "binaryFeatures" else set() + ) + + if feature_type != "binaryFeatures": + if fid not in rec_replace.__dict__[ + feature_type + ] and fid in rec_new.__dict__.get(feature_type, dict()): + # If the replacement datarecord does not contain the feature but the original does + del rec_new.__dict__[feature_type][fid] + elif fid in rec_replace.__dict__[feature_type]: + # If the replacement datarecord does contain the feature + if rec_new.__dict__[feature_type] is None: + rec_new.__dict__[feature_type] = dict() + rec_new.__dict__[feature_type][fid] = rec_replace.__dict__[ + feature_type + ][fid] + else: + # If neither datarecord contains this feature + pass + else: + if fid not in rec_replace.__dict__[ + feature_type + ] and fid in rec_new.__dict__.get(feature_type, set()): + # If the replacement datarecord does not contain the feature but the original does + rec_new.__dict__[feature_type].remove(fid) + elif fid in rec_replace.__dict__[feature_type]: + # If the replacement datarecord does contain the feature + if rec_new.__dict__[feature_type] is None: + rec_new.__dict__[feature_type] = set() + rec_new.__dict__[feature_type].add(fid) + # If neither datarecord contains this feature + else: + # If neither datarecord contains this feature + pass + return rec_new diff --git a/twml/twml/contrib/feature_importances/helpers.py b/twml/twml/contrib/feature_importances/helpers.py index f3f600e8b..f794c33bf 100644 --- a/twml/twml/contrib/feature_importances/helpers.py +++ b/twml/twml/contrib/feature_importances/helpers.py @@ -1,96 +1,113 @@ import uuid +from typing import List +import tensorflow.compat.v1 as tf from tensorflow.compat.v1 import logging + import twml -import tensorflow.compat.v1 as tf -def write_list_to_hdfs_gfile(list_to_write, output_path): - """Use tensorflow gfile to write a list to a location on hdfs""" - locname = "/tmp/{}".format(str(uuid.uuid4())) - with open(locname, "w") as f: - for row in list_to_write: - f.write("%s\n" % row) - tf.io.gfile.copy(locname, output_path, overwrite=False) - - -def decode_str_or_unicode(str_or_unicode): - return str_or_unicode.decode() if hasattr(str_or_unicode, 'decode') else str_or_unicode - - -def longest_common_prefix(strings, split_character): - """ - Args: - string (list): The list of strings to find the longest common prefix of - split_character (str): If not None, require that the return string end in this character or - be the length of the entire string - Returns: - The string corresponding to the longest common prefix - """ - sorted_strings = sorted(strings) - s1, s2 = sorted_strings[0], sorted_strings[-1] - if s1 == s2: - # If the strings are the same, just return the full string - out = s1 - else: - # If the strings are not the same, return the longest common prefix optionally ending in split_character - ix = 0 - for i in range(min(len(s1), len(s2))): - if s1[i] != s2[i]: - break - if split_character is None or s1[i] == split_character: - ix = i + 1 - out = s1[:ix] - return out - - -def _expand_prefix(fname, prefix, split_character): - if len(fname) == len(prefix): - # If the prefix is already the full feature, just take the feature name - out = fname - elif split_character is None: - # Advance the prefix by one character - out = fname[:len(prefix) + 1] - else: - # Advance the prefix to the next instance of split_character or the end of the string - for ix in range(len(prefix), len(fname)): - if fname[ix] == split_character: - break - out = fname[:ix + 1] - return out - - -def _get_feature_types_from_records(records, fnames): - # This method gets the types of the features in fnames by looking at the datarecords themselves. - # The reason why we do this rather than extract the feature types from the feature_config is - # that the feature naming conventions in the feature_config are different from those in the - # datarecords. - fids = [twml.feature_id(fname)[0] for fname in fnames] - feature_to_type = {} - for record in records: - for feature_type, values in record.__dict__.items(): - if values is not None: - included_ids = set(values) - for fname, fid in zip(fnames, fids): - if fid in included_ids: - feature_to_type[fname] = feature_type - return feature_to_type - - -def _get_metrics_hook(trainer): - def get_metrics_fn(trainer=trainer): - return {k: v[0]for k, v in trainer.current_estimator_spec.eval_metric_ops.items()} - return twml.hooks.GetMetricsHook(get_metrics_fn=get_metrics_fn) - - -def _get_feature_name_from_config(feature_config): - """Extract the names of the features on a feature config object - """ - decoded_feature_names = [] - for f in feature_config.get_feature_spec()['features'].values(): - try: - fname = decode_str_or_unicode(f['featureName']) - except UnicodeEncodeError as e: - logging.error("Encountered decoding exception when decoding %s: %s" % (f, e)) - decoded_feature_names.append(fname) - return decoded_feature_names +def write_list_to_hdfs_gfile(list_to_write: List[str], output_path: str) -> None: + """Use tensorflow gfile to write a list to a location on hdfs""" + locname = f"/tmp/{str(uuid.uuid4())}" + with open(locname, "w") as f: + for row in list_to_write: + f.write("%s\n" % row) + tf.io.gfile.copy(locname, output_path, overwrite=False) + + +def decode_str_or_unicode(str_or_unicode: str) -> str: + if hasattr(str_or_unicode, "decode"): + return str_or_unicode.decode() + return str_or_unicode + + +def longest_common_prefix(strings: List[str], split_character: str) -> str: + """ + Args: + string (list): The list of strings to find the longest common prefix of + split_character (str): If not None, require that the return string end in this character or + be the length of the entire string + Returns: + The string corresponding to the longest common prefix + """ + sorted_strings = sorted(strings) + s1, s2 = sorted_strings[0], sorted_strings[-1] + if s1 == s2: + # If the strings are the same, just return the full string + out = s1 + else: + # If the strings are not the same, return the longest common prefix optionally ending in split_character + ix = 0 + for i in range(min(len(s1), len(s2))): + if s1[i] != s2[i]: + break + if split_character is None or s1[i] == split_character: + ix = i + 1 + out = s1[:ix] + return out + + +def _expand_prefix(fname: str, prefix: str, split_character: str) -> str: + """Expand the prefix of a feature name to the next split_character or the end of the string""" + + if len(fname) == len(prefix): + # If the prefix is already the full feature, just take the feature name + out = fname + elif split_character is None: + # Advance the prefix by one character + out = fname[: len(prefix) + 1] + else: + # Advance the prefix to the next instance of split_character or the end of the string + for ix in range(len(prefix), len(fname)): + if fname[ix] == split_character: + break + out = fname[: ix + 1] + return out + + +def _get_feature_types_from_records( + records: List[twml.datarecord.DataRecord], fnames: List[str] +) -> dict: + """Get the types of the features in fnames by looking at the datarecords themselves""" + + # This method gets the types of the features in fnames by looking at the datarecords themselves. + # The reason why we do this rather than extract the feature types from the feature_config is + # that the feature naming conventions in the feature_config are different from those in the + # datarecords. + fids = [twml.feature_id(fname)[0] for fname in fnames] + feature_to_type = {} + for record in records: + for feature_type, values in record.__dict__.items(): + if values is not None: + included_ids = set(values) + for fname, fid in zip(fnames, fids): + if fid in included_ids: + feature_to_type[fname] = feature_type + return feature_to_type + + +def _get_metrics_hook(trainer: twml.Trainer) -> tf.train.SessionRunHook: + """Get a hook that returns the metrics from the current estimator spec""" + + def get_metrics_fn(trainer=trainer): + return { + k: v[0] for k, v in trainer.current_estimator_spec.eval_metric_ops.items() + } + + return twml.hooks.GetMetricsHook(get_metrics_fn=get_metrics_fn) + + +def _get_feature_name_from_config(feature_config: twml.FeatureConfig) -> List[str]: + """Extract the names of the features on a feature config object""" + + decoded_feature_names = [] + for f in feature_config.get_feature_spec()["features"].values(): + try: + fname = decode_str_or_unicode(f["featureName"]) + except UnicodeEncodeError as e: + logging.error( + "Encountered decoding exception when decoding %s: %s" % (f, e) + ) + decoded_feature_names.append(fname) + return decoded_feature_names diff --git a/twml/twml/contrib/hooks.py b/twml/twml/contrib/hooks.py index 6d68831fc..d76cefbab 100644 --- a/twml/twml/contrib/hooks.py +++ b/twml/twml/contrib/hooks.py @@ -1,42 +1,50 @@ import datetime +from typing import Union -from absl import logging import pytz import tensorflow.compat.v1 as tf +from absl import logging class StopAtTimeHook(tf.train.SessionRunHook): - """ - Hook that stops training at a fixed datetime - """ - - def __init__(self, stop_time): """ - Arguments: - stop_time: - a datetime.datetime or a datetime.timedelta specifying when to stop. - For naive datetime.datetime objects (with no time zone specified), - UTC time zone is assumed. + Hook that stops training at a fixed datetime """ - if isinstance(stop_time, datetime.timedelta): - self._stop_datetime = pytz.utc.localize(datetime.datetime.utcnow() + stop_time) - elif isinstance(stop_time, datetime.datetime): - if stop_time.tzinfo is None: - self._stop_datetime = pytz.utc.localize(stop_time) - else: - self._stop_datetime = stop_time.astimezone(pytz.UTC) - else: - raise ValueError("Expecting datetime or timedelta for stop_time arg") - self._stop_requested = False - def after_run(self, run_context, run_values): - delta = self._stop_datetime - pytz.utc.localize(datetime.datetime.utcnow()) - if delta.total_seconds() <= 0: - logging.info("StopAtTimeHook reached stop_time; requesting stop") - run_context.request_stop() - self._stop_requested = True + def __init__(self, stop_time: Union[datetime.datetime, datetime.timedelta]): + """ + Args: + stop_time: + a datetime.datetime or a datetime.timedelta specifying when to stop. + For naive datetime.datetime objects (with no time zone specified), + UTC time zone is assumed. + """ + if isinstance(stop_time, datetime.timedelta): + self._stop_datetime = pytz.utc.localize( + datetime.datetime.utcnow() + stop_time + ) + elif isinstance(stop_time, datetime.datetime): + if stop_time.tzinfo is None: + self._stop_datetime = pytz.utc.localize(stop_time) + else: + self._stop_datetime = stop_time.astimezone(pytz.UTC) + else: + raise ValueError("Expecting datetime or timedelta for stop_time arg") + self._stop_requested = False + + def after_run( + self, + run_context: tf.train.SessionRunContext, + run_values: tf.train.SessionRunValues, + ) -> None: + """Called after each call to run().""" + delta = self._stop_datetime - pytz.utc.localize(datetime.datetime.utcnow()) + if delta.total_seconds() <= 0: + logging.info("StopAtTimeHook reached stop_time; requesting stop") + run_context.request_stop() + self._stop_requested = True - @property - def stop_requested(self): - """ true if this hook requested a stop """ - return self._stop_requested + @property + def stop_requested(self) -> bool: + """true if this hook requested a stop""" + return self._stop_requested diff --git a/twml/twml/contrib/initializers.py b/twml/twml/contrib/initializers.py index 52bad3a19..fd7c73abe 100644 --- a/twml/twml/contrib/initializers.py +++ b/twml/twml/contrib/initializers.py @@ -1,61 +1,76 @@ +from typing import Optional + import numpy as np import tensorflow.compat.v1 as tf - TWML_INIT_FEED_KEY = "TWML_INIT_FEED_COLLECTION" class PartitionConstant(tf.keras.initializers.Constant): - """A constant initializer that supports partitions""" - - def __call__(self, shape, dtype=None, partition_info=None): - if partition_info is not None: - if not isinstance(self.value, np.ndarray): - raise ValueError( - "Currently, PartitionConstant only supports " - "partitioning on np.ndarrays. Got {}".format(type(self.value).__name__)) - offsets = partition_info.var_offset - indices = tuple([slice(offset, offset + size) for offset, size in zip(offsets, shape)]) - subset = self.value[indices] - return subset - else: - return self.value + """A constant initializer that supports partitions""" + + def __call__( + self, + shape: tf.TensorShape, + dtype: Optional[tf.Dtype] = None, + partition_info: Optional[tf.VariablePartitionInfo] = None, + ) -> tf.Tensor: + if partition_info is not None: + if not isinstance(self.value, np.ndarray): + raise ValueError( + "Currently, PartitionConstant only supports " + f"partitioning on np.ndarrays. Got {type(self.value).__name__}" + ) + offsets = partition_info.var_offset + indices = tuple( + [slice(offset, offset + size) for offset, size in zip(offsets, shape)] + ) + subset = self.value[indices] + return subset + return self.value partition_constant_initializer = PartitionConstant class PlaceholderInitializer(tf.keras.initializers.Initializer): - """A placeholder initializer that supports partitions""" - - def __init__(self, shape, dtype): - self.dtype = dtype - self.value = tf.placeholder(dtype=dtype, shape=shape) - - def __call__(self, shape, dtype=None, partition_info=None): - if partition_info is not None: - if self.dtype != dtype: - raise ValueError("dtype does not match placeholder dtype") - offsets = partition_info.var_offset - indices = tuple([slice(offset, offset + size) for offset, size in zip(offsets, shape)]) - subset = self.value[indices] - return subset - else: - return self.value - - -def get_init_feed_dict(): - """Get the init feed dictionary to be used when running the init op.""" - # Get the reference to the collection. - init_feed_collection = tf.get_collection(TWML_INIT_FEED_KEY) - init_feed_dict = {} - for d in init_feed_collection: - init_feed_dict.update(d) - return init_feed_dict - - -def clear_init_feed_collection(): - """Clear the init feed collection.""" - init_feed_collection = tf.get_collection_ref(TWML_INIT_FEED_KEY) - while init_feed_collection: - init_feed_collection.pop() + """A placeholder initializer that supports partitions""" + + def __init__(self, shape: tf.TensorShape, dtype: tf.Dtype = tf.float32): + self.dtype = dtype + self.value = tf.placeholder(dtype=dtype, shape=shape) + + def __call__( + self, + shape: tf.TensorShape, + dtype: Optional[tf.Dtype] = None, + partition_info: Optional[tf.VariablePartitionInfo] = None, + ) -> tf.Tensor: + if partition_info is not None: + if self.dtype != dtype: + raise ValueError("dtype does not match placeholder dtype") + offsets = partition_info.var_offset + indices = tuple( + [slice(offset, offset + size) for offset, size in zip(offsets, shape)] + ) + subset = self.value[indices] + return subset + else: + return self.value + + +def get_init_feed_dict() -> dict: + """Get the init feed dictionary to be used when running the init op.""" + # Get the reference to the collection. + init_feed_collection = tf.get_collection(TWML_INIT_FEED_KEY) + init_feed_dict = {} + for d in init_feed_collection: + init_feed_dict.update(d) + return init_feed_dict + + +def clear_init_feed_collection() -> None: + """Clear the init feed collection.""" + init_feed_collection = tf.get_collection_ref(TWML_INIT_FEED_KEY) + while init_feed_collection: + init_feed_collection.pop() diff --git a/twml/twml/contrib/layers/__init__.py b/twml/twml/contrib/layers/__init__.py index aa6e7d7e4..ac29dcc7c 100644 --- a/twml/twml/contrib/layers/__init__.py +++ b/twml/twml/contrib/layers/__init__.py @@ -1,11 +1,12 @@ # pylint: disable=wildcard-import """ This module contains all contrib Layers. """ +from .embedding_lookup import EmbeddingLookup # noqa: F401 +from .factorization_machine import FactorizationMachine # noqa: F401 +from .full_dense import FullDense, full_dense # noqa: F401 from .hashed_percentile_discretizer import HashedPercentileDiscretizer # noqa: F401 from .hashing_discretizer import HashingDiscretizer # noqa: F401 from .mask_layer import MaskLayer # noqa: F401 -from .embedding_lookup import EmbeddingLookup # noqa: F401 -from .factorization_machine import FactorizationMachine # noqa: F401 -from .full_dense import full_dense, FullDense # noqa: F401 from .stacked_rnn import StackedRNN, stacked_rnn # noqa: F401 -from .zscore_normalization import ZscoreNormalization, zscore_normalization # noqa: F401 +from .zscore_normalization import ZscoreNormalization # noqa: F401 +from .zscore_normalization import zscore_normalization diff --git a/twml/twml/contrib/layers/embedding_lookup.py b/twml/twml/contrib/layers/embedding_lookup.py index c83dc7edd..b86565da0 100644 --- a/twml/twml/contrib/layers/embedding_lookup.py +++ b/twml/twml/contrib/layers/embedding_lookup.py @@ -1,12 +1,12 @@ +import argparse import os import re import time +from typing import Dict, Optional, Tuple -from collections import OrderedDict - -from absl import logging import numpy as np import tensorflow.compat.v1 as tf +from absl import logging from tensorflow.python.ops.lookup_ops import index_table_from_tensor import twml @@ -17,403 +17,435 @@ def load_initializers_from_csv( - embedding_path, vocab_size=-1, embedding_size=None, separator=None, vocab=None -): - """ - Loads embeddings saved in the `glove format `_. - The glove format is a txt file separated by spaces. - Each line looks like: "word 0.00001 0.2334 ...". - - Arguments: - embedding_path: - path to the embeddings file on HDFS (hdfs://default/...) - or its local_path (/path/to/...). - The embedding_path may also specify a pattern. In which case, the embeddings - are read in the lexical order of the filenames that match the order. - vocab_size: - the maximum size of the vocabulary. The top ``vocab_size`` words in the file - are included in the vocabulary. If you specify a positive vocab_size, - the words are expected to be in descending order of frequency. - This allows the embeddings to be easily filtered to top vocab_size words. - Reducing the vocab_size acts as a regularizer, preventing the model to overfit on rarer words. - A negative vocab_size loads all embeddings. - Reducing the vocab_size may also help with memory issues, - allowing the embedding initializers to fit inside the graph. - embedding_size: - Defaults to None. If None, the embedding size is infered from the file name. - For example, ``glove.300d.txt`` and ``glove300d200.txt`` will both infrered - as ``embedding_size=300``. If this can't be done, the ``embedding_size`` is - inferred from the first line in the file. If ``embedding_size`` is provided, - only the last ``embedding_size`` values of each line are considered. This - allows the line parser to recover from partial word parsing errors. - separator: - Specifies the separator to use when splitting each line into values. - Default value is a whitespace (same as glove format). - vocab: - OrderedDict mapping words to np.array embedding vectors. Initializes the vocabulary. - Duplicate words found in the file are ignored. - Defaults to a vocabulary of two words:: - - vocab = OrderedDict() - vocab[''] = np.random.randn(embedding_size) - vocab[''] = np.random.randn(embedding_size) - - Returns: - tuple of (vocab_initializer, weight_initializer, shape) - - vocab_initializer: - A tf.constant_initializer containing a vector of word strings of size vocab_size. - weight_initializer: - A twml.contrib.initializers.partition_constant_initializer containing - the weight matrix of embeddings of size vocab_size x embedding_size. - shape: - A tuple containing of (vocab_size, embedding_size). - - """ - - start = time.time() - - embedding_path = twml.util.sanitize_hdfs_path(embedding_path) - - is_user_vocab = True - if vocab is None: - vocab = OrderedDict() - vocab[''] = True - vocab[''] = True - is_user_vocab = False - elif not isinstance(vocab, OrderedDict): - raise RuntimeError( - "Expecting vocab argument of type OrderedDict or None. " - "Got type %s instead." % type(vocab).__name__ - ) + embedding_path: str, + vocab_size: int = -1, + embedding_size: int = None, + separator: str = " ", + vocab: Dict[str, np.ndarray] = None, +) -> Tuple[ + tf.constant_initializer, tf.keras.initializers.PartitionedConstant, Tuple[int, int] +]: + """ + Loads embeddings saved in the `glove format `_. + The glove format is a txt file separated by spaces. + Each line looks like: "word 0.00001 0.2334 ...". + + Args: + embedding_path: + path to the embeddings file on HDFS (hdfs://default/...) + or its local_path (/path/to/...). + The embedding_path may also specify a pattern. In which case, the embeddings + are read in the lexical order of the filenames that match the order. + vocab_size: + the maximum size of the vocabulary. The top ``vocab_size`` words in the file + are included in the vocabulary. If you specify a positive vocab_size, + the words are expected to be in descending order of frequency. + This allows the embeddings to be easily filtered to top vocab_size words. + Reducing the vocab_size acts as a regularizer, preventing the model to overfit on rarer words. + A negative vocab_size loads all embeddings. + Reducing the vocab_size may also help with memory issues, + allowing the embedding initializers to fit inside the graph. + embedding_size: + Defaults to None. If None, the embedding size is inferred from the file name. + For example, ``glove.300d.txt`` and ``glove300d200.txt`` will both inferred + as ``embedding_size=300``. If this can't be done, the ``embedding_size`` is + inferred from the first line in the file. If ``embedding_size`` is provided, + only the last ``embedding_size`` values of each line are considered. This + allows the line parser to recover from partial word parsing errors. + separator: + Specifies the separator to use when splitting each line into values. + Default value is a whitespace (same as glove format). + vocab: + dict mapping words to np.array embedding vectors. Initializes the vocabulary. + Duplicate words found in the file are ignored. + Defaults to a vocabulary of two words:: + + vocab = dict() + vocab[''] = np.random.randn(embedding_size) + vocab[''] = np.random.randn(embedding_size) - if embedding_size is None: - embedding_file = os.path.basename(embedding_path) - match = re.search(r"[^\d]([\d]+)d", embedding_file) - if match is not None: - embedding_size = int(match.group(1)) + Returns: + tuple of (vocab_initializer, weight_initializer, shape) + vocab_initializer: + A tf.constant_initializer containing a vector of word strings of size vocab_size. + weight_initializer: + A twml.contrib.initializers.partition_constant_initializer containing + the weight matrix of embeddings of size vocab_size x embedding_size. + shape: + A tuple containing of (vocab_size, embedding_size). + """ - if embedding_size is not None and not isinstance(embedding_size, int): - raise RuntimeError( - "Expecting embedding_size argument of type int or None. " - "Got type %s, instead." % type(embedding_size).__name__ + start = time.time() + + embedding_path = twml.util.sanitize_hdfs_path(embedding_path) + + is_user_vocab = True + if vocab is None: + vocab = dict() + vocab[""] = True + vocab[""] = True + is_user_vocab = False + + elif not isinstance(vocab, dict): + raise RuntimeError( + "Expecting vocab argument of type dict or None. " + "Got type %s instead." % type(vocab).__name__ + ) + + if embedding_size is None: + embedding_file = os.path.basename(embedding_path) + match = re.search(r"[^\d]([\d]+)d", embedding_file) + if match is not None: + embedding_size = int(match.group(1)) + + if embedding_size is not None and not isinstance(embedding_size, int): + raise RuntimeError( + "Expecting embedding_size argument of type int or None. " + "Got type %s, instead." % type(embedding_size).__name__ + ) + + embedding_paths = sorted(tf.io.gfile.glob(embedding_path)) + + if len(embedding_paths) > 1: + raise ValueError("You are most likely using a the wrong --embedding.path") + + embedding_path = embedding_paths[0] + logging.info("Reading embeddings file from path %s.." % embedding_path) + + with tf.io.gfile.GFile(embedding_path) as f: + lines = f.readlines() + + logging.info("Done reading embeddings file from path %s." % embedding_path) + + logging.info("Parsing vocabulary and embeddings...") + + for line in lines: + # Word and weights separated by space + values = line.strip().split(separator) + # Word is first symbol on each line + word = values[0] + + if word not in vocab: + if embedding_size is None or embedding_size <= 0: + # get all elements after the first one. + word_weights = values[1:] + embedding_size = len(word_weights) + else: + # get the last embedding_size elements + word_weights = values[-min(embedding_size, len(values) - 1) :] + + try: + if len(word_weights) != embedding_size: + raise ValueError + + word_weights = np.asarray(word_weights, dtype=np.float32) + vocab[word] = word_weights + except ValueError: + logging.info( + "Wasn't able to load embeddings for word '%s'. Ignoring it" % word + ) + + vocab_len = len(vocab) + if vocab_size > 0 and vocab_len == vocab_size: + # Limit vocabulary to top terms + break + elif (vocab_len % 1000) == 0: + logging.info("Loaded %d words into vocab" % vocab_len) + + else: + logging.info("found duplicate word: %s" % word) + + if not is_user_vocab: + vocab[""] = np.random.randn(embedding_size) + vocab[""] = np.random.randn(embedding_size) + + words = list(vocab.keys()) + weights = list(vocab.values()) + + weights = np.asarray(weights, dtype=np.float32) + assert weights.shape[0] == len(vocab) + assert weights.shape[1] == embedding_size + + vocab_initializer = tf.constant_initializer(words, tf.string) + weight_initializer = twml.contrib.initializers.PartitionConstant( + weights, tf.float32 ) - embedding_paths = sorted(tf.io.gfile.glob(embedding_path)) - - if len(embedding_paths) > 1: - raise ValueError( - "You are most likely using a the wrong --embedding.path" + logging.info( + "Loaded %d embeddings in %d seconds." % (len(vocab), time.time() - start) ) + return vocab_initializer, weight_initializer, weights.shape - embedding_path = embedding_paths[0] - logging.info("Reading embeddings file from path %s.." % embedding_path) - - with tf.io.gfile.GFile(embedding_path) as f: - lines = f.readlines() - - logging.info("Done reading embeddings file from path %s." % embedding_path) - - logging.info("Parsing vocbulary and embeddings...") - - for line in lines: - # Word and weights separated by space - values = line.strip().split(separator) - # Word is first symbol on each line - word = values[0] - - if word not in vocab: - if embedding_size is None or embedding_size <= 0: - # get all elements after the first one. - word_weights = values[1:] - embedding_size = len(word_weights) - else: - # get the last embedding_size elements - word_weights = values[-min(embedding_size, len(values) - 1) :] - - try: - if len(word_weights) != embedding_size: - raise ValueError - - word_weights = np.asarray(word_weights, dtype=np.float32) - vocab[word] = word_weights - except ValueError: - logging.info("Wasn't able to load embeddings for word '%s'. Ignoring it" % word) - - vocab_len = len(vocab) - if vocab_size > 0 and vocab_len == vocab_size: - # Limit vocabulary to top terms - break - elif (vocab_len % 1000) == 0: - logging.info("Loaded %d words into vocab" % vocab_len) - - else: - logging.info("found duplicate word: %s" % word) - - if not is_user_vocab: - vocab[''] = np.random.randn(embedding_size) - vocab[''] = np.random.randn(embedding_size) - - words = list(vocab.keys()) - weights = list(vocab.values()) - - weights = np.asarray(weights, dtype=np.float32) - assert weights.shape[0] == len(vocab) - assert weights.shape[1] == embedding_size - - vocab_initializer = tf.constant_initializer(words, tf.string) - weight_initializer = twml.contrib.initializers.PartitionConstant(weights, tf.float32) - - logging.info("Loaded %d embeddings in %d seconds." % (len(vocab), time.time() - start)) - return vocab_initializer, weight_initializer, weights.shape - - -def add_parser_arguments(parser): - """ - Adds the embedding.path and embedding.vocab_size command-line arguments to the parser. - These can be used to call an initializer loader function like - the ``load_initializers_from_csv`` function. - - Arguments: - parser: argparse.ArgumentParser instance obtained from Trainer.get_trainer_parser - - Returns: - argparse.ArgumentParser instance with discretizer-specific arguments added - """ - - parser.add_argument( - "--embedding.path", - "--embedding_path", - dest="embedding_path", - type=str, - default=None, - help="When specified, loads glove embeddings from .txt glove file", - ) - parser.add_argument( - "--embedding.vocab_size", - "--embedding_vocab_size", - dest="embedding_vocab_size", - type=int, - default=-1, - help="Size of vocabulary. Uses this many of the most frequent terms. Defaults to -1 (use full vocab).", - ) - - return parser - -class EmbeddingLookup(twml.layers.Layer): - """Layer for looking up embeddings. - Transforms a sequence of strings to a sequence of embeddings. - - Arguments: - vocab_size: - The number of word strings and embeddings in the vocabulary. - output_size: - Long or Integer, dimensionality of the output space. The embedding vector size. - vocab_initializer: - Initializer function for the vocabulary. Required. The initializer should - return a list of strings of size vocab_size. - weight_initializer: - Initializer function for the weight matrix of size vocab_size x output_size. - This argument defaults to zeros_initializer(). - This is valid when the EmbeddingLookup is the first layer of - parameters but should be changed otherwise. - trainable: - Boolean, if `True` adds variables to the graph collection - ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable - `_). - Defaults to True: trains the embeddings. - num_oov_buckets: - The number of buckets to use for OOV strings. These bucket ids occur after the vocab bucket - ids. Hashing is used to assign OOV strings to these buckets. If `num_oov_buckets` is not - specified, index `OOV_WORD_ID` is used for OOV strings. - name: - String, the name of the layer. Layers with the same name will - share weights, but to avoid mistakes we require ``reuse=True`` in such cases. - num_partitions: - Number of partitions to use for the weight variable. Defaults to 1. - partition_axis: - If num_partitions is specified, the partition axis for the weight variable - Defaults to 0 (partition by row). - Must be 0 (row) or 1 (column, does not support yet) - weight_regularizer: - Regularizer function for the weight matrix. - Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect. - dtype: - Defaults to tf.float32. Specifies the dtype of the weights. - use_placeholder: - Defaults to True. - If set to `True`, the initializer is passed via a placeholder. The initializer in this case needs to be of type `keras.initializers.Constant`. - If set to `False`, the initializer becomes part of the graph. This can sometimes be beyond what protobuf clients support. - checkpoint_dir: - Default to None. - If set to the path of a checkpoint, load embedding from the checkpoint. - convert_to_lowercase: - Default to True. - Converting all string inputs to lowercase. - - Notes: If `use_placeholder` is set to `True`, the feed dictionary can be accessed by calling `twml.contrib.initializers.get_init_feed_dict()`. - """ - - def __init__( - self, - vocab_size, - output_size, - vocab_initializer, - weight_initializer=None, - trainable=True, - num_oov_buckets=None, - oov_word_id=None, - name=None, - num_partitions=1, - partition_axis=0, - weight_regularizer=None, - dtype=None, - use_placeholder=True, - checkpoint_dir=None, - convert_to_lowercase=True, - **kwargs, - ): - if dtype is None: - # prevents a bug where the parent class defaults to the type of the first input tensor. - dtype = tf.float32 - super().__init__(trainable=trainable, name=name, dtype=dtype, **kwargs) - # Weights initialization is set to 0s. This is safe for full sparse layers because - # you are supposed to learn your embedding from the label. - - is_constant_init = isinstance(weight_initializer, tf.keras.initializers.Constant) - if use_placeholder and (not is_constant_init) and (weight_initializer is not None): - raise ValueError("Weight initializer should be a `Constant` or `None`.") - - if weight_initializer is None: - self.weight_initializer = tf.zeros_initializer() - else: - self.weight_initializer = weight_initializer - self.use_placeholder = use_placeholder - self.checkpoint_dir = checkpoint_dir - self.convert_to_lowercase = convert_to_lowercase - - self.vocab_initializer = vocab_initializer - self.vocab_size = vocab_size - self.output_size = output_size - self.num_partitions = num_partitions - self.partition_axis = partition_axis - self.weight_regularizer = weight_regularizer - self.trainable = trainable - self.oov_word_id = oov_word_id - self.num_oov_buckets = num_oov_buckets - - if self.oov_word_id is not None and self.num_oov_buckets is not None: - raise ValueError("At most one of oov_word_id or num_oov_buckets should be specified") - elif self.oov_word_id is None and self.num_oov_buckets is None: - self.oov_word_id = OOV_WORD_ID # use the default OOV word id - - if partition_axis != 0: - raise NotImplementedError("embedding_lookup only supports partition_axis = 0") - - def build(self, input_shapes): +def add_parser_arguments(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: """ - creates the ``vocab`` and ``weight`` Variables - of shape ``[vocab_size]`` and ``[vocab_size, output_size]`` respectively. - """ - partitioner = None - - additional_buckets_for_oov = self.num_oov_buckets if self.num_oov_buckets is not None else 0 - shape = [self.vocab_size + additional_buckets_for_oov, self.output_size] - - if self.use_placeholder: - embedding_weight_initializer = twml.contrib.initializers.PlaceholderInitializer( - shape, self.dtype - ) - tf.add_to_collection( - twml.contrib.initializers.TWML_INIT_FEED_KEY, - {embedding_weight_initializer.value: self.weight_initializer.value}, - ) - else: - embedding_weight_initializer = self.weight_initializer - - if self.num_partitions: - partition_axis = int(self.partition_axis) - partitioner = tf.fixed_size_partitioner(self.num_partitions, axis=partition_axis) - else: - # Regular variables do not like it when you pass both constant tensors and shape - if not callable(self.weight_initializer): - shape = None - - self.vocab = self.add_variable( - 'vocab', - initializer=self.vocab_initializer, - shape=[self.vocab_size], - dtype=tf.string, - trainable=False, - ) + Adds the embedding.path and embedding.vocab_size command-line arguments to the parser. + These can be used to call an initializer loader function like + the ``load_initializers_from_csv`` function. - self.weight = self.add_variable( - 'weight', - initializer=None if self.checkpoint_dir is not None else embedding_weight_initializer, - regularizer=self.weight_regularizer, - shape=shape, - dtype=self.dtype, - trainable=self.trainable, - partitioner=partitioner, - ) - if self.checkpoint_dir is not None: - twml.trainers.trainer.init_from_checkpoint(self.checkpoint_dir, {'weight': self.weight.name}) - - self.built = True - - def call( - self, inputs, debug=False, oov_summaries=False, **kwargs - ): # pylint: disable=unused-argument - """Converts word strings to word ids using the vocabulary lookup table. - Then converts the word ids to their commensurate embedding vector. - - Arguments: - inputs: - A tensor of word strings. Typically, of size batch_size x seq_len. - debug: - When True, prints the input strings and their commensurate input_ids. - Defaults to False. - oov_summaries: - When True, log the out-of-vocabulary (OOV) rate to TensorBoard - Defaults to False. + Args: + parser: argparse.ArgumentParser instance obtained from Trainer.get_trainer_parser Returns: - The mapping of input word strings to output embedding vectors. - Given an input of shape ``batch_size x seq_len``, the output has shape - ``batch_size x seq_len x embedding_size``. + argparse.ArgumentParser instance with discretizer-specific arguments added """ - if self.convert_to_lowercase: - inputs = tf.strings.lower(inputs) - if self.num_oov_buckets is None: - lookup_table = index_table_from_tensor(self.vocab, default_value=self.oov_word_id) - else: - lookup_table = index_table_from_tensor(self.vocab, num_oov_buckets=self.num_oov_buckets) - input_ids = lookup_table.lookup(inputs) - - if oov_summaries: - oov_count = tf.reduce_sum( - tf.cast(tf.math.equal(input_ids, self.oov_word_id), tf.dtypes.float32) - ) - valid_count = tf.reduce_sum( - tf.cast(tf.math.not_equal(input_ids, PAD_WORD_ID), tf.dtypes.float32) - ) - oov_rate = oov_count / valid_count - tf.summary.scalar('OOV_rate', oov_rate) - - if debug: - - def print_debug(): - return tf.print("input_strings:", inputs, "\ninput_ids: ", input_ids, summarize=140) - - with tf.control_dependencies([twml.util.do_every_n_steps(print_debug, 1000)]): - input_ids = tf.identity(input_ids) - - output_embeddings = tf.nn.embedding_lookup( - params=self.weight, ids=input_ids, partition_strategy='div' + + parser.add_argument( + "--embedding.path", + "--embedding_path", + dest="embedding_path", + type=str, + default=None, + help="When specified, loads glove embeddings from .txt glove file", + ) + parser.add_argument( + "--embedding.vocab_size", + "--embedding_vocab_size", + dest="embedding_vocab_size", + type=int, + default=-1, + help="Size of vocabulary. Uses this many of the most frequent terms. Defaults to -1 (use full vocab).", ) - output_shape = inputs.shape.concatenate(tf.TensorShape([self.output_size])) - output_embeddings.set_shape(output_shape) + return parser + + +class EmbeddingLookup(twml.layers.Layer): + """Layer for looking up embeddings. + Transforms a sequence of strings to a sequence of embeddings. + + Args: + vocab_size: + The number of word strings and embeddings in the vocabulary. + output_size: + Long or Integer, dimensionality of the output space. The embedding vector size. + vocab_initializer: + Initializer function for the vocabulary. Required. The initializer should + return a list of strings of size vocab_size. + weight_initializer: + Initializer function for the weight matrix of size vocab_size x output_size. + This argument defaults to zeros_initializer(). + This is valid when the EmbeddingLookup is the first layer of + parameters but should be changed otherwise. + trainable: + Boolean, if `True` adds variables to the graph collection + ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable + `_). + Defaults to True: trains the embeddings. + num_oov_buckets: + The number of buckets to use for OOV strings. These bucket ids occur after the vocab bucket + ids. Hashing is used to assign OOV strings to these buckets. If `num_oov_buckets` is not + specified, index `OOV_WORD_ID` is used for OOV strings. + name: + String, the name of the layer. Layers with the same name will + share weights, but to avoid mistakes we require ``reuse=True`` in such cases. + num_partitions: + Number of partitions to use for the weight variable. Defaults to 1. + partition_axis: + If num_partitions is specified, the partition axis for the weight variable + Defaults to 0 (partition by row). + Must be 0 (row) or 1 (column, does not support yet) + weight_regularizer: + Regularizer function for the weight matrix. + Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect. + dtype: + Defaults to tf.float32. Specifies the dtype of the weights. + use_placeholder: + Defaults to True. + If set to `True`, the initializer is passed via a placeholder. The initializer in this case needs to be of type `keras.initializers.Constant`. + If set to `False`, the initializer becomes part of the graph. This can sometimes be beyond what protobuf clients support. + checkpoint_dir: + Default to None. + If set to the path of a checkpoint, load embedding from the checkpoint. + convert_to_lowercase: + Default to True. + Converting all string inputs to lowercase. + + Notes: If `use_placeholder` is set to `True`, the feed dictionary can be accessed by calling `twml.contrib.initializers.get_init_feed_dict()`. + """ - return output_embeddings + def __init__( + self, + vocab_size: int, + output_size: int, + vocab_initializer: tf.keras.initializers.Initializer, + weight_initializer: tf.keras.initializers.Initializer = tf.zeros_initializer(), + trainable: bool = True, + num_oov_buckets: Optional[int] = None, + oov_word_id: Optional[int] = None, + name: Optional[str] = None, + num_partitions: int = 1, + partition_axis: int = 0, + weight_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, + dtype: tf.DType = tf.float32, + use_placeholder: bool = True, + checkpoint_dir: Optional[str] = None, + convert_to_lowercase: bool = True, + **kwargs, + ): + super().__init__(trainable=trainable, name=name, dtype=dtype, **kwargs) + # Weights initialization is set to 0s. This is safe for full sparse layers because + # you are supposed to learn your embedding from the label. + + is_constant_init = isinstance( + weight_initializer, tf.keras.initializers.Constant + ) + if ( + use_placeholder + and (not is_constant_init) + and (weight_initializer is not None) + ): + raise ValueError("Weight initializer should be a `Constant` or `None`.") + + self.use_placeholder = use_placeholder + self.checkpoint_dir = checkpoint_dir + self.convert_to_lowercase = convert_to_lowercase + + self.vocab_initializer = vocab_initializer + self.vocab_size = vocab_size + self.output_size = output_size + self.num_partitions = num_partitions + self.partition_axis = partition_axis + self.weight_regularizer = weight_regularizer + self.trainable = trainable + self.oov_word_id = oov_word_id + self.num_oov_buckets = num_oov_buckets + + if self.oov_word_id is not None and self.num_oov_buckets is not None: + raise ValueError( + "At most one of oov_word_id or num_oov_buckets should be specified" + ) + elif self.oov_word_id is None and self.num_oov_buckets is None: + self.oov_word_id = OOV_WORD_ID # use the default OOV word id + + if partition_axis != 0: + raise NotImplementedError( + "embedding_lookup only supports partition_axis = 0" + ) + + def build(self, input_shapes: tf.TensorShape) -> None: + """ + creates the ``vocab`` and ``weight`` Variables + of shape ``[vocab_size]`` and ``[vocab_size, output_size]`` respectively. + """ + partitioner = None + + additional_buckets_for_oov = ( + self.num_oov_buckets if self.num_oov_buckets is not None else 0 + ) + shape = [self.vocab_size + additional_buckets_for_oov, self.output_size] + + if self.use_placeholder: + embedding_weight_initializer = ( + twml.contrib.initializers.PlaceholderInitializer(shape, self.dtype) + ) + tf.add_to_collection( + twml.contrib.initializers.TWML_INIT_FEED_KEY, + {embedding_weight_initializer.value: self.weight_initializer.value}, + ) + else: + embedding_weight_initializer = self.weight_initializer + + if self.num_partitions: + partition_axis = int(self.partition_axis) + partitioner = tf.fixed_size_partitioner( + self.num_partitions, axis=partition_axis + ) + else: + # Regular variables do not like it when you pass both constant tensors and shape + if not callable(self.weight_initializer): + shape = None + + self.vocab = self.add_variable( + "vocab", + initializer=self.vocab_initializer, + shape=[self.vocab_size], + dtype=tf.string, + trainable=False, + ) + + self.weight = self.add_variable( + "weight", + initializer=None + if self.checkpoint_dir is not None + else embedding_weight_initializer, + regularizer=self.weight_regularizer, + shape=shape, + dtype=self.dtype, + trainable=self.trainable, + partitioner=partitioner, + ) + if self.checkpoint_dir is not None: + twml.trainers.trainer.init_from_checkpoint( + self.checkpoint_dir, {"weight": self.weight.name} + ) + + self.built = True + + def call( + self, + inputs: tf.Tensor, + debug: bool = False, + oov_summaries: bool = False, + **kwargs, + ): # pylint: disable=unused-argument + """Converts word strings to word ids using the vocabulary lookup table. + Then converts the word ids to their commensurate embedding vector. + + Args: + inputs: + A tensor of word strings. Typically, of size batch_size x seq_len. + debug: + When True, prints the input strings and their commensurate input_ids. + Defaults to False. + oov_summaries: + When True, log the out-of-vocabulary (OOV) rate to TensorBoard + Defaults to False. + + Returns: + The mapping of input word strings to output embedding vectors. + Given an input of shape ``batch_size x seq_len``, the output has shape + ``batch_size x seq_len x embedding_size``. + """ + if self.convert_to_lowercase: + inputs = tf.strings.lower(inputs) + if self.num_oov_buckets is None: + lookup_table = index_table_from_tensor( + self.vocab, default_value=self.oov_word_id + ) + else: + lookup_table = index_table_from_tensor( + self.vocab, num_oov_buckets=self.num_oov_buckets + ) + input_ids = lookup_table.lookup(inputs) + + if oov_summaries: + oov_count = tf.reduce_sum( + tf.cast(tf.math.equal(input_ids, self.oov_word_id), tf.dtypes.float32) + ) + valid_count = tf.reduce_sum( + tf.cast(tf.math.not_equal(input_ids, PAD_WORD_ID), tf.dtypes.float32) + ) + oov_rate = oov_count / valid_count + tf.summary.scalar("OOV_rate", oov_rate) + + if debug: + + def print_debug(): + return tf.print( + "input_strings:", inputs, "\ninput_ids: ", input_ids, summarize=140 + ) + + with tf.control_dependencies( + [twml.util.do_every_n_steps(print_debug, 1000)] + ): + input_ids = tf.identity(input_ids) + + output_embeddings = tf.nn.embedding_lookup( + params=self.weight, ids=input_ids, partition_strategy="div" + ) + + output_shape = inputs.shape.concatenate(tf.TensorShape([self.output_size])) + output_embeddings.set_shape(output_shape) + + return output_embeddings diff --git a/twml/twml/contrib/layers/factorization_machine.py b/twml/twml/contrib/layers/factorization_machine.py index 3b8adae42..2484d0a8d 100644 --- a/twml/twml/contrib/layers/factorization_machine.py +++ b/twml/twml/contrib/layers/factorization_machine.py @@ -3,177 +3,198 @@ Implementing factorization Layer """ -from twitter.deepbird.sparse.sparse_ops import _pad_empty_outputs +from typing import Optional import tensorflow.compat.v1 as tf +from twitter.deepbird.sparse.sparse_ops import _pad_empty_outputs + import twml from twml.layers.layer import Layer class FactorizationMachine(Layer): - """factorization machine layer class. - This layer implements the factorization machine operation. - The paper is "Factorization Machines" by Steffen Rendle. - TDD: go/tf-fm-tdd - - Arguments: - num_latent_variables: - num of latent variables - The number of parameter in this layer is num_latent_variables x n where n is number of - input features. - weight_initializer: - Initializer function for the weight matrix. - This argument defaults to zeros_initializer(). - This is valid when the FullSparse is the first layer of - parameters but should be changed otherwise. - weight_regularizer: - Regularizer function for the weight matrix. - Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect. - activation: - Activation function (callable). Set it to None to maintain a linear activation. - trainable: - Boolean, if `True` also add variables to the graph collection - ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable - `_). - name: - String, the name of the layer. Layers with the same name will - share weights, but to avoid mistakes we require ``reuse=True`` in such cases. - use_sparse_grads: - Boolean, if `True` do sparse mat mul with `embedding_lookup_sparse`, which will - make gradients to weight matrix also sparse in backward pass. This can lead to non-trivial - speed up at training time when input_size is large and optimizer handles sparse gradients - correctly (eg. with SGD or LazyAdamOptimizer). If weight matrix is small, it's recommended - to set this flag to `False`; for most use cases of FullSparse, however, weight matrix will - be large, so it's better to set it to `True` - use_binary_values: - Assume all non zero values are 1. Defaults to False. - This can improve training if used in conjunction with MDL. - This parameter can also be a list of binary values if `inputs` passed to `call` a list. - """ - - def __init__(self, - num_latent_variables=10, - weight_initializer=None, - activation=None, - trainable=True, - name=None, - use_sparse_grads=True, - use_binary_values=False, - weight_regularizer=None, - substract_self_cross=True, - **kwargs): - super(FactorizationMachine, self).__init__(trainable=trainable, name=name, **kwargs) - - if weight_initializer is None: - weight_initializer = tf.zeros_initializer() - self.weight_initializer = weight_initializer - self.num_latent_variables = num_latent_variables - self.activation = activation - self.use_sparse_grads = use_sparse_grads - self.use_binary_values = use_binary_values - self.weight_regularizer = weight_regularizer - self.substract_self_cross = substract_self_cross - - def build(self, input_shape): - """ - creates``weight`` Variable of shape``[input_size, num_latent_variables]``. - - """ - - shape = [input_shape[1], self.num_latent_variables] - - # There is a 2GB limitation for each tensor because of protobuf. - # 2**30 is 1GB. 2 * (2**30) is 2GB. - dtype = tf.as_dtype(self.dtype) - requested_size = input_shape[1] * self.num_latent_variables * dtype.size - if (requested_size >= 2**31): - raise ValueError("Weight tensor can not be larger than 2GB. " % - "Requested Dimensions(%d, %d) of type %s (%d bytes total)" - (input_shape[1], self.num_latent_variables, dtype.name)) - - if not callable(self.weight_initializer): - shape = None - - # dense tensor - self.weight = self.add_variable( - 'weight', - initializer=self.weight_initializer, - regularizer=self.weight_regularizer, - shape=shape, - dtype=self.dtype, - trainable=True, - ) - - self.built = True - - def compute_output_shape(self, input_shape): - """Computes the output shape of the layer given the input shape. + """factorization machine layer class. + This layer implements the factorization machine operation. + The paper is "Factorization Machines" by Steffen Rendle. + TDD: go/tf-fm-tdd Args: - input_shape: A (possibly nested tuple of) `TensorShape`. It need not - be fully defined (e.g. the batch size may be unknown). - - Raises NotImplementedError. - + num_latent_variables: + num of latent variables + The number of parameter in this layer is num_latent_variables x n where n is number of + input features. + weight_initializer: + Initializer function for the weight matrix. + This argument defaults to zeros_initializer(). + This is valid when the FullSparse is the first layer of + parameters but should be changed otherwise. + weight_regularizer: + Regularizer function for the weight matrix. + Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect. + activation: + Activation function (callable). Set it to None to maintain a linear activation. + trainable: + Boolean, if `True` also add variables to the graph collection + ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable + `_). + name: + String, the name of the layer. Layers with the same name will + share weights, but to avoid mistakes we require ``reuse=True`` in such cases. + use_sparse_grads: + Boolean, if `True` do sparse mat mul with `embedding_lookup_sparse`, which will + make gradients to weight matrix also sparse in backward pass. This can lead to non-trivial + speed up at training time when input_size is large and optimizer handles sparse gradients + correctly (eg. with SGD or LazyAdamOptimizer). If weight matrix is small, it's recommended + to set this flag to `False`; for most use cases of FullSparse, however, weight matrix will + be large, so it's better to set it to `True` + use_binary_values: + Assume all non zero values are 1. Defaults to False. + This can improve training if used in conjunction with MDL. + This parameter can also be a list of binary values if `inputs` passed to `call` a list. """ - raise NotImplementedError - def call(self, inputs, **kwargs): # pylint: disable=unused-argument - """The logic of the layer lives here. - - Arguments: - inputs: - A SparseTensor - Returns: - - If `inputs` is `SparseTensor`, then returns a number with cross info - """ - # The following are given: - # - inputs is a sparse tensor, we call it sp_x. - # - The dense_v tensor is a dense matrix, whose row i - # corresponds to the vector V_i. - # weights has shape [num_features, k] - sp_x = inputs - if isinstance(inputs, twml.SparseTensor): - sp_x = inputs.to_tf() - elif not isinstance(sp_x, tf.SparseTensor): - raise TypeError("The sp_x must be of type tf.SparseTensor or twml.SparseTensor") - - indices = sp_x.indices[:, 1] - batch_ids = sp_x.indices[:, 0] - values = tf.reshape(sp_x.values, [-1, 1], name=self.name) - if self.use_sparse_grads: - v = tf.nn.embedding_lookup(self.weight, indices) - # if (self.use_binary_values): - # values = tf.ones(tf.shape(values), dtype=values.dtype) - v_times_x = v * values - # First term: Sum_k [Sum_i (v_ik * x_i)]^2 - all_crosses = tf.segment_sum(v_times_x, batch_ids, name=self.name) - all_crosses_squared = tf.reduce_sum((all_crosses * all_crosses), 1) - - if self.substract_self_cross: - # Second term: Sum_k Sum_i [ (v_ik * x_i)^2 ] - v_times_x_2 = v_times_x**2 - self_crosses = tf.reduce_sum(tf.segment_sum(v_times_x_2, batch_ids, name=self.name), 1) - outputs = all_crosses_squared - self_crosses - else: - outputs = all_crosses_squared - else: - # need to check if prediction is faster with code below - crossTerm = tf.reduce_sum((tf.sparse_tensor_dense_matmul(sp_x, self.weight)**2), 1) - - if self.substract_self_cross: - # compute self-cross term - self_crossTerm = tf.reduce_sum(tf.segment_sum((tf.gather(self.weight, indices) * values)**2, batch_ids), 1) - outputs = crossTerm - self_crossTerm - else: - outputs = crossTerm - - if self.activation is not None: - outputs = self.activation(outputs) - - outputs = tf.reshape(outputs, [-1, 1], name=self.name) - outputs = _pad_empty_outputs(outputs, tf.cast(sp_x.dense_shape[0], tf.int32)) - # set more explicit and static shape to avoid shape inference error - # valueError: The last dimension of the inputs to `Dense` should be defined. Found `None` - outputs.set_shape([None, 1]) - return outputs + def __init__( + self, + num_latent_variables: int = 10, + weight_initializer: tf.keras.initializers.Initializer = tf.zeros_initializer(), + activation: Optional[tf.keras.activations.Activation] = None, + trainable: bool = True, + name: Optional[str] = None, + use_sparse_grads: bool = True, + use_binary_values: bool = False, + weight_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, + substract_self_cross: bool = True, + **kwargs + ): + super(FactorizationMachine, self).__init__( + trainable=trainable, + name=name, + **kwargs, + ) + + self.weight_initializer = weight_initializer + self.num_latent_variables = num_latent_variables + self.activation = activation + self.use_sparse_grads = use_sparse_grads + self.use_binary_values = use_binary_values + self.weight_regularizer = weight_regularizer + self.substract_self_cross = substract_self_cross + + def build(self, input_shape: tf.TensorShape) -> None: + """creates `weight` Variable of shape `[input_size, num_latent_variables]`.""" + + shape = [input_shape[1], self.num_latent_variables] + + # There is a 2GB limitation for each tensor because of protobuf. + # 2**30 is 1GB. 2 * (2**30) is 2GB. + dtype = tf.as_dtype(self.dtype) + requested_size = input_shape[1] * self.num_latent_variables * dtype.size + if requested_size >= (1<<31): + raise ValueError( + "Weight tensor can not be larger than 2GB. " + % "Requested Dimensions(%d, %d) of type %s (%d bytes total)"( + input_shape[1], self.num_latent_variables, dtype.name + ) + ) + + if not callable(self.weight_initializer): + shape = None + + # dense tensor + self.weight = self.add_variable( + "weight", + initializer=self.weight_initializer, + regularizer=self.weight_regularizer, + shape=shape, + dtype=self.dtype, + trainable=True, + ) + + self.built = True + + def compute_output_shape( + self, input_shape: tf.TensorShape + ): # pylint: disable=unused-argument + """Computes the output shape of the layer given the input shape. + + Args: + input_shape: A (possibly nested tuple of) `TensorShape`. It need not + be fully defined (e.g. the batch size may be unknown). + + Raises NotImplementedError. + """ + raise NotImplementedError + + def call( + self, inputs: tf.SparseTensor, **kwargs + ): # pylint: disable=unused-argument + """The logic of the layer lives here. + + Args: + inputs: + A SparseTensor + + Returns: + - If `inputs` is `SparseTensor`, then returns a number with cross info + """ + # The following are given: + # - inputs is a sparse tensor, we call it sp_x. + # - The dense_v tensor is a dense matrix, whose row i + # corresponds to the vector V_i. + # weights has shape [num_features, k] + sp_x = inputs + if isinstance(inputs, twml.SparseTensor): + sp_x = inputs.to_tf() + elif not isinstance(sp_x, tf.SparseTensor): + raise TypeError( + "The sp_x must be of type tf.SparseTensor or twml.SparseTensor" + ) + + indices = sp_x.indices[:, 1] + batch_ids = sp_x.indices[:, 0] + values = tf.reshape(sp_x.values, [-1, 1], name=self.name) + if self.use_sparse_grads: + v = tf.nn.embedding_lookup(self.weight, indices) + # if (self.use_binary_values): + # values = tf.ones(tf.shape(values), dtype=values.dtype) + v_times_x = v * values + # First term: Sum_k [Sum_i (v_ik * x_i)]^2 + all_crosses = tf.segment_sum(v_times_x, batch_ids, name=self.name) + all_crosses_squared = tf.reduce_sum((all_crosses * all_crosses), 1) + + if self.substract_self_cross: + # Second term: Sum_k Sum_i [ (v_ik * x_i)^2 ] + v_times_x_2 = v_times_x**2 + self_crosses = tf.reduce_sum( + tf.segment_sum(v_times_x_2, batch_ids, name=self.name), 1 + ) + outputs = all_crosses_squared - self_crosses + else: + outputs = all_crosses_squared + else: + # need to check if prediction is faster with code below + crossTerm = tf.reduce_sum( + (tf.sparse_tensor_dense_matmul(sp_x, self.weight) ** 2), 1 + ) + + if self.substract_self_cross: + # compute self-cross term + self_crossTerm = tf.reduce_sum( + tf.segment_sum( + (tf.gather(self.weight, indices) * values) ** 2, batch_ids + ), + 1, + ) + outputs = crossTerm - self_crossTerm + else: + outputs = crossTerm + + if self.activation is not None: + outputs = self.activation(outputs) + + outputs = tf.reshape(outputs, [-1, 1], name=self.name) + outputs = _pad_empty_outputs(outputs, tf.cast(sp_x.dense_shape[0], tf.int32)) + # set more explicit and static shape to avoid shape inference error + # valueError: The last dimension of the inputs to `Dense` should be defined. Found `None` + outputs.set_shape([None, 1]) + return outputs diff --git a/twml/twml/contrib/layers/full_dense.py b/twml/twml/contrib/layers/full_dense.py index ad78a91a4..63990ba4e 100644 --- a/twml/twml/contrib/layers/full_dense.py +++ b/twml/twml/contrib/layers/full_dense.py @@ -2,379 +2,402 @@ """ Implementing Full Dense Layer """ -from twml.layers import Layer +from typing import List, Optional, Tuple, Union import tensorflow.compat.v1 as tf +from tensorflow import keras from tensorflow.python.layers import core +from twml.layers import Layer + class FullDense(Layer): - """ - Full-connected, Dense input layer class. - This layer implements the operation: - - .. code-block:: python - - outputs = activation(inputs.weight + bias) - - Where ``activation`` is the activation function passed as the ``activation`` - argument (if not ``None``), ``weight`` is a weights matrix created by the layer, - and ``bias`` is a bias vector created by the layer. - - However, this layer breaks up ``weight`` into ``num_partitions`` parts, - for the purpose of even disribution of weights across parameter servers - for distributed training. - - Note - This layer is created to allow distributed training optimizations, - but can also be used for single node training (e.g. hogwild) without - code modification - - Arguments: - output_size: - Integer or Long, dimensionality of the output space. - weight_initializer: - Initializer function for the weight matrix. - weight_regularizer: - Regularizer function for the weight matrix. - Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect. - weight_constraint: - An optional projection function to be applied to the - weight after being updated by an `Optimizer` (e.g. used to implement - norm constraints or value constraints for layer weights). The function - must take as input the unprojected variable and must return the - projected variable (which must have the same shape). Constraints are - not safe to use when doing asynchronous distributed training. - bias_constraint: - An optional projection function to be applied to the - bias after being updated by an `Optimizer`. - num_partitions: - Number of pieces to partition the weights into. This layer does - column partitioning of the weights, which is equivalent to - processing the input tensor with multiple fully connected layers - of smaller output size, and then concatenating these outputs - activation: - Activation function (callable). Set it to None to maintain a linear activation. - use_bias: - Boolean whether to include a bias parameter in the layer - bias_initializer: - Initializer function for the bias. - bias_regularizer: - Regularizer function for the bias. - Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect. - activity_regularizer: - Regularizer function for the output. - trainable: - Boolean, if `True` also add variables to the graph collection - ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable - `_). - name: - String, the name of the layer. Layers with the same name will - share weights, but to avoid mistakes we require ``reuse=True`` in such cases. - - Properties: - output_size: - Python integer, dimensionality of the output space. - activation: - Activation function (callable). - weight_initializer: - Initializer instance (or name) for the weight matrix. - bias_initializer: - Initializer instance (or name) for the bias. - weights: - list of underlying weight and bias matrix components. no guarantee on order of elements - weight_regularizer: - Regularizer instance for the weight matrix (callable) - bias_regularizer: - Regularizer instance for the bias (callable). - activity_regularizer: - Regularizer instance for the output (callable) - weight_constraint: - Constraint function for the weight matrix. - bias_constraint: - Constraint function for the bias. - """ - - def __init__(self, output_size, - weight_initializer=None, - weight_regularizer=None, - weight_constraint=None, - bias_constraint=None, - num_partitions=3, - activation=None, - use_bias=True, - bias_initializer=tf.zeros_initializer(), - bias_regularizer=None, - activity_regularizer=None, - trainable=True, - name=None, - **kwargs): - super(FullDense, self).__init__(trainable=trainable, name=name, **kwargs) - self._output_sizes = self._get_output_partition_sizes(output_size, num_partitions) - self._units = output_size - self._activation = activation - self._weight_initializer = weight_initializer - self._bias_initializer = bias_initializer - self._weight_regularizer = weight_regularizer - self._bias_regularizer = bias_regularizer - self._weight_constraint = weight_constraint - self._bias_constraint = bias_constraint - self._use_bias = use_bias - # NOTE - many initializers depend on fan_in and fan_out - # - as such, initialization here may be different than - # - for a non-partitioned FullDense - self._parts = [core.Dense(units=out_size, - activation=activation, - use_bias=use_bias, - kernel_initializer=weight_initializer, - bias_initializer=bias_initializer, - kernel_regularizer=weight_regularizer, - bias_regularizer=bias_regularizer, - activity_regularizer=activity_regularizer, - kernel_constraint=weight_constraint, - bias_constraint=bias_constraint, - trainable=trainable, - name=name, - **kwargs) for out_size in self._output_sizes] - - @staticmethod - def _get_output_partition_sizes(out_size, num_parts): - """ Returns the appropriate output sizes of the partitions """ - boundaries = [out_size * n // num_parts for n in range(num_parts + 1)] - return [k - j for j, k in zip(boundaries[:], boundaries[1:])] - - def build(self, input_shapes): - """ Create the appropriately sized weights and biases in each layer partition """ - if isinstance(input_shapes, (list, tuple)): - input_shape = input_shapes[0] - is_compatible = True - for other_shape in input_shapes[1:]: - is_compatible &= input_shape.is_compatible_with(other_shape) - if not is_compatible: - raise ValueError("Input shapes %s are not compatible." % input_shapes) - else: - input_shape = input_shapes - - for part in self._parts: - part.build(input_shape) - - self.built = True - - @property - def units(self): - """ Returns the number of output units of the layer """ - return self._units - - @property - def output_size(self): - """ Returns the number of output units of the layer """ - return self._units - - @property - def activation(self): - """ Returns the activation function """ - return self._activation - - @property - def weight_initializer(self): - """ Returns the weight_initializer """ - return self._weight_initializer - - @property - def weight_regularizer(self): - """ Returns the weight_regularizer """ - return self._weight_regularizer - - @property - def weight_constraint(self): - """ Returns the weight_constraint """ - return self._weight_constraint - - @property - def bias_initializer(self): - """ Returns the bias_initializer """ - return self._bias_initializer - - @property - def bias_regularizer(self): - """ Returns the bias_regularizer """ - return self._bias_regularizer - - @property - def bias_constraint(self): - """ Returns the bias_constraint """ - return self._bias_constraint - - @property - def use_bias(self): - """ Returns whether a bias is used in the layer """ - return self._use_bias - - @property - def trainable_variables(self): - """ Returns the trainable variables of the layer """ - trainable_vars = [] - for pt in self._parts: - trainable_vars += pt.trainable_variables - return trainable_vars - - @property - def trainable_weights(self): - """ Returns the trainable variables of the layer """ - return self.trainable_variables - - @property - def non_trainable_variables(self): - """ Returns the non-trainable variables of the layer """ - non_trainable_vars = [] - for pt in self._parts: - non_trainable_vars += pt.non_trainable_variables - return non_trainable_vars - - @property - def non_trainable_weights(self): - """ Returns the non-trainable variables of the layer """ - return self.non_trainable_variables - - @property - def variables(self): - """ Returns a list of all weights and biases in this layer """ - layer_vars = [] - for pt in self._parts: - layer_vars += pt.weights - return layer_vars - - @property - def weights(self): - """ Returns a list of all weights and biases in this layer """ - return self.variables - - @property - def dtype(self): - """ Returns the dtype of the layers weights """ - return self._parts[0].dtype - - def call(self, inputs, **kwargs): # pylint: disable=unused-argument - """The logic of the layer lives here. - - Arguments: - inputs: - A dense Tensor or a list of such. - If `inputs` is a list, all tensors must have same `dense_shape`. + """ + Full-connected, Dense input layer class. + This layer implements the operation: + + .. code-block:: python + outputs = activation(inputs.weight + bias) + + Where ``activation`` is the activation function passed as the ``activation`` + argument (if not ``None``), ``weight`` is a weights matrix created by the layer, + and ``bias`` is a bias vector created by the layer. + + However, this layer breaks up ``weight`` into ``num_partitions`` parts, + for the purpose of even disribution of weights across parameter servers + for distributed training. + + Note - This layer is created to allow distributed training optimizations, + but can also be used for single node training (e.g. hogwild) without + code modification + + Args: + output_size: + Integer or Long, dimensionality of the output space. + weight_initializer: + Initializer function for the weight matrix. + weight_regularizer: + Regularizer function for the weight matrix. + Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect. + weight_constraint: + An optional projection function to be applied to the + weight after being updated by an `Optimizer` (e.g. used to implement + norm constraints or value constraints for layer weights). The function + must take as input the unprotected variable and must return the + projected variable (which must have the same shape). Constraints are + not safe to use when doing asynchronous distributed training. + bias_constraint: + An optional projection function to be applied to the + bias after being updated by an `Optimizer`. + num_partitions: + Number of pieces to partition the weights into. This layer does + column partitioning of the weights, which is equivalent to + processing the input tensor with multiple fully connected layers + of smaller output size, and then concatenating these outputs + activation: + Activation function (callable). Set it to None to maintain a linear activation. + use_bias: + Boolean whether to include a bias parameter in the layer + bias_initializer: + Initializer function for the bias. + bias_regularizer: + Regularizer function for the bias. + Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect. + activity_regularizer: + Regularizer function for the output. + trainable: + Boolean, if `True` also add variables to the graph collection + ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable + `_). + name: + String, the name of the layer. Layers with the same name will + share weights, but to avoid mistakes we require ``reuse=True`` in such cases. + + Properties: + output_size: + Python integer, dimensionality of the output space. + activation: + Activation function (callable). + weight_initializer: + Initializer instance (or name) for the weight matrix. + bias_initializer: + Initializer instance (or name) for the bias. + weights: + list of underlying weight and bias matrix components. no guarantee on order of elements + weight_regularizer: + Regularizer instance for the weight matrix (callable) + bias_regularizer: + Regularizer instance for the bias (callable). + activity_regularizer: + Regularizer instance for the output (callable) + weight_constraint: + Constraint function for the weight matrix. + bias_constraint: + Constraint function for the bias. + """ + + def __init__( + self, + output_size: int, + weight_initializer: Optional[keras.initializers.Initializer] = None, + weight_regularizer: Optional[keras.regularizers.Regularizer] = None, + weight_constraint: Optional[keras.constraints.Constraint] = None, + bias_constraint: Optional[keras.constraints.Constraint] = None, + num_partitions: int = 3, + activation: Optional[tf.keras.activations.Activation] = None, + use_bias: bool = True, + bias_initializer: keras.initializers.Initializer = tf.zeros_initializer(), + bias_regularizer: Optional[keras.regularizers.Regularizer] = None, + activity_regularizer: Optional[keras.regularizers.Regularizer] = None, + trainable: bool = True, + name: Optional[str] = None, + **kwargs + ): + super(FullDense, self).__init__(trainable=trainable, name=name, **kwargs) + self._output_sizes = self._get_output_partition_sizes( + output_size, num_partitions + ) + self._units = output_size + self._activation = activation + self._weight_initializer = weight_initializer + self._bias_initializer = bias_initializer + self._weight_regularizer = weight_regularizer + self._bias_regularizer = bias_regularizer + self._weight_constraint = weight_constraint + self._bias_constraint = bias_constraint + self._use_bias = use_bias + # NOTE - many initializers depend on fan_in and fan_out + # - as such, initialization here may be different than + # - for a non-partitioned FullDense + self._parts = [ + core.Dense( + units=out_size, + activation=activation, + use_bias=use_bias, + kernel_initializer=weight_initializer, + bias_initializer=bias_initializer, + kernel_regularizer=weight_regularizer, + bias_regularizer=bias_regularizer, + activity_regularizer=activity_regularizer, + kernel_constraint=weight_constraint, + bias_constraint=bias_constraint, + trainable=trainable, + name=name, + **kwargs + ) + for out_size in self._output_sizes + ] + + @staticmethod + def _get_output_partition_sizes(out_size: int, num_parts: int) -> List[int]: + """Returns the appropriate output sizes of the partitions""" + boundaries = [out_size * n // num_parts for n in range(num_parts + 1)] + return [k - j for j, k in zip(boundaries[:], boundaries[1:])] + + def build(self, input_shapes: Union[tf.TensorShape, List[tf.TensorShape]]): + """Create the appropriately sized weights and biases in each layer partition""" + if isinstance(input_shapes, (list, tuple)): + input_shape = input_shapes[0] + is_compatible = True + for other_shape in input_shapes[1:]: + is_compatible &= input_shape.is_compatible_with(other_shape) + if not is_compatible: + raise ValueError("Input shapes %s are not compatible." % input_shapes) + else: + input_shape = input_shapes + + for part in self._parts: + part.build(input_shape) + + self.built = True + + @property + def units(self) -> int: + """Returns the number of output units of the layer""" + return self._units + + @property + def output_size(self) -> int: + """Returns the number of output units of the layer""" + return self._units + + @property + def activation(self) -> Optional[tf.keras.activations.Activation]: + """Returns the activation function""" + return self._activation + + @property + def weight_initializer(self) -> Optional[keras.initializers.Initializer]: + """Returns the weight_initializer""" + return self._weight_initializer + + @property + def weight_regularizer(self) -> Optional[keras.regularizers.Regularizer]: + """Returns the weight_regularizer""" + return self._weight_regularizer + + @property + def weight_constraint(self) -> Optional[keras.constraints.Constraint]: + """Returns the weight_constraint""" + return self._weight_constraint + + @property + def bias_initializer(self) -> Optional[keras.initializers.Initializer]: + """Returns the bias_initializer""" + return self._bias_initializer + + @property + def bias_regularizer(self) -> Optional[keras.regularizers.Regularizer]: + """Returns the bias_regularizer""" + return self._bias_regularizer + + @property + def bias_constraint(self) -> Optional[keras.constraints.Constraint]: + """Returns the bias_constraint""" + return self._bias_constraint + + @property + def use_bias(self) -> bool: + """Returns whether a bias is used in the layer""" + return self._use_bias + + @property + def trainable_variables(self) -> List[tf.Variable]: + """Returns the trainable variables of the layer""" + trainable_vars = [] + for pt in self._parts: + trainable_vars += pt.trainable_variables + return trainable_vars + + @property + def trainable_weights(self) -> List[tf.Variable]: + """Returns the trainable variables of the layer""" + return self.trainable_variables + + @property + def non_trainable_variables(self) -> List[tf.Variable]: + """Returns the non-trainable variables of the layer""" + non_trainable_vars = [] + for pt in self._parts: + non_trainable_vars += pt.non_trainable_variables + return non_trainable_vars + + @property + def non_trainable_weights(self) -> List[tf.Variable]: + """Returns the non-trainable variables of the layer""" + return self.non_trainable_variables + + @property + def variables(self) -> List[tf.Variable]: + """Returns a list of all weights and biases in this layer""" + layer_vars = [] + for pt in self._parts: + layer_vars += pt.weights + return layer_vars + + @property + def weights(self) -> List[tf.Variable]: + """Returns a list of all weights and biases in this layer""" + return self.variables + + @property + def dtype(self) -> tf.DType: + """Returns the dtype of the layers weights""" + return self._parts[0].dtype + + def call( + self, + inputs: Union[ + tf.SparseTensor, Union[List[tf.SparseTensor], Tuple[tf.SparseTensor]] + ], + **kwargs + ): # pylint: disable=unused-argument + """The logic of the layer lives here. + + Args: + inputs: + A dense Tensor or a list of such. + If `inputs` is a list, all tensors must have same `dense_shape`. + + Returns: + - If `inputs` is `SparseTensor`, then returns `bias + inputs * dense_b`. + - If `inputs` is a `list[SparseTensor`, then returns + `bias + accumulate_n([sp_a * dense_b for sp_a in inputs])`. + """ + if not isinstance(inputs, (list, tuple)): + inputs = [inputs] + + outputs = [] + for inp in inputs: + part_outputs = [part(inp) for part in self._parts] + outputs.append(tf.concat(part_outputs, axis=-1)) + + return tf.accumulate_n(outputs) + + +def full_dense( + inputs: tf.Tensor, + output_size: int, + weight_initializer: Optional[keras.initializers.Initializer] = None, + weight_regularizer: Optional[keras.regularizers.Regularizer] = None, + weight_constraint: Optional[keras.constraints.Constraint] = None, + bias_constraint: Optional[keras.constraints.Constraint] = None, + num_partitions: int = 3, + activation: Optional[tf.keras.activations.Activation] = None, + use_bias: bool = True, + bias_initializer: keras.initializers.Initializer = tf.zeros_initializer(), + bias_regularizer: Optional[keras.regularizers.Regularizer] = None, + activity_regularizer: Optional[keras.regularizers.Regularizer] = None, + trainable: bool = True, + name: Optional[str] = None, + reuse: Optional[bool] = None, + **kwargs +): + """Functional interface for the fully-connected dense-input layer. + This layer implements the operation: + `outputs = activation(inputs.weight + bias)` + Where `activation` is the activation function passed as the `activation` + argument (if not `None`), `weight` is a weights matrix created by the layer, + and `bias` is a bias vector created by the layer + (only if `use_bias` is `True`). + + However, this layer breaks up ``weight`` into ``num_partitions`` parts, + for the purpose of even disribution of weights across parameter servers + for distributed training. + + Note - This layer is created to allow distributed training optimizations, + but can also be used for single node training (e.g. hogwild) without + code modification + + Args: + inputs: Tensor input. + output_size: Integer or Long, dimensionality of the output space. + weight_initializer: Initializer function for the weight matrix. + If `None` (default), weights are initialized using the default + initializer used by `tf.get_variable`. + weight_regularizer: + Regularizer function for the weight matrix. + Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect. + weight_constraint: + An optional projection function to be applied to the + weight after being updated by an `Optimizer` (e.g. used to implement + norm constraints or value constraints for layer weights). The function + must take as input the unprojected variable and must return the + projected variable (which must have the same shape). Constraints are + not safe to use when doing asynchronous distributed training. + bias_constraint: + An optional projection function to be applied to the + bias after being updated by an `Optimizer`. + num_partitions: + Number of pieces to partition the weights into. This layer does + column partitioning of the weights, which is equivalent to + processing the input tensor with multiple fully connected layers + of smaller output size, and then concatenating these outputs + activation: Activation function (callable). Set it to None to maintain a + linear activation. + use_bias: Boolean, whether the layer uses a bias. + bias_initializer: + Initializer function for the bias. + bias_regularizer: + Regularizer function for the bias. + Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect. + activity_regularizer: + Regularizer function for the output. + trainable: + Boolean, if `True` also add variables to the graph collection + `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). + name: + String, the name of the layer. + reuse: + Boolean, whether to reuse the weights of a previous layer + by the same name. Returns: - - If `inputs` is `SparseTensor`, then returns `bias + inputs * dense_b`. - - If `inputs` is a `list[SparseTensor`, then returns - `bias + accumulate_n([sp_a * dense_b for sp_a in inputs])`. + Output tensor with shape `inputs.shape[:-1] + [output_size]`. """ if not isinstance(inputs, (list, tuple)): - inputs = [inputs] - - outputs = [] - for inp in inputs: - part_outputs = [part(inp) for part in self._parts] - outputs.append(tf.concat(part_outputs, axis=-1)) - - return tf.accumulate_n(outputs) - - -def full_dense(inputs, output_size, - weight_initializer=None, - weight_regularizer=None, - weight_constraint=None, - bias_constraint=None, - num_partitions=3, - activation=None, - use_bias=True, - bias_initializer=tf.zeros_initializer(), - bias_regularizer=None, - activity_regularizer=None, - trainable=True, - name=None, - reuse=None, - **kwargs): - """Functional interface for the fully-connected dense-input layer. - This layer implements the operation: - `outputs = activation(inputs.weight + bias)` - Where `activation` is the activation function passed as the `activation` - argument (if not `None`), `weight` is a weights matrix created by the layer, - and `bias` is a bias vector created by the layer - (only if `use_bias` is `True`). - - However, this layer breaks up ``weight`` into ``num_partitions`` parts, - for the purpose of even disribution of weights across parameter servers - for distributed training. - - Note - This layer is created to allow distributed training optimizations, - but can also be used for single node training (e.g. hogwild) without - code modification - - Arguments: - inputs: Tensor input. - output_size: Integer or Long, dimensionality of the output space. - weight_initializer: Initializer function for the weight matrix. - If `None` (default), weights are initialized using the default - initializer used by `tf.get_variable`. - weight_regularizer: - Regularizer function for the weight matrix. - Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect. - weight_constraint: - An optional projection function to be applied to the - weight after being updated by an `Optimizer` (e.g. used to implement - norm constraints or value constraints for layer weights). The function - must take as input the unprojected variable and must return the - projected variable (which must have the same shape). Constraints are - not safe to use when doing asynchronous distributed training. - bias_constraint: - An optional projection function to be applied to the - bias after being updated by an `Optimizer`. - num_partitions: - Number of pieces to partition the weights into. This layer does - column partitioning of the weights, which is equivalent to - processing the input tensor with multiple fully connected layers - of smaller output size, and then concatenating these outputs - activation: Activation function (callable). Set it to None to maintain a - linear activation. - use_bias: Boolean, whether the layer uses a bias. - bias_initializer: - Initializer function for the bias. - bias_regularizer: - Regularizer function for the bias. - Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect. - activity_regularizer: - Regularizer function for the output. - trainable: - Boolean, if `True` also add variables to the graph collection - `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). - name: - String, the name of the layer. - reuse: - Boolean, whether to reuse the weights of a previous layer - by the same name. - - Returns: - Output tensor with shape `inputs.shape[:-1] + [output_size]`. - """ - if not isinstance(inputs, (list, tuple)): - inputs = [inputs] - - dtype = inputs[0].dtype.base_dtype - - layer = FullDense(output_size=output_size, - weight_initializer=weight_initializer, - weight_regularizer=weight_regularizer, - weight_constraint=weight_constraint, - bias_constraint=bias_constraint, - num_partitions=num_partitions, - activation=activation, - use_bias=use_bias, - bias_initializer=bias_initializer, - bias_regularizer=bias_regularizer, - activity_regularizer=activity_regularizer, - trainable=trainable, - name=name, - dtype=dtype, - _scope=name, - _reuse=reuse, - **kwargs) - - return layer(inputs) + inputs = [inputs] + + dtype = inputs[0].dtype.base_dtype + + layer = FullDense( + output_size=output_size, + weight_initializer=weight_initializer, + weight_regularizer=weight_regularizer, + weight_constraint=weight_constraint, + bias_constraint=bias_constraint, + num_partitions=num_partitions, + activation=activation, + use_bias=use_bias, + bias_initializer=bias_initializer, + bias_regularizer=bias_regularizer, + activity_regularizer=activity_regularizer, + trainable=trainable, + name=name, + dtype=dtype, + _scope=name, + _reuse=reuse, + **kwargs + ) + + return layer(inputs) diff --git a/twml/twml/contrib/layers/hashed_percentile_discretizer.py b/twml/twml/contrib/layers/hashed_percentile_discretizer.py index b32c3be8d..3dc99a6e1 100644 --- a/twml/twml/contrib/layers/hashed_percentile_discretizer.py +++ b/twml/twml/contrib/layers/hashed_percentile_discretizer.py @@ -4,14 +4,16 @@ """ -from twitter.deepbird.util.hashing import ( - integer_multiplicative_hashing_uniform, - integer_multiplicative_hashing, -) # noqa: F401 +from typing import Callable, Optional -from libtwml import percentile_discretizer_bin_indices import numpy as np import tensorflow.compat.v1 as tf +from libtwml import percentile_discretizer_bin_indices +from twitter.deepbird.util.hashing import ( # noqa: F401 + integer_multiplicative_hashing, + integer_multiplicative_hashing_uniform, +) + import twml from twml.layers.layer import Layer from twml.layers.partition import Partition @@ -19,199 +21,219 @@ class HashedPercentileDiscretizer(Layer): - """ - HashedPercentileDiscretizer layer is constructed by PercentileDiscretizerCalibrator - after accumulating data - and performing minimum description length (PercentileDiscretizer) calibration. - - HashedPercentileDiscretizer takes sparse continuous features and converts then to sparse - binary features. Each binary output feature is associated to an HashedPercentileDiscretizer - bin. - Each HashedPercentileDiscretizer input feature is converted to n_bin bins. - Each HashedPercentileDiscretizer calibration tries to find bin delimiters such - that the number of features values - per bin is roughly equal (for each given HashedPercentileDiscretizer feature). - Note that if an input feature is rarely used, so will its associated output bin/features. - The difference between this layer and PercentileDiscretizer is that the - DeterministicPercentileDiscretize always assigns the same output id in the SparseTensor to the - same input feature id + bin. This is useful if you want to user transfer learning on pre-trained - sparse to dense embedding layers, but re-calibrate your discretizer on newer data. - """ - - def __init__(self, n_feature, n_bin, out_bits, - bin_values=None, hash_keys=None, hash_values=None, - bin_ids=None, feature_offsets=None, - hash_fn=integer_multiplicative_hashing_uniform, **kwargs): """ - Creates a non-initialized `HashedPercentileDiscretizer` object. - Before using the table you will have to initialize it. After initialization - the table will be immutable. - - Parent class args: - see [tf.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/layers/Layer) - for documentation of parent class arguments. - - Required args: - n_feature: - number of unique features accumulated during HashedPercentileDiscretizer calibration. - This is the number of features in the hash map. - Used to initialize bin_values, hash_keys, hash_values, - bin_ids, bin_values and feature_offsets. - n_bin: - number of HashedPercentileDiscretizer bins used for - HashedPercentileDiscretizer calibration. Used to initialize bin_values, hash_keys, - hash_values, bin_ids, bin_values and feature_offsets. - out_bits: - Determines the maximum value for output feature IDs. - The dense_shape of the SparseTensor returned by lookup(x) - will be [x.shape[0], 1 << output_bits]. - - Optional args: - hash_keys: - contains the features ID that HashedPercentileDiscretizer discretizes and knows - about. The hash map (hash_keys->hash_values) is used for two reasons: - 1. divide inputs into two feature spaces: - HashedPercentileDiscretizer vs non-HashedPercentileDiscretizer - 2. transate the HashedPercentileDiscretizer features into a hash_feature ID that - HashedPercentileDiscretizer understands. - The hash_map is expected to contain n_feature items. - hash_values: - translates the feature IDs into hash_feature IDs for HashedPercentileDiscretizer. - bin_ids: - a 1D Tensor of size n_feature * n_bin + 1 which contains - unique IDs to which the HashedPercentileDiscretizer features will be translated to. - For example, tf.Tensor(np.arange(n_feature * n_bin)) would produce - the most efficient output space. - bin_values: - a 1D Tensor aligned with bin_ids. - For a given hash_feature ID j, it's value bin's are indexed between - `j*n_bin` and `j*n_bin + n_bin-1`. - As such, bin_ids[j*n_bin+i] is translated from a hash_feature ID of j - and a inputs value between - `bin_values[j*n_bin + i]` and `bin_values[j*n_bin+i+1]`. - feature_offsets: - a 1D Tensor specifying the starting location of bins for a given feature id. - For example, tf.Tensor(np.arange(0, bin_values.size, n_bin, dtype='int64')). - hash_fn: - a function that takes in `feature_ids`, `bucket_indices` and `output_size` and - hashes the bucketed features into the `output_size` buckets. The default uses knuth's - multiplicative hashing + HashedPercentileDiscretizer layer is constructed by PercentileDiscretizerCalibrator + after accumulating data + and performing minimum description length (PercentileDiscretizer) calibration. + + HashedPercentileDiscretizer takes sparse continuous features and converts then to sparse + binary features. Each binary output feature is associated to an HashedPercentileDiscretizer + bin. + Each HashedPercentileDiscretizer input feature is converted to n_bin bins. + Each HashedPercentileDiscretizer calibration tries to find bin delimiters such + that the number of features values + per bin is roughly equal (for each given HashedPercentileDiscretizer feature). + Note that if an input feature is rarely used, so will its associated output bin/features. + The difference between this layer and PercentileDiscretizer is that the + DeterministicPercentileDiscretize always assigns the same output id in the SparseTensor to the + same input feature id + bin. This is useful if you want to user transfer learning on pre-trained + sparse to dense embedding layers, but re-calibrate your discretizer on newer data. """ - super(HashedPercentileDiscretizer, self).__init__(**kwargs) - - max_discretizer_feature = n_feature * (n_bin + 1) - self._n_feature = n_feature - self._n_bin = n_bin - - if not self.built: - self.build(input_shape=None) - - # build variables - self.output_size = tf.convert_to_tensor(1 << out_bits, tf.int64) - self._out_bits = out_bits - - hash_keys = hash_keys - if hash_keys is None: - hash_keys = np.empty(n_feature, dtype=np.int64) - - hash_values = hash_values - if hash_values is None: - hash_values = np.empty(n_feature, dtype=np.int64) - initializer = tf.lookup.KeyValueTensorInitializer(hash_keys, hash_values) - self.hash_map = tf.lookup.StaticHashTable(initializer, -1) - self.bin_ids = bin_ids - if bin_ids is None: - bin_ids = np.empty(max_discretizer_feature, dtype=np.int64) - - self.bin_values = bin_values - if bin_values is None: - bin_values = np.empty(max_discretizer_feature, dtype=np.float32) - - self.feature_offsets = feature_offsets - if feature_offsets is None: - feature_offsets = np.empty(n_feature, dtype=np.int64) - - self.hash_fn = hash_fn - - def build(self, input_shape): # pylint: disable=unused-argument - """ - Creates the variables of the layer: - hash_keys, hash_values, bin_ids, bin_values, feature_offsets and self.output_size. - """ - # build layers - self.partition = Partition() - self.stitch = Stitch() - # make sure this is last - self.built = True - - def call(self, inputs, **kwargs): - """Looks up `keys` in a table, outputs the corresponding values. - - Implements HashedPercentileDiscretizer inference where inputs are intersected with a - hash_map. - Part of the inputs are discretized using twml.discretizer - to produce a discretizer_output SparseTensor. - This SparseTensor is then joined with the original inputs SparseTensor, - but only for the inputs keys that did not get discretized. - - Args: - inputs: A 2D SparseTensor that is input to HashedPercentileDiscretizer for - discretization. It has a dense_shape of [batch_size, input_size] - name: A name for the operation (optional). - Returns: - A `SparseTensor` of the same type as `inputs`. - Its dense_shape is [shape_input.dense_shape[0], 1 << output_bits]. - """ - if isinstance(inputs, tf.SparseTensor): - inputs = twml.SparseTensor.from_tf(inputs) - - assert(isinstance(inputs, twml.SparseTensor)) - - # sparse column indices - ids = inputs.ids - # sparse row indices - keys = inputs.indices - # sparse values - vals = inputs.values - - hashed_keys = self.hash_map.lookup(keys) - hashed_keys = tf.cast(hashed_keys, tf.int64) - - found = tf.not_equal(hashed_keys, tf.constant(-1, tf.int64)) - partition_ids = tf.cast(found, tf.int32) - - found = tf.reshape(found, [-1]) - continuous_feature_ids = tf.boolean_mask(keys, found) - - vals, key, indices = self.partition(partition_ids, vals, tf.where(found, hashed_keys, keys)) - non_discretizer_keys, discretizer_in_keys = key - non_discretizer_vals, discretizer_in_vals = vals - - non_discretizer_keys = twml.util.limit_bits(non_discretizer_keys, self._out_bits) - self.non_discretizer_keys = non_discretizer_keys - - # run HashedPercentileDiscretizer on the keys/values it knows about - output = percentile_discretizer_bin_indices(discretizer_in_keys, - discretizer_in_vals, - self.bin_ids, - self.bin_values, - self.feature_offsets) - discretizer_bucket_idxs, discretizer_vals = output - new_discretizer_keys = self.hash_fn(continuous_feature_ids, discretizer_bucket_idxs, - self.output_size) - # Stitch the keys and values from discretizer and non discretizer indices back, with help - # of the Stitch Layer - self.discretizer_out_keys = new_discretizer_keys - - concat_data = self.stitch([non_discretizer_vals, discretizer_vals], - [non_discretizer_keys, new_discretizer_keys], - indices) - - concat_vals, concat_keys = concat_data - - # Generate output shape using _compute_output_shape - - batch_size = tf.to_int64(inputs.dense_shape[0]) - output_shape = [batch_size, self.output_size] - return twml.SparseTensor(ids, concat_keys, concat_vals, output_shape).to_tf() + def __init__( + self, + n_feature: int, + n_bin: int, + out_bits: int, + bin_values: Optional[tf.Tensor] = None, + hash_keys: Optional[tf.Tensor] = None, + hash_values: Optional[tf.Tensor] = None, + bin_ids: Optional[tf.Tensor] = None, + feature_offsets: Optional[tf.Tensor] = None, + hash_fn: Callable[ + [tf.Tensor, tf.Tensor, tf.Tensor], tf.Tensor + ] = integer_multiplicative_hashing_uniform, + **kwargs + ): + """ + Creates a non-initialized `HashedPercentileDiscretizer` object. + Before using the table you will have to initialize it. After initialization + the table will be immutable. + + Parent class args: + see [tf.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/layers/Layer) + for documentation of parent class arguments. + + Required args: + n_feature: + number of unique features accumulated during HashedPercentileDiscretizer calibration. + This is the number of features in the hash map. + Used to initialize bin_values, hash_keys, hash_values, + bin_ids, bin_values and feature_offsets. + n_bin: + number of HashedPercentileDiscretizer bins used for + HashedPercentileDiscretizer calibration. Used to initialize bin_values, hash_keys, + hash_values, bin_ids, bin_values and feature_offsets. + out_bits: + Determines the maximum value for output feature IDs. + The dense_shape of the SparseTensor returned by lookup(x) + will be [x.shape[0], 1 << output_bits]. + + Optional args: + bin_values: + a 1D Tensor aligned with bin_ids. + For a given hash_feature ID j, it's value bin's are indexed between + `j*n_bin` and `j*n_bin + n_bin-1`. + As such, bin_ids[j*n_bin+i] is translated from a hash_feature ID of j + and a inputs value between + `bin_values[j*n_bin + i]` and `bin_values[j*n_bin+i+1]`. + hash_keys: + contains the features ID that HashedPercentileDiscretizer discretizes and knows + about. The hash map (hash_keys->hash_values) is used for two reasons: + 1. divide inputs into two feature spaces: + HashedPercentileDiscretizer vs non-HashedPercentileDiscretizer + 2. translate the HashedPercentileDiscretizer features into a hash_feature ID that + HashedPercentileDiscretizer understands. + The hash_map is expected to contain n_feature items. + hash_values: + translates the feature IDs into hash_feature IDs for HashedPercentileDiscretizer. + bin_ids: + a 1D Tensor of size n_feature * n_bin + 1 which contains + unique IDs to which the HashedPercentileDiscretizer features will be translated to. + For example, tf.Tensor(np.arange(n_feature * n_bin)) would produce + the most efficient output space. + feature_offsets: + a 1D Tensor specifying the starting location of bins for a given feature id. + For example, tf.Tensor(np.arange(0, bin_values.size, n_bin, dtype='int64')). + hash_fn: + a function that takes in `feature_ids`, `bucket_indices` and `output_size` and + hashes the bucketed features into the `output_size` buckets. The default uses knuth's + multiplicative hashing + """ + super(HashedPercentileDiscretizer, self).__init__(**kwargs) + + max_discretizer_feature = n_feature * (n_bin + 1) + self._n_feature = n_feature + self._n_bin = n_bin + + if not self.built: + self.build(input_shape=None) + + # build variables + self.output_size = tf.convert_to_tensor(1 << out_bits, tf.int64) + self._out_bits = out_bits + + hash_keys = hash_keys + if hash_keys is None: + hash_keys = np.empty(n_feature, dtype=np.int64) + + hash_values = hash_values + if hash_values is None: + hash_values = np.empty(n_feature, dtype=np.int64) + + initializer = tf.lookup.KeyValueTensorInitializer(hash_keys, hash_values) + self.hash_map = tf.lookup.StaticHashTable(initializer, -1) + self.bin_ids = bin_ids + if bin_ids is None: + bin_ids = np.empty(max_discretizer_feature, dtype=np.int64) + + self.bin_values = bin_values + if bin_values is None: + bin_values = np.empty(max_discretizer_feature, dtype=np.float32) + + self.feature_offsets = feature_offsets + if feature_offsets is None: + feature_offsets = np.empty(n_feature, dtype=np.int64) + + self.hash_fn = hash_fn + + def build(self, input_shape): # pylint: disable=unused-argument + """ + Creates the variables of the layer: + hash_keys, hash_values, bin_ids, bin_values, feature_offsets and self.output_size. + """ + # build layers + self.partition = Partition() + self.stitch = Stitch() + # make sure this is last + self.built = True + + def call(self, inputs: twml.SparseTensor, **kwargs) -> twml.SparseTensor: + """Looks up `keys` in a table, outputs the corresponding values. + + Implements HashedPercentileDiscretizer inference where inputs are intersected with a + hash_map. + Part of the inputs are discretized using twml.discretizer + to produce a discretizer_output SparseTensor. + This SparseTensor is then joined with the original inputs SparseTensor, + but only for the inputs keys that did not get discretized. + + Args: + inputs: A 2D SparseTensor that is input to HashedPercentileDiscretizer for + discretization. It has a dense_shape of [batch_size, input_size] + name: A name for the operation (optional). + Returns: + A `SparseTensor` of the same type as `inputs`. + Its dense_shape is [shape_input.dense_shape[0], 1 << output_bits]. + """ + if isinstance(inputs, tf.SparseTensor): + inputs = twml.SparseTensor.from_tf(inputs) + + assert isinstance(inputs, twml.SparseTensor) + + # sparse column indices + ids = inputs.ids + # sparse row indices + keys = inputs.indices + # sparse values + vals = inputs.values + + hashed_keys = self.hash_map.lookup(keys) + hashed_keys = tf.cast(hashed_keys, tf.int64) + + found = tf.not_equal(hashed_keys, tf.constant(-1, tf.int64)) + partition_ids = tf.cast(found, tf.int32) + + found = tf.reshape(found, [-1]) + continuous_feature_ids = tf.boolean_mask(keys, found) + + vals, key, indices = self.partition( + partition_ids, vals, tf.where(found, hashed_keys, keys) + ) + non_discretizer_keys, discretizer_in_keys = key + non_discretizer_vals, discretizer_in_vals = vals + + non_discretizer_keys = twml.util.limit_bits( + non_discretizer_keys, self._out_bits + ) + self.non_discretizer_keys = non_discretizer_keys + + # run HashedPercentileDiscretizer on the keys/values it knows about + output = percentile_discretizer_bin_indices( + discretizer_in_keys, + discretizer_in_vals, + self.bin_ids, + self.bin_values, + self.feature_offsets, + ) + discretizer_bucket_idxs, discretizer_vals = output + new_discretizer_keys = self.hash_fn( + continuous_feature_ids, discretizer_bucket_idxs, self.output_size + ) + # Stitch the keys and values from discretizer and non discretizer indices back, with help + # of the Stitch Layer + self.discretizer_out_keys = new_discretizer_keys + + concat_data = self.stitch( + [non_discretizer_vals, discretizer_vals], + [non_discretizer_keys, new_discretizer_keys], + indices, + ) + + concat_vals, concat_keys = concat_data + + # Generate output shape using _compute_output_shape + + batch_size = tf.to_int64(inputs.dense_shape[0]) + output_shape = [batch_size, self.output_size] + return twml.SparseTensor(ids, concat_keys, concat_vals, output_shape).to_tf() diff --git a/twml/twml/contrib/layers/hashing_discretizer.py b/twml/twml/contrib/layers/hashing_discretizer.py index 2a8244f4b..19ee51f9d 100644 --- a/twml/twml/contrib/layers/hashing_discretizer.py +++ b/twml/twml/contrib/layers/hashing_discretizer.py @@ -4,153 +4,170 @@ """ +from typing import Optional + import libtwml import tensorflow.compat.v1 as tf + import twml from twml.constants import HashingDiscretizerOptions from twml.layers.layer import Layer class HashingDiscretizer(Layer): - """A layer that discretizes continuous features, with hashed feature assignments - - HashingDiscretizer converts sparse continuous features into sparse - binary features. Each binary output feature indicates the presence of a - value in a HashingDiscretizer bin. + """A layer that discretizes continuous features, with hashed feature assignments - Each calibrated HashingDiscretizer input feature is converted to n_bin+1 bins. + HashingDiscretizer converts sparse continuous features into sparse + binary features. Each binary output feature indicates the presence of a + value in a HashingDiscretizer bin. - - n_bin bin boundaries for each feature (i.e. len(bin_vals[id])==n_bin) defines n_bin+1 bins - - bin assignment = sum(bin_vals 0: - # pass all inputs to the c++ op - # the op determines whether to discretize (when a feature is calibrated), - # or whether to simply limit bits and pass through (when not calibrated) - # NOTE - Hashing is done in C++ - discretizer_keys, discretizer_vals = libtwml.ops.hashing_discretizer( - input_ids=keys, # Input - input_vals=vals, # Input - bin_vals=self._bin_vals, # Input - feature_ids=tf.make_tensor_proto(self._feature_ids), # Attr - n_bin=self._n_bin, # Attr - output_bits=self._out_bits, # Attr - cost_per_unit=self.cost_per_unit, # Attr - options=self._options, # Attr - ) - else: - discretizer_keys = twml.util.limit_bits(keys, self._out_bits) - discretizer_vals = vals - - batch_size = tf.to_int64(inputs.dense_shape[0]) - output_size = tf.convert_to_tensor(1 << self._out_bits, tf.int64) - output_shape = [batch_size, output_size] - - return twml.SparseTensor(ids, discretizer_keys, discretizer_vals, output_shape).to_tf() + def __init__( + self, + feature_ids: tf.Tensor, + bin_vals: tf.Tensor, + n_bin: int, + out_bits: int, + cost_per_unit: int = 500, + options: Optional[HashingDiscretizerOptions] = None, + **kwargs + ): + """ + Creates a non-initialized `HashingDiscretizer` object. + + Parent class args: + see [tf.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/layers/Layer) + for documentation of parent class arguments. + + Required args: + feature_ids (1D int64 numpy array): + - list of feature IDs that have been calibrated and have corresponding + bin boundary values in the bin_vals array + - bin values for feature feature_ids[i] live at bin_vals[i*n_bin:(i+1)*n_bin] + bin_vals (1D float numpy array): + - These are the bin boundary values for each calibrated feature + - len(bin_vals) = n_bin*len(feature_ids) + n_bin (int): + - number of HashingDiscretizer bins is actually n_bin + 1 + - ***Note*** that if a value N is passed for the value of n_bin to + HashingDiscretizerCalibrator, then HashingDiscretizerCalibrator + will generate N+1 bin boundaries for each feature, and hence there + will actually be N+2 potential bins for each feature + out_bits (int): + Determines the maximum value for output feature IDs. + The dense_shape of the SparseTensor returned by lookup(x) + will be [x.shape[0], 1 << output_bits]. + + Optional args: + cost_per_unit (int): + - heuristic for intra op multithreading. approximate nanoseconds per input value. + options (int or None for default): + - Selects behavior of the op. Default is lower_bound and integer_multiplicative_hashing. + - Use values in twml.constants.HashingDiscretizerOptions to select options as follows + choose exactly one of HashingDiscretizerOptions.{SEARCH_LOWER_BOUND, SEARCH_LINEAR, SEARCH_UPPER_BOUND} + choose exactly one of HashingDiscretizerOptions.{HASH_32BIT, HASH_64BIT} + Bitwise OR these together to construct the options input. + For example, `options=(HashingDiscretizerOptions.SEARCH_UPPER_BOUND | HashingDiscretizerOptions.HASH_64BIT)` + """ + + super(HashingDiscretizer, self).__init__(**kwargs) + self._feature_ids = feature_ids + self._bin_vals = bin_vals + self._n_bin = n_bin + self._out_bits = out_bits + self.cost_per_unit = cost_per_unit + if options is None: + options = ( + HashingDiscretizerOptions.SEARCH_LOWER_BOUND + | HashingDiscretizerOptions.HASH_32BIT + ) + self._options = options + + if not self.built: + self.build(input_shape=None) + + def build(self, input_shape): # pylint: disable=unused-argument + """Creates the variables of the layer""" + # make sure this is last + self.built = True + + def call( + self, inputs: tf.SparseTensor, name: Optional[str] = None + ) -> tf.SparseTensor: # pylint: disable=unused-argument + """ + Implements HashingDiscretizer inference on a twml.SparseTensor. + Alternatively, accepts a tf.SparseTensor that can be converted + to twml.SparseTensor. + + Performs discretization of input values. + i.e. bucket_val = bucket(val | feature_id) + + This bucket mapping depends on the calibration (i.e. the bin boundaries). + However, (feature_id, bucket_val) pairs are mapped to new_feature_id in + a way that is independent of the calibration procedure + + Args: + inputs: A 2D SparseTensor that is input to HashingDiscretizer for + discretization. It has a dense_shape of [batch_size, input_size] + name: A name for the operation (optional). + + Returns: + A tf.SparseTensor, created from twml.SparseTensor.to_tf() + Its dense_shape is [shape_input.dense_shape[0], 1 << output_bits]. + """ + if isinstance(inputs, tf.SparseTensor): + inputs = twml.SparseTensor.from_tf(inputs) + + assert isinstance(inputs, twml.SparseTensor) + + # sparse column indices + ids = inputs.ids + # sparse row indices + keys = inputs.indices + # sparse values + vals = inputs.values + + if len(self._feature_ids) > 0: + # pass all inputs to the c++ op + # the op determines whether to discretize (when a feature is calibrated), + # or whether to simply limit bits and pass through (when not calibrated) + # NOTE - Hashing is done in C++ + discretizer_keys, discretizer_vals = libtwml.ops.hashing_discretizer( + input_ids=keys, # Input + input_vals=vals, # Input + bin_vals=self._bin_vals, # Input + feature_ids=tf.make_tensor_proto(self._feature_ids), # Attr + n_bin=self._n_bin, # Attr + output_bits=self._out_bits, # Attr + cost_per_unit=self.cost_per_unit, # Attr + options=self._options, # Attr + ) + else: + discretizer_keys = twml.util.limit_bits(keys, self._out_bits) + discretizer_vals = vals + + batch_size = tf.to_int64(inputs.dense_shape[0]) + output_size = tf.convert_to_tensor((1 << self._out_bits), tf.int64) + output_shape = [batch_size, output_size] + + return twml.SparseTensor( + ids, discretizer_keys, discretizer_vals, output_shape + ).to_tf() diff --git a/twml/twml/contrib/layers/mask_layer.py b/twml/twml/contrib/layers/mask_layer.py index f5e788c7b..0e632eeba 100644 --- a/twml/twml/contrib/layers/mask_layer.py +++ b/twml/twml/contrib/layers/mask_layer.py @@ -1,29 +1,31 @@ +import tensorflow.compat.v1 as tf + from twml.contrib.pruning import apply_mask from twml.layers import Layer class MaskLayer(Layer): - """ - This layer corresponds to `twml.contrib.pruning.apply_mask`. - - It applies a binary mask to mask out channels of a given tensor. The masks can be - optimized using `twml.contrib.trainers.PruningDataRecordTrainer`. - """ + """ + This layer corresponds to `twml.contrib.pruning.apply_mask`. - def call(self, inputs, **kwargs): + It applies a binary mask to mask out channels of a given tensor. The masks can be + optimized using `twml.contrib.trainers.PruningDataRecordTrainer`. """ - Applies a binary mask to the channels of the input. - Arguments: - inputs: - input tensor - **kwargs: - additional keyword arguments + def call(self, inputs: tf.Tensor, **kwargs): + """ + Applies a binary mask to the channels of the input. - Returns: - Masked tensor - """ - return apply_mask(inputs) + Args: + inputs: + input tensor + **kwargs: + additional keyword arguments + + Returns: + Masked tensor + """ + return apply_mask(inputs) - def compute_output_shape(self, input_shape): - return input_shape + def compute_output_shape(self, input_shape: tf.TensorShape) -> tf.TensorShape: + return input_shape diff --git a/twml/twml/contrib/layers/stacked_rnn.py b/twml/twml/contrib/layers/stacked_rnn.py index e05f5d853..2a86e9493 100644 --- a/twml/twml/contrib/layers/stacked_rnn.py +++ b/twml/twml/contrib/layers/stacked_rnn.py @@ -1,189 +1,235 @@ +from typing import Callable, List, Union +import tensorflow.compat.v1 as tf from twitter.deepbird.compat.v1.rnn import stack_bidirectional_dynamic_rnn -import tensorflow.compat.v1 as tf -import tensorflow import twml -def _get_rnn_cell_creator(cell_type): - if cell_type == "LSTM": - Cell = tf.nn.rnn_cell.LSTMCell - elif cell_type == "GRU": - Cell = tf.nn.rnn_cell.GRUCell - else: - raise ValueError("cell_type: %s is not supported." - "It should be one of 'LSTM' or 'GRU'." % cell_type) - return Cell - - -def _apply_dropout_wrapper(rnn_cells, dropout): - """ Apply dropout wrapper around each cell if necessary """ - if rnn_cells is None: - return None - - cells = [] - for i, dropout_rate in enumerate(dropout): - cell = rnn_cells[i] - if dropout_rate > 0: - cell = tf.nn.rnn_cell.DropoutWrapper(cell, input_keep_prob=(1.0 - dropout_rate)) - cells.append(cell) - return cells - - -def _create_bidirectional_rnn_cell(num_units, dropout, cell_type): - scope_name = "lstm" if cell_type else "gru" - with tf.variable_scope(scope_name): - Cell = _get_rnn_cell_creator(cell_type) - cells_forward = [Cell(output_size) for output_size in num_units] - cells_backward = [Cell(output_size) for output_size in num_units] - cells_forward = _apply_dropout_wrapper(cells_forward, dropout) - cells_backward = _apply_dropout_wrapper(cells_backward, dropout) - - def stacked_rnn_cell(inputs, sequence_lengths): +def _get_rnn_cell_creator(cell_type: str): + if cell_type == "LSTM": + Cell = tf.nn.rnn_cell.LSTMCell + elif cell_type == "GRU": + Cell = tf.nn.rnn_cell.GRUCell + else: + raise ValueError( + "cell_type: %s is not supported." + "It should be one of 'LSTM' or 'GRU'." % cell_type + ) + return Cell + + +def _apply_dropout_wrapper( + rnn_cells: List[tf.nn.rnn_cell.RNNCell], + dropout: List[float], +) -> List[tf.nn.rnn_cell.RNNCell]: + """Apply dropout wrapper around each cell if necessary""" + + if any( + [rnn_cells is None, len(rnn_cells) == 0, dropout is None, len(dropout) == 0] + ): + return None + + cells = [] + for i, dropout_rate in enumerate(dropout): + cell = rnn_cells[i] + if dropout_rate > 0: + cell = tf.nn.rnn_cell.DropoutWrapper( + cell, input_keep_prob=(1.0 - dropout_rate) + ) + cells.append(cell) + return cells + + +def _create_bidirectional_rnn_cell( + num_units: List[int], + dropout: List[float], + cell_type: str, +) -> Callable[[tf.Tensor, tf.Tensor], tf.Tensor]: + """Create a bidirectional RNN cell.""" + + scope_name = "lstm" if cell_type else "gru" with tf.variable_scope(scope_name): - outputs, final_states, _ = stack_bidirectional_dynamic_rnn( - cells_fw=cells_forward, cells_bw=cells_backward, inputs=inputs, - sequence_length=sequence_lengths, dtype=inputs.dtype) - return final_states[-1][-1] - - return stacked_rnn_cell - - -def _create_unidirectional_rnn_cell(num_units, dropout, cell_type): - scope_name = "lstm" if cell_type else "gru" - with tf.variable_scope(scope_name): - Cell = _get_rnn_cell_creator(cell_type) - cells = [Cell(output_size) for output_size in num_units] - cells = _apply_dropout_wrapper(cells, dropout) - multi_cell = tf.nn.rnn_cell.MultiRNNCell(cells) - - def stacked_rnn_cell(inputs, sequence_lengths): + Cell = _get_rnn_cell_creator(cell_type) + cells_forward = [Cell(output_size) for output_size in num_units] + cells_backward = [Cell(output_size) for output_size in num_units] + cells_forward = _apply_dropout_wrapper(cells_forward, dropout) + cells_backward = _apply_dropout_wrapper(cells_backward, dropout) + + def stacked_rnn_cell(inputs: tf.Tensor, sequence_lengths: tf.Tensor) -> tf.Tensor: + """Create a bidirectional RNN cell.""" + + with tf.variable_scope(scope_name): + outputs, final_states, _ = stack_bidirectional_dynamic_rnn( + cells_fw=cells_forward, + cells_bw=cells_backward, + inputs=inputs, + sequence_length=sequence_lengths, + dtype=inputs.dtype, + ) + return final_states[-1][-1] + + return stacked_rnn_cell + + +def _create_unidirectional_rnn_cell( + num_units: List[int], + dropout: List[float], + cell_type: str, +) -> Callable[[tf.Tensor, tf.Tensor], tf.Tensor]: + """Create a unidirectional RNN cell.""" + + scope_name = "lstm" if cell_type else "gru" with tf.variable_scope(scope_name): - outputs, final_states = tf.nn.static_rnn( - multi_cell, - tf.unstack(inputs, axis=1), - dtype=inputs.dtype, - sequence_length=sequence_lengths) - return final_states[-1].h - - return stacked_rnn_cell - - -def _create_regular_rnn_cell(num_units, dropout, cell_type, is_bidirectional): - if is_bidirectional: - return _create_bidirectional_rnn_cell(num_units, dropout, cell_type) - else: + Cell = _get_rnn_cell_creator(cell_type) + cells = [Cell(output_size) for output_size in num_units] + cells = _apply_dropout_wrapper(cells, dropout) + multi_cell = tf.nn.rnn_cell.MultiRNNCell(cells) + + def stacked_rnn_cell(inputs: tf.Tensor, sequence_lengths: tf.Tensor) -> tf.Tensor: + """Create a unidirectional RNN cell.""" + + with tf.variable_scope(scope_name): + outputs, final_states = tf.nn.static_rnn( + multi_cell, + tf.unstack(inputs, axis=1), + dtype=inputs.dtype, + sequence_length=sequence_lengths, + ) + return final_states[-1].h + + return stacked_rnn_cell + + +def _create_regular_rnn_cell( + num_units: List[int], + dropout: List[float], + cell_type: str, + is_bidirectional: bool = True, +) -> Callable[[tf.Tensor, tf.Tensor], tf.Tensor]: + if is_bidirectional: + return _create_bidirectional_rnn_cell(num_units, dropout, cell_type) return _create_unidirectional_rnn_cell(num_units, dropout, cell_type) class StackedRNN(twml.layers.Layer): - """ - Layer for stacking RNN modules. - This layer provides a unified interface for RNN modules that perform well on CPUs and GPUs. - - Arguments: - num_units: - A list specifying the number of units per layer. - dropout: - Dropout applied to the input of each cell. - If list, has to dropout used for each layer. - If number, the same amount of dropout is used everywhere. - Defaults to 0. - is_training: - Flag to specify if the layer is used in training mode or not. - cell_type: - Sepcifies the type of RNN. Can be "LSTM". "GRU" is not yet implemented. - is_bidirectional: - Specifies if the stacked RNN layer is bidirectional. - This is for forward compatibility, this is not yet implemented. - Defaults to False. - """ - - def __init__(self, - num_units, - dropout=0, - is_training=True, - cell_type="LSTM", - is_bidirectional=False, - name="stacked_rnn"): - - super(StackedRNN, self).__init__(name=name) - - if (is_bidirectional): - raise NotImplementedError("Bidirectional RNN is not yet implemented") - - if (cell_type != "LSTM"): - raise NotImplementedError("Only LSTMs are supported") - - if not isinstance(num_units, (list, tuple)): - num_units = [num_units] - else: - num_units = num_units - - self.num_layers = len(num_units) - if not isinstance(dropout, (tuple, list)): - dropout = [dropout] * self.num_layers - else: - dropout = dropout - - self.is_training = is_training - - is_gpu_available = twml.contrib.utils.is_gpu_available() - same_unit_size = all(size == num_units[0] for size in num_units) - same_dropout_rate = any(val == dropout[0] for val in dropout) - - self.stacked_rnn_cell = None - self.num_units = num_units - self.dropout = dropout - self.cell_type = cell_type - self.is_bidirectional = is_bidirectional - - def build(self, input_shape): - self.stacked_rnn_cell = _create_regular_rnn_cell(self.num_units, - self.dropout, - self.cell_type, - self.is_bidirectional) - - def call(self, inputs, sequence_lengths): """ - Arguments: - inputs: - A tensor of size [batch_size, max_sequence_length, embedding_size]. - sequence_lengths: - The length of each input sequence in the batch. Should be of size [batch_size]. - Returns: - final_output - The output of at the end of sequence_length. + Layer for stacking RNN modules. + This layer provides a unified interface for RNN modules that perform well on CPUs and GPUs. + + Args: + num_units: int or list + A list specifying the number of units per layer. + dropout: float or list + Dropout applied to the input of each cell. + If list, has to dropout used for each layer. + If number, the same amount of dropout is used everywhere. + Defaults to 0. + is_training: bool + Flag to specify if the layer is used in training mode or not. + cell_type: str + Sepcifies the type of RNN. Can be "LSTM". "GRU" is not yet implemented. + is_bidirectional: bool + Specifies if the stacked RNN layer is bidirectional. + This is for forward compatibility, this is not yet implemented. + Defaults to False. + name: str + Name of the layer. + """ + + def __init__( + self, + num_units: Union[int, List[int]], + dropout: Union[float, List[float]] = 0.0, + is_training: bool = True, + cell_type: str = "LSTM", + is_bidirectional: bool = False, + name: str = "stacked_rnn", + ): + super(StackedRNN, self).__init__(name=name) + + if is_bidirectional: + raise NotImplementedError("Bidirectional RNN is not yet implemented") + + assert cell_type in ["LSTM", "GRU"] + if cell_type != "LSTM": + raise NotImplementedError("Only LSTMs are supported") + + # Make sure num_units is a list + if not isinstance(num_units, (list, tuple)): + num_units = [num_units] + + # Make sure dropout is a list + self.num_layers = len(num_units) + if not isinstance(dropout, (tuple, list)): + dropout = [dropout] * self.num_layers + + # Check if all parameters are valid + is_gpu_available = twml.contrib.utils.is_gpu_available() + same_unit_size = all(size == num_units[0] for size in num_units) + same_dropout_rate = any(val == dropout[0] for val in dropout) + + # set all class variables + self.is_training = is_training + self.stacked_rnn_cell = None + self.num_units = num_units + self.dropout = dropout + self.cell_type = cell_type + self.is_bidirectional = is_bidirectional + + def build(self, input_shape: tf.TensorShape): + self.stacked_rnn_cell = _create_regular_rnn_cell( + self.num_units, self.dropout, self.cell_type, self.is_bidirectional + ) + + def call(self, inputs: tf.Tensor, sequence_lengths: tf.Tensor) -> tf.Tensor: + """ + Args: + inputs: + A tensor of size [batch_size, max_sequence_length, embedding_size]. + sequence_lengths: + The length of each input sequence in the batch. Should be of size [batch_size]. + Returns: + final_output + The output of at the end of sequence_length. + """ + return self.stacked_rnn_cell(inputs, sequence_lengths) + + +def stacked_rnn( + inputs: tf.Tensor, + sequence_lengths: tf.Tensor, + num_units: List[int], + dropout: Union[float, List[float]] = 0.0, + is_training: bool = True, + cell_type: str = "LSTM", + is_bidirectional: bool = False, + name: str = "stacked_rnn", +) -> StackedRNN: + """Functional interface for StackedRNN + + Args: + inputs: + A tensor of size [batch_size, max_sequence_length, embedding_size]. + sequence_lengths: + The length of each input sequence in the batch. Should be of size [batch_size]. + num_units: + A list specifying the number of units per layer. + dropout: + Dropout applied to the input of each cell. + If list, has to dropout used for each layer. + If number, the same amount of dropout is used everywhere. + Defaults to 0. + is_training: + Flag to specify if the layer is used in training mode or not. + cell_type: + Specifies the type of RNN. Can be "LSTM" or "GRU". + is_bidirectional: + Specifies if the stacked RNN layer is bidirectional. + Defaults to False. + + Returns + outputs, state. """ - return self.stacked_rnn_cell(inputs, sequence_lengths) - - -def stacked_rnn(inputs, sequence_lengths, num_units, - dropout=0, is_training=True, - cell_type="LSTM", is_bidirectional=False, name="stacked_rnn"): - """Functional interface for StackedRNN - Arguments: - inputs: - A tensor of size [batch_size, max_sequence_length, embedding_size]. - sequence_lengths: - The length of each input sequence in the batch. Should be of size [batch_size]. - num_units: - A list specifying the number of units per layer. - dropout: - Dropout applied to the input of each cell. - If list, has to dropout used for each layer. - If number, the same amount of dropout is used everywhere. - Defaults to 0. - is_training: - Flag to specify if the layer is used in training mode or not. - cell_type: - Sepcifies the type of RNN. Can be "LSTM" or "GRU". - is_bidirectional: - Specifies if the stacked RNN layer is bidirectional. - Defaults to False. - Returns - outputs, state. - """ - rnn = StackedRNN(num_units, dropout, is_training, cell_type, is_bidirectional, name) - return rnn(inputs, sequence_lengths) + rnn = StackedRNN(num_units, dropout, is_training, cell_type, is_bidirectional, name) + return rnn(inputs, sequence_lengths) diff --git a/twml/twml/contrib/layers/zscore_normalization.py b/twml/twml/contrib/layers/zscore_normalization.py index 8a1064965..05da7d4d1 100644 --- a/twml/twml/contrib/layers/zscore_normalization.py +++ b/twml/twml/contrib/layers/zscore_normalization.py @@ -1,247 +1,294 @@ """ Contains the twml.layers.ZscoreNormalization layer. """ -from twml.layers.layer import Layer -import tensorflow.compat.v1 as tf +from typing import Optional, Tuple, Union +import tensorflow.compat.v1 as tf from tensorflow.python.training import moving_averages +from twml.layers.layer import Layer + # This is copied from tensorflow.contrib.framework.python.ops.add_model_variable in 1.15 # Not available in 2.x # TODO: Figure out if this is really necessary. -def _add_model_variable(var): - """Adds a variable to the `GraphKeys.MODEL_VARIABLES` collection. - Args: - var: a variable. - """ - if var not in tf.get_collection(tf.GraphKeys.MODEL_VARIABLES): - tf.add_to_collection(tf.GraphKeys.MODEL_VARIABLES, var) - +def _add_model_variable(var: tf.Variable) -> None: + """Adds a variable to the `GraphKeys.MODEL_VARIABLES` collection. -def update_moving_variable(batch_var, moving_var, decay, zero_debias=True, name=None): - update_op = moving_averages.assign_moving_average( - moving_var, batch_var, decay, zero_debias=zero_debias, name=None) - _add_model_variable(moving_var) - with tf.control_dependencies([update_op]): - return tf.identity(moving_var) + Args: + var: a variable. + """ + if var not in tf.get_collection(tf.GraphKeys.MODEL_VARIABLES): + tf.add_to_collection(tf.GraphKeys.MODEL_VARIABLES, var) -class ZscoreNormalization(Layer): - """ - Perform z-score normalization using moving mean and std. - Missing values are not included during mean/std calculation - This layer should only be used right after input layer. - - Args: - decay: - using large decay to include longer moving means. - data_type: - use float64 to prevent overflow during variance calculation. - name: - Layer name - Returns: - A layer representing the output of the ZscoreNormalization transformation. - """ - - def __init__( - self, - decay=0.9999, - data_type=tf.float64, - name=None, - **kwargs): - super(ZscoreNormalization, self).__init__(name=name, **kwargs) - self.epsilon = tf.constant(1., data_type) - self.decay = decay - self.data_type = data_type - - def build(self, input_shape): # pylint: disable=unused-argument - """Creates the moving_mean and moving_var tf.Variables of the layer.""" - input_dim = input_shape[1] - self.moving_mean = self.add_variable( - '{}_mean/EMA'.format(self.name), - initializer=tf.constant_initializer(), - shape=[input_dim], - dtype=self.data_type, - trainable=False - ) - self.moving_var = self.add_variable( - '{}_variance/EMA'.format(self.name), - initializer=tf.constant_initializer(), - shape=[input_dim], - dtype=self.data_type, - trainable=False - ) - self.built = True - def compute_output_shape(self, input_shape): - """Computes the output shape of the layer given the input shape. +def update_moving_variable( + batch_var: tf.Variable, + moving_var: tf.Variable, + decay: float, + zero_debias: bool = True, + name: Optional[str] = None, +) -> tf.Variable: + """Update moving variable using batch variable. Args: - input_shape: A (possibly nested tuple of) `TensorShape`. It need not - be fully defined (e.g. the batch size may be unknown). + batch_var: a variable. + moving_var: a variable. + decay: decay rate. + zero_debias: whether to use zero debias. + name: name of the operation. + Returns: + A variable representing the updated moving variable. """ - return input_shape - - def _training_pass(self, input, dense_mask, input_dtype, handle_single, zero_debias): - epsilon = self.epsilon - moving_mean, moving_var = self.moving_mean, self.moving_var - # calculate the number of exisiting value for each feature - tensor_batch_num = tf.reduce_sum(tf.cast(dense_mask, self.data_type), axis=0) - mask_ones = tf.cast(tensor_batch_num, tf.bool) - eps_vector = tf.fill(tf.shape(tensor_batch_num), epsilon) - # the following filled 0 with epision - tensor_batch_num_eps = tf.where(mask_ones, - tensor_batch_num, - eps_vector - ) - tensor_batch_num_eps_broacast = tf.expand_dims(tensor_batch_num_eps, 0) - tensor_batch_divided = input / tensor_batch_num_eps_broacast - tensor_batch_mean = tf.reduce_sum(tensor_batch_divided, axis=0) - - # update moving mean here, and use it to calculate the std. - tensor_moving_mean = update_moving_variable(tensor_batch_mean, moving_mean, self.decay, - zero_debias, name="mean_ema_op") - - tensor_batch_sub_mean = input - tf.expand_dims(tensor_moving_mean, 0) - tensor_batch_sub_mean = tf.where(dense_mask, - tensor_batch_sub_mean, - tf.zeros_like(tensor_batch_sub_mean)) - # divided by sqrt(n) before square, and then do summation for numeric stability. - broad_sqrt_num_eps = tf.expand_dims(tf.sqrt(tensor_batch_num_eps), 0) - tensor_batch_sub_mean_div = tensor_batch_sub_mean / broad_sqrt_num_eps - tensor_batch_sub_mean_div_square = tf.square(tensor_batch_sub_mean_div) - tensor_batch_var = tf.reduce_sum(tensor_batch_sub_mean_div_square, axis=0) - - # update moving var here, dont replace 0 with eps before updating. - tensor_moving_var = update_moving_variable(tensor_batch_var, moving_var, self.decay, - zero_debias, name="var_ema_op") - - # if std is 0, replace it with epsilon - tensor_moving_std = tf.sqrt(tensor_moving_var) - tensor_moving_std_eps = tf.where(tf.equal(tensor_moving_std, 0), - eps_vector, - tensor_moving_std) - - missing_input_norm = tensor_batch_sub_mean / tf.expand_dims(tensor_moving_std_eps, 0) - - if handle_single: - # if std==0 and value not missing, reset it to 1. - moving_var_mask_zero = tf.math.equal(tensor_moving_var, 0) - moving_var_mask_zero = tf.expand_dims(moving_var_mask_zero, 0) - missing_input_norm = tf.where( - tf.math.logical_and(dense_mask, moving_var_mask_zero), - tf.ones_like(missing_input_norm), - missing_input_norm - ) - if input_dtype != self.data_type: - missing_input_norm = tf.cast(missing_input_norm, input_dtype) - return missing_input_norm - - def _infer_pass(self, input, dense_mask, input_dtype, handle_single): - epsilon = tf.cast(self.epsilon, input_dtype) - testing_moving_mean = tf.cast(self.moving_mean, input_dtype) - tensor_moving_std = tf.cast(tf.sqrt(self.moving_var), input_dtype) - - broad_mean = tf.expand_dims(testing_moving_mean, 0) - tensor_batch_sub_mean = input - broad_mean - - tensor_batch_sub_mean = tf.where(dense_mask, - tensor_batch_sub_mean, - tf.zeros_like(tensor_batch_sub_mean) - ) - tensor_moving_std_eps = tf.where(tf.equal(tensor_moving_std, 0), - tf.fill(tf.shape(tensor_moving_std), epsilon), - tensor_moving_std) - missing_input_norm = tensor_batch_sub_mean / tf.expand_dims(tensor_moving_std_eps, 0) - if handle_single: - # if std==0 and value not missing, reset it to 1. - moving_var_broad = tf.expand_dims(tensor_moving_std, 0) - moving_var_mask_zero = tf.math.logical_not(tf.cast(moving_var_broad, tf.bool)) - - missing_input_norm = tf.where(tf.math.logical_and(dense_mask, moving_var_mask_zero), - tf.ones_like(missing_input_norm), - missing_input_norm - ) - return missing_input_norm - - def call( - self, - input, - is_training, - dense_mask=None, - zero_debias=True, - handle_single=False): + update_op = moving_averages.assign_moving_average( + moving_var, batch_var, decay, zero_debias=zero_debias, name=None + ) + _add_model_variable(moving_var) + with tf.control_dependencies([update_op]): + return tf.identity(moving_var) + + +class ZscoreNormalization(Layer): """ + Perform z-score normalization using moving mean and std. + Missing values are not included during mean/std calculation + This layer should only be used right after input layer. + Args: - ----------- - input: B x D : float32/float64 - missing value must be set to 0. - is_training: bool - training phase or testing phase - dense_mask: B x D : bool - missing value should be marked as 0, non-missing as 1. same shape as input - zero_debias: bool - bias correction of the moving average. (biased towards 0 in the beginning. - see adam paper. https://arxiv.org/abs/1412.6980) - handle_single: bool - if std==0, and feature is not missing value, set the value to 1, instead of 0. - This is super rare if input only consists of continous feature. - But if one-hot feature is included, - they will all have same values 1, in that case, make sure to set handle_single to true. + decay: + using large decay to include longer moving means. + data_type: + use float64 to prevent overflow during variance calculation. + name: + Layer name + + Returns: + A layer representing the output of the ZscoreNormalization transformation. """ - if dense_mask is None: - dense_mask = tf.math.logical_not(tf.equal(input, 0)) - input_dtype = input.dtype + def __init__(self, decay=0.9999, data_type=tf.float64, name=None, **kwargs): + super(ZscoreNormalization, self).__init__(name=name, **kwargs) + self.epsilon = tf.constant(1.0, data_type) + self.decay = decay + self.data_type = data_type + + def build(self, input_shape: tf.TensorShape): + """Creates the moving_mean and moving_var tf.Variables of the layer.""" + input_dim = input_shape[1] + self.moving_mean = self.add_variable( + f"{self.name}_mean/EMA", + initializer=tf.constant_initializer(), + shape=[input_dim], + dtype=self.data_type, + trainable=False, + ) + self.moving_var = self.add_variable( + f"{self.name}_variance/EMA", + initializer=tf.constant_initializer(), + shape=[input_dim], + dtype=self.data_type, + trainable=False, + ) + self.built = True + + def compute_output_shape( + self, input_shape: Union[tf.TensorShape, Tuple[tf.TensorShape]] + ) -> tf.TensorShape: + """Computes the output shape of the layer given the input shape.""" + + return input_shape + + def _training_pass( + self, + input: tf.Tensor, + dense_mask: tf.Tensor, + input_dtype: tf.DType, + handle_single: bool = False, + zero_debias: bool = True, + ) -> tf.Tensor: + """Perform z-score normalization in training mode.""" - if is_training: - if input_dtype != self.data_type: - input = tf.cast(input, self.data_type) - return self._training_pass(input, dense_mask, input_dtype, handle_single, zero_debias) - else: - return self._infer_pass(input, dense_mask, input_dtype, handle_single) + epsilon = self.epsilon + moving_mean, moving_var = self.moving_mean, self.moving_var + # calculate the number of exisiting value for each feature + tensor_batch_num = tf.reduce_sum(tf.cast(dense_mask, self.data_type), axis=0) + mask_ones = tf.cast(tensor_batch_num, tf.bool) + eps_vector = tf.fill(tf.shape(tensor_batch_num), epsilon) + # the following filled 0 with epision + tensor_batch_num_eps = tf.where(mask_ones, tensor_batch_num, eps_vector) + tensor_batch_num_eps_broacast = tf.expand_dims(tensor_batch_num_eps, 0) + tensor_batch_divided = input / tensor_batch_num_eps_broacast + tensor_batch_mean = tf.reduce_sum(tensor_batch_divided, axis=0) + + # update moving mean here, and use it to calculate the std. + tensor_moving_mean = update_moving_variable( + tensor_batch_mean, moving_mean, self.decay, zero_debias, name="mean_ema_op" + ) + + tensor_batch_sub_mean = input - tf.expand_dims(tensor_moving_mean, 0) + tensor_batch_sub_mean = tf.where( + dense_mask, tensor_batch_sub_mean, tf.zeros_like(tensor_batch_sub_mean) + ) + # divided by sqrt(n) before square, and then do summation for numeric stability. + broad_sqrt_num_eps = tf.expand_dims(tf.sqrt(tensor_batch_num_eps), 0) + tensor_batch_sub_mean_div = tensor_batch_sub_mean / broad_sqrt_num_eps + tensor_batch_sub_mean_div_square = tf.square(tensor_batch_sub_mean_div) + tensor_batch_var = tf.reduce_sum(tensor_batch_sub_mean_div_square, axis=0) + + # update moving var here, dont replace 0 with eps before updating. + tensor_moving_var = update_moving_variable( + tensor_batch_var, moving_var, self.decay, zero_debias, name="var_ema_op" + ) + + # if std is 0, replace it with epsilon + tensor_moving_std = tf.sqrt(tensor_moving_var) + tensor_moving_std_eps = tf.where( + tf.equal(tensor_moving_std, 0), eps_vector, tensor_moving_std + ) + + missing_input_norm = tensor_batch_sub_mean / tf.expand_dims( + tensor_moving_std_eps, 0 + ) + + if handle_single: + # if std==0 and value not missing, reset it to 1. + moving_var_mask_zero = tf.math.equal(tensor_moving_var, 0) + moving_var_mask_zero = tf.expand_dims(moving_var_mask_zero, 0) + missing_input_norm = tf.where( + tf.math.logical_and(dense_mask, moving_var_mask_zero), + tf.ones_like(missing_input_norm), + missing_input_norm, + ) + if input_dtype != self.data_type: + missing_input_norm = tf.cast(missing_input_norm, input_dtype) + return missing_input_norm + + def _infer_pass( + self, + input: tf.Tensor, + dense_mask: tf.Tensor, + input_dtype: tf.DType, + handle_single: bool = False, + ) -> tf.Tensor: + """Perform z-score normalization in inference mode.""" + + epsilon = tf.cast(self.epsilon, input_dtype) + testing_moving_mean = tf.cast(self.moving_mean, input_dtype) + tensor_moving_std = tf.cast(tf.sqrt(self.moving_var), input_dtype) + + broad_mean = tf.expand_dims(testing_moving_mean, 0) + tensor_batch_sub_mean = input - broad_mean + + tensor_batch_sub_mean = tf.where( + dense_mask, tensor_batch_sub_mean, tf.zeros_like(tensor_batch_sub_mean) + ) + tensor_moving_std_eps = tf.where( + tf.equal(tensor_moving_std, 0), + tf.fill(tf.shape(tensor_moving_std), epsilon), + tensor_moving_std, + ) + missing_input_norm = tensor_batch_sub_mean / tf.expand_dims( + tensor_moving_std_eps, 0 + ) + if handle_single: + # if std==0 and value not missing, reset it to 1. + moving_var_broad = tf.expand_dims(tensor_moving_std, 0) + moving_var_mask_zero = tf.math.logical_not( + tf.cast(moving_var_broad, tf.bool) + ) + + missing_input_norm = tf.where( + tf.math.logical_and(dense_mask, moving_var_mask_zero), + tf.ones_like(missing_input_norm), + missing_input_norm, + ) + return missing_input_norm + + def call( + self, + input: tf.Tensor, + is_training: bool = True, + dense_mask: bool = None, + zero_debias: bool = True, + handle_single: bool = False, + ) -> tf.Tensor: + """ + Args: + input: B x D : float32/float64 + missing value must be set to 0. + is_training: bool + training phase or testing phase + dense_mask: B x D : bool + missing value should be marked as 0, non-missing as 1. same shape as input + zero_debias: bool + bias correction of the moving average. (biased towards 0 in the beginning. + see adam paper. https://arxiv.org/abs/1412.6980) + handle_single: bool + if std==0, and feature is not missing value, set the value to 1, instead of 0. + This is super rare if input only consists of continuous feature. + But if one-hot feature is included, + they will all have same values 1, in that case, make sure to set handle_single to true. + """ + + if dense_mask is None: + dense_mask = tf.math.logical_not(tf.equal(input, 0)) + input_dtype = input.dtype + + if is_training: + if input_dtype != self.data_type: + input = tf.cast(input, self.data_type) + return self._training_pass( + input, dense_mask, input_dtype, handle_single, zero_debias + ) + else: + return self._infer_pass(input, dense_mask, input_dtype, handle_single) def zscore_normalization( - input, - is_training, - decay=0.9999, - data_type=tf.float64, - name=None, - dense_mask=None, - zero_debias=True, - handle_single=False, **kwargs): - """ - Args: - ------------ - input: B x D : float32/float64 - missing value must be set to 0. - is_training: bool - training phase or testing phase - decay: - using large decay to include longer moving means. - data_type: - use float64 to zprevent overflow during variance calculation. - name: - Layer name - dense_mask: B x D : bool - missing value should be marked as 0, non-missing as 1. same shape as input - zero_debias: bool - bias correction of the moving average. (biased towards 0 in the beginning. - see adam paper. https://arxiv.org/abs/1412.6980) - handle_single: bool - if std==0, and feature is not missing value, set the value to 1, instead of 0. - This is super rare if input only consists of continous feature. - But if one-hot feature is included, - they will all have same values 1, in that case, make sure to set handle_single to true. - """ - - norm_layer = ZscoreNormalization(decay=decay, data_type=data_type, name=name, **kwargs) - return norm_layer(input, - is_training, - dense_mask=dense_mask, - zero_debias=zero_debias, - handle_single=handle_single) + input: tf.Tensor, + is_training: bool = True, + decay: float = 0.9999, + data_type: tf.DType = tf.float64, + name: Optional[str] = None, + dense_mask: Optional[tf.Tensor] = None, + zero_debias: bool = True, + handle_single: bool = False, + **kwargs, +): + """ + Args: + input: B x D : float32/float64 + missing value must be set to 0. + is_training: bool + training phase or testing phase + decay: float + using large decay to include longer moving means. + data_type: tf.DType + use float64 to zprevent overflow during variance calculation. + name: str + Layer name + dense_mask: B x D : bool + missing value should be marked as 0, non-missing as 1. same shape as input + zero_debias: bool + bias correction of the moving average. (biased towards 0 in the beginning. + see adam paper. https://arxiv.org/abs/1412.6980) + handle_single: bool + if std == 0, and feature is not missing value, set the value to 1, instead of 0. + This is super rare if input only consists of continuous feature. + But if one-hot feature is included, + they will all have same values 1, in that case, make sure to set handle_single to true. + """ + + norm_layer = ZscoreNormalization( + decay=decay, data_type=data_type, name=name, **kwargs + ) + return norm_layer( + input, + is_training, + dense_mask=dense_mask, + zero_debias=zero_debias, + handle_single=handle_single, + ) diff --git a/twml/twml/contrib/metrics/__init__.py b/twml/twml/contrib/metrics/__init__.py index 37e6563c9..f2e26dafe 100644 --- a/twml/twml/contrib/metrics/__init__.py +++ b/twml/twml/contrib/metrics/__init__.py @@ -1,5 +1,5 @@ # pylint: disable=wildcard-import """This module contains experimental metric(s) for search and ranking""" -from .search_metrics import get_search_metric_fn, ndcg # noqa: F401 from .metrics import * # noqa: F401 +from .search_metrics import get_search_metric_fn, ndcg # noqa: F401 diff --git a/twml/twml/contrib/metrics/metrics.py b/twml/twml/contrib/metrics/metrics.py index dea1a5273..d39e2bef5 100644 --- a/twml/twml/contrib/metrics/metrics.py +++ b/twml/twml/contrib/metrics/metrics.py @@ -2,208 +2,324 @@ Module containing extra tensorflow metrics used at Twitter. This module conforms to conventions used by tf.metrics.*. In particular, each metric constructs two subgraphs: value_op and update_op: - - The value op is used to fetch the current metric value. - - The update_op is used to accumulate into the metric. + - The value op is used to fetch the current metric value. + - The update_op is used to accumulate into the metric. Note: similar to tf.metrics.*, metrics in here do not support multi-label learning. We will have to write wrapper classes to create one metric per label. Note: similar to tf.metrics.*, batches added into a metric via its update_op are cumulative! - """ -from collections import OrderedDict +from typing import Callable, List, Optional, Tuple, Union import tensorflow.compat.v1 as tf -from twml.metrics import get_multi_binary_class_metric_fn +from twml.metrics import get_multi_binary_class_metric_fn # checkstyle: noqa -def get_partial_multi_binary_class_metric_fn(metrics, classes=None, class_dim=1, predcols=None): - - def get_eval_metric_ops(graph_output, labels, weights): - if predcols is None: - preds = graph_output['output'] - else: - if isinstance(predcols, int): - predcol_list=[predcols] - else: - predcol_list=list(predcols) - for col in predcol_list: - assert 0 <= col < graph_output['output'].shape[class_dim], 'Invalid Prediction Column Index !' - preds = tf.gather(graph_output['output'], indices=predcol_list, axis=class_dim) # [batchSz, num_col] - labels = tf.gather(labels, indices=predcol_list, axis=class_dim) # [batchSz, num_col] - - predInfo = {'output': preds} - if 'threshold' in graph_output: - predInfo['threshold'] = graph_output['threshold'] - if 'hard_output' in graph_output: - predInfo['hard_output'] = graph_output['hard_output'] - - metrics_op = get_multi_binary_class_metric_fn(metrics, classes, class_dim) - metrics_op_res = metrics_op(predInfo, labels, weights) - return metrics_op_res - - return get_eval_metric_ops - +def get_partial_multi_binary_class_metric_fn( + metrics: List[str], + classes: Optional[List[str]] = None, + class_dim: int = 1, + predcols: Optional[Union[int, List[int]]] = None, +) -> callable: + def get_eval_metric_ops( + graph_output: dict, labels: tf.Tensor, weights: tf.Tensor + ) -> dict: + if predcols is None: + preds = graph_output["output"] + else: + if isinstance(predcols, int): + predcol_list = [predcols] + else: + predcol_list = list(predcols) + for col in predcol_list: + assert ( + 0 <= col < graph_output["output"].shape[class_dim] + ), "Invalid Prediction Column Index !" + preds = tf.gather( + graph_output["output"], indices=predcol_list, axis=class_dim + ) # [batchSz, num_col] + labels = tf.gather( + labels, indices=predcol_list, axis=class_dim + ) # [batchSz, num_col] + + predInfo = {"output": preds} + if "threshold" in graph_output: + predInfo["threshold"] = graph_output["threshold"] + if "hard_output" in graph_output: + predInfo["hard_output"] = graph_output["hard_output"] + + metrics_op = get_multi_binary_class_metric_fn(metrics, classes, class_dim) + metrics_op_res = metrics_op(predInfo, labels, weights) + return metrics_op_res + + return get_eval_metric_ops # Numeric Prediction Performance among TopK Predictions -def mean_numeric_label_topK(labels, predictions, weights, name, topK_id): - top_k_labels = tf.gather(params=labels, indices=topK_id, axis=0) # [topK, 1] - return tf.metrics.mean(values=top_k_labels, name=name) +def mean_numeric_label_topK( + labels: List[str], + predictions: tf.Tensor, + weights: tf.Tensor, + name: str, + topK_id: tf.Tensor, +) -> tf.Tensor: + top_k_labels = tf.gather(params=labels, indices=topK_id, axis=0) # [topK, 1] + return tf.metrics.mean(values=top_k_labels, name=name) + + +def mean_gated_numeric_label_topK( + labels: List[str], + predictions: tf.Tensor, + weights: tf.Tensor, + name: str, + topK_id: tf.Tensor, + bar: float = 2.0, +) -> tf.Tensor: + assert isinstance(bar, int) or isinstance(bar, float), "bar must be int or float" + top_k_labels = tf.gather(params=labels, indices=topK_id, axis=0) # [topK, 1] + gated_top_k_labels = tf.cast(top_k_labels > bar * 1.0, tf.int32) + return tf.metrics.mean(values=gated_top_k_labels, name=name) -def mean_gated_numeric_label_topK(labels, predictions, weights, name, topK_id, bar=2.0): - assert isinstance(bar, int) or isinstance(bar, float), "bar must be int or float" - top_k_labels = tf.gather(params=labels, indices=topK_id, axis=0) # [topK, 1] - gated_top_k_labels = tf.cast(top_k_labels > bar*1.0, tf.int32) - return tf.metrics.mean(values=gated_top_k_labels, name=name) SUPPORTED_NUMERIC_METRICS = { - 'mean_numeric_label_topk': mean_numeric_label_topK, - 'mean_gated_numeric_label_topk': mean_gated_numeric_label_topK + "mean_numeric_label_topk": mean_numeric_label_topK, + "mean_gated_numeric_label_topk": mean_gated_numeric_label_topK, } -DEFAULT_NUMERIC_METRICS = ['mean_numeric_label_topk', 'mean_gated_numeric_label_topk'] +DEFAULT_NUMERIC_METRICS = ["mean_numeric_label_topk", "mean_gated_numeric_label_topk"] - -def get_metric_topK_fn_helper(targetMetrics, supportedMetrics_op, metrics=None, topK=(5,5,5), predcol=None, labelcol=None): - """ - :param targetMetrics: Target Metric List - :param supportedMetrics_op: Supported Metric Operators Dict - :param metrics: Metric Set to evaluate - :param topK: (topK_min, topK_max, topK_delta) Tuple - :param predcol: Prediction Column Index - :param labelcol: Label Column Index - :return: - """ - # pylint: disable=dict-keys-not-iterating - if targetMetrics is None or supportedMetrics_op is None: - raise ValueError("Invalid Target Metric List/op !") - - targetMetrics = set([m.lower() for m in targetMetrics]) - if metrics is None: - metrics = list(targetMetrics) - else: - metrics = [m.lower() for m in metrics if m.lower() in targetMetrics] - - num_k = int((topK[1]-topK[0])/topK[2]+1) - topK_list = [topK[0]+d*topK[2] for d in range(num_k)] - if 1 not in topK_list: - topK_list = [1] + topK_list - - - def get_eval_metric_ops(graph_output, labels, weights): +def get_metric_topK_fn_helper( + targetMetrics: List[str], + supportedMetrics_op: dict, + metrics: Optional[List[str]] = None, + topK: Tuple[int] = (5, 5, 5), + predcol: Optional[int] = None, + labelcol: Optional[int] = None, +) -> Callable[[dict, tf.Tensor, tf.Tensor], dict]: """ - graph_output: - dict that is returned by build_graph given input features. - labels: - target labels associated to batch. - weights: - weights of the samples.. + Helper function to get metric function for topK evaluation + + Args: + targetMetrics (list[str]): + Target Metric List + supportedMetrics_op (dict): + Supported Metric Operators + metrics (list[str], optional): + Metric Set to evaluate + topK (tuple[int], optional): + (topK_min, topK_max, topK_delta) + predcol (int, optional): + Prediction Column Index + labelcol (int, optional): + Label Column Index + + Returns: + callable: + Metric Function """ - eval_metric_ops = OrderedDict() - if predcol is None: - pred = graph_output['output'] + # pylint: disable=dict-keys-not-iterating + if targetMetrics is None or supportedMetrics_op is None: + raise ValueError("Invalid Target Metric List/op !") + + targetMetrics = set([m.lower() for m in targetMetrics]) + if metrics is None: + metrics = list(targetMetrics) else: - assert 0 <= predcol < graph_output['output'].shape[1], 'Invalid Prediction Column Index !' - assert labelcol is not None - pred = tf.reshape(graph_output['output'][:, predcol], shape=[-1, 1]) - labels = tf.reshape(labels[:, labelcol], shape=[-1, 1]) - numOut = graph_output['output'].shape[1] - pred_score = tf.reshape(graph_output['output'][:, numOut-1], shape=[-1, 1]) - - # add metrics to eval_metric_ops dict - for metric_name in metrics: - metric_name = metric_name.lower() # metric name are case insensitive. - - if metric_name in supportedMetrics_op: - metric_factory = supportedMetrics_op.get(metric_name) - - if 'topk' not in metric_name: - value_op, update_op = metric_factory( - labels=labels, - predictions=pred, - weights=weights, - name=metric_name) - eval_metric_ops[metric_name] = (value_op, update_op) + metrics = [m.lower() for m in metrics if m.lower() in targetMetrics] + + num_k = int((topK[1] - topK[0]) / topK[2] + 1) + topK_list = [topK[0] + d * topK[2] for d in range(num_k)] + if 1 not in topK_list: + topK_list = [1] + topK_list + + def get_eval_metric_ops( + graph_output: dict, labels: tf.Tensor, weights: tf.Tensor + ) -> dict: + """ + Get Evaluation Metric Ops + + Args: + graph_output (dict): + Graph Output + labels (tf.Tensor): + Labels + weights (tf.Tensor): + Weights + + Returns: + dict: + Evaluation Metric Ops + """ + eval_metric_ops = dict() + + if predcol is None: + pred = graph_output["output"] else: - for K in topK_list: - K_min = tf.minimum(K, tf.shape(pred_score)[0]) - topK_id = tf.nn.top_k(tf.reshape(pred_score, shape=[-1]), k=K_min)[1] # [topK] - value_op, update_op = metric_factory( - labels=labels, - predictions=pred, - weights=weights, - name=metric_name+'__k_'+str(K), - topK_id=topK_id) - eval_metric_ops[metric_name+'__k_'+str(K)] = (value_op, update_op) - - else: - raise ValueError('Cannot find the metric named ' + metric_name) - - return eval_metric_ops - - return get_eval_metric_ops - - - -def get_numeric_metric_fn(metrics=None, topK=(5,5,5), predcol=None, labelcol=None): - if metrics is None: - metrics = list(DEFAULT_NUMERIC_METRICS) - metrics = list(set(metrics)) - - metric_op = get_metric_topK_fn_helper(targetMetrics=list(DEFAULT_NUMERIC_METRICS), - supportedMetrics_op=SUPPORTED_NUMERIC_METRICS, - metrics=metrics, topK=topK, predcol=predcol, labelcol=labelcol) - return metric_op - - - -def get_single_binary_task_metric_fn(metrics, classnames, topK=(5,5,5), use_topK=False): - """ - graph_output['output']: [BatchSz, 1] [pred_Task1] - labels: [BatchSz, 2] [Task1, NumericLabel] - """ - def get_eval_metric_ops(graph_output, labels, weights): - metric_op_base = get_partial_multi_binary_class_metric_fn(metrics, predcols=0, classes=classnames) - classnames_unw = ['unweighted_'+cs for cs in classnames] - metric_op_unw = get_partial_multi_binary_class_metric_fn(metrics, predcols=0, classes=classnames_unw) - - metrics_base_res = metric_op_base(graph_output, labels, weights) - metrics_unw_res = metric_op_unw(graph_output, labels, None) - metrics_base_res.update(metrics_unw_res) - - if use_topK: - metric_op_numeric = get_numeric_metric_fn(metrics=None, topK=topK, predcol=0, labelcol=1) - metrics_numeric_res = metric_op_numeric(graph_output, labels, weights) - metrics_base_res.update(metrics_numeric_res) - return metrics_base_res - - return get_eval_metric_ops - - -def get_dual_binary_tasks_metric_fn(metrics, classnames, topK=(5,5,5), use_topK=False): - """ - graph_output['output']: [BatchSz, 3] [pred_Task1, pred_Task2, Score] - labels: [BatchSz, 3] [Task1, Task2, NumericLabel] - """ - def get_eval_metric_ops(graph_output, labels, weights): - - metric_op_base = get_partial_multi_binary_class_metric_fn(metrics, predcols=[0, 1], classes=classnames) - classnames_unw = ['unweighted_'+cs for cs in classnames] - metric_op_unw = get_partial_multi_binary_class_metric_fn(metrics, predcols=[0, 1], classes=classnames_unw) - - metrics_base_res = metric_op_base(graph_output, labels, weights) - metrics_unw_res = metric_op_unw(graph_output, labels, None) - metrics_base_res.update(metrics_unw_res) + assert ( + 0 <= predcol < graph_output["output"].shape[1] + ), "Invalid Prediction Column Index !" + assert labelcol is not None + pred = tf.reshape(graph_output["output"][:, predcol], shape=[-1, 1]) + labels = tf.reshape(labels[:, labelcol], shape=[-1, 1]) + numOut = graph_output["output"].shape[1] + pred_score = tf.reshape(graph_output["output"][:, numOut - 1], shape=[-1, 1]) + + # add metrics to eval_metric_ops dict + for metric_name in metrics: + metric_name = metric_name.lower() # metric name are case insensitive. + + if metric_name in supportedMetrics_op: + metric_factory = supportedMetrics_op.get(metric_name) + + if "topk" not in metric_name: + value_op, update_op = metric_factory( + labels=labels, + predictions=pred, + weights=weights, + name=metric_name, + ) + eval_metric_ops[metric_name] = (value_op, update_op) + else: + for K in topK_list: + K_min = tf.minimum(K, tf.shape(pred_score)[0]) + topK_id = tf.nn.top_k( + tf.reshape(pred_score, shape=[-1]), k=K_min + )[ + 1 + ] # [topK] + value_op, update_op = metric_factory( + labels=labels, + predictions=pred, + weights=weights, + name=metric_name + "__k_" + str(K), + topK_id=topK_id, + ) + eval_metric_ops[metric_name + "__k_" + str(K)] = ( + value_op, + update_op, + ) + else: + raise ValueError("Cannot find the metric named " + metric_name) + return eval_metric_ops + + return get_eval_metric_ops + + +def get_numeric_metric_fn( + metrics: List[str] = None, + topK: Tuple[int] = (5, 5, 5), + predcol: Optional[int] = None, + labelcol: Optional[int] = None, +) -> Callable[[dict, tf.Tensor, tf.Tensor], dict]: + if metrics is None: + metrics = list(DEFAULT_NUMERIC_METRICS) + metrics = list(set(metrics)) + + metric_op = get_metric_topK_fn_helper( + targetMetrics=list(DEFAULT_NUMERIC_METRICS), + supportedMetrics_op=SUPPORTED_NUMERIC_METRICS, + metrics=metrics, + topK=topK, + predcol=predcol, + labelcol=labelcol, + ) + return metric_op + + +def get_single_binary_task_metric_fn( + metrics: List[str], + classnames: List[str], + topK: Tuple[int] = (5, 5, 5), + use_topK: bool = False, +) -> Callable[[dict, tf.Tensor, tf.Tensor], dict]: + """ + graph_output['output']: [BatchSz, 1] [pred_Task1] + labels: [BatchSz, 2] [Task1, NumericLabel] + """ - if use_topK: - metric_op_numeric = get_numeric_metric_fn(metrics=None, topK=topK, predcol=2, labelcol=2) - metrics_numeric_res = metric_op_numeric(graph_output, labels, weights) - metrics_base_res.update(metrics_numeric_res) - return metrics_base_res + def get_eval_metric_ops(graph_output, labels, weights): + metric_op_base = get_partial_multi_binary_class_metric_fn( + metrics, predcols=0, classes=classnames + ) + classnames_unw = ["unweighted_" + cs for cs in classnames] + metric_op_unw = get_partial_multi_binary_class_metric_fn( + metrics, predcols=0, classes=classnames_unw + ) + + metrics_base_res = metric_op_base(graph_output, labels, weights) + metrics_unw_res = metric_op_unw(graph_output, labels, None) + metrics_base_res.update(metrics_unw_res) + + if use_topK: + metric_op_numeric = get_numeric_metric_fn( + metrics=None, topK=topK, predcol=0, labelcol=1 + ) + metrics_numeric_res = metric_op_numeric(graph_output, labels, weights) + metrics_base_res.update(metrics_numeric_res) + return metrics_base_res + + return get_eval_metric_ops + + +def get_dual_binary_tasks_metric_fn( + metrics: List[str], + classnames: List[str], + topK: Tuple[int] = (5, 5, 5), + use_topK: bool = False, +) -> Callable[[dict, tf.Tensor, tf.Tensor], dict]: + """ + Args: + metrics (List[str]): + List of metrics to use + classnames (List[str]): + List of class names + topK (Tuple[int]): + Top K + use_topK (bool): + Whether to use top K + + Returns: + callable: + Evaluation Metric Ops + """ - return get_eval_metric_ops + def get_eval_metric_ops(graph_output: dict, labels: tf.Tensor, weights: tf.Tensor): + """ + Args: + graph_output (dict): + Graph Output + labels (tf.Tensor): + Labels + weights (tf.Tensor): + Weights + + Returns: + callable: + Evaluation Metric Ops + """ + metric_op_base = get_partial_multi_binary_class_metric_fn( + metrics, predcols=[0, 1], classes=classnames + ) + classnames_unw = ["unweighted_" + cs for cs in classnames] + metric_op_unw = get_partial_multi_binary_class_metric_fn( + metrics, predcols=[0, 1], classes=classnames_unw + ) + + metrics_base_res = metric_op_base(graph_output, labels, weights) + metrics_unw_res = metric_op_unw(graph_output, labels, None) + metrics_base_res.update(metrics_unw_res) + + if use_topK: + metric_op_numeric = get_numeric_metric_fn( + metrics=None, topK=topK, predcol=2, labelcol=2 + ) + metrics_numeric_res = metric_op_numeric(graph_output, labels, weights) + metrics_base_res.update(metrics_numeric_res) + return metrics_base_res + + return get_eval_metric_ops diff --git a/twml/twml/contrib/metrics/search_metrics.py b/twml/twml/contrib/metrics/search_metrics.py index 7d7a502f1..7038a2eb8 100644 --- a/twml/twml/contrib/metrics/search_metrics.py +++ b/twml/twml/contrib/metrics/search_metrics.py @@ -12,281 +12,302 @@ """ -from collections import OrderedDict from functools import partial +from typing import Callable, Dict, List, Optional, Tuple import tensorflow.compat.v1 as tf from tensorflow.python.eager import context from tensorflow.python.framework import dtypes, ops from tensorflow.python.ops import array_ops, state_ops + import twml from twml.contrib.utils import math_fns -def ndcg(labels, predictions, - metrics_collections=None, - updates_collections=None, - name=None, - top_k_int=1): - # pylint: disable=unused-argument - """ - Compute full normalized discounted cumulative gain (ndcg) based on predictions - ndcg = dcg_k/idcg_k, k is a cut off ranking postion - There are a few variants of ndcg - The dcg (discounted cumulative gain) formula used in - twml.contrib.metrics.ndcg is:: - - \\sum_{i=1}^k \frac{2^{relevance\\_score} -1}{\\log_{2}(i + 1)} - - k is the length of items to be ranked in a batch/query - Notice that whether k will be replaced with a fixed value requires discussions - The scores in predictions are transformed to order and relevance scores to calculate ndcg - A relevance score means how relevant a DataRecord is to a particular query - - Arguments: - labels: the ground truth value. - predictions: the predicted values, whose shape must match labels. Ignored for CTR computation. - metrics_collections: optional list of collections to add this metric into. - updates_collections: optional list of collections to add the associated update_op into. - name: an optional variable_scope name. - - Returns: - ndcg: A `Tensor` representing the ndcg score. - update_op: A update operation used to accumulate data into this metric. - """ - with tf.variable_scope(name, 'ndcg', (labels, predictions)): - label_scores = tf.to_float(labels, name='label_to_float') - predicted_scores = tf.to_float(predictions, name='predictions_to_float') - - if context.executing_eagerly(): - raise RuntimeError('ndcg is not supported when eager execution ' - 'is enabled.') - - total_ndcg = _metric_variable([], dtypes.float32, name='total_ndcg') - count_query = _metric_variable([], dtypes.float32, name='query_count') - - # actual ndcg cutoff position top_k_int - max_prediction_size = array_ops.size(predicted_scores) - top_k_int = tf.minimum(max_prediction_size, top_k_int) - # the ndcg score of the batch - ndcg = math_fns.cal_ndcg(label_scores, - predicted_scores, top_k_int=top_k_int) - # add ndcg of the current batch to total_ndcg - update_total_op = state_ops.assign_add(total_ndcg, ndcg) - with ops.control_dependencies([ndcg]): - # count_query stores the number of queries - # count_query increases by 1 for each batch/query - update_count_op = state_ops.assign_add(count_query, 1) - - mean_ndcg = math_fns.safe_div(total_ndcg, count_query, 'mean_ndcg') - update_op = math_fns.safe_div(update_total_op, update_count_op, 'update_mean_ndcg_op') - - if metrics_collections: - ops.add_to_collections(metrics_collections, mean_ndcg) - - if updates_collections: - ops.add_to_collections(updates_collections, update_op) - - return mean_ndcg, update_op +def ndcg( + labels: tf.Tensor, + predictions: tf.Tensor, + metrics_collections: Optional[tf.Tensor] = None, + updates_collections: Optional[tf.Tensor] = None, + name: Optional[str] = None, + top_k_int: int = 1, +) -> Tuple[tf.Tensor, tf.Tensor]: + # pylint: disable=unused-argument + """ + Compute full normalized discounted cumulative gain (ndcg) based on predictions + ndcg = dcg_k/idcg_k, k is a cut off ranking postion + There are a few variants of ndcg + The dcg (discounted cumulative gain) formula used in + twml.contrib.metrics.ndcg is:: + + \\sum_{i=1}^k \frac{2^{relevance\\_score} -1}{\\log_{2}(i + 1)} + + k is the length of items to be ranked in a batch/query + Notice that whether k will be replaced with a fixed value requires discussions + The scores in predictions are transformed to order and relevance scores to calculate ndcg + A relevance score means how relevant a DataRecord is to a particular query + + Args: + labels (tf.Tensor): + the ground truth value. + predictions (tf.Tensor): + the predicted values, whose shape must match labels. Ignored for CTR computation. + metrics_collections (tf.Tensor): + optional list of collections to add this metric into. + updates_collections (tf.Tensor): + optional list of collections to add the associated update_op into. + name (str): + an optional variable_scope name. + + Returns: + ndcg: A `Tensor` representing the ndcg score. + update_op: A update operation used to accumulate data into this metric. + """ + with tf.variable_scope(name, "ndcg", (labels, predictions)): + label_scores = tf.to_float(labels, name="label_to_float") + predicted_scores = tf.to_float(predictions, name="predictions_to_float") + + if context.executing_eagerly(): + raise RuntimeError( + "ndcg is not supported when eager execution " "is enabled." + ) + total_ndcg = _metric_variable([], dtypes.float32, name="total_ndcg") + count_query = _metric_variable([], dtypes.float32, name="query_count") -# Copied from metrics_impl.py with minor modifications. -# https://github.com/tensorflow/tensorflow/blob/v1.5.0/tensorflow/python/ops/metrics_impl.py#L39 -def _metric_variable(shape, dtype, validate_shape=True, name=None): - """Create variable in `GraphKeys.(LOCAL|METRIC_VARIABLES`) collections.""" + # actual ndcg cutoff position top_k_int + max_prediction_size = array_ops.size(predicted_scores) + top_k_int = tf.minimum(max_prediction_size, top_k_int) + # the ndcg score of the batch + ndcg = math_fns.cal_ndcg(label_scores, predicted_scores, top_k_int=top_k_int) + # add ndcg of the current batch to total_ndcg + update_total_op = state_ops.assign_add(total_ndcg, ndcg) + with ops.control_dependencies([ndcg]): + # count_query stores the number of queries + # count_query increases by 1 for each batch/query + update_count_op = state_ops.assign_add(count_query, 1) - return tf.Variable( - lambda: tf.zeros(shape, dtype), - trainable=False, - collections=[tf.GraphKeys.LOCAL_VARIABLES, tf.GraphKeys.METRIC_VARIABLES], - validate_shape=validate_shape, - name=name) + mean_ndcg = math_fns.safe_div(total_ndcg, count_query, "mean_ndcg") + update_op = math_fns.safe_div( + update_total_op, update_count_op, "update_mean_ndcg_op" + ) + if metrics_collections: + ops.add_to_collections(metrics_collections, mean_ndcg) -# binary metric_name: (metric, requires thresholded output) + if updates_collections: + ops.add_to_collections(updates_collections, update_op) + + return mean_ndcg, update_op + + +# Copied from metrics_impl.py with minor modifications. +# https://github.com/tensorflow/tensorflow/blob/v1.5.0/tensorflow/python/ops/metrics_impl.py#L39 +def _metric_variable( + shape: Tuple[int, ...], + dtype: tf.Dtype, + validate_shape: bool = True, + name: Optional[str] = None, +) -> tf.Variable: + """Create variable in `GraphKeys.(LOCAL|METRIC_VARIABLES`) collections.""" + + return tf.Variable( + lambda: tf.zeros(shape, dtype), + trainable=False, + collections=[tf.GraphKeys.LOCAL_VARIABLES, tf.GraphKeys.METRIC_VARIABLES], + validate_shape=validate_shape, + name=name, + ) + + +# binary metric_name: (metric, requires threshold output) SUPPORTED_BINARY_CLASS_METRICS = { - # TWML binary metrics - 'rce': (twml.metrics.rce, False), - 'nrce': (partial(twml.metrics.rce, normalize=True), False), - # CTR measures positive sample ratio. This terminology is inherited from Ads. - 'ctr': (twml.metrics.ctr, False), - # predicted CTR measures predicted positive ratio. - 'predicted_ctr': (twml.metrics.predicted_ctr, False), - # thresholded metrics - 'accuracy': (tf.metrics.accuracy, True), - 'precision': (tf.metrics.precision, True), - 'recall': (tf.metrics.recall, True), - # tensorflow metrics - 'roc_auc': (partial(tf.metrics.auc, curve='ROC'), False), - 'pr_auc': (partial(tf.metrics.auc, curve='PR'), False), + # TWML binary metrics + "rce": (twml.metrics.rce, False), + "nrce": (partial(twml.metrics.rce, normalize=True), False), + # CTR measures positive sample ratio. This terminology is inherited from Ads. + "ctr": (twml.metrics.ctr, False), + # predicted CTR measures predicted positive ratio. + "predicted_ctr": (twml.metrics.predicted_ctr, False), + # thresholded metrics + "accuracy": (tf.metrics.accuracy, True), + "precision": (tf.metrics.precision, True), + "recall": (tf.metrics.recall, True), + # tensorflow metrics + "roc_auc": (partial(tf.metrics.auc, curve="ROC"), False), + "pr_auc": (partial(tf.metrics.auc, curve="PR"), False), } # search metric_name: metric SUPPORTED_SEARCH_METRICS = { - # TWML search metrics - # ndcg needs the raw prediction scores to sort - 'ndcg': ndcg, + # TWML search metrics + # ndcg needs the raw prediction scores to sort + "ndcg": ndcg, } -def get_search_metric_fn(binary_metrics=None, search_metrics=None, - ndcg_top_ks=[1, 3, 5, 10], use_binary_metrics=False): - """ - Returns a function having signature: - - .. code-block:: python - - def get_eval_metric_ops(graph_output, labels, weights): - ... - return eval_metric_ops - - where the returned eval_metric_ops is a dict of common evaluation metric - Ops for ranking. See `tf.estimator.EstimatorSpec - `_ - for a description of eval_metric_ops. The graph_output is a the result - dict returned by build_graph. Labels and weights are tf.Tensors. - - The following graph_output keys are recognized: - output: - the raw predictions. Required. - threshold: - Only used in SUPPORTED_BINARY_CLASS_METRICS - If the lables are 0s and 1s - A value between 0 and 1 used to threshold the output into a hard_output. - Defaults to 0.5 when threshold and hard_output are missing. - Either threshold or hard_output can be provided, but not both. - hard_output: - Only used in SUPPORTED_BINARY_CLASS_METRICS - A thresholded output. Either threshold or hard_output can be provided, but not both. - - Arguments: - only used in pointwise learning-to-rank - - binary_metrics (list of String): - a list of metrics of interest. E.g. ['ctr', 'accuracy', 'rce'] - These metrics are evaluated and reported to tensorboard *during the eval phases only*. - Supported metrics: - - ctr (same as positive sample ratio.) - - rce (cross entropy loss compared to the baseline model of always predicting ctr) - - nrce (normalized rce, do not use this one if you do not understand what it is) - - pr_auc - - roc_auc - - accuracy (percentage of predictions that are correct) - - precision (true positives) / (true positives + false positives) - - recall (true positives) / (true positives + false negatives) - - NOTE: accuracy / precision / recall apply to binary classification problems only. - I.e. a prediction is only considered correct if it matches the label. E.g. if the label - is 1.0, and the prediction is 0.99, it does not get credit. If you want to use - precision / recall / accuracy metrics with soft predictions, you'll need to threshold - your predictions into hard 0/1 labels. - - When binary_metrics is None (the default), it defaults to all supported metrics - - search_metrics (list of String): - a list of metrics of interest. E.g. ['ndcg'] - These metrics are evaluated and reported to tensorboard *during the eval phases only*. - Supported metrics: - - ndcg - - NOTE: ndcg works for ranking-relatd problems. - A batch contains all DataRecords that belong to the same query - If pair_in_batch_mode used in scalding -- a batch contains a pair of DataRecords - that belong to the same query and have different labels -- ndcg does not apply in here. - - When search_metrics is None (the default), it defaults to all supported search metrics - currently only 'ndcg' - - ndcg_top_ks (list of integers): - The cut-off ranking postions for a query - When ndcg_top_ks is None or empty (the default), it defaults to [1, 3, 5, 10] - - use_binary_metrics: - False (default) - Only set it to true in pointwise learning-to-rank - """ - # pylint: disable=dict-keys-not-iterating - - if ndcg_top_ks is None or not ndcg_top_ks: - ndcg_top_ks = [1, 3, 5, 10] - - if search_metrics is None: - search_metrics = list(SUPPORTED_SEARCH_METRICS.keys()) - - if binary_metrics is None and use_binary_metrics: - # Added SUPPORTED_BINARY_CLASS_METRICS in twml.metics as well - # they are only used in pointwise learing-to-rank - binary_metrics = list(SUPPORTED_BINARY_CLASS_METRICS.keys()) - - def get_eval_metric_ops(graph_output, labels, weights): +def get_search_metric_fn( + binary_metrics: Optional[List[str]] = None, + search_metrics: Optional[List[str]] = None, + ndcg_top_ks: List[int] = [1, 3, 5, 10], + use_binary_metrics: bool = False, +) -> Callable[[Dict[str, tf.Tensor], tf.Tensor, tf.Tensor], Dict[str, tf.Tensor]]: """ - graph_output: - dict that is returned by build_graph given input features. - labels: - target labels associated to batch. - weights: - weights of the samples.. + Returns a function having signature: + + .. code-block:: python + + def get_eval_metric_ops(graph_output, labels, weights): + ... + return eval_metric_ops + + where the returned eval_metric_ops is a dict of common evaluation metric + Ops for ranking. See `tf.estimator.EstimatorSpec + `_ + for a description of eval_metric_ops. The graph_output is a the result + dict returned by build_graph. Labels and weights are tf.Tensors. + + The following graph_output keys are recognized: + output: + the raw predictions. Required. + threshold: + Only used in SUPPORTED_BINARY_CLASS_METRICS + If the lables are 0s and 1s + A value between 0 and 1 used to threshold the output into a hard_output. + Defaults to 0.5 when threshold and hard_output are missing. + Either threshold or hard_output can be provided, but not both. + hard_output: + Only used in SUPPORTED_BINARY_CLASS_METRICS + A thresholded output. Either threshold or hard_output can be provided, but not both. + + Args: + only used in pointwise learning-to-rank + binary_metrics (list of String): + a list of metrics of interest. E.g. ['ctr', 'accuracy', 'rce'] + These metrics are evaluated and reported to tensorboard *during the eval phases only*. + Supported metrics: + - ctr (same as positive sample ratio.) + - rce (cross entropy loss compared to the baseline model of always predicting ctr) + - nrce (normalized rce, do not use this one if you do not understand what it is) + - pr_auc + - roc_auc + - accuracy (percentage of predictions that are correct) + - precision (true positives) / (true positives + false positives) + - recall (true positives) / (true positives + false negatives) + NOTE: accuracy / precision / recall apply to binary classification problems only. + I.e. a prediction is only considered correct if it matches the label. E.g. if the label + is 1.0, and the prediction is 0.99, it does not get credit. If you want to use + precision / recall / accuracy metrics with soft predictions, you'll need to threshold + your predictions into hard 0/1 labels. + When binary_metrics is None (the default), it defaults to all supported metrics + search_metrics (list of String): + a list of metrics of interest. E.g. ['ndcg'] + These metrics are evaluated and reported to tensorboard *during the eval phases only*. + Supported metrics: + - ndcg + NOTE: ndcg works for ranking-related problems. + A batch contains all DataRecords that belong to the same query + If pair_in_batch_mode used in scalding -- a batch contains a pair of DataRecords + that belong to the same query and have different labels -- ndcg does not apply in here. + When search_metrics is None (the default), it defaults to all supported search metrics + currently only 'ndcg' + ndcg_top_ks (list of integers): + The cut-off ranking positions for a query + When ndcg_top_ks is None or empty (the default), it defaults to [1, 3, 5, 10] + use_binary_metrics: + False (default) + Only set it to true in pointwise learning-to-rank """ - - eval_metric_ops = OrderedDict() - - preds = graph_output['output'] - - threshold = graph_output['threshold'] if 'threshold' in graph_output else 0.5 - - hard_preds = graph_output.get('hard_output') - # hard_preds is a tensor - # check hard_preds is None and then check if it is empty - if hard_preds is None or tf.equal(tf.size(hard_preds), 0): - hard_preds = tf.greater_equal(preds, threshold) - - # add search metrics to eval_metric_ops dict - for metric_name in search_metrics: - metric_name = metric_name.lower() # metric name are case insensitive. - - if metric_name in eval_metric_ops: - # avoid adding duplicate metrics. - continue - - search_metric_factory = SUPPORTED_SEARCH_METRICS.get(metric_name) - if search_metric_factory: - if metric_name == 'ndcg': - for top_k in ndcg_top_ks: - # metric name will show as ndcg_1, ndcg_10, ... - metric_name_ndcg_top_k = metric_name + '_' + str(top_k) - top_k_int = tf.constant(top_k, dtype=tf.int32) - # Note: having weights in ndcg does not make much sense - # Because ndcg already has position weights/discounts - # Thus weights are not applied in ndcg metric - value_op, update_op = search_metric_factory( - labels=labels, - predictions=preds, - name=metric_name_ndcg_top_k, - top_k_int=top_k_int) - eval_metric_ops[metric_name_ndcg_top_k] = (value_op, update_op) - else: - raise ValueError('Cannot find the search metric named ' + metric_name) - - if use_binary_metrics: - # add binary metrics to eval_metric_ops dict - for metric_name in binary_metrics: - - if metric_name in eval_metric_ops: - # avoid adding duplicate metrics. - continue - - metric_name = metric_name.lower() # metric name are case insensitive. - binary_metric_factory, requires_threshold = SUPPORTED_BINARY_CLASS_METRICS.get(metric_name) - if binary_metric_factory: - value_op, update_op = binary_metric_factory( - labels=labels, - predictions=(hard_preds if requires_threshold else preds), - weights=weights, - name=metric_name) - eval_metric_ops[metric_name] = (value_op, update_op) - else: - raise ValueError('Cannot find the binary metric named ' + metric_name) - - return eval_metric_ops - - return get_eval_metric_ops + # pylint: disable=dict-keys-not-iterating + + if ndcg_top_ks is None or not ndcg_top_ks: + ndcg_top_ks = [1, 3, 5, 10] + + if search_metrics is None: + search_metrics = list(SUPPORTED_SEARCH_METRICS.keys()) + + if binary_metrics is None and use_binary_metrics: + # Added SUPPORTED_BINARY_CLASS_METRICS in twml.metics as well + # they are only used in pointwise learing-to-rank + binary_metrics = list(SUPPORTED_BINARY_CLASS_METRICS.keys()) + + def get_eval_metric_ops( + graph_output: Dict[str, tf.Tensor], labels: tf.Tensor, weights: tf.Tensor + ) -> Dict[str, tf.Tensor]: + """ + graph_output: + dict that is returned by build_graph given input features. + labels: + target labels associated to batch. + weights: + weights of the samples.. + """ + + eval_metric_ops = dict() + + preds = graph_output["output"] + + threshold = graph_output["threshold"] if "threshold" in graph_output else 0.5 + + hard_preds = graph_output.get("hard_output") + # hard_preds is a tensor + # check hard_preds is None and then check if it is empty + if hard_preds is None or tf.equal(tf.size(hard_preds), 0): + hard_preds = tf.greater_equal(preds, threshold) + + # add search metrics to eval_metric_ops dict + for metric_name in search_metrics: + metric_name = metric_name.lower() # metric name are case insensitive. + + if metric_name in eval_metric_ops: + # avoid adding duplicate metrics. + continue + + search_metric_factory = SUPPORTED_SEARCH_METRICS.get(metric_name) + if search_metric_factory: + if metric_name == "ndcg": + for top_k in ndcg_top_ks: + # metric name will show as ndcg_1, ndcg_10, ... + metric_name_ndcg_top_k = metric_name + "_" + str(top_k) + top_k_int = tf.constant(top_k, dtype=tf.int32) + # Note: having weights in ndcg does not make much sense + # Because ndcg already has position weights/discounts + # Thus weights are not applied in ndcg metric + value_op, update_op = search_metric_factory( + labels=labels, + predictions=preds, + name=metric_name_ndcg_top_k, + top_k_int=top_k_int, + ) + eval_metric_ops[metric_name_ndcg_top_k] = (value_op, update_op) + else: + raise ValueError("Cannot find the search metric named " + metric_name) + + if use_binary_metrics: + # add binary metrics to eval_metric_ops dict + for metric_name in binary_metrics: + if metric_name in eval_metric_ops: + # avoid adding duplicate metrics. + continue + + metric_name = metric_name.lower() # metric name are case insensitive. + ( + binary_metric_factory, + requires_threshold, + ) = SUPPORTED_BINARY_CLASS_METRICS.get(metric_name) + if binary_metric_factory: + value_op, update_op = binary_metric_factory( + labels=labels, + predictions=(hard_preds if requires_threshold else preds), + weights=weights, + name=metric_name, + ) + eval_metric_ops[metric_name] = (value_op, update_op) + else: + raise ValueError( + "Cannot find the binary metric named " + metric_name + ) + + return eval_metric_ops + + return get_eval_metric_ops diff --git a/twml/twml/contrib/optimizers/__init__.py b/twml/twml/contrib/optimizers/__init__.py index 112b2b410..c140e55af 100644 --- a/twml/twml/contrib/optimizers/__init__.py +++ b/twml/twml/contrib/optimizers/__init__.py @@ -1,4 +1,6 @@ # pylint: disable=wildcard-import """This module contains experimental optimizer classes""" -from .deep_gradient_compression_optimizer import DeepGradientCompressionOptimizer # noqa: F401 +from .deep_gradient_compression_optimizer import ( + DeepGradientCompressionOptimizer, +) # noqa: F401 from .pruning_optimizer import PruningOptimizer # noqa: F401 diff --git a/twml/twml/contrib/optimizers/deep_gradient_compression_optimizer.py b/twml/twml/contrib/optimizers/deep_gradient_compression_optimizer.py index 2c71ed13f..4447feb90 100644 --- a/twml/twml/contrib/optimizers/deep_gradient_compression_optimizer.py +++ b/twml/twml/contrib/optimizers/deep_gradient_compression_optimizer.py @@ -8,173 +8,225 @@ # TODO: Test how much communication overhead this DeepGradientCompressionOptimizer can reduce under # multi-GPU and distributed setting. -import tensorflow.compat.v1 as tf - +from typing import List -def compute_threshold(grad, density): - """ - A utility function to compute the threshold for gradient sparsification, given the gradient - tensor and the density. - Args: - grad(tf.Tensor): - Gradient tensor for some variable. - density(float): - Density degree when sparsifying gradients. - Returns(float): - Threshold for gradient sparsification. - """ - flat_grad = tf.reshape(grad, [-1]) - abs_flat_grad = tf.abs(flat_grad) - size = tf.shape(abs_flat_grad)[0] - k = tf.maximum(tf.constant(1), - tf.cast(tf.scalar_mul(density, tf.cast(size, tf.float32)), tf.int32)) - topk, _ = tf.nn.top_k(abs_flat_grad, k, False) - return topk[-1] - - -def get_top_row_indices(values, density): - """ - A utility function to get indices of most significant rows, given the density degree. - Args: - values(tf.Tensor): - Gradient or locally accumulated gradient for some variable. - density(float): - Density degree when filtering out rows. - Returns(list(int)): - Indices of most significant rows. - """ - abs_values = tf.abs(values) - - try: - row_num = tf.shape(abs_values)[0] - k = tf.maximum(tf.constant(1), - tf.cast(tf.scalar_mul(density, tf.cast(row_num, tf.float32)), tf.int32)) - row_sums = tf.squeeze(tf.reduce_sum(values, axis=1, keepdims=True)) - _, top_row_indices = tf.nn.top_k(row_sums, k=k, sorted=False) - # print "abs_values", abs_values, "row_sums", row_sums - return top_row_indices - # return tf.range(row_num) - - except ValueError: # if the tensor is 0-D or 1-D - return None +import tensorflow.compat.v1 as tf -class DeepGradientCompressionOptimizer(tf.train.GradientDescentOptimizer): - """ - A custom optimizer to implement Deep Gradient Compression (https://arxiv.org/abs/1712.01887). - """ - - def __init__(self, learning_rate, use_locking=False, name="Sparse", - density=1.0, - density_decay=False, - density_decay_steps=10000, - density_decay_rate=0.5, - min_density=0.1, - accumulation=False): - super(DeepGradientCompressionOptimizer, self).__init__(learning_rate, use_locking, name) - self._initial_density_t = tf.convert_to_tensor(density) - self._density_decay = density_decay - dtype = self._initial_density_t.dtype - self._density_decay_steps_t = tf.convert_to_tensor(density_decay_steps, dtype) - self._density_decay_rate_t = tf.convert_to_tensor(density_decay_rate, dtype) - self._min_density_t = tf.convert_to_tensor(min_density, dtype) - self._accumulation = accumulation - - def _prepare(self): - super(DeepGradientCompressionOptimizer, self)._prepare() - if not self._density_decay: - self._density_t = self._initial_density_t - else: - dtype = self._initial_density_t.dtype - global_step = tf.cast(tf.train.get_global_step(), dtype) - p = tf.floor(tf.divide(global_step, self._density_decay_steps_t)) - decayed_density = tf.multiply(self._initial_density_t, - tf.pow(self._density_decay_rate_t, p)) - self._density_t = tf.maximum(self._min_density_t, decayed_density) - - def _create_slots(self, var_list): +def compute_threshold(grad: tf.Tensor, density: float) -> float: """ - Create a slot variable to accumulate gradients locally for each variable in `var_list`. + A utility function to compute the threshold for gradient sparsification, given the gradient + tensor and the density. Args: - var_list(list(tf.Variable)): - List of variables to accumulate gradients locally for. + grad(tf.Tensor): + Gradient tensor for some variable. + density (float): + Density degree when sparsifying gradients. + Returns: + (float) Threshold for gradient sparsification. """ - for var in var_list: - self._zeros_slot(var, "g_buffer", self._name) - - def _apply_dense(self, grad, var): - if not self._accumulation: - top_row_indices = get_top_row_indices(grad, self._density_t) - - if top_row_indices is None: - return super(DeepGradientCompressionOptimizer, self)._apply_dense(grad, var) - - sparsified_values = tf.gather(grad, top_row_indices) - sparsified_indices = top_row_indices - - sparsified_grad = tf.IndexedSlices(sparsified_values, sparsified_indices) - - return super(DeepGradientCompressionOptimizer, self)._apply_sparse_duplicate_indices( - sparsified_grad, var) - - else: - g_buffer = self.get_slot(var, "g_buffer") - - g_buffer = tf.assign_add(g_buffer, grad) - - top_row_indices = get_top_row_indices(g_buffer, self._density_t) - - if top_row_indices is None: - return super(DeepGradientCompressionOptimizer, self)._apply_dense(grad, var) - - sparsified_values = tf.gather(g_buffer, top_row_indices) - sparsified_indices = top_row_indices - - sparsified_grad = tf.IndexedSlices(sparsified_values, sparsified_indices) - - update_var = super(DeepGradientCompressionOptimizer, self)._apply_sparse_duplicate_indices( - sparsified_grad, var) - - update_g_buffer = tf.scatter_update(g_buffer, sparsified_indices, tf.zeros_like( - sparsified_values)) - - return tf.group(*[update_var, update_g_buffer]) - - def _apply_sparse_duplicate_indices(self, grad, var): - if not self._accumulation: - top_row_indices = get_top_row_indices(grad.values, self._density_t) - - if top_row_indices is None: - return super(DeepGradientCompressionOptimizer, self)._apply_sparse_duplicate_indices(grad, var) # noqa: E501 - - sparsified_values = tf.gather(grad.values, top_row_indices) - sparsified_indices = tf.gather(grad.indices, top_row_indices) - - sparsified_grad = tf.IndexedSlices(sparsified_values, sparsified_indices) + flat_grad = tf.reshape(grad, [-1]) + abs_flat_grad = tf.abs(flat_grad) + size = tf.shape(abs_flat_grad)[0] + k = tf.maximum( + tf.constant(1), + tf.cast(tf.scalar_mul(density, tf.cast(size, tf.float32)), tf.int32), + ) + topk, _ = tf.nn.top_k(abs_flat_grad, k, False) + return topk[-1] + + +def get_top_row_indices(values: tf.Tensor, density: float) -> List[int]: + """ + A utility function to get indices of most significant rows, given the density degree. + Args: + values(tf.Tensor): + Gradient or locally accumulated gradient for some variable. + density(float): + Density degree when filtering out rows. + Returns(list(int)): + Indices of most significant rows. + """ + abs_values = tf.abs(values) - return super(DeepGradientCompressionOptimizer, self)._apply_sparse_duplicate_indices( - sparsified_grad, var) + try: + row_num = tf.shape(abs_values)[0] + k = tf.maximum( + tf.constant(1), + tf.cast(tf.scalar_mul(density, tf.cast(row_num, tf.float32)), tf.int32), + ) + row_sums = tf.squeeze(tf.reduce_sum(values, axis=1, keepdims=True)) + _, top_row_indices = tf.nn.top_k(row_sums, k=k, sorted=False) + # print "abs_values", abs_values, "row_sums", row_sums + return top_row_indices + # return tf.range(row_num) - else: - g_buffer = self.get_slot(var, "g_buffer") + except ValueError: # if the tensor is 0-D or 1-D + return None - g_buffer = tf.scatter_update(g_buffer, grad.indices, grad.values) - top_row_indices = get_top_row_indices(g_buffer, self._density_t) +class DeepGradientCompressionOptimizer(tf.train.GradientDescentOptimizer): + """ + A custom optimizer to implement Deep Gradient Compression (https://arxiv.org/abs/1712.01887). + """ - if top_row_indices is None: - return super(DeepGradientCompressionOptimizer, - self)._apply_sparse_duplicate_indices(grad, var) + def __init__( + self, + learning_rate: float, + use_locking: bool = False, + name: str = "Sparse", + density: float = 1.0, + density_decay: bool = False, + density_decay_steps: int = 10000, + density_decay_rate: float = 0.5, + min_density: float = 0.1, + accumulation: bool = False, + ): + super(DeepGradientCompressionOptimizer, self).__init__( + learning_rate, use_locking, name + ) + self._initial_density_t = tf.convert_to_tensor(density) + self._density_decay = density_decay + dtype = self._initial_density_t.dtype + self._density_decay_steps_t = tf.convert_to_tensor(density_decay_steps, dtype) + self._density_decay_rate_t = tf.convert_to_tensor(density_decay_rate, dtype) + self._min_density_t = tf.convert_to_tensor(min_density, dtype) + self._accumulation = accumulation + + def _prepare(self) -> None: + super(DeepGradientCompressionOptimizer, self)._prepare() + if not self._density_decay: + self._density_t = self._initial_density_t + else: + dtype = self._initial_density_t.dtype + global_step = tf.cast(tf.train.get_global_step(), dtype) + p = tf.floor(tf.divide(global_step, self._density_decay_steps_t)) + decayed_density = tf.multiply( + self._initial_density_t, tf.pow(self._density_decay_rate_t, p) + ) + self._density_t = tf.maximum(self._min_density_t, decayed_density) + + def _create_slots(self, var_list: List[tf.Variable]) -> None: + """ + Create a slot variable to accumulate gradients locally for each variable in `var_list`. + Args: + var_list(list(tf.Variable)): + List of variables to accumulate gradients locally for. + """ + for var in var_list: + self._zeros_slot(var, "g_buffer", self._name) + + def _apply_dense(self, grad: tf.Tensor, var: tf.Variable) -> tf.Operation: + """ + Apply dense gradients to variables. + + Args: + grad(tf.Tensor): + Dense gradients to apply. + var(tf.Variable): + Variable to apply gradients to. + + Returns: + (tf.Operation) Operation to apply dense gradients to variables. + """ + if not self._accumulation: + top_row_indices = get_top_row_indices(grad, self._density_t) + + if top_row_indices is None: + return super(DeepGradientCompressionOptimizer, self)._apply_dense( + grad, var + ) + + sparsified_values = tf.gather(grad, top_row_indices) + sparsified_indices = top_row_indices + + sparsified_grad = tf.IndexedSlices(sparsified_values, sparsified_indices) + + return super( + DeepGradientCompressionOptimizer, self + )._apply_sparse_duplicate_indices(sparsified_grad, var) + + else: + g_buffer = self.get_slot(var, "g_buffer") + + g_buffer = tf.assign_add(g_buffer, grad) + + top_row_indices = get_top_row_indices(g_buffer, self._density_t) + + if top_row_indices is None: + return super(DeepGradientCompressionOptimizer, self)._apply_dense( + grad, var + ) + + sparsified_values = tf.gather(g_buffer, top_row_indices) + sparsified_indices = top_row_indices + + sparsified_grad = tf.IndexedSlices(sparsified_values, sparsified_indices) + + update_var = super( + DeepGradientCompressionOptimizer, self + )._apply_sparse_duplicate_indices(sparsified_grad, var) + + update_g_buffer = tf.scatter_update( + g_buffer, sparsified_indices, tf.zeros_like(sparsified_values) + ) + + return tf.group(*[update_var, update_g_buffer]) + + def _apply_sparse_duplicate_indices( + self, grad: tf.IndexedSlices, var: tf.Variable + ) -> tf.Operation: + """ + Apply sparse gradients to variables. + + Args: + grad(tf.IndexedSlices): + Sparse gradients to apply. + var(tf.Variable): + Variable to apply gradients to. + + Returns: + (tf.Operation) Operation to apply sparse gradients to variables. + """ + + if not self._accumulation: + top_row_indices = get_top_row_indices(grad.values, self._density_t) + + if top_row_indices is None: + return super( + DeepGradientCompressionOptimizer, self + )._apply_sparse_duplicate_indices( + grad, var + ) # noqa: E501 + + sparsified_values = tf.gather(grad.values, top_row_indices) + sparsified_indices = tf.gather(grad.indices, top_row_indices) + sparsified_grad = tf.IndexedSlices(sparsified_values, sparsified_indices) + + return super( + DeepGradientCompressionOptimizer, self + )._apply_sparse_duplicate_indices(sparsified_grad, var) + + else: + g_buffer = self.get_slot(var, "g_buffer") + g_buffer = tf.scatter_update(g_buffer, grad.indices, grad.values) + top_row_indices = get_top_row_indices(g_buffer, self._density_t) - sparsified_values = tf.gather(g_buffer, top_row_indices) - sparsified_indices = top_row_indices + if top_row_indices is None: + return super( + DeepGradientCompressionOptimizer, self + )._apply_sparse_duplicate_indices(grad, var) - sparsified_grad = tf.IndexedSlices(sparsified_values, sparsified_indices) + sparsified_values = tf.gather(g_buffer, top_row_indices) + sparsified_indices = top_row_indices + sparsified_grad = tf.IndexedSlices(sparsified_values, sparsified_indices) - update_var = super(DeepGradientCompressionOptimizer, self)._apply_sparse_duplicate_indices( - sparsified_grad, var) + update_var = super( + DeepGradientCompressionOptimizer, self + )._apply_sparse_duplicate_indices(sparsified_grad, var) - update_g_buffer = tf.scatter_update(g_buffer, sparsified_indices, tf.zeros_like( - sparsified_values)) + update_g_buffer = tf.scatter_update( + g_buffer, sparsified_indices, tf.zeros_like(sparsified_values) + ) - return tf.group(*[update_var, update_g_buffer]) + return tf.group(*[update_var, update_g_buffer]) diff --git a/twml/twml/contrib/optimizers/pruning_optimizer.py b/twml/twml/contrib/optimizers/pruning_optimizer.py index 2bcd612ed..40f2fc007 100644 --- a/twml/twml/contrib/optimizers/pruning_optimizer.py +++ b/twml/twml/contrib/optimizers/pruning_optimizer.py @@ -6,159 +6,168 @@ To make a layer prunable, use `twml.contrib.pruning.apply_mask`: - dense1 = tf.layers.dense(inputs=inputs, units=50, activation=tf.nn.relu) - dense1 = apply_mask(dense1) + dense1 = tf.layers.dense(inputs=inputs, units=50, activation=tf.nn.relu) + dense1 = apply_mask(dense1) To prune the network, apply PruningOptimizer to any cross-entropy loss: - loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) + loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) - optimizer = PruningOptimizer(learning_rate=0.001, momentum=0.5) - minimize = optimizer.minimize( - loss=loss, - prune_every=10, - burn_in=100, - global_step=tf.train.get_global_step()) + optimizer = PruningOptimizer(learning_rate=0.001, momentum=0.5) + minimize = optimizer.minimize( + loss=loss, + prune_every=10, + burn_in=100, + global_step=tf.train.get_global_step()) """ +from typing import Optional + import tensorflow.compat.v1 as tf -from twml.contrib.pruning import computational_cost, prune, update_pruning_signals -from twml.contrib.pruning import MASK_COLLECTION +from twml.contrib.pruning import ( + MASK_COLLECTION, + computational_cost, + prune, + update_pruning_signals, +) class PruningOptimizer(tf.train.MomentumOptimizer): - """ - Updates parameters with SGD and pruning masks using Fisher pruning. - - Arguments: - learning_rate: float - Learning rate of SGD - - momentum: float - Momentum used by SGD - - use_locking: bool - If `True`, use locks for update operations - - name: str - Optional name prefix for the operations created when applying gradients - - use_nesterov: bool - If `True`, use Nesterov momentum - """ - - def __init__( - self, - learning_rate, - momentum=0.9, - use_locking=False, - name="PruningOptimizer", - use_nesterov=False): - super(PruningOptimizer, self).__init__( - learning_rate=learning_rate, - momentum=momentum, - use_locking=use_locking, - name=name, - use_nesterov=use_nesterov) - - def minimize( - self, - loss, - prune_every=100, - burn_in=0, - decay=.96, - flops_weight='AUTO', - flops_target=0, - update_params=None, - method='Fisher', - *args, - **kwargs): """ - Create operations to minimize loss and to prune features. - - A pruning signal measures the importance of feature maps. This is weighed against the - computational cost of computing a feature map. Features are then iteratively pruned - based on a weighted average of feature importance S and computational cost C (in FLOPs): - - $$S + w * C$$ - - Setting `flops_weight` to 'AUTO' is the most convenient and recommended option, but not - necessarily optimal. - - Arguments: - loss: tf.Tensor - The value to minimize - - prune_every: int - One entry of a mask is set to zero only every few update steps - - burn_in: int - Pruning starts only after this many parameter updates - - decay: float - Controls exponential moving average of pruning signals - - flops_weight: float or str - Controls the targeted trade-off between computational complexity and performance - - flops_target: float - Stop pruning when computational complexity is less or this many floating point ops - - update_params: tf.Operation - Optional training operation used instead of MomentumOptimizer to update parameters - - method: str - Method used to compute pruning signal (currently only supports 'Fisher') - - Returns: - A `tf.Operation` updating parameters and pruning masks - - References: - * Theis et al., Faster gaze prediction with dense networks and Fisher pruning, 2018 + Updates parameters with SGD and pruning masks using Fisher pruning. + + Args: + learning_rate: float + Learning rate of SGD + momentum: float + Momentum used by SGD + use_locking: bool + If `True`, use locks for update operations + name: str + Optional name prefix for the operations created when applying gradients + use_nesterov: bool + If `True`, use Nesterov momentum """ - # gradient-based updates of parameters - if update_params is None: - update_params = super(PruningOptimizer, self).minimize(loss, *args, **kwargs) - - masks = tf.get_collection(MASK_COLLECTION) - - with tf.variable_scope('pruning_opt', reuse=True): - # estimate computational cost per data point - batch_size = tf.cast(tf.shape(masks[0].tensor), loss.dtype)[0] - cost = tf.divide(computational_cost(loss), batch_size, name='computational_cost') - - tf.summary.scalar('computational_cost', cost) - - if masks: - signals = update_pruning_signals(loss, masks=masks, decay=decay, method=method) - - # estimate computational cost per feature map - costs = tf.gradients(cost, masks) - - # trade off computational complexity and performance - if flops_weight.upper() == 'AUTO': - signals = [s / (c + 1e-6) for s, c in zip(signals, costs)] - elif not isinstance(flops_weight, float) or flops_weight != 0.: - signals = [s - flops_weight * c for s, c in zip(signals, costs)] - - counter = tf.Variable(0, name='pruning_counter') - counter = tf.assign_add(counter, 1, use_locking=True) - - # only prune every so often after a burn-in phase - pruning_cond = tf.logical_and(counter > burn_in, tf.equal(counter % prune_every, 0)) - - # stop pruning after reaching threshold - if flops_target > 0: - pruning_cond = tf.logical_and(pruning_cond, tf.greater(cost, flops_target)) - - update_masks = tf.cond( - pruning_cond, - lambda: prune(signals, masks=masks), - lambda: tf.group(masks)) - - return tf.group([update_params, update_masks]) - - # no masks found - return update_params + def __init__( + self, + learning_rate: float, + momentum: float = 0.9, + use_locking: bool = False, + name: str = "PruningOptimizer", + use_nesterov: bool = False, + ): + super(PruningOptimizer, self).__init__( + learning_rate=learning_rate, + momentum=momentum, + use_locking=use_locking, + name=name, + use_nesterov=use_nesterov, + ) + + def minimize( + self, + loss: tf.Tensor, + prune_every: int = 100, + burn_in: int = 0, + decay: float = 0.96, + flops_weight: str = "AUTO", + flops_target: int = 0, + update_params: Optional[tf.Operation] = None, + method: str = "Fisher", + *args, + **kwargs + ) -> tf.Operation: + """ + Create operations to minimize loss and to prune features. + + A pruning signal measures the importance of feature maps. This is weighed against the + computational cost of computing a feature map. Features are then iteratively pruned + based on a weighted average of feature importance S and computational cost C (in FLOPs): + + $$S + w * C$$ + + Setting `flops_weight` to 'AUTO' is the most convenient and recommended option, but not + necessarily optimal. + + Args: + loss: tf.Tensor + The value to minimize + prune_every: int + One entry of a mask is set to zero only every few update steps + burn_in: int + Pruning starts only after this many parameter updates + decay: float + Controls exponential moving average of pruning signals + flops_weight: float or str + Controls the targeted trade-off between computational complexity and performance + flops_target: float + Stop pruning when computational complexity is less or this many floating point ops + update_params: tf.Operation + Optional training operation used instead of MomentumOptimizer to update parameters + method: str + Method used to compute pruning signal (currently only supports 'Fisher') + + Returns: + A `tf.Operation` updating parameters and pruning masks + + References: + * Theis et al., Faster gaze prediction with dense networks and Fisher pruning, 2018 + """ + + # gradient-based updates of parameters + if update_params is None: + update_params = super(PruningOptimizer, self).minimize( + loss, *args, **kwargs + ) + + masks = tf.get_collection(MASK_COLLECTION) + + with tf.variable_scope("pruning_opt", reuse=True): + # estimate computational cost per data point + batch_size = tf.cast(tf.shape(masks[0].tensor), loss.dtype)[0] + cost = tf.divide( + computational_cost(loss), batch_size, name="computational_cost" + ) + + tf.summary.scalar("computational_cost", cost) + + if masks: + signals = update_pruning_signals( + loss, masks=masks, decay=decay, method=method + ) + + # estimate computational cost per feature map + costs = tf.gradients(cost, masks) + + # trade off computational complexity and performance + if flops_weight.upper() == "AUTO": + signals = [s / (c + 1e-6) for s, c in zip(signals, costs)] + elif not isinstance(flops_weight, float) or flops_weight != 0.0: + signals = [s - flops_weight * c for s, c in zip(signals, costs)] + + counter = tf.Variable(0, name="pruning_counter") + counter = tf.assign_add(counter, 1, use_locking=True) + + # only prune every so often after a burn-in phase + pruning_cond = tf.logical_and( + counter > burn_in, tf.equal(counter % prune_every, 0) + ) + + # stop pruning after reaching threshold + if flops_target > 0: + pruning_cond = tf.logical_and( + pruning_cond, tf.greater(cost, flops_target) + ) + + update_masks = tf.cond( + pruning_cond, + lambda: prune(signals, masks=masks), + lambda: tf.group(masks), + ) + + return tf.group([update_params, update_masks]) + + # no masks found + return update_params diff --git a/twml/twml/contrib/parsers.py b/twml/twml/contrib/parsers.py index a27f2acbd..448b724fa 100644 --- a/twml/twml/contrib/parsers.py +++ b/twml/twml/contrib/parsers.py @@ -1,21 +1,21 @@ -''' +""" Contains implementations of functions to parse the contrib.FeatureConfig Modelers can use the functions in this module as the the train/eval_parse_fn of the DataRecordTrainer constructor to customize how to parse their datasets. Modelers may also provide custom implementations of train/eval_parse_fn using these as reference. -''' +""" -from twitter.deepbird.io.legacy.contrib.parsers import ( - _convert_to_fixed_length_tensor, # noqa: F401 - _get_input_receiver_fn_feature_dict, # noqa: F401 - _merge_dictionaries, # noqa: F401 - get_features_as_tensor_dict, # noqa: F401 - get_keras_parse_fn, # noqa: F401 - get_serving_input_receiver_fn_feature_dict, # noqa: F401 - get_string_tensor_parse_fn, # noqa: F401 - get_string_tensor_serving_input_receiver_fn, # noqa: F401 - get_supervised_input_receiver_fn_feature_dict, # noqa: F401 - parse_string_tensor, # noqa: F401 +from twitter.deepbird.io.legacy.contrib.parsers import _merge_dictionaries # noqa: F401 +from twitter.deepbird.io.legacy.contrib.parsers import get_keras_parse_fn # noqa: F401 +from twitter.deepbird.io.legacy.contrib.parsers import parse_string_tensor # noqa: F401 +from twitter.deepbird.io.legacy.contrib.parsers import ( # noqa: F401 + _convert_to_fixed_length_tensor, + _get_input_receiver_fn_feature_dict, + get_features_as_tensor_dict, + get_serving_input_receiver_fn_feature_dict, + get_string_tensor_parse_fn, + get_string_tensor_serving_input_receiver_fn, + get_supervised_input_receiver_fn_feature_dict, ) diff --git a/twml/twml/contrib/pruning.py b/twml/twml/contrib/pruning.py index b6ddee693..950f3c9d1 100644 --- a/twml/twml/contrib/pruning.py +++ b/twml/twml/contrib/pruning.py @@ -3,361 +3,395 @@ In particular, it provides tools for dealing with masks: - features = apply_mask(features) + features = apply_mask(features) The function `apply_mask` applies a binary mask to the channels of a given tensor. Consider the following loss: - logits = tf.matmul(features, weights) - loss = tf.losses.sparse_softmax_cross_entropy(labels, logits) + logits = tf.matmul(features, weights) + loss = tf.losses.sparse_softmax_cross_entropy(labels, logits) Each mask has a corresponding pruning signal. The function `update_pruning_signals` will update and return these signals: - signals = update_pruning_signals(loss) + signals = update_pruning_signals(loss) The pruning operation will zero out the mask entry with the smallest corresponding pruning signal: - prune(signals) + prune(signals) The following function allows us to estimate the computational cost of a graph (number of FLOPs): - cost = computational_cost(loss) + cost = computational_cost(loss) To compute the cost of each feature per data point, we can do: - costs = tf.gradients(cost / batch_size, masks) + costs = tf.gradients(cost / batch_size, masks) The current implementation of `computational_cost` is designed to work with standard feed-forward and convolutional network architectures only, but may fail with more complicated architectures. """ +from typing import List, Optional, Set, Union + import numpy as np import tensorflow.compat.v1 as tf -MASK_COLLECTION = 'pruning/masks' -MASK_EXTENDED_COLLECTION = 'pruning/masks_extended' -OP_COLLECTION = 'pruning/ops' - - -def apply_mask(tensor, name='pruning'): - """ - Point-wise multiplies a tensor with a binary mask. - - During training, pruning is simulated by setting entries of the mask to zero. - - Arguments: - tensor: tf.Tensor - A tensor where the last dimension represents channels which will be masked - - Returns: - `tf.Tensor` with same shape as `tensor` - """ - - tensor_shape = tensor.shape - - with tf.variable_scope(name, reuse=True): - # allocate masks and corresponding pruning signals - mask = tf.Variable(tf.ones(tensor.shape.as_list()[-1]), trainable=False, name='mask') - pruning_signal = tf.Variable(tf.zeros_like(mask), trainable=False, name='signal') - - # extending masks is a trick to get a separate gradient for each data point - mask_extended = extend_mask(mask, tensor) - - # store extended mask, pruning signal, and other vars for easy access later - mask.extended = mask_extended - mask.pruning_signal = pruning_signal - mask.tensor = tensor - - # mask tensor - tensor = tf.multiply(tensor, mask_extended) - tensor.set_shape(tensor_shape) - tensor._mask = mask - - tf.add_to_collection(MASK_COLLECTION, mask) - tf.add_to_collection(MASK_EXTENDED_COLLECTION, mask.extended) - tf.add_to_collection(OP_COLLECTION, tensor.op) - - return tensor - - -def extend_mask(mask, tensor): - """ - Repeats the mask for each data point stored in a tensor. - - If `tensor` is AxBxC dimensional and `mask` is C dimensional, returns an Ax1xC dimensional - tensor with A copies or `mask`. - - Arguments: - mask: tf.Tensor - The mask which will be extended - - tensor: tf.Tensor - The tensor to which the extended mask will be applied - - Returns: - The extended mask - """ - - batch_size = tf.shape(tensor)[:1] - ones = tf.ones([tf.rank(tensor) - 1], dtype=batch_size.dtype) - multiples = tf.concat([batch_size, ones], 0) - mask_shape = tf.concat([ones, [-1]], 0) - return tf.tile(tf.reshape(mask, mask_shape), multiples) - - -def find_input_mask(tensor): - """ - Find ancestral mask affecting the number of pruned channels of a tensor. - - Arguments: - tensor: tf.Tensor - Tensor for which to identify relevant mask - - Returns: - A `tf.Tensor` or `None` - """ - - if hasattr(tensor, '_mask'): - return tensor._mask - if tensor.op.type in ['MatMul', 'Conv1D', 'Conv2D', 'Conv3D', 'Transpose']: - # op produces a new number of channels, preceding mask therefore irrelevant - return None - if not tensor.op.inputs: - return None - for input in tensor.op.inputs: - mask = find_input_mask(input) - if mask is not None: - return mask - - -def find_output_mask(tensor): - """ - Find mask applied to the tensor or one of its descendants if it affects the tensor's pruned shape. - - Arguments: - tensor: tf.Tensor or tf.Variable - Tensor for which to identify relevant mask - - Returns: - A `tf.Tensor` or `None` - """ - - if isinstance(tensor, tf.Variable): - return find_output_mask(tensor.op.outputs[0]) - if hasattr(tensor, '_mask'): - return tensor._mask - for op in tensor.consumers(): - if len(op.outputs) != 1: - continue - if op.type in ['MatMul', 'Conv1D', 'Conv2D', 'Conv3D']: - # masks of descendants are only relevant if tensor is right-multiplied - if tensor == op.inputs[1]: - return find_output_mask(op.outputs[0]) - return None - mask = find_output_mask(op.outputs[0]) - if mask is not None: - return mask - - -def find_mask(tensor): - """ - Returns masks indicating channels of the tensor that are effectively removed from the graph. - - Arguments: - tensor: tf.Tensor - Tensor for which to compute a mask - - Returns: - A `tf.Tensor` with binary entries indicating disabled channels - """ - - input_mask = find_input_mask(tensor) - output_mask = find_output_mask(tensor) - if input_mask is None: - return output_mask - if output_mask is None: - return input_mask - if input_mask is output_mask: - return input_mask - return input_mask * output_mask - - -def pruned_shape(tensor): - """ - Computes the shape of a tensor after taking into account pruning of channels. - - Note that the shape will only differ in the last dimension, even if other dimensions are also - effectively disabled by pruning masks. - - Arguments: - tensor: tf.Tensor - Tensor for which to compute a pruned shape - - Returns: - A `tf.Tensor[tf.float32]` representing the pruned shape - """ - - mask = find_mask(tensor) - - if mask is None: - return tf.cast(tf.shape(tensor), tf.float32) - - return tf.concat([ - tf.cast(tf.shape(tensor)[:-1], mask.dtype), - tf.reduce_sum(mask, keepdims=True)], 0) - - -def computational_cost(op_or_tensor, _observed=None): - """ - Estimates the computational complexity of a pruned graph (number of floating point operations). - - This function currently only supports sequential graphs such as those of MLPs and - simple CNNs with 2D convolutions in NHWC format. - - Note that the computational cost returned by this function is proportional to batch size. - - Arguments: - op_or_tensor: tf.Tensor or tf.Operation - Root node of graph for which to compute computational cost - - Returns: - A `tf.Tensor` representing a number of floating point operations - """ +MASK_COLLECTION = "pruning/masks" +MASK_EXTENDED_COLLECTION = "pruning/masks_extended" +OP_COLLECTION = "pruning/ops" + - cost = tf.constant(0.) +def apply_mask(tensor: tf.tensor, name: str = "pruning") -> tf.tensor: + """ + Point-wise multiplies a tensor with a binary mask. + During training, pruning is simulated by setting entries of the mask to zero. - # exclude cost of computing extended pruning masks - masks_extended = [mask.extended for mask in tf.get_collection(MASK_COLLECTION)] - if op_or_tensor in masks_extended: - return cost - - # convert tensor to op - op = op_or_tensor.op if isinstance(op_or_tensor, (tf.Tensor, tf.Variable)) else op_or_tensor - - # make sure cost of op will not be counted twice - if _observed is None: - _observed = [] - elif op in _observed: - return cost - _observed.append(op) + Args: + tensor: tf.Tensor + A tensor where the last dimension represents channels which will be masked - # compute cost of computing inputs - for tensor in op.inputs: - cost = cost + computational_cost(tensor, _observed) + Returns: + `tf.Tensor` with same shape as `tensor` + """ + + tensor_shape = tensor.shape + + with tf.variable_scope(name, reuse=True): + # allocate masks and corresponding pruning signals + mask = tf.Variable( + tf.ones(tensor.shape.as_list()[-1]), trainable=False, name="mask" + ) + pruning_signal = tf.Variable( + tf.zeros_like(mask), trainable=False, name="signal" + ) + + # extending masks is a trick to get a separate gradient for each data point + mask_extended = extend_mask(mask, tensor) + + # store extended mask, pruning signal, and other vars for easy access later + mask.extended = mask_extended + mask.pruning_signal = pruning_signal + mask.tensor = tensor + + # mask tensor + tensor = tf.multiply(tensor, mask_extended) + tensor.set_shape(tensor_shape) + tensor._mask = mask + + tf.add_to_collection(MASK_COLLECTION, mask) + tf.add_to_collection(MASK_EXTENDED_COLLECTION, mask.extended) + tf.add_to_collection(OP_COLLECTION, tensor.op) + + return tensor + + +def extend_mask(mask: tf.Tensor, tensor: tf.Tensor) -> tf.Tensor: + """ + Repeats the mask for each data point stored in a tensor. + If `tensor` is AxBxC dimensional and `mask` is C dimensional, returns an Ax1xC dimensional + tensor with A copies or `mask`. + + Args: + mask: tf.Tensor + The mask which will be extended + tensor: tf.Tensor + The tensor to which the extended mask will be applied + + Returns: + The extended mask + """ + + batch_size = tf.shape(tensor)[:1] + ones = tf.ones([tf.rank(tensor) - 1], dtype=batch_size.dtype) + multiples = tf.concat([batch_size, ones], 0) + mask_shape = tf.concat([ones, [-1]], 0) + return tf.tile(tf.reshape(mask, mask_shape), multiples) + + +def find_input_mask(tensor: tf.Tensor) -> Optional[tf.Tensor]: + """ + Find ancestral mask affecting the number of pruned channels of a tensor. + + Args: + tensor: tf.Tensor + Tensor for which to identify relevant mask + + Returns: + A `tf.Tensor` or `None` + """ + + if hasattr(tensor, "_mask"): + return tensor._mask + if tensor.op.type in ["MatMul", "Conv1D", "Conv2D", "Conv3D", "Transpose"]: + # op produces a new number of channels, preceding mask therefore irrelevant + return None + if not tensor.op.inputs: + return None + for input in tensor.op.inputs: + mask = find_input_mask(input) + if mask is not None: + return mask + + +def find_output_mask(tensor: Union[tf.Tensor, tf.Variable]) -> Optional[tf.Tensor]: + """ + Find mask applied to the tensor or one of its descendants if it affects the tensor's pruned shape. + + Args: + tensor: tf.Tensor or tf.Variable + Tensor for which to identify relevant mask + + Returns: + A `tf.Tensor` or `None` + """ + + if isinstance(tensor, tf.Variable): + return find_output_mask(tensor.op.outputs[0]) + + if hasattr(tensor, "_mask"): + return tensor._mask + for op in tensor.consumers(): + if len(op.outputs) != 1: + continue + if op.type in ["MatMul", "Conv1D", "Conv2D", "Conv3D"]: + # masks of descendants are only relevant if tensor is right-multiplied + if tensor == op.inputs[1]: + return find_output_mask(op.outputs[0]) + return None + mask = find_output_mask(op.outputs[0]) + if mask is not None: + return mask + + +def find_mask(tensor: tf.Tensor) -> tf.Tensor: + """ + Returns masks indicating channels of the tensor that are effectively removed from the graph. + + Args: + tensor: tf.Tensor + Tensor for which to compute a mask + + Returns: + A `tf.Tensor` with binary entries indicating disabled channels + """ + + input_mask = find_input_mask(tensor) + output_mask = find_output_mask(tensor) + if input_mask is None: + return output_mask + if output_mask is None: + return input_mask + if input_mask is output_mask: + return input_mask + return input_mask * output_mask + + +def pruned_shape(tensor: tf.Tensor) -> tf.Tensor: + """ + Computes the shape of a tensor after taking into account pruning of channels. + + Note that the shape will only differ in the last dimension, even if other dimensions are also + effectively disabled by pruning masks. + + Args: + tensor: tf.Tensor + Tensor for which to compute a pruned shape + + Returns: + A `tf.Tensor[tf.float32]` representing the pruned shape + """ + + mask = find_mask(tensor) + + if mask is None: + return tf.cast(tf.shape(tensor), tf.float32) + + return tf.concat( + [ + tf.cast(tf.shape(tensor)[:-1], mask.dtype), + tf.reduce_sum(mask, keepdims=True), + ], + 0, + ) + + +def computational_cost( + op_or_tensor: Union[tf.Tensor, tf.Operation], _observed: Optional[Set] = None +) -> tf.Tensor: + """ + Estimates the computational complexity of a pruned graph (number of floating point operations). + + This function currently only supports sequential graphs such as those of MLPs and + simple CNNs with 2D convolutions in NHWC format. + + Note that the computational cost returned by this function is proportional to batch size. + + Args: + op_or_tensor: tf.Tensor or tf.Operation + Root node of graph for which to compute computational cost + _observed: Set + Internal parameter used to avoid counting the same operation twice + + Returns: + A `tf.Tensor` representing a number of floating point operations + """ + + cost = tf.constant(0.0) + + # exclude cost of computing extended pruning masks + masks_extended = [mask.extended for mask in tf.get_collection(MASK_COLLECTION)] + if op_or_tensor in masks_extended: + return cost + + # convert tensor to op + op = ( + op_or_tensor.op + if isinstance(op_or_tensor, (tf.Tensor, tf.Variable)) + else op_or_tensor + ) + + # make sure cost of op will not be counted twice + if _observed is None: + _observed = [] + elif op in _observed: + return cost + _observed.append(op) + + # compute cost of computing inputs + for tensor in op.inputs: + cost = cost + computational_cost(tensor, _observed) + + # add cost of operation + if op.op_def is None or op in tf.get_collection(OP_COLLECTION): + # exclude cost of undefined ops and pruning ops + return cost + + elif op.op_def.name == "MatMul": + shape_a = pruned_shape(op.inputs[0]) + shape_b = pruned_shape(op.inputs[1]) + return cost + shape_a[0] * shape_b[1] * (2.0 * shape_a[1] - 1.0) + + elif op.op_def.name in ["Add", "Mul", "BiasAdd"]: + return cost + tf.cond( + tf.size(op.inputs[0]) > tf.size(op.inputs[1]), + lambda: tf.reduce_prod(pruned_shape(op.inputs[0])), + lambda: tf.reduce_prod(pruned_shape(op.inputs[1])), + ) + + elif op.op_def.name in ["Conv2D"]: + output_shape = pruned_shape(op.outputs[0]) + input_shape = pruned_shape(op.inputs[0]) + kernel_shape = pruned_shape(op.inputs[1]) + inner_prod_cost = tf.reduce_prod(kernel_shape[:2]) * input_shape[-1] * 2.0 - 1.0 + return cost + tf.reduce_prod(output_shape) * inner_prod_cost - # add cost of operation - if op.op_def is None or op in tf.get_collection(OP_COLLECTION): - # exclude cost of undefined ops and pruning ops return cost - elif op.op_def.name == 'MatMul': - shape_a = pruned_shape(op.inputs[0]) - shape_b = pruned_shape(op.inputs[1]) - return cost + shape_a[0] * shape_b[1] * (2. * shape_a[1] - 1.) - - elif op.op_def.name in ['Add', 'Mul', 'BiasAdd']: - return cost + tf.cond( - tf.size(op.inputs[0]) > tf.size(op.inputs[1]), - lambda: tf.reduce_prod(pruned_shape(op.inputs[0])), - lambda: tf.reduce_prod(pruned_shape(op.inputs[1]))) - - elif op.op_def.name in ['Conv2D']: - output_shape = pruned_shape(op.outputs[0]) - input_shape = pruned_shape(op.inputs[0]) - kernel_shape = pruned_shape(op.inputs[1]) - inner_prod_cost = (tf.reduce_prod(kernel_shape[:2]) * input_shape[-1] * 2. - 1.) - return cost + tf.reduce_prod(output_shape) * inner_prod_cost - - return cost - - -def update_pruning_signals(loss, decay=.96, masks=None, method='Fisher'): - """ - For each mask, computes corresponding pruning signals indicating the importance of a feature. - - Arguments: - loss: tf.Tensor - Any cross-entropy loss - - decay: float - Controls exponential moving average of pruning signals - - method: str - Method used to compute pruning signal (currently only supports 'Fisher') - - Returns: - A `list[tf.Tensor]` of pruning signals corresponding to masks - - References: - * Theis et al., Faster gaze prediction with dense networks and Fisher pruning, 2018 - """ - - if masks is None: - masks = tf.get_collection(MASK_COLLECTION) - - if method not in ['Fisher']: - raise ValueError('Pruning method \'{0}\' not supported.'.format(method)) - - if not masks: - return [] - - with tf.variable_scope('pruning_opt', reuse=True): - # compute gradients of extended masks (yields separate gradient for each data point) - grads = tf.gradients(loss, [m.extended for m in masks]) - - # estimate Fisher pruning signals from batch - signals_batch = [tf.squeeze(tf.reduce_mean(tf.square(g), 0)) for g in grads] - - # update pruning signals - signals = [m.pruning_signal for m in masks] - signals = [tf.assign(s, decay * s + (1. - decay) * f, use_locking=True) - for s, f in zip(signals, signals_batch)] - - return signals - - -def prune(signals, masks=None): - """ - Prunes a single feature by zeroing the mask entry with the smallest pruning signal. - - Arguments: - signals: list[tf.Tensor] - A list of pruning signals - - masks: list[tf.Tensor] - A list of corresponding masks, defaults to `tf.get_collection(MASK_COLLECTION)` - - Returns: - A `tf.Operation` which updates masks - """ - - if masks is None: - masks = tf.get_collection(MASK_COLLECTION) - - with tf.variable_scope('pruning_opt', reuse=True): - # make sure we don't select already pruned units - signals = [tf.where(m > .5, s, tf.zeros_like(s) + np.inf) for m, s in zip(masks, signals)] - - # find units with smallest pruning signal in each layer - min_idx = [tf.argmin(s) for s in signals] - min_signals = [s[i] for s, i in zip(signals, min_idx)] - - # find layer with smallest pruning signal - l = tf.argmin(min_signals) - - # construct pruning operations, one for each mask - updates = [] - for k, i in enumerate(min_idx): - # set mask of layer l to 0 where pruning signal is smallest - updates.append( - tf.cond( - tf.equal(l, k), - lambda: tf.scatter_update( - masks[k], tf.Print(i, [i], message="Pruning layer [{0}] at index ".format(k)), 0.), - lambda: masks[k])) - - updates = tf.group(updates, name='prune') - return updates +def update_pruning_signals( + loss: tf.Tensor, + decay: float = 0.96, + masks: Optional[str] = None, + method: str = "Fisher", +) -> List[tf.Tensor]: + """ + For each mask, computes corresponding pruning signals indicating the importance of a feature. + + Args: + loss: tf.Tensor + Any cross-entropy loss + decay: float + Controls exponential moving average of pruning signals + method: str + Method used to compute pruning signal (currently only supports 'Fisher') + + Returns: + A `list[tf.Tensor]` of pruning signals corresponding to masks + + References: + * Theis et al., Faster gaze prediction with dense networks and Fisher pruning, 2018 + """ + + if masks is None: + masks = tf.get_collection(MASK_COLLECTION) + + allowed_methods = ["Fisher"] + if method not in allowed_methods: + raise ValueError(f"Pruning method '{method}' not supported.") + + if not masks: + return [] + + with tf.variable_scope("pruning_opt", reuse=True): + # compute gradients of extended masks (yields separate gradient for each data point) + grads = tf.gradients(loss, [m.extended for m in masks]) + + # estimate Fisher pruning signals from batch + signals_batch = [tf.squeeze(tf.reduce_mean(tf.square(g), 0)) for g in grads] + + # update pruning signals + signals = [m.pruning_signal for m in masks] + signals = [ + tf.assign(s, decay * s + (1.0 - decay) * f, use_locking=True) + for s, f in zip(signals, signals_batch) + ] + + return signals + + +def prune( + signals: List[tf.Tensor], masks: Optional[List[tf.Tensor]] = None +) -> tf.Operation: + """ + Prunes a single feature by zeroing the mask entry with the smallest pruning signal. + + Args: + signals: list[tf.Tensor] + A list of pruning signals + masks: list[tf.Tensor] + A list of corresponding masks, defaults to `tf.get_collection(MASK_COLLECTION)` + + Returns: + A `tf.Operation` which updates masks + """ + + if masks is None: + masks = tf.get_collection(MASK_COLLECTION) + + with tf.variable_scope("pruning_opt", reuse=True): + # make sure we don't select already pruned units + signals = [ + tf.where(m > 0.5, s, tf.zeros_like(s) + np.inf) + for m, s in zip(masks, signals) + ] + + # find units with smallest pruning signal in each layer + min_idx = [tf.argmin(s) for s in signals] + min_signals = [s[i] for s, i in zip(signals, min_idx)] + + # find layer with smallest pruning signal + l = tf.argmin(min_signals) + + # construct pruning operations, one for each mask + updates = [] + for index, id in enumerate(min_idx): + # set mask of layer l to 0 where pruning signal is smallest + updates.append( + tf.cond( + tf.equal(l, index), + lambda: tf.scatter_update( + masks[index], + tf.Print( + id, [id], message=f"Pruning layer [{index}] at index " + ), + 0.0, + ), + lambda: masks[index], + ) + ) + + updates = tf.group(updates, name="prune") + + return updates diff --git a/twml/twml/contrib/readers/batch_prediction_request.py b/twml/twml/contrib/readers/batch_prediction_request.py index 4408b33b4..3341cc851 100644 --- a/twml/twml/contrib/readers/batch_prediction_request.py +++ b/twml/twml/contrib/readers/batch_prediction_request.py @@ -4,5 +4,5 @@ """ from twitter.deepbird.io.legacy.contrib.readers.batch_prediction_request import ( - BatchPredictionRequest # noqa: F401 -) + BatchPredictionRequest, +) # noqa: F401 diff --git a/twml/twml/contrib/readers/data_record.py b/twml/twml/contrib/readers/data_record.py index ae8cc0b68..84ca74f2e 100644 --- a/twml/twml/contrib/readers/data_record.py +++ b/twml/twml/contrib/readers/data_record.py @@ -4,7 +4,7 @@ The result of this subclass methods are dictionaries of Tensors and SparseTensors """ -from twitter.deepbird.io.legacy.contrib.readers.data_record import ( - SUPPORTED_DENSE_FEATURE_TYPES, # noqa: F401 - DataRecord, # noqa: F401 +from twitter.deepbird.io.legacy.contrib.readers.data_record import ( # noqa: F401 + SUPPORTED_DENSE_FEATURE_TYPES, + DataRecord, ) diff --git a/twml/twml/contrib/readers/hashed_batch_prediction_request.py b/twml/twml/contrib/readers/hashed_batch_prediction_request.py index 3454f8483..d97c47a2f 100644 --- a/twml/twml/contrib/readers/hashed_batch_prediction_request.py +++ b/twml/twml/contrib/readers/hashed_batch_prediction_request.py @@ -4,5 +4,5 @@ """ from twitter.deepbird.io.legacy.contrib.readers.hashed_batch_prediction_request import ( - HashedBatchPredictionRequest # noqa: F401 -) + HashedBatchPredictionRequest, +) # noqa: F401 diff --git a/twml/twml/contrib/trainers/__init__.py b/twml/twml/contrib/trainers/__init__.py index 3226cd805..cc9508628 100644 --- a/twml/twml/contrib/trainers/__init__.py +++ b/twml/twml/contrib/trainers/__init__.py @@ -1,5 +1,7 @@ # pylint: disable=wildcard-import """This module contains experimental trainer classes""" -from .batch_prediction_request_trainer import BatchPredictionRequestTrainer # noqa: F401 +from .batch_prediction_request_trainer import ( + BatchPredictionRequestTrainer, +) # noqa: F401 from .pruning_data_record_trainer import PruningDataRecordTrainer # noqa: F401 -from .trainer_utils import build_keras_trainer # noqa: F401 +from .trainer_utils import build_keras_trainer # noqa: F401 diff --git a/twml/twml/contrib/trainers/batch_prediction_request_trainer.py b/twml/twml/contrib/trainers/batch_prediction_request_trainer.py index 2effa87ed..09d61aaf3 100644 --- a/twml/twml/contrib/trainers/batch_prediction_request_trainer.py +++ b/twml/twml/contrib/trainers/batch_prediction_request_trainer.py @@ -2,179 +2,212 @@ """ This file contains the DataRecordTrainer class. """ +import argparse import warnings +from typing import Callable, Optional import twml from twml.trainers import DataRecordTrainer -class BatchPredictionRequestTrainer(DataRecordTrainer): # pylint: disable=abstract-method - """ - The ``BatchPredictionRequestTrainer`` implementation is intended to satisfy use cases - that input is BatchPredictionRequest at Twitter and also where only the build_graph methods - needs to be overridden. For this reason, ``Trainer.[train,eval]_input_fn`` methods - assume a DataRecord dataset partitioned into part files stored in compressed (e.g. gzip) format. - - For use-cases that differ from this common Twitter use-case, - further Trainer methods can be overridden. - If that still doesn't provide enough flexibility, the user can always - use the tf.estimator.Esimator or tf.session.run directly. - """ - - def __init__( - self, name, params, - build_graph_fn, - feature_config=None, - **kwargs): +class BatchPredictionRequestTrainer( + DataRecordTrainer +): # pylint: disable=abstract-method """ - The BatchPredictionRequestTrainer constructor builds a - ``tf.estimator.Estimator`` and stores it in self.estimator. - For this reason, BatchPredictionRequestTrainer accepts the same Estimator constructor arguments. - It also accepts additional arguments to facilitate metric evaluation and multi-phase training - (init_from_dir, init_map). - - Args: - parent arguments: - See the `Trainer constructor <#twml.trainers.Trainer.__init__>`_ documentation - for a full list of arguments accepted by the parent class. - name, params, build_graph_fn (and other parent class args): - see documentation for twml.Trainer and twml.DataRecordTrainer doc. - feature_config: - An object of type FeatureConfig describing what features to decode. - Defaults to None. But it is needed in the following cases: - - `get_train_input_fn()` / `get_eval_input_fn()` is called without a `parse_fn` - - `learn()`, `train()`, `eval()`, `calibrate()` are called without providing `*input_fn`. - - **kwargs: - further kwargs can be specified and passed to the Estimator constructor. + The ``BatchPredictionRequestTrainer`` implementation is intended to satisfy use cases + that input is BatchPredictionRequest at Twitter and also where only the build_graph methods + needs to be overridden. For this reason, ``Trainer.[train,eval]_input_fn`` methods + assume a DataRecord dataset partitioned into part files stored in compressed (e.g. gzip) format. + + For use-cases that differ from this common Twitter use-case, + further Trainer methods can be overridden. + If that still doesn't provide enough flexibility, the user can always + use the tf.estimator.Esimator or tf.session.run directly. """ - # Check and update train_batch_size and eval_batch_size in params before initialization - # to print correct parameter logs and does not stop running - # This overwrites batch_size parameter constrains in twml.trainers.Trainer.check_params - updated_params = self.check_batch_size_params(params) - super(BatchPredictionRequestTrainer, self).__init__( - name=name, params=updated_params, build_graph_fn=build_graph_fn, **kwargs) - - def check_batch_size_params(self, params): - """ Verify that params has the correct key,values """ - # updated_params is an instance of tensorflow.contrib.training.HParams - updated_params = twml.util.convert_to_hparams(params) - param_values = updated_params.values() - - # twml.trainers.Trainer.check_params already checks other constraints, - # such as being an integer - if 'train_batch_size' in param_values: - if not isinstance(updated_params.train_batch_size, int): - raise ValueError("Expecting params.train_batch_size to be an integer.") - if param_values['train_batch_size'] != 1: - # This can be a bit annoying to force users to pass the batch sizes, - # but it is good to let them know what they actually use in the models - # Use warning instead of ValueError in there to continue the run - # and print out that train_batch_size is changed - warnings.warn('You are processing BatchPredictionRequest data, ' - 'train_batch_size is always 1.\n' - 'The number of DataRecords in a batch is determined by the size ' - 'of each BatchPredictionRequest.\n' - 'If you did not pass train.batch_size or eval.batch_size, and ' - 'the default batch_size 32 was in use,\n' - 'please pass --train.batch_size 1 --eval.batch_size 1') - # If the upper error warning, change/pass --train.batch_size 1 - # so that train_batch_size = 1 - updated_params.train_batch_size = 1 - - if 'eval_batch_size' in param_values: - if not isinstance(updated_params.train_batch_size, int): - raise ValueError('Expecting params.eval_batch_size to be an integer.') - if param_values['eval_batch_size'] != 1: - # This can be a bit annoying to force users to pass the batch sizes, - # but it is good to let them know what they actually use in the models - # Use warning instead of ValueError in there to continue the run - # and print out that eval_batch_size is changed - warnings.warn('You are processing BatchPredictionRequest data, ' - 'eval_batch_size is also always 1.\n' - 'The number of DataRecords in a batch is determined by the size ' - 'of each BatchPredictionRequest.\n' - 'If you did not pass train.batch_size or eval.batch_size, and ' - 'the default batch_size 32 was in use,\n' - 'please pass --train.batch_size 1 --eval.batch_size 1') - # If the upper warning raises, change/pass --eval.batch_size 1 - # so that eval_batch_size = 1 - updated_params.eval_batch_size = 1 - - if 'eval_batch_size' not in param_values: - updated_params.eval_batch_size = 1 - - if not updated_params.eval_batch_size: - updated_params.eval_batch_size = 1 - - return updated_params - - @staticmethod - def add_batch_prediction_request_arguments(): - """ - Add commandline args to parse typically for the BatchPredictionRequestTrainer class. - Typically, the user calls this function and then parses cmd-line arguments - into an argparse.Namespace object which is then passed to the Trainer constructor - via the params argument. - - See the `code <_modules/twml/argument_parser.html#get_trainer_parser>`_ - for a list and description of all cmd-line arguments. - - Returns: - argparse.ArgumentParser instance with some useful args already added. - """ - parser = super(BatchPredictionRequestTrainer, - BatchPredictionRequestTrainer).add_parser_arguments() - - # mlp arguments - parser.add_argument( - '--model.use_existing_discretizer', action='store_true', - dest="model_use_existing_discretizer", - help='Load a pre-trained calibration or train a new one') - parser.add_argument( - '--model.use_binary_values', action='store_true', - dest='model_use_binary_values', - help='Use the use_binary_values optimization') - - # control hom many featues we keep in sparse tensors - # 12 is enough for learning-to-rank for now - parser.add_argument( - '--input_size_bits', type=int, default=12, - help='Number of bits allocated to the input size') - - parser.add_argument( - '--loss_function', type=str, default='ranknet', - dest='loss_function', - help='Options are pairwise: ranknet (default), lambdarank, ' - 'listnet, listmle, attrank, ' - 'pointwise') - - # whether convert sparse tensors to dense tensor - # in order to use dense normalization methods - parser.add_argument( - '--use_dense_tensor', action='store_true', - dest='use_dense_tensor', - default=False, - help='If use_dense_tensor is False, ' - 'sparse tensor and spare normalization are in use. ' - 'If use_dense_tensor is True, ' - 'dense tensor and dense normalization are in use.') - - parser.add_argument( - '--dense_normalization', type=str, default='mean_max_normalizaiton', - dest='dense_normalization', - help='Options are mean_max_normalizaiton (default), standard_normalizaiton') - - parser.add_argument( - '--sparse_normalization', type=str, default='SparseMaxNorm', - dest='sparse_normalization', - help='Options are SparseMaxNorm (default), SparseBatchNorm') - - # so far only used in pairwise learning-to-rank - parser.add_argument( - '--mask', type=str, default='full_mask', - dest='mask', - help='Options are full_mask (default), diag_mask') - - return parser + def __init__( + self, + name: str, + params: dict, + build_graph_fn: Callable, + feature_config: Optional[dict] = None, + **kwargs, + ): + """ + The BatchPredictionRequestTrainer constructor builds a + ``tf.estimator.Estimator`` and stores it in self.estimator. + For this reason, BatchPredictionRequestTrainer accepts the same Estimator constructor arguments. + It also accepts additional arguments to facilitate metric evaluation and multi-phase training + (init_from_dir, init_map). + + Args: + parent Args: + See the `Trainer constructor <#twml.trainers.Trainer.__init__>`_ documentation + for a full list of arguments accepted by the parent class. + name, params, build_graph_fn (and other parent class args): + see documentation for twml.Trainer and twml.DataRecordTrainer doc. + feature_config: + An object of type FeatureConfig describing what features to decode. + Defaults to None. But it is needed in the following cases: + - `get_train_input_fn()` / `get_eval_input_fn()` is called without a `parse_fn` + - `learn()`, `train()`, `eval()`, `calibrate()` are called without providing `*input_fn`. + **kwargs: + further kwargs can be specified and passed to the Estimator constructor. + """ + + # Check and update train_batch_size and eval_batch_size in params before initialization + # to print correct parameter logs and does not stop running + # This overwrites batch_size parameter constrains in twml.trainers.Trainer.check_params + updated_params = self.check_batch_size_params(params) + super(BatchPredictionRequestTrainer, self).__init__( + name=name, params=updated_params, build_graph_fn=build_graph_fn, **kwargs + ) + + def check_batch_size_params(self, params: dict): + """Verify that params has the correct key,values""" + # updated_params is an instance of tensorflow.contrib.training.HParams + updated_params = twml.util.convert_to_hparams(params) + param_values = updated_params.values() + + # twml.trainers.Trainer.check_params already checks other constraints, + # such as being an integer + if "train_batch_size" in param_values: + if not isinstance(updated_params.train_batch_size, int): + raise ValueError("Expecting params.train_batch_size to be an integer.") + if param_values["train_batch_size"] != 1: + # This can be a bit annoying to force users to pass the batch sizes, + # but it is good to let them know what they actually use in the models + # Use warning instead of ValueError in there to continue the run + # and print out that train_batch_size is changed + warnings.warn( + "You are processing BatchPredictionRequest data, " + "train_batch_size is always 1.\n" + "The number of DataRecords in a batch is determined by the size " + "of each BatchPredictionRequest.\n" + "If you did not pass train.batch_size or eval.batch_size, and " + "the default batch_size 32 was in use,\n" + "please pass --train.batch_size 1 --eval.batch_size 1" + ) + # If the upper error warning, change/pass --train.batch_size 1 + # so that train_batch_size = 1 + updated_params.train_batch_size = 1 + + if "eval_batch_size" in param_values: + if not isinstance(updated_params.train_batch_size, int): + raise ValueError("Expecting params.eval_batch_size to be an integer.") + if param_values["eval_batch_size"] != 1: + # This can be a bit annoying to force users to pass the batch sizes, + # but it is good to let them know what they actually use in the models + # Use warning instead of ValueError in there to continue the run + # and print out that eval_batch_size is changed + warnings.warn( + "You are processing BatchPredictionRequest data, " + "eval_batch_size is also always 1.\n" + "The number of DataRecords in a batch is determined by the size " + "of each BatchPredictionRequest.\n" + "If you did not pass train.batch_size or eval.batch_size, and " + "the default batch_size 32 was in use,\n" + "please pass --train.batch_size 1 --eval.batch_size 1" + ) + # If the upper warning raises, change/pass --eval.batch_size 1 + # so that eval_batch_size = 1 + updated_params.eval_batch_size = 1 + + if "eval_batch_size" not in param_values: + updated_params.eval_batch_size = 1 + + if not updated_params.eval_batch_size: + updated_params.eval_batch_size = 1 + + return updated_params + + @staticmethod + def add_batch_prediction_request_arguments() -> argparse.ArgumentParser: + """ + Add commandline args to parse typically for the BatchPredictionRequestTrainer class. + Typically, the user calls this function and then parses cmd-line arguments + into an argparse.Namespace object which is then passed to the Trainer constructor + via the params argument. + + See the `code <_modules/twml/argument_parser.html#get_trainer_parser>`_ + for a list and description of all cmd-line arguments. + + Returns: + argparse.ArgumentParser instance with some useful args already added. + """ + parser = super( + BatchPredictionRequestTrainer, BatchPredictionRequestTrainer + ).add_parser_arguments() + + # mlp arguments + parser.add_argument( + "--model.use_existing_discretizer", + action="store_true", + dest="model_use_existing_discretizer", + help="Load a pre-trained calibration or train a new one", + ) + parser.add_argument( + "--model.use_binary_values", + action="store_true", + dest="model_use_binary_values", + help="Use the use_binary_values optimization", + ) + + # control hom many featues we keep in sparse tensors + # 12 is enough for learning-to-rank for now + parser.add_argument( + "--input_size_bits", + type=int, + default=12, + help="Number of bits allocated to the input size", + ) + + parser.add_argument( + "--loss_function", + type=str, + default="ranknet", + dest="loss_function", + help="Options are pairwise: ranknet (default), lambdarank, " + "listnet, listmle, attrank, " + "pointwise", + ) + + # whether convert sparse tensors to dense tensor + # in order to use dense normalization methods + parser.add_argument( + "--use_dense_tensor", + action="store_true", + dest="use_dense_tensor", + default=False, + help="If use_dense_tensor is False, " + "sparse tensor and spare normalization are in use. " + "If use_dense_tensor is True, " + "dense tensor and dense normalization are in use.", + ) + + parser.add_argument( + "--dense_normalization", + type=str, + default="mean_max_normalizaiton", + dest="dense_normalization", + help="Options are mean_max_normalizaiton (default), standard_normalizaiton", + ) + + parser.add_argument( + "--sparse_normalization", + type=str, + default="SparseMaxNorm", + dest="sparse_normalization", + help="Options are SparseMaxNorm (default), SparseBatchNorm", + ) + + # so far only used in pairwise learning-to-rank + parser.add_argument( + "--mask", + type=str, + default="full_mask", + dest="mask", + help="Options are full_mask (default), diag_mask", + ) + + return parser diff --git a/twml/twml/contrib/trainers/pruning_data_record_trainer.py b/twml/twml/contrib/trainers/pruning_data_record_trainer.py index 4796e5390..3a3fead02 100644 --- a/twml/twml/contrib/trainers/pruning_data_record_trainer.py +++ b/twml/twml/contrib/trainers/pruning_data_record_trainer.py @@ -1,59 +1,87 @@ +import argparse +from typing import Optional + import tensorflow.compat.v1 as tf -from twml.trainers import DataRecordTrainer from twml.contrib.optimizers import PruningOptimizer +from twml.trainers import DataRecordTrainer class PruningDataRecordTrainer(DataRecordTrainer): - @staticmethod - def get_train_op(params, loss): - train_op = DataRecordTrainer.get_train_op(params, loss) - - optimizer = PruningOptimizer(learning_rate=params.get('learning_rate')) - - return optimizer.minimize( - loss=loss, - prune_every=params.get('pruning_iter', 5000), - burn_in=params.get('pruning_burn_in', 100000), - decay=params.get('pruning_decay', .9999), - flops_target=params.get('pruning_flops_target', 250000), - update_params=train_op, - global_step=tf.train.get_global_step()) - - def __init__(self, name, params, build_graph_fn, feature_config=None, **kwargs): - kwargs['optimize_loss_fn'] = self.get_train_op - - super(PruningDataRecordTrainer, self).__init__( - name=name, - params=params, - build_graph_fn=build_graph_fn, - feature_config=feature_config, - **kwargs) - - def export_model(self, *args, **kwargs): - # TODO: modify graph before exporting to take into account masks - return super(PruningDataRecordTrainer, self).export_model(*args, **kwargs) - - @staticmethod - def add_parser_arguments(): - parser = DataRecordTrainer.add_parser_arguments() - parser.add_argument( - "--pruning.iter", "--pruning_iter", type=int, default=5000, - dest="pruning_iter", - help="A single feature or feature map is pruned every this many iterations") - parser.add_argument( - "--pruning.burn_in", "--pruning_burn_in", type=int, default=100000, - dest="pruning_burn_in", - help="Only start pruning after collecting statistics for this many training steps") - parser.add_argument( - "--pruning.flops_target", "--pruning_flops_target", type=int, default=250000, - dest="pruning_flops_target", - help="Stop pruning when estimated number of floating point operations reached this target. \ - For example, a small feed-forward network might require 250,000 FLOPs to run.") - parser.add_argument( - "--pruning.decay", "--pruning_decay", type=float, default=.9999, - dest="pruning_decay", - help="A float value in [0.0, 1.0) controlling an exponential moving average of pruning \ - signal statistics. A value of 0.9999 can be thought of as averaging statistics over 10,000 \ - steps.") - return parser + @staticmethod + def get_train_op(params: dict, loss: tf.Tensor) -> tf.Operation: + train_op = DataRecordTrainer.get_train_op(params, loss) + + optimizer = PruningOptimizer(learning_rate=params.get("learning_rate")) + + return optimizer.minimize( + loss=loss, + prune_every=params.get("pruning_iter", 5000), + burn_in=params.get("pruning_burn_in", 100000), + decay=params.get("pruning_decay", 0.9999), + flops_target=params.get("pruning_flops_target", 250000), + update_params=train_op, + global_step=tf.train.get_global_step(), + ) + + def __init__( + self, + name: str, + params: dict, + build_graph_fn: callable, + feature_config: Optional[dict] = None, + **kwargs, + ): + kwargs["optimize_loss_fn"] = self.get_train_op + + super(PruningDataRecordTrainer, self).__init__( + name=name, + params=params, + build_graph_fn=build_graph_fn, + feature_config=feature_config, + **kwargs, + ) + + def export_model(self, *args, **kwargs) -> str: + # TODO: modify graph before exporting to take into account masks + return super(PruningDataRecordTrainer, self).export_model(*args, **kwargs) + + @staticmethod + def add_parser_arguments() -> argparse.ArgumentParser: + parser = DataRecordTrainer.add_parser_arguments() + parser.add_argument( + "--pruning.iter", + "--pruning_iter", + type=int, + default=5000, + dest="pruning_iter", + help="A single feature or feature map is pruned every this many iterations", + ) + parser.add_argument( + "--pruning.burn_in", + "--pruning_burn_in", + type=int, + default=100000, + dest="pruning_burn_in", + help="Only start pruning after collecting statistics for this many training steps", + ) + parser.add_argument( + "--pruning.flops_target", + "--pruning_flops_target", + type=int, + default=250000, + dest="pruning_flops_target", + help="Stop pruning when estimated number of floating point operations reached this target. \ + For example, a small feed-forward network might require 250,000 FLOPs to run.", + ) + parser.add_argument( + "--pruning.decay", + "--pruning_decay", + type=float, + default=0.9999, + dest="pruning_decay", + help="A float value in [0.0, 1.0) controlling an exponential moving average of pruning \ + signal statistics. A value of 0.9999 can be thought of as averaging statistics over 10,000 \ + steps.", + ) + return parser diff --git a/twml/twml/contrib/trainers/trainer_utils.py b/twml/twml/contrib/trainers/trainer_utils.py index f279571be..cb6ec1be6 100644 --- a/twml/twml/contrib/trainers/trainer_utils.py +++ b/twml/twml/contrib/trainers/trainer_utils.py @@ -4,14 +4,14 @@ As of now (Q4 2019), Keras model training using `model.fit()` has various issues, making it unfit for production training: - 1. `model.fit()` is slow in TF 1.14. This will be fixed with future TensorFlow updates. - 2. `model.fit()` crashes during model saving or in eager mode when the input has SparseTensor. - 3. Models saved using TF 2.0 API cannot be served by TensorFlow's Java API. + 1. `model.fit()` is slow in TF 1.14. This will be fixed with future TensorFlow updates. + 2. `model.fit()` crashes during model saving or in eager mode when the input has SparseTensor. + 3. Models saved using TF 2.0 API cannot be served by TensorFlow's Java API. Until MLCE team resolves the above issues, MLCE team recommends the following: - - Please feel free to use Keras models for experimentation and exploration. - - Please stick to twml Trainer for production training & exporting, - especially if you want to serve your model using Twitter's prediction servers. + - Please feel free to use Keras models for experimentation and exploration. + - Please stick to twml Trainer for production training & exporting, + especially if you want to serve your model using Twitter's prediction servers. This module provide tooling for easily training keras models using twml Trainer. @@ -22,90 +22,97 @@ This input function can be created from the tf.data.Dataset you used with your Keras model. .. note: this util handles the most common case. If you have cases not satisfied by this util, - consider writing your own build_graph to wrap your keras models. + consider writing your own build_graph to wrap your keras models. """ -from twitter.deepbird.hparam import HParams +from typing import Callable import tensorflow # noqa: F401 import tensorflow.compat.v2 as tf +from twitter.deepbird.hparam import HParams import twml def build_keras_trainer( - name, - model_factory, - save_dir, - loss_fn=None, - metrics_fn=None, - **kwargs): - """ - Compile the given model_factory into a twml Trainer. - - Args: - name: a string name for the returned twml Trainer. - - model_factory: a callable that returns a keras model when called. - This keras model is expected to solve a binary classification problem. - This keras model takes a dict of tensors as input, and outputs a logit or probability. - - save_dir: a directory where the trainer saves data. Can be an HDFS path. - - loss_fn: the loss function to use. Defaults to tf.keras.losses.BinaryCrossentropy. - - metrics_fn: metrics function used by TensorFlow estimators. - Defaults to twml.metrics.get_binary_class_metric_fn(). - - **kwargs: for people familiar with twml Trainer's options, they can be passed in here - as kwargs, and they will be forwarded to Trainer as opts. - See https://cgit.twitter.biz/source/tree/twml/twml/argument_parser.py#n43 for available args. - - Returns: - a twml.trainers.Trainer object which can be used for training and exporting models. - """ - build_graph = create_build_graph_fn(model_factory, loss_fn) - - if metrics_fn is None: - metrics_fn = twml.metrics.get_binary_class_metric_fn() - - opts = HParams(**kwargs) - opts.add_hparam('save_dir', save_dir) - - return twml.trainers.Trainer( - name, - opts, - build_graph_fn=build_graph, - save_dir=save_dir, - metric_fn=metrics_fn) - - -def create_build_graph_fn(model_factory, loss_fn=None): - """Create a build graph function from the given keras model.""" - - def build_graph(features, label, mode, params, config=None): - # create model from model factory. - model = model_factory() - - # create loss function if the user didn't specify one. - if loss_fn is None: - build_graph_loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=False) - else: - build_graph_loss_fn = loss_fn - - output = model(features) - if mode == 'infer': - loss = None - else: - weights = features.get('weights', None) - loss = build_graph_loss_fn(y_true=label, y_pred=output, sample_weight=weights) - - if isinstance(output, dict): - if loss is None: - return output - else: - output['loss'] = loss - return output - else: - return {'output': output, 'loss': loss} - - return build_graph + name: str, + model_factory: Callable[[], tf.keras.Model], + save_dir: str, + loss_fn: tf.keras.losses.Loss = tf.keras.losses.BinaryCrossentropy, + metrics_fn: Callable = twml.metrics.get_binary_class_metric_fn(), + **kwargs, +) -> twml.trainers.Trainer: + """ + Compile the given model_factory into a twml Trainer. + + Args: + name: + a string name for the returned twml Trainer. + model_factory: + a callable that returns a keras model when called. + This keras model is expected to solve a binary classification problem. + This keras model takes a dict of tensors as input, and outputs a logit or probability. + save_dir: + a directory where the trainer saves data. Can be an HDFS path. + loss_fn: + the loss function to use. Defaults to tf.keras.losses.BinaryCrossentropy. + metrics_fn: + metrics function used by TensorFlow estimators. + Defaults to twml.metrics.get_binary_class_metric_fn(). + **kwargs: + for people familiar with twml Trainer's options, they can be passed in here + as kwargs, and they will be forwarded to Trainer as opts. + See https://cgit.twitter.biz/source/tree/twml/twml/argument_parser.py#n43 for available args. + + Returns: + a twml.trainers.Trainer object which can be used for training and exporting models. + """ + build_graph = create_build_graph_fn(model_factory, loss_fn) + + opts = HParams(**kwargs) + opts.add_hparam("save_dir", save_dir) + + return twml.trainers.Trainer( + name, + opts, + build_graph_fn=build_graph, + save_dir=save_dir, + metric_fn=metrics_fn, + ) + + +def create_build_graph_fn(model_factory: Callable[[], tf.keras.Model], loss_fn=None): + """Create a build graph function from the given keras model.""" + + def build_graph( + features: dict, + label: tf.Tensor, + mode: str, + params: HParams, + config: dict, + ) -> dict: # pylint: disable=unused-argument + # create model from model factory. + model = model_factory() + + # create loss function if the user didn't specify one. + if loss_fn is None: + build_graph_loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=False) + else: + build_graph_loss_fn = loss_fn + + output = model(features) + if mode == "infer": + loss = None + else: + weights = features.get("weights", None) + loss = build_graph_loss_fn( + y_true=label, y_pred=output, sample_weight=weights + ) + + if isinstance(output, dict): + if loss is None: + return output + output["loss"] = loss + return output + return {"output": output, "loss": loss} + + return build_graph diff --git a/twml/twml/contrib/utils/__init__.py b/twml/twml/contrib/utils/__init__.py index 56a083048..4b7dbdf87 100644 --- a/twml/twml/contrib/utils/__init__.py +++ b/twml/twml/contrib/utils/__init__.py @@ -1,18 +1,22 @@ # pylint: disable=wildcard-import """This module contains experimental util functions for contrib.""" -from .math_fns import safe_div, safe_log, cal_ndcg, cal_swapped_ndcg # noqa: F401 -from .masks import diag_mask, full_mask # noqa: F401 -from .normalizer import mean_max_normalizaiton, standard_normalizaiton # noqa: F401 -from .scores import get_pairwise_scores, get_pairwise_label_scores # noqa: F401 -# pointwise functions -from .loss_fns import get_pointwise_loss # noqa: F401 -# ranknet functions -from .loss_fns import get_pair_loss # noqa: F401 -# listwise functions -from .loss_fns import get_attrank_loss, get_listnet_loss, get_listmle_loss # noqa: F401 +from . import interp # noqa: F401 +from .device import get_gpu_list # noqa: F401 +from .device import get_device_map, get_gpu_count, is_gpu_available + # lambdarank functions +# listwise functions +# ranknet functions +# pointwise functions from .loss_fns import get_lambda_pair_loss # noqa: F401 -from .device import get_device_map, get_gpu_list, get_gpu_count, is_gpu_available # noqa: F401 +from .loss_fns import get_pair_loss # noqa: F401 +from .loss_fns import get_pointwise_loss # noqa: F401 +from .loss_fns import get_attrank_loss, get_listmle_loss, get_listnet_loss # noqa: F401 +from .masks import diag_mask, full_mask # noqa: F401 +from .math_fns import cal_ndcg, cal_swapped_ndcg, safe_div, safe_log # noqa: F401 +from .normalizer import mean_max_normalizaiton # noqa: F401 +from .normalizer import standard_normalizaiton +from .scores import get_pairwise_label_scores # noqa: F401 +from .scores import get_pairwise_scores from .similarities import cosine_similarity # noqa: F401 -from . import interp # noqa: F401 diff --git a/twml/twml/contrib/utils/datasets.py b/twml/twml/contrib/utils/datasets.py index d31ea3ae4..d6e203811 100644 --- a/twml/twml/contrib/utils/datasets.py +++ b/twml/twml/contrib/utils/datasets.py @@ -1,4 +1,6 @@ import random +from datetime import datetime +from typing import List, Optional, Tuple import twml @@ -6,88 +8,95 @@ def resolve_train_and_eval_files_overlap( - train_files, eval_files, fraction_kept_for_eval, seed=None -): - """Resolve any overlap between train and eval files. + train_files: List[str], + eval_files: List[str], + fraction_kept_for_eval: float, + seed: Optional[int] = None, +) -> Tuple[List[str], List[str]]: + """Resolve any overlap between train and eval files. - Specifically, if there's an overlap between `train_files` and `eval_files`, then a fraction of - the overlap (i.e. `fraction_kept_for_eval`) will be randomly assigned (exclusively) to the - `eval_files`. + Specifically, if there's an overlap between `train_files` and `eval_files`, then a fraction of + the overlap (i.e. `fraction_kept_for_eval`) will be randomly assigned (exclusively) to the + `eval_files`. - The following example demonstrates its usage: + The following example demonstrates its usage: - >>> orig_train_files = ['f1', 'f2', 'f3', 'f4'] - >>> orig_eval_files = ['f1', 'f2', 'f3'] - >>> resolved_train_files, resolved_eval_files = resolve_train_and_eval_files_overlap( - ... orig_train_files, orig_eval_files, 0.5 - ... ) - >>> set(resolved_train_files) & set(resolved_eval_files) == set() - True - >>> len(resolved_train_files) == 3 - True - >>> len(resolved_eval_files) == 2 - True + >>> orig_train_files = ['f1', 'f2', 'f3', 'f4'] + >>> orig_eval_files = ['f1', 'f2', 'f3'] + >>> resolved_train_files, resolved_eval_files = resolve_train_and_eval_files_overlap( + ... orig_train_files, orig_eval_files, 0.5 + ... ) + >>> set(resolved_train_files) & set(resolved_eval_files) == set() + True + >>> len(resolved_train_files) == 3 + True + >>> len(resolved_eval_files) == 2 + True - Args: - train_files: A list of the files used for training. - eval_files: A list of the files used for validation. - fraction_kept_for_eval: A fraction of files in the intersection between `train_files` and - `eval_files` exclusively kept for evaluation. - seed: A seed for generating random numbers. + Args: + train_files: + A list of the files used for training. + eval_files: + A list of the files used for validation. + fraction_kept_for_eval: + A fraction of files in the intersection between `train_files` and `eval_files` exclusively kept for evaluation. + seed: + A seed for generating random numbers. - Returns: - A tuple `(new_train_files, new_eval_files)` with the overlapping resolved. - """ + Returns: + A tuple `(new_train_files, new_eval_files)` with the overlapping resolved. + """ - rng = random.Random(seed) + rng = random.Random(seed) - train_files = set(train_files) - eval_files = set(eval_files) - overlapping_files = train_files & eval_files - train_files_selected_for_eval = set(rng.sample( - overlapping_files, - int(len(overlapping_files) * fraction_kept_for_eval) - )) - train_files = train_files - train_files_selected_for_eval - eval_files = (eval_files - overlapping_files) | train_files_selected_for_eval - return list(train_files), list(eval_files) + train_files = set(train_files) + eval_files = set(eval_files) + overlapping_files = train_files & eval_files + train_files_selected_for_eval = set( + rng.sample( + overlapping_files, int(len(overlapping_files) * fraction_kept_for_eval) + ) + ) + train_files = train_files - train_files_selected_for_eval + eval_files = (eval_files - overlapping_files) | train_files_selected_for_eval + return list(train_files), list(eval_files) def get_time_based_dataset_files_for_train_and_eval( - base_path, - train_start_datetime, - train_end_datetime, - eval_start_datetime, - eval_end_datetime, - fraction_kept_for_eval, - datetime_prefix_format='%Y/%m/%d/%H', - extension='lzo', - parallelism=1 -): - """Get train/eval dataset files organized with a time-based prefix. + base_path: str, + train_start_datetime: datetime, + train_end_datetime: datetime, + eval_start_datetime: datetime, + eval_end_datetime: datetime, + fraction_kept_for_eval: float, + datetime_prefix_format: str = "%Y/%m/%d/%H", + extension: str = "lzo", + parallelism: int = 1, +) -> Tuple[List[str], List[str]]: + """ + Get train/eval dataset files organized with a time-based prefix. + This is just a convenience built around `get_dataset_files_prefixed_by_time` and + `resolve_train_and_eval_files_overlap`. Please refer to these functions for documentation. + """ - This is just a convenience built around `get_dataset_files_prefixed_by_time` and - `resolve_train_and_eval_files_overlap`. Please refer to these functions for documentation. - """ - - train_files = get_time_based_dataset_files( - base_path=base_path, - start_datetime=train_start_datetime, - end_datetime=train_end_datetime, - datetime_prefix_format=datetime_prefix_format, - extension=extension, - parallelism=parallelism - ) - eval_files = get_time_based_dataset_files( - base_path=base_path, - start_datetime=eval_start_datetime, - end_datetime=eval_end_datetime, - datetime_prefix_format=datetime_prefix_format, - extension=extension, - parallelism=parallelism - ) - return resolve_train_and_eval_files_overlap( - train_files=train_files, - eval_files=eval_files, - fraction_kept_for_eval=fraction_kept_for_eval - ) + train_files = get_time_based_dataset_files( + base_path=base_path, + start_datetime=train_start_datetime, + end_datetime=train_end_datetime, + datetime_prefix_format=datetime_prefix_format, + extension=extension, + parallelism=parallelism, + ) + eval_files = get_time_based_dataset_files( + base_path=base_path, + start_datetime=eval_start_datetime, + end_datetime=eval_end_datetime, + datetime_prefix_format=datetime_prefix_format, + extension=extension, + parallelism=parallelism, + ) + return resolve_train_and_eval_files_overlap( + train_files=train_files, + eval_files=eval_files, + fraction_kept_for_eval=fraction_kept_for_eval, + ) diff --git a/twml/twml/contrib/utils/device.py b/twml/twml/contrib/utils/device.py index ab189c98a..d3f3cef42 100644 --- a/twml/twml/contrib/utils/device.py +++ b/twml/twml/contrib/utils/device.py @@ -2,26 +2,32 @@ Functions to query devices being used by tensorflow """ +from typing import Dict, List + from tensorflow.python.client import device_lib -def get_device_map(): - """Returns the map of device name to device type""" - local_device_protos = device_lib.list_local_devices() - return {x.name: x.device_type for x in local_device_protos} +def get_device_map() -> Dict[str, str]: + """Returns the map of device name to device type""" + + local_device_protos = device_lib.list_local_devices() + return {x.name: x.device_type for x in local_device_protos} + + +def get_gpu_list() -> List[str]: + """Returns the list of GPUs available""" + + device_map = get_device_map() + return [name for name in device_map if device_map[name] == "GPU"] -def get_gpu_list(): - """Returns the list of GPUs available""" - device_map = get_device_map() - return [name for name in device_map if device_map[name] == 'GPU'] +def get_gpu_count() -> int: + """Returns the count of GPUs available""" + return len(get_gpu_list()) -def get_gpu_count(): - """Returns the count of GPUs available""" - return len(get_gpu_list()) +def is_gpu_available() -> bool: + """Returns if GPUs are available""" -def is_gpu_available(): - """Returns if GPUs are available""" - return get_gpu_count() > 0 + return get_gpu_count() > 0 diff --git a/twml/twml/contrib/utils/interp.py b/twml/twml/contrib/utils/interp.py index 419d89030..5893352c6 100644 --- a/twml/twml/contrib/utils/interp.py +++ b/twml/twml/contrib/utils/interp.py @@ -4,91 +4,105 @@ import libtwml import tensorflow.compat.v1 as tf + import twml -def linear_interp1(inputs, ref_inputs, ref_outputs): - """ - Perform 1D linear interpolation. - Arguments: - inputs: - The query input values. - ref_inputs: - Reference grid points used for interpolation. - ref_outputs: - Reference output values used for interpolation. - - Returns: - The interpolated outputs for the requested input values. - """ - - inputs = tf.convert_to_tensor(inputs) - ref_inputs = tf.convert_to_tensor(ref_inputs) - ref_outputs = tf.convert_to_tensor(ref_outputs) - - ndims = inputs.shape.ndims - ref_inputs_ndims = ref_inputs.shape.ndims - ref_outputs_ndims = ref_inputs.shape.ndims - - if (ref_inputs_ndims != ndims): - raise ValueError("Dimension mismatch. inputs: %d, ref_inputs: %d" % (ndims, ref_inputs_ndims)) - - if (ref_outputs_ndims != ndims): - raise ValueError("Dimension mismatch. inputs: %d, ref_outputs: %d" % (ndims, ref_outputs_ndims)) - - if ndims > 2: - raise ValueError("Input dimensions should be < 2D. But got %d." % ndims) - - original_input_shape = tf.shape(inputs) - # This is needed because isotonic_calibration expects: - # - inputs of size [num_samples, num_classes] - # - ref_inputs, ref_outputs of size [num_classes, num_bins] - inputs = tf.reshape(inputs, [-1, 1]) - ref_inputs = tf.reshape(ref_inputs, [1, -1]) - ref_outputs = tf.reshape(ref_outputs, [1, -1]) - - # isotonic_calibration is simply doing linear interpolation. - # This needs to be renamed in the future to make it consistent. - outputs = libtwml.ops.isotonic_calibration(inputs, ref_inputs, ref_outputs) - return tf.reshape(outputs, original_input_shape) - - -def linear_interp1_by_class(inputs, input_classes, ref_inputs, ref_outputs): - """ - Perform 1D linear interpolation. - Arguments: - inputs: - The query input values. - input_classes: - The class index to use from the reference grid. - ref_inputs: - Reference 2D grid points used for interpolation. - Each row denotes the grid from a different class. - ref_outputs: - Reference 2D output values used for interpolation. - Each row denotes the grid from a different class. - - Returns: - The interpolated outputs for the requested input values. - """ - - inputs = tf.convert_to_tensor(inputs) - input_classes = tf.convert_to_tensor(input_classes) - ref_inputs = tf.convert_to_tensor(ref_inputs) - ref_outputs = tf.convert_to_tensor(ref_outputs) - - original_input_shape = tf.shape(inputs) - - # pass through - def in_func(x): - return x - - # indexed function - def cond_func(i, fn): - idx = input_classes[i] - x = tf.expand_dims(fn(), axis=0) - return linear_interp1(x, ref_inputs[idx], ref_outputs[idx]) - - # Use while loop for now, needs to be replace by a custom C++ op later. - outputs = twml.util.batch_apply(in_func, inputs, cond_func=cond_func) - return tf.reshape(outputs, original_input_shape) +def linear_interp1( + inputs: tf.Tensor, ref_inputs: tf.Tensor, ref_outputs: tf.Tensor +) -> tf.Tensor: + """ + Perform 1D linear interpolation. + Args: + inputs: + The query input values. + ref_inputs: + Reference grid points used for interpolation. + ref_outputs: + Reference output values used for interpolation. + + Returns: + The interpolated outputs for the requested input values. + """ + + inputs = tf.convert_to_tensor(inputs) + ref_inputs = tf.convert_to_tensor(ref_inputs) + ref_outputs = tf.convert_to_tensor(ref_outputs) + + ndims = inputs.shape.ndims + ref_inputs_ndims = ref_inputs.shape.ndims + ref_outputs_ndims = ref_inputs.shape.ndims + + if ref_inputs_ndims != ndims: + raise ValueError( + "Dimension mismatch. inputs: %d, ref_inputs: %d" % (ndims, ref_inputs_ndims) + ) + + if ref_outputs_ndims != ndims: + raise ValueError( + "Dimension mismatch. inputs: %d, ref_outputs: %d" + % (ndims, ref_outputs_ndims) + ) + + if ndims > 2: + raise ValueError("Input dimensions should be < 2D. But got %d." % ndims) + + original_input_shape = tf.shape(inputs) + # This is needed because isotonic_calibration expects: + # - inputs of size [num_samples, num_classes] + # - ref_inputs, ref_outputs of size [num_classes, num_bins] + inputs = tf.reshape(inputs, [-1, 1]) + ref_inputs = tf.reshape(ref_inputs, [1, -1]) + ref_outputs = tf.reshape(ref_outputs, [1, -1]) + + # isotonic_calibration is simply doing linear interpolation. + # This needs to be renamed in the future to make it consistent. + outputs = libtwml.ops.isotonic_calibration(inputs, ref_inputs, ref_outputs) + return tf.reshape(outputs, original_input_shape) + + +def linear_interp1_by_class( + inputs: tf.Tensor, + input_classes: tf.Tensor, + ref_inputs: tf.Tensor, + ref_outputs: tf.Tensor, +) -> tf.Tensor: + """ + Perform 1D linear interpolation. + + Args: + inputs: + The query input values. + input_classes: + The class index to use from the reference grid. + ref_inputs: + Reference 2D grid points used for interpolation. + Each row denotes the grid from a different class. + ref_outputs: + Reference 2D output values used for interpolation. + Each row denotes the grid from a different class. + + Returns: + The interpolated outputs for the requested input values. + """ + + inputs = tf.convert_to_tensor(inputs) + input_classes = tf.convert_to_tensor(input_classes) + ref_inputs = tf.convert_to_tensor(ref_inputs) + ref_outputs = tf.convert_to_tensor(ref_outputs) + + original_input_shape = tf.shape(inputs) + + # pass through + def in_func(x): + return x + + # indexed function + def cond_func(i: int, fn: callable): + idx = input_classes[i] + x = tf.expand_dims(fn(), axis=0) + return linear_interp1(x, ref_inputs[idx], ref_outputs[idx]) + + # Use while loop for now, needs to be replace by a custom C++ op later. + outputs = twml.util.batch_apply(in_func, inputs, cond_func=cond_func) + return tf.reshape(outputs, original_input_shape) diff --git a/twml/twml/contrib/utils/loss_fns.py b/twml/twml/contrib/utils/loss_fns.py index eb25b430a..ce01f2de8 100644 --- a/twml/twml/contrib/utils/loss_fns.py +++ b/twml/twml/contrib/utils/loss_fns.py @@ -1,302 +1,387 @@ +from typing import Optional + import tensorflow.compat.v1 as tf + from twml.contrib.utils import masks, math_fns -def get_pair_loss(pairwise_label_scores, pairwise_predicted_scores, - params): - """ - Paiwise learning-to-rank ranknet loss - Check paper https://www.microsoft.com/en-us/research/publication/ - learning-to-rank-using-gradient-descent/ - for more information - Args: - pairwise_label_scores: a dense tensor of shape [n_data, n_data] - pairwise_predicted_scores: a dense tensor of shape [n_data, n_data] - n_data is the number of tweet candidates in a BatchPredictionRequest - params: network parameters - mask options: full_mask and diag_mask - Returns: - average loss over pairs defined by the masks - """ - n_data = tf.shape(pairwise_label_scores)[0] - if params.mask == "full_mask": - # full_mask that only covers pairs that have different labels - # (all pairwise_label_scores = 0.5: selfs and same labels are 0s) - mask, pair_count = masks.full_mask(n_data, pairwise_label_scores) - else: - # diag_mask that covers all pairs - # (only selfs/diags are 0s) - mask, pair_count = masks.diag_mask(n_data, pairwise_label_scores) - - # pairwise sigmoid_cross_entropy_with_logits loss - loss = tf.cond(tf.equal(pair_count, 0), lambda: 0., - lambda: _get_average_cross_entropy_loss(pairwise_label_scores, - pairwise_predicted_scores, mask, pair_count)) - return loss - - -def get_lambda_pair_loss(pairwise_label_scores, pairwise_predicted_scores, - params, swapped_ndcg): - """ - Paiwise learning-to-rank lambdarank loss - faster than the previous gradient method - Note: this loss depends on ranknet cross-entropy - delta NDCG is applied to ranknet cross-entropy - Hence, it is still a gradient descent method - Check paper http://citeseerx.ist.psu.edu/viewdoc/ - download?doi=10.1.1.180.634&rep=rep1&type=pdf for more information - for more information - Args: - pairwise_label_scores: a dense tensor of shape [n_data, n_data] - pairwise_predicted_scores: a dense tensor of shape [n_data, n_data] - n_data is the number of tweet candidates in a BatchPredictionRequest - params: network parameters - swapped_ndcg: swapped ndcg of shape [n_data, n_data] - ndcg values when swapping each pair in the prediction ranking order - mask options: full_mask and diag_mask - Returns: - average loss over pairs defined by the masks - """ - n_data = tf.shape(pairwise_label_scores)[0] - if params.mask == "full_mask": - # full_mask that only covers pairs that have different labels - # (all pairwise_label_scores = 0.5: selfs and same labels are 0s) - mask, pair_count = masks.full_mask(n_data, pairwise_label_scores) - else: - # diag_mask that covers all pairs - # (only selfs/diags are 0s) - mask, pair_count = masks.diag_mask(n_data, pairwise_label_scores) - - # pairwise sigmoid_cross_entropy_with_logits loss - loss = tf.cond(tf.equal(pair_count, 0), lambda: 0., - lambda: _get_average_cross_entropy_loss(pairwise_label_scores, - pairwise_predicted_scores, mask, pair_count, swapped_ndcg)) - return loss - - -def _get_average_cross_entropy_loss(pairwise_label_scores, pairwise_predicted_scores, - mask, pair_count, swapped_ndcg=None): - """ - Average the loss for a batchPredictionRequest based on a desired number of pairs - """ - loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=pairwise_label_scores, - logits=pairwise_predicted_scores) - loss = mask * loss - if swapped_ndcg is not None: - loss = loss * swapped_ndcg - loss = tf.reduce_sum(loss) / pair_count - return loss - - -def get_listmle_loss(labels, predicted_scores): - r""" - listwise learning-to-rank listMLE loss - Note: Simplified MLE formula is used in here (omit the proof in here) - \sum_{s=1}^{n-1} (-predicted_scores + ln(\sum_{i=s}^n exp(predicted_scores))) - n is tf.shape(predicted_scores)[0] - Check paper http://icml2008.cs.helsinki.fi/papers/167.pdf for more information - Args: - labels: a dense tensor of shape [n_data, 1] - n_data is the number of tweet candidates in a BatchPredictionRequest - predicted_scores: a dense tensor of same shape and type as labels - Returns: - average loss - """ - labels = tf.reshape(labels, [-1, 1]) - n_data = tf.shape(labels)[0] - predicted_scores = tf.reshape(predicted_scores, [-1, 1]) - - predicted_scores_ordered_by_labels = _get_ordered_predicted_scores(labels, - predicted_scores, n_data) - - loss = (-1) * tf.reduce_sum(predicted_scores) - # sum over 1 to n_data - 1 - temp = tf.gather(predicted_scores_ordered_by_labels, [n_data - 1]) - temp = tf.reshape(temp, []) - loss = tf.add(loss, temp) - - exps = tf.exp(predicted_scores_ordered_by_labels) - exp_sum = tf.reduce_sum(exps) - # clip exp_sum for safer log - loss = tf.add(loss, math_fns.safe_log(exp_sum)) - - iteration = tf.constant(0) - - def _cond(iteration, loss, exp_sum, exp): - return tf.less(iteration, n_data - 2) - - def _gen_loop_body(): - def loop_body(iteration, loss, exp_sum, exps): - temp = tf.gather(exps, [iteration]) - temp = tf.reshape(temp, []) - exp_sum = tf.subtract(exp_sum, temp) - # clip exp_sum for safer log - loss = tf.add(loss, math_fns.safe_log(exp_sum)) - return tf.add(iteration, 1), loss, exp_sum, exps - return loop_body - - iteration, loss, exp_sum, exps = tf.while_loop(_cond, _gen_loop_body(), - (iteration, loss, exp_sum, exps)) - loss = loss / tf.cast(n_data, dtype=tf.float32) - return loss - - -def _get_ordered_predicted_scores(labels, predicted_scores, n_data): - """ - Order predicted_scores based on sorted labels - """ - sorted_labels, ordered_labels_indices = tf.nn.top_k( - tf.transpose(labels), k=n_data) - ordered_labels_indices = tf.transpose(ordered_labels_indices) - predicted_scores_ordered_by_labels = tf.gather_nd(predicted_scores, - ordered_labels_indices) - return predicted_scores_ordered_by_labels - - -def get_attrank_loss(labels, predicted_scores, weights=None): - """ - Modified listwise learning-to-rank AttRank loss - Check paper https://arxiv.org/abs/1804.05936 for more information - Note: there is an inconsistency between the paper statement and - their public code - Args: - labels: a dense tensor of shape [n_data, 1] - n_data is the number of tweet candidates in a BatchPredictionRequest - predicted_scores: a dense tensor of same shape and type as labels - weights: a dense tensor of the same shape as labels - Returns: - average loss - """ - # The authors immeplemented the following, which is basically listnet - # attention_labels = _get_attentions(labels) - # attention_labels = tf.reshape(attention_labels, [1, -1]) - # predicted_scores = tf.reshape(predicted_scores, [1, -1]) - # loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=attention_labels, - # logits=predicted_scores)) - - # The paper proposed the following - # attention_labels = _get_attentions(labels) - # # However the following line is wrong based on their statement - # # as _get_attentions can give 0 results when input < 0 - # # and the result cannot be used in _get_attrank_cross_entropy - # # log(a_i^S) - # # attention_predicted_scores = _get_attentions(predicted_scores) - # loss = _get_attrank_cross_entropy(attention_labels, attention_predicted_scores) - # # the range of attention_predicted_scores is [0, 1) - # # this gives sigmoid [0.5, 0.732) - # # hence, it is not good to use in sigmoid_cross_entropy_with_logits either - - # Implemented the following instead - # _get_attentions is applied to labels - # softmax is applied to predicted_scores - reshaped_labels = tf.reshape(labels, [1, -1]) - attention_labels = _get_attentions(reshaped_labels) - reshaped_predicted_scores = tf.reshape(predicted_scores, [1, -1]) - attention_predicted_scores = tf.nn.softmax(reshaped_predicted_scores) - loss = _get_attrank_cross_entropy(attention_labels, attention_predicted_scores) - return loss - - -def _get_attentions(raw_scores): - """ - Used in attention weights in AttRank loss - for a query/batch/batchPreidictionRequest - (a rectified softmax function) - """ - not_consider = tf.less_equal(raw_scores, 0) - mask = tf.ones(tf.shape(raw_scores)) - tf.cast(not_consider, dtype=tf.float32) - mask = tf.cast(mask, dtype=tf.float32) - expon_labels = mask * tf.exp(raw_scores) - - expon_label_sum = tf.reduce_sum(expon_labels) - # expon_label_sum is safe as a denominator - attentions = math_fns.safe_div(expon_labels, expon_label_sum) - return attentions - - -def _get_attrank_cross_entropy(labels, logits): - # logits is not safe based on their satement - # do not use this function directly elsewhere - results = labels * math_fns.safe_log(logits) + (1 - labels) * math_fns.safe_log(1 - logits) - results = (-1) * results - results = tf.reduce_mean(results) - return results - - -def get_listnet_loss(labels, predicted_scores, weights=None): - """ - Listwise learning-to-rank listet loss - Check paper https://www.microsoft.com/en-us/research/ - wp-content/uploads/2016/02/tr-2007-40.pdf - for more information - Args: - labels: a dense tensor of shape [n_data, 1] - n_data is the number of tweet candidates in a BatchPredictionRequest - predicted_scores: a dense tensor of same shape and type as labels - weights: a dense tensor of the same shape as labels - Returns: - average loss - """ - # top one probability is the same as softmax - labels_top_one_probs = _get_top_one_probs(labels) - predicted_scores_top_one_probs = _get_top_one_probs(predicted_scores) - - if weights is None: +def get_pair_loss( + pairwise_label_scores: tf.Tensor, pairwise_predicted_scores: tf.Tensor, params: dict +) -> tf.Tensor: + """ + Pairwise learning-to-rank ranknet loss. Check paper for more information: + https://www.microsoft.com/en-us/research/publication/learning-to-rank-using-gradient-descent/ + + Args: + pairwise_label_scores: + a dense tensor of shape [n_data, n_data] + pairwise_predicted_scores: + a dense tensor of shape [n_data, n_data] + n_data is the number of tweet candidates in a BatchPredictionRequest + params: + network parameters + mask: + full_mask or diag_mask + Returns: + average loss over pairs defined by the masks + """ + + n_data = tf.shape(pairwise_label_scores)[0] + if params.mask == "full_mask": + # full_mask that only covers pairs that have different labels + # (all pairwise_label_scores = 0.5: selfs and same labels are 0s) + mask, pair_count = masks.full_mask(n_data, pairwise_label_scores) + else: + # diag_mask that covers all pairs + # (only selfs/diags are 0s) + mask, pair_count = masks.diag_mask(n_data, pairwise_label_scores) + + # pairwise sigmoid_cross_entropy_with_logits loss + loss = tf.cond( + tf.equal(pair_count, 0), + lambda: 0.0, + lambda: _get_average_cross_entropy_loss( + pairwise_label_scores, pairwise_predicted_scores, mask, pair_count + ), + ) + return loss + + +def get_lambda_pair_loss( + pairwise_label_scores: tf.Tensor, + pairwise_predicted_scores: tf.Tensor, + params: dict, + swapped_ndcg: tf.Tensor, +) -> tf.Tensor: + """ + Pairwise learning-to-rank lambdarank loss faster than the previous gradient method + Note: this loss depends on ranknet cross-entropy delta NDCG is applied to ranknet cross-entropy + Hence, it is still a gradient descent method + + For more information, check paper: + http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.180.634&rep=rep1&type=pdf + + Args: + pairwise_label_scores: + a dense tensor of shape [n_data, n_data] + pairwise_predicted_scores: + a dense tensor of shape [n_data, n_data] + n_data is the number of tweet candidates in a BatchPredictionRequest + params: + network parameters + swapped_ndcg: + swapped ndcg of shape [n_data, n_data] + ndcg values when swapping each pair in the prediction ranking order + mask options: full_mask and diag_mask + + Returns: + average loss over pairs defined by the masks + """ + n_data = tf.shape(pairwise_label_scores)[0] + if params.mask == "full_mask": + # full_mask that only covers pairs that have different labels + # (all pairwise_label_scores = 0.5: selfs and same labels are 0s) + mask, pair_count = masks.full_mask(n_data, pairwise_label_scores) + else: + # diag_mask that covers all pairs + # (only selfs/diags are 0s) + mask, pair_count = masks.diag_mask(n_data, pairwise_label_scores) + + # pairwise sigmoid_cross_entropy_with_logits loss + loss = tf.cond( + tf.equal(pair_count, 0), + lambda: 0.0, + lambda: _get_average_cross_entropy_loss( + pairwise_label_scores, + pairwise_predicted_scores, + mask, + pair_count, + swapped_ndcg, + ), + ) + return loss + + +def _get_average_cross_entropy_loss( + pairwise_label_scores: tf.Tensor, + pairwise_predicted_scores: tf.Tensor, + mask: tf.Tensor, + pair_count: tf.Tensor, + swapped_ndcg: Optional[tf.Tensor] = None, +) -> tf.Tensor: + """Average the loss for a batchPredictionRequest based on a desired number of pairs""" + + loss = tf.nn.sigmoid_cross_entropy_with_logits( + labels=pairwise_label_scores, + logits=pairwise_predicted_scores, + ) + loss = mask * loss + if swapped_ndcg is not None: + loss = loss * swapped_ndcg + loss = tf.reduce_sum(loss) / pair_count + return loss + + +def get_listmle_loss(labels: tf.Tensor, predicted_scores: tf.Tensor) -> tf.Tensor: + """ + listwise learning-to-rank listMLE loss + Note: Simplified MLE formula is used in here (omit the proof in here) + \sum_{s=1}^{n-1} (-predicted_scores + ln(\sum_{i=s}^n exp(predicted_scores))) + n is tf.shape(predicted_scores)[0] + + Check paper http://icml2008.cs.helsinki.fi/papers/167.pdf for more information + + Args: + labels: + a dense tensor of shape [n_data, 1] + n_data is the number of tweet candidates in a BatchPredictionRequest + predicted_scores: + a dense tensor of same shape and type as labels + + Returns: + average loss + """ + labels = tf.reshape(labels, [-1, 1]) + n_data = tf.shape(labels)[0] + predicted_scores = tf.reshape(predicted_scores, [-1, 1]) + + predicted_scores_ordered_by_labels = _get_ordered_predicted_scores( + labels, predicted_scores, n_data + ) + + loss = (-1) * tf.reduce_sum(predicted_scores) + # sum over 1 to n_data - 1 + temp = tf.gather(predicted_scores_ordered_by_labels, [n_data - 1]) + temp = tf.reshape(temp, []) + loss = tf.add(loss, temp) + + exps = tf.exp(predicted_scores_ordered_by_labels) + exp_sum = tf.reduce_sum(exps) + # clip exp_sum for safer log + loss = tf.add(loss, math_fns.safe_log(exp_sum)) + + iteration = tf.constant(0) + + def _cond( + iteration: tf.Tensor, loss: tf.Tensor, exp_sum: tf.Tensor, exps: tf.Tensor + ) -> tf.Tensor: + return tf.less(iteration, n_data - 2) + + def _gen_loop_body() -> callable: + def loop_body( + iteration: tf.Tensor, loss: tf.Tensor, exp_sum: tf.Tensor, exps: tf.Tensor + ) -> tf.Tensor: + temp = tf.gather(exps, [iteration]) + temp = tf.reshape(temp, []) + exp_sum = tf.subtract(exp_sum, temp) + # clip exp_sum for safer log + loss = tf.add(loss, math_fns.safe_log(exp_sum)) + return tf.add(iteration, 1), loss, exp_sum, exps + + return loop_body + + iteration, loss, exp_sum, exps = tf.while_loop( + _cond, _gen_loop_body(), (iteration, loss, exp_sum, exps) + ) + loss = loss / tf.cast(n_data, dtype=tf.float32) + return loss + + +def _get_ordered_predicted_scores( + labels: tf.Tensor, predicted_scores: tf.Tensor, n_data: tf.Tensor +) -> tf.Tensor: + """Order predicted_scores based on sorted labels""" + + sorted_labels, ordered_labels_indices = tf.nn.top_k(tf.transpose(labels), k=n_data) + ordered_labels_indices = tf.transpose(ordered_labels_indices) + predicted_scores_ordered_by_labels = tf.gather_nd( + predicted_scores, ordered_labels_indices + ) + return predicted_scores_ordered_by_labels + + +def get_attrank_loss( + labels: tf.Tensor, predicted_scores: tf.Tensor, weights: Optional[tf.Tensor] = None +) -> tf.Tensor: + """ + Modified listwise learning-to-rank AttRank loss. For more info, Check paper: + https://arxiv.org/abs/1804.05936 + + Note: there is an inconsistency between the paper statement and their public code + + Args: + labels: + a dense tensor of shape [n_data, 1] + n_data is the number of tweet candidates in a BatchPredictionRequest + predicted_scores: + a dense tensor of same shape and type as labels + weights: + a dense tensor of the same shape as labels + + Returns: + average loss + """ + # The authors implemented the following, which is basically listnet + # attention_labels = _get_attentions(labels) + # attention_labels = tf.reshape(attention_labels, [1, -1]) + # predicted_scores = tf.reshape(predicted_scores, [1, -1]) + # loss = tf.reduce_mean( + # tf.nn.softmax_cross_entropy_with_logits( + # labels=attention_labels, logits=predicted_scores + # ) + # ) + + # The paper proposed the following + # attention_labels = _get_attentions(labels) + # # However the following line is wrong based on their statement + # # as _get_attentions can give 0 results when input < 0 + # # and the result cannot be used in _get_attrank_cross_entropy + # # log(a_i^S) + # # attention_predicted_scores = _get_attentions(predicted_scores) + # loss = _get_attrank_cross_entropy(attention_labels, attention_predicted_scores) + # # the range of attention_predicted_scores is [0, 1) + # # this gives sigmoid [0.5, 0.732) + # # hence, it is not good to use in sigmoid_cross_entropy_with_logits either + + # Implemented the following instead + # _get_attentions is applied to labels + # softmax is applied to predicted_scores + reshaped_labels = tf.reshape(labels, [1, -1]) + attention_labels = _get_attentions(reshaped_labels) + reshaped_predicted_scores = tf.reshape(predicted_scores, [1, -1]) + attention_predicted_scores = tf.nn.softmax(reshaped_predicted_scores) + loss = _get_attrank_cross_entropy(attention_labels, attention_predicted_scores) + return loss + + +def _get_attentions(raw_scores: tf.Tensor) -> tf.Tensor: + """ + Used in attention weights in AttRank loss for a query/batch/batchPredictionRequest + (a rectified softmax function) + """ + + not_consider = tf.less_equal(raw_scores, 0) + mask = tf.ones(tf.shape(raw_scores)) - tf.cast(not_consider, dtype=tf.float32) + mask = tf.cast(mask, dtype=tf.float32) + expon_labels = mask * tf.exp(raw_scores) + + expon_label_sum = tf.reduce_sum(expon_labels) + # expon_label_sum is safe as a denominator + attentions = math_fns.safe_div(expon_labels, expon_label_sum) + return attentions + + +def _get_attrank_cross_entropy(labels: tf.Tensor, logits: tf.Tensor) -> tf.Tensor: + # logits is not safe based on their statement + # do not use this function directly elsewhere + results = labels * math_fns.safe_log(logits) + (1 - labels) * math_fns.safe_log( + 1 - logits + ) + results = (-1) * results + results = tf.reduce_mean(results) + return results + + +def get_listnet_loss( + labels: tf.Tensor, + predicted_scores: tf.Tensor, + weights: Optional[tf.Tensor] = None, +) -> tf.Tensor: + """ + Listwise learning-to-rank listet loss. For more information, check paper: + https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/tr-2007-40.pdf + + Args: + labels: + a dense tensor of shape [n_data, 1] + n_data is the number of tweet candidates in a BatchPredictionRequest + predicted_scores: + a dense tensor of same shape and type as labels + weights: + a dense tensor of the same shape as labels + + Returns: + average loss + """ + # top one probability is the same as softmax + labels_top_one_probs = _get_top_one_probs(labels) + predicted_scores_top_one_probs = _get_top_one_probs(predicted_scores) + + if weights is None: + loss = tf.reduce_mean( + _get_listnet_cross_entropy( + labels=labels_top_one_probs, logits=predicted_scores_top_one_probs + ) + ) + return loss + loss = tf.reduce_mean( - _get_listnet_cross_entropy(labels=labels_top_one_probs, - logits=predicted_scores_top_one_probs)) + _get_listnet_cross_entropy( + labels=labels_top_one_probs, logits=predicted_scores_top_one_probs + ) + * weights + ) / tf.reduce_mean(weights) return loss - loss = tf.reduce_mean( - _get_listnet_cross_entropy(labels=labels_top_one_probs, - logits=predicted_scores_top_one_probs) * weights) / tf.reduce_mean(weights) - return loss - - -def _get_top_one_probs(labels): - """ - Used in listnet top-one probabilities - for a query/batch/batchPreidictionRequest - (essentially a softmax function) - """ - expon_labels = tf.exp(labels) - expon_label_sum = tf.reduce_sum(expon_labels) - # expon_label_sum is safe as a denominator - attentions = expon_labels / expon_label_sum - return attentions - - -def _get_listnet_cross_entropy(labels, logits): - """ - Used in listnet - cross entropy on top-one probabilities - between ideal/label top-one probabilities - and predicted/logits top-one probabilities - for a query/batch/batchPreidictionRequest - """ - # it is safe to use log on logits - # that come from _get_top_one_probs - # do not use this function directly elsewhere - results = (-1) * labels * math_fns.safe_log(logits) - return results - - -def get_pointwise_loss(labels, predicted_scores, weights=None): - """ - Pointwise learning-to-rank pointwise loss - Args: - labels: a dense tensor of shape [n_data, 1] - n_data is the number of tweet candidates in a BatchPredictionRequest - predicted_scores: a dense tensor of same shape and type as labels - weights: a dense tensor of the same shape as labels - Returns: - average loss - """ - if weights is None: + +def _get_top_one_probs(labels: tf.Tensor) -> tf.Tensor: + """ + Used in listnet top-one probabilities + for a query/batch/batchPredictionRequest + (essentially a softmax function) + """ + expon_labels = tf.exp(labels) + expon_label_sum = tf.reduce_sum(expon_labels) + # expon_label_sum is safe as a denominator + attentions = expon_labels / expon_label_sum + return attentions + + +def _get_listnet_cross_entropy(labels: tf.Tensor, logits: tf.Tensor) -> tf.Tensor: + """ + Used in listnet + cross entropy on top-one probabilities + between ideal/label top-one probabilities + and predicted/logits top-one probabilities + for a query/batch/batchPredictionRequest + """ + # it is safe to use log on logits + # that come from _get_top_one_probs + # do not use this function directly elsewhere + results = (-1) * labels * math_fns.safe_log(logits) + return results + + +def get_pointwise_loss( + labels: tf.Tensor, + predicted_scores: tf.Tensor, + weights: Optional[tf.Tensor] = None, +) -> tf.Tensor: + """ + Pointwise learning-to-rank pointwise loss + + Args: + labels: + a dense tensor of shape [n_data, 1] + n_data is the number of tweet candidates in a BatchPredictionRequest + predicted_scores: + a dense tensor of same shape and type as labels + weights: + a dense tensor of the same shape as labels + + Returns: + average loss + """ + + if weights is None: + loss = tf.reduce_mean( + tf.nn.sigmoid_cross_entropy_with_logits( + labels=labels, logits=predicted_scores + ) + ) + return loss loss = tf.reduce_mean( - tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, - logits=predicted_scores)) + tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=predicted_scores) + * weights + ) / tf.reduce_mean(weights) return loss - loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, - logits=predicted_scores) * weights) / tf.reduce_mean(weights) - return loss diff --git a/twml/twml/contrib/utils/masks.py b/twml/twml/contrib/utils/masks.py index f3143dc52..3a76a7607 100644 --- a/twml/twml/contrib/utils/masks.py +++ b/twml/twml/contrib/utils/masks.py @@ -1,38 +1,53 @@ +from typing import Tuple + import tensorflow.compat.v1 as tf -def diag_mask(n_data, pairwise_label_scores): - """ - This is so far only used in pariwise learning-to-rank - Args: - n_data: a int `Tensor`. - pairwise_label_scores: a dense `Tensor` of shape [n_data, n_data]. - Returns: - values in pairwise_label_scores except the diagonal - each cell contains a paiwise score difference - only selfs/diags are 0s - """ - mask = tf.ones([n_data, n_data]) - tf.diag(tf.ones([n_data])) - mask = tf.cast(mask, dtype=tf.float32) - pair_count = tf.to_float(n_data) * (tf.to_float(n_data) - 1) - pair_count = tf.cast(pair_count, dtype=tf.float32) - return mask, pair_count - - -def full_mask(n_data, pairwise_label_scores): - """ - This is so far only used in pariwise learning-to-rank - Args: - n_data: a int `Tensor`. - pairwise_label_scores: a dense `Tensor` of shape [n_data, n_data]. - Returns: - values in pairwise_label_scores except pairs that have the same labels - each cell contains a paiwise score difference - all pairwise_label_scores = 0.5: selfs and same labels are 0s - """ - not_consider = tf.equal(pairwise_label_scores, 0.5) - mask = tf.ones([n_data, n_data]) - tf.cast(not_consider, dtype=tf.float32) - mask = tf.cast(mask, dtype=tf.float32) - pair_count = tf.reduce_sum(mask) - pair_count = tf.cast(pair_count, dtype=tf.float32) - return mask, pair_count +def diag_mask( + n_data: tf.Tensor, pairwise_label_scores: tf.Tensor +) -> Tuple[tf.Tensor, tf.Tensor]: + """ + This is so far only used in pairwise learning-to-rank + + Args: + n_data: + a int `Tensor`. + pairwise_label_scores: + a dense `Tensor` of shape [n_data, n_data]. + + Returns: + values in pairwise_label_scores except the diagonal each cell contains a + pairwise score difference only selfs/diags are 0s + """ + + mask = tf.ones([n_data, n_data]) - tf.diag(tf.ones([n_data])) + mask = tf.cast(mask, dtype=tf.float32) + pair_count = tf.to_float(n_data) * (tf.to_float(n_data) - 1) + pair_count = tf.cast(pair_count, dtype=tf.float32) + return mask, pair_count + + +def full_mask( + n_data: tf.Tensor, pairwise_label_scores: tf.Tensor +) -> Tuple[tf.Tensor, tf.Tensor]: + """ + This is so far only used in pairwise learning-to-rank + + Args: + n_data: + a int `Tensor`. + pairwise_label_scores: + a dense `Tensor` of shape [n_data, n_data]. + + Returns: + values in pairwise_label_scores except pairs that have the same labels + each cell contains a pairwise score difference + all pairwise_label_scores = 0.5: selfs and same labels are 0s + """ + + not_consider = tf.equal(pairwise_label_scores, 0.5) + mask = tf.ones([n_data, n_data]) - tf.cast(not_consider, dtype=tf.float32) + mask = tf.cast(mask, dtype=tf.float32) + pair_count = tf.reduce_sum(mask) + pair_count = tf.cast(pair_count, dtype=tf.float32) + return mask, pair_count diff --git a/twml/twml/contrib/utils/math_fns.py b/twml/twml/contrib/utils/math_fns.py index 2d9e72282..924b8968e 100644 --- a/twml/twml/contrib/utils/math_fns.py +++ b/twml/twml/contrib/utils/math_fns.py @@ -1,171 +1,226 @@ +from typing import Optional, Union + import tensorflow.compat.v1 as tf from tensorflow.python.ops import array_ops, math_ops # Copied from metrics_impl.py # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/metrics_impl.py#L216 -def safe_div(numerator, denominator, name=None): - """ - Example usage: calculating NDCG = DCG / IDCG to handle cases when - IDCG = 0 returns 0 instead of Infinity - Do not use this dividing funciton unless it makes sense to your problem - Divides two tensors element-wise, returns 0 if the denominator is <= 0. - Args: - numerator: a real `Tensor`. - denominator: a real `Tensor`, with dtype matching `numerator`. - name: Name for the returned op. - Returns: - 0 if `denominator` <= 0, else `numerator` / `denominator` - """ - t = math_ops.truediv(numerator, denominator) - zero = array_ops.zeros_like(t, dtype=denominator.dtype) - condition = math_ops.greater(denominator, zero) - zero = math_ops.cast(zero, t.dtype) - return array_ops.where(condition, t, zero, name=name) - - -def cal_ndcg(label_scores, predicted_scores, top_k_int=1): - """ - Calculate NDCG score for top_k_int ranking positions - Args: - label_scores: a real `Tensor`. - predicted_scores: a real `Tensor`, with dtype matching label_scores - top_k_int: An int or an int `Tensor`. - Returns: - a `Tensor` that holds DCG / IDCG. - """ - sorted_labels, predicted_order = _get_ranking_orders( - label_scores, predicted_scores, top_k_int=top_k_int) - - predicted_relevance = _get_relevance_scores(predicted_order) - sorted_relevance = _get_relevance_scores(sorted_labels) - - cg_discount = _get_cg_discount(top_k_int) - - dcg = _dcg_idcg(predicted_relevance, cg_discount) - idcg = _dcg_idcg(sorted_relevance, cg_discount) - # the ndcg score of the batch - # idcg is 0 if label_scores are all 0 - ndcg = safe_div(dcg, idcg, 'one_ndcg') - return ndcg - - -def cal_swapped_ndcg(label_scores, predicted_scores, top_k_int): - """ - Calculate swapped NDCG score in Lambda Rank for full/top k ranking positions - Args: - label_scores: a real `Tensor`. - predicted_scores: a real `Tensor`, with dtype matching label_scores - top_k_int: An int or an int `Tensor`. - Returns: - a `Tensor` that holds swapped NDCG by . - """ - sorted_labels, predicted_order = _get_ranking_orders( - label_scores, predicted_scores, top_k_int=top_k_int) - - predicted_relevance = _get_relevance_scores(predicted_order) - sorted_relevance = _get_relevance_scores(sorted_labels) - - cg_discount = _get_cg_discount(top_k_int) - - # cg_discount is safe as a denominator - dcg_k = predicted_relevance / cg_discount - dcg = tf.reduce_sum(dcg_k) - - idcg_k = sorted_relevance / cg_discount - idcg = tf.reduce_sum(idcg_k) - - ndcg = safe_div(dcg, idcg, 'ndcg_in_lambdarank_training') - - # remove the gain from label i then add the gain from label j - tiled_ij = tf.tile(dcg_k, [1, top_k_int]) - new_ij = (predicted_relevance / tf.transpose(cg_discount)) - - tiled_ji = tf.tile(tf.transpose(dcg_k), [top_k_int, 1]) - new_ji = tf.transpose(predicted_relevance) / cg_discount - - # if swap i and j, remove the stale cg for i, then add the new cg for i, - # remove the stale cg for j, and then add the new cg for j - new_dcg = dcg - tiled_ij + new_ij - tiled_ji + new_ji - - new_ndcg = safe_div(new_dcg, idcg, 'new_ndcg_in_lambdarank_training') - swapped_ndcg = tf.abs(ndcg - new_ndcg) - return swapped_ndcg - - -def _dcg_idcg(relevance_scores, cg_discount): - """ - Calculate DCG scores for top_k_int ranking positions - Args: - relevance_scores: a real `Tensor`. - cg_discount: a real `Tensor`, with dtype matching relevance_scores - Returns: - a `Tensor` that holds \\sum_{i=1}^k \frac{relevance_scores_k}{cg_discount} - """ - # cg_discount is safe - dcg_k = relevance_scores / cg_discount - return tf.reduce_sum(dcg_k) - - -def _get_ranking_orders(label_scores, predicted_scores, top_k_int=1): - """ - Calculate DCG scores for top_k_int ranking positions - Args: - label_scores: a real `Tensor`. - predicted_scores: a real `Tensor`, with dtype matching label_scores - top_k_int: an integer or an int `Tensor`. - Returns: - two `Tensors` that hold sorted_labels: the ground truth relevance socres - and predicted_order: relevance socres based on sorted predicted_scores - """ - # sort predictions_scores and label_scores - # size [batch_size/num of DataRecords, 1] - label_scores = tf.reshape(label_scores, [-1, 1]) - predicted_scores = tf.reshape(predicted_scores, [-1, 1]) - # sorted_labels contians the relevance scores of the correct order - sorted_labels, ordered_labels_indices = tf.nn.top_k( - tf.transpose(label_scores), k=top_k_int) - sorted_labels = tf.transpose(sorted_labels) - # sort predicitons and use the indices to obtain the relevance scores of the predicted order - sorted_predictions, ordered_predictions_indices = tf.nn.top_k( - tf.transpose(predicted_scores), k=top_k_int) - ordered_predictions_indices_for_labels = tf.transpose(ordered_predictions_indices) - # predicted_order contians the relevance scores of the predicted order - predicted_order = tf.gather_nd(label_scores, ordered_predictions_indices_for_labels) - return sorted_labels, predicted_order - - -def _get_cg_discount(top_k_int=1): - r""" - Calculate discounted gain factor for ranking position till top_k_int - Args: - top_k_int: An int or an int `Tensor`. - Returns: - a `Tensor` that holds \log_{2}(i + 1), i \in [1, k] - """ - log_2 = tf.log(tf.constant(2.0, dtype=tf.float32)) - # top_k_range needs to start from 1 to top_k_int - top_k_range = tf.range(top_k_int) + 1 - top_k_range = tf.reshape(top_k_range, [-1, 1]) - # cast top_k_range to float - top_k_range = tf.cast(top_k_range, dtype=tf.float32) - cg_discount = tf.log(top_k_range + 1.0) / log_2 - return cg_discount - - -def _get_relevance_scores(scores): - return 2 ** scores - 1 - - -def safe_log(raw_scores, name=None): - """ - Calculate log of a tensor, handling cases that - raw_scores are close to 0s - Args: - raw_scores: An float `Tensor`. - Returns: - A float `Tensor` that hols the safe log base e of input - """ - epsilon = 1E-8 - clipped_raw_scores = tf.maximum(raw_scores, epsilon) - return tf.log(clipped_raw_scores) +def safe_div( + numerator: tf.Tensor, denominator: tf.Tensor, name: Optional[str] = None +) -> tf.Tensor: + """ + Example usage: calculating NDCG = DCG / IDCG to handle cases when + IDCG = 0 returns 0 instead of Infinity + Do not use this dividing function unless it makes sense to your problem + Divides two tensors element-wise, returns 0 if the denominator is <= 0. + + Args: + numerator: + a real `Tensor`. + denominator: + a real `Tensor`, with dtype matching `numerator`. + name: + Name for the returned op. + + Returns: + 0 if `denominator` <= 0, else `numerator` / `denominator` + """ + t = math_ops.truediv(numerator, denominator) + zero = array_ops.zeros_like(t, dtype=denominator.dtype) + condition = math_ops.greater(denominator, zero) + zero = math_ops.cast(zero, t.dtype) + return array_ops.where(condition, t, zero, name=name) + + +def cal_ndcg( + label_scores: tf.Tensor, + predicted_scores: tf.Tensor, + top_k_int: Union[int, tf.Tensor], +) -> tf.Tensor: + """ + Calculate NDCG score for top_k_int ranking positions + + Args: + label_scores: + a real `Tensor`. + predicted_scores: + a real `Tensor`, with dtype matching label_scores + top_k_int: + An int or an int `Tensor`. + + Returns: + a `Tensor` that holds DCG / IDCG. + """ + + sorted_labels, predicted_order = _get_ranking_orders( + label_scores, predicted_scores, top_k_int=top_k_int + ) + + predicted_relevance = _get_relevance_scores(predicted_order) + sorted_relevance = _get_relevance_scores(sorted_labels) + + cg_discount = _get_cg_discount(top_k_int) + + dcg = _dcg_idcg(predicted_relevance, cg_discount) + idcg = _dcg_idcg(sorted_relevance, cg_discount) + # the ndcg score of the batch + # idcg is 0 if label_scores are all 0 + ndcg = safe_div(dcg, idcg, "one_ndcg") + return ndcg + + +def cal_swapped_ndcg( + label_scores: tf.Tensor, + predicted_scores: tf.Tensor, + top_k_int: Union[int, tf.Tensor], +) -> tf.Tensor: + """ + Calculate swapped NDCG score in Lambda Rank for full/top k ranking positions + + Args: + label_scores: + a real `Tensor`. + predicted_scores: + a real `Tensor`, with dtype matching label_scores + top_k_int: + An int or an int `Tensor`. + + Returns: + a `Tensor` that holds swapped NDCG by . + """ + + sorted_labels, predicted_order = _get_ranking_orders( + label_scores, predicted_scores, top_k_int=top_k_int + ) + + predicted_relevance = _get_relevance_scores(predicted_order) + sorted_relevance = _get_relevance_scores(sorted_labels) + + cg_discount = _get_cg_discount(top_k_int) + + # cg_discount is safe as a denominator + dcg_k = predicted_relevance / cg_discount + dcg = tf.reduce_sum(dcg_k) + + idcg_k = sorted_relevance / cg_discount + idcg = tf.reduce_sum(idcg_k) + + ndcg = safe_div(dcg, idcg, "ndcg_in_lambdarank_training") + + # remove the gain from label i then add the gain from label j + tiled_ij = tf.tile(dcg_k, [1, top_k_int]) + new_ij = predicted_relevance / tf.transpose(cg_discount) + + tiled_ji = tf.tile(tf.transpose(dcg_k), [top_k_int, 1]) + new_ji = tf.transpose(predicted_relevance) / cg_discount + + # if swap i and j, remove the stale cg for i, then add the new cg for i, + # remove the stale cg for j, and then add the new cg for j + new_dcg = dcg - tiled_ij + new_ij - tiled_ji + new_ji + + new_ndcg = safe_div(new_dcg, idcg, "new_ndcg_in_lambdarank_training") + swapped_ndcg = tf.abs(ndcg - new_ndcg) + return swapped_ndcg + + +def _dcg_idcg(relevance_scores: tf.Tensor, cg_discount: tf.Tensor) -> tf.Tensor: + """ + Calculate DCG scores for top_k_int ranking positions + + Args: + relevance_scores: + a real `Tensor`. + cg_discount: + a real `Tensor`, with dtype matching relevance_scores + + Returns: + a `Tensor` that holds \\sum_{i=1}^k \frac{relevance_scores_k}{cg_discount} + """ + + # cg_discount is safe + dcg_k = relevance_scores / cg_discount + return tf.reduce_sum(dcg_k) + + +def _get_ranking_orders( + label_scores: tf.Tensor, + predicted_scores: tf.Tensor, + top_k_int: Union[int, tf.Tensor], +) -> tf.Tensor: + """ + Calculate DCG scores for top_k_int ranking positions + + Args: + label_scores: + a real `Tensor`. + predicted_scores: + a real `Tensor`, with dtype matching label_scores + top_k_int: + an integer or an int `Tensor`. + + Returns: + two `Tensors` that hold sorted_labels: the ground truth relevance scores + and predicted_order: relevance scores based on sorted predicted_scores + """ + + # sort predictions_scores and label_scores + # size [batch_size/num of DataRecords, 1] + label_scores = tf.reshape(label_scores, [-1, 1]) + predicted_scores = tf.reshape(predicted_scores, [-1, 1]) + # sorted_labels contains the relevance scores of the correct order + sorted_labels, ordered_labels_indices = tf.nn.top_k( + tf.transpose(label_scores), k=top_k_int + ) + sorted_labels = tf.transpose(sorted_labels) + # sort predictions and use the indices to obtain the relevance scores of the predicted order + sorted_predictions, ordered_predictions_indices = tf.nn.top_k( + tf.transpose(predicted_scores), k=top_k_int + ) + ordered_predictions_indices_for_labels = tf.transpose(ordered_predictions_indices) + # predicted_order contains the relevance scores of the predicted order + predicted_order = tf.gather_nd(label_scores, ordered_predictions_indices_for_labels) + return sorted_labels, predicted_order + + +def _get_cg_discount(top_k_int: int = 1): + r""" + Calculate discounted gain factor for ranking position till top_k_int + + Args: + top_k_int: An int or an int `Tensor`. + + Returns: + a `Tensor` that holds \log_{2}(i + 1), i \in [1, k] + """ + + log_2 = tf.log(tf.constant(2.0, dtype=tf.float32)) + # top_k_range needs to start from 1 to top_k_int + top_k_range = tf.range(top_k_int) + 1 + top_k_range = tf.reshape(top_k_range, [-1, 1]) + # cast top_k_range to float + top_k_range = tf.cast(top_k_range, dtype=tf.float32) + cg_discount = tf.log(top_k_range + 1.0) / log_2 + return cg_discount + + +def _get_relevance_scores(scores: tf.Tensor) -> tf.Tensor: + """Calculate relevance scores for top_k_int ranking positions""" + + return 2**scores - 1 + + +def safe_log(raw_scores: tf.Tensor, name: Optional[str] = None) -> tf.Tensor: + """ + Calculate log of a tensor, handling cases that raw_scores are close to 0s + + Args: + raw_scores: An float `Tensor`. + + Returns: + A float `Tensor` that hols the safe log base e of input + """ + + epsilon = 1e-8 + clipped_raw_scores = tf.maximum(raw_scores, epsilon) + return tf.log(clipped_raw_scores) diff --git a/twml/twml/contrib/utils/normalizer.py b/twml/twml/contrib/utils/normalizer.py index a6a7035b8..0b9a94602 100644 --- a/twml/twml/contrib/utils/normalizer.py +++ b/twml/twml/contrib/utils/normalizer.py @@ -1,39 +1,48 @@ import tensorflow.compat.v1 as tf + from twml.contrib.utils import math_fns -def mean_max_normalizaiton(dense_tensor): - """ - In-batch normalization - Args: - dense_tensor: A dense `Tensor`. - Returns: - (dense_tensor - mean) / abs(max value) - Note: - when dense_tensor is of size [1, ?] it will give 0 - If this is not what you want handle it outside the function - """ - dense_mean = tf.reduce_mean(dense_tensor, reduction_indices=[0]) - dense_abs_max = tf.abs(tf.reduce_max(dense_tensor, reduction_indices=[0])) - dense_tensor = math_fns.safe_div(dense_tensor - dense_mean, dense_abs_max, - 'mean_max_normalization_in_batch') - return dense_tensor - - -def standard_normalizaiton(dense_tensor): - """ - In-batch normalization - z-normalization or standard_normalization in batch - Args: - dense_tensor: A dense `Tensor`. - Returns: - (dense_tensor - mean) / variance - Note: - when dense_tensor is of size [1, ?] it will give 0 - If this is not what you want handle it outside the function - """ - epsilon = 1E-7 - dense_mean, dense_variance = tf.nn.moments(dense_tensor, 0) - # using epsilon is safer than math_fns.safe_div in here - dense_tensor = (dense_tensor - dense_mean) / (dense_variance + epsilon) - return dense_tensor +def mean_max_normalizaiton(dense_tensor: tf.Tensor) -> tf.Tensor: + """ + In-batch normalization + + Args: + dense_tensor: A dense `Tensor`. + + Returns: + (dense_tensor - mean) / abs(max value) + + Note: + when dense_tensor is of size [1, ?] it will give 0 + If this is not what you want handle it outside the function + """ + + dense_mean = tf.reduce_mean(dense_tensor, reduction_indices=[0]) + dense_abs_max = tf.abs(tf.reduce_max(dense_tensor, reduction_indices=[0])) + dense_tensor = math_fns.safe_div( + dense_tensor - dense_mean, dense_abs_max, "mean_max_normalization_in_batch" + ) + return dense_tensor + + +def standard_normalizaiton(dense_tensor: tf.Tensor) -> tf.Tensor: + """ + In-batch normalization + z-normalization or standard_normalization in batch + + Args: + dense_tensor: A dense `Tensor`. + + Returns: + (dense_tensor - mean) / variance + + Note: + when dense_tensor is of size [1, ?] it will give 0 + If this is not what you want handle it outside the function + """ + epsilon = 1e-7 + dense_mean, dense_variance = tf.nn.moments(dense_tensor, 0) + # using epsilon is safer than math_fns.safe_div in here + dense_tensor = (dense_tensor - dense_mean) / (dense_variance + epsilon) + return dense_tensor diff --git a/twml/twml/contrib/utils/scores.py b/twml/twml/contrib/utils/scores.py index 84e792c13..641b3fbc9 100644 --- a/twml/twml/contrib/utils/scores.py +++ b/twml/twml/contrib/utils/scores.py @@ -1,33 +1,35 @@ import tensorflow.compat.v1 as tf -def get_pairwise_scores(tensor_input): - """ - This is so far used in pariwise learning-to-rank - - Arguments: - tensor_input: a dense `Tensor` of shape [n_data, 1] - n_data is the number of teet candidates - - Returns: - pairwise scores: a dense `Tensor` of shape [n_data, n_data]. - """ - return tensor_input - tf.transpose(tensor_input) - - -def get_pairwise_label_scores(labels): - """ - This is so far used in pariwise learning-to-rank - Args: - labels: a dense `Tensor` of shape [n_data, 1] - n_data is the number of teet candidates - Returns: - pairwise label scores: a dense `Tensor` of shape [n_data, n_data]. - each value is within [0, 1] - """ - # raw pairwise label scores/differences - pairwise_label_scores = get_pairwise_scores(labels) - # sanity check to make sure values in differences_ij are [-1, 1] - differences_ij = tf.maximum(tf.minimum(1.0, pairwise_label_scores), -1.0) - # values in pairwise_label_scores are within [0, 1] for cross entropy - return (1.0 / 2.0) * (1.0 + differences_ij) +def get_pairwise_scores(tensor_input: tf.Tensor) -> tf.Tensor: + """ + This is so far used in pairwise learning-to-rank + + Args: + tensor_input: a dense `Tensor` of shape [n_data, 1] + n_data is the number of tweet candidates + + Returns: + pairwise scores: a dense `Tensor` of shape [n_data, n_data]. + """ + + return tensor_input - tf.transpose(tensor_input) + + +def get_pairwise_label_scores(labels: tf.Tensor) -> tf.Tensor: + """ + This is so far used in pairwise learning-to-rank + Args: + labels: a dense `Tensor` of shape [n_data, 1] + n_data is the number of tweet candidates + Returns: + pairwise label scores: a dense `Tensor` of shape [n_data, n_data]. + each value is within [0, 1] + """ + + # raw pairwise label scores/differences + pairwise_label_scores = get_pairwise_scores(labels) + # sanity check to make sure values in differences_ij are [-1, 1] + differences_ij = tf.maximum(tf.minimum(1.0, pairwise_label_scores), -1.0) + # values in pairwise_label_scores are within [0, 1] for cross entropy + return (1.0 / 2.0) * (1.0 + differences_ij) diff --git a/twml/twml/contrib/utils/similarities.py b/twml/twml/contrib/utils/similarities.py index 212065f88..3107ab8f6 100644 --- a/twml/twml/contrib/utils/similarities.py +++ b/twml/twml/contrib/utils/similarities.py @@ -1,17 +1,21 @@ import tensorflow.compat.v1 as tf -def cosine_similarity(x1, x2, axis): - """ - cosine similarity of two tensors. +def cosine_similarity(x1: tf.Tensor, x2: tf.tensor, axis: int = 1) -> tf.Tensor: + """ + cosine similarity of two tensors. - Arguments: - x1: - A tf.Tensor - x2: - A tf.Tensor - axis: Dimension along which to normalize. - """ - normalize_x1 = tf.nn.l2_normalize(x1, axis=axis) - normalize_x2 = tf.nn.l2_normalize(x2, axis=axis) - return tf.reduce_sum(tf.multiply(normalize_x1, normalize_x2), axis=axis) + Args: + x1: + A tf.Tensor + x2: + A tf.Tensor + axis: + Dimension along which to normalize. + + Returns: + A tf.Tensor + """ + normalize_x1 = tf.nn.l2_normalize(x1, axis=axis) + normalize_x2 = tf.nn.l2_normalize(x2, axis=axis) + return tf.reduce_sum(tf.multiply(normalize_x1, normalize_x2), axis=axis) diff --git a/twml/twml/dataset.py b/twml/twml/dataset.py index 4356fdc7c..537caa316 100644 --- a/twml/twml/dataset.py +++ b/twml/twml/dataset.py @@ -2,371 +2,429 @@ This module implements custom tf.data.datasets for twml. """ import numbers +from typing import Callable, Generator, Iterator, List, Optional +import tensorflow.compat.v1 as tf from absl import logging from kazoo.client import KazooClient from libtwml import OPLIB -import tensorflow.compat.v1 as tf + from twml.constants import DEFAULT_ZOOKEEPER_BASE_ZNODE, DEFAULT_ZOOKEEPER_HOST class BlockFormatDataset(tf.data.Dataset): - """A ``tf.data.Dataset`` comprising records from one or more TFRecord files.""" - - def __init__(self, filenames, compression_type="auto", buffer_size=1 << 20): + """A ``tf.data.Dataset`` comprising records from one or more TFRecord files.""" + + def __init__( + self, + filenames: tf.Tensor, + compression_type: str = "auto", + buffer_size: int = 1 << 20, + ): + """ + Creates a ``BlockFormatDataset``. + + Args: + filenames: + A `tf.string` tensor containing one or more filenames. + compression_type: + A string specifying the compression type. + Can be one of 'gz' (or 'gzip'), 'none', 'auto' (default). + When compression_type == 'auto', it is inferred from file extension. + buffer_size: + Buffer size to be used during decompression. default: 1<<20. + """ + self._filenames = tf.convert_to_tensor( + filenames, dtype=tf.string, name="filenames" + ) + self._compression_type = tf.convert_to_tensor( + compression_type.lower(), name="compression_type" + ) + self._buffer_size = tf.convert_to_tensor( + buffer_size, dtype=tf.int64, name="buffer_size" + ) + # Parent class calls self._as_variant_tensor in init. So call this at the end. + super(BlockFormatDataset, self).__init__() + + def _as_variant_tensor(self) -> tf.Tensor: + """Create the resource handle for the dataset.""" + try: + block_format_dataset = __import__( + "libtwml_internal" + ).OPLIB.block_format_dataset + return block_format_dataset(self._filenames) + except ImportError: + block_format_dataset = OPLIB.block_format_dataset_v2 + return block_format_dataset( + self._filenames, self._compression_type, self._buffer_size + ) + + def _inputs(self) -> list: + return [] + + @property + def output_shapes(self) -> tf.TensorShape: + """Return output shapes""" + return tf.TensorShape([]) + + @property + def output_types(self) -> tf.DType: + """Return output types""" + return tf.string + + @property + def output_classes(self) -> tf.Tensor: + """Return output classes""" + return tf.Tensor + + +def downsample_dataset( + dataset: tf.data.Dataset, + sample_rate: numbers.Real, + rate_name: str = "rate", +) -> tf.data.Dataset: """ - Creates a ``BlockFormatDataset``. - - Args: - filenames: - A `tf.string` tensor containing one or more filenames. - compression_type: - A string specifying the compression type. - Can be one of 'gz' (or 'gzip'), 'none', 'auto' (default). - When compression_type == 'auto', it is inferred from file extension. - buffer_size: - Buffer size to be used during decompression. default: 1<<20. + Downsample a tf.data.Dataset at sample_rate """ - self._filenames = tf.convert_to_tensor(filenames, dtype=tf.string, name="filenames") - self._compression_type = tf.convert_to_tensor(compression_type.lower(), name="compression_type") - self._buffer_size = tf.convert_to_tensor(buffer_size, dtype=tf.int64, name="buffer_size") - # Parent class calss self._as_variant_tensor in init. So call this at the end. - super(BlockFormatDataset, self).__init__() - - def _as_variant_tensor(self): + if sample_rate is None or sample_rate == 1.0: + return dataset + elif not isinstance(sample_rate, numbers.Real): + raise TypeError("dataset %s must be a real number" % rate_name) + elif sample_rate <= 0 or sample_rate > 1: + raise ValueError("dataset %s must be in range (0, 1])" % rate_name) + return dataset.filter(lambda _: tf.squeeze(tf.random_uniform([1])) < sample_rate) + + +def _filenames_dataset( + files: List[str], + shards: Optional[int] = None, + shard_index: Optional[int] = None, +) -> tf.data.Dataset: """ - Create the resource handle for the dataset. + Get a tf.data.Dataset with file names from a list of files + Optionally shard the file list (see stream_block_format_dataset) """ - try: - block_format_dataset = __import__("libtwml_internal").OPLIB.block_format_dataset - return block_format_dataset(self._filenames) - except ImportError: - block_format_dataset = OPLIB.block_format_dataset_v2 - return block_format_dataset(self._filenames, self._compression_type, self._buffer_size) - - def _inputs(self): - return [] - - @property - def output_shapes(self): - """Return output shapes""" - return tf.TensorShape([]) - - @property - def output_types(self): - """Return output types""" - return tf.string - - @property - def output_classes(self): - """Return output classes""" - return tf.Tensor - - -def downsample_dataset(dataset, sample_rate, rate_name): - """ - Downsample a tf.data.Dataset at sample_rate - """ - if sample_rate is None or sample_rate == 1.0: - return dataset - elif not isinstance(sample_rate, numbers.Real): - raise TypeError("dataset %s must be a real number" % rate_name) - elif sample_rate <= 0 or sample_rate > 1: - raise ValueError("dataset %s must be in range (0, 1])" % rate_name) - return dataset.filter(lambda _: tf.squeeze(tf.random_uniform([1])) < sample_rate) - + files = tf.data.Dataset.from_tensor_slices(files) -def _filenames_dataset(files, shards=None, shard_index=None): - """ - Get a tf.data.Dataset with file names from a list of files - Optionally shard the file list (see stream_block_format_dataset) - """ - files = tf.data.Dataset.from_tensor_slices(files) + if [shards, shard_index] != [None, None]: + logging.info( + "Sharding files dataset (index: %d, shards: %d)" % (shard_index, shards) + ) + files = files.shard(num_shards=shards, index=shard_index) - if [shards, shard_index] != [None, None]: - logging.info("Sharding files dataset (index: %d, shards: %d)" % (shard_index, shards)) - files = files.shard(num_shards=shards, index=shard_index) - - return files + return files def stream_block_format_dataset( - files, parse_fn, batch_size, num_threads, - shuffle=True, repeat=False, - block_length=None, part_file_parallelism=None, file_shuffle_size=None, - record_shuffle_size=None, dataset_fn=None, - keep_rate=None, parts_downsampling_rate=None, prefetch_size=2, - shards=None, shard_index=None, shuffle_files=True, interleave=True): - """ - Helper function to stream a list of part files. - - Args: - files: - List of input files which will create a dataset. - parse_fn: - A function that takes a byte tensor containing a datarecord and decodes it. - batch_size: - The batch size for each step. - num_threads: - Number of threads working on the data in parallel. - shuffle: - Shuffle records within each file using ``record_shuffle_size``. Defaults to True. - repeat: - Repeat the dataset indefinitely. Defaults to False. - Useful when you want to use an ``[train,eval]_steps`` greater than the size of the dataset - (otherwise ``Estimator.[train,evaluate]`` stop when the end of the dataset is reached). - block_length (optional): - Number of consecutive records to pull from a single part file. - Defaults to batch_size. - part_file_parallelism (optional): - Number of part files to read from in parallel. Once a part file is completely read, it will - be replaced by the next part file in the part file list. - - ``num_threads`` specifies a reader thread pool size, while ``part_file_parallelism`` specifies - the number of files to read from in parallel. If ``part_file_parallelism`` is greater than or - equal to ``num_threads``, the reads will be distributed over ``num_threads``. On the other hand, - if ``part_file_parallelism`` is smaller than``num_threads``, it is very likely that the reader - thread pool will be underutilized, since it can never be the case that every reader thread has - a part file to read from. - - file_shuffle_size (optional): - the buffer_size used for shuffling of the list of files. - Defaults to 1000. For example, if you have 2000 files, the first - 1000 files are shuffled together, iterated through, then the next 1000 files are shuffled - and iterated through. - record_shuffle_size (optional): - the ``buffer_size`` used for shuffling records in each thread. - Defaults to ``batch_size * 8`` records. - dataset_fn (optional): - A function of that modifies the dataset after it reads different interleaved parts files. - Defaults to: - - .. code-block:: python - - def dataset_fn(dataset, parse_fn, batch_size): - return dataset.batch(batch_size).map(parse_fn, 1) - - keep_rate (optional): - A float value in (0.0, 1.0] that indicates to drop records according to the Bernoulli - distribution with p = 1 - keep_rate. - Defaults to None (no records dropped). - - parts_downsampling_rate (optional): - A float value in ``(0.0, 1.0]`` that indicates the factor by which to downsample part files. - For example, a value of 0.2 means only 20 percent of part files become part of the dataset. - - Note that this argument is only useful in conjunction with a [train,eval]_steps of -1 - (that is, when the entire dataset is used). Furthermore, note that even in this case, each - epoch will see a different set of part files. This is because new part files are re-sampled - every epoch. In other words, this argument is only provided for backwards compatibility with - DeepBird v1. We recommend you use a smaller [train,eval]_steps (or specify a keep_rate) - instead. - - shards (optional): - Number of partitions to shard the dataset into. This is useful for codistillation and other - techniques that require each worker to train on disjoint partitions of the dataset. - The dataset is not sharded by default. - - shard_index (optional): - Which partition of the dataset to use if ``shards`` is set. - - shuffle_files (optional): - Shuffle the list of files. Defaults to True. - When False, files are iterated in the order they are passed in. - - interleave (optional): - Interleave records from multiple files in parallel. Defaults to True. - - Returns: - tf.data.DataSet of batches of HashedDataRecord resource handles decoded and streamed online. - """ - # Creating a dataset from an input directory - - files = _filenames_dataset(files, shards=shards, shard_index=shard_index) - - file_shuffle_size = file_shuffle_size if file_shuffle_size is not None else 100000 - record_shuffle_size = record_shuffle_size if record_shuffle_size is not None else (batch_size * 8) - block_length = block_length if block_length is not None else batch_size - - logging.info("NUM_THREADS: %d", num_threads) - - if repeat: - files = files.repeat() - - if shuffle_files: - # Randomly shuffle the files list. - files = files.shuffle(buffer_size=file_shuffle_size) - - # Downsample parts files - files = downsample_dataset(files, parts_downsampling_rate, "parts_downsampling_rate") - - # Interleave the result from BlockFormatDataset - # block_length == batch_size results in batch_size records being read from a single file. - def map_fn(filenames): - '''function that maps each filename to a BlockFormatDataset''' - # reach each file using BlockFormatDataset - dataset = BlockFormatDataset(filenames) - - # early prefetching can sometimes improve performance (like on GCS) - dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) - - # Shuffling before repeating ensures strong ordering. - if shuffle: - dataset = dataset.shuffle(buffer_size=record_shuffle_size) - - return dataset - - if interleave: - part_file_parallelism = num_threads if part_file_parallelism is None else part_file_parallelism - dataset = files.interleave( - map_fn, cycle_length=part_file_parallelism, block_length=block_length, num_parallel_calls=num_threads) - else: - dataset = files.flat_map(map_fn) - - # Downsample DataRecords - dataset = downsample_dataset(dataset, keep_rate, "keep_rate") - - if dataset_fn is None: - # Create a batch of datarecords and decode them - return dataset.batch(batch_size).map(parse_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE).prefetch(prefetch_size) + files: List[str], + parse_fn: Callable[[tf.Tensor], tf.Tensor], + batch_size: int, + num_threads: int, + shuffle: bool = True, + repeat: bool = False, + block_length: Optional[int] = None, + part_file_parallelism: Optional[int] = None, + file_shuffle_size: Optional[int] = None, + record_shuffle_size: Optional[int] = None, + dataset_fn: Optional[Callable[[tf.data.Dataset], tf.data.Dataset]] = None, + keep_rate: Optional[float] = None, + parts_downsampling_rate: Optional[float] = None, + prefetch_size: int = 2, + shards: int = None, + shard_index: int = None, + shuffle_files: bool = True, + interleave: bool = True, +) -> tf.data.Dataset: + """ + Helper function to stream a list of part files. - return dataset_fn(dataset, parse_fn, batch_size) + Args: + files: + List of input files which will create a dataset. + parse_fn: + A function that takes a byte tensor containing a data record and decodes it. + batch_size: + The batch size for each step. + num_threads: + Number of threads working on the data in parallel. + shuffle: + Shuffle records within each file using ``record_shuffle_size``. Defaults to True. + repeat: + Repeat the dataset indefinitely. Defaults to False. + Useful when you want to use an ``[train,eval]_steps`` greater than the size of the dataset + (otherwise ``Estimator.[train,evaluate]`` stop when the end of the dataset is reached). + block_length (optional): + Number of consecutive records to pull from a single part file. + Defaults to batch_size. + part_file_parallelism (optional): + Number of part files to read from in parallel. Once a part file is completely read, it will + be replaced by the next part file in the part file list. + ``num_threads`` specifies a reader thread pool size, while ``part_file_parallelism`` specifies + the number of files to read from in parallel. If ``part_file_parallelism`` is greater than or + equal to ``num_threads``, the reads will be distributed over ``num_threads``. On the other hand, + if ``part_file_parallelism`` is smaller than``num_threads``, it is very likely that the reader + thread pool will be underutilized, since it can never be the case that every reader thread has + a part file to read from. + file_shuffle_size (optional): + the buffer_size used for shuffling of the list of files. + Defaults to 1000. For example, if you have 2000 files, the first + 1000 files are shuffled together, iterated through, then the next 1000 files are shuffled + and iterated through. + record_shuffle_size (optional): + the ``buffer_size`` used for shuffling records in each thread. + Defaults to ``batch_size * 8`` records. + dataset_fn (optional): + A function of that modifies the dataset after it reads different interleaved parts files. + Defaults to: + .. code-block:: python + def dataset_fn(dataset, parse_fn, batch_size): + return dataset.batch(batch_size).map(parse_fn, 1) + keep_rate (optional): + A float value in (0.0, 1.0] that indicates to drop records according to the Bernoulli + distribution with p = 1 - keep_rate. + Defaults to None (no records dropped). + parts_downsampling_rate (optional): + A float value in ``(0.0, 1.0]`` that indicates the factor by which to downsample part files. + For example, a value of 0.2 means only 20 percent of part files become part of the dataset. + Note that this argument is only useful in conjunction with a [train,eval]_steps of -1 + (that is, when the entire dataset is used). Furthermore, note that even in this case, each + epoch will see a different set of part files. This is because new part files are re-sampled + every epoch. In other words, this argument is only provided for backwards compatibility with + DeepBird v1. We recommend you use a smaller [train,eval]_steps (or specify a keep_rate) + instead. + shards (optional): + Number of partitions to shard the dataset into. This is useful for codistillation and other + techniques that require each worker to train on disjoint partitions of the dataset. + The dataset is not shared by default. + shard_index (optional): + Which partition of the dataset to use if ``shards`` is set. + shuffle_files (optional): + Shuffle the list of files. Defaults to True. + When False, files are iterated in the order they are passed in. + interleave (optional): + Interleave records from multiple files in parallel. Defaults to True. + + Returns: + tf.data.DataSet of batches of HashedDataRecord resource handles decoded and streamed online. + """ + # Creating a dataset from an input directory + + files = _filenames_dataset(files, shards=shards, shard_index=shard_index) + + file_shuffle_size = file_shuffle_size if file_shuffle_size is not None else 100000 + record_shuffle_size = ( + record_shuffle_size if record_shuffle_size is not None else (batch_size * 8) + ) + block_length = block_length if block_length is not None else batch_size + + logging.info("NUM_THREADS: %d", num_threads) + + if repeat: + files = files.repeat() + + if shuffle_files: + # Randomly shuffle the files list. + files = files.shuffle(buffer_size=file_shuffle_size) + + # Downsample parts files + files = downsample_dataset( + files, parts_downsampling_rate, "parts_downsampling_rate" + ) + + # Interleave the result from BlockFormatDataset + # block_length == batch_size results in batch_size records being read from a single file. + def map_fn(filenames): + """function that maps each filename to a BlockFormatDataset""" + # reach each file using BlockFormatDataset + dataset = BlockFormatDataset(filenames) + + # early prefetching can sometimes improve performance (like on GCS) + dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) + + # Shuffling before repeating ensures strong ordering. + if shuffle: + dataset = dataset.shuffle(buffer_size=record_shuffle_size) + + return dataset + + if interleave: + part_file_parallelism = ( + num_threads if part_file_parallelism is None else part_file_parallelism + ) + dataset = files.interleave( + map_fn, + cycle_length=part_file_parallelism, + block_length=block_length, + num_parallel_calls=num_threads, + ) + else: + dataset = files.flat_map(map_fn) + + # Downsample DataRecords + dataset = downsample_dataset(dataset, keep_rate, "keep_rate") + + if dataset_fn is None: + # Create a batch of datarecords and decode them + return ( + dataset.batch(batch_size) + .map(parse_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE) + .prefetch(prefetch_size) + ) + + return dataset_fn(dataset, parse_fn, batch_size) + + +def cx_zk_path(path: str) -> str: + if path is None: + raise ValueError( + "Path for zookeeper dataset pointer is None. You must specify a path." + ) + return_path = "/".join([DEFAULT_ZOOKEEPER_BASE_ZNODE, path]) + logging.info(f"Zookeeper path is: {return_path}") + return return_path -def cx_zk_path(path): - if path is None: - raise ValueError("Path for zookeeper dataset pointer is None. You must specify a path.") - return_path = "/".join([DEFAULT_ZOOKEEPER_BASE_ZNODE, path]) - logging.info("Zookeeper path is: {}".format(return_path)) - return return_path +def zookeeper_ordered_dataset( + files: List[str], + parse_fn: Callable[[tf.Tensor], tf.Tensor], + batch_size: int, + zk_counter_path: str, + repeat: bool = False, + num_threads: int = 2, + block_length: Optional[int] = None, + part_file_parallelism: Optional[int] = None, + batch_shuffle_size: Optional[int] = None, + file_keep_rate: Optional[float] = None, + record_keep_rate: Optional[float] = None, + prefetch_size: int = 2, + interleave: bool = False, + dataset_fn: Callable[ + [tf.data.Dataset, Callable[[tf.Tensor], tf.Tensor], int], tf.data.Dataset + ] = None, + verbose: bool = False, +) -> tf.data.Dataset: + """ + Make a tf.Dataset given an ordered list of filenames, using Zookeeper to keep track of + which file to read, and to coordinate multiple workers. + Args: + files: + ordered list of (typically HDFS) filenames. This must remain consistent + between different workers, and between worker restarts (e.g. in the case + of instance failure or preemption). + To ensure this remains consistent, consider using the --train.files_list + option from DataRecordTrainer. + parse_fn: + A function that takes a byte tensor containing a datarecord and decodes it. + batch_size: + The batch size for each step. + zk_counter_path: + Path under the root node for the underlying zookeeper shared counter that + is used to coordinate distributed iteration over the list of files. + Full path will be `'/'.join([DEFAULT_ZOOKEEPER_BASE_ZNODE, zk_counter_path])`. + repeat: + Default False. Set True to repeat over the files forever. + num_threads: + Default 2. Number of threads working on the data in parallel. + Only used if interleave=True. + block_length: + Default None. Number of consecutive records to pull from a single part file. + If None, then block_length=batch_size will be used. + Only used if interleave=True. + part_file_parallelism: + Default None. Number of part files to read from in parallel. Once a part file is completely + read, it will be replaced by the next part file indicated by the zookeeper counter. + Only used if interleave=True. + ``num_threads`` specifies a reader thread pool size, while ``part_file_parallelism`` specifies + the number of files to read from in parallel. If ``part_file_parallelism`` is greater than or + equal to ``num_threads``, the reads will be distributed over ``num_threads``. On the other hand, + if ``part_file_parallelism`` is smaller than``num_threads``, it is very likely that the reader + thread pool will be underutilized, since it can never be the case that every reader thread has + a part file to read from. + batch_shuffle_size: + Default None. Size of shuffle buffer, for shuffling that will be applied after batching. + if None, then batches will not be shuffled. Ignored if dataset_fn is provided. + file_keep_rate: + Default None. Fraction of files to keep, or None to keep all files. + record_keep_rate: + Default None. Fraction of records to keep, or None to keep all records. + prefetch_size: + Default 2. Number of parsed batches to prefetch. Ignored if dataset_fn is provided. + interleave: + Default False. Set True to use tf.data.Dataset.interleave rather than flat_map. + dataset_fn: + A function that is applied to the dataset of individual records, after + these have been read from the parts files. + If ``None`` (the default), the behavior will be as though dataset_fn were set to: + + .. code-block:: python + def dataset_fn(dataset, parse_fn, batch_size): + dataset = dataset.batch(batch_size) + dataset = dataset.map(parse_fn, tf.data.experimental.AUTOTUNE) + if batch_shuffle_size: + dataset = dataset.shuffle(batch_shuffle_size) + return dataset.prefetch(prefetch_size) + verbose: + Default False. Set True to log the names of files loaded by TF. + """ + block_length = batch_size if block_length is None else block_length + part_file_parallelism = ( + num_threads if part_file_parallelism is None else part_file_parallelism + ) + + def zk_index_generator(my_files: List[str] = files) -> Generator[str, None, None]: + zk = KazooClient(hosts=DEFAULT_ZOOKEEPER_HOST) + zk.start() + my_counter = zk.Counter(cx_zk_path(zk_counter_path), default=0) + while True: + my_counter += 1 + counter_pre_value = my_counter.pre_value + if repeat: + counter_pre_value = counter_pre_value % len(my_files) + if counter_pre_value >= len(my_files): + break + else: + chosen_file = my_files[counter_pre_value] + if verbose: + logging.info(f"{counter_pre_value}. yielding {chosen_file}") + yield chosen_file + zk.stop() + + files = tf.data.Dataset.from_generator(zk_index_generator, tf.string) + + # Downsample parts files + files = downsample_dataset(files, file_keep_rate, "file_keep_rate") + + def map_fn(filenames: tf.Tensor) -> tf.data.Dataset: + return BlockFormatDataset(filenames).prefetch(20) + + # Dont interleave for sequential training + if interleave: + dataset = files.interleave( + map_fn, + cycle_length=part_file_parallelism, + block_length=block_length, + num_parallel_calls=num_threads, + ) + else: + dataset = files.flat_map(map_fn) + + # Downsample DataRecords + dataset = downsample_dataset(dataset, record_keep_rate, "record_keep_rate") + + if dataset_fn is None: + # Create a batch of data records and decode them + dataset = dataset.batch(batch_size) + dataset = dataset.map( + parse_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE + ) + # shuffle after batching and parsing for performance reasons + # faster b/c 1 random selection is made per batch rather than per record + if batch_shuffle_size: + dataset = dataset.shuffle(buffer_size=batch_shuffle_size) + dataset = dataset.prefetch(prefetch_size) + + else: + dataset = dataset_fn(dataset, parse_fn, batch_size) -def zookeeper_ordered_dataset( - files, parse_fn, batch_size, zk_counter_path, repeat=False, - num_threads=2, block_length=None, part_file_parallelism=None, - batch_shuffle_size=None, file_keep_rate=None, record_keep_rate=None, - prefetch_size=2, interleave=False, dataset_fn=None, verbose=False): - """ - Make a tf.Dataset given an ordered list of filenames, using Zookeeper to keep track of - which file to read, and to coordinate multiple workers. - - Args: - files: - ordered list of (typically HDFS) filenames. This must remain consistent - between different workers, and between worker restarts (e.g. in the case - of instance failure or preemption). - To ensure this remains consistent, consider using the --train.files_list - option from DataRecordTrainer. - parse_fn: - A function that takes a byte tensor containing a datarecord and decodes it. - batch_size: - The batch size for each step. - zk_counter_path: - Path under the root node for the underlying zookeeper shared counter that - is used to coordinate distributed iteration over the list of files. - Full path will be `'/'.join([DEFAULT_ZOOKEEPER_BASE_ZNODE, zk_counter_path])`. - repeat: - Default False. Set True to repeat over the files forever. - num_threads: - Default 2. Number of threads working on the data in parallel. - Only used if interleave=True. - block_length: - Default None. Number of consecutive records to pull from a single part file. - If None, then block_length=batch_size will be used. - Only used if interleave=True. - part_file_parallelism: - Default None. Number of part files to read from in parallel. Once a part file is completely - read, it will be replaced by the next part file indicated by the zookeeper counter. - Only used if interleave=True. - - ``num_threads`` specifies a reader thread pool size, while ``part_file_parallelism`` specifies - the number of files to read from in parallel. If ``part_file_parallelism`` is greater than or - equal to ``num_threads``, the reads will be distributed over ``num_threads``. On the other hand, - if ``part_file_parallelism`` is smaller than``num_threads``, it is very likely that the reader - thread pool will be underutilized, since it can never be the case that every reader thread has - a part file to read from. - - batch_shuffle_size: - Default None. Size of shuffle buffer, for shuffling that will be applied after batching. - if None, then batches will not be shuffled. Ignored if dataset_fn is provided. - file_keep_rate: - Default None. Fraction of files to keep, or None to keep all files. - record_keep_rate: - Default None. Fraction of records to keep, or None to keep all records. - prefetch_size: - Default 2. Number of parsed batches to prefetch. Ignored if dataset_fn is provided. - interleave: - Default False. Set True to use tf.data.Dataset.interleave rather than flat_map. - dataset_fn: - A function that is applied to the dataset of individual records, after - these have been read from the parts files. - If ``None`` (the default), the behavior will be as though dataset_fn were set to: - - .. code-block:: python - - def dataset_fn(dataset, parse_fn, batch_size): - dataset = dataset.batch(batch_size) - dataset = dataset.map(parse_fn, tf.data.experimental.AUTOTUNE) - if batch_shuffle_size: - dataset = dataset.shuffle(batch_shuffle_size) - return dataset.prefetch(prefetch_size) - - verbose: - Default False. Set True to log the names of files loaded by TF. - """ - block_length = batch_size if block_length is None else block_length - part_file_parallelism = num_threads if part_file_parallelism is None else part_file_parallelism - - def zk_index_generator(my_files=files): - zk = KazooClient(hosts=DEFAULT_ZOOKEEPER_HOST) - zk.start() - my_counter = zk.Counter(cx_zk_path(zk_counter_path), default=0) - while True: - my_counter += 1 - counter_pre_value = my_counter.pre_value - if repeat: - counter_pre_value = counter_pre_value % len(my_files) - if counter_pre_value >= len(my_files): - break - else: - chosen_file = my_files[counter_pre_value] - if verbose: - logging.info("{}. yielding {}".format(counter_pre_value, chosen_file)) - yield chosen_file - zk.stop() - - files = tf.data.Dataset.from_generator(zk_index_generator, tf.string) - - # Downsample parts files - files = downsample_dataset(files, file_keep_rate, "file_keep_rate") - - def map_fn(filenames): - return BlockFormatDataset(filenames).prefetch(20) - - # Dont interleave for sequential training - if interleave: - dataset = files.interleave( - map_fn, - cycle_length=part_file_parallelism, - block_length=block_length, - num_parallel_calls=num_threads) - else: - dataset = files.flat_map(map_fn) - - # Downsample DataRecords - dataset = downsample_dataset(dataset, record_keep_rate, "record_keep_rate") - - if dataset_fn is None: - # Create a batch of datarecords and decode them - dataset = dataset.batch(batch_size) - dataset = dataset.map(parse_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE) - # shuffle after batching and parsing for performance reasons - # faster b/c 1 random selection is made per batch rather than per record - if batch_shuffle_size: - dataset = dataset.shuffle(buffer_size=batch_shuffle_size) - dataset = dataset.prefetch(prefetch_size) - - else: - dataset = dataset_fn(dataset, parse_fn, batch_size) - - return dataset + return dataset diff --git a/twml/twml/errors.py b/twml/twml/errors.py index 9b50fcd79..ff39a5139 100644 --- a/twml/twml/errors.py +++ b/twml/twml/errors.py @@ -4,10 +4,12 @@ class EarlyStopError(Exception): - """Exception used to indicate evaluator needs to early stop.""" - pass + """Exception used to indicate evaluator needs to early stop.""" + + pass class CheckpointNotFoundError(Exception): - """Exception used to indicate a checkpoint hasnt been found.""" - pass + """Exception used to indicate a checkpoint hasn't been found.""" + + pass diff --git a/twml/twml/export_output_fns.py b/twml/twml/export_output_fns.py index f72e1d0fe..e80a73eeb 100644 --- a/twml/twml/export_output_fns.py +++ b/twml/twml/export_output_fns.py @@ -1,4 +1,4 @@ -''' +""" Contains implemenations of DataRecordTrainer.get_export_output_fns that specify how to export model graph outputs from build_graph to DataRecords for prediction servers. @@ -6,12 +6,12 @@ the DataRecordTrainer constructor to customize how to export their model outputs. Modelers may also provide a custom implementation of export_output_fn using these as reference. -''' +""" # pylint: disable=invalid-name -from twitter.deepbird.io.legacy.export_output_fns import ( - batch_prediction_continuous_output_fn, # noqa: F401 - batch_prediction_tensor_output_fn, # noqa: F401 - default_output_fn, # noqa: F401 - variable_length_continuous_output_fn, # noqa: F401 +from twitter.deepbird.io.legacy.export_output_fns import default_output_fn # noqa: F401 +from twitter.deepbird.io.legacy.export_output_fns import ( # noqa: F401 + batch_prediction_continuous_output_fn, + batch_prediction_tensor_output_fn, + variable_length_continuous_output_fn, ) diff --git a/twml/twml/feature_config.py b/twml/twml/feature_config.py index 37004f442..0efd437fc 100644 --- a/twml/twml/feature_config.py +++ b/twml/twml/feature_config.py @@ -10,45 +10,45 @@ class FeatureConfig(feature_config.FeatureConfig): - def get_feature_spec(self): - """ - Generates a serialization-friendly dict representing this FeatureConfig. - """ - doc = super(FeatureConfig, self).get_feature_spec() - # Override the class in the spec. - doc["class"] = "twml.FeatureConfig" - return doc + def get_feature_spec(self): + """ + Generates a serialization-friendly dict representing this FeatureConfig. + """ + doc = super(FeatureConfig, self).get_feature_spec() + # Override the class in the spec. + doc["class"] = "twml.FeatureConfig" + return doc class FeatureConfigBuilder(feature_config.FeatureConfigBuilder): - def build(self): - # Overwrite self.build() to return twml.FeatureConfig instead - """ - Builds and returns FeatureConfig object. - """ - - ( - features, - tensor_types, - sparse_tensor_types, - feature_map, - feature_name_to_feature_parser, - feature_in_bq_name, - ) = self._build() - - return FeatureConfig( - features=features, - labels=self._labels, - weight=self._weight, - filters=self._filter_features, - tensor_types=tensor_types, - sparse_tensor_types=sparse_tensor_types, - feature_types=feature_map, - decode_mode=self._decode_mode, - legacy_sparse=self._legacy_sparse, - feature_name_to_feature_parser=self._feature_name_to_feature_parser, - feature_in_bq_name=self._feature_in_bq_name, - ) + def build(self) -> FeatureConfig: + # Overwrite self.build() to return twml.FeatureConfig instead + """ + Builds and returns FeatureConfig object. + """ + + ( + features, + tensor_types, + sparse_tensor_types, + feature_map, + feature_name_to_feature_parser, + feature_in_bq_name, + ) = self._build() + + return FeatureConfig( + features=features, + labels=self._labels, + weight=self._weight, + filters=self._filter_features, + tensor_types=tensor_types, + sparse_tensor_types=sparse_tensor_types, + feature_types=feature_map, + decode_mode=self._decode_mode, + legacy_sparse=self._legacy_sparse, + feature_name_to_feature_parser=self._feature_name_to_feature_parser, + feature_in_bq_name=self._feature_in_bq_name, + ) _name_to_id = feature_config._name_to_id diff --git a/twml/twml/filters.py b/twml/twml/filters.py index e48633808..d1376e45c 100644 --- a/twml/twml/filters.py +++ b/twml/twml/filters.py @@ -1,9 +1,10 @@ -''' +""" Includes functions to filter features dict build from data records. -''' +""" +from twitter.deepbird.io.legacy.filters import sparse_keep_feature_if # noqa: F401 +from twitter.deepbird.io.legacy.filters import sparse_keep_sample_if # noqa: F401 from twitter.deepbird.io.legacy.filters import ( - balance_binary_class_samples, # noqa: F401 - sparse_keep_feature_if, # noqa: F401 - sparse_keep_sample_if) # noqa: F401 + balance_binary_class_samples, +) # noqa: F401 diff --git a/twml/twml/hooks.py b/twml/twml/hooks.py index cdf733535..c2118166a 100644 --- a/twml/twml/hooks.py +++ b/twml/twml/hooks.py @@ -1,562 +1,616 @@ """ This file contains tf.train.SessionRunHooks defined by TWML """ -from datetime import datetime import json import operator import os +from datetime import datetime +from typing import Callable, Dict, Optional -from absl import logging import numpy as np import tensorflow.compat.v1 as tf -from tensorflow.python.training.basic_session_run_hooks import NeverTriggerTimer, SecondOrStepTimer +from absl import logging +from tensorflow.python.training.basic_session_run_hooks import ( + NeverTriggerTimer, + SecondOrStepTimer, +) + import twml class StepProgressHook(tf.train.SessionRunHook): - """Hook that displays a progress bar to monitor global step progress """ - - def __init__(self, max_step): - """ - Initializes a `StepProgressHook`. - This hook displays a progress bar for max_steps. - - Note that this hook only works for training and calibration. - - Args: - max_steps: - maximum steps to monitor in progress bar. - When this many steps is reached, the progress bar will be full. - """ - self._max_step = max_step - self._start_step = 0 - self._global_step_tensor = None - self._progress_bar = None - - def begin(self): - """ sets the global_step_tensor """ - self._global_step_tensor = tf.train.get_or_create_global_step() - if self._global_step_tensor is None: - raise RuntimeError("Global step should be created to use StepProgressHook.") - - def after_create_session(self, session, coord): - """ creates the progress bar and keeps track of the first global step upon session creation """ - global_step = session.run(self._global_step_tensor) - self._start_step = global_step - self._progress_bar = tf.keras.utils.Progbar(self._max_step) - - def before_run(self, run_context): # pylint: disable=unused-argument - """ invoked before calling session.run """ - return tf.train.SessionRunArgs(self._global_step_tensor) - - def after_run(self, run_context, run_values): - """ invoked after run is called. Updates the progress bar. """ - step = run_context.session.run(self._global_step_tensor) - self._progress_bar.update(step - self._start_step) + """Hook that displays a progress bar to monitor global step progress""" + + def __init__(self, max_step): + """ + Initializes a `StepProgressHook`. + This hook displays a progress bar for max_steps. + + Note that this hook only works for training and calibration. + + Args: + max_steps: + maximum steps to monitor in progress bar. + When this many steps is reached, the progress bar will be full. + """ + self._max_step = max_step + self._start_step = 0 + self._global_step_tensor = None + self._progress_bar = None + + def begin(self) -> None: + """sets the global_step_tensor""" + self._global_step_tensor = tf.train.get_or_create_global_step() + if self._global_step_tensor is None: + raise RuntimeError("Global step should be created to use StepProgressHook.") + + def after_create_session( + self, session: tf.Session, coord: tf.train.Coordinator + ) -> None: # pylint: disable=unused-argument + """creates the progress bar and keeps track of the first global step upon session creation""" + global_step = session.run(self._global_step_tensor) + self._start_step = global_step + self._progress_bar = tf.keras.utils.Progbar(self._max_step) + + def before_run( + self, run_context: tf.train.SessionRunContext + ) -> None: # pylint: disable=unused-argument + """invoked before calling session.run""" + return tf.train.SessionRunArgs(self._global_step_tensor) + + def after_run( + self, + run_context: tf.train.SessionRunContext, + run_values: tf.train.SessionRunValues, + ) -> None: # pylint: disable=unused-argument + """invoked after run is called. Updates the progress bar.""" + step = run_context.session.run(self._global_step_tensor) + self._progress_bar.update(step - self._start_step) class GetMetricsHook(tf.train.SessionRunHook): - """ - Hook used to obtain evaluation metrics. - Typically used for early-stopping by obtaining the value of a - metric at the end of an epoch. - Note that the metric tensor and its commensurate update Op - are responsible for aggregating the metric during the session - (one session per epoch). Used for evaluation. - """ - - def __init__(self, get_metrics_fn): - """GetMetricsHook constructor. - - Args: - get_metrics_fn: - Function that returns a dict mapping metric keys to - tensors as a tf.Tensor. - See Trainer.learn for an example use-case. + """ + Hook used to obtain evaluation metrics. + Typically used for early-stopping by obtaining the value of a + metric at the end of an epoch. + Note that the metric tensor and its commensurate update Op + are responsible for aggregating the metric during the session + (one session per epoch). Used for evaluation. """ - self._get_metrics_fn = get_metrics_fn - self._metric_tensors = None - self.metric_values = None + def __init__(self, get_metrics_fn: Callable[[], Dict[str, tf.Tensor]]): + """GetMetricsHook constructor. - def begin(self): - """ sets the global_step_tensor and metric tensor""" - self._metric_tensors = self._get_metrics_fn() - assert isinstance(self._metric_tensors, dict) + Args: + get_metrics_fn: + Function that returns a dict mapping metric keys to + tensors as a tf.Tensor. + See Trainer.learn for an example use-case. + """ - def end(self, session): - self.metric_values = session.run(self._metric_tensors) + self._get_metrics_fn = get_metrics_fn + self._metric_tensors = None + self.metric_values = None + def begin(self) -> None: + """sets the global_step_tensor and metric tensor""" + self._metric_tensors = self._get_metrics_fn() + assert isinstance(self._metric_tensors, dict) -class EarlyStopHook(GetMetricsHook): - """ - A GetMetricsHook augmented with early-stopping logic for use - within the Trainer.learn method. - """ - - def __init__(self, - metric, - patience, - minimize, - get_estimator_spec_fn, - checkpoint_dir, - file_path=None, - exit_on_end=True, - start_epoch=0, - tolerance=0): - """ - Prepare early-stopping hook and variables. + def end(self, session: tf.Session) -> None: + self.metric_values = session.run(self._metric_tensors) - Args: - metric: - String specifying the metric to early-stop on. Required with positive - ``early_stop_patience``. For example, 'accuracy', 'accuracy_0', 'loss', etc. - The string is used to extract the relevant tensor Op from the dict returned by - the get_eval_metric_ops method. For ``metrics`` pass to the constructor, - the string is one of those. For multi-class (that is, multi-metric) - metrics, the string may be appended with a ``_0``, ``_1``, etc. or one - of the ``multi_metric_names`` (one per class). - patience: - Maximum number of epochs to wait for an improvement in the early_stop_metric - before breaking off training. For example, a patience of 10 means that - training will have 10 epochs to improve the metric before it is killed. - Whenever the metric is improved before running out of patience, - patience is reset to ``early_stop_patience``. - minimize: - Set this to True for metrics that need to be minimized - (like ``loss``). Metrics like ``accuracy`` that need to be maximized - should set this to False. - tolerance: - A non-negative tolerance for comparing early_stop_metric. - e.g. when maximizing the condition is current_metric > best_metric + tolerance." - Defaults to 0. - get_estimator_spec_fn: - function that returns the current EstimatorSpec. - The EstimatorSpec is used to obtain the current eval_metric_ops. - checkpoint_dir: - path to directory containing the Estimator checkpoints. - file_path: - path to file that is used by this hook to communicate early-stopping - to StopIfExistsHook. This hook would be used for evaluation, while - the StopIfExistsHooks (the listeners) would be used for training. - When the file is created, the StopIfExistsHooks detect and terminate training. - This argument is used by ``Trainer.train_and_evaluate``. - exit_on_end: - when the end() method is called to indicate that the session is terminating, - and exit_on_end is True, twml.errors.EarlyStopError() is triggered to stop the evaluation job. - This is set to False by the trainer for non distributed jobs. - start_epoch: - Specifies the starting epoch number. This is used for logging purposes only. - """ - if not isinstance(metric, str): - raise ValueError("Expecting string for metric arg") - if not isinstance(patience, int): - raise ValueError("Expecting positive number for metric arg") - - self.should_stop = False - self._metric = metric - self._patience = patience - self._current_patience = patience - self._checkpoint_dir = checkpoint_dir - self._exit_on_end = exit_on_end - self._latest_checkpoint_path = None - # used for distributed training (tf.estimator.train_and_evaluate) - self._file_path = file_path - self._epoch = start_epoch - if self._file_path is not None: - # TODO try to read epoch from a file that we create - if tf.io.gfile.exists(self._file_path): - # delete the file if it exists (not sure this makes sense) - logging.info("EarlyStopHook: Removing existing file: %s.", self._file_path) - tf.io.gfile.remove(self._file_path) - - # best_checkpoint dir will contain the best checkpoint - self._best_checkpoint_path = os.path.join(checkpoint_dir, 'best_checkpoint') - self._eval_checkpoint_path = os.path.join(checkpoint_dir, 'eval_checkpoint') - self._best_metric_path = os.path.join(self._best_checkpoint_path, self._metric) - - if tf.io.gfile.exists(self._best_metric_path): - with tf.io.gfile.GFile(self._best_metric_path, mode="r") as f: - best_metric_from_file = float(f.read()) - else: - best_metric_from_file = None - - if minimize: - # current < best : is better - self._is_better_than = operator.lt - # worse metric possible - if best_metric_from_file is None: - self._best_metric = np.inf - else: - self._best_metric = best_metric_from_file - tolerance - # used for printing - self._early_stop_name = "minimum" - else: - # current > best : is better - self._is_better_than = operator.gt - # worse metric possible - if best_metric_from_file is None: - self._best_metric = -np.inf - else: - self._best_metric = best_metric_from_file + tolerance - # used for printing - self._early_stop_name = "maximum" - - def get_metrics_fn(): - """ function to get metric tensors to early-stopping """ - estimator_spec = get_estimator_spec_fn() - eval_metric_ops = estimator_spec.eval_metric_ops - if metric not in eval_metric_ops: - raise ValueError( - "Expecting early_stop_metric '%s' key in eval_metric_ops dict" - % (metric)) - # get the value_op from the (value_op, update_op) value - return {k: v[0] for k, v in eval_metric_ops.items()} - - # initialize GetMetricsHook to get current value of metric from session - super(EarlyStopHook, self).__init__(get_metrics_fn=get_metrics_fn) - - def early_stop(self, epoch): - """ - Looks at the current value of the early stopping metric. - Decrements current patience. If metric improves, patience is reset - and latest checkpoint is moved to checkpoint_dir/best_checkpoint. - If current patience reaches zero, returns True. - Args: - epoch: - The current epoch number. - - Returns: - True when early-stopped. False otherwise. - """ - # decrement patience - self._current_patience -= 1 - - # get the current metric value - current_metric = self.metric_values[self._metric] - - if self._is_better_than(current_metric, self._best_metric): - # save best version of model - self._best_metric = current_metric - logging.info( - "Found new %s %s=%f @ epoch %d", - self._early_stop_name, self._metric, self._best_metric, epoch) - # backup the file to checkpoint_dir/best_checkpoint - assert self._latest_checkpoint_path, "expecting latest checkpoint" - logging.info("Backing up " + self._latest_checkpoint_path) - - try: - eval_checkpoint = tf.train.latest_checkpoint(self._eval_checkpoint_path) - twml.util.backup_checkpoint( - checkpoint_path_prefix=eval_checkpoint, - backup_path=self._best_checkpoint_path) - except twml.errors.CheckpointNotFoundError as ex: - msg = "Consider increasing 'keep_checkpoint_max' or 'save_checkpoint_secs'" - raise twml.errors.CheckpointNotFoundError(str(ex) + "\n" + msg) - - tf.io.gfile.makedirs(os.path.dirname(self._best_metric_path)) - with tf.io.gfile.GFile(self._best_metric_path, mode="w") as f: - # Write with enough precision - f.write("%.8f" % self._best_metric) - - # reset patience - self._current_patience = self._patience - - elif self._current_patience > 0: - logging.info("No new %s found after %d epochs", - self._early_stop_name, self._patience - self._current_patience) - elif self._current_patience == 0: - logging.info( - "No new %s found after %d epochs. Early-stopping experiment.", - self._early_stop_name, self._patience) - return True - - return False - - def cleanup_checkpoints(self): +class EarlyStopHook(GetMetricsHook): """ - makes it so that the best checkpoint is the only checkpoint - in checkpoint_dir. + A GetMetricsHook augmented with early-stopping logic for use + within the Trainer.learn method. """ - raise NotImplementedError("cleanup_checkpoints is no longer supported") - def end(self, session): - """ - This method is called at the end of an evaluation/epoch. - When file_path constructor argument is provided, this - will call ``early_stop()``. - When ``early_stop()`` returns True, it creates the file_path, - which will be detected by StopIfExistsHooks - and stop training for all workers and the chief. It will - also call ``cleanup_checkpoints()``. - """ - super(EarlyStopHook, self).end(session) - - # Checks for early stopping criteria and makes a backup - self.should_stop = self.early_stop(self._epoch) - - if self._file_path is not None: - if self.should_stop: - # create a file to inform workers - with tf.io.gfile.GFile(self._file_path, "wb") as gfile: - gfile.write("early-stop\n") - # makes the best checkpoint the only checkpoint in save_dir. - msg = "early-stopping evaluation at epoch %d" % self._epoch - logging.info(msg) - if self._exit_on_end: - raise twml.errors.EarlyStopError(msg) - else: + def __init__( + self, + metric: str, + patience: int, + minimize: bool, + get_estimator_spec_fn: Callable[[], tf.estimator.EstimatorSpec], + checkpoint_dir: str, + file_path: str = None, + exit_on_end: bool = True, + start_epoch: int = 0, + tolerance: float = 0.0, + ): + """ + Prepare early-stopping hook and variables. + + Args: + metric: + String specifying the metric to early-stop on. Required with positive + ``early_stop_patience``. For example, 'accuracy', 'accuracy_0', 'loss', etc. + The string is used to extract the relevant tensor Op from the dict returned by + the get_eval_metric_ops method. For ``metrics`` pass to the constructor, + the string is one of those. For multi-class (that is, multi-metric) + metrics, the string may be appended with a ``_0``, ``_1``, etc. or one + of the ``multi_metric_names`` (one per class). + patience: + Maximum number of epochs to wait for an improvement in the early_stop_metric + before breaking off training. For example, a patience of 10 means that + training will have 10 epochs to improve the metric before it is killed. + Whenever the metric is improved before running out of patience, + patience is reset to ``early_stop_patience``. + minimize: + Set this to True for metrics that need to be minimized + (like ``loss``). Metrics like ``accuracy`` that need to be maximized + should set this to False. + get_estimator_spec_fn: + function that returns the current EstimatorSpec. + The EstimatorSpec is used to obtain the current eval_metric_ops. + checkpoint_dir: + path to directory containing the Estimator checkpoints. + file_path: + path to file that is used by this hook to communicate early-stopping + to StopIfExistsHook. This hook would be used for evaluation, while + the StopIfExistsHooks (the listeners) would be used for training. + When the file is created, the StopIfExistsHooks detect and terminate training. + This argument is used by ``Trainer.train_and_evaluate``. + exit_on_end: + when the end() method is called to indicate that the session is terminating, + and exit_on_end is True, twml.errors.EarlyStopError() is triggered to stop the evaluation job. + This is set to False by the trainer for non distributed jobs. + start_epoch: + Specifies the starting epoch number. This is used for logging purposes only. + tolerance: + A non-negative tolerance for comparing early_stop_metric. + e.g. when maximizing the condition is current_metric > best_metric + tolerance." + Defaults to 0. + """ + if not isinstance(metric, str): + raise ValueError("Expecting string for metric arg") + if not isinstance(patience, int): + raise ValueError("Expecting positive number for metric arg") + + self.should_stop = False + self._metric = metric + self._patience = patience + self._current_patience = patience + self._checkpoint_dir = checkpoint_dir + self._exit_on_end = exit_on_end self._latest_checkpoint_path = None - - self._epoch += 1 - - def begin(self): - """ - Saves the latest_checkpoint in case it gets superseded by another checkpoint. - Remember that when used with train_and_evaluate, the chief saves checkpoints - continuouly. The chief could save a checkpoint after evaluation started. - So saving the checkpoint at the beginning of evaluation ensures that we - later save the correct best checkpoint. - """ - super(EarlyStopHook, self).begin() - self._latest_checkpoint_path = tf.train.latest_checkpoint(self._checkpoint_dir) - - assert self._latest_checkpoint_path, "expecting latest checkpoint" - # Backup to temporary directory - try: - twml.util.backup_checkpoint( - checkpoint_path_prefix=self._latest_checkpoint_path, - backup_path=self._eval_checkpoint_path) - except twml.errors.CheckpointNotFoundError as ex: - msg = "Consider increasing 'keep_checkpoint_max' or 'save_checkpoint_secs'" - raise twml.errors.CheckpointNotFoundError(str(ex) + "\n" + msg) + # used for distributed training (tf.estimator.train_and_evaluate) + self._file_path = file_path + self._epoch = start_epoch + if self._file_path is not None: + # TODO try to read epoch from a file that we create + if tf.io.gfile.exists(self._file_path): + # delete the file if it exists (not sure this makes sense) + logging.info( + "EarlyStopHook: Removing existing file: %s.", self._file_path + ) + tf.io.gfile.remove(self._file_path) + + # best_checkpoint dir will contain the best checkpoint + self._best_checkpoint_path = os.path.join(checkpoint_dir, "best_checkpoint") + self._eval_checkpoint_path = os.path.join(checkpoint_dir, "eval_checkpoint") + self._best_metric_path = os.path.join(self._best_checkpoint_path, self._metric) + + if tf.io.gfile.exists(self._best_metric_path): + with tf.io.gfile.GFile(self._best_metric_path, mode="r") as f: + best_metric_from_file = float(f.read()) + else: + best_metric_from_file = None + + if minimize: + # current < best : is better + self._is_better_than = operator.lt + # worse metric possible + if best_metric_from_file is None: + self._best_metric = np.inf + else: + self._best_metric = best_metric_from_file - tolerance + # used for printing + self._early_stop_name = "minimum" + else: + # current > best : is better + self._is_better_than = operator.gt + # worse metric possible + if best_metric_from_file is None: + self._best_metric = -np.inf + else: + self._best_metric = best_metric_from_file + tolerance + # used for printing + self._early_stop_name = "maximum" + + def get_metrics_fn() -> Dict[str, tf.Tensor]: + """function to get metric tensors to early-stopping""" + estimator_spec = get_estimator_spec_fn() + eval_metric_ops = estimator_spec.eval_metric_ops + if metric not in eval_metric_ops: + raise ValueError( + "Expecting early_stop_metric '%s' key in eval_metric_ops dict" + % (metric) + ) + # get the value_op from the (value_op, update_op) value + return {k: v[0] for k, v in eval_metric_ops.items()} + + # initialize GetMetricsHook to get current value of metric from session + super(EarlyStopHook, self).__init__(get_metrics_fn=get_metrics_fn) + + def early_stop(self, epoch: int) -> bool: + """ + Looks at the current value of the early stopping metric. + Decrements current patience. If metric improves, patience is reset + and latest checkpoint is moved to checkpoint_dir/best_checkpoint. + If current patience reaches zero, returns True. + + Args: + epoch: The current epoch number. + + Returns: + True when early-stopped. False otherwise. + """ + # decrement patience + self._current_patience -= 1 + + # get the current metric value + current_metric = self.metric_values[self._metric] + + if self._is_better_than(current_metric, self._best_metric): + # save best version of model + self._best_metric = current_metric + logging.info( + "Found new %s %s=%f @ epoch %d", + self._early_stop_name, + self._metric, + self._best_metric, + epoch, + ) + # backup the file to checkpoint_dir/best_checkpoint + assert self._latest_checkpoint_path, "expecting latest checkpoint" + logging.info("Backing up " + self._latest_checkpoint_path) + + try: + eval_checkpoint = tf.train.latest_checkpoint(self._eval_checkpoint_path) + twml.util.backup_checkpoint( + checkpoint_path_prefix=eval_checkpoint, + backup_path=self._best_checkpoint_path, + ) + except twml.errors.CheckpointNotFoundError as ex: + msg = "Consider increasing 'keep_checkpoint_max' or 'save_checkpoint_secs'" + raise twml.errors.CheckpointNotFoundError(str(ex) + "\n" + msg) + + tf.io.gfile.makedirs(os.path.dirname(self._best_metric_path)) + with tf.io.gfile.GFile(self._best_metric_path, mode="w") as f: + # Write with enough precision + f.write("%.8f" % self._best_metric) + + # reset patience + self._current_patience = self._patience + + elif self._current_patience > 0: + logging.info( + "No new %s found after %d epochs", + self._early_stop_name, + self._patience - self._current_patience, + ) + elif self._current_patience == 0: + logging.info( + "No new %s found after %d epochs. Early-stopping experiment.", + self._early_stop_name, + self._patience, + ) + return True + + return False + + def cleanup_checkpoints(self) -> None: + """ + makes it so that the best checkpoint is the only checkpoint + in checkpoint_dir. + """ + raise NotImplementedError("cleanup_checkpoints is no longer supported") + + def end(self, session: tf.Session) -> None: + """ + This method is called at the end of an evaluation/epoch. + When file_path constructor argument is provided, this + will call ``early_stop()``. + When ``early_stop()`` returns True, it creates the file_path, + which will be detected by StopIfExistsHooks + and stop training for all workers and the chief. It will + also call ``cleanup_checkpoints()``. + """ + super(EarlyStopHook, self).end(session) + + # Checks for early stopping criteria and makes a backup + self.should_stop = self.early_stop(self._epoch) + + if self._file_path is not None: + if self.should_stop: + # create a file to inform workers + with tf.io.gfile.GFile(self._file_path, "wb") as gfile: + gfile.write("early-stop\n") + # makes the best checkpoint the only checkpoint in save_dir. + msg = "early-stopping evaluation at epoch %d" % self._epoch + logging.info(msg) + if self._exit_on_end: + raise twml.errors.EarlyStopError(msg) + else: + self._latest_checkpoint_path = None + + self._epoch += 1 + + def begin(self) -> None: + """ + Saves the latest_checkpoint in case it gets superseded by another checkpoint. + Remember that when used with train_and_evaluate, the chief saves checkpoints + continuouly. The chief could save a checkpoint after evaluation started. + So saving the checkpoint at the beginning of evaluation ensures that we + later save the correct best checkpoint. + """ + super(EarlyStopHook, self).begin() + self._latest_checkpoint_path = tf.train.latest_checkpoint(self._checkpoint_dir) + + assert self._latest_checkpoint_path, "expecting latest checkpoint" + # Backup to temporary directory + try: + twml.util.backup_checkpoint( + checkpoint_path_prefix=self._latest_checkpoint_path, + backup_path=self._eval_checkpoint_path, + ) + except twml.errors.CheckpointNotFoundError as ex: + msg = "Consider increasing 'keep_checkpoint_max' or 'save_checkpoint_secs'" + raise twml.errors.CheckpointNotFoundError(str(ex) + "\n" + msg) class MetricsUpdateHook(GetMetricsHook): - """ - A GetMetricsHook augmented with logic to map SessionRun events to metrics updates. - It is mainly used by `TrackRun` to persist model metrics via Model Repo. - """ - - def __init__(self, - get_estimator_spec_fn, - add_metrics_fn, - every_n_iter=None, - every_n_secs=None - ): - """ - Args: - get_estimator_spec_fn: - function that returns the current EstimatorSpec. - The EstimatorSpec is used to obtain the current eval_metric_ops. - add_metrics_fn: `function` callback used to report metrics, called automatically - at the end of every epoch. - every_n_iter: `int`, log the metrics once every N local - steps taken in the current epoch. - every_n_secs: `int` or `float`, log the metrics once every N - seconds passed in the current epoch. Exactly one of `every_n_iter` and `every_n_secs` - should be provided. - Raises: - ValueError: if `every_n_iter` is non-positive or if not exactly one of `every_n_iter` and - `every_n_secs` is set when `add_progress_metrics_fn` is provided. - """ - only_log_at_end = (every_n_iter is None) and (every_n_secs is None) - - if (not only_log_at_end and every_n_iter and every_n_secs): - raise ValueError( - 'exactly one of every_n_iter and every_n_secs must be provided' - ) - - # TODO: should have a minimum to avoid too many calls to ModelRepo? - if every_n_iter is not None and every_n_iter <= 0: - raise ValueError("invalid every_n_iter=%s." % every_n_iter) - - self._timer = ( - NeverTriggerTimer() if only_log_at_end else - SecondOrStepTimer(every_secs=every_n_secs, every_steps=every_n_iter) - ) - - self._should_trigger = False - self._iter_count = 0 - - self._add_metrics_fn = add_metrics_fn - - def get_metrics_fn(): - """ - Function that returns the current EstimatorSpec. - The EstimatorSpec is used to obtain the current eval_metric_ops. - """ - estimator_spec = get_estimator_spec_fn() - eval_metric_ops = estimator_spec.eval_metric_ops - # get the value_op from the (value_op, update_op) value - return {k: v[0] for k, v in eval_metric_ops.items()} - super(MetricsUpdateHook, self).__init__(get_metrics_fn=get_metrics_fn) - - def report_metrics(self): """ - Triggers a metrics report. + A GetMetricsHook augmented with logic to map SessionRun events to metrics updates. + It is mainly used by `TrackRun` to persist model metrics via Model Repo. """ - self._timer.update_last_triggered_step(self._iter_count) - if self.metric_values is not None: - self._add_metrics_fn(self.metric_values) - def begin(self): - """ - Triggered before each epoch. - """ - self._timer.reset() - self._iter_count = 0 - return super(MetricsUpdateHook, self).begin() + def __init__( + self, + get_estimator_spec_fn: Callable[[], tf.estimator.EstimatorSpec], + add_metrics_fn: Callable[[Dict[str, float]], None], + every_n_iter: Optional[int] = None, + every_n_secs: Optional[float] = None, + ): + """ + Args: + get_estimator_spec_fn: + function that returns the current EstimatorSpec. + The EstimatorSpec is used to obtain the current eval_metric_ops. + add_metrics_fn: `function` callback used to report metrics, called automatically + at the end of every epoch. + every_n_iter: `int`, log the metrics once every N local + steps taken in the current epoch. + every_n_secs: `int` or `float`, log the metrics once every N + seconds passed in the current epoch. Exactly one of `every_n_iter` and `every_n_secs` + should be provided. + Raises: + ValueError: if `every_n_iter` is non-positive or if not exactly one of `every_n_iter` and + `every_n_secs` is set when `add_progress_metrics_fn` is provided. + """ + only_log_at_end = (every_n_iter is None) and (every_n_secs is None) + + if not only_log_at_end and every_n_iter and every_n_secs: + raise ValueError( + "exactly one of every_n_iter and every_n_secs must be provided" + ) + + # TODO: should have a minimum to avoid too many calls to ModelRepo? + if every_n_iter is not None and every_n_iter <= 0: + raise ValueError("invalid every_n_iter=%s." % every_n_iter) + + self._timer = ( + NeverTriggerTimer() + if only_log_at_end + else SecondOrStepTimer(every_secs=every_n_secs, every_steps=every_n_iter) + ) + + self._should_trigger = False + self._iter_count = 0 + + self._add_metrics_fn = add_metrics_fn + + def get_metrics_fn(): + """ + Function that returns the current EstimatorSpec. + The EstimatorSpec is used to obtain the current eval_metric_ops. + """ + estimator_spec = get_estimator_spec_fn() + eval_metric_ops = estimator_spec.eval_metric_ops + # get the value_op from the (value_op, update_op) value + return {k: v[0] for k, v in eval_metric_ops.items()} + + super(MetricsUpdateHook, self).__init__(get_metrics_fn=get_metrics_fn) + + def report_metrics(self) -> None: + """ + Triggers a metrics report. + """ + self._timer.update_last_triggered_step(self._iter_count) + if self.metric_values is not None: + self._add_metrics_fn(self.metric_values) + + def begin(self) -> None: + """ + Triggered before each epoch. + """ + self._timer.reset() + self._iter_count = 0 + return super(MetricsUpdateHook, self).begin() + + def before_run( + self, run_context: tf.estimator.SessionRunContext + ) -> tf.train.SessionRunArgs: + """ + Triggered before each step. + """ + self._should_trigger = self._timer.should_trigger_for_step(self._iter_count) + return super(MetricsUpdateHook, self).before_run(run_context) + + def after_run( + self, + run_context: tf.estimator.SessionRunContext, + run_values: tf.train.SessionRunValues, + ) -> None: + """ + Triggered after each step. + """ + if self._should_trigger: + self.report_metrics() + self._iter_count += 1 + return super(MetricsUpdateHook, self).after_run(run_context, run_values) + + def end(self, session: tf.Session) -> None: + """ + Triggered after each epoch. + """ + self.report_metrics() + return super(MetricsUpdateHook, self).end(session) - def before_run(self, run_context): - """ - Triggered before each step. - """ - self._should_trigger = self._timer.should_trigger_for_step(self._iter_count) - return super(MetricsUpdateHook, self).before_run(run_context) - def after_run(self, run_context, run_values): - """ - Triggered after each step. +class EarlyStopDuration(tf.train.SessionRunHook): """ - if self._should_trigger: - self.report_metrics() - self._iter_count += 1 - return super(MetricsUpdateHook, self).after_run(run_context, run_values) + Hook that can be used to terminate a job (training or validation) after a certain duration. + The hook is fault tolerant, i.e., if a job is allotted 1 hour to run and fails after 45 minutes, + then it will only run for 15 minutes once restarted. - def end(self, session): - """ - Triggered after each epoch. + Args: + max_duration: + A float. When this argument is defined, the job will automatically terminate after + `max_duration` seconds if it has not already completed. + + overwrite: + A boolean. If set to True, this hook will overwrite the file containing the elapsed time + since the beginning of the job. In a distributed setting, this will be used so only one + job writes to the file while all others will have read access. In a distributed setting, + if all executors have this parameter set to False, then it just means that the hook will + not be fault tolerant. When restarted, the job will restart the clock from 0. + + save_dir: + String. A directory (located on a file system that is Tensorflow compatible) where + we can store the file which contains the record of the elapsed time. This file is what makes + the hook fault tolerant. + + exit_on_end: + when exit_on_end is True, twml.errors.EarlyStopError() is triggered to stop the job. + This is usually set to True to kill a validation job in a distributed setting. """ - self.report_metrics() - return super(MetricsUpdateHook, self).end(session) - -class EarlyStopDuration(tf.train.SessionRunHook): - """ - Hook that can be used to terminate a job (training or validation) after a certain duration. - The hook is fault tolerant, i.e., if a job is allotted 1 hour to run and fails after 45 minutes, - then it will only run for 15 minutes once restarted. - - Args: - max_duration: - A float. When this argument is defined, the job will automatically terminate after - `max_duration` seconds if it has not already compeleted. - - overwrite: - A boolean. If set to True, this hook will overwrite the file containing the elapsed time - since the beginning of the job. In a distributed setting, this will be used so only one - job writes to the file while all others will have read access. In a distributed setting, - if all executors have this parameter set to False, then it just means that the hook will - not be fault tolerant. When restarted, the job will restart the clock from 0. - - save_dir: - String. A directory (located on a file system that is Tensorflow compatible) where - we can store the file which contains the record of the elapsed time. This file is what makes - the hook faul tolerant. - - exit_on_end: - when exit_on_end is True, twml.errors.EarlyStopError() is triggered to stop the job. - This is usually set to True to kill a validation job in a distributed setting. - """ - - def __init__(self, max_duration: float, exit_on_end: bool, save_dir: str, overwrite: bool): - self._overwrite = overwrite - self._save_dir = save_dir - self._exit_on_end = exit_on_end - self._max_duration = max_duration - self._last_time_check = datetime.now() - - # Initialize elapse time file - if overwrite: - self.elapsed_time() - - @property - def elapsed_file_path(self): - return os.path.join(self._save_dir, "early_stop_duration.txt") - - def early_stop(self) -> bool: - return self.elapsed_time() > self._max_duration - - def elapsed_time(self) -> float: - # Recorded elapsed time is 0 unless it's been recorded in a file already - recorded_elapsed_time = 0 - if tf.io.gfile.exists(self.elapsed_file_path): - with tf.io.gfile.GFile(self.elapsed_file_path, mode="r") as file: - recorded_elapsed_time = json.loads(file.read())["elapsed_time"] - - elapsed_time = recorded_elapsed_time + (datetime.now() - self._last_time_check).total_seconds() - self._last_time_check = datetime.now() - - if self._overwrite: - # Record the actualized new elapsed time to the file - tf.io.gfile.makedirs(os.path.dirname(self.elapsed_file_path)) - with tf.io.gfile.GFile(self.elapsed_file_path, mode="w") as file: - record = { - "elapsed_time": elapsed_time, - "max_duration": self._max_duration - } - file.write(json.dumps(record, indent=2)) - - return elapsed_time - - def before_run(self, run_context: tf.estimator.SessionRunContext) -> None: - if self.early_stop(): - message = f""" - Stopping job which now exceeded the maximum duration of {self._max_duration} seconds. + def __init__( + self, + max_duration: float, + exit_on_end: bool, + save_dir: str, + overwrite: bool, + ): + self._overwrite = overwrite + self._save_dir = save_dir + self._exit_on_end = exit_on_end + self._max_duration = max_duration + self._last_time_check = datetime.now() + + # Initialize elapse time file + if overwrite: + self.elapsed_time() + + @property + def elapsed_file_path(self): + return os.path.join(self._save_dir, "early_stop_duration.txt") + + def early_stop(self) -> bool: + return self.elapsed_time() > self._max_duration + + def elapsed_time(self) -> float: + # Recorded elapsed time is 0 unless it's been recorded in a file already + recorded_elapsed_time = 0 + if tf.io.gfile.exists(self.elapsed_file_path): + with tf.io.gfile.GFile(self.elapsed_file_path, mode="r") as file: + recorded_elapsed_time = json.loads(file.read())["elapsed_time"] + + elapsed_time = ( + recorded_elapsed_time + + (datetime.now() - self._last_time_check).total_seconds() + ) + self._last_time_check = datetime.now() + + if self._overwrite: + # Record the actualized new elapsed time to the file + tf.io.gfile.makedirs(os.path.dirname(self.elapsed_file_path)) + with tf.io.gfile.GFile(self.elapsed_file_path, mode="w") as file: + record = { + "elapsed_time": elapsed_time, + "max_duration": self._max_duration, + } + file.write(json.dumps(record, indent=2)) + + return elapsed_time + + def before_run(self, run_context: tf.estimator.SessionRunContext) -> None: + if self.early_stop(): + message = f""" + Stopping job which now exceeded the maximum duration of {self._max_duration} seconds. """ - logging.info(message) - run_context.request_stop() + logging.info(message) + run_context.request_stop() - if self._exit_on_end: - raise twml.errors.EarlyStopError(message) + if self._exit_on_end: + raise twml.errors.EarlyStopError(message) class StopAtStepHook(tf.train.StopAtStepHook): - """ - Overrides ``tf.train.StopAtStepHook`` so that - a ``stop_requested`` property can be accessed to determine - if this hook requested a stop. - """ + """ + Overrides ``tf.train.StopAtStepHook`` so that + a ``stop_requested`` property can be accessed to determine + if this hook requested a stop. + """ - def __init__(self, *args, **kwargs): - super(StopAtStepHook, self).__init__(*args, **kwargs) - self._stop_requested = False + def __init__(self, *args, **kwargs): + super(StopAtStepHook, self).__init__(*args, **kwargs) + self._stop_requested = False - @property - def stop_requested(self): - """ true if this hook requested a stop """ - return self._stop_requested + @property + def stop_requested(self) -> bool: + """true if this hook requested a stop""" + return self._stop_requested - def after_run(self, run_context, run_values): - """ sets self.stop_requested to true when requesting a stop """ - super(StopAtStepHook, self).after_run(run_context, run_values) - self._stop_requested = run_context.stop_requested + def after_run( + self, + run_context: tf.estimator.SessionRunContext, + run_values: tf.train.SessionRunValues, + ) -> None: + """sets self.stop_requested to true when requesting a stop""" + super(StopAtStepHook, self).after_run(run_context, run_values) + self._stop_requested = run_context.stop_requested class StopIfExistsHook(tf.train.SessionRunHook): - """ - Hook that requests stop if a file exists. - This hook is used with the EarlyStopHook to implement - early-stopping for distributed training (tf.estimator.train_and_evaluate). - """ - - def __init__(self, file_path): """ - Arguments: - file_path: - path to file. When this hook detects that the file exists, - it requests a stop, which effectively kills this worker. + Hook that requests stop if a file exists. + This hook is used with the EarlyStopHook to implement + early-stopping for distributed training (tf.estimator.train_and_evaluate). """ - self._file_path = file_path - self._stop_requested = False - - def after_run(self, run_context, run_values): - if tf.io.gfile.exists(self._file_path): - logging.info("Early-stopping file detected; requesting stop") - run_context.request_stop() - self._stop_requested = True - - @property - def stop_requested(self): - """ true if this hook requested a stop """ - return self._stop_requested + + def __init__(self, file_path: str): + """ + Args: + file_path: + path to file. When this hook detects that the file exists, + it requests a stop, which effectively kills this worker. + """ + self._file_path = file_path + self._stop_requested = False + + def after_run( + self, + run_context: tf.estimator.SessionRunContext, + run_values: tf.train.SessionRunValues, + ) -> None: + if tf.io.gfile.exists(self._file_path): + logging.info("Early-stopping file detected; requesting stop") + run_context.request_stop() + self._stop_requested = True + + @property + def stop_requested(self) -> bool: + """true if this hook requested a stop""" + return self._stop_requested diff --git a/twml/twml/input_fns.py b/twml/twml/input_fns.py index 394fc8674..814823678 100644 --- a/twml/twml/input_fns.py +++ b/twml/twml/input_fns.py @@ -1,129 +1,131 @@ -''' +""" Contains implementations of functions to read input data. -''' -from .dataset import stream_block_format_dataset +""" +from typing import Callable, List, Optional import tensorflow.compat.v1 as tf +from .dataset import stream_block_format_dataset -def data_record_input_fn( - files, batch_size, parse_fn, - num_threads=2, repeat=False, dataset_fn=None, - keep_rate=None, parts_downsampling_rate=None, - shards=None, shard_index=None, shuffle=True, shuffle_files=True, interleave=True, - initializable=False, log_tf_data_summaries=False, - **kwargs): - """ - Returns a nested structure of tf.Tensors containing the next element. - Used by ``train_input_fn`` and ``eval_input_fn`` in DataRecordTrainer. - By default, works with DataRecord dataset for compressed partition files. - - Args: - files: - List of files that will be parsed. - batch_size: - number of samples per batch. - parse_fn: - function passed to data loading for parsing individual data records. - Usually one of the decoder functions like ``parsers.get_sparse_parse_fn``. - num_threads (optional): - number of threads used for loading data. Defaults to 2. - repeat (optional): - Repeat the dataset indefinitely. Defaults to False. - Useful when you want to use ``train_steps`` or ``eval_steps`` - greater than the size of the dataset - (otherwise Estimator.[train,evaluate] stops when the end of the dataset is reached). - dataset_fn (optional): - A function that modifies the dataset after it reads different interleaved parts files. - Defaults to: - - .. code-block:: python - - def dataset_fn(dataset, parse_fn, batch_size): - return dataset.batch(batch_size).map(parse_fn, 1) - - keep_rate (optional): - A float value in (0.0, 1.0] that indicates to drop records according to the Bernoulli - distribution with p = 1 - keep_rate. - Defaults to None (no records dropped). - - parts_downsampling_rate (optional): - A float value in (0.0, 1.0] that indicates the factor by which to downsample part files. - For example, a value of 0.2 means only 20 percent of part files become part of the dataset. - - shards (optional): - Number of partitions to shard the dataset into. This is useful for codistillation - (https://arxiv.org/pdf/1804.03235.pdf) and other techniques that require each worker to - train on disjoint partitions of the dataset. - The dataset is not sharded by default. - - shard_index (optional): - Which partition of the dataset to use if ``shards`` is set. - - shuffle (optional): - Whether to shuffle the records. Defaults to True. - - shuffle_files (optional): - Shuffle the list of files. Defaults to True. - When False, files are iterated in the order they are passed in. - - interleave (optional): - Interleave records from multiple files in parallel. Defaults to True. - - initializable (optional): - A boolean indicator. When the Dataset Iterator depends on some resource, e.g. a HashTable or - a Tensor, i.e. it's an initializable iterator, set it to True. Otherwise, default value (false) - is used for most plain iterators. - - log_tf_data_summaries (optional): - A boolean indicator denoting whether to add a `tf.data.experimental.StatsAggregator` to the - tf.data pipeline. This adds summaries of pipeline utilization and buffer sizes to the output - events files. This requires that `initializable` is `True` above. - - Returns: - Iterator of elements of the dataset. - """ - if not parse_fn: - raise ValueError("default_input_fn requires a parse_fn") - - if log_tf_data_summaries and not initializable: - raise ValueError("Require `initializable` if `log_tf_data_summaries`.") - dataset = stream_block_format_dataset( - files=files, - parse_fn=parse_fn, - batch_size=batch_size, - repeat=repeat, - num_threads=num_threads, - dataset_fn=dataset_fn, - keep_rate=keep_rate, - parts_downsampling_rate=parts_downsampling_rate, - shards=shards, - shard_index=shard_index, - shuffle=shuffle, - shuffle_files=shuffle_files, - interleave=interleave, +def data_record_input_fn( + files: List[str], + batch_size: int, + parse_fn: Callable, + num_threads: int = 2, + repeat: bool = False, + dataset_fn: Optional[Callable] = None, + keep_rate: Optional[float] = None, + parts_downsampling_rate: Optional[float] = None, + shards: Optional[int] = None, + shard_index: Optional[int] = None, + shuffle: bool = True, + shuffle_files: bool = True, + interleave: bool = True, + initializable: bool = False, + log_tf_data_summaries: bool = False, **kwargs - ) - - # Add a tf.data.experimental.StatsAggregator - # https://www.tensorflow.org/versions/r1.15/api_docs/python/tf/data/experimental/StatsAggregator - if log_tf_data_summaries: - aggregator = tf.data.experimental.StatsAggregator() - options = tf.data.Options() - options.experimental_stats.aggregator = aggregator - dataset = dataset.with_options(options) - stats_summary = aggregator.get_summary() - tf.add_to_collection(tf.GraphKeys.SUMMARIES, stats_summary) - - if initializable: - # when the data parsing dpends on some HashTable or Tensor, the iterator is initalizable and - # therefore we need to be run explicitly - iterator = dataset.make_initializable_iterator() - tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS, iterator.initializer) - else: - iterator = dataset.make_one_shot_iterator() - return iterator.get_next() +) -> tf.Tensor: + """ + Returns a nested structure of tf.Tensors containing the next element. + Used by ``train_input_fn`` and ``eval_input_fn`` in DataRecordTrainer. + By default, works with DataRecord dataset for compressed partition files. + + Args: + files: + List of files that will be parsed. + batch_size: + number of samples per batch. + parse_fn: + function passed to data loading for parsing individual data records. + Usually one of the decoder functions like ``parsers.get_sparse_parse_fn``. + num_threads (optional): + number of threads used for loading data. Defaults to 2. + repeat (optional): + Repeat the dataset indefinitely. Defaults to False. + Useful when you want to use ``train_steps`` or ``eval_steps`` + greater than the size of the dataset + (otherwise Estimator.[train,evaluate] stops when the end of the dataset is reached). + dataset_fn (optional): + A function that modifies the dataset after it reads different interleaved parts files. + Defaults to: + .. code-block:: python + def dataset_fn(dataset, parse_fn, batch_size): + return dataset.batch(batch_size).map(parse_fn, 1) + keep_rate (optional): + A float value in (0.0, 1.0] that indicates to drop records according to the Bernoulli + distribution with p = 1 - keep_rate. + Defaults to None (no records dropped). + parts_downsampling_rate (optional): + A float value in (0.0, 1.0] that indicates the factor by which to downsample part files. + For example, a value of 0.2 means only 20 percent of part files become part of the dataset. + shards (optional): + Number of partitions to shard the dataset into. This is useful for codistillation + (https://arxiv.org/pdf/1804.03235.pdf) and other techniques that require each worker to + train on disjoint partitions of the dataset. + The dataset is not sharded by default. + shard_index (optional): + Which partition of the dataset to use if ``shards`` is set. + shuffle (optional): + Whether to shuffle the records. Defaults to True. + shuffle_files (optional): + Shuffle the list of files. Defaults to True. + When False, files are iterated in the order they are passed in. + interleave (optional): + Interleave records from multiple files in parallel. Defaults to True. + initializable (optional): + A boolean indicator. When the Dataset Iterator depends on some resource, e.g. a HashTable or + a Tensor, i.e. it's an initializable iterator, set it to True. Otherwise, default value (false) + is used for most plain iterators. + log_tf_data_summaries (optional): + A boolean indicator denoting whether to add a `tf.data.experimental.StatsAggregator` to the + tf.data pipeline. This adds summaries of pipeline utilization and buffer sizes to the output + events files. This requires that `initializable` is `True` above. + + Returns: + Iterator of elements of the dataset. + """ + if not parse_fn: + raise ValueError("default_input_fn requires a parse_fn") + + if log_tf_data_summaries and not initializable: + raise ValueError("Require `initializable` if `log_tf_data_summaries`.") + + dataset = stream_block_format_dataset( + files=files, + parse_fn=parse_fn, + batch_size=batch_size, + repeat=repeat, + num_threads=num_threads, + dataset_fn=dataset_fn, + keep_rate=keep_rate, + parts_downsampling_rate=parts_downsampling_rate, + shards=shards, + shard_index=shard_index, + shuffle=shuffle, + shuffle_files=shuffle_files, + interleave=interleave, + **kwargs + ) + + # Add a tf.data.experimental.StatsAggregator + # https://www.tensorflow.org/versions/r1.15/api_docs/python/tf/data/experimental/StatsAggregator + if log_tf_data_summaries: + aggregator = tf.data.experimental.StatsAggregator() + options = tf.data.Options() + options.experimental_stats.aggregator = aggregator + dataset = dataset.with_options(options) + stats_summary = aggregator.get_summary() + tf.add_to_collection(tf.GraphKeys.SUMMARIES, stats_summary) + + if initializable: + # when the data parsing dpends on some HashTable or Tensor, the iterator is initalizable and + # therefore we need to be run explicitly + iterator = dataset.make_initializable_iterator() + tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS, iterator.initializer) + else: + iterator = dataset.make_one_shot_iterator() + return iterator.get_next() default_input_fn = data_record_input_fn # pylint: disable=invalid-name diff --git a/twml/twml/layers/__init__.py b/twml/twml/layers/__init__.py index 917c61867..ad7e798c5 100644 --- a/twml/twml/layers/__init__.py +++ b/twml/twml/layers/__init__.py @@ -9,13 +9,13 @@ from .batch_prediction_tensor_writer import BatchPredictionTensorWriter # noqa: F401 from .batch_prediction_writer import BatchPredictionWriter # noqa: F401 from .data_record_tensor_writer import DataRecordTensorWriter # noqa: F401 -from .full_dense import full_dense, FullDense # noqa: F401 -from .full_sparse import full_sparse, FullSparse # noqa: F401 +from .full_dense import FullDense, full_dense # noqa: F401 +from .full_sparse import FullSparse, full_sparse # noqa: F401 from .isotonic import Isotonic # noqa: F401 from .layer import Layer # noqa: F401 from .mdl import MDL # noqa: F401 from .partition import Partition # noqa: F401 from .percentile_discretizer import PercentileDiscretizer # noqa: F401 from .sequential import Sequential # noqa: F401 -from .sparse_max_norm import MaxNorm, sparse_max_norm, SparseMaxNorm # noqa: F401 +from .sparse_max_norm import MaxNorm, SparseMaxNorm, sparse_max_norm # noqa: F401 from .stitch import Stitch # noqa: F401 diff --git a/twml/twml/layers/batch_prediction_tensor_writer.py b/twml/twml/layers/batch_prediction_tensor_writer.py index 3f6633a8e..e5ca3c718 100644 --- a/twml/twml/layers/batch_prediction_tensor_writer.py +++ b/twml/twml/layers/batch_prediction_tensor_writer.py @@ -2,50 +2,58 @@ """ Implementing Writer Layer """ -from .layer import Layer +from typing import List, Tuple import libtwml +import tensorflow.compat.v1 as tf + +from .layer import Layer class BatchPredictionTensorWriter(Layer): - """ - A layer that packages keys and dense tensors into a BatchPredictionResponse. - Typically used at the out of an exported model for use in a the PredictionEngine - (that is, in production) when model predictions are dense tensors. - - Arguments: - keys: - keys to hashmap - Output: - output: - a BatchPredictionResponse serialized using Thrift into a uint8 tensor. - """ - - def __init__(self, keys, **kwargs): # pylint: disable=useless-super-delegation - super(BatchPredictionTensorWriter, self).__init__(**kwargs) - self.keys = keys - - def compute_output_shape(self, input_shape): - """Computes the output shape of the layer given the input shape. + """ + A layer that packages keys and dense tensors into a BatchPredictionResponse. + Typically used at the out of an exported model for use in a the PredictionEngine + (that is, in production) when model predictions are dense tensors. Args: - input_shape: A (possibly nested tuple of) `TensorShape`. It need not - be fully defined (e.g. the batch size may be unknown). - - Raise NotImplementedError. + keys: keys to hashmap + Output: + output: + a BatchPredictionResponse serialized using Thrift into a uint8 tensor. """ - raise NotImplementedError - - def call(self, values, **kwargs): # pylint: disable=unused-argument, arguments-differ - """The logic of the layer lives here. - Arguments: - values: - dense tensors corresponding to keys in hashmap - - Returns: - The output from the layer - """ - write_op = libtwml.ops.batch_prediction_tensor_response_writer(self.keys, values) - return write_op + def __init__(self, keys: List[str], **kwargs): + super(BatchPredictionTensorWriter, self).__init__(**kwargs) + self.keys = keys + + def compute_output_shape( + self, input_shape: Tuple[tf.TensorShape] + ): # pylint: disable=unused-argument + """Computes the output shape of the layer given the input shape. + + Args: + input_shape: A (possibly nested tuple of) `TensorShape`. It need not + be fully defined (e.g. the batch size may be unknown). + + Raise NotImplementedError. + """ + raise NotImplementedError + + def call( + self, values: List[tf.Tensor], **kwargs + ): # pylint: disable=unused-argument + """The logic of the layer lives here. + + Args: + values: + dense tensors corresponding to keys in hashmap + + Returns: + The output from the layer + """ + write_op = libtwml.ops.batch_prediction_tensor_response_writer( + self.keys, values + ) + return write_op diff --git a/twml/twml/layers/batch_prediction_writer.py b/twml/twml/layers/batch_prediction_writer.py index 118d21921..15fd1379b 100644 --- a/twml/twml/layers/batch_prediction_writer.py +++ b/twml/twml/layers/batch_prediction_writer.py @@ -2,50 +2,57 @@ """ Implementing Writer Layer """ -from .layer import Layer +from typing import List, Tuple import libtwml +import tensorflow.compat.v1 as tf + +from .layer import Layer class BatchPredictionWriter(Layer): - """ - A layer that packages keys and values into a BatchPredictionResponse. - Typically used at the out of an exported model for use in a the PredictionEngine - (that is, in production). - - Arguments: - keys: - keys to hashmap - Output: - output: - a BatchPredictionResponse serialized using Thrift into a uint8 tensor. - """ - - def __init__(self, keys, **kwargs): # pylint: disable=useless-super-delegation - super(BatchPredictionWriter, self).__init__(**kwargs) - self.keys = keys - - def compute_output_shape(self, input_shape): - """Computes the output shape of the layer given the input shape. + """ + A layer that packages keys and values into a BatchPredictionResponse. + Typically used at the out of an exported model for use in a the PredictionEngine + (that is, in production). Args: - input_shape: A (possibly nested tuple of) `TensorShape`. It need not - be fully defined (e.g. the batch size may be unknown). + keys: + keys to hashmap + Output: + output: + a BatchPredictionResponse serialized using Thrift into a uint8 tensor. + """ - Raise NotImplementedError. + def __init__( + self, keys: List[str], **kwargs + ): # pylint: disable=useless-super-delegation + super(BatchPredictionWriter, self).__init__(**kwargs) + self.keys = keys - """ - raise NotImplementedError + def compute_output_shape(self, input_shape: Tuple[tf.TensorShape]): + """Computes the output shape of the layer given the input shape. - def call(self, values, **kwargs): # pylint: disable=unused-argument, arguments-differ - """The logic of the layer lives here. + Args: + input_shape: A (possibly nested tuple of) `TensorShape`. It need not + be fully defined (e.g. the batch size may be unknown). - Arguments: - values: - values corresponding to keys in hashmap + Raise NotImplementedError. - Returns: - The output from the layer - """ - write_op = libtwml.ops.batch_prediction_response_writer(self.keys, values) - return write_op + """ + raise NotImplementedError + + def call( + self, values: List[tf.Tensor], **kwargs + ): # pylint: disable=unused-argument, arguments-differ + """The logic of the layer lives here. + + Args: + values: + values corresponding to keys in hashmap + + Returns: + The output from the layer + """ + write_op = libtwml.ops.batch_prediction_response_writer(self.keys, values) + return write_op diff --git a/twml/twml/layers/data_record_tensor_writer.py b/twml/twml/layers/data_record_tensor_writer.py index 0f70186b4..7b80f9fa7 100644 --- a/twml/twml/layers/data_record_tensor_writer.py +++ b/twml/twml/layers/data_record_tensor_writer.py @@ -2,49 +2,53 @@ """ Implementing Writer Layer """ -from .layer import Layer +from typing import Tuple import libtwml +import tensorflow.compat.v1 as tf +from .layer import Layer -class DataRecordTensorWriter(Layer): - """ - A layer that packages keys and dense tensors into a DataRecord. - This layer was initially added to support exporting user embeddings as tensors. - Arguments: - keys: - keys to hashmap - Output: - output: - a DataRecord serialized using Thrift into a uint8 tensor - """ +class DataRecordTensorWriter(Layer): + """ + A layer that packages keys and dense tensors into a DataRecord. + This layer was initially added to support exporting user embeddings as tensors. - def __init__(self, keys, **kwargs): # pylint: disable=useless-super-delegation - super(DataRecordTensorWriter, self).__init__(**kwargs) - self.keys = keys + Args: + keys: + keys to hashmap + Output: + output: + a DataRecord serialized using Thrift into a uint8 tensor + """ - def compute_output_shape(self, input_shape): - """Computes the output shape of the layer given the input shape. + def __init__(self, keys, **kwargs): # pylint: disable=useless-super-delegation + super(DataRecordTensorWriter, self).__init__(**kwargs) + self.keys = keys - Args: - input_shape: A (possibly nested tuple of) `TensorShape`. It need not - be fully defined (e.g. the batch size may be unknown). + def compute_output_shape(self, input_shape: Tuple[tf.TensorShape]): + """Computes the output shape of the layer given the input shape. - Raises NotImplementedError. + Args: + input_shape: A (possibly nested tuple of) `TensorShape`. It need not + be fully defined (e.g. the batch size may be unknown). - """ - raise NotImplementedError + Raises NotImplementedError. + """ + raise NotImplementedError - def call(self, values, **kwargs): # pylint: disable=unused-argument, arguments-differ - """The logic of the layer lives here. + def call( + self, values: Tuple[tf.Tensor], **kwargs + ) -> tf.Tensor: # pylint: disable=unused-argument, arguments-differ + """The logic of the layer lives here. - Arguments: - values: - dense tensors corresponding to keys in hashmap + Args: + values: + dense tensors corresponding to keys in hashmap - Returns: - The output from the layer - """ - write_op = libtwml.ops.data_record_tensor_writer(self.keys, values) - return write_op + Returns: + The output from the layer + """ + write_op = libtwml.ops.data_record_tensor_writer(self.keys, values) + return write_op diff --git a/twml/twml/layers/full_dense.py b/twml/twml/layers/full_dense.py index 9c354ad3e..d08559a6d 100644 --- a/twml/twml/layers/full_dense.py +++ b/twml/twml/layers/full_dense.py @@ -2,258 +2,264 @@ """ Implementing Full Dense Layer """ -from tensorflow.python.layers import core as core_layers -from tensorflow.python.ops import init_ops +from typing import Callable, Optional + +import tensorflow.compat.v1 as tf from tensorflow.python.framework import tensor_shape from tensorflow.python.keras.engine.base_layer import InputSpec -import tensorflow.compat.v1 as tf +from tensorflow.python.layers import core as core_layers +from tensorflow.python.ops import init_ops class FullDense(core_layers.Dense): - """ - Densely-connected layer class. - This is wrapping tensorflow.python.layers.core.Dense - This layer implements the operation: - - .. code-block:: python + """ + Densely-connected layer class. + This is wrapping tensorflow.python.layers.core.Dense + This layer implements the operation: - outputs = activation(inputs.weight + bias) + .. code-block:: python - Where ``activation`` is the activation function passed as the ``activation`` - argument (if not ``None``), ``weight`` is a weights matrix created by the layer, - and ``bias`` is a bias vector created by the layer. + outputs = activation(inputs.weight + bias) - Arguments: - output_size: - Integer or Long, dimensionality of the output space. - activation: - Activation function (callable). Set it to None to maintain a linear activation. - weight_initializer: - Initializer function for the weight matrix. - bias_initializer: - Initializer function for the bias. - weight_regularizer: - Regularizer function for the weight matrix. - Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect. - bias_regularizer: - Regularizer function for the bias. - Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect. - activity_regularizer: - Regularizer function for the output. - weight_constraint: - An optional projection function to be applied to the - weight after being updated by an `Optimizer` (e.g. used to implement - norm constraints or value constraints for layer weights). The function - must take as input the unprojected variable and must return the - projected variable (which must have the same shape). Constraints are - not safe to use when doing asynchronous distributed training. - bias_constraint: - An optional projection function to be applied to the - bias after being updated by an `Optimizer`. - trainable: - Boolean, if `True` also add variables to the graph collection - ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable - `_). - name: - String, the name of the layer. Layers with the same name will - share weights, but to avoid mistakes we require ``reuse=True`` in such cases. + Where ``activation`` is the activation function passed as the ``activation`` + argument (if not ``None``), ``weight`` is a weights matrix created by the layer, + and ``bias`` is a bias vector created by the layer. - Properties: - output_size: - Python integer, dimensionality of the output space. - activation: - Activation function (callable). - weight_initializer: - Initializer instance (or name) for the weight matrix. - bias_initializer: - Initializer instance (or name) for the bias. - weight: - Weight matrix (TensorFlow variable or tensor). (weight) - bias: - Bias vector, if applicable (TensorFlow variable or tensor). - weight_regularizer: - Regularizer instance for the weight matrix (callable) - bias_regularizer: - Regularizer instance for the bias (callable). - activity_regularizer: - Regularizer instance for the output (callable) - weight_constraint: - Constraint function for the weight matrix. - bias_constraint: - Constraint function for the bias. + Args: + output_size (int): + Integer or Long, dimensionality of the output space. + activation (callable): + Activation function (callable). Set it to None to maintain a linear activation. + weight_initializer (callable): + Initializer function for the weight matrix. + bias_initializer (callable): + Initializer function for the bias. + weight_regularizer (callable): + Regularizer function for the weight matrix. + Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect. + bias_regularizer (callable): + Regularizer function for the bias. + Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect. + activity_regularizer (callable): + Regularizer function for the output. + weight_constraint (callable): + An optional projection function to be applied to the + weight after being updated by an `Optimizer` (e.g. used to implement + norm constraints or value constraints for layer weights). The function + must take as input the unprojected variable and must return the + projected variable (which must have the same shape). Constraints are + not safe to use when doing asynchronous distributed training. + bias_constraint (callable): + An optional projection function to be applied to the + bias after being updated by an `Optimizer`. + trainable (bool): + Boolean, if `True` also add variables to the graph collection + ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable + `_). + name (str): + String, the name of the layer. Layers with the same name will + share weights, but to avoid mistakes we require ``reuse=True`` in such cases. - """ + Properties: + output_size (int): + Python integer, dimensionality of the output space. + activation (callable): + Activation function (callable). + weight_initializer (Initializer): + Initializer instance (or name) for the weight matrix. + bias_initializer (Initializer): + Initializer instance (or name) for the bias. + weight (TensorFlow variable or tensor): + Weight matrix (TensorFlow variable or tensor). (weight) + bias (TensorFlow variable or tensor): + Bias vector, if applicable (TensorFlow variable or tensor). + weight_regularizer (Regularizer): + Regularizer instance for the weight matrix (callable) + bias_regularizer (Regularizer): + Regularizer instance for the bias (callable). + activity_regularizer (Regularizer): + Regularizer instance for the output (callable) + weight_constraint (Constraint): + Constraint function for the weight matrix. + bias_constraint (Constraint): + Constraint function for the bias. + """ - def __init__(self, output_size, - weight_initializer=None, - weight_regularizer=None, - weight_constraint=None, - bias_constraint=None, - num_partitions=None, - **kwargs): - super(FullDense, self).__init__(units=output_size, - kernel_initializer=weight_initializer, - kernel_regularizer=weight_regularizer, - kernel_constraint=weight_constraint, - **kwargs) - self._num_partitions = num_partitions + def __init__( + self, + output_size: int, + weight_initializer: Optional[Callable[[int], tf.Tensor]] = None, + weight_regularizer: Optional[Callable[[tf.Tensor], tf.Tensor]] = None, + weight_constraint: Optional[Callable[[tf.Tensor], tf.Tensor]] = None, + bias_constraint: Optional[Callable[[tf.Tensor], tf.Tensor]] = None, + num_partitions: Optional[int] = None, + **kwargs + ): + super(FullDense, self).__init__( + units=output_size, + kernel_initializer=weight_initializer, + kernel_regularizer=weight_regularizer, + kernel_constraint=weight_constraint, + **kwargs + ) + self._num_partitions = num_partitions - def build(self, input_shape): - ''' - code adapted from TF 1.12 Keras Dense layer: - https://github.com/tensorflow/tensorflow/blob/r1.12/tensorflow/python/keras/layers/core.py#L930-L956 - ''' - input_shape = tensor_shape.TensorShape(input_shape) - if input_shape[-1] is None: - raise ValueError('The last dimension of the inputs to `Dense` ' - 'should be defined. Found `None`.') - self.input_spec = InputSpec(min_ndim=2, - axes={-1: input_shape[-1]}) + def build(self, input_shape: tf.TensorShape): + """ + code adapted from TF 1.12 Keras Dense layer: + https://github.com/tensorflow/tensorflow/blob/r1.12/tensorflow/python/keras/layers/core.py#L930-L956 + """ + input_shape = tensor_shape.TensorShape(input_shape) + if input_shape[-1] is None: + raise ValueError( + "The last dimension of the inputs to `Dense` " + "should be defined. Found `None`." + ) + self.input_spec = InputSpec(min_ndim=2, axes={-1: input_shape[-1]}) - partitioner = None - if self._num_partitions: - partitioner = tf.fixed_size_partitioner(self._num_partitions) + partitioner = None + if self._num_partitions: + partitioner = tf.fixed_size_partitioner(self._num_partitions) - self.kernel = self.add_weight( - 'kernel', - shape=[input_shape[-1], self.units], - initializer=self.kernel_initializer, - regularizer=self.kernel_regularizer, - constraint=self.kernel_constraint, - dtype=self.dtype, - partitioner=partitioner, - trainable=True) + self.kernel = self.add_weight( + "kernel", + shape=[input_shape[-1], self.units], + initializer=self.kernel_initializer, + regularizer=self.kernel_regularizer, + constraint=self.kernel_constraint, + dtype=self.dtype, + partitioner=partitioner, + trainable=True, + ) - if self.use_bias: - self.bias = self.add_weight( - 'bias', - shape=[self.units, ], - initializer=self.bias_initializer, - regularizer=self.bias_regularizer, - constraint=self.bias_constraint, - dtype=self.dtype, - trainable=True) - else: - self.bias = None - self.built = True + if self.use_bias: + self.bias = self.add_weight( + "bias", + shape=[ + self.units, + ], + initializer=self.bias_initializer, + regularizer=self.bias_regularizer, + constraint=self.bias_constraint, + dtype=self.dtype, + trainable=True, + ) + else: + self.bias = None + self.built = True - @property - def output_size(self): - """ - Returns output_size - """ - return self.units + @property + def output_size(self) -> int: + """Returns output_size.""" + return self.units - @property - def weight(self): - """ - Returns weight - """ - return self.kernel + @property + def weight(self) -> tf.Tensor: + """Returns weight.""" + return self.kernel - @property - def weight_regularizer(self): - """ - Returns weight_regularizer - """ - return self.kernel_regularizer + @property + def weight_regularizer(self) -> Callable[[tf.Tensor], tf.Tensor]: + """Returns weight_regularizer.""" + return self.kernel_regularizer - @property - def weight_initializer(self): - """ - Returns weight_initializer - """ - return self.kernel_initializer + @property + def weight_initializer(self) -> Callable[[int], tf.Tensor]: + """Returns weight_initializer.""" + return self.kernel_initializer - @property - def weight_constraint(self): - """ - Returns weight_constraint - """ - return self.kernel_constraint + @property + def weight_constraint(self) -> Callable[[tf.Tensor], tf.Tensor]: + """Returns weight_constraint.""" + return self.kernel_constraint -def full_dense(inputs, output_size, - activation=None, - use_bias=True, - weight_initializer=None, - bias_initializer=init_ops.zeros_initializer(), - weight_regularizer=None, - bias_regularizer=None, - activity_regularizer=None, - weight_constraint=None, - bias_constraint=None, - trainable=True, - name=None, - num_partitions=None, - reuse=None): - """Functional interface for the densely-connected layer. - This layer implements the operation: - `outputs = activation(inputs.weight + bias)` - Where `activation` is the activation function passed as the `activation` - argument (if not `None`), `weight` is a weights matrix created by the layer, - and `bias` is a bias vector created by the layer - (only if `use_bias` is `True`). +def full_dense( + inputs: tf.Tensor, + output_size: int, + activation: Optional[Callable[[tf.Tensor], tf.Tensor]] = None, + use_bias: bool = True, + weight_initializer: Optional[Callable[[int], tf.Tensor]] = None, + bias_initializer: Callable[[int], tf.Tensor] = init_ops.zeros_initializer(), + weight_regularizer: Optional[Callable[[tf.Tensor], tf.Tensor]] = None, + bias_regularizer: Optional[Callable[[tf.Tensor], tf.Tensor]] = None, + activity_regularizer: Optional[Callable[[tf.Tensor], tf.Tensor]] = None, + weight_constraint: Optional[Callable[[tf.Tensor], tf.Tensor]] = None, + bias_constraint: Optional[Callable[[tf.Tensor], tf.Tensor]] = None, + trainable: bool = True, + name: Optional[str] = None, + num_partitions: Optional[int] = None, + reuse: bool = False, +) -> tf.Tensor: + """ + Functional interface for the densely-connected layer. + This layer implements the operation: + `outputs = activation(inputs.weight + bias)` + Where `activation` is the activation function passed as the `activation` + argument (if not `None`), `weight` is a weights matrix created by the layer, + and `bias` is a bias vector created by the layer + (only if `use_bias` is `True`). - Arguments: - inputs: Tensor input. - units: Integer or Long, dimensionality of the output space. - activation: Activation function (callable). Set it to None to maintain a - linear activation. - use_bias: Boolean, whether the layer uses a bias. - weight_initializer: Initializer function for the weight matrix. - If `None` (default), weights are initialized using the default - initializer used by `tf.get_variable`. - bias_initializer: - Initializer function for the bias. - weight_regularizer: - Regularizer function for the weight matrix. - Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect. - bias_regularizer: - Regularizer function for the bias. - Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect. - activity_regularizer: - Regularizer function for the output. - weight_constraint: - An optional projection function to be applied to the - weight after being updated by an `Optimizer` (e.g. used to implement - norm constraints or value constraints for layer weights). The function - must take as input the unprojected variable and must return the - projected variable (which must have the same shape). Constraints are - not safe to use when doing asynchronous distributed training. - bias_constraint: - An optional projection function to be applied to the - bias after being updated by an `Optimizer`. - trainable: - Boolean, if `True` also add variables to the graph collection - `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). - name: - String, the name of the layer. - reuse: - Boolean, whether to reuse the weights of a previous layer - by the same name. + Args: + inputs: Tensor input. + units: Integer or Long, dimensionality of the output space. + activation: Activation function (callable). Set it to None to maintain a linear activation. + use_bias: Boolean, whether the layer uses a bias. + weight_initializer: Initializer function for the weight matrix. + If `None` (default), weights are initialized using the default + initializer used by `tf.get_variable`. + bias_initializer: + Initializer function for the bias. + weight_regularizer: + Regularizer function for the weight matrix. + Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect. + bias_regularizer: + Regularizer function for the bias. + Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect. + activity_regularizer: + Regularizer function for the output. + weight_constraint: + An optional projection function to be applied to the + weight after being updated by an `Optimizer` (e.g. used to implement + norm constraints or value constraints for layer weights). The function + must take as input the unprojected variable and must return the + projected variable (which must have the same shape). Constraints are + not safe to use when doing asynchronous distributed training. + bias_constraint: + An optional projection function to be applied to the + bias after being updated by an `Optimizer`. + trainable: + Boolean, if `True` also add variables to the graph collection + `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). + name: + String, the name of the layer. + reuse: + Boolean, whether to reuse the weights of a previous layer + by the same name. - Returns: - Output tensor the same shape as `inputs` except the last dimension is of - size `units`. + Returns: + Output tensor the same shape as `inputs` except the last dimension is of + size `units`. - Raises: - ValueError: if eager execution is enabled. - """ - layer = FullDense(output_size, - activation=activation, - use_bias=use_bias, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - weight_regularizer=weight_regularizer, - bias_regularizer=bias_regularizer, - activity_regularizer=activity_regularizer, - weight_constraint=weight_constraint, - bias_constraint=bias_constraint, - trainable=trainable, - name=name, - dtype=inputs.dtype.base_dtype, - num_partitions=num_partitions, - _scope=name, - _reuse=reuse) - return layer.apply(inputs) + Raises: + ValueError: if eager execution is enabled. + """ + layer = FullDense( + output_size, + activation=activation, + use_bias=use_bias, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + weight_regularizer=weight_regularizer, + bias_regularizer=bias_regularizer, + activity_regularizer=activity_regularizer, + weight_constraint=weight_constraint, + bias_constraint=bias_constraint, + trainable=trainable, + name=name, + dtype=inputs.dtype.base_dtype, + num_partitions=num_partitions, + _scope=name, + _reuse=reuse, + ) + return layer.apply(inputs) diff --git a/twml/twml/layers/full_sparse.py b/twml/twml/layers/full_sparse.py index 4f0f21930..27701ac49 100644 --- a/twml/twml/layers/full_sparse.py +++ b/twml/twml/layers/full_sparse.py @@ -4,367 +4,400 @@ """ import math +from typing import Callable, List, Optional, Tuple, Union +import tensorflow.compat.v1 as tf from twitter.deepbird.sparse import sparse_dense_matmul -from .layer import Layer - -import tensorflow.compat.v1 as tf import twml +from .layer import Layer -class FullSparse(Layer): - """Fully-sparse layer class. - This layer implements the operation: - - .. code-block:: python - - outputs = activation(inputs.weight + bias) - - Arguments: - output_size: - Long or Integer, dimensionality of the output space. - input_size: - The number of input units. (Deprecated) - weight_initializer: - Initializer function for the weight matrix. - This argument defaults to zeros_initializer(). - This is valid when the FullSparse is the first layer of - parameters but should be changed otherwise. - weight_regularizer: - Regularizer function for the weight matrix. - Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect. - bias_regularizer: - Regularizer function for the bias. - Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect - activation: - Activation function (callable). Set it to None to maintain a linear activation. - bias_initializer: - Initializer function for the bias. - This argument defaults to tf.constant_initializer(1/output_size) - trainable: - Boolean, if `True` also add variables to the graph collection - ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable - `_). - name: - String, the name of the layer. Layers with the same name will - share weights, but to avoid mistakes we require ``reuse=True`` in such cases. - use_sparse_grads: - Boolean, if `True` do sparse mat mul with `embedding_lookup_sparse`, which will - make gradients to weight matrix also sparse in backward pass. This can lead to non-trivial - speed up at training time when input_size is large and optimizer handles sparse gradients - correctly (eg. with SGD or LazyAdamOptimizer). If weight matrix is small, it's recommended - to set this flag to `False`; for most use cases of FullSparse, however, weight matrix will - be large, so it's better to set it to `True` - num_partitions: - Number of partitions to use for the weight variable. Defaults to 1. - partition_axis: - If num_partitions is specified, the partition axis for the weight variable - Defaults to 0 (partition by row). - Must be 0 (row) or 1 (column) - use_binary_values: - Assume all non zero values are 1. Defaults to False. - This can improve training if used in conjunction with MDL. - This parameter can also be a list of binary values if `inputs` passed to `call` a list. - use_compression: - Default False. Set True to enable data compression techniques for - optimization of network traffic for distributed training. - use_binary_sparse_dense_matmul: - If binary sparse dense matmul op is to be used. It will only be enabled if - `use_binary_values` is set true. It only should be used for inference, best practice is - to set `use_binary_sparse_dense_matmul = not is_training`. - """ - - def __init__(self, - output_size, - input_size=None, - weight_initializer=None, - activation=None, - bias_initializer=None, - trainable=True, - name=None, - use_sparse_grads=True, - num_partitions=None, - partition_axis=0, - use_binary_values=False, - bias_regularizer=None, - weight_regularizer=None, - use_compression=False, - use_binary_sparse_dense_matmul=False, - **kwargs): - super(FullSparse, self).__init__(trainable=trainable, name=name, **kwargs) - # TODO - remove input_size warning. - if input_size: - raise ValueError('input_size is deprecated - it is now automatically \ - inferred from your input.') - - # The bias initialization and weights initialization is set to match v1's implementation. - if bias_initializer is None: - bias_initializer = tf.constant_initializer(1 / output_size) - # Weights initialization is set to 0s. This is safe for full sparse layers because - # you are supposed to learn your embedding from the label. - if weight_initializer is None: - weight_initializer = tf.zeros_initializer() - self.weight_initializer = weight_initializer - self.bias_initializer = bias_initializer - self.output_size = output_size - self.activation = activation - self.use_sparse_grads = use_sparse_grads - self.num_partitions = num_partitions - if partition_axis != 0 and partition_axis != 1: - raise ValueError('partition_axis must be 0 or 1') - self.partition_axis = partition_axis - self.use_binary_values = use_binary_values - self.weight_regularizer = weight_regularizer - self.bias_regularizer = bias_regularizer - self._use_compression = use_compression - self._cast_indices_dtype = tf.int32 if self._use_compression else None - self.use_binary_sparse_dense_matmul = use_binary_sparse_dense_matmul - - def _make_weight_var(self, shape, partitioner): - self.weight = self.add_variable( - 'weight', - initializer=self.weight_initializer, - regularizer=self.weight_regularizer, - shape=shape, - dtype=self.dtype, - trainable=True, - partitioner=partitioner, - ) - def build(self, input_shapes): - """ - creates the ``bias`` and ``weight`` Variables - of shape ``[output_size]`` and ``[input_size, output_size]`` respectively. - """ +class FullSparse(Layer): + """Fully-sparse layer class. + This layer implements the operation: - if isinstance(input_shapes, (list, tuple)): - input_shape = input_shapes[0] - is_compatible = True - for other_shape in input_shapes[1:]: - is_compatible &= input_shape.is_compatible_with(other_shape) - if not is_compatible: - raise ValueError("Input shapes %s are not compatible." % input_shapes) - else: - input_shape = input_shapes - - self.bias = self.add_variable( - 'bias', - initializer=self.bias_initializer, - regularizer=self.bias_regularizer, - shape=[self.output_size, ], - dtype=self.dtype, - trainable=True - ) + .. code-block:: python - partitioner = None - shape = [input_shape[1], self.output_size] - - # There is a 2gb limitation for each tensor because of protobuf. - # 2**30 is 1GB. 2 * (2**30) is 2GB. - dtype = tf.as_dtype(self.dtype) - num_partitions = 1 if self.num_partitions is None else self.num_partitions - in_shape = input_shape[1] - out_shape = self.output_size - - # when v2 behavior is disabled, in_shape is tf.Dimension. otherwise it is int. - if isinstance(in_shape, tf.Dimension): - in_shape = in_shape.value - - if in_shape is None: - raise ValueError("Input tensor should have shape." - " You can set it using twml.util.limit_sparse_tensor_size") - - (split_dim, other_dim) = (in_shape, out_shape) if self.partition_axis == 0 else (out_shape, in_shape) - requested_size = math.ceil(float(split_dim) / num_partitions) * other_dim * dtype.size - if (requested_size >= 2**31): - raise ValueError("Weight tensor partitions cannot be larger than 2GB.\n" - "Requested Dimensions(%d, %d) of type %s (%d bytes total) over %d partitions.\n" - "Possible solutions:\n" - "- reduce the params.output_size_bits\n" - "- reduce the output_size of the sparse_layer\n" - "- specify a larger num_partitions argument\n" - "- reduce input_size_bits" % - (in_shape, self.output_size, dtype.name, requested_size, num_partitions)) - - if self.num_partitions: - partition_axis = int(self.partition_axis) - partitioner = tf.fixed_size_partitioner(self.num_partitions, axis=partition_axis) - else: - # Regular variables do not like it when you pass both constant tensors and shape - if not callable(self.weight_initializer): - shape = None - - self._make_weight_var(shape, partitioner) - - self.built = True - - def compute_output_shape(self, input_shape): - """Computes the output shape of the layer given the input shape. + outputs = activation(inputs.weight + bias) Args: - input_shape: A (possibly nested tuple of) `TensorShape`. It need not - be fully defined (e.g. the batch size may be unknown). - - Raises NotImplementedError. - + output_size: + Long or Integer, dimensionality of the output space. + weight_initializer: + Initializer function for the weight matrix. + This argument defaults to zeros_initializer(). + This is valid when the FullSparse is the first layer of + parameters but should be changed otherwise. + weight_regularizer: + Regularizer function for the weight matrix. + Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect. + bias_regularizer: + Regularizer function for the bias. + Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect + activation: + Activation function (callable). Set it to None to maintain a linear activation. + bias_initializer: + Initializer function for the bias. + This argument defaults to tf.constant_initializer(1/output_size) + trainable: + Boolean, if `True` also add variables to the graph collection + ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable + `_). + name: + String, the name of the layer. Layers with the same name will + share weights, but to avoid mistakes we require ``reuse=True`` in such cases. + use_sparse_grads: + Boolean, if `True` do sparse mat mul with `embedding_lookup_sparse`, which will + make gradients to weight matrix also sparse in backward pass. This can lead to non-trivial + speed up at training time when input_size is large and optimizer handles sparse gradients + correctly (eg. with SGD or LazyAdamOptimizer). If weight matrix is small, it's recommended + to set this flag to `False`; for most use cases of FullSparse, however, weight matrix will + be large, so it's better to set it to `True` + num_partitions: + Number of partitions to use for the weight variable. Defaults to 1. + partition_axis: + If num_partitions is specified, the partition axis for the weight variable + Defaults to 0 (partition by row). + Must be 0 (row) or 1 (column) + use_binary_values: + Assume all non zero values are 1. Defaults to False. + This can improve training if used in conjunction with MDL. + This parameter can also be a list of binary values if `inputs` passed to `call` a list. + use_compression: + Default False. Set True to enable data compression techniques for + optimization of network traffic for distributed training. + use_binary_sparse_dense_matmul: + If binary sparse dense matmul op is to be used. It will only be enabled if + `use_binary_values` is set true. It only should be used for inference, best practice is + to set `use_binary_sparse_dense_matmul = not is_training`. """ - raise NotImplementedError - - def call(self, inputs, **kwargs): # pylint: disable=unused-argument - """The logic of the layer lives here. - Arguments: - inputs: - A SparseTensor or a list of SparseTensors. - If `inputs` is a list, all tensors must have same `dense_shape`. + def __init__( + self, + output_size: int, + weight_initializer: Optional[Callable[[tf.Tensor], tf.Tensor]] = None, + activation: Optional[Callable[[tf.Tensor], tf.Tensor]] = tf.nn.relu, + bias_initializer: Optional[Callable[[tf.Tensor], tf.Tensor]] = None, + trainable: bool = True, + name: Optional[str] = None, + use_sparse_grads: bool = True, + num_partitions: Optional[int] = None, + partition_axis: int = 0, + use_binary_values: bool = False, + bias_regularizer: Optional[Callable[[tf.Tensor], tf.Tensor]] = None, + weight_regularizer: Optional[Callable[[tf.Tensor], tf.Tensor]] = None, + use_compression: bool = False, + use_binary_sparse_dense_matmul: bool = False, + **kwargs + ): + super(FullSparse, self).__init__(trainable=trainable, name=name, **kwargs) + + # The bias initialization and weights initialization is set to match v1's implementation. + if bias_initializer is None: + bias_initializer = tf.constant_initializer(1 / output_size) + # Weights initialization is set to 0s. This is safe for full sparse layers because + # you are supposed to learn your embedding from the label. + if weight_initializer is None: + weight_initializer = tf.zeros_initializer() + self.weight_initializer = weight_initializer + self.bias_initializer = bias_initializer + self.output_size = output_size + self.activation = activation + self.use_sparse_grads = use_sparse_grads + self.num_partitions = num_partitions + if partition_axis != 0 and partition_axis != 1: + raise ValueError("partition_axis must be 0 or 1") + self.partition_axis = partition_axis + self.use_binary_values = use_binary_values + self.weight_regularizer = weight_regularizer + self.bias_regularizer = bias_regularizer + self._use_compression = use_compression + self._cast_indices_dtype = tf.int32 if self._use_compression else None + self.use_binary_sparse_dense_matmul = use_binary_sparse_dense_matmul + + def _make_weight_var( + self, + shape: List[int], + partitioner: tf.Partitioner, + ) -> None: + self.weight = self.add_variable( + "weight", + initializer=self.weight_initializer, + regularizer=self.weight_regularizer, + shape=shape, + dtype=self.dtype, + trainable=True, + partitioner=partitioner, + ) + + def build( + self, input_shapes: Union[Tuple[tf.TensorShape], List[tf.TensorShape]] + ) -> None: + """ + creates the ``bias`` and ``weight`` Variables + of shape ``[output_size]`` and ``[input_size, output_size]`` respectively. + """ + + if isinstance(input_shapes, (list, tuple)): + input_shape = input_shapes[0] + is_compatible = True + for other_shape in input_shapes[1:]: + is_compatible &= input_shape.is_compatible_with(other_shape) + if not is_compatible: + raise ValueError("Input shapes %s are not compatible." % input_shapes) + else: + input_shape = input_shapes + + self.bias = self.add_variable( + "bias", + initializer=self.bias_initializer, + regularizer=self.bias_regularizer, + shape=[ + self.output_size, + ], + dtype=self.dtype, + trainable=True, + ) + + partitioner = None + shape = [input_shape[1], self.output_size] + + # There is a 2gb limitation for each tensor because of protobuf. + # 2**30 is 1GB. 2 * (2**30) is 2GB. + dtype = tf.as_dtype(self.dtype) + num_partitions = 1 if self.num_partitions is None else self.num_partitions + in_shape = input_shape[1] + out_shape = self.output_size + + # when v2 behavior is disabled, in_shape is tf.Dimension. otherwise it is int. + if isinstance(in_shape, tf.Dimension): + in_shape = in_shape.value + + if in_shape is None: + raise ValueError( + "Input tensor should have shape." + " You can set it using twml.util.limit_sparse_tensor_size" + ) + + (split_dim, other_dim) = ( + (in_shape, out_shape) if self.partition_axis == 0 else (out_shape, in_shape) + ) + requested_size = ( + math.ceil(float(split_dim) / num_partitions) * other_dim * dtype.size + ) + if requested_size >= 1 << 31: + raise ValueError( + "Weight tensor partitions cannot be larger than 2GB.\n" + "Requested Dimensions(%d, %d) of type %s (%d bytes total) over %d partitions.\n" + "Possible solutions:\n" + "- reduce the params.output_size_bits\n" + "- reduce the output_size of the sparse_layer\n" + "- specify a larger num_partitions argument\n" + "- reduce input_size_bits" + % ( + in_shape, + self.output_size, + dtype.name, + requested_size, + num_partitions, + ) + ) + + if self.num_partitions: + partition_axis = int(self.partition_axis) + partitioner = tf.fixed_size_partitioner( + self.num_partitions, axis=partition_axis + ) + else: + # Regular variables do not like it when you pass both constant tensors and shape + if not callable(self.weight_initializer): + shape = None + + self._make_weight_var(shape, partitioner) + + self.built = True + + def compute_output_shape(self, input_shape: tf.TensorShape): + """Computes the output shape of the layer given the input shape. + + Args: + input_shape: A (possibly nested tuple of) `TensorShape`. It need not + be fully defined (e.g. the batch size may be unknown). + + Raises NotImplementedError. + """ + raise NotImplementedError + + def call( + self, inputs: Union[List[tf.SparseTensor], Tuple[tf.SparseTensor]], **kwargs + ) -> tf.Tensor: # pylint: disable=unused-argument + """The logic of the layer lives here. + + Args: + inputs: + A SparseTensor or a list of SparseTensors. + If `inputs` is a list, all tensors must have same `dense_shape`. + + Returns: + - If `inputs` is `SparseTensor`, then returns `bias + inputs * dense_b`. + - If `inputs` is a `list[SparseTensor`, then returns + `bias + add_n([sp_a * dense_b for sp_a in inputs])`. + + """ + if isinstance(inputs, (list, tuple)): + if isinstance(self.use_binary_values, (list, tuple)): + use_binary_values = self.use_binary_values + else: + use_binary_values = [self.use_binary_values] * len(inputs) + + num_inputs = len(inputs) + if num_inputs != len(use_binary_values): + raise ValueError( + "#inputs is %d while #use_binary_values is %d" + % (num_inputs, len(use_binary_values)) + ) + + outputs = [] + for n in range(num_inputs): + outputs.append( + sparse_dense_matmul( + inputs[n], + self.weight, + self.use_sparse_grads, + use_binary_values[n], + name="sparse_mm_" + str(n), + partition_axis=self.partition_axis, + num_partitions=self.num_partitions, + compress_ids=self._use_compression, + cast_indices_dtype=self._cast_indices_dtype, + use_binary_sparse_dense_matmul=self.use_binary_sparse_dense_matmul, + ) + ) + outputs = tf.accumulate_n(outputs) + else: + if isinstance(self.use_binary_values, (list, tuple)): + raise ValueError( + "use_binary_values can not be %s when inputs is %s" + % (type(self.use_binary_values), type(inputs)) + ) + + outputs = sparse_dense_matmul( + inputs, + self.weight, + self.use_sparse_grads, + self.use_binary_values, + name="sparse_mm", + partition_axis=self.partition_axis, + num_partitions=self.num_partitions, + compress_ids=self._use_compression, + cast_indices_dtype=self._cast_indices_dtype, + use_binary_sparse_dense_matmul=self.use_binary_sparse_dense_matmul, + ) + + if self.bias is not None: + outputs = tf.nn.bias_add(outputs, self.bias) + + if self.activation is not None: + return self.activation(outputs) # pylint: disable=not-callable + return outputs - Returns: - - If `inputs` is `SparseTensor`, then returns `bias + inputs * dense_b`. - - If `inputs` is a `list[SparseTensor`, then returns - `bias + add_n([sp_a * dense_b for sp_a in inputs])`. +def full_sparse( + inputs: tf.SparseTensor, + output_size: int, + activation: Optional[Callable] = None, + bias_regularizer: Optional[Callable] = None, + weight_regularizer: Optional[Callable] = None, + bias_initializer: Optional[Callable] = None, + weight_initializer: Optional[Callable] = None, + trainable: bool = True, + name: Optional[str] = None, + reuse: Optional[bool] = None, + use_sparse_grads: bool = True, + num_partitions: Optional[int] = None, + partition_axis: int = 0, + use_binary_values: bool = False, + use_compression: bool = False, +) -> FullSparse: """ - if isinstance(inputs, (list, tuple)): + Functional interface for the sparsely-connected layer. - if isinstance(self.use_binary_values, (list, tuple)): - use_binary_values = self.use_binary_values - else: - use_binary_values = [self.use_binary_values] * len(inputs) - - num_inputs = len(inputs) - if num_inputs != len(use_binary_values): - raise ValueError("#inputs is %d while #use_binary_values is %d" - % (num_inputs, len(use_binary_values))) - - outputs = [] - for n in range(num_inputs): - outputs.append(sparse_dense_matmul(inputs[n], self.weight, - self.use_sparse_grads, - use_binary_values[n], - name='sparse_mm_' + str(n), - partition_axis=self.partition_axis, - num_partitions=self.num_partitions, - compress_ids=self._use_compression, - cast_indices_dtype=self._cast_indices_dtype, - use_binary_sparse_dense_matmul=self.use_binary_sparse_dense_matmul)) - outputs = tf.accumulate_n(outputs) - else: - - if isinstance(self.use_binary_values, (list, tuple)): - raise ValueError("use_binary_values can not be %s when inputs is %s" % - (type(self.use_binary_values), type(inputs))) - - outputs = sparse_dense_matmul(inputs, self.weight, - self.use_sparse_grads, - self.use_binary_values, - name='sparse_mm', - partition_axis=self.partition_axis, - num_partitions=self.num_partitions, - compress_ids=self._use_compression, - cast_indices_dtype=self._cast_indices_dtype, - use_binary_sparse_dense_matmul=self.use_binary_sparse_dense_matmul) - - if self.bias is not None: - outputs = tf.nn.bias_add(outputs, self.bias) - - if self.activation is not None: - return self.activation(outputs) # pylint: disable=not-callable - return outputs + Args: + inputs: + A sparse tensor (can be twml.SparseTensor or tf.SparseTensor) + output_size: + Long or Integer, dimensionality of the output space. + weight_initializer: + Initializer function for the weight matrix. + activation: + Activation function (callable). Set it to None to maintain a linear activation. + bias_initializer: + Initializer function for the bias. + weight_regularizer: + Regularizer function for the weight matrix. + Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect. + bias_regularizer: + Regularizer function for the bias. + Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect. + trainable: + Boolean, if `True` also add variables to the graph collection + ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable + `_). + name: + String, the name of the layer. Layers with the same name will + share weights, but to avoid mistakes we require ``reuse=True`` in such cases. + use_sparse_grads: + Boolean, if `True` do sparse mat mul with `embedding_lookup_sparse`, which will + make gradients to weight matrix also sparse in backward pass. This can lead to non-trivial + speed up at training time when input_size is large and optimizer handles sparse gradients + correctly (eg. with SGD or LazyAdamOptimizer). If weight matrix is small, it's recommended + to set this flag to `False`; for most use cases of FullSparse, however, weight matrix will + be large, so it's better to set it to `True` + num_partitions: + Number of partitions to use for the weight variable. Defaults to 1. + partition_axis: + If num_partitions is specified, the partition axis for the weight variable + Defaults to 0 (partition by row). + Must be 0 (row) or 1 (column) + use_binary_values: + Assume all non zero values are 1. Defaults to False. + This can improve training if used in conjunction with MDL. + use_compression: + Default False. Set True to enable data compression techniques for + optimization of network traffic for distributed training. + Returns: + Outputs a ``tf.Tensor`` of size ``[batch_size x output_size]``. + """ + dtype = None + if isinstance(inputs, twml.SparseTensor): + inputs = inputs.to_tf() + dtype = inputs.dtype.base_dtype -def full_sparse( - inputs, output_size, - input_size=None, - activation=None, - bias_regularizer=None, - weight_regularizer=None, - bias_initializer=None, - weight_initializer=None, - trainable=True, - name=None, - reuse=None, - use_sparse_grads=True, - num_partitions=None, - partition_axis=0, - use_binary_values=False, - use_compression=False): - """Functional interface for the sparsely-connected layer. - - Arguments: - inputs: - A sparse tensor (can be twml.SparseTensor or tf.SparseTensor) - output_size: - Long or Integer, dimensionality of the output space. - weight_initializer: - Initializer function for the weight matrix. - activation: - Activation function (callable). Set it to None to maintain a linear activation. - bias_initializer: - Initializer function for the bias. - weight_regularizer: - Regularizer function for the weight matrix. - Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect. - bias_regularizer: - Regularizer function for the bias. - Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect. - trainable: - Boolean, if `True` also add variables to the graph collection - ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable - `_). - name: - String, the name of the layer. Layers with the same name will - share weights, but to avoid mistakes we require ``reuse=True`` in such cases. - use_sparse_grads: - Boolean, if `True` do sparse mat mul with `embedding_lookup_sparse`, which will - make gradients to weight matrix also sparse in backward pass. This can lead to non-trivial - speed up at training time when input_size is large and optimizer handles sparse gradients - correctly (eg. with SGD or LazyAdamOptimizer). If weight matrix is small, it's recommended - to set this flag to `False`; for most use cases of FullSparse, however, weight matrix will - be large, so it's better to set it to `True` - num_partitions: - Number of partitions to use for the weight variable. Defaults to 1. - partition_axis: - If num_partitions is specified, the partition axis for the weight variable - Defaults to 0 (partition by row). - Must be 0 (row) or 1 (column) - use_binary_values: - Assume all non zero values are 1. Defaults to False. - This can improve training if used in conjunction with MDL. - use_compression: - Default False. Set True to enable data compression techniques for - optimization of network traffic for distributed training. - Returns: - Outputs a ``tf.Tensor`` of size ``[batch_size x output_size]``. - """ - # TODO - remove input_size warning. - if input_size: - raise ValueError('input_size is deprecated - it is now \ - automatically inferred from your input.') - - dtype = None - if isinstance(inputs, twml.SparseTensor): - inputs = inputs.to_tf() - dtype = inputs.dtype.base_dtype - - if isinstance(inputs, (list, tuple)): - inputs = [inp.to_tf() if isinstance(inp, twml.SparseTensor) else inp for inp in inputs] - dtype = inputs[0].dtype.base_dtype - - layer = FullSparse(output_size=output_size, - activation=activation, - trainable=trainable, - name=name, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - weight_regularizer=weight_regularizer, - bias_regularizer=bias_regularizer, - dtype=dtype, - _scope=name, - _reuse=reuse, - use_sparse_grads=use_sparse_grads, - num_partitions=num_partitions, - partition_axis=partition_axis, - use_compression=use_compression, - use_binary_values=use_binary_values) - return layer(inputs) + if isinstance(inputs, (list, tuple)): + inputs = [ + inp.to_tf() if isinstance(inp, twml.SparseTensor) else inp for inp in inputs + ] + dtype = inputs[0].dtype.base_dtype + + layer = FullSparse( + output_size=output_size, + activation=activation, + trainable=trainable, + name=name, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + weight_regularizer=weight_regularizer, + bias_regularizer=bias_regularizer, + dtype=dtype, + _scope=name, + _reuse=reuse, + use_sparse_grads=use_sparse_grads, + num_partitions=num_partitions, + partition_axis=partition_axis, + use_compression=use_compression, + use_binary_values=use_binary_values, + ) + return layer(inputs) diff --git a/twml/twml/layers/isotonic.py b/twml/twml/layers/isotonic.py index 7113f7af4..9c1bed62a 100644 --- a/twml/twml/layers/isotonic.py +++ b/twml/twml/layers/isotonic.py @@ -3,74 +3,93 @@ Contains the Isotonic Layer """ -from .layer import Layer +from typing import Optional import libtwml import numpy as np +import tensorflow.compat.v1 as tf + +from .layer import Layer class Isotonic(Layer): - """ - This layer is created by the IsotonicCalibrator. - Typically it is used intead of sigmoid activation on the output unit. - - Arguments: - n_unit: - number of input units to the layer (same as number of output units). - n_bin: - number of bins used for isotonic calibration. - More bins means a more precise isotonic function. - Less bins means a more regularized isotonic function. - xs_input: - A tensor containing the boundaries of the bins. - ys_input: - A tensor containing calibrated values for the corresponding bins. - - Output: - output: - A layer containing calibrated probabilities with same shape and size as input. - Expected Sizes: - xs_input, ys_input: - [n_unit, n_bin]. - Expected Types: - xs_input, ys_input: - same as input. - """ - - def __init__(self, n_unit, n_bin, xs_input=None, ys_input=None, **kwargs): - super(Isotonic, self).__init__(**kwargs) - - self._n_unit = n_unit - self._n_bin = n_bin - - self.xs_input = np.empty([n_unit, n_bin], dtype=np.float32) if xs_input is None else xs_input - self.ys_input = np.empty([n_unit, n_bin], dtype=np.float32) if ys_input is None else ys_input - - def compute_output_shape(self, input_shape): - """Computes the output shape of the layer given the input shape. + """ + This layer is created by the IsotonicCalibrator. + Typically it is used instead of sigmoid activation on the output unit. Args: - input_shape: A (possibly nested tuple of) `TensorShape`. It need not - be fully defined (e.g. the batch size may be unknown). - - Raises NotImplementedError. - + n_unit: + number of input units to the layer (same as number of output units). + n_bin: + number of bins used for isotonic calibration. + More bins means a more precise isotonic function. + Less bins means a more regularized isotonic function. + xs_input: + A tensor containing the boundaries of the bins. + ys_input: + A tensor containing calibrated values for the corresponding bins. + + Output: + output: + A layer containing calibrated probabilities with same shape and size as input. + Expected Sizes: + xs_input, ys_input: + [n_unit, n_bin]. + Expected Types: + xs_input, ys_input: + same as input. """ - raise NotImplementedError - def build(self, input_shape): # pylint: disable=unused-argument - """Creates the variables of the layer.""" - - self.built = True - - def call(self, inputs, **kwargs): # pylint: disable=unused-argument - """The logic of the layer lives here. - - Arguments: - inputs: input tensor(s). - - Returns: - The output from the layer - """ - calibrate_op = libtwml.ops.isotonic_calibration(inputs, self.xs_input, self.ys_input) - return calibrate_op + def __init__( + self, + n_unit: int, + n_bin: int, + xs_input: Optional[np.ndarray] = None, + ys_input: Optional[np.ndarray] = None, + **kwargs, + ): + super(Isotonic, self).__init__(**kwargs) + + self._n_unit = n_unit + self._n_bin = n_bin + + self.xs_input = ( + np.empty([n_unit, n_bin], dtype=np.float32) + if xs_input is None + else xs_input + ) + self.ys_input = ( + np.empty([n_unit, n_bin], dtype=np.float32) + if ys_input is None + else ys_input + ) + + def compute_output_shape(self, input_shape: tf.TensorShape): + """Computes the output shape of the layer given the input shape. + + Args: + input_shape: A (possibly nested tuple of) `TensorShape`. It need not + be fully defined (e.g. the batch size may be unknown). + + Raises NotImplementedError. + + """ + raise NotImplementedError + + def build(self, input_shape: tf.TensorShape): # pylint: disable=unused-argument + """Creates the variables of the layer.""" + self.built = True + + def call(self, inputs: tf.Tensor, **kwargs): # pylint: disable=unused-argument + """The logic of the layer lives here. + + Args: + inputs: input tensor(s). + + Returns: + The output from the layer + """ + calibrate_op = libtwml.ops.isotonic_calibration( + inputs, self.xs_input, self.ys_input + ) + return calibrate_op diff --git a/twml/twml/layers/layer.py b/twml/twml/layers/layer.py index c1b00eb13..d798c9e5a 100644 --- a/twml/twml/layers/layer.py +++ b/twml/twml/layers/layer.py @@ -2,49 +2,52 @@ """ Implementing a base layer for twml """ +from typing import List, Union + import tensorflow.compat.v1 as tf from tensorflow.python.layers import base class Layer(base.Layer): - """ - Base Layer implementation for twml. - Overloads `twml.layers.Layer - `_ - from tensorflow and adds a couple of custom methods. - """ - - @property - def init(self): - """ - Return initializer ops. By default returns tf.no_op(). - This method is overwritten by classes like twml.layers.MDL, which - uses a HashTable internally, that must be initialized with its own op. """ - return tf.no_op() - - def call(self, inputs, **kwargs): - """The logic of the layer lives here. - - Arguments: - inputs: - input tensor(s). - **kwargs: - additional keyword arguments. - - Returns: - Output tensor(s). + Base Layer implementation for twml. + Overloads `twml.layers.Layer + `_ + from tensorflow and adds a couple of custom methods. """ - raise NotImplementedError - def compute_output_shape(self, input_shape): - """Computes the output shape of the layer given the input shape. - - Args: - input_shape: A (possibly nested tuple of) `TensorShape`. It need not - be fully defined (e.g. the batch size may be unknown). - - Raise NotImplementedError. - - """ - raise NotImplementedError + @property + def init(self) -> tf.Operation: + """ + Return initializer ops. By default returns tf.no_op(). + This method is overwritten by classes like twml.layers.MDL, which + uses a HashTable internally, that must be initialized with its own op. + """ + return tf.no_op() + + def call( + self, inputs: Union[tf.Tensor, List[tf.Tensor]], **kwargs + ) -> tf.Tensor: # pylint: disable=arguments-differ + """The logic of the layer lives here. + + Args: + inputs: + input tensor(s). + **kwargs: + additional keyword arguments. + + Returns: + Output tensor(s). + """ + raise NotImplementedError + + def compute_output_shape(self, input_shape: tf.TensorShape): + """Computes the output shape of the layer given the input shape. + + Args: + input_shape: A (possibly nested tuple of) `TensorShape`. It need not + be fully defined (e.g. the batch size may be unknown). + + Raise NotImplementedError. + """ + raise NotImplementedError diff --git a/twml/twml/layers/mdl.py b/twml/twml/layers/mdl.py index cf4018afa..7202c0a45 100644 --- a/twml/twml/layers/mdl.py +++ b/twml/twml/layers/mdl.py @@ -4,253 +4,279 @@ """ -from .layer import Layer -from .partition import Partition -from .stitch import Stitch +from typing import Optional import libtwml import numpy as np import tensorflow.compat.v1 as tf + import twml +from .layer import Layer +from .partition import Partition +from .stitch import Stitch + class MDL(Layer): # noqa: T000 - """ - MDL layer is constructed by MDLCalibrator after accumulating data - and performing minimum description length (MDL) calibration. - - MDL takes sparse continuous features and converts then to sparse - binary features. Each binary output feature is associated to an MDL bin. - Each MDL input feature is converted to n_bin bins. - Each MDL calibration tries to find bin delimiters such that the number of features values - per bin is roughly equal (for each given MDL feature). - Note that if an input feature is rarely used, so will its associated output bin/features. - """ - - def __init__( - self, - n_feature, n_bin, out_bits, - bin_values=None, hash_keys=None, hash_values=None, - bin_ids=None, feature_offsets=None, **kwargs): """ - Creates a non-initialized `MDL` object. - Before using the table you will have to initialize it. After initialization - the table will be immutable. - - Parent class args: - see [tf.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/layers/Layer) - for documentation of parent class arguments. - - Required args: - n_feature: - number of unique features accumulated during MDL calibration. - This is the number of features in the hash map. - Used to initialize bin_values, hash_keys, hash_values, - bin_ids, bin_values and feature_offsets. - n_bin: - number of MDL bins used for MDL calibration. - Used to initialize bin_values, hash_keys, hash_values, - bin_ids, bin_values and feature_offsets. - out_bits: - Determines the maximum value for output feature IDs. - The dense_shape of the SparseTensor returned by lookup(x) - will be [x.shape[0], 1 << output_bits]. - - Optional args: - hash_keys: - contains the features ID that MDL discretizes and knows about. - The hash map (hash_keys->hash_values) is used for two reasons: - 1. divide inputs into two feature spaces: MDL vs non-MDL - 2. transate the MDL features into a hash_feature ID that MDL understands. - The hash_map is expected to contain n_feature items. - hash_values: - translates the feature IDs into hash_feature IDs for MDL. - bin_ids: - a 1D Tensor of size n_feature * n_bin + 1 which contains - unique IDs to which the MDL features will be translated to. - For example, tf.Tensor(np.arange(n_feature * n_bin)) would produce - the most efficient output space. - bin_values: - a 1D Tensor aligned with bin_ids. - For a given hash_feature ID j, it's value bin's are indexed between - `j*n_bin` and `j*n_bin + n_bin-1`. - As such, bin_ids[j*n_bin+i] is translated from a hash_feature ID of j - and a inputs value between - `bin_values[j*n_bin + i]` and `bin_values[j*n_bin+i+1]`. - feature_offsets: - a 1D Tensor specifying the starting location of bins for a given feature id. - For example, tf.Tensor(np.arange(0, bin_values.size, n_bin, dtype='int64')). + MDL layer is constructed by MDLCalibrator after accumulating data + and performing minimum description length (MDL) calibration. + + MDL takes sparse continuous features and converts then to sparse + binary features. Each binary output feature is associated to an MDL bin. + Each MDL input feature is converted to n_bin bins. + Each MDL calibration tries to find bin delimiters such that the number of features values + per bin is roughly equal (for each given MDL feature). + Note that if an input feature is rarely used, so will its associated output bin/features. """ - super(MDL, self).__init__(**kwargs) - tf.logging.warning("MDL will be deprecated. Please use PercentileDiscretizer instead") - - max_mdl_feature = n_feature * (n_bin + 1) - self._n_feature = n_feature - self._n_bin = n_bin - - self._hash_keys_initializer = tf.constant_initializer( - hash_keys if hash_keys is not None - else np.empty(n_feature, dtype=np.int64), - dtype=np.int64 - ) - self._hash_values_initializer = tf.constant_initializer( - hash_values if hash_values is not None - else np.empty(n_feature, dtype=np.int64), - dtype=np.int64 - ) - self._bin_ids_initializer = tf.constant_initializer( - bin_ids if bin_ids is not None - else np.empty(max_mdl_feature, dtype=np.int64), - dtype=np.int64 - ) - self._bin_values_initializer = tf.constant_initializer( - bin_values if bin_values is not None - else np.empty(max_mdl_feature, dtype=np.float32), - dtype=np.float32 - ) - self._feature_offsets_initializer = tf.constant_initializer( - feature_offsets if feature_offsets is not None - else np.empty(n_feature, dtype=np.int64), - dtype=np.int64 - ) - - # note that calling build here is an exception as typically __call__ would call build(). - # We call it here because we need to initialize hash_map. - # Also note that the variable_scope is set by add_variable in build() - if not self.built: - self.build(input_shape=None) - - self.output_size = tf.convert_to_tensor(1 << out_bits, tf.int64) - - def build(self, input_shape): # pylint: disable=unused-argument - """ - Creates the variables of the layer: - hash_keys, hash_values, bin_ids, bin_values, feature_offsets and self.output_size. - """ - - # build layers - self.partition = Partition() - self.stitch = Stitch() - - # build variables - - hash_keys = self.add_variable( - 'hash_keys', - initializer=self._hash_keys_initializer, - shape=[self._n_feature], - dtype=tf.int64, - trainable=False) - - hash_values = self.add_variable( - 'hash_values', - initializer=self._hash_values_initializer, - shape=[self._n_feature], - dtype=tf.int64, - trainable=False) - - # hashmap converts known features into range [0, n_feature) - initializer = tf.lookup.KeyValueTensorInitializer(hash_keys, hash_values) - self.hash_map = tf.lookup.StaticHashTable(initializer, -1) - - self.bin_ids = self.add_variable( - 'bin_ids', - initializer=self._bin_ids_initializer, - shape=[self._n_feature * (self._n_bin + 1)], - dtype=tf.int64, - trainable=False) - - self.bin_values = self.add_variable( - 'bin_values', - initializer=self._bin_values_initializer, - shape=[self._n_feature * (self._n_bin + 1)], - dtype=tf.float32, - trainable=False) - - self.feature_offsets = self.add_variable( - 'feature_offsets', - initializer=self._feature_offsets_initializer, - shape=[self._n_feature], - dtype=tf.int64, - trainable=False) - - # make sure this is last - self.built = True - - def call(self, inputs, **kwargs): - """Looks up `keys` in a table, outputs the corresponding values. - - Implements MDL inference where inputs are intersected with a hash_map. - Part of the inputs are discretized using twml.mdl to produce a mdl_output SparseTensor. - This SparseTensor is then joined with the original inputs SparseTensor, - but only for the inputs keys that did not get discretized. - - Args: - inputs: A 2D SparseTensor that is input to MDL for discretization. - It has a dense_shape of [batch_size, input_size] - name: A name for the operation (optional). - Returns: - A `SparseTensor` of the same type as `inputs`. - Its dense_shape is [shape_input.dense_shape[0], 1 << output_bits]. - """ - if isinstance(inputs, tf.SparseTensor): - inputs = twml.SparseTensor.from_tf(inputs) - - assert(isinstance(inputs, twml.SparseTensor)) - - # sparse column indices - ids = inputs.ids - # sparse row indices - keys = inputs.indices - # sparse values - vals = inputs.values - - # get intersect(keys, hash_map) - hashed_keys = self.hash_map.lookup(keys) - found = tf.not_equal(hashed_keys, tf.constant(-1, tf.int64)) - partition_ids = tf.cast(found, tf.int32) - - vals, key, indices = self.partition(partition_ids, vals, tf.where(found, hashed_keys, keys)) - non_mdl_keys, mdl_in_keys = key - non_mdl_vals, mdl_in_vals = vals - - self.non_mdl_keys = non_mdl_keys - - # run MDL on the keys/values it knows about - mdl_keys, mdl_vals = libtwml.ops.mdl(mdl_in_keys, mdl_in_vals, self.bin_ids, self.bin_values, - self.feature_offsets) - - # handle output ID conflicts - mdl_size = tf.size(self.bin_ids, out_type=tf.int64) - non_mdl_size = tf.subtract(self.output_size, mdl_size) - non_mdl_keys = tf.add(tf.floormod(non_mdl_keys, non_mdl_size), mdl_size) - - # Stitch the keys and values from mdl and non mdl indices back, with help - # of the Stitch Layer - - # out for inference checking - self.mdl_out_keys = mdl_keys - - concat_data = self.stitch([non_mdl_vals, mdl_vals], - [non_mdl_keys, mdl_keys], - indices) - - concat_vals, concat_keys = concat_data - - # Generate output shape using _compute_output_shape - - batch_size = tf.to_int64(inputs.dense_shape[0]) - output_shape = [batch_size, self.output_size] - return twml.SparseTensor(ids, concat_keys, concat_vals, output_shape).to_tf() - - def compute_output_shape(self, input_shape): - """Computes the output shape of the layer given the input shape. - - Args: - input_shape: A (possibly nested tuple of) `TensorShape`. It need not - be fully defined (e.g. the batch size may be unknown). - - Raises NotImplementedError. - - """ - raise NotImplementedError + def __init__( + self, + n_feature: int, + n_bin: int, + out_bits: int, + bin_values: Optional[np.ndarray] = None, + hash_keys: Optional[np.ndarray] = None, + hash_values: Optional[np.ndarray] = None, + bin_ids: Optional[np.ndarray] = None, + feature_offsets: Optional[np.ndarray] = None, + **kwargs + ): + """ + Creates a non-initialized `MDL` object. + Before using the table you will have to initialize it. After initialization + the table will be immutable. + + Parent class args: + see [tf.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/layers/Layer) + for documentation of parent class arguments. + + Required args: + n_feature: + number of unique features accumulated during MDL calibration. + This is the number of features in the hash map. + Used to initialize bin_values, hash_keys, hash_values, + bin_ids, bin_values and feature_offsets. + n_bin: + number of MDL bins used for MDL calibration. + Used to initialize bin_values, hash_keys, hash_values, + bin_ids, bin_values and feature_offsets. + out_bits: + Determines the maximum value for output feature IDs. + The dense_shape of the SparseTensor returned by lookup(x) + will be [x.shape[0], 1 << output_bits]. + + Optional args: + hash_keys: + contains the features ID that MDL discretizes and knows about. + The hash map (hash_keys->hash_values) is used for two reasons: + 1. divide inputs into two feature spaces: MDL vs non-MDL + 2. transate the MDL features into a hash_feature ID that MDL understands. + The hash_map is expected to contain n_feature items. + hash_values: + translates the feature IDs into hash_feature IDs for MDL. + bin_ids: + a 1D Tensor of size n_feature * n_bin + 1 which contains + unique IDs to which the MDL features will be translated to. + For example, tf.Tensor(np.arange(n_feature * n_bin)) would produce + the most efficient output space. + bin_values: + a 1D Tensor aligned with bin_ids. + For a given hash_feature ID j, it's value bin's are indexed between + `j*n_bin` and `j*n_bin + n_bin-1`. + As such, bin_ids[j*n_bin+i] is translated from a hash_feature ID of j + and a inputs value between + `bin_values[j*n_bin + i]` and `bin_values[j*n_bin+i+1]`. + feature_offsets: + a 1D Tensor specifying the starting location of bins for a given feature id. + For example, tf.Tensor(np.arange(0, bin_values.size, n_bin, dtype='int64')). + """ + super(MDL, self).__init__(**kwargs) + tf.logging.warning( + "MDL will be deprecated. Please use PercentileDiscretizer instead" + ) + + max_mdl_feature = n_feature * (n_bin + 1) + self._n_feature = n_feature + self._n_bin = n_bin + + self._hash_keys_initializer = tf.constant_initializer( + hash_keys if hash_keys is not None else np.empty(n_feature, dtype=np.int64), + dtype=np.int64, + ) + self._hash_values_initializer = tf.constant_initializer( + hash_values + if hash_values is not None + else np.empty(n_feature, dtype=np.int64), + dtype=np.int64, + ) + self._bin_ids_initializer = tf.constant_initializer( + bin_ids + if bin_ids is not None + else np.empty(max_mdl_feature, dtype=np.int64), + dtype=np.int64, + ) + self._bin_values_initializer = tf.constant_initializer( + bin_values + if bin_values is not None + else np.empty(max_mdl_feature, dtype=np.float32), + dtype=np.float32, + ) + self._feature_offsets_initializer = tf.constant_initializer( + feature_offsets + if feature_offsets is not None + else np.empty(n_feature, dtype=np.int64), + dtype=np.int64, + ) + + # note that calling build here is an exception as typically __call__ would call build(). + # We call it here because we need to initialize hash_map. + # Also note that the variable_scope is set by add_variable in build() + if not self.built: + self.build(input_shape=None) + + self.output_size = tf.convert_to_tensor(1 << out_bits, tf.int64) + + def build( + self, input_shape: Optional[tf.TensorShape] = None + ): # pylint: disable=unused-argument + """ + Creates the variables of the layer: + hash_keys, hash_values, bin_ids, bin_values, feature_offsets and self.output_size. + """ + + # build layers + self.partition = Partition() + self.stitch = Stitch() + + # build variables + + hash_keys = self.add_variable( + "hash_keys", + initializer=self._hash_keys_initializer, + shape=[self._n_feature], + dtype=tf.int64, + trainable=False, + ) + + hash_values = self.add_variable( + "hash_values", + initializer=self._hash_values_initializer, + shape=[self._n_feature], + dtype=tf.int64, + trainable=False, + ) + + # hashmap converts known features into range [0, n_feature) + initializer = tf.lookup.KeyValueTensorInitializer(hash_keys, hash_values) + self.hash_map = tf.lookup.StaticHashTable(initializer, -1) + + self.bin_ids = self.add_variable( + "bin_ids", + initializer=self._bin_ids_initializer, + shape=[self._n_feature * (self._n_bin + 1)], + dtype=tf.int64, + trainable=False, + ) + + self.bin_values = self.add_variable( + "bin_values", + initializer=self._bin_values_initializer, + shape=[self._n_feature * (self._n_bin + 1)], + dtype=tf.float32, + trainable=False, + ) + + self.feature_offsets = self.add_variable( + "feature_offsets", + initializer=self._feature_offsets_initializer, + shape=[self._n_feature], + dtype=tf.int64, + trainable=False, + ) + + # make sure this is last + self.built = True + + def call(self, inputs: twml.SparseTensor, **kwargs) -> twml.SparseTensor: + """Looks up `keys` in a table, outputs the corresponding values. + + Implements MDL inference where inputs are intersected with a hash_map. + Part of the inputs are discretized using twml.mdl to produce a mdl_output SparseTensor. + This SparseTensor is then joined with the original inputs SparseTensor, + but only for the inputs keys that did not get discretized. + + Args: + inputs: A 2D SparseTensor that is input to MDL for discretization. + It has a dense_shape of [batch_size, input_size] + name: A name for the operation (optional). + + Returns: + A `SparseTensor` of the same type as `inputs`. + Its dense_shape is [shape_input.dense_shape[0], 1 << output_bits]. + """ + assert isinstance(inputs, twml.SparseTensor) + + if isinstance(inputs, tf.SparseTensor): + inputs = twml.SparseTensor.from_tf(inputs) + + # sparse column indices + ids = inputs.ids + # sparse row indices + keys = inputs.indices + # sparse values + vals = inputs.values + + # get intersect(keys, hash_map) + hashed_keys = self.hash_map.lookup(keys) + found = tf.not_equal(hashed_keys, tf.constant(-1, tf.int64)) + partition_ids = tf.cast(found, tf.int32) + + vals, key, indices = self.partition( + partition_ids, vals, tf.where(found, hashed_keys, keys) + ) + non_mdl_keys, mdl_in_keys = key + non_mdl_vals, mdl_in_vals = vals + + self.non_mdl_keys = non_mdl_keys + + # run MDL on the keys/values it knows about + mdl_keys, mdl_vals = libtwml.ops.mdl( + mdl_in_keys, + mdl_in_vals, + self.bin_ids, + self.bin_values, + self.feature_offsets, + ) + + # handle output ID conflicts + mdl_size = tf.size(self.bin_ids, out_type=tf.int64) + non_mdl_size = tf.subtract(self.output_size, mdl_size) + non_mdl_keys = tf.add(tf.floormod(non_mdl_keys, non_mdl_size), mdl_size) + + # Stitch the keys and values from mdl and non mdl indices back, with help + # of the Stitch Layer + # out for inference checking + self.mdl_out_keys = mdl_keys + + concat_data = self.stitch( + [non_mdl_vals, mdl_vals], [non_mdl_keys, mdl_keys], indices + ) + + concat_vals, concat_keys = concat_data + + # Generate output shape using _compute_output_shape + batch_size = tf.to_int64(inputs.dense_shape[0]) + output_shape = [batch_size, self.output_size] + return twml.SparseTensor(ids, concat_keys, concat_vals, output_shape).to_tf() + + def compute_output_shape(self, input_shape: tf.TensorShape): + """Computes the output shape of the layer given the input shape. + + Args: + input_shape: A (possibly nested tuple of) `TensorShape`. It need not + be fully defined (e.g. the batch size may be unknown). + + Raises NotImplementedError. + """ + raise NotImplementedError diff --git a/twml/twml/layers/partition.py b/twml/twml/layers/partition.py index 0e7c85f18..fb2c2f846 100644 --- a/twml/twml/layers/partition.py +++ b/twml/twml/layers/partition.py @@ -3,72 +3,86 @@ """ -from .layer import Layer +from typing import List, Union import tensorflow.compat.v1 as tf +from .layer import Layer -class Partition(Layer): - """ - This layer implements: - - .. code-block:: python - - tf.dynamic_partition(input_vals, partition_ids, self.partitions) - - Input: - partitions: - the number of partitions which we will divide the hashmap keys/bvalues - - Output: - A layer that performs partitioning - """ - def __init__(self, partitions=2, **kwargs): - self.partitions = partitions - super(Partition, self).__init__(**kwargs) +class Partition(Layer): + """ + This layer implements: - def compute_output_shape(self, input_shape): - """Computes the output shape of the layer given the input shape. + .. code-block:: python - Args: - input_shape: A (possibly nested tuple of) `TensorShape`. It need not - be fully defined (e.g. the batch size may be unknown). + tf.dynamic_partition(input_vals, partition_ids, self.partitions) - Raises NotImplementedError. + Input: + partitions: + the number of partitions which we will divide the hashmap keys/bvalues + Output: + A layer that performs partitioning """ - raise NotImplementedError - - def call(self, partition_ids, input_vals, input_keys, **kwargs): - """This layer is responsible for partitioning the values/keys of a hashmap - - Arguments: - partition_ids: - Tensor that is equivalent to boolean (int32). - input_vals: - Tensor that represents the values of the hashmap(float). - input_keys: - Tensor that represents the keys of the hashmap(float) - - Returns: - The output of the partition layer, which is a list of lists which looks - something like: - - .. code-block:: python - - [[vals_0, vals_1], [keys_0, keys_1], [indices_0, indices_1]] - - where: - vals_x: - values of the hashmap for partition x - keys_x: - keys of the hashmap for partition x - indices_x: - indices of the hashmap for partition x - """ - partioned_val = tf.dynamic_partition(input_vals, partition_ids, self.partitions) - partioned_keys = tf.dynamic_partition(input_keys, partition_ids, self.partitions) - partioned_indices = tf.dynamic_partition(tf.range(tf.shape(partition_ids)[0]), - tf.cast(partition_ids, tf.int32), self.partitions) - return [partioned_val, partioned_keys, partioned_indices] + + def __init__(self, partitions: int = 2, **kwargs): + self.partitions = partitions + super(Partition, self).__init__(**kwargs) + + def compute_output_shape( + self, input_shape: Union[tf.TensorShape, List[tf.TensorShape]] + ): + """Computes the output shape of the layer given the input shape. + + Args: + input_shape: A (possibly nested tuple of) `TensorShape`. It need not + be fully defined (e.g. the batch size may be unknown). + + Raises NotImplementedError. + """ + raise NotImplementedError + + def call( + self, + partition_ids: tf.Tensor, + input_vals: tf.Tensor, + input_keys: tf.Tensor, + **kwargs + ) -> List[List[tf.Tensor]]: + """This layer is responsible for partitioning the values/keys of a hashmap + + Args: + partition_ids: + Tensor that is equivalent to boolean (int32). + input_vals: + Tensor that represents the values of the hashmap(float). + input_keys: + Tensor that represents the keys of the hashmap(float) + + Returns: + The output of the partition layer, which is a list of lists which looks + something like: + + .. code-block:: python + + [[vals_0, vals_1], [keys_0, keys_1], [indices_0, indices_1]] + + where: + vals_x: + values of the hashmap for partition x + keys_x: + keys of the hashmap for partition x + indices_x: + indices of the hashmap for partition x + """ + partioned_val = tf.dynamic_partition(input_vals, partition_ids, self.partitions) + partioned_keys = tf.dynamic_partition( + input_keys, partition_ids, self.partitions + ) + partioned_indices = tf.dynamic_partition( + tf.range(tf.shape(partition_ids)[0]), + tf.cast(partition_ids, tf.int32), + self.partitions, + ) + return [partioned_val, partioned_keys, partioned_indices] diff --git a/twml/twml/layers/percentile_discretizer.py b/twml/twml/layers/percentile_discretizer.py index 55bb4de8c..ab1734aec 100644 --- a/twml/twml/layers/percentile_discretizer.py +++ b/twml/twml/layers/percentile_discretizer.py @@ -4,206 +4,240 @@ """ +from typing import Optional, Tuple, Union + import libtwml import numpy as np import tensorflow.compat.v1 as tf + import twml from twml.layers import Layer class PercentileDiscretizer(Layer): - """ - PercentileDiscretizer layer is constructed by PercentileDiscretizerCalibrator after - accumulating data and performing percentile bucket calibration. - - PercentileDiscretizer takes sparse continuous features and converts then to sparse - binary features. Each binary output feature is associated to an PercentileDiscretizer bin. - Each PercentileDiscretizer input feature is converted to n_bin bins. - Each PercentileDiscretizer calibration tries to find bin delimiters such - that the number of features values per bin is roughly equal (for - each given PercentileDiscretizer feature). In other words, bins are calibrated to be approx. - equiprobable, according to the given calibration data. - Note that if an input feature is rarely used, so will its associated output bin/features. - """ - - def __init__( - self, - n_feature, n_bin, out_bits, - bin_values=None, hash_keys=None, hash_values=None, - bin_ids=None, feature_offsets=None, num_parts=1, cost_per_unit=100, **kwargs): - """ - Creates a non-initialized `PercentileDiscretizer` object. - Before using the table you will have to initialize it. After initialization - the table will be immutable. - - If there are no calibrated features, then the discretizer will only apply - twml.util.limit_bits to the the feature keys (aka "feature_ids"). Essentially, - the discretizer will be a "no-operation", other than obeying `out_bits` - - Parent class args: - see [tf.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/layers/Layer) - for documentation of parent class arguments. - - Required args: - n_feature: - number of unique features accumulated during PercentileDiscretizer calibration. - This is the number of features in the hash map. - Used to initialize bin_values, hash_keys, hash_values, - bin_ids, bin_values and feature_offsets. - n_bin: - number of PercentileDiscretizer bins used for PercentileDiscretizer calibration. - Used to initialize bin_values, hash_keys, hash_values, - bin_ids, bin_values and feature_offsets. - out_bits: - Determines the maximum value for output feature IDs. - The dense_shape of the SparseTensor returned by lookup(x) - will be [x.shape[0], 1 << output_bits]. - - Optional args: - hash_keys: - contains the features ID that PercentileDiscretizer discretizes and knows about. - The hash map (hash_keys->hash_values) is used for two reasons: - 1. divide inputs into two feature spaces: - PercentileDiscretizer vs non-PercentileDiscretizer - 2. transate the PercentileDiscretizer features into a hash_feature ID that - PercentileDiscretizer understands. - The hash_map is expected to contain n_feature items. - hash_values: - translates the feature IDs into hash_feature IDs for PercentileDiscretizer. - bin_ids: - a 1D Tensor of size n_feature * n_bin + 1 which contains - unique IDs to which the PercentileDiscretizer features will be translated to. - For example, tf.Tensor(np.arange(n_feature * n_bin)) would produce - the most efficient output space. - bin_values: - a 1D Tensor aligned with bin_ids. - For a given hash_feature ID j, it's value bin's are indexed between - `j*n_bin` and `j*n_bin + n_bin-1`. - As such, bin_ids[j*n_bin+i] is translated from a hash_feature ID of j - and a inputs value between - `bin_values[j*n_bin + i]` and `bin_values[j*n_bin+i+1]`. - feature_offsets: - a 1D Tensor specifying the starting location of bins for a given feature id. - For example, tf.Tensor(np.arange(0, bin_values.size, n_bin, dtype='int64')). - """ - - super(PercentileDiscretizer, self).__init__(**kwargs) - - if not self.built: - self.build(input_shape=None) - - max_discretizer_feature = n_feature * (n_bin + 1) - self._n_feature = n_feature - self._n_bin = n_bin - - # build variables - self._out_bits = out_bits - self._output_size = tf.convert_to_tensor(1 << out_bits, tf.int64) - self._hash_keys = (hash_keys if hash_keys is not None else - np.empty(n_feature, dtype=np.int64)) - self._hash_values = (hash_values if hash_values is not None else - np.empty(n_feature, dtype=np.int64)) - self._bin_ids = (bin_ids if bin_ids is not None else - np.empty(max_discretizer_feature, dtype=np.int64)) - self._bin_values = (bin_values if bin_values is not None else - np.empty(max_discretizer_feature, dtype=np.float32)) - self._feature_offsets = (feature_offsets if feature_offsets is not None else - np.empty(n_feature, dtype=np.int64)) - self.num_parts = num_parts - self.cost_per_unit = cost_per_unit - - def build(self, input_shape): # pylint: disable=unused-argument - """ - Creates the variables of the layer """ - self.built = True - - def call(self, inputs, keep_inputs=False, **kwargs): - """Looks up `keys` in a table, outputs the corresponding values. - - Implements PercentileDiscretizer inference where inputs are intersected with a hash_map. - Input features that were not calibrated have their feature IDs truncated, so as - to be less than 1< 0: - discretizer_keys, discretizer_vals = libtwml.ops.percentile_discretizer_v2( - input_ids=keys, # inc key assigned to feature_id, or -1 - input_vals=vals, # the observed feature values - bin_ids=self._bin_ids, # n_feat X (n_bin+1) 2D arange - bin_vals=self._bin_values, # bin boundaries - feature_offsets=self._feature_offsets, # 0 : nbin_1 : max_feat - output_bits=self._out_bits, - feature_ids=tf.make_tensor_proto(self._hash_keys), # feature ids to build internal hash map - feature_indices=tf.make_tensor_proto(self._hash_values), # keys associated w/ feat. indices - start_compute=tf.constant(0, shape=[], dtype=tf.int64), - end_compute=tf.constant(-1, shape=[], dtype=tf.int64), - cost_per_unit=self.cost_per_unit - ) - else: - discretizer_keys = twml.util.limit_bits(keys, self._out_bits) - discretizer_vals = vals - # don't 2x the input. - keep_inputs = False - - batch_size = tf.to_int64(inputs.dense_shape[0]) - output_shape = [batch_size, self._output_size] - - output = twml.SparseTensor(ids, discretizer_keys, discretizer_vals, output_shape).to_tf() - - if keep_inputs: - # Note the non-discretized features will end up doubled, - # since these are already in `output` - # handle output ID conflicts - mdl_size = self._n_feature * (self._n_bin + 1) - non_mdl_size = tf.subtract(self._output_size, mdl_size) - input_keys = tf.add(tf.floormod(keys, non_mdl_size), mdl_size) - - new_input = twml.SparseTensor( - ids=ids, indices=input_keys, values=vals, dense_shape=output_shape).to_tf() - - # concatenate discretizer output with original input - sparse_add = tf.sparse_add(new_input, output) - output = tf.SparseTensor(sparse_add.indices, sparse_add.values, output_shape) - - return output - - def compute_output_shape(self, input_shape): - """Computes the output shape of the layer given the input shape. - - Args: - input_shape: A (possibly nested tuple of) `TensorShape`. It need not - be fully defined (e.g. the batch size may be unknown). - - Raises NotImplementedError. - - """ - raise NotImplementedError + def __init__( + self, + n_feature: int, + n_bin: int, + out_bits: int, + bin_values: Optional[tf.Tensor] = None, + hash_keys: Optional[tf.Tensor] = None, + hash_values: Optional[tf.Tensor] = None, + bin_ids: Optional[tf.Tensor] = None, + feature_offsets: Optional[tf.Tensor] = None, + num_parts: int = 1, + cost_per_unit: int = 100, + **kwargs + ): + """ + Creates a non-initialized `PercentileDiscretizer` object. + Before using the table you will have to initialize it. After initialization + the table will be immutable. + + If there are no calibrated features, then the discretizer will only apply + twml.util.limit_bits to the the feature keys (aka "feature_ids"). Essentially, + the discretizer will be a "no-operation", other than obeying `out_bits` + + Parent class args: + see [tf.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/layers/Layer) + for documentation of parent class arguments. + + Required args: + n_feature: + number of unique features accumulated during PercentileDiscretizer calibration. + This is the number of features in the hash map. + Used to initialize bin_values, hash_keys, hash_values, + bin_ids, bin_values and feature_offsets. + n_bin: + number of PercentileDiscretizer bins used for PercentileDiscretizer calibration. + Used to initialize bin_values, hash_keys, hash_values, + bin_ids, bin_values and feature_offsets. + out_bits: + Determines the maximum value for output feature IDs. + The dense_shape of the SparseTensor returned by lookup(x) + will be [x.shape[0], 1 << output_bits]. + + Optional args: + hash_keys: + contains the features ID that PercentileDiscretizer discretizes and knows about. + The hash map (hash_keys->hash_values) is used for two reasons: + 1. divide inputs into two feature spaces: + PercentileDiscretizer vs non-PercentileDiscretizer + 2. transate the PercentileDiscretizer features into a hash_feature ID that + PercentileDiscretizer understands. + The hash_map is expected to contain n_feature items. + hash_values: + translates the feature IDs into hash_feature IDs for PercentileDiscretizer. + bin_ids: + a 1D Tensor of size n_feature * n_bin + 1 which contains + unique IDs to which the PercentileDiscretizer features will be translated to. + For example, tf.Tensor(np.arange(n_feature * n_bin)) would produce + the most efficient output space. + bin_values: + a 1D Tensor aligned with bin_ids. + For a given hash_feature ID j, it's value bin's are indexed between + `j*n_bin` and `j*n_bin + n_bin-1`. + As such, bin_ids[j*n_bin+i] is translated from a hash_feature ID of j + and a inputs value between + `bin_values[j*n_bin + i]` and `bin_values[j*n_bin+i+1]`. + feature_offsets: + a 1D Tensor specifying the starting location of bins for a given feature id. + For example, tf.Tensor(np.arange(0, bin_values.size, n_bin, dtype='int64')). + """ + + super(PercentileDiscretizer, self).__init__(**kwargs) + + if not self.built: + self.build(input_shape=None) + + max_discretizer_feature = n_feature * (n_bin + 1) + self._n_feature = n_feature + self._n_bin = n_bin + + # build variables + self._out_bits = out_bits + self._output_size = tf.convert_to_tensor(1 << out_bits, tf.int64) + self._hash_keys = ( + hash_keys if hash_keys is not None else np.empty(n_feature, dtype=np.int64) + ) + self._hash_values = ( + hash_values + if hash_values is not None + else np.empty(n_feature, dtype=np.int64) + ) + self._bin_ids = ( + bin_ids + if bin_ids is not None + else np.empty(max_discretizer_feature, dtype=np.int64) + ) + self._bin_values = ( + bin_values + if bin_values is not None + else np.empty(max_discretizer_feature, dtype=np.float32) + ) + self._feature_offsets = ( + feature_offsets + if feature_offsets is not None + else np.empty(n_feature, dtype=np.int64) + ) + self.num_parts = num_parts + self.cost_per_unit = cost_per_unit + + def build( + self, input_shape: Optional[tf.TensorShape] = None + ): # pylint: disable=unused-argument + """Creates the variables of the layer""" + self.built = True + + def call( + self, inputs: tf.SparseTensor, keep_inputs: bool = False, **kwargs + ) -> tf.SparseTensor: + """Looks up `keys` in a table, outputs the corresponding values. + + Implements PercentileDiscretizer inference where inputs are intersected with a hash_map. + Input features that were not calibrated have their feature IDs truncated, so as + to be less than 1< 0: + discretizer_keys, discretizer_vals = libtwml.ops.percentile_discretizer_v2( + input_ids=keys, # inc key assigned to feature_id, or -1 + input_vals=vals, # the observed feature values + bin_ids=self._bin_ids, # n_feat X (n_bin+1) 2D arange + bin_vals=self._bin_values, # bin boundaries + feature_offsets=self._feature_offsets, # 0 : nbin_1 : max_feat + output_bits=self._out_bits, + feature_ids=tf.make_tensor_proto( + self._hash_keys + ), # feature ids to build internal hash map + feature_indices=tf.make_tensor_proto( + self._hash_values + ), # keys associated w/ feat. indices + start_compute=tf.constant(0, shape=[], dtype=tf.int64), + end_compute=tf.constant(-1, shape=[], dtype=tf.int64), + cost_per_unit=self.cost_per_unit, + ) + else: + discretizer_keys = twml.util.limit_bits(keys, self._out_bits) + discretizer_vals = vals + # don't 2x the input. + keep_inputs = False + + batch_size = tf.to_int64(inputs.dense_shape[0]) + output_shape = [batch_size, self._output_size] + + output = twml.SparseTensor( + ids, discretizer_keys, discretizer_vals, output_shape + ).to_tf() + + if keep_inputs: + # Note the non-discretized features will end up doubled, + # since these are already in `output` + # handle output ID conflicts + mdl_size = self._n_feature * (self._n_bin + 1) + non_mdl_size = tf.subtract(self._output_size, mdl_size) + input_keys = tf.add(tf.floormod(keys, non_mdl_size), mdl_size) + + new_input = twml.SparseTensor( + ids=ids, indices=input_keys, values=vals, dense_shape=output_shape + ).to_tf() + + # concatenate discretizer output with original input + sparse_add = tf.sparse_add(new_input, output) + output = tf.SparseTensor( + sparse_add.indices, sparse_add.values, output_shape + ) + + return output + + def compute_output_shape(self, input_shape: Union[tf.TensorShape, Tuple[int, ...]]): + """Computes the output shape of the layer given the input shape. + + Args: + input_shape: A (possibly nested tuple of) `TensorShape`. It need not + be fully defined (e.g. the batch size may be unknown). + + Raises NotImplementedError. + """ + raise NotImplementedError diff --git a/twml/twml/layers/sequential.py b/twml/twml/layers/sequential.py index c0d4b92cc..7b0f83d34 100644 --- a/twml/twml/layers/sequential.py +++ b/twml/twml/layers/sequential.py @@ -3,158 +3,168 @@ """ -from .layer import Layer +from typing import List, Optional, Union +import tensorflow.compat.v1 as tf from tensorflow import keras from tensorflow.python.layers import base +from .layer import Layer -class Sequential(Layer): - """ - A sequential stack of layers. - - Arguments: - layers: list of layers to add to the model. - - Output: - the output of the sequential layers - """ - - def __init__(self, layers=None, **kwargs): - self._layers = [] # Stack of layers. - self._layer_names = [] # Stack of layers names - self._layer_outputs = [] - # Add to the model any layers passed to the constructor. - if layers: - for layer in layers: - self.add(layer) - super(Sequential, self).__init__(**kwargs) - - def add(self, layer): - """Adds a layer instance on top of the layer stack. - - Arguments: - layer: - layer instance. - - Raises: - TypeError: - if the layer argument is not instance of base.Layer - """ - if not isinstance(layer, base.Layer) and not isinstance(layer, keras.layers.Layer): - raise TypeError('The added layer must be an instance of class Layer') - - if layer.name in self._layer_names: - raise ValueError('Layer with name %s already exists in sequential layer' % layer.name) - - self._layers.append(layer) - self._layer_names.append(layer.name) - - def pop(self): - """Removes the last layer in the model. - - Raises: - TypeError: - if there are no layers in the model. - """ - if not self._layers or not self._layer_names: - raise TypeError('There are no layers in the model.') - self._layers.pop() - self._layer_names.pop() - - def call(self, inputs, **kwargs): # pylint: disable=unused-argument - """The logic of the layer lives here. - - Arguments: - inputs: - input tensor(s). - - Returns: - The output of the sequential layers - """ - self._layer_outputs = [] - for layer in self._layers: - # don't use layer.call because you want to build individual layers - inputs = layer(inputs) # overwrites the current input after it has been processed - self._layer_outputs.append(inputs) - return inputs - - @property - def layers(self): - """ Return the layers in the sequential layer """ - return self._layers - - @property - def layer_names(self): - """ Return the layer names in the sequential layer """ - return self._layer_names - - @property - def layer_outputs(self): - """ Return the layer outputs in the sequential layer """ - return self._layer_outputs - - def get(self, key): - """Retrieves the n-th layer. - - Arguments: - key: - index of the layer - - Output: - The n-th layer where n is equal to the key. - """ - return self._layers[key] - - def get_output(self, key): - """Retrieves the n-th layer output. - - Arguments: - key: - index of the layer - - Output: - The intermediary output equivalent to the nth layer, where n is equal to the key. - """ - return self._layer_outputs[key] - - def get_layer_by_name(self, name): - """Retrieves the layer corresponding to the name. - - Arguments: - name: - name of the layer - Output: - list of layers that have the name desired +class Sequential(Layer): """ - return self._layers[self._layer_names.index(name)] + A sequential stack of layers. - def get_layer_output_by_name(self, name): - """Retrieves the layer output corresponding to the name. - - Arguments: - name: - name of the layer + Args: + layers: list of layers to add to the model. Output: - list of the output of the layers that have the desired name + the output of the sequential layers """ - return self._layer_outputs[self._layer_names.index(name)] - - @property - def init(self): - """ returns a list of initialization ops (one per layer) """ - return [layer.init for layer in self._layers] - - def compute_output_shape(self, input_shape): - """Computes the output shape of the layer given the input shape. - Args: - input_shape: A (possibly nested tuple of) `TensorShape`. It need not - be fully defined (e.g. the batch size may be unknown). - - Raise NotImplementedError. - - """ - raise NotImplementedError + def __init__(self, layers: Optional[List[Layer]] = None, **kwargs): + self._layers = [] # Stack of layers. + self._layer_names = [] # Stack of layers names + self._layer_outputs = [] + # Add to the model any layers passed to the constructor. + if layers: + for layer in layers: + self.add(layer) + super(Sequential, self).__init__(**kwargs) + + def add(self, layer: Layer) -> None: + """Adds a layer instance on top of the layer stack. + + Args: + layer: + layer instance. + + Raises: + TypeError: + if the layer argument is not instance of base.Layer + """ + if not isinstance(layer, base.Layer) and not isinstance( + layer, keras.layers.Layer + ): + raise TypeError("The added layer must be an instance of class Layer") + + if layer.name in self._layer_names: + raise ValueError( + "Layer with name %s already exists in sequential layer" % layer.name + ) + + self._layers.append(layer) + self._layer_names.append(layer.name) + + def pop(self) -> None: + """Removes the last layer in the model. + + Raises: + TypeError: + if there are no layers in the model. + """ + if not self._layers or not self._layer_names: + raise TypeError("There are no layers in the model.") + self._layers.pop() + self._layer_names.pop() + + def call(self, inputs: Layer, **kwargs) -> Layer: # pylint: disable=unused-argument + """The logic of the layer lives here. + + Args: + inputs: + input tensor(s). + + Returns: + The output of the sequential layers + """ + self._layer_outputs = [] + for layer in self._layers: + # don't use layer.call because you want to build individual layers + inputs = layer( + inputs + ) # overwrites the current input after it has been processed + self._layer_outputs.append(inputs) + return inputs + + @property + def layers(self) -> List[Layer]: + """Return the layers in the sequential layer""" + return self._layers + + @property + def layer_names(self) -> List[str]: + """Return the layer names in the sequential layer""" + return self._layer_names + + @property + def layer_outputs(self) -> List[Layer]: + """Return the layer outputs in the sequential layer""" + return self._layer_outputs + + def get(self, key: int) -> Layer: + """Retrieves the n-th layer. + + Args: + key: + index of the layer + + Output: + The n-th layer where n is equal to the key. + """ + return self._layers[key] + + def get_output(self, key: int) -> Layer: + """Retrieves the n-th layer output. + + Args: + key: + index of the layer + + Output: + The intermediary output equivalent to the nth layer, where n is equal to the key. + """ + return self._layer_outputs[key] + + def get_layer_by_name(self, name: str) -> Layer: + """Retrieves the layer corresponding to the name. + + Args: + name: + name of the layer + + Output: + list of layers that have the name desired + """ + return self._layers[self._layer_names.index(name)] + + def get_layer_output_by_name(self, name: str) -> Layer: + """Retrieves the layer output corresponding to the name. + + Args: + name: + name of the layer + + Output: + list of the output of the layers that have the desired name + """ + return self._layer_outputs[self._layer_names.index(name)] + + @property + def init(self) -> List[tf.Operation]: + """returns a list of initialization ops (one per layer)""" + return [layer.init for layer in self._layers] + + def compute_output_shape( + self, input_shape: Union[tf.TensorShape, List[tf.TensorShape]] + ) -> tf.TensorShape: + """Computes the output shape of the layer given the input shape. + + Args: + input_shape: A (possibly nested tuple of) `TensorShape`. It need not + be fully defined (e.g. the batch size may be unknown). + + Raise NotImplementedError. + """ + raise NotImplementedError diff --git a/twml/twml/layers/sparse_max_norm.py b/twml/twml/layers/sparse_max_norm.py index e1f423fe0..04fc6b68d 100644 --- a/twml/twml/layers/sparse_max_norm.py +++ b/twml/twml/layers/sparse_max_norm.py @@ -2,220 +2,237 @@ """ Contains the twml.layers.SparseMaxNorm layer. """ -from .layer import Layer +from typing import Optional, Union -from libtwml import OPLIB import tensorflow.compat.v1 as tf -import twml - +from libtwml import OPLIB -class SparseMaxNorm(Layer): - """ - Computes a max-normalization and adds bias to the sparse_input, - forwards that through a sparse affine transform followed - by an non-linear activation on the resulting dense representation. - - This layer has two parameters, one of which learns through gradient descent: - bias_x (optional): - vector of shape [input_size]. Learned through gradient descent. - max_x: - vector of shape [input_size]. Holds the maximas of input ``x`` for normalization. - Either calibrated through SparseMaxNorm calibrator, or calibrated online, or both. - - The pseudo-code for this layer looks like: - - .. code-block:: python - - abs_x = abs(x) - normed_x = clip_by_value(x / max_x, -1, 1) - biased_x = normed_x + bias_x - return biased - - - Args: - max_x_initializer: - initializer vector of shape [input_size] used by variable `max_x` - bias_x_initializer: - initializer vector of shape [input_size] used by parameter `bias_x` - is_training: - Are we training the layer to learn the normalization maximas. - If set to True, max_x will be able to learn. This is independent of bias_x - epsilon: - The minimum value used for max_x. Defaults to 1E-5. - use_bias: - Default True. Set to False to not use a bias term. - - Returns: - A layer representing the output of the sparse_max_norm transformation. - """ - - def __init__( - self, - input_size=None, - max_x_initializer=None, - bias_x_initializer=None, - is_training=True, - epsilon=1E-5, - use_bias=True, - **kwargs): - - super(SparseMaxNorm, self).__init__(**kwargs) - if input_size: - raise ValueError('input_size is deprecated - it is now automatically \ - inferred from your input.') - if max_x_initializer is None: - max_x_initializer = tf.zeros_initializer() - self.max_x_initializer = max_x_initializer - - self._use_bias = use_bias - if use_bias: - if bias_x_initializer is None: - bias_x_initializer = tf.zeros_initializer() - self.bias_x_initializer = bias_x_initializer - - self.epsilon = epsilon - self.is_training = is_training - - def build(self, input_shape): # pylint: disable=unused-argument - """Creates the max_x and bias_x tf.Variables of the layer.""" - - self.max_x = self.add_variable( - 'max_x', - initializer=self.max_x_initializer, - shape=[input_shape[1]], - dtype=tf.float32, - trainable=False) - - if self._use_bias: - self.bias_x = self.add_variable( - 'bias_x', - initializer=self.bias_x_initializer, - shape=[input_shape[1]], - dtype=tf.float32, - trainable=True) - - self.built = True - - def compute_output_shape(self, input_shape): - """Computes the output shape of the layer given the input shape. +import twml - Args: - input_shape: A (possibly nested tuple of) `TensorShape`. It need not - be fully defined (e.g. the batch size may be unknown). +from .layer import Layer - Raises NotImplementedError. +class SparseMaxNorm(Layer): """ - raise NotImplementedError + Computes a max-normalization and adds bias to the sparse_input, + forwards that through a sparse affine transform followed + by an non-linear activation on the resulting dense representation. - def _call(self, inputs, **kwargs): # pylint: disable=unused-argument - """ - The forward propagation logic of the layer lives here. + This layer has two parameters, one of which learns through gradient descent: + bias_x (optional): + vector of shape [input_size]. Learned through gradient descent. + max_x: + vector of shape [input_size]. Holds the maximas of input ``x`` for normalization. + Either calibrated through SparseMaxNorm calibrator, or calibrated online, or both. - Arguments: - sparse_input: - A 2D ``tf.SparseTensor`` of dense_shape ``[batch_size, input_size]`` - Returns: - A ``tf.SparseTensor`` representing the output of the max_norm transformation, this can - be fed into twml.layers.FullSparse in order to be transformed into a ``tf.Tensor``. - """ + The pseudo-code for this layer looks like: - if isinstance(inputs, twml.SparseTensor): - inputs = inputs.to_tf() - elif not isinstance(inputs, tf.SparseTensor): - raise TypeError("The inputs must be of type tf.SparseTensor or twml.SparseTensor") + .. code-block:: python - indices_x = inputs.indices[:, 1] - values_x = inputs.values + abs_x = abs(x) + normed_x = clip_by_value(x / max_x, -1, 1) + biased_x = normed_x + bias_x + return biased - if self.is_training is False: - normalized_x = OPLIB.sparse_max_norm_inference(self.max_x, - indices_x, - values_x, - self.epsilon) - update_op = tf.no_op() - else: - max_x, normalized_x = OPLIB.sparse_max_norm_training(self.max_x, - indices_x, - values_x, - self.epsilon) + Args: + max_x_initializer: + initializer vector of shape [input_size] used by variable `max_x` + bias_x_initializer: + initializer vector of shape [input_size] used by parameter `bias_x` + is_training: + Are we training the layer to learn the normalization maximas. + If set to True, max_x will be able to learn. This is independent of bias_x + epsilon: + The minimum value used for max_x. Defaults to 1E-5. + use_bias: + Default True. Set to False to not use a bias term. - update_op = tf.assign(self.max_x, max_x) + Returns: + A layer representing the output of the sparse_max_norm transformation. + """ - with tf.control_dependencies([update_op]): - normalized_x = tf.stop_gradient(normalized_x) + def __init__( + self, + input_size: Optional[int] = None, + max_x_initializer: Optional[tf.keras.initializers.Initializer] = None, + bias_x_initializer: Optional[tf.keras.initializers.Initializer] = None, + is_training: bool = True, + epsilon: float = 1e-5, + use_bias: bool = True, + **kwargs + ): + super(SparseMaxNorm, self).__init__(**kwargs) + if input_size: + raise ValueError( + "input_size is deprecated - it is now automatically \ + inferred from your input." + ) + if max_x_initializer is None: + max_x_initializer = tf.zeros_initializer() + self.max_x_initializer = max_x_initializer + + self._use_bias = use_bias + if use_bias: + if bias_x_initializer is None: + bias_x_initializer = tf.zeros_initializer() + self.bias_x_initializer = bias_x_initializer + + self.epsilon = epsilon + self.is_training = is_training + + def build(self, input_shape: tf.TensorShape): + """Creates the max_x and bias_x tf.Variables of the layer.""" + + self.max_x = self.add_variable( + "max_x", + initializer=self.max_x_initializer, + shape=[input_shape[1]], + dtype=tf.float32, + trainable=False, + ) + + if self._use_bias: + self.bias_x = self.add_variable( + "bias_x", + initializer=self.bias_x_initializer, + shape=[input_shape[1]], + dtype=tf.float32, + trainable=True, + ) + + self.built = True + + def compute_output_shape( + self, input_shape: tf.TensorShape + ): # pylint: disable=unused-argument + """Computes the output shape of the layer given the input shape. + + Args: + input_shape: A (possibly nested tuple of) `TensorShape`. It need not + be fully defined (e.g. the batch size may be unknown). + + Raises NotImplementedError. + """ + raise NotImplementedError + + def _call( + self, inputs: tf.SparseTensor, **kwargs + ) -> tf.SparseTensor: # pylint: disable=unused-argument + """ + The forward propagation logic of the layer lives here. + + Args: + sparse_input: + A 2D ``tf.SparseTensor`` of dense_shape ``[batch_size, input_size]`` + Returns: + A ``tf.SparseTensor`` representing the output of the max_norm transformation, this can + be fed into twml.layers.FullSparse in order to be transformed into a ``tf.Tensor``. + """ + + if isinstance(inputs, twml.SparseTensor): + inputs = inputs.to_tf() + elif not isinstance(inputs, tf.SparseTensor): + raise TypeError( + "The inputs must be of type tf.SparseTensor or twml.SparseTensor" + ) + + indices_x = inputs.indices[:, 1] + values_x = inputs.values + + if self.is_training is False: + normalized_x = OPLIB.sparse_max_norm_inference( + self.max_x, indices_x, values_x, self.epsilon + ) + + update_op = tf.no_op() + else: + max_x, normalized_x = OPLIB.sparse_max_norm_training( + self.max_x, indices_x, values_x, self.epsilon + ) + + update_op = tf.assign(self.max_x, max_x) + + with tf.control_dependencies([update_op]): + normalized_x = tf.stop_gradient(normalized_x) + + # add input bias + if self._use_bias: + normalized_x = normalized_x + tf.gather(self.bias_x, indices_x) + + # convert back to sparse tensor + return tf.SparseTensor(inputs.indices, normalized_x, inputs.dense_shape) + + def call(self, inputs: tf.SparseTensor, **kwargs) -> tf.SparseTensor: + """ + The forward propagation logic of the layer lives here. + + Args: + sparse_input: + A 2D ``tf.SparseTensor`` of dense_shape ``[batch_size, input_size]`` + Returns: + A ``tf.SparseTensor`` representing the output of the max_norm transformation, this can + be fed into twml.layers.FullSparse in order to be transformed into a ``tf.Tensor``. + """ + with tf.device(self.max_x.device): + return self._call(inputs, **kwargs) - # add input bias - if self._use_bias: - normalized_x = normalized_x + tf.gather(self.bias_x, indices_x) - # convert back to sparse tensor - return tf.SparseTensor(inputs.indices, normalized_x, inputs.dense_shape) +# For backwards compatiblity and also because I don't want to change all the tests. +MaxNorm = SparseMaxNorm - def call(self, inputs, **kwargs): # pylint: disable=unused-argument - """ - The forward propagation logic of the layer lives here. - Arguments: - sparse_input: - A 2D ``tf.SparseTensor`` of dense_shape ``[batch_size, input_size]`` - Returns: - A ``tf.SparseTensor`` representing the output of the max_norm transformation, this can - be fed into twml.layers.FullSparse in order to be transformed into a ``tf.Tensor``. +def sparse_max_norm( + inputs: Union[tf.SparseTensor, twml.SparseTensor], + input_size: Optional[int] = None, + max_x_initializer: Optional[tf.keras.initializers.Initializer] = None, + bias_x_initializer: Optional[tf.keras.initializers.Initializer] = None, + is_training: bool = True, + epsilon: float = 1e-5, + use_bias: bool = True, + name: Optional[str] = None, + reuse: Optional[bool] = None, +) -> tf.SparseTensor: """ - with tf.device(self.max_x.device): - return self._call(inputs, **kwargs) + Functional inteface to SparseMaxNorm. -# For backwards compatiblity and also because I don't want to change all the tests. -MaxNorm = SparseMaxNorm + Args: + inputs: + A sparse tensor (can be twml.SparseTensor or tf.SparseTensor) + input_size: + number of input units + max_x_initializer: + initializer vector of shape [input_size] used by variable `max_x` + bias_x_initializer: + initializer vector of shape [input_size] used by parameter `bias_x` + is_training: + Are we training the layer to learn the normalization maximas. + If set to True, max_x will be able to learn. This is independent of bias_x + epsilon: + The minimum value used for max_x. Defaults to 1E-5. + use_bias: + Default True. Set to False to not use a bias term. + Returns: + Output after normalizing with the max value. + """ + if input_size: + raise ValueError( + "input_size is deprecated - it is now automatically \ + inferred from your input." + ) -def sparse_max_norm(inputs, - input_size=None, - max_x_initializer=None, - bias_x_initializer=None, - is_training=True, - epsilon=1E-5, - use_bias=True, - name=None, - reuse=None): - """ - Functional inteface to SparseMaxNorm. - - Args: - inputs: - A sparse tensor (can be twml.SparseTensor or tf.SparseTensor) - input_size: - number of input units - max_x_initializer: - initializer vector of shape [input_size] used by variable `max_x` - bias_x_initializer: - initializer vector of shape [input_size] used by parameter `bias_x` - is_training: - Are we training the layer to learn the normalization maximas. - If set to True, max_x will be able to learn. This is independent of bias_x - epsilon: - The minimum value used for max_x. Defaults to 1E-5. - use_bias: - Default True. Set to False to not use a bias term. - - Returns: - Output after normalizing with the max value. - """ - if input_size: - raise ValueError('input_size is deprecated - it is now automatically \ - inferred from your input.') - - if isinstance(inputs, twml.SparseTensor): - inputs = inputs.to_tf() - - layer = SparseMaxNorm(max_x_initializer=max_x_initializer, - bias_x_initializer=bias_x_initializer, - is_training=is_training, - epsilon=epsilon, - use_bias=use_bias, - name=name, - _scope=name, - _reuse=reuse) - return layer(inputs) + if isinstance(inputs, twml.SparseTensor): + inputs = inputs.to_tf() + + layer = SparseMaxNorm( + max_x_initializer=max_x_initializer, + bias_x_initializer=bias_x_initializer, + is_training=is_training, + epsilon=epsilon, + use_bias=use_bias, + name=name, + _scope=name, + _reuse=reuse, + ) + return layer(inputs) diff --git a/twml/twml/layers/stitch.py b/twml/twml/layers/stitch.py index 51dffdb8e..eba9bc318 100644 --- a/twml/twml/layers/stitch.py +++ b/twml/twml/layers/stitch.py @@ -4,51 +4,61 @@ """ -from .layer import Layer +from typing import List, Union import tensorflow.compat.v1 as tf +from .layer import Layer + class Stitch(Layer): - """ - This layer is responsible for stitching a partioned layer together. + """ + This layer is responsible for stitching a partitioned layer together. + + Output: + A layer that performs stitching + """ - Output: - A layer that performs stitching - """ + def compute_output_shape( + self, input_shape: Union[tf.TensorShape, List[tf.TensorShape]] + ): + """Computes the output shape of the layer given the input shape. - def compute_output_shape(self, input_shape): - """Computes the output shape of the layer given the input shape. + Args: + input_shape: A (possibly nested tuple of) `TensorShape`. It need not + be fully defined (e.g. the batch size may be unknown). - Args: - input_shape: A (possibly nested tuple of) `TensorShape`. It need not - be fully defined (e.g. the batch size may be unknown). + Raises NotImplementedError. - Raises NotImplementedError. + """ + raise NotImplementedError - """ - raise NotImplementedError + def call( + self, + partitioned_val: List[tf.Tensor], + partitioned_keys: List[tf.Tensor], + partitioned_indices: List[tf.Tensor], + **kwargs, + ) -> List[tf.Tensor]: + """ + This layer is responsible for stitching a partitioned layer together. - def call(self, partioned_val, partioned_keys, - partioned_indices, **kwargs): # pylint: disable=unused-argument, arguments-differ - """ - This layer is responsible for stitching a partioned layer together. - - Input: - partioned_val: - a list of partioned Tensors which represent the vals of the hashmap - partioned_keys: - a list of partioned Tensors which represent the keys of the hashmap - partioned_indices: - a list of partioned Tensors which represent the indices of the hashmap - Output: - List which contains: [output_vals, output_keys] - output_vals: - Values of the HashMap (float) - output_keys: - Keys of HashMap (float) - """ - indices = [tf.to_int32(index) for index in partioned_indices] - concat_keys = tf.dynamic_stitch(indices, partioned_keys) - concat_vals = tf.dynamic_stitch(indices, partioned_val) - return [concat_vals, concat_keys] + Input: + partitioned_val: + a list of partitioned Tensors which represent the vals of the hashmap + partitioned_keys: + a list of partitioned Tensors which represent the keys of the hashmap + partitioned_indices: + a list of partitioned Tensors which represent the indices of the hashmap + + Output: + List which contains: [output_vals, output_keys] + output_vals: + Values of the HashMap (float) + output_keys: + Keys of HashMap (float) + """ + indices = [tf.to_int32(index) for index in partitioned_indices] + concat_keys = tf.dynamic_stitch(indices, partitioned_keys) + concat_vals = tf.dynamic_stitch(indices, partitioned_val) + return [concat_vals, concat_keys] diff --git a/twml/twml/learning_rate_decay.py b/twml/twml/learning_rate_decay.py index be522d75b..9a012012c 100644 --- a/twml/twml/learning_rate_decay.py +++ b/twml/twml/learning_rate_decay.py @@ -3,166 +3,213 @@ import tensorflow.compat.v1 as tf -def get_learning_rate_decay_fn(params): - """ - Returns a learning rate decay function that takes the initial - learning_rate and global_step - as arguments and returns the current learning rate. - - Currently supports params.learning_rate_decay values of: - exponential | polynomial | piecewise_constant | cosine | cosine restarts. - See `Decaying the Leanring Rate - `_ for details. - - Arguments: - params: - a tensorflow.contrib.train.HParams object containing the relevant hyperparameters. - """ - paramsv = params.values() - if 'learning_rate_decay' not in paramsv or params.learning_rate_decay == 'no_learning_rate_decay': - return None - elif params.learning_rate_decay == 'exponential_learning_rate_decay': - if 'decay_steps' not in paramsv: - raise ValueError("Expecting params.decay_steps for " - "params.learning_rate_decay == 'exponential'") - if 'exponential_decay_rate' not in paramsv: - raise ValueError("Expecting params.exponential_decay_rate for " - "params.learning_rate_decay == 'exponential'") - - def exponential_decay_fn(learning_rate, global_step): - """ exponential decay function to be passed to optimize_loss """ - return tf.train.exponential_decay( - learning_rate=learning_rate, - global_step=global_step, - decay_steps=params.decay_steps, - decay_rate=params.exponential_decay_rate - ) - return exponential_decay_fn - elif params.learning_rate_decay == 'piecewise_constant_learning_rate_decay': - if 'piecewise_constant_boundaries' not in paramsv: - raise ValueError("Expecting params.piecewise_constant_boundaries for " - "params.learning_rate_decay == 'piecewise_constant'") - if 'piecewise_constant_values' not in paramsv: - raise ValueError("Expecting params.piecewise_constant_values for " - "params.learning_rate_decay == 'piecewise_constant'") - # pylint: disable=unused-argument - - def piecewise_constant_fn(learning_rate, global_step): - """ piecewise_constant decay function to be passed to optimize_loss """ - return tf.train.piecewise_constant( - x=global_step, - boundaries=params.piecewise_constant_boundaries, - values=params.piecewise_constant_values - ) - return piecewise_constant_fn - elif params.learning_rate_decay == 'polynomial_learning_rate_decay': - if 'decay_steps' not in paramsv: - raise ValueError("Expecting params.decay_steps for " - "params.learning_rate_decay == 'polynomial'") - if 'end_learning_rate' not in paramsv: - raise ValueError("Expecting params.end_learning_rate for " - "params.learning_rate_decay == 'polynomial'") - - def polynomial_decay_fn(learning_rate, global_step): - """ polynomial decay function to be passed to optimize_loss """ - return tf.train.polynomial_decay( - learning_rate=learning_rate, - global_step=global_step, - decay_steps=params.decay_steps, - end_learning_rate=params.end_learning_rate, - power=params.polynomial_power if 'polynomial_power' in paramsv else 1.0, - ) - return polynomial_decay_fn - - elif params.learning_rate_decay == 'inverse_learning_rate_decay': - if 'min_learning_rate' not in paramsv: - raise ValueError("Expecting params.min_learning_rate for " - "params.learning_rate_decay == 'inverse'") - if 'decay_rate' not in paramsv: - raise ValueError("Expecting params.decay_rate for " - "params.learning_rate_decay == 'inverse'") - if 'decay_steps' not in paramsv: - raise ValueError("Expecting params.decay_steps for " - "params.learning_rate_decay == 'inverse'") - - def bounded_inverse_time_decay_fn(learning_rate, global_step): - ''' - Returns the decayed learning_rate by applying the function: - decayed_lr = max(lr /(1 + decay_rate * floor(global_step /decay_step)), - min_learning_rate) - Arguments: - learning_rate: - A scalar `float32` or `float64` `Tensor` or a Python number. - The initial learning rate. - global_step: - A scalar `int32` or `int64` `Tensor` or a Python number. - Global step to use for the decay computation. Must not be negative. - min_learning_rate: - A scalar `int32` or `int64` `Tensor` or a Python number. - Minimum possible learning_rate. The decayed learning_rate will not be - smaller than the min_learning_rate - decay_steps: - How often to apply decay. In dbv1, this should be 1. - decay_rate: - A scalar `int32` or `int64` `Tensor` or a Python number. - Rate in which we decay the learning rate. - Returns: - A scalar `Tensor` of the same type as `learning_rate`. The decayed - learning rate. - ''' - decayed_rate = tf.train.inverse_time_decay( - learning_rate=learning_rate, - global_step=global_step, - decay_steps=params.decay_steps, - decay_rate=params.decay_rate) - # Getting dtype of returned Tensor - dtype = decayed_rate.dtype - # Casting the min_learning rate the same dtype as decayes rate - min_learning_rate = tf.cast(params.min_learning_rate, dtype) - # Returning the maximum between the two - return tf.maximum(decayed_rate, min_learning_rate) - - return bounded_inverse_time_decay_fn - - elif params.learning_rate_decay == 'cosine_learning_rate_decay': - if 'decay_steps' not in paramsv: - raise ValueError("Expecting params.decay_steps for " - "params.learning_rate_decay == 'cosine_decay'") - if "alpha" not in paramsv: - raise ValueError("Expecting params.alpha for " - "params.learning_rate_decay == 'cosine_decay'") - def cosine_decay_fn(learning_rate, global_step): - """ cosine decay function to be passed to optimize_loss """ - return tf.train.cosine_decay( - learning_rate=learning_rate, - global_step=global_step, - decay_steps=params.decay_steps, - alpha=params.alpha - ) - return cosine_decay_fn - elif params.learning_rate_decay == 'cosine_restarts_learning_rate_decay': - if 'first_decay_steps' not in paramsv: - raise ValueError("Expecting params.first_decay_steps for " - "params.learning_rate_decay == 'cosine_restarts_decay'") - if 't_mul' not in paramsv: - raise ValueError("Expecting params.t_mul for " - "params.learning_rate_decay == 'cosine_restarts_decay'") - if 'm_mul' not in paramsv: - raise ValueError("Expecting params.m_mul for " - "params.learning_rate_decay == 'cosine_restarts_decay'") - if "alpha" not in paramsv: - raise ValueError("Expecting params.alpha for " - "params.learning_rate_decay == 'cosine_restarts_decay'") - def cosine_restart_decay_fn(learning_rate, global_step): - """ cosine decay function to be passed to optimize_loss """ - return tf.train.cosine_decay_restarts( - learning_rate=learning_rate, - global_step=global_step, - first_decay_steps=params.first_decay_steps, - t_mul=params.t_mul, - m_mul=params.m_mul, - alpha=params.alpha - ) - return cosine_restart_decay_fn - - raise ValueError("Unsupported params.learning_rate_decay: %s" % params.learning_rate_decay) +def get_learning_rate_decay_fn(params: tf.contrib.training.HParams) -> tf.Tensor: + """ + Returns a learning rate decay function that takes the initial + learning_rate and global_step + as arguments and returns the current learning rate. + + Currently supports params.learning_rate_decay values of: + exponential | polynomial | piecewise_constant | cosine | cosine restarts. + See `Decaying the Learning Rate + `_ for details. + + Args: + params: + a tensorflow.contrib.train.HParams object containing the relevant hyper parameters. + """ + paramsv = params.values() + if ( + "learning_rate_decay" not in paramsv + or params.learning_rate_decay == "no_learning_rate_decay" + ): + return None + elif params.learning_rate_decay == "exponential_learning_rate_decay": + if "decay_steps" not in paramsv: + raise ValueError( + "Expecting params.decay_steps for " + "params.learning_rate_decay == 'exponential'" + ) + if "exponential_decay_rate" not in paramsv: + raise ValueError( + "Expecting params.exponential_decay_rate for " + "params.learning_rate_decay == 'exponential'" + ) + + def exponential_decay_fn(learning_rate: float, global_step: int) -> tf.Tensor: + """exponential decay function to be passed to optimize_loss""" + return tf.train.exponential_decay( + learning_rate=learning_rate, + global_step=global_step, + decay_steps=params.decay_steps, + decay_rate=params.exponential_decay_rate, + ) + + return exponential_decay_fn + elif params.learning_rate_decay == "piecewise_constant_learning_rate_decay": + if "piecewise_constant_boundaries" not in paramsv: + raise ValueError( + "Expecting params.piecewise_constant_boundaries for " + "params.learning_rate_decay == 'piecewise_constant'" + ) + if "piecewise_constant_values" not in paramsv: + raise ValueError( + "Expecting params.piecewise_constant_values for " + "params.learning_rate_decay == 'piecewise_constant'" + ) + # pylint: disable=unused-argument + + def piecewise_constant_fn(learning_rate: float, global_step: int) -> tf.Tensor: + """piecewise_constant decay function to be passed to optimize_loss""" + return tf.train.piecewise_constant( + x=global_step, + boundaries=params.piecewise_constant_boundaries, + values=params.piecewise_constant_values, + ) + + return piecewise_constant_fn + elif params.learning_rate_decay == "polynomial_learning_rate_decay": + if "decay_steps" not in paramsv: + raise ValueError( + "Expecting params.decay_steps for " + "params.learning_rate_decay == 'polynomial'" + ) + if "end_learning_rate" not in paramsv: + raise ValueError( + "Expecting params.end_learning_rate for " + "params.learning_rate_decay == 'polynomial'" + ) + + def polynomial_decay_fn(learning_rate: float, global_step: int) -> tf.Tensor: + """polynomial decay function to be passed to optimize_loss""" + return tf.train.polynomial_decay( + learning_rate=learning_rate, + global_step=global_step, + decay_steps=params.decay_steps, + end_learning_rate=params.end_learning_rate, + power=params.polynomial_power if "polynomial_power" in paramsv else 1.0, + ) + + return polynomial_decay_fn + + elif params.learning_rate_decay == "inverse_learning_rate_decay": + if "min_learning_rate" not in paramsv: + raise ValueError( + "Expecting params.min_learning_rate for " + "params.learning_rate_decay == 'inverse'" + ) + if "decay_rate" not in paramsv: + raise ValueError( + "Expecting params.decay_rate for " + "params.learning_rate_decay == 'inverse'" + ) + if "decay_steps" not in paramsv: + raise ValueError( + "Expecting params.decay_steps for " + "params.learning_rate_decay == 'inverse'" + ) + + def bounded_inverse_time_decay_fn( + learning_rate: float, global_step: int + ) -> tf.Tensor: + """ + Returns the decayed learning_rate by applying the function: + decayed_lr = max(lr /(1 + decay_rate * floor(global_step /decay_step)), + min_learning_rate) + Args: + learning_rate: + A scalar `float32` or `float64` `Tensor` or a Python number. + The initial learning rate. + global_step: + A scalar `int32` or `int64` `Tensor` or a Python number. + Global step to use for the decay computation. Must not be negative. + min_learning_rate: + A scalar `int32` or `int64` `Tensor` or a Python number. + Minimum possible learning_rate. The decayed learning_rate will not be + smaller than the min_learning_rate + decay_steps: + How often to apply decay. In dbv1, this should be 1. + decay_rate: + A scalar `int32` or `int64` `Tensor` or a Python number. + Rate in which we decay the learning rate. + Returns: + A scalar `Tensor` of the same type as `learning_rate`. The decayed + learning rate. + """ + decayed_rate = tf.train.inverse_time_decay( + learning_rate=learning_rate, + global_step=global_step, + decay_steps=params.decay_steps, + decay_rate=params.decay_rate, + ) + # Getting dtype of returned Tensor + dtype = decayed_rate.dtype + # Casting the min_learning rate the same dtype as decayes rate + min_learning_rate = tf.cast(params.min_learning_rate, dtype) + # Returning the maximum between the two + return tf.maximum(decayed_rate, min_learning_rate) + + return bounded_inverse_time_decay_fn + + elif params.learning_rate_decay == "cosine_learning_rate_decay": + if "decay_steps" not in paramsv: + raise ValueError( + "Expecting params.decay_steps for " + "params.learning_rate_decay == 'cosine_decay'" + ) + if "alpha" not in paramsv: + raise ValueError( + "Expecting params.alpha for " + "params.learning_rate_decay == 'cosine_decay'" + ) + + def cosine_decay_fn(learning_rate: float, global_step: int) -> tf.Tensor: + """cosine decay function to be passed to optimize_loss""" + return tf.train.cosine_decay( + learning_rate=learning_rate, + global_step=global_step, + decay_steps=params.decay_steps, + alpha=params.alpha, + ) + + return cosine_decay_fn + elif params.learning_rate_decay == "cosine_restarts_learning_rate_decay": + if "first_decay_steps" not in paramsv: + raise ValueError( + "Expecting params.first_decay_steps for " + "params.learning_rate_decay == 'cosine_restarts_decay'" + ) + if "t_mul" not in paramsv: + raise ValueError( + "Expecting params.t_mul for " + "params.learning_rate_decay == 'cosine_restarts_decay'" + ) + if "m_mul" not in paramsv: + raise ValueError( + "Expecting params.m_mul for " + "params.learning_rate_decay == 'cosine_restarts_decay'" + ) + if "alpha" not in paramsv: + raise ValueError( + "Expecting params.alpha for " + "params.learning_rate_decay == 'cosine_restarts_decay'" + ) + + def cosine_restart_decay_fn( + learning_rate: float, global_step: int + ) -> tf.Tensor: + """cosine decay function to be passed to optimize_loss""" + return tf.train.cosine_decay_restarts( + learning_rate=learning_rate, + global_step=global_step, + first_decay_steps=params.first_decay_steps, + t_mul=params.t_mul, + m_mul=params.m_mul, + alpha=params.alpha, + ) + + return cosine_restart_decay_fn + + raise ValueError( + "Unsupported params.learning_rate_decay: %s" % params.learning_rate_decay + ) diff --git a/twml/twml/lookup/__init__.py b/twml/twml/lookup/__init__.py index 87392d719..2695fa53c 100644 --- a/twml/twml/lookup/__init__.py +++ b/twml/twml/lookup/__init__.py @@ -1,9 +1,8 @@ -from tensorflow.python.ops.lookup_ops import ( - index_table_from_file, - index_table_from_tensor, - index_to_string_table_from_file -) # noqa: F401 - +from tensorflow.python.ops.lookup_ops import ( # noqa: F401 + index_table_from_file, + index_table_from_tensor, + index_to_string_table_from_file, +) """ NOTE: Using `from tensorflow.python.ops.lookup_ops import index_table_from_tensor` in the code works. diff --git a/twml/twml/metrics.py b/twml/twml/metrics.py index ee2f82b74..663997416 100644 --- a/twml/twml/metrics.py +++ b/twml/twml/metrics.py @@ -4,1377 +4,1638 @@ """ -from collections import OrderedDict from functools import partial +from typing import Callable, Collection, Dict, List, Optional, Sequence, Tuple, Union import numpy as np import tensorboard as tb import tensorflow.compat.v1 as tf - CLAMP_EPSILON = 0.00001 def total_weight_metric( - labels, - predictions, - weights=None, - metrics_collections=None, - updates_collections=None, - name=None): - with tf.variable_scope(name, 'total_weight', (labels, predictions, weights)): - total_weight = _metric_variable(name='total_weight', shape=[], dtype=tf.float64) - - if weights is None: - weights = tf.cast(tf.size(labels), total_weight.dtype, name="default_weight") - else: - weights = tf.cast(weights, total_weight.dtype) + labels: tf.Tensor, + predictions: tf.Tensor, + weights: Optional[tf.Tensor] = None, + metrics_collections: Optional[Collection[tf.Variable]] = None, + updates_collections: Optional[Collection[tf.Variable]] = None, + name: Optional[str] = None, +) -> Tuple[tf.Tensor, tf.Tensor]: + with tf.variable_scope(name, "total_weight", (labels, predictions, weights)): + total_weight = _metric_variable(name="total_weight", shape=[], dtype=tf.float64) + + if weights is None: + weights = tf.cast( + tf.size(labels), total_weight.dtype, name="default_weight" + ) + else: + weights = tf.cast(weights, total_weight.dtype) - # add up the weights to get total weight of the eval set - update_total_weight = tf.assign_add(total_weight, tf.reduce_sum(weights), name="update_op") + # add up the weights to get total weight of the eval set + update_total_weight = tf.assign_add( + total_weight, tf.reduce_sum(weights), name="update_op" + ) - value_op = tf.identity(total_weight) - update_op = tf.identity(update_total_weight) + value_op = tf.identity(total_weight) + update_op = tf.identity(update_total_weight) - if metrics_collections: - tf.add_to_collections(metrics_collections, value_op) + if metrics_collections: + tf.add_to_collections(metrics_collections, value_op) - if updates_collections: - tf.add_to_collections(updates_collections, update_op) + if updates_collections: + tf.add_to_collections(updates_collections, update_op) - return value_op, update_op + return value_op, update_op def num_samples_metric( - labels, - predictions, - weights=None, - metrics_collections=None, - updates_collections=None, - name=None): - with tf.variable_scope(name, 'num_samples', (labels, predictions, weights)): - num_samples = _metric_variable(name='num_samples', shape=[], dtype=tf.float64) - update_num_samples = tf.assign_add(num_samples, tf.cast(tf.size(labels), num_samples.dtype), name="update_op") - - value_op = tf.identity(num_samples) - update_op = tf.identity(update_num_samples) - - if metrics_collections: - tf.add_to_collections(metrics_collections, value_op) - - if updates_collections: - tf.add_to_collections(updates_collections, update_op) - - return value_op, update_op - - -def ctr(labels, predictions, - weights=None, - metrics_collections=None, - updates_collections=None, - name=None): - # pylint: disable=unused-argument - """ - Compute the weighted average positive sample ratio based on labels - (i.e. weighted average percentage of positive labels). - The name `ctr` (click-through-rate) is from legacy. - - Args: - labels: the ground truth value. - predictions: the predicted values, whose shape must match labels. Ignored for CTR computation. - weights: optional weights, whose shape must match labels . Weight is 1 if not set. - metrics_collections: optional list of collections to add this metric into. - updates_collections: optional list of collections to add the associated update_op into. - name: an optional variable_scope name. - - Return: - ctr: A `Tensor` representing positive sample ratio. - update_op: A update operation used to accumulate data into this metric. - """ - return tf.metrics.mean( - values=labels, - weights=weights, - metrics_collections=metrics_collections, - updates_collections=updates_collections, - name=name) - - -def predicted_ctr(labels, predictions, - weights=None, - metrics_collections=None, - updates_collections=None, - name=None): - # pylint: disable=unused-argument - """ - Compute the weighted average positive ratio based on predictions, - (i.e. weighted averaged predicted positive probability). - The name `ctr` (click-through-rate) is from legacy. - - Args: - labels: the ground truth value. - predictions: the predicted values, whose shape must match labels. Ignored for CTR computation. - weights: optional weights, whose shape must match labels . Weight is 1 if not set. - metrics_collections: optional list of collections to add this metric into. - updates_collections: optional list of collections to add the associated update_op into. - name: an optional variable_scope name. - - Return: - predicted_ctr: A `Tensor` representing the predicted positive ratio. - update_op: A update operation used to accumulate data into this metric. - """ - return tf.metrics.mean( - values=predictions, - weights=weights, - metrics_collections=metrics_collections, - updates_collections=updates_collections, - name=name) - - -def prediction_std_dev(labels, predictions, - weights=None, - metrics_collections=None, - updates_collections=None, - name=None): - """ - Compute the weighted standard deviation of the predictions. - Note - this is not a confidence interval metric. - - Args: - labels: the ground truth value. - predictions: the predicted values, whose shape must match labels. Ignored for CTR computation. - weights: optional weights, whose shape must match labels . Weight is 1 if not set. - metrics_collections: optional list of collections to add this metric into. - updates_collections: optional list of collections to add the associated update_op into. - name: an optional variable_scope name. - - Return: - metric value: A `Tensor` representing the value of the metric on the data accumulated so far. - update_op: A update operation used to accumulate data into this metric. - """ - with tf.variable_scope(name, 'pred_std_dev', (labels, predictions, weights)): - labels = tf.cast(labels, tf.float64) - predictions = tf.cast(predictions, tf.float64) - - if weights is None: - weights = tf.ones(shape=tf.shape(labels), dtype=tf.float64, name="default_weight") - else: - weights = tf.cast(weights, tf.float64) - - # State kept during streaming of examples - total_weighted_preds = _metric_variable( - name='total_weighted_preds', shape=[], dtype=tf.float64) - total_weighted_preds_sq = _metric_variable( - name='total_weighted_preds_sq', shape=[], dtype=tf.float64) - total_weights = _metric_variable( - name='total_weights', shape=[], dtype=tf.float64) - - # Update state - update_total_weighted_preds = tf.assign_add(total_weighted_preds, tf.reduce_sum(weights * predictions)) - update_total_weighted_preds_sq = tf.assign_add(total_weighted_preds_sq, tf.reduce_sum(weights * predictions * predictions)) - update_total_weights = tf.assign_add(total_weights, tf.reduce_sum(weights)) - - # Compute output - def compute_output(tot_w, tot_wp, tot_wpp): - return tf.math.sqrt(tot_wpp / tot_w - (tot_wp / tot_w) ** 2) - std_dev_est = compute_output(total_weights, total_weighted_preds, total_weighted_preds_sq) - update_std_dev_est = compute_output(update_total_weights, update_total_weighted_preds, update_total_weighted_preds_sq) - - if metrics_collections: - tf.add_to_collections(metrics_collections, std_dev_est) - - if updates_collections: - tf.add_to_collections(updates_collections, update_std_dev_est) - - return std_dev_est, update_std_dev_est - - -def _get_arce_predictions(predictions, weights, label_weighted, labels, - up_weight, deprecated_rce, - total_positive, update_total_positive): - """ - Returns the ARCE predictions, total_positive, update_total_positive and weights - used by the rest of the twml.metrics.rce metric computation. - """ - predictions_weighted = tf.multiply(predictions, weights, name="weighted_preds") - label_weighted_comp = tf.subtract(tf.reduce_sum(weights), tf.reduce_sum(label_weighted)) - pred_weight_comp = tf.subtract(tf.reduce_sum(weights), tf.reduce_sum(predictions_weighted)) - normalizer_comp = label_weighted_comp / pred_weight_comp - - if up_weight is False: - total_positive_unweighted = _metric_variable( - name='total_positive_unweighted', shape=[], dtype=tf.float32) - - update_total_positive_unweighted = tf.assign_add( - total_positive_unweighted, tf.reduce_sum(labels), - name="total_positive_unweighted_update") - - if deprecated_rce: - normalizer = tf.reduce_sum(labels) / tf.reduce_sum(label_weighted) - else: - # sum of labels / sum of weighted labels - normalizer = update_total_positive_unweighted / update_total_positive - - label_comp = tf.subtract(tf.to_float(tf.size(labels)), tf.reduce_sum(labels)) - normalizer_comp = label_comp / label_weighted_comp - - # note that up_weight=True changes these for the rest of the twml.metric.rce computation - weights = tf.ones(shape=tf.shape(labels), dtype=tf.float32, name="default_weight") - total_positive = total_positive_unweighted - update_total_positive = update_total_positive_unweighted - else: - if deprecated_rce: - normalizer = tf.reduce_sum(label_weighted) / tf.reduce_sum(predictions_weighted) - else: - # normalizer used for NRCE (and ARCE with up_weight=True) - total_prediction = _metric_variable(name='total_prediction', shape=[], dtype=tf.float32) - - # update the variable holding the sum of weighted predictions - update_total_prediction = tf.assign_add( - total_prediction, tf.reduce_sum(predictions_weighted), name="total_prediction_update") - - # this used to be tf.reduce_sum(label_weighted) / tf.reduce_sum(predictions_weighted) - # but it measure normalizer over batch was too flawed an approximation. - normalizer = update_total_positive / update_total_prediction - - pred_comp = tf.subtract(tf.ones(shape=tf.shape(labels), dtype=tf.float32), predictions) - pred_comp_norm = tf.multiply(pred_comp, normalizer_comp, name="normalized_predictions_comp") - pred_num = tf.multiply(predictions, normalizer, name="normalized_pred_numerator") - pred_denom = tf.add(pred_num, pred_comp_norm, name="normalized_pred_denominator") - predictions = pred_num / pred_denom - - return predictions, total_positive, update_total_positive, weights - - -def rce(labels, predictions, - weights=None, - normalize=False, - arce=False, - up_weight=True, - metrics_collections=None, - updates_collections=None, - name=None, - deprecated_rce=False): - """ - Compute the relative cross entropy (RCE). - The RCE is a relative measurement compared to the baseline model's performance. - The baseline model always predicts average click-through-rate (CTR). - The RCE measures, in percentage, how much better the predictions are, compared - to the baseline model, in terms of cross entropy loss. - - y = label; p = prediction; - binary cross entropy = y * log(p) + (1-y) * log(1-p) - - Args: - labels: - the ground true value. - predictions: - the predicted values, whose shape must match labels. - weights: - optional weights, whose shape must match labels . Weight is 1 if not set. - normalize: - if set to true, produce NRCEs used at Twitter. (normalize preds by weights first) - NOTE: if you don't understand what NRCE is, please don't use it. - arce: - if set to true, produces `ARCE `_. - This can only be activated if `normalize=True`. - up_weight: - if set to true, produces arce in the up_weighted space (considers CTR after up_weighting - data), while False gives arce in the original space (only considers CTR before up_weighting). - In the actual version, this flag can only be activated if arce is True. - Notice that the actual version of NRCE corresponds to up_weight=True. - metrics_collections: - optional list of collections to add this metric into. - updates_collections: - optional list of collections to add the associated update_op into. - name: - an optional variable_scope name. - deprecated_rce: - enables the previous NRCE/ARCE calculations which calculated some label metrics - on the batch instead of on all batches seen so far. Note that the older metric - calculation is less stable, especially for smaller batch sizes. You should probably - never have to set this to True. - - Return: - rce_value: - A ``Tensor`` representing the RCE. - update_op: - A update operation used to accumulate data into this metric. - - .. note:: Must have at least 1 positive and 1 negative sample accumulated, - or RCE will come out as NaN. - """ - with tf.variable_scope(name, 'rce', (labels, predictions, weights)): - labels = tf.to_float(labels, name="label_to_float") - predictions = tf.to_float(predictions, name="predictions_to_float") - - if weights is None: - weights = tf.ones(shape=tf.shape(labels), dtype=tf.float32, name="default_weight") - else: - weights = tf.to_float(weights, name="weight_to_float") + labels: tf.Tensor, + predictions: tf.Tensor, + weights: Optional[tf.Tensor] = None, + metrics_collections: Optional[Collection[tf.Variable]] = None, + updates_collections: Optional[Collection[tf.Variable]] = None, + name: Optional[str] = None, +) -> Tuple[tf.Tensor, tf.Tensor]: + with tf.variable_scope(name, "num_samples", (labels, predictions, weights)): + num_samples = _metric_variable(name="num_samples", shape=[], dtype=tf.float64) + update_num_samples = tf.assign_add( + num_samples, tf.cast(tf.size(labels), num_samples.dtype), name="update_op" + ) + + value_op = tf.identity(num_samples) + update_op = tf.identity(update_num_samples) + + if metrics_collections: + tf.add_to_collections(metrics_collections, value_op) + + if updates_collections: + tf.add_to_collections(updates_collections, update_op) + + return value_op, update_op + + +def ctr( + labels: tf.Tensor, + predictions: tf.Tensor, + weights: Optional[tf.Tensor] = None, + metrics_collections: Optional[Collection[tf.Variable]] = None, + updates_collections: Optional[Collection[tf.Variable]] = None, + name: Optional[str] = None, +) -> Tuple[tf.Tensor, tf.Tensor]: # pylint: disable=unused-argument + """ + Compute the weighted average positive sample ratio based on labels + (i.e. weighted average percentage of positive labels). + The name `ctr` (click-through-rate) is from legacy. + + Args: + labels: the ground truth value. + predictions: the predicted values, whose shape must match labels. Ignored for CTR computation. + weights: optional weights, whose shape must match labels . Weight is 1 if not set. + metrics_collections: optional list of collections to add this metric into. + updates_collections: optional list of collections to add the associated update_op into. + name: an optional variable_scope name. + + Return: + ctr: A `Tensor` representing positive sample ratio. + update_op: A update operation used to accumulate data into this metric. + """ + return tf.metrics.mean( + values=labels, + weights=weights, + metrics_collections=metrics_collections, + updates_collections=updates_collections, + name=name, + ) - total_positive = _metric_variable(name='total_positive', shape=[], dtype=tf.float32) - total_loss = _metric_variable(name='total_loss', shape=[], dtype=tf.float32) - total_weight = _metric_variable(name='total_weight', shape=[], dtype=tf.float32) - label_weighted = tf.multiply(labels, weights, name="weighted_label") +def predicted_ctr( + labels: tf.Tensor, + predictions: tf.Tensor, + weights: tf.Tensor = None, + metrics_collections: Optional[Collection[tf.Variable]] = None, + updates_collections: Optional[Collection[tf.Variable]] = None, + name: Optional[str] = None, +) -> Tuple[tf.Tensor, tf.Tensor]: + # pylint: disable=unused-argument + """ + Compute the weighted average positive ratio based on predictions, + (i.e. weighted averaged predicted positive probability). + The name `ctr` (click-through-rate) is from legacy. + + Args: + labels: the ground truth value. + predictions: the predicted values, whose shape must match labels. Ignored for CTR computation. + weights: optional weights, whose shape must match labels . Weight is 1 if not set. + metrics_collections: optional list of collections to add this metric into. + updates_collections: optional list of collections to add the associated update_op into. + name: an optional variable_scope name. + + Return: + predicted_ctr: A `Tensor` representing the predicted positive ratio. + update_op: A update operation used to accumulate data into this metric. + """ + return tf.metrics.mean( + values=predictions, + weights=weights, + metrics_collections=metrics_collections, + updates_collections=updates_collections, + name=name, + ) - update_total_positive = tf.assign_add( - total_positive, tf.reduce_sum(label_weighted), name="total_pos_update") - if arce: - if normalize is False: - raise ValueError('This configuration of parameters is not actually allowed') +def prediction_std_dev( + labels: tf.Tensor, + predictions: tf.Tensor, + weights: tf.Tensor = None, + metrics_collections: Optional[Collection[tf.Variable]] = None, + updates_collections: Optional[Collection[tf.Variable]] = None, + name: Optional[str] = None, +) -> Tuple[tf.Tensor, tf.Tensor]: + """ + Compute the weighted standard deviation of the predictions. + Note - this is not a confidence interval metric. + + Args: + labels: the ground truth value. + predictions: the predicted values, whose shape must match labels. Ignored for CTR computation. + weights: optional weights, whose shape must match labels . Weight is 1 if not set. + metrics_collections: optional list of collections to add this metric into. + updates_collections: optional list of collections to add the associated update_op into. + name: an optional variable_scope name. + + Return: + metric value: A `Tensor` representing the value of the metric on the data accumulated so far. + update_op: A update operation used to accumulate data into this metric. + """ + with tf.variable_scope(name, "pred_std_dev", (labels, predictions, weights)): + labels = tf.cast(labels, tf.float64) + predictions = tf.cast(predictions, tf.float64) + + if weights is None: + weights = tf.ones( + shape=tf.shape(labels), dtype=tf.float64, name="default_weight" + ) + else: + weights = tf.cast(weights, tf.float64) + + # State kept during streaming of examples + total_weighted_preds = _metric_variable( + name="total_weighted_preds", shape=[], dtype=tf.float64 + ) + total_weighted_preds_sq = _metric_variable( + name="total_weighted_preds_sq", shape=[], dtype=tf.float64 + ) + total_weights = _metric_variable( + name="total_weights", shape=[], dtype=tf.float64 + ) + + # Update state + update_total_weighted_preds = tf.assign_add( + total_weighted_preds, tf.reduce_sum(weights * predictions) + ) + update_total_weighted_preds_sq = tf.assign_add( + total_weighted_preds_sq, tf.reduce_sum(weights * predictions * predictions) + ) + update_total_weights = tf.assign_add(total_weights, tf.reduce_sum(weights)) + + # Compute output + def compute_output( + tot_w: tf.Tensor, tot_wp: tf.Tensor, tot_wpp: tf.Tensor + ) -> tf.Tensor: + return tf.math.sqrt(tot_wpp / tot_w - (tot_wp / tot_w) ** 2) + + std_dev_est = compute_output( + total_weights, total_weighted_preds, total_weighted_preds_sq + ) + update_std_dev_est = compute_output( + update_total_weights, + update_total_weighted_preds, + update_total_weighted_preds_sq, + ) + + if metrics_collections: + tf.add_to_collections(metrics_collections, std_dev_est) + + if updates_collections: + tf.add_to_collections(updates_collections, update_std_dev_est) + + return std_dev_est, update_std_dev_est + + +def _get_arce_predictions( + predictions: tf.Tensor, + weights: tf.Tensor, + label_weighted: tf.Tensor, + labels: tf.Tensor, + up_weight: bool, + deprecated_rce: bool, + total_positive: tf.Tensor, + update_total_positive: tf.Tensor, +) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]: + """ + Returns the ARCE predictions, total_positive, update_total_positive and weights + used by the rest of the twml.metrics.rce metric computation. + """ + predictions_weighted = tf.multiply(predictions, weights, name="weighted_preds") + label_weighted_comp = tf.subtract( + tf.reduce_sum(weights), tf.reduce_sum(label_weighted) + ) + pred_weight_comp = tf.subtract( + tf.reduce_sum(weights), tf.reduce_sum(predictions_weighted) + ) + normalizer_comp = label_weighted_comp / pred_weight_comp - predictions, total_positive, update_total_positive, weights = _get_arce_predictions( - predictions=predictions, weights=weights, deprecated_rce=deprecated_rce, - label_weighted=label_weighted, labels=labels, up_weight=up_weight, - total_positive=total_positive, update_total_positive=update_total_positive) + if up_weight is False: + total_positive_unweighted = _metric_variable( + name="total_positive_unweighted", shape=[], dtype=tf.float32 + ) - elif normalize: - predictions_weighted = tf.multiply(predictions, weights, name="weighted_preds") + update_total_positive_unweighted = tf.assign_add( + total_positive_unweighted, + tf.reduce_sum(labels), + name="total_positive_unweighted_update", + ) - if deprecated_rce: - normalizer = tf.reduce_sum(label_weighted) / tf.reduce_sum(predictions_weighted) - else: - total_prediction = _metric_variable(name='total_prediction', shape=[], dtype=tf.float32) + if deprecated_rce: + normalizer = tf.reduce_sum(labels) / tf.reduce_sum(label_weighted) + else: + # sum of labels / sum of weighted labels + normalizer = update_total_positive_unweighted / update_total_positive + + label_comp = tf.subtract(tf.to_float(tf.size(labels)), tf.reduce_sum(labels)) + normalizer_comp = label_comp / label_weighted_comp + + # note that up_weight=True changes these for the rest of the twml.metric.rce computation + weights = tf.ones( + shape=tf.shape(labels), dtype=tf.float32, name="default_weight" + ) + total_positive = total_positive_unweighted + update_total_positive = update_total_positive_unweighted + else: + if deprecated_rce: + normalizer = tf.reduce_sum(label_weighted) / tf.reduce_sum( + predictions_weighted + ) + else: + # normalizer used for NRCE (and ARCE with up_weight=True) + total_prediction = _metric_variable( + name="total_prediction", shape=[], dtype=tf.float32 + ) + + # update the variable holding the sum of weighted predictions + update_total_prediction = tf.assign_add( + total_prediction, + tf.reduce_sum(predictions_weighted), + name="total_prediction_update", + ) + + # this used to be tf.reduce_sum(label_weighted) / tf.reduce_sum(predictions_weighted) + # but it measure normalizer over batch was too flawed an approximation. + normalizer = update_total_positive / update_total_prediction + + pred_comp = tf.subtract( + tf.ones(shape=tf.shape(labels), dtype=tf.float32), predictions + ) + pred_comp_norm = tf.multiply( + pred_comp, normalizer_comp, name="normalized_predictions_comp" + ) + pred_num = tf.multiply(predictions, normalizer, name="normalized_pred_numerator") + pred_denom = tf.add(pred_num, pred_comp_norm, name="normalized_pred_denominator") + predictions = pred_num / pred_denom + + return predictions, total_positive, update_total_positive, weights + + +def rce( + labels: tf.Tensor, + predictions: tf.Tensor, + weights: tf.Tensor = None, + normalize: bool = False, + arce: bool = False, + up_weight: bool = True, + metrics_collections: Optional[Collection[tf.Variable]] = None, + updates_collections: Optional[Collection[tf.Variable]] = None, + name: Optional[str] = None, + deprecated_rce: bool = False, +) -> Tuple[tf.Tensor, tf.Tensor]: + """ + Compute the relative cross entropy (RCE). + The RCE is a relative measurement compared to the baseline model's performance. + The baseline model always predicts average click-through-rate (CTR). + The RCE measures, in percentage, how much better the predictions are, compared + to the baseline model, in terms of cross entropy loss. + + y = label; p = prediction; + binary cross entropy = y * log(p) + (1-y) * log(1-p) + + Args: + labels: + the ground true value. + predictions: + the predicted values, whose shape must match labels. + weights: + optional weights, whose shape must match labels . Weight is 1 if not set. + normalize: + if set to true, produce NRCEs used at Twitter. (normalize preds by weights first) + NOTE: if you don't understand what NRCE is, please don't use it. + arce: + if set to true, produces `ARCE `_. + This can only be activated if `normalize=True`. + up_weight: + if set to true, produces arce in the up_weighted space (considers CTR after up_weighting + data), while False gives arce in the original space (only considers CTR before up_weighting). + In the actual version, this flag can only be activated if arce is True. + Notice that the actual version of NRCE corresponds to up_weight=True. + metrics_collections: + optional list of collections to add this metric into. + updates_collections: + optional list of collections to add the associated update_op into. + name: + an optional variable_scope name. + deprecated_rce: + enables the previous NRCE/ARCE calculations which calculated some label metrics + on the batch instead of on all batches seen so far. Note that the older metric + calculation is less stable, especially for smaller batch sizes. You should probably + never have to set this to True. + + Return: + rce_value: + A ``Tensor`` representing the RCE. + update_op: + A update operation used to accumulate data into this metric. + + .. note:: Must have at least 1 positive and 1 negative sample accumulated, + or RCE will come out as NaN. + """ + with tf.variable_scope(name, "rce", (labels, predictions, weights)): + labels = tf.to_float(labels, name="label_to_float") + predictions = tf.to_float(predictions, name="predictions_to_float") + + if weights is None: + weights = tf.ones( + shape=tf.shape(labels), dtype=tf.float32, name="default_weight" + ) + else: + weights = tf.to_float(weights, name="weight_to_float") + + total_positive = _metric_variable( + name="total_positive", shape=[], dtype=tf.float32 + ) + total_loss = _metric_variable(name="total_loss", shape=[], dtype=tf.float32) + total_weight = _metric_variable(name="total_weight", shape=[], dtype=tf.float32) + + label_weighted = tf.multiply(labels, weights, name="weighted_label") + + update_total_positive = tf.assign_add( + total_positive, tf.reduce_sum(label_weighted), name="total_pos_update" + ) + + if arce: + if normalize is False: + raise ValueError( + "This configuration of parameters is not actually allowed" + ) + + ( + predictions, + total_positive, + update_total_positive, + weights, + ) = _get_arce_predictions( + predictions=predictions, + weights=weights, + deprecated_rce=deprecated_rce, + label_weighted=label_weighted, + labels=labels, + up_weight=up_weight, + total_positive=total_positive, + update_total_positive=update_total_positive, + ) + + elif normalize: + predictions_weighted = tf.multiply( + predictions, weights, name="weighted_preds" + ) + + if deprecated_rce: + normalizer = tf.reduce_sum(label_weighted) / tf.reduce_sum( + predictions_weighted + ) + else: + total_prediction = _metric_variable( + name="total_prediction", shape=[], dtype=tf.float32 + ) + + # update the variable holding the sum of weighted predictions + update_total_prediction = tf.assign_add( + total_prediction, + tf.reduce_sum(predictions_weighted), + name="total_prediction_update", + ) + + # this used to be tf.reduce_sum(label_weighted) / tf.reduce_sum(predictions_weighted) + # but it measure normalizer over batch was too flawed an approximation. + normalizer = update_total_positive / update_total_prediction + + # NRCE + predictions = tf.multiply( + predictions, normalizer, name="normalized_predictions" + ) + + # clamp predictions to keep log(p) stable + clip_p = tf.clip_by_value( + predictions, CLAMP_EPSILON, 1.0 - CLAMP_EPSILON, name="clip_p" + ) + logloss = _binary_cross_entropy(pred=clip_p, target=labels, name="logloss") + + logloss_weighted = tf.multiply(logloss, weights, name="weighted_logloss") + + update_total_loss = tf.assign_add( + total_loss, tf.reduce_sum(logloss_weighted), name="total_loss_update" + ) + update_total_weight = tf.assign_add( + total_weight, tf.reduce_sum(weights), name="total_weight_update" + ) + + # metric value retrieval subgraph + ctr1 = tf.truediv(total_positive, total_weight, name="ctr") + # Note: we don't have to keep running averages for computing baseline CE. Because the prediction + # is constant for every sample, we can simplify it to the formula below. + baseline_ce = _binary_cross_entropy(pred=ctr1, target=ctr1, name="baseline_ce") + pred_ce = tf.truediv(total_loss, total_weight, name="pred_ce") + + rce_t = tf.multiply(1.0 - tf.truediv(pred_ce, baseline_ce), 100, name="rce") + + # metric update subgraph + ctr2 = tf.truediv(update_total_positive, update_total_weight, name="ctr_update") + # Note: we don't have to keep running averages for computing baseline CE. Because the prediction + # is constant for every sample, we can simplify it to the formula below. + baseline_ce2 = _binary_cross_entropy( + pred=ctr2, target=ctr2, name="baseline_ce_update" + ) + pred_ce2 = tf.truediv( + update_total_loss, update_total_weight, name="pred_ce_update" + ) + + update_op = tf.multiply( + 1.0 - tf.truediv(pred_ce2, baseline_ce2), 100, name="update_op" + ) + + if metrics_collections: + tf.add_to_collections(metrics_collections, rce_t) + + if updates_collections: + tf.add_to_collections(updates_collections, update_op) + + return rce_t, update_op + + +def ce(p_true: tf.Tensor, p_est: Optional[tf.Tensor] = None) -> tf.Tensor: + if p_est is None: + p_est = p_true + return _binary_cross_entropy(pred=p_est, target=p_true, name=None) + + +def rce_transform(outputs, labels: tf.Tensor, weights: tf.Tensor) -> dict: + """ + Construct an dict of quantities to aggregate over eval batches + outputs, labels, weights are TensorFlow tensors, and are assumed to + be of shape [N] for batch_size = N + Each entry in the output dict should also be of shape [N] + """ + out_vals = dict() + out_vals["weighted_loss"] = weights * ce(p_true=labels, p_est=outputs) + out_vals["weighted_labels"] = labels * weights + out_vals["weight"] = weights + return out_vals + + +def rce_metric(aggregates: dict) -> tf.Tensor: + """ + input ``aggregates`` is an dict with the same keys as those created + by rce_transform(). The dict values are the aggregates (reduce_sum) + of the values produced by rce_transform(), and should be scalars. + output is the value of RCE + """ + # cumulative weighted loss of model predictions + total_weighted_loss = aggregates["weighted_loss"] + total_weighted_labels = aggregates["weighted_labels"] + total_weight = aggregates["weight"] + + model_average_loss = total_weighted_loss / total_weight + baseline_average_loss = ce(total_weighted_labels / total_weight) + return 100.0 * (1 - model_average_loss / baseline_average_loss) + + +def metric_std_err( + labels: tf.Tensor, + predictions: tf.Tensor, + weights: tf.Tensor = None, + transform: Callable[[tf.Tensor, tf.Tensor, tf.Tensor], dict] = rce_transform, + metric: Callable[[dict], tf.Tensor] = rce_metric, + metrics_collections: Optional[Collection[tf.Variable]] = None, + updates_collections: Optional[Collection[tf.Variable]] = None, + name: str = "rce_std_err", +) -> tf.Tensor: + """ + Compute the weighted standard error of the RCE metric on this eval set. + This can be used for confidence intervals and unpaired hypothesis tests. + + Args: + labels: the ground truth value. + predictions: the predicted values, whose shape must match labels. + weights: optional weights, whose shape must match labels . Weight is 1 if not set. + transform: a function of the following form: + + .. code-block:: python + + def transform(outputs, labels, weights): + out_vals = dict() + ... + return out_vals + + where outputs, labels, and weights are all tensors of shape [eval_batch_size]. + The returned dict() should have values that are tensors of shape [eval_batch_size]. + These will be aggregated across many batches in the eval dataset, to produce + one scalar value per key of out_vals. + metric: a function of the following form + + .. code-block:: python + + def metric(aggregates): + ... + return metric_value + + where aggregates is an dict() having the same keys created by transform(). + Each of the corresponding dict values is the reduce_sum of the values produced by + transform(), and is a TF scalar. The return value should be a scalar representing + the value of the desired metric. + metrics_collections: optional list of collections to add this metric into. + updates_collections: optional list of collections to add the associated update_op into. + name: an optional variable_scope name. + + Return: + metric value: A `Tensor` representing the value of the metric on the data accumulated so far. + update_op: A update operation used to accumulate data into this metric. + """ + with tf.variable_scope(name, "metric_std_err", (labels, predictions, weights)): + labels = tf.cast(labels, tf.float64) + predictions = tf.cast(predictions, tf.float64) - # update the variable holding the sum of weighted predictions + if weights is None: + weights = tf.ones_like(labels, dtype=tf.float64, name="default_weight") + else: + weights = tf.cast(weights, tf.float64) + + labels = tf.reshape(labels, [-1]) + predictions = tf.reshape(predictions, [-1]) + predictions = tf.clip_by_value( + predictions, CLAMP_EPSILON, 1.0 - CLAMP_EPSILON, name="clip_p" + ) + weights = tf.reshape(weights, [-1]) + + # first apply the supplied transform function to the output, label, weight data + # returns an dict of 1xN tensors for N input samples + # for each sample, compute f = transform(pred, l, w) + transformed = transform(predictions, labels, weights) + + # we track 3 types of aggregate information + # 1. total number of samples + # 2. aggregated transformed samples (moment1), i.e. sum(f) + # 3. aggregated crosses of transformed samples (moment2), i.e. sum(f*f^T) + + # count total number of samples + sample_count = _metric_variable(name="sample_count", shape=[], dtype=tf.int64) + update_sample_count = tf.assign_add( + sample_count, tf.size(labels, out_type=sample_count.dtype) + ) + + # compose the ordered dict into a single vector + # so f can be treated as a single column vector rather than a collection of scalars + N = len(transformed) + transformed_vec = tf.stack(list(transformed.values()), axis=1) + + # compute and update transformed samples (1st order statistics) + # i.e. accumulate f into F as F += sum(f) + aggregates_1 = _metric_variable( + name="aggregates_1", shape=[N], dtype=tf.float64 + ) + update_aggregates_1 = tf.assign_add( + aggregates_1, tf.reduce_sum(transformed_vec, axis=0) + ) + + # compute and update crossed transformed samples (2nd order statistics) + # i.e. accumulate f*f^T into F2 as F2 += sum(f*transpose(f)) + aggregates_2 = _metric_variable( + name="aggregates_2", shape=[N, N], dtype=tf.float64 + ) + moment_2_temp = tf.reshape(transformed_vec, shape=[-1, N, 1]) * tf.reshape( + transformed_vec, shape=[-1, 1, N] + ) + update_aggregates_2 = tf.assign_add( + aggregates_2, tf.reduce_sum(moment_2_temp, axis=0) + ) + + def compute_output( + agg_1: tf.Tensor, agg_2: tf.Tensor, samp_cnt: tf.Tensor + ) -> tf.Tensor: + """Compute the metric value and its standard error.""" + # decompose the aggregates back into a dict to pass to the user-supplied metric fn + aggregates_dict = dict() + for i, key in enumerate(transformed.keys()): + aggregates_dict[key] = agg_1[i] + + metric_value = metric(aggregates_dict) + + # derivative of metric with respect to the 1st order aggregates + # i.e. d M(agg1) / d agg1 + metric_prime = tf.gradients(metric_value, agg_1, stop_gradients=agg_1) + + # estimated covariance of agg_1 + # cov(F) = sum(f*f^T) - (sum(f) * sum(f)^T) / N + # = agg_2 - (agg_1 * agg_1^T) / N + N_covariance_estimate = agg_2 - ( + tf.reshape(agg_1, shape=[-1, 1]) + @ tf.reshape(agg_1, shape=[1, -1]) + / tf.cast(samp_cnt, dtype=tf.float64) + ) + + # push N_covariance_estimate through a linearization of metric around agg_1 + # metric var = transpose(d M(agg1) / d agg1) * cov(F) * (d M(agg1) / d agg1) + metric_variance = ( + tf.reshape(metric_prime, shape=[1, -1]) + @ N_covariance_estimate + @ tf.reshape(metric_prime, shape=[-1, 1]) + ) + # result should be a single element, but the matmul is 2D + metric_variance = metric_variance[0][0] + metric_stderr = tf.sqrt(metric_variance) + return metric_stderr + + metric_stderr = compute_output(aggregates_1, aggregates_2, sample_count) + update_metric_stderr = compute_output( + update_aggregates_1, update_aggregates_2, update_sample_count + ) + + if metrics_collections: + tf.add_to_collections(metrics_collections, metric_stderr) + + if updates_collections: + tf.add_to_collections(updates_collections, update_metric_stderr) + + return metric_stderr, update_metric_stderr + + +def lolly_nrce( + labels: tf.Tensor, + predictions: tf.Tensor, + weights: Optional[tf.Tensor] = None, + metrics_collections: Optional[Collection[tf.Variable]] = None, + updates_collections: Optional[Collection[tf.Variable]] = None, + name: Optional[str] = None, +) -> Tuple[tf.Tensor, tf.Tensor]: + """ + Compute the Lolly NRCE. + + Note: As this NRCE calculation uses Taylor expansion, it becomes inaccurate when the ctr is large, + especially when the adjusted ctr goes above 1.0. + + Calculation: + + :: + + NRCE: lolly NRCE + BCE: baseline cross entropy + NCE: normalized cross entropy + CE: cross entropy + y_i: label of example i + p_i: prediction of example i + y: ctr + p: average prediction + a: normalizer + + Assumes any p_i and a * p_i is within [0, 1) + NRCE = (1 - NCE / BCE) * 100 + BCE = - sum_i(y_i * log(y) + (1 - y_i) * log(1 - y)) + = - (y * log(y) + (1 - y) * log(1 - y)) + a = y / p + CE = - sum_i(y_i * log(p_i) + (1 - y_i) * log(1 - p_i)) + NCE = - sum_i(y_i * log(a * p_i) + (1 - y_i) * log(1 - a * p_i)) + = - sum_i(y_i * log(p_i) + (1 - y_i) * log(1 - p_i)) + - sum_i(y_i * log(a)) + + sum_i((1 - y_i) * log(1 - p_i)) + - sum_i((1 - y_i) * log(1 - a * p_i)) + ~= CE - sum_i(y_i) * log(a) + + sum_i((1 - y_i) * (- sum_{j=1~5}(p_i^j / j))) + - sum_i((1 - y_i) * (- sum_{j=1~5}(a^j * p_i^j / j))) + # Takes 5 items from the Taylor expansion, can be increased if needed + # Error for each example is O(p_i^6) + = CE - sum_i(y_i) * log(a) + - sum_{j=1~5}(sum_i((1 - y_i) * p_i^j) / j) + + sum_{j=1~5}(sum_i((1 - y_i) * p_i^j) * a^j / j) + = CE - sum_i(y_i) * log(a) + + sum_{j=1~5}(sum_i((1 - y_i) * p_i^j) * (a^j - 1) / j) + + Thus we keep track of CE, sum_i(y_i), sum_i((1 - y_i) * p_i^j) for j=1~5. + We also keep track of p and y by sum_i(y_i), sum_i(p_i), sum_i(1) so that + we can get a at the end, which leads to this NRCE. + + NRCE uses ctr and average pctr to normalize the pctrs. + It removes the impact of prediction error from RCE. + Usually NRCE is higher as the prediction error impact on RCE is negative. + Removing prediction error in our model can make RCE closer to NRCE and thus improve RCE. + + In Lolly NRCE we use ctr and average pctr of the whole dataset. + We thus remove the dataset level error in NRCE calculation. + In this case, when we want to improve RCE to the level of NRCE, + it is achievable as dataset level prediction error is easy to remove by calibration. + Lolly NRCE is thus a good estimate about the potential gain by adding calibration. + + In DBv2 NRCE, we use per-batch ctr and average pctr. We remove the batch level error. + This error is difficult to remove by modeling improvement, + at least not by simple calibration. + It thus cannot indicate the same opportunity as the Lolly NRCE does. + + Args: + labels: + the ground true value. + predictions: + the predicted values, whose shape must match labels. + weights: + optional weights, whose shape must match labels . Weight is 1 if not set. + metrics_collections: + optional list of collections to add this metric into. + updates_collections: + optional list of collections to add the associated update_op into. + name: + an optional variable_scope name. + + Return: + rce_value: + A ``Tensor`` representing the RCE. + update_op: + A update operation used to accumulate data into this metric. + + Note: Must have at least 1 positive and 1 negative sample accumulated, + or NRCE will come out as NaN. + """ + with tf.variable_scope(name, "lolly_nrce", (labels, predictions, weights)): + labels = tf.to_float(labels, name="label_to_float") + predictions = tf.to_float(predictions, name="predictions_to_float") + + if weights is None: + weights = tf.ones( + shape=tf.shape(labels), dtype=tf.float32, name="default_weight" + ) + else: + weights = tf.to_float(weights, name="weight_to_float") + + positive_weights = tf.multiply(labels, weights, name="positive_weights") + + # clamp predictions to keep log(p) stable + clip_predictions = tf.clip_by_value( + predictions, CLAMP_EPSILON, 1.0 - CLAMP_EPSILON, name="clip_predictions" + ) + weighted_predictions = tf.multiply( + predictions, weights, name="weighted_predictions" + ) + + logloss = _binary_cross_entropy( + pred=clip_predictions, target=labels, name="logloss" + ) + weighted_logloss = tf.multiply(logloss, weights, name="weighted_logloss") + + negatives = tf.subtract( + tf.ones(shape=tf.shape(labels), dtype=tf.float32), labels, name="negatives" + ) + negative_predictions = tf.multiply( + predictions, negatives, name="negative_predictions" + ) + weighted_negative_predictions = tf.multiply( + negative_predictions, weights, name="weighted_negative_predictions" + ) + negative_squared_predictions = tf.multiply( + negative_predictions, + negative_predictions, + name="negative_squared_predictions", + ) + weighted_negative_squared_predictions = tf.multiply( + negative_squared_predictions, + weights, + name="weighted_negative_squared_predictions", + ) + negative_cubed_predictions = tf.multiply( + negative_squared_predictions, + negative_predictions, + name="negative_cubed_predictions", + ) + weighted_negative_cubed_predictions = tf.multiply( + negative_cubed_predictions, + weights, + name="weighted_negative_cubed_predictions", + ) + negative_quartic_predictions = tf.multiply( + negative_cubed_predictions, + negative_predictions, + name="negative_quartic_predictions", + ) + weighted_negative_quartic_predictions = tf.multiply( + negative_quartic_predictions, + weights, + name="weighted_negative_quartic_predictions", + ) + negative_quintic_predictions = tf.multiply( + negative_quartic_predictions, + negative_predictions, + name="negative_quintic_predictions", + ) + weighted_negative_quintic_predictions = tf.multiply( + negative_quintic_predictions, + weights, + name="weighted_negative_quintic_predictions", + ) + + # Tracked stats + total_positive = _metric_variable( + name="total_positive", shape=[], dtype=tf.float32 + ) + total_weight = _metric_variable(name="total_weight", shape=[], dtype=tf.float32) + + total_prediction = _metric_variable( + name="total_prediction", shape=[], dtype=tf.float32 + ) + + total_negative_prediction = _metric_variable( + name="total_negative_prediction", shape=[], dtype=tf.float32 + ) + total_negative_squared_prediction = _metric_variable( + name="total_negative_squared_prediction", shape=[], dtype=tf.float32 + ) + total_negative_cubed_prediction = _metric_variable( + name="total_negative_cubed_prediction", shape=[], dtype=tf.float32 + ) + total_negative_quartic_prediction = _metric_variable( + name="total_negative_quartic_prediction", shape=[], dtype=tf.float32 + ) + total_negative_quintic_prediction = _metric_variable( + name="total_negative_quintic_prediction", shape=[], dtype=tf.float32 + ) + + total_loss = _metric_variable(name="total_loss", shape=[], dtype=tf.float32) + + # Update tracked stats + update_total_positive = tf.assign_add( + total_positive, + tf.reduce_sum(positive_weights), + name="total_positive_update", + ) + update_total_weight = tf.assign_add( + total_weight, tf.reduce_sum(weights), name="total_weight_update" + ) update_total_prediction = tf.assign_add( - total_prediction, tf.reduce_sum(predictions_weighted), name="total_prediction_update") - - # this used to be tf.reduce_sum(label_weighted) / tf.reduce_sum(predictions_weighted) - # but it measure normalizer over batch was too flawed an approximation. - normalizer = update_total_positive / update_total_prediction - - # NRCE - predictions = tf.multiply(predictions, normalizer, name="normalized_predictions") - - # clamp predictions to keep log(p) stable - clip_p = tf.clip_by_value(predictions, CLAMP_EPSILON, 1.0 - CLAMP_EPSILON, name="clip_p") - logloss = _binary_cross_entropy(pred=clip_p, target=labels, name="logloss") - - logloss_weighted = tf.multiply(logloss, weights, name="weighted_logloss") - - update_total_loss = tf.assign_add( - total_loss, tf.reduce_sum(logloss_weighted), name="total_loss_update") - update_total_weight = tf.assign_add( - total_weight, tf.reduce_sum(weights), name="total_weight_update") - - # metric value retrieval subgraph - ctr1 = tf.truediv(total_positive, total_weight, name="ctr") - # Note: we don't have to keep running averages for computing baseline CE. Because the prediction - # is constant for every sample, we can simplify it to the formula below. - baseline_ce = _binary_cross_entropy(pred=ctr1, target=ctr1, name="baseline_ce") - pred_ce = tf.truediv(total_loss, total_weight, name="pred_ce") - - rce_t = tf.multiply( - 1.0 - tf.truediv(pred_ce, baseline_ce), - 100, - name="rce") - - # metric update subgraph - ctr2 = tf.truediv(update_total_positive, update_total_weight, name="ctr_update") - # Note: we don't have to keep running averages for computing baseline CE. Because the prediction - # is constant for every sample, we can simplify it to the formula below. - baseline_ce2 = _binary_cross_entropy(pred=ctr2, target=ctr2, name="baseline_ce_update") - pred_ce2 = tf.truediv(update_total_loss, update_total_weight, name="pred_ce_update") - - update_op = tf.multiply( - 1.0 - tf.truediv(pred_ce2, baseline_ce2), - 100, - name="update_op") - - if metrics_collections: - tf.add_to_collections(metrics_collections, rce_t) - - if updates_collections: - tf.add_to_collections(updates_collections, update_op) - - return rce_t, update_op - - -def ce(p_true, p_est=None): - if p_est is None: - p_est = p_true - return _binary_cross_entropy(pred=p_est, target=p_true, name=None) - - -def rce_transform(outputs, labels, weights): - ''' - Construct an OrderedDict of quantities to aggregate over eval batches - outputs, labels, weights are TensorFlow tensors, and are assumed to - be of shape [N] for batch_size = N - Each entry in the output OrderedDict should also be of shape [N] - ''' - out_vals = OrderedDict() - out_vals['weighted_loss'] = weights * ce(p_true=labels, p_est=outputs) - out_vals['weighted_labels'] = labels * weights - out_vals['weight'] = weights - return out_vals - - -def rce_metric(aggregates): - ''' - input ``aggregates`` is an OrderedDict with the same keys as those created - by rce_transform(). The dict values are the aggregates (reduce_sum) - of the values produced by rce_transform(), and should be scalars. - output is the value of RCE - ''' - # cummulative weighted loss of model predictions - total_weighted_loss = aggregates['weighted_loss'] - total_weighted_labels = aggregates['weighted_labels'] - total_weight = aggregates['weight'] - - model_average_loss = total_weighted_loss / total_weight - baseline_average_loss = ce(total_weighted_labels / total_weight) - return 100.0 * (1 - model_average_loss / baseline_average_loss) - - -def metric_std_err(labels, predictions, - weights=None, - transform=rce_transform, metric=rce_metric, - metrics_collections=None, - updates_collections=None, - name='rce_std_err'): - """ - Compute the weighted standard error of the RCE metric on this eval set. - This can be used for confidence intervals and unpaired hypothesis tests. - - Args: - labels: the ground truth value. - predictions: the predicted values, whose shape must match labels. - weights: optional weights, whose shape must match labels . Weight is 1 if not set. - transform: a function of the following form: - - .. code-block:: python - - def transform(outputs, labels, weights): - out_vals = OrderedDict() - ... - return out_vals - - where outputs, labels, and weights are all tensors of shape [eval_batch_size]. - The returned OrderedDict() should have values that are tensors of shape [eval_batch_size]. - These will be aggregated across many batches in the eval dataset, to produce - one scalar value per key of out_vals. - metric: a function of the following form - - .. code-block:: python - - def metric(aggregates): - ... - return metric_value - - where aggregates is an OrderedDict() having the same keys created by transform(). - Each of the corresponding dict values is the reduce_sum of the values produced by - transform(), and is a TF scalar. The return value should be a scalar representing - the value of the desired metric. - metrics_collections: optional list of collections to add this metric into. - updates_collections: optional list of collections to add the associated update_op into. - name: an optional variable_scope name. - - Return: - metric value: A `Tensor` representing the value of the metric on the data accumulated so far. - update_op: A update operation used to accumulate data into this metric. - """ - with tf.variable_scope(name, 'metric_std_err', (labels, predictions, weights)): - labels = tf.cast(labels, tf.float64) - predictions = tf.cast(predictions, tf.float64) - - if weights is None: - weights = tf.ones_like(labels, dtype=tf.float64, name="default_weight") - else: - weights = tf.cast(weights, tf.float64) - - labels = tf.reshape(labels, [-1]) - predictions = tf.reshape(predictions, [-1]) - predictions = tf.clip_by_value(predictions, CLAMP_EPSILON, 1.0 - CLAMP_EPSILON, name="clip_p") - weights = tf.reshape(weights, [-1]) - - # first apply the supplied transform function to the output, label, weight data - # returns an OrderedDict of 1xN tensors for N input samples - # for each sample, compute f = transform(pred, l, w) - transformed = transform(predictions, labels, weights) - - # we track 3 types of aggregate information - # 1. total number of samples - # 2. aggregated transformed samples (moment1), i.e. sum(f) - # 3. aggregated crosses of transformed samples (moment2), i.e. sum(f*f^T) - - # count total number of samples - sample_count = _metric_variable( - name='sample_count', shape=[], dtype=tf.int64) - update_sample_count = tf.assign_add(sample_count, tf.size(labels, out_type=sample_count.dtype)) - - # compose the ordered dict into a single vector - # so f can be treated as a single column vector rather than a collection of scalars - N = len(transformed) - transformed_vec = tf.stack(list(transformed.values()), axis=1) - - # compute and update transformed samples (1st order statistics) - # i.e. accumulate f into F as F += sum(f) - aggregates_1 = _metric_variable( - name='aggregates_1', shape=[N], dtype=tf.float64) - update_aggregates_1 = tf.assign_add(aggregates_1, tf.reduce_sum(transformed_vec, axis=0)) - - # compute and update crossed transformed samples (2nd order statistics) - # i.e. accumulate f*f^T into F2 as F2 += sum(f*transpose(f)) - aggregates_2 = _metric_variable( - name='aggregates_2', shape=[N, N], dtype=tf.float64) - moment_2_temp = ( - tf.reshape(transformed_vec, shape=[-1, N, 1]) - * tf.reshape(transformed_vec, shape=[-1, 1, N]) + total_prediction, + tf.reduce_sum(weighted_predictions), + name="total_prediction_update", + ) + update_total_negative_prediction = tf.assign_add( + total_negative_prediction, + tf.reduce_sum(weighted_negative_predictions), + name="total_negative_prediction_update", + ) + update_total_negative_squared_prediction = tf.assign_add( + total_negative_squared_prediction, + tf.reduce_sum(weighted_negative_squared_predictions), + name="total_negative_squared_prediction_update", + ) + update_total_negative_cubed_prediction = tf.assign_add( + total_negative_cubed_prediction, + tf.reduce_sum(weighted_negative_cubed_predictions), + name="total_negative_cubed_prediction_update", + ) + update_total_negative_quartic_prediction = tf.assign_add( + total_negative_quartic_prediction, + tf.reduce_sum(weighted_negative_quartic_predictions), + name="total_negative_quartic_prediction_update", + ) + update_total_negative_quintic_prediction = tf.assign_add( + total_negative_quintic_prediction, + tf.reduce_sum(weighted_negative_quintic_predictions), + name="total_negative_quintic_prediction_update", + ) + update_total_loss = tf.assign_add( + total_loss, tf.reduce_sum(weighted_logloss), name="total_loss_update" + ) + + # metric value retrieval subgraph + # ctr of this batch + positive_rate = tf.truediv(total_positive, total_weight, name="positive_rate") + # Note: we don't have to keep running averages for computing baseline CE. Because the prediction + # is constant for every sample, we can simplify it to the formula below. + baseline_loss = _binary_cross_entropy( + pred=positive_rate, target=positive_rate, name="baseline_loss" + ) + + # normalizing ratio for nrce + # calculated using total ctr and pctr so the last batch has the dataset ctr and pctr + normalizer = tf.truediv(total_positive, total_prediction, name="normalizer") + # Taylor expansion to calculate nl = - sum(y * log(p * a) + (1 - y) * log (1 - p * a)) + # log(1 - p * a) = -sum_{i=1~+inf}(a^i * x^i / i) + # log(1 - p) = -sum_{i=1~+inf}(a^i * x^i / i) + normalized_loss = ( + total_loss + - total_positive * tf.log(normalizer) + + total_negative_prediction * (normalizer - 1) + + total_negative_squared_prediction * (normalizer * normalizer - 1) / 2 + + total_negative_cubed_prediction + * (normalizer * normalizer * normalizer - 1) + / 3 + + total_negative_quartic_prediction + * (normalizer * normalizer * normalizer * normalizer - 1) + / 4 + + total_negative_quintic_prediction + * (normalizer * normalizer * normalizer * normalizer * normalizer - 1) + / 5 + ) + + # average normalized loss + avg_loss = tf.truediv(normalized_loss, total_weight, name="avg_loss") + + nrce_t = tf.multiply( + 1.0 - tf.truediv(avg_loss, baseline_loss), 100, name="lolly_nrce" + ) + + # metric update subgraph + update_positive_rate = tf.truediv( + update_total_positive, update_total_weight, name="update_positive_rate" + ) + # Note: we don't have to keep running averages for computing baseline CE. Because the prediction + # is constant for every sample, we can simplify it to the formula below. + update_baseline_loss = _binary_cross_entropy( + pred=update_positive_rate, + target=update_positive_rate, + name="update_baseline_loss", + ) + + update_normalizer = tf.truediv( + update_total_positive, update_total_prediction, name="update_normalizer" + ) + update_normalized_loss = ( + update_total_loss + - update_total_positive * tf.log(update_normalizer) + + update_total_negative_prediction * (update_normalizer - 1) + + update_total_negative_squared_prediction + * (update_normalizer * update_normalizer - 1) + / 2 + + update_total_negative_cubed_prediction + * (update_normalizer * update_normalizer * update_normalizer - 1) + / 3 + + update_total_negative_quartic_prediction + * ( + update_normalizer + * update_normalizer + * update_normalizer + * update_normalizer + - 1 + ) + / 4 + + update_total_negative_quintic_prediction + * ( + update_normalizer + * update_normalizer + * update_normalizer + * update_normalizer + * update_normalizer + - 1 + ) + / 5 + ) + + update_avg_loss = tf.truediv( + update_normalized_loss, update_total_weight, name="update_avg_loss" + ) + + update_op = tf.multiply( + 1.0 - tf.truediv(update_avg_loss, update_baseline_loss), + 100, + name="update_op", + ) + + if metrics_collections: + tf.add_to_collections(metrics_collections, nrce_t) + + if updates_collections: + tf.add_to_collections(updates_collections, update_op) + + return nrce_t, update_op + + +def _binary_cross_entropy( + pred: tf.Tensor, + target: tf.Tensor, + name: Optional[str] = None, +) -> tf.Tensor: + return -tf.add( + target * tf.log(pred), (1.0 - target) * tf.log(1.0 - pred), name=name ) - update_aggregates_2 = tf.assign_add(aggregates_2, tf.reduce_sum(moment_2_temp, axis=0)) - - def compute_output(agg_1, agg_2, samp_cnt): - # decompose the aggregates back into a dict to pass to the user-supplied metric fn - aggregates_dict = OrderedDict() - for i, key in enumerate(transformed.keys()): - aggregates_dict[key] = agg_1[i] - - metric_value = metric(aggregates_dict) - - # derivative of metric with respect to the 1st order aggregates - # i.e. d M(agg1) / d agg1 - metric_prime = tf.gradients(metric_value, agg_1, stop_gradients=agg_1) - - # estimated covariance of agg_1 - # cov(F) = sum(f*f^T) - (sum(f) * sum(f)^T) / N - # = agg_2 - (agg_1 * agg_1^T) / N - N_covariance_estimate = agg_2 - ( - tf.reshape(agg_1, shape=[-1, 1]) - @ tf.reshape(agg_1, shape=[1, -1]) - / tf.cast(samp_cnt, dtype=tf.float64) - ) - - # push N_covariance_estimate through a linearization of metric around agg_1 - # metric var = transpose(d M(agg1) / d agg1) * cov(F) * (d M(agg1) / d agg1) - metric_variance = ( - tf.reshape(metric_prime, shape=[1, -1]) - @ N_covariance_estimate - @ tf.reshape(metric_prime, shape=[-1, 1]) - ) - # result should be a single element, but the matmul is 2D - metric_variance = metric_variance[0][0] - metric_stderr = tf.sqrt(metric_variance) - return metric_stderr - - metric_stderr = compute_output(aggregates_1, aggregates_2, sample_count) - update_metric_stderr = compute_output(update_aggregates_1, update_aggregates_2, update_sample_count) - - if metrics_collections: - tf.add_to_collections(metrics_collections, metric_stderr) - - if updates_collections: - tf.add_to_collections(updates_collections, update_metric_stderr) - - return metric_stderr, update_metric_stderr - - -def lolly_nrce(labels, predictions, - weights=None, - metrics_collections=None, - updates_collections=None, - name=None): - """ - Compute the Lolly NRCE. - - Note: As this NRCE calculation uses Taylor expansion, it becomes inaccurate when the ctr is large, - especially when the adjusted ctr goes above 1.0. - - Calculation: - - :: - - NRCE: lolly NRCE - BCE: baseline cross entropy - NCE: normalized cross entropy - CE: cross entropy - y_i: label of example i - p_i: prediction of example i - y: ctr - p: average prediction - a: normalizer - - Assumes any p_i and a * p_i is within [0, 1) - NRCE = (1 - NCE / BCE) * 100 - BCE = - sum_i(y_i * log(y) + (1 - y_i) * log(1 - y)) - = - (y * log(y) + (1 - y) * log(1 - y)) - a = y / p - CE = - sum_i(y_i * log(p_i) + (1 - y_i) * log(1 - p_i)) - NCE = - sum_i(y_i * log(a * p_i) + (1 - y_i) * log(1 - a * p_i)) - = - sum_i(y_i * log(p_i) + (1 - y_i) * log(1 - p_i)) - - sum_i(y_i * log(a)) - + sum_i((1 - y_i) * log(1 - p_i)) - - sum_i((1 - y_i) * log(1 - a * p_i)) - ~= CE - sum_i(y_i) * log(a) - + sum_i((1 - y_i) * (- sum_{j=1~5}(p_i^j / j))) - - sum_i((1 - y_i) * (- sum_{j=1~5}(a^j * p_i^j / j))) - # Takes 5 items from the Taylor expansion, can be increased if needed - # Error for each example is O(p_i^6) - = CE - sum_i(y_i) * log(a) - - sum_{j=1~5}(sum_i((1 - y_i) * p_i^j) / j) - + sum_{j=1~5}(sum_i((1 - y_i) * p_i^j) * a^j / j) - = CE - sum_i(y_i) * log(a) - + sum_{j=1~5}(sum_i((1 - y_i) * p_i^j) * (a^j - 1) / j) - - Thus we keep track of CE, sum_i(y_i), sum_i((1 - y_i) * p_i^j) for j=1~5. - We also keep track of p and y by sum_i(y_i), sum_i(p_i), sum_i(1) so that - we can get a at the end, which leads to this NRCE. - - NRCE uses ctr and average pctr to normalize the pctrs. - It removes the impact of prediction error from RCE. - Usually NRCE is higher as the prediction error impact on RCE is negative. - Removing prediction error in our model can make RCE closer to NRCE and thus improve RCE. - - In Lolly NRCE we use ctr and average pctr of the whole dataset. - We thus remove the dataset level error in NRCE calculation. - In this case, when we want to improve RCE to the level of NRCE, - it is achievable as dataset level prediction error is easy to remove by calibration. - Lolly NRCE is thus a good estimate about the potential gain by adding calibration. - - In DBv2 NRCE, we use per-batch ctr and average pctr. We remove the batch level error. - This error is difficult to remove by modeling improvement, - at least not by simple calibration. - It thus cannot indicate the same opportunity as the Lolly NRCE does. - - Args: - labels: - the ground true value. - predictions: - the predicted values, whose shape must match labels. - weights: - optional weights, whose shape must match labels . Weight is 1 if not set. - metrics_collections: - optional list of collections to add this metric into. - updates_collections: - optional list of collections to add the associated update_op into. - name: - an optional variable_scope name. - - Return: - rce_value: - A ``Tensor`` representing the RCE. - update_op: - A update operation used to accumulate data into this metric. - - Note: Must have at least 1 positive and 1 negative sample accumulated, - or NRCE will come out as NaN. - """ - with tf.variable_scope(name, "lolly_nrce", (labels, predictions, weights)): - labels = tf.to_float(labels, name="label_to_float") - predictions = tf.to_float(predictions, name="predictions_to_float") - - if weights is None: - weights = tf.ones(shape=tf.shape(labels), dtype=tf.float32, name="default_weight") - else: - weights = tf.to_float(weights, name="weight_to_float") - - positive_weights = tf.multiply(labels, weights, name="positive_weights") - - # clamp predictions to keep log(p) stable - clip_predictions = tf.clip_by_value( - predictions, - CLAMP_EPSILON, - 1.0 - CLAMP_EPSILON, - name="clip_predictions") - weighted_predictions = tf.multiply( - predictions, weights, - name="weighted_predictions") - - logloss = _binary_cross_entropy(pred=clip_predictions, target=labels, name="logloss") - weighted_logloss = tf.multiply(logloss, weights, name="weighted_logloss") - - negatives = tf.subtract( - tf.ones(shape=tf.shape(labels), dtype=tf.float32), - labels, - name="negatives") - negative_predictions = tf.multiply( - predictions, - negatives, - name="negative_predictions") - weighted_negative_predictions = tf.multiply( - negative_predictions, weights, - name="weighted_negative_predictions") - negative_squared_predictions = tf.multiply( - negative_predictions, - negative_predictions, - name="negative_squared_predictions") - weighted_negative_squared_predictions = tf.multiply( - negative_squared_predictions, weights, - name="weighted_negative_squared_predictions") - negative_cubed_predictions = tf.multiply( - negative_squared_predictions, - negative_predictions, - name="negative_cubed_predictions") - weighted_negative_cubed_predictions = tf.multiply( - negative_cubed_predictions, weights, - name="weighted_negative_cubed_predictions") - negative_quartic_predictions = tf.multiply( - negative_cubed_predictions, - negative_predictions, - name="negative_quartic_predictions") - weighted_negative_quartic_predictions = tf.multiply( - negative_quartic_predictions, weights, - name="weighted_negative_quartic_predictions") - negative_quintic_predictions = tf.multiply( - negative_quartic_predictions, - negative_predictions, - name="negative_quintic_predictions") - weighted_negative_quintic_predictions = tf.multiply( - negative_quintic_predictions, weights, - name="weighted_negative_quintic_predictions") - - # Tracked stats - total_positive = _metric_variable(name="total_positive", shape=[], dtype=tf.float32) - total_weight = _metric_variable(name="total_weight", shape=[], dtype=tf.float32) - - total_prediction = _metric_variable(name="total_prediction", shape=[], dtype=tf.float32) - - total_negative_prediction = _metric_variable( - name="total_negative_prediction", - shape=[], dtype=tf.float32) - total_negative_squared_prediction = _metric_variable( - name="total_negative_squared_prediction", - shape=[], dtype=tf.float32) - total_negative_cubed_prediction = _metric_variable( - name="total_negative_cubed_prediction", - shape=[], dtype=tf.float32) - total_negative_quartic_prediction = _metric_variable( - name="total_negative_quartic_prediction", - shape=[], dtype=tf.float32) - total_negative_quintic_prediction = _metric_variable( - name="total_negative_quintic_prediction", - shape=[], dtype=tf.float32) - - total_loss = _metric_variable(name="total_loss", shape=[], dtype=tf.float32) - - # Update tracked stats - update_total_positive = tf.assign_add( - total_positive, tf.reduce_sum(positive_weights), name="total_positive_update") - update_total_weight = tf.assign_add( - total_weight, tf.reduce_sum(weights), name="total_weight_update") - update_total_prediction = tf.assign_add( - total_prediction, tf.reduce_sum(weighted_predictions), name="total_prediction_update") - update_total_negative_prediction = tf.assign_add( - total_negative_prediction, - tf.reduce_sum(weighted_negative_predictions), name="total_negative_prediction_update") - update_total_negative_squared_prediction = tf.assign_add( - total_negative_squared_prediction, - tf.reduce_sum(weighted_negative_squared_predictions), - name="total_negative_squared_prediction_update") - update_total_negative_cubed_prediction = tf.assign_add( - total_negative_cubed_prediction, - tf.reduce_sum(weighted_negative_cubed_predictions), - name="total_negative_cubed_prediction_update") - update_total_negative_quartic_prediction = tf.assign_add( - total_negative_quartic_prediction, - tf.reduce_sum(weighted_negative_quartic_predictions), - name="total_negative_quartic_prediction_update") - update_total_negative_quintic_prediction = tf.assign_add( - total_negative_quintic_prediction, - tf.reduce_sum(weighted_negative_quintic_predictions), - name="total_negative_quintic_prediction_update") - update_total_loss = tf.assign_add( - total_loss, tf.reduce_sum(weighted_logloss), name="total_loss_update") - - # metric value retrieval subgraph - # ctr of this batch - positive_rate = tf.truediv(total_positive, total_weight, name="positive_rate") - # Note: we don't have to keep running averages for computing baseline CE. Because the prediction - # is constant for every sample, we can simplify it to the formula below. - baseline_loss = _binary_cross_entropy( - pred=positive_rate, - target=positive_rate, - name="baseline_loss") - - # normalizing ratio for nrce - # calculated using total ctr and pctr so the last batch has the dataset ctr and pctr - normalizer = tf.truediv(total_positive, total_prediction, name="normalizer") - # Taylor expansion to calculate nl = - sum(y * log(p * a) + (1 - y) * log (1 - p * a)) - # log(1 - p * a) = -sum_{i=1~+inf}(a^i * x^i / i) - # log(1 - p) = -sum_{i=1~+inf}(a^i * x^i / i) - normalized_loss = ( - total_loss - - total_positive * tf.log(normalizer) + - total_negative_prediction * (normalizer - 1) + - total_negative_squared_prediction * (normalizer * normalizer - 1) / 2 + - total_negative_cubed_prediction * - (normalizer * normalizer * normalizer - 1) / 3 + - total_negative_quartic_prediction * - (normalizer * normalizer * normalizer * normalizer - 1) / 4 + - total_negative_quintic_prediction * - (normalizer * normalizer * normalizer * normalizer * normalizer - 1) / 5) - - # average normalized loss - avg_loss = tf.truediv(normalized_loss, total_weight, name="avg_loss") - - nrce_t = tf.multiply( - 1.0 - tf.truediv(avg_loss, baseline_loss), - 100, - name="lolly_nrce") - - # metric update subgraph - update_positive_rate = tf.truediv( - update_total_positive, - update_total_weight, - name="update_positive_rate") - # Note: we don't have to keep running averages for computing baseline CE. Because the prediction - # is constant for every sample, we can simplify it to the formula below. - update_baseline_loss = _binary_cross_entropy( - pred=update_positive_rate, - target=update_positive_rate, - name="update_baseline_loss") - - update_normalizer = tf.truediv( - update_total_positive, - update_total_prediction, - name="update_normalizer") - update_normalized_loss = ( - update_total_loss - - update_total_positive * tf.log(update_normalizer) + - update_total_negative_prediction * - (update_normalizer - 1) + - update_total_negative_squared_prediction * - (update_normalizer * update_normalizer - 1) / 2 + - update_total_negative_cubed_prediction * - (update_normalizer * update_normalizer * update_normalizer - 1) / 3 + - update_total_negative_quartic_prediction * - (update_normalizer * update_normalizer * update_normalizer * - update_normalizer - 1) / 4 + - update_total_negative_quintic_prediction * - (update_normalizer * update_normalizer * update_normalizer * - update_normalizer * update_normalizer - 1) / 5) - - update_avg_loss = tf.truediv( - update_normalized_loss, - update_total_weight, - name="update_avg_loss") - - update_op = tf.multiply( - 1.0 - tf.truediv(update_avg_loss, update_baseline_loss), - 100, - name="update_op") - - if metrics_collections: - tf.add_to_collections(metrics_collections, nrce_t) - - if updates_collections: - tf.add_to_collections(updates_collections, update_op) - - return nrce_t, update_op - - -def _binary_cross_entropy(pred, target, name): - return - tf.add( - target * tf.log(pred), - (1.0 - target) * tf.log(1.0 - pred), - name=name) # Copied from metrics_impl.py with minor modifications. # https://github.com/tensorflow/tensorflow/blob/v1.5.0/tensorflow/python/ops/metrics_impl.py#L39 -def _metric_variable(shape, dtype, validate_shape=True, name=None): - """Create variable in `GraphKeys.(LOCAL|METRIC_VARIABLES`) collections.""" +def _metric_variable( + shape: Sequence[int], + dtype: tf.dtypes.DType, + validate_shape: bool = True, + name: Optional[str] = None, +) -> tf.Variable: + """Create variable in `GraphKeys.(LOCAL|METRIC_VARIABLES`) collections.""" + + return tf.Variable( + lambda: tf.zeros(shape, dtype), + trainable=False, + collections=[tf.GraphKeys.LOCAL_VARIABLES, tf.GraphKeys.METRIC_VARIABLES], + validate_shape=validate_shape, + name=name, + ) - return tf.Variable( - lambda: tf.zeros(shape, dtype), - trainable=False, - collections=[tf.GraphKeys.LOCAL_VARIABLES, tf.GraphKeys.METRIC_VARIABLES], - validate_shape=validate_shape, - name=name) PERCENTILES = np.linspace(0, 1, 101, dtype=np.float32) # metric_name: (metric, requires thresholded output) SUPPORTED_BINARY_CLASS_METRICS = { - # TWML metrics - 'total_weight': (total_weight_metric, False), - 'num_samples': (num_samples_metric, False), - 'rce': (rce, False), - 'rce_std_err': (partial(metric_std_err, transform=rce_transform, metric=rce_metric, name='rce_std_err'), False), - 'nrce': (partial(rce, normalize=True), False), - 'lolly_nrce': (lolly_nrce, False), - 'arce': (partial(rce, normalize=True, arce=True), False), - 'arce_original': (partial(rce, normalize=True, arce=True, up_weight=False), False), - # CTR measures positive sample ratio. This terminology is inherited from Ads. - 'ctr': (ctr, False), - # predicted CTR measures predicted positive ratio. - 'predicted_ctr': (predicted_ctr, False), - 'pred_std_dev': (prediction_std_dev, False), - # thresholded metrics - 'accuracy': (tf.metrics.accuracy, True), - 'precision': (tf.metrics.precision, True), - 'recall': (tf.metrics.recall, True), - - 'false_positives': (tf.metrics.false_positives, True), - 'false_negatives': (tf.metrics.false_negatives, True), - 'true_positives': (tf.metrics.true_positives, True), - 'true_negatives': (tf.metrics.true_negatives, True), - - 'precision_at_percentiles': (partial(tf.metrics.precision_at_thresholds, thresholds=PERCENTILES), False), - 'recall_at_percentiles': (partial(tf.metrics.recall_at_thresholds, thresholds=PERCENTILES), False), - 'false_positives_at_percentiles': (partial(tf.metrics.false_positives_at_thresholds, thresholds=PERCENTILES), False), - 'false_negatives_at_percentiles': (partial(tf.metrics.false_negatives_at_thresholds, thresholds=PERCENTILES), False), - 'true_positives_at_percentiles': (partial(tf.metrics.true_positives_at_thresholds, thresholds=PERCENTILES), False), - 'true_negatives_at_percentiles': (partial(tf.metrics.true_negatives_at_thresholds, thresholds=PERCENTILES), False), - - # tensorflow metrics - 'roc_auc': (partial(tf.metrics.auc, curve='ROC', - summation_method='careful_interpolation'), False), - 'pr_auc': (partial(tf.metrics.auc, curve='PR', - summation_method='careful_interpolation'), False), - - # tensorboard curves - 'pr_curve': (tb.summary.v1.pr_curve_streaming_op, False), - - # deprecated metrics - 'deprecated_nrce': (partial(rce, normalize=True, deprecated_rce=True), False), - 'deprecated_arce': (partial(rce, normalize=True, arce=True, deprecated_rce=True), False), - 'deprecated_arce_original': (partial(rce, normalize=True, arce=True, - up_weight=False, deprecated_rce=True), False) + # TWML metrics + "total_weight": (total_weight_metric, False), + "num_samples": (num_samples_metric, False), + "rce": (rce, False), + "rce_std_err": ( + partial( + metric_std_err, + transform=rce_transform, + metric=rce_metric, + name="rce_std_err", + ), + False, + ), + "nrce": (partial(rce, normalize=True), False), + "lolly_nrce": (lolly_nrce, False), + "arce": (partial(rce, normalize=True, arce=True), False), + "arce_original": (partial(rce, normalize=True, arce=True, up_weight=False), False), + # CTR measures positive sample ratio. This terminology is inherited from Ads. + "ctr": (ctr, False), + # predicted CTR measures predicted positive ratio. + "predicted_ctr": (predicted_ctr, False), + "pred_std_dev": (prediction_std_dev, False), + # thresholded metrics + "accuracy": (tf.metrics.accuracy, True), + "precision": (tf.metrics.precision, True), + "recall": (tf.metrics.recall, True), + "false_positives": (tf.metrics.false_positives, True), + "false_negatives": (tf.metrics.false_negatives, True), + "true_positives": (tf.metrics.true_positives, True), + "true_negatives": (tf.metrics.true_negatives, True), + "precision_at_percentiles": ( + partial(tf.metrics.precision_at_thresholds, thresholds=PERCENTILES), + False, + ), + "recall_at_percentiles": ( + partial(tf.metrics.recall_at_thresholds, thresholds=PERCENTILES), + False, + ), + "false_positives_at_percentiles": ( + partial(tf.metrics.false_positives_at_thresholds, thresholds=PERCENTILES), + False, + ), + "false_negatives_at_percentiles": ( + partial(tf.metrics.false_negatives_at_thresholds, thresholds=PERCENTILES), + False, + ), + "true_positives_at_percentiles": ( + partial(tf.metrics.true_positives_at_thresholds, thresholds=PERCENTILES), + False, + ), + "true_negatives_at_percentiles": ( + partial(tf.metrics.true_negatives_at_thresholds, thresholds=PERCENTILES), + False, + ), + # tensorflow metrics + "roc_auc": ( + partial(tf.metrics.auc, curve="ROC", summation_method="careful_interpolation"), + False, + ), + "pr_auc": ( + partial(tf.metrics.auc, curve="PR", summation_method="careful_interpolation"), + False, + ), + # tensorboard curves + "pr_curve": (tb.summary.v1.pr_curve_streaming_op, False), + # deprecated metrics + "deprecated_nrce": (partial(rce, normalize=True, deprecated_rce=True), False), + "deprecated_arce": ( + partial(rce, normalize=True, arce=True, deprecated_rce=True), + False, + ), + "deprecated_arce_original": ( + partial(rce, normalize=True, arce=True, up_weight=False, deprecated_rce=True), + False, + ), } # default metrics provided by get_binary_class_metric_fn -DEFAULT_BINARY_CLASS_METRICS = ['total_weight', 'num_samples', 'rce', 'rce_std_err', - 'nrce', 'arce', 'ctr', 'predicted_ctr', 'pred_std_dev', - 'accuracy', 'precision', 'recall', 'roc_auc', 'pr_auc'] - - -def get_binary_class_metric_fn(metrics=None): - """ - Returns a function having signature: - - .. code-block:: python - - def get_eval_metric_ops(graph_output, labels, weights): - ... - return eval_metric_ops - - where the returned eval_metric_ops is a dict of common evaluation metric - Ops for binary classification. See `tf.estimator.EstimatorSpec - `_ - for a description of eval_metric_ops. The graph_output is a the result - dict returned by build_graph. Labels and weights are tf.Tensors. - - The following graph_output keys are recognized: - output: - the raw predictions between 0 and 1. Required. - threshold: - A value between 0 and 1 used to threshold the output into a hard_output. - Defaults to 0.5 when threshold and hard_output are missing. - Either threshold or hard_output can be provided, but not both. - hard_output: - A thresholded output. Either threshold or hard_output can be provided, but not both. - - Args: - metrics (list of String): - a list of metrics of interest. E.g. ['ctr', 'accuracy', 'rce'] - Element in the list can be a string from following supported metrics, or can be a tuple - with three items: metric name, metric function, bool for thresholded output. - - These metrics are evaluated and reported to tensorboard *during the eval phases only*. - Supported metrics: - - - ctr (same as positive sample ratio.) - - rce (cross entropy loss compared to the baseline model of always predicting ctr) - - nrce (normalized rce, do not use this one if you do not understand what it is) - - `arce `_ (a more recent proposed improvment over NRCE) - - arce_original - - lolly_nrce (NRCE as it is computed in Lolly, with Taylor expansion) - - pr_auc - - roc_auc - - accuracy (percentage of predictions that are correct) - - precision (true positives) / (true positives + false positives) - - recall (true positives) / (true positives + false negatives) - - pr_curve (precision-recall curve) - - deprecated_arce (ARCE as it was calculated before a stability fix) - - deprecated_nrce (NRCE as it was calculated before a stability fix) - - Example of metrics list with mixture of string and tuple: - metrics = [ - 'rce','nrce', - 'roc_auc', # default roc_auc metric - ( - 'roc_auc_500', # give this metric a name - partial(tf.metrics.auc, curve='ROC', summation_method='careful_interpolation', num_thresholds=500), # the metric fn - False, # whether the metric requires thresholded output - )] - - NOTE: When predicting rare events roc_auc can be underestimated. Increasing num_threshold - can reduce the underestimation. See go/roc-auc-pitfall for more details. - - NOTE: accuracy / precision / recall apply to binary classification problems only. - I.e. a prediction is only considered correct if it matches the label. E.g. if the label - is 1.0, and the prediction is 0.99, it does not get credit. If you want to use - precision / recall / accuracy metrics with soft predictions, you'll need to threshold - your predictions into hard 0/1 labels. - - When metrics is None (the default), it defaults to: - [rce, nrce, arce, ctr, predicted_ctr, accuracy, precision, recall, prauc, roc_auc], - """ - # pylint: disable=dict-keys-not-iterating - if metrics is None: - # remove expensive metrics by default for faster eval - metrics = list(DEFAULT_BINARY_CLASS_METRICS) - - def get_eval_metric_ops(graph_output, labels, weights): +DEFAULT_BINARY_CLASS_METRICS = [ + "total_weight", + "num_samples", + "rce", + "rce_std_err", + "nrce", + "arce", + "ctr", + "predicted_ctr", + "pred_std_dev", + "accuracy", + "precision", + "recall", + "roc_auc", + "pr_auc", +] + + +def get_binary_class_metric_fn(metrics: Optional[List[str]] = None) -> Callable: """ - graph_output: - dict that is returned by build_graph given input features. - labels: - target labels associated to batch. - weights: - weights of the samples.. + Returns a function having signature: + + .. code-block:: python + + def get_eval_metric_ops(graph_output, labels, weights): + ... + return eval_metric_ops + + where the returned eval_metric_ops is a dict of common evaluation metric + Ops for binary classification. See `tf.estimator.EstimatorSpec + `_ + for a description of eval_metric_ops. The graph_output is a the result + dict returned by build_graph. Labels and weights are tf.Tensors. + + The following graph_output keys are recognized: + output: + the raw predictions between 0 and 1. Required. + threshold: + A value between 0 and 1 used to threshold the output into a hard_output. + Defaults to 0.5 when threshold and hard_output are missing. + Either threshold or hard_output can be provided, but not both. + hard_output: + A thresholded output. Either threshold or hard_output can be provided, but not both. + + Args: + metrics (list of String): + a list of metrics of interest. E.g. ['ctr', 'accuracy', 'rce'] + Element in the list can be a string from following supported metrics, or can be a tuple + with three items: metric name, metric function, bool for thresholded output. + + These metrics are evaluated and reported to tensorboard *during the eval phases only*. + Supported metrics: + + - ctr (same as positive sample ratio.) + - rce (cross entropy loss compared to the baseline model of always predicting ctr) + - nrce (normalized rce, do not use this one if you do not understand what it is) + - `arce `_ (a more recent proposed improvment over NRCE) + - arce_original + - lolly_nrce (NRCE as it is computed in Lolly, with Taylor expansion) + - pr_auc + - roc_auc + - accuracy (percentage of predictions that are correct) + - precision (true positives) / (true positives + false positives) + - recall (true positives) / (true positives + false negatives) + - pr_curve (precision-recall curve) + - deprecated_arce (ARCE as it was calculated before a stability fix) + - deprecated_nrce (NRCE as it was calculated before a stability fix) + + Example of metrics list with mixture of string and tuple: + metrics = [ + 'rce','nrce', + 'roc_auc', # default roc_auc metric + ( + 'roc_auc_500', # give this metric a name + partial(tf.metrics.auc, curve='ROC', summation_method='careful_interpolation', num_thresholds=500), # the metric fn + False, # whether the metric requires thresholded output + )] + + NOTE: When predicting rare events roc_auc can be underestimated. Increasing num_threshold + can reduce the underestimation. See go/roc-auc-pitfall for more details. + + NOTE: accuracy / precision / recall apply to binary classification problems only. + I.e. a prediction is only considered correct if it matches the label. E.g. if the label + is 1.0, and the prediction is 0.99, it does not get credit. If you want to use + precision / recall / accuracy metrics with soft predictions, you'll need to threshold + your predictions into hard 0/1 labels. + + When metrics is None (the default), it defaults to: + [rce, nrce, arce, ctr, predicted_ctr, accuracy, precision, recall, prauc, roc_auc], """ - - eval_metric_ops = OrderedDict() - - preds = graph_output['output'] - - threshold = graph_output['threshold'] if 'threshold' in graph_output else 0.5 - - hard_preds = graph_output.get('hard_output') - if hard_preds is None: - hard_preds = tf.greater_equal(preds, threshold) - - # add metrics to eval_metric_ops dict - for metric in metrics: - if isinstance(metric, tuple) and len(metric) == 3: - metric_name, metric_factory, requires_threshold = metric - metric_name = metric_name.lower() - elif isinstance(metric, str): - metric_name = metric.lower() # metric name are case insensitive. - metric_factory, requires_threshold = SUPPORTED_BINARY_CLASS_METRICS.get(metric_name) - else: - raise ValueError("Metric should be either string or tuple of length 3.") - - if metric_name in eval_metric_ops: - # avoid adding duplicate metrics. - continue - - if metric_factory: - value_op, update_op = metric_factory( - labels=labels, - predictions=(hard_preds if requires_threshold else preds), - weights=weights, name=metric_name) - eval_metric_ops[metric_name] = (value_op, update_op) - else: - raise ValueError('Cannot find the metric named ' + metric_name) - - return eval_metric_ops - - return get_eval_metric_ops - - -def get_multi_binary_class_metric_fn(metrics, classes=None, class_dim=1): - """ - Returns a function having signature: - - .. code-block:: python - - def get_eval_metric_ops(graph_output, labels, weights): - ... - return eval_metric_ops - - where the returned eval_metric_ops is a dict of common evaluation metric - Ops for concatenated binary classifications. See `tf.estimator.EstimatorSpec - `_ - for a description of eval_metric_ops. The graph_output is a the result - dict returned by build_graph. Labels and weights are tf.Tensors. - - In multiple binary classification problems, the - ``predictions`` (that is, ``graph_output['output']``) - are expected to have shape ``batch_size x n_classes``, - where ``n_classes`` is the number of binary classification. - Binary classification at output[i] is expected to discriminate between ``classes[i]`` (1) - and NOT ``classes[i]`` (0). The labels should be of the same shape as ``graph_output`` - with binary values (0 or 1). The weights can be of size ``batch_size`` or - ``batch_size x n_classes``. The ``class_dim`` contain separate probabilities, - and need to have separate metrics. - - The following graph_output keys are recognized: - output: - the raw predictions between 0 and 1. Required. - threshold: - A value between 0 and 1 used to threshold the output into a hard_output. - Defaults to 0.5 when threshold and hard_output are missing. - Either threshold or hard_output can be provided, but not both. - hard_output: - A thresholded output. Either threshold or hard_output can be provided, but not both. - - Args: - metrics (list of Metrics): - a list of metrics of interest. E.g. ['ctr', 'accuracy', 'rce'] - Element in the list can be a string from following supported metrics, or can be a tuple - with three items: metric name, metric function, bool for thresholded output. - - These metrics are evaluated and reported to tensorboard *during the eval phases only*. - Supported metrics: - - - ctr (same as positive sample ratio.) - - rce (cross entropy loss compared to the baseline model of always predicting ctr) - - nrce (normalized rce, do not use this one if you do not understand what it is) - - pr_auc - - roc_auc - - accuracy (percentage of predictions that are correct) - - precision (true positives) / (true positives + false positives) - - recall (true positives) / (true positives + false negatives) - - pr_curve (precision-recall curve) - - Example of metrics list with mixture of string and tuple: - metrics = [ - 'rce','nrce', - 'roc_auc', # default roc_auc metric - ( - 'roc_auc_500', # give this metric a name - partial(tf.metrics.auc, curve='ROC', summation_method='careful_interpolation', num_thresholds=500), # the metric fn - False, # whether the metric requires thresholded output - )] - - NOTE: When prediction on rare events, roc_auc can be underestimated. Increase num_threshold - can reduce the underestimation. See go/roc-auc-pitfall for more details. - - NOTE: accuracy / precision / recall apply to binary classification problems only. - I.e. a prediction is only considered correct if it matches the label. E.g. if the label - is 1.0, and the prediction is 0.99, it does not get credit. If you want to use - precision / recall / accuracy metrics with soft predictions, you'll need to threshold - your predictions into hard 0/1 labels. - - When metrics is None (the default), it defaults to: - [rce, nrce, arce, ctr, predicted_ctr, accuracy, precision, recall, prauc, roc_auc], - - classes (list of strings): - In case of multiple binary class models, the names for each class or label. - These are used to display metrics on tensorboard. - If these are not specified, the index in the class or label dimension is used, and you'll - get metrics on tensorboard named like: accuracy_0, accuracy_1, etc. - - class_dim (number): - Dimension of the classes in predictions. Defaults to 1, that is, batch_size x n_classes. - """ - # pylint: disable=invalid-name,dict-keys-not-iterating - if metrics is None: - # remove expensive metrics by default for faster eval - metrics = list(DEFAULT_BINARY_CLASS_METRICS) - - def get_eval_metric_ops(graph_output, labels, weights): + # pylint: disable=dict-keys-not-iterating + if metrics is None: + # remove expensive metrics by default for faster eval + metrics = list(DEFAULT_BINARY_CLASS_METRICS) + + def get_eval_metric_ops( + graph_output: Dict[str, tf.Tensor], + labels: tf.Tensor, + weights: tf.Tensor, + ) -> Dict[str, tf.Tensor]: + """ + graph_output: + dict that is returned by build_graph given input features. + labels: + target labels associated to batch. + weights: + weights of the samples.. + """ + + eval_metric_ops = dict() + + preds = graph_output["output"] + + threshold = graph_output["threshold"] if "threshold" in graph_output else 0.5 + + hard_preds = graph_output.get("hard_output") + if hard_preds is None: + hard_preds = tf.greater_equal(preds, threshold) + + # add metrics to eval_metric_ops dict + for metric in metrics: + if isinstance(metric, tuple) and len(metric) == 3: + metric_name, metric_factory, requires_threshold = metric + metric_name = metric_name.lower() + elif isinstance(metric, str): + metric_name = metric.lower() # metric name are case insensitive. + metric_factory, requires_threshold = SUPPORTED_BINARY_CLASS_METRICS.get( + metric_name + ) + else: + raise ValueError("Metric should be either string or tuple of length 3.") + + if metric_name in eval_metric_ops: + # avoid adding duplicate metrics. + continue + + if metric_factory: + value_op, update_op = metric_factory( + labels=labels, + predictions=(hard_preds if requires_threshold else preds), + weights=weights, + name=metric_name, + ) + eval_metric_ops[metric_name] = (value_op, update_op) + else: + raise ValueError("Cannot find the metric named " + metric_name) + + return eval_metric_ops + + return get_eval_metric_ops + + +def get_multi_binary_class_metric_fn( + metrics: List[str], + classes: Optional[List[str]] = None, + class_dim: int = 1, +) -> Callable: """ - graph_output: - dict that is returned by build_graph given input features. - labels: - target labels associated to batch. - weights: - weights of the samples.. + Returns a function having signature: + + .. code-block:: python + + def get_eval_metric_ops(graph_output, labels, weights): + ... + return eval_metric_ops + + where the returned eval_metric_ops is a dict of common evaluation metric + Ops for concatenated binary classifications. See `tf.estimator.EstimatorSpec + `_ + for a description of eval_metric_ops. The graph_output is a the result + dict returned by build_graph. Labels and weights are tf.Tensors. + + In multiple binary classification problems, the + ``predictions`` (that is, ``graph_output['output']``) + are expected to have shape ``batch_size x n_classes``, + where ``n_classes`` is the number of binary classification. + Binary classification at output[i] is expected to discriminate between ``classes[i]`` (1) + and NOT ``classes[i]`` (0). The labels should be of the same shape as ``graph_output`` + with binary values (0 or 1). The weights can be of size ``batch_size`` or + ``batch_size x n_classes``. The ``class_dim`` contain separate probabilities, + and need to have separate metrics. + + The following graph_output keys are recognized: + output: + the raw predictions between 0 and 1. Required. + threshold: + A value between 0 and 1 used to threshold the output into a hard_output. + Defaults to 0.5 when threshold and hard_output are missing. + Either threshold or hard_output can be provided, but not both. + hard_output: + A thresholded output. Either threshold or hard_output can be provided, but not both. + + Args: + metrics (list of Metrics): + a list of metrics of interest. E.g. ['ctr', 'accuracy', 'rce'] + Element in the list can be a string from following supported metrics, or can be a tuple + with three items: metric name, metric function, bool for thresholded output. + + These metrics are evaluated and reported to tensorboard *during the eval phases only*. + Supported metrics: + + - ctr (same as positive sample ratio.) + - rce (cross entropy loss compared to the baseline model of always predicting ctr) + - nrce (normalized rce, do not use this one if you do not understand what it is) + - pr_auc + - roc_auc + - accuracy (percentage of predictions that are correct) + - precision (true positives) / (true positives + false positives) + - recall (true positives) / (true positives + false negatives) + - pr_curve (precision-recall curve) + + Example of metrics list with mixture of string and tuple: + metrics = [ + 'rce','nrce', + 'roc_auc', # default roc_auc metric + ( + 'roc_auc_500', # give this metric a name + partial(tf.metrics.auc, curve='ROC', summation_method='careful_interpolation', num_thresholds=500), # the metric fn + False, # whether the metric requires thresholded output + )] + + NOTE: When prediction on rare events, roc_auc can be underestimated. Increase num_threshold + can reduce the underestimation. See go/roc-auc-pitfall for more details. + + NOTE: accuracy / precision / recall apply to binary classification problems only. + I.e. a prediction is only considered correct if it matches the label. E.g. if the label + is 1.0, and the prediction is 0.99, it does not get credit. If you want to use + precision / recall / accuracy metrics with soft predictions, you'll need to threshold + your predictions into hard 0/1 labels. + + When metrics is None (the default), it defaults to: + [rce, nrce, arce, ctr, predicted_ctr, accuracy, precision, recall, prauc, roc_auc], + + classes (list of strings): + In case of multiple binary class models, the names for each class or label. + These are used to display metrics on tensorboard. + If these are not specified, the index in the class or label dimension is used, and you'll + get metrics on tensorboard named like: accuracy_0, accuracy_1, etc. + + class_dim (number): + Dimension of the classes in predictions. Defaults to 1, that is, batch_size x n_classes. """ - - eval_metric_ops = OrderedDict() - - preds = graph_output['output'] - - threshold = graph_output['threshold'] if 'threshold' in graph_output else 0.5 - - hard_preds = graph_output.get('hard_output') - if hard_preds is None: - hard_preds = tf.greater_equal(preds, threshold) - - shape = labels.get_shape() - # basic sanity check: multi_metric dimension must exist - assert len(shape) > class_dim, "Dimension specified by class_dim does not exist." - - num_labels = shape[class_dim] - # If we are doing multi-class / multi-label metric, the number of classes / labels must - # be know at graph construction time. This dimension cannot have size None. - assert num_labels is not None, "The multi-metric dimension cannot be None." - assert classes is None or len(classes) == num_labels, ( - "Number of classes must match the number of labels") - - weights_shape = weights.get_shape() if weights is not None else None - if weights_shape is None: - num_weights = None - elif len(weights_shape) > 1: - num_weights = weights_shape[class_dim] - else: - num_weights = 1 - - for i in range(num_labels): - - # add metrics to eval_metric_ops dict - for metric in metrics: - if isinstance(metric, tuple) and len(metric) == 3: - metric_name, metric_factory, requires_threshold = metric - metric_name = metric_name.lower() - elif isinstance(metric, str): - metric_name = metric.lower() # metric name are case insensitive. - metric_factory, requires_threshold = SUPPORTED_BINARY_CLASS_METRICS.get(metric_name) + # pylint: disable=invalid-name,dict-keys-not-iterating + if metrics is None: + # remove expensive metrics by default for faster eval + metrics = list(DEFAULT_BINARY_CLASS_METRICS) + + def get_eval_metric_ops( + graph_output: Dict[str, tf.Tensor], + labels: tf.Tensor, + weights: tf.Tensor, + ) -> Dict: + """ + graph_output: + dict that is returned by build_graph given input features. + labels: + target labels associated to batch. + weights: + weights of the samples.. + """ + + eval_metric_ops = dict() + + preds = graph_output["output"] + + threshold = graph_output["threshold"] if "threshold" in graph_output else 0.5 + + hard_preds = graph_output.get("hard_output") + if hard_preds is None: + hard_preds = tf.greater_equal(preds, threshold) + + shape = labels.get_shape() + # basic sanity check: multi_metric dimension must exist + assert ( + len(shape) > class_dim + ), "Dimension specified by class_dim does not exist." + + num_labels = shape[class_dim] + # If we are doing multi-class / multi-label metric, the number of classes / labels must + # be know at graph construction time. This dimension cannot have size None. + assert num_labels is not None, "The multi-metric dimension cannot be None." + assert ( + classes is None or len(classes) == num_labels + ), "Number of classes must match the number of labels" + + weights_shape = weights.get_shape() if weights is not None else None + if weights_shape is None: + num_weights = None + elif len(weights_shape) > 1: + num_weights = weights_shape[class_dim] else: - raise ValueError("Metric should be either string or tuple of length 3.") - - class_metric_name = metric_name + "_" + (classes[i] if classes is not None else str(i)) - - if class_metric_name in eval_metric_ops: - # avoid adding duplicate metrics. - continue - - class_labels = tf.gather(labels, indices=[i], axis=class_dim) - class_preds = tf.gather(preds, indices=[i], axis=class_dim) - class_hard_preds = tf.gather(hard_preds, indices=[i], axis=class_dim) - - if num_weights is None: - class_weights = None - elif num_weights == num_labels: - class_weights = tf.gather(weights, indices=[i], axis=class_dim) - elif num_weights == 1: - class_weights = weights - else: - raise ValueError("num_weights (%d) and num_labels (%d) do not match" - % (num_weights, num_labels)) - - if metric_factory: - value_op, update_op = metric_factory( - labels=class_labels, - predictions=(class_hard_preds if requires_threshold else class_preds), - weights=class_weights, name=class_metric_name) - eval_metric_ops[class_metric_name] = (value_op, update_op) - else: - raise ValueError('Cannot find the metric named ' + metric_name) - - return eval_metric_ops - - return get_eval_metric_ops - + num_weights = 1 + + for i in range(num_labels): + # add metrics to eval_metric_ops dict + for metric in metrics: + if isinstance(metric, tuple) and len(metric) == 3: + metric_name, metric_factory, requires_threshold = metric + metric_name = metric_name.lower() + elif isinstance(metric, str): + metric_name = metric.lower() # metric name are case insensitive. + ( + metric_factory, + requires_threshold, + ) = SUPPORTED_BINARY_CLASS_METRICS.get(metric_name) + else: + raise ValueError( + "Metric should be either string or tuple of length 3." + ) + + class_metric_name = ( + metric_name + "_" + (classes[i] if classes is not None else str(i)) + ) + + if class_metric_name in eval_metric_ops: + # avoid adding duplicate metrics. + continue + + class_labels = tf.gather(labels, indices=[i], axis=class_dim) + class_preds = tf.gather(preds, indices=[i], axis=class_dim) + class_hard_preds = tf.gather(hard_preds, indices=[i], axis=class_dim) + + if num_weights is None: + class_weights = None + elif num_weights == num_labels: + class_weights = tf.gather(weights, indices=[i], axis=class_dim) + elif num_weights == 1: + class_weights = weights + else: + raise ValueError( + "num_weights (%d) and num_labels (%d) do not match" + % (num_weights, num_labels) + ) + + if metric_factory: + value_op, update_op = metric_factory( + labels=class_labels, + predictions=( + class_hard_preds if requires_threshold else class_preds + ), + weights=class_weights, + name=class_metric_name, + ) + eval_metric_ops[class_metric_name] = (value_op, update_op) + else: + raise ValueError("Cannot find the metric named " + metric_name) + + return eval_metric_ops + + return get_eval_metric_ops + + +def _get_uncalibrated_metric_fn( + calibrated_metric_fn: Callable, keep_weight: bool = True +) -> Callable: + """ + Returns a function having signature: + + .. code-block:: python + + def get_eval_metric_ops(graph_output, labels, weights): + ... + return eval_metric_ops + + where the returned eval_metric_ops is a dict of common evaluation metric + Ops with uncalibrated output. + + The following graph_output keys are recognized: + uncalibrated_output: + the uncalibrated raw predictions between 0 and 1. Required. + output: + the calibrated predictions between 0 and 1. + threshold: + A value between 0 and 1 used to threshold the output into a hard_output. + Defaults to 0.5 when threshold and hard_output are missing. + Either threshold or hard_output can be provided, but not both. + hard_output: + A thresholded output. Either threshold or hard_output can be provided, but not both. + + Args: + calibrated_metric_fn: metrics function with calibration and weight. + keep_weight: Bool indicating whether we keep weight. + """ + metric_scope = "uncalibrated" if keep_weight else "unweighted" -def _get_uncalibrated_metric_fn(calibrated_metric_fn, keep_weight=True): - """ - Returns a function having signature: + def get_eval_metric_ops(graph_output, labels, weights): + """ + graph_output: + dict that is returned by build_graph given input features. + labels: + target labels associated to batch. + weights: + weights of the samples.. + """ + with tf.variable_scope(metric_scope): + if "uncalibrated_output" not in graph_output: + raise Exception("Missing uncalibrated_output in graph_output!") + un_calibrated_weights = weights if keep_weight else tf.ones_like(weights) + uncalibrated_output = { + "output": graph_output["uncalibrated_output"], + "threshold": graph_output.get("threshold", 0.5), + "hard_output": graph_output.get("hard_output"), + **{ + k: v + for k, v in graph_output.items() + if k not in ["output", "threshold", "hard_output"] + }, + } + + eval_metrics_ops = calibrated_metric_fn( + uncalibrated_output, labels, un_calibrated_weights + ) + + renamed_metrics_ops = { + f"{metric_scope}_{k}": v for k, v in eval_metrics_ops.items() + } + return renamed_metrics_ops + + return get_eval_metric_ops - .. code-block:: python - def get_eval_metric_ops(graph_output, labels, weights): - ... - return eval_metric_ops - - where the returned eval_metric_ops is a dict of common evaluation metric - Ops with uncalibrated output. - - The following graph_output keys are recognized: - uncalibrated_output: - the uncalibrated raw predictions between 0 and 1. Required. - output: - the calibrated predictions between 0 and 1. - threshold: - A value between 0 and 1 used to threshold the output into a hard_output. - Defaults to 0.5 when threshold and hard_output are missing. - Either threshold or hard_output can be provided, but not both. - hard_output: - A thresholded output. Either threshold or hard_output can be provided, but not both. - - Args: - calibrated_metric_fn: metrics function with calibration and weight. - keep_weight: Bool indicating whether we keep weight. - """ - metric_scope = 'uncalibrated' if keep_weight else 'unweighted' - - def get_eval_metric_ops(graph_output, labels, weights): +def get_multi_binary_class_uncalibrated_metric_fn( + metrics: List[Union[str, Tuple[str, Callable, bool]]], + classes: Optional[List[str]] = None, + class_dim: int = 1, + keep_weight: bool = True, +) -> Callable: """ - graph_output: - dict that is returned by build_graph given input features. - labels: - target labels associated to batch. - weights: - weights of the samples.. + Returns a function having signature: + + .. code-block:: python + + def get_eval_metric_ops(graph_output, labels, weights): + ... + return eval_metric_ops + + where the returned eval_metric_ops is a dict of common evaluation metric + Ops for concatenated binary classifications without calibration. + + Note: 'uncalibrated_output' is required key in graph_output. + + The main use case for this function is: + + 1) To calculated roc-auc for rare event. + Calibrated prediction score for rare events will be concentrated near zero. As a result, + the roc-auc can be seriously underestimated with current implementation in tf.metric.auc. + Since roc-auc is invariant against calibration, we can directly use uncalibrated score for roc-auc. + For more details, please refer to: go/roc-auc-invariance. + + 2) To set keep_weight=False and get unweighted and uncalibrated metrics. + This is useful to eval how the model is fitted to its actual training data, since + often time the model is trained without weight. + + Args: + metrics (list of String): + a list of metrics of interest. E.g. ['ctr', 'accuracy', 'rce'] + Element in the list can be a string from supported metrics, or can be a tuple + with three items: metric name, metric function, bool for thresholded output. + These metrics are evaluated and reported to tensorboard *during the eval phases only*. + When metrics is None (the default), it defaults to: + [rce, nrce, arce, ctr, predicted_ctr, accuracy, precision, recall, prauc, roc_auc], + classes (list of strings): + In case of multiple binary class models, the names for each class or label. + These are used to display metrics on tensorboard. + If these are not specified, the index in the class or label dimension is used, and you'll + get metrics on tensorboard named like: accuracy_0, accuracy_1, etc. + class_dim (number): + Dimension of the classes in predictions. Defaults to 1, that is, batch_size x n_classes. + keep_weight (bool): + Whether to keep weights for the metric. """ - with tf.variable_scope(metric_scope): - if 'uncalibrated_output' not in graph_output: - raise Exception("Missing uncalibrated_output in graph_output!") - un_calibrated_weights = weights if keep_weight else tf.ones_like(weights) - uncalibrated_output = { - 'output': graph_output['uncalibrated_output'], - 'threshold': graph_output.get('threshold', 0.5), - 'hard_output': graph_output.get('hard_output'), - **{k: v for k, v in graph_output.items() if k not in ['output', 'threshold', 'hard_output']} - } - eval_metrics_ops = calibrated_metric_fn(uncalibrated_output, labels, un_calibrated_weights) + calibrated_metric_fn = get_multi_binary_class_metric_fn( + metrics, classes=classes, class_dim=class_dim + ) + return _get_uncalibrated_metric_fn(calibrated_metric_fn, keep_weight=keep_weight) - renamed_metrics_ops = {f'{metric_scope}_{k}': v for k, v in eval_metrics_ops.items()} - return renamed_metrics_ops - return get_eval_metric_ops +def combine_metric_fns(*fn_list) -> Callable: + """ + Combine multiple metric functions. + For example, we can combine metrics function generated by + get_multi_binary_class_metric_fn and get_multi_binary_class_uncalibrated_metric_fn. + Args: + *fn_list: Multiple metric functions to be combined -def get_multi_binary_class_uncalibrated_metric_fn( - metrics, classes=None, class_dim=1, keep_weight=True): - """ - Returns a function having signature: + Returns: + Combined metric function. + """ - .. code-block:: python + def combined_metric_ops(*args, **kwargs) -> dict: + eval_metric_ops = dict() + for fn in fn_list: + eval_metric_ops.update(fn(*args, **kwargs)) + return eval_metric_ops - def get_eval_metric_ops(graph_output, labels, weights): - ... - return eval_metric_ops - - where the returned eval_metric_ops is a dict of common evaluation metric - Ops for concatenated binary classifications without calibration. - - Note: 'uncalibrated_output' is required key in graph_output. - - The main use case for this function is: - - 1) To calculated roc-auc for rare event. - Calibrated prediction score for rare events will be concentrated near zero. As a result, - the roc-auc can be seriously underestimated with current implementation in tf.metric.auc. - Since roc-auc is invariant against calibration, we can directly use uncalibrated score for roc-auc. - For more details, please refer to: go/roc-auc-invariance. - - 2) To set keep_weight=False and get unweighted and uncalibrated metrics. - This is useful to eval how the model is fitted to its actual training data, since - often time the model is trained without weight. - - Args: - metrics (list of String): - a list of metrics of interest. E.g. ['ctr', 'accuracy', 'rce'] - Element in the list can be a string from supported metrics, or can be a tuple - with three items: metric name, metric function, bool for thresholded output. - These metrics are evaluated and reported to tensorboard *during the eval phases only*. - - When metrics is None (the default), it defaults to: - [rce, nrce, arce, ctr, predicted_ctr, accuracy, precision, recall, prauc, roc_auc], - - classes (list of strings): - In case of multiple binary class models, the names for each class or label. - These are used to display metrics on tensorboard. - If these are not specified, the index in the class or label dimension is used, and you'll - get metrics on tensorboard named like: accuracy_0, accuracy_1, etc. - - class_dim (number): - Dimension of the classes in predictions. Defaults to 1, that is, batch_size x n_classes. - - keep_weight (bool): - Whether to keep weights for the metric. - """ - - calibrated_metric_fn = get_multi_binary_class_metric_fn( - metrics, classes=classes, class_dim=class_dim) - return _get_uncalibrated_metric_fn(calibrated_metric_fn, keep_weight=keep_weight) - - -def combine_metric_fns(*fn_list): - """ - Combine multiple metric functions. - For example, we can combine metrics function generated by - get_multi_binary_class_metric_fn and get_multi_binary_class_uncalibrated_metric_fn. - - Args: - *fn_list: Multiple metric functions to be combined - - Returns: - Combined metric function. - """ - def combined_metric_ops(*args, **kwargs): - eval_metric_ops = OrderedDict() - for fn in fn_list: - eval_metric_ops.update(fn(*args, **kwargs)) - return eval_metric_ops - return combined_metric_ops + return combined_metric_ops diff --git a/twml/twml/optimizers/__init__.py b/twml/twml/optimizers/__init__.py index eaa29883c..e96cadfdf 100644 --- a/twml/twml/optimizers/__init__.py +++ b/twml/twml/optimizers/__init__.py @@ -1,4 +1,2 @@ -from twitter.deepbird.compat.v1.optimizers import ( - LazyAdamOptimizer, - optimize_loss, - OPTIMIZER_SUMMARIES) # noqa: F401 +from twitter.deepbird.compat.v1.optimizers import OPTIMIZER_SUMMARIES # noqa: F401 +from twitter.deepbird.compat.v1.optimizers import LazyAdamOptimizer, optimize_loss diff --git a/twml/twml/parsers.py b/twml/twml/parsers.py index eac60083a..d0cb09011 100644 --- a/twml/twml/parsers.py +++ b/twml/twml/parsers.py @@ -1,20 +1,20 @@ -''' +""" Contains implementations of functions to parse training and evaluation data. Modelers can use the functions in this module as the the train/eval_parse_fn of the DataRecordTrainer constructor to customize how to parse their datasets. Modelers may also provide custom implementations of train/eval_parse_fn using these as reference. -''' +""" -from twitter.deepbird.io.legacy.parsers import ( - convert_to_supervised_input_receiver_fn, # noqa: F401 - get_continuous_parse_fn, # noqa: F401 - get_default_parse_fn, # noqa: F401 - get_features_as_tensor_dict, # noqa: F401 - get_labels_in_features_parse_fn, # noqa: F401 - get_serving_input_receiver_fn_feature_dict, # noqa: F401 - get_sparse_parse_fn, # noqa: F401 - get_sparse_serving_input_receiver_fn, # noqa: F401 - get_tensor_parse_fn, # noqa: F401 +from twitter.deepbird.io.legacy.parsers import get_continuous_parse_fn # noqa: F401 +from twitter.deepbird.io.legacy.parsers import get_default_parse_fn # noqa: F401 +from twitter.deepbird.io.legacy.parsers import get_features_as_tensor_dict # noqa: F401 +from twitter.deepbird.io.legacy.parsers import get_sparse_parse_fn # noqa: F401 +from twitter.deepbird.io.legacy.parsers import get_tensor_parse_fn # noqa: F401 +from twitter.deepbird.io.legacy.parsers import ( # noqa: F401 + convert_to_supervised_input_receiver_fn, + get_labels_in_features_parse_fn, + get_serving_input_receiver_fn_feature_dict, + get_sparse_serving_input_receiver_fn, ) diff --git a/twml/twml/readers/__init__.py b/twml/twml/readers/__init__.py index 06a6d79f5..2578ab0da 100644 --- a/twml/twml/readers/__init__.py +++ b/twml/twml/readers/__init__.py @@ -2,6 +2,7 @@ """ This module contains data readers """ from .batch_prediction_request import BatchPredictionRequest # noqa: F401 -from .data_record import DataRecord, SPARSE_DATA_RECORD_FEATURE_FIELDS # noqa: F401 +from .data_record import SPARSE_DATA_RECORD_FEATURE_FIELDS # noqa: F401 +from .data_record import DataRecord from .hashed_batch_prediction_request import HashedBatchPredictionRequest # noqa: F401 -from .hashed_data_record import HashedDataRecord # noqa: F401 \ No newline at end of file +from .hashed_data_record import HashedDataRecord # noqa: F401 diff --git a/twml/twml/readers/batch_prediction_request.py b/twml/twml/readers/batch_prediction_request.py index 512a8c514..f0c233d35 100644 --- a/twml/twml/readers/batch_prediction_request.py +++ b/twml/twml/readers/batch_prediction_request.py @@ -4,5 +4,5 @@ """ from twitter.deepbird.io.legacy.readers.batch_prediction_request import ( - BatchPredictionRequest # noqa: F401 -) + BatchPredictionRequest, +) # noqa: F401 diff --git a/twml/twml/readers/data_record.py b/twml/twml/readers/data_record.py index d1c377afd..d5d773aa0 100644 --- a/twml/twml/readers/data_record.py +++ b/twml/twml/readers/data_record.py @@ -3,13 +3,13 @@ This module includes facilities for manipulating data records. """ -from twitter.deepbird.io.legacy.readers.data_record import ( - _SPEC_TO_TF, # noqa: F401 - SPARSE_DATA_RECORD_FEATURE_FIELDS, # noqa: F401 - _FeaturesBase, # noqa: F401 - _Features, # noqa: F401 - _DiscreteFeatures, # noqa: F401 - _StringFeatures, # noqa: F401 - _BaseDataRecord, # noqa: F401 - DataRecord, # noqa: F401 +from twitter.deepbird.io.legacy.readers.data_record import _SPEC_TO_TF # noqa: F401 +from twitter.deepbird.io.legacy.readers.data_record import DataRecord # noqa: F401 +from twitter.deepbird.io.legacy.readers.data_record import _BaseDataRecord # noqa: F401 +from twitter.deepbird.io.legacy.readers.data_record import _Features # noqa: F401 +from twitter.deepbird.io.legacy.readers.data_record import _FeaturesBase # noqa: F401 +from twitter.deepbird.io.legacy.readers.data_record import _StringFeatures # noqa: F401 +from twitter.deepbird.io.legacy.readers.data_record import ( # noqa: F401 + SPARSE_DATA_RECORD_FEATURE_FIELDS, + _DiscreteFeatures, ) diff --git a/twml/twml/readers/hashed_batch_prediction_request.py b/twml/twml/readers/hashed_batch_prediction_request.py index 5850c4497..213dee734 100644 --- a/twml/twml/readers/hashed_batch_prediction_request.py +++ b/twml/twml/readers/hashed_batch_prediction_request.py @@ -4,5 +4,5 @@ """ from twitter.deepbird.io.legacy.readers.hashed_batch_prediction_request import ( - HashedBatchPredictionRequest # noqa: F401 -) + HashedBatchPredictionRequest, +) # noqa: F401 diff --git a/twml/twml/readers/hashed_data_record.py b/twml/twml/readers/hashed_data_record.py index 1ff9ce816..9f8c5bd8f 100644 --- a/twml/twml/readers/hashed_data_record.py +++ b/twml/twml/readers/hashed_data_record.py @@ -5,8 +5,8 @@ """ from twitter.deepbird.io.legacy.readers.hashed_data_record import ( - _HASHED_FIELDS, - _FEATURE_NAMES, - _FEATURE_TYPES, - HashedDataRecord, + _FEATURE_NAMES, + _FEATURE_TYPES, + _HASHED_FIELDS, + HashedDataRecord, ) diff --git a/twml/twml/saved_model_cli/__main__.py b/twml/twml/saved_model_cli/__main__.py index ad5326431..96d4409e0 100644 --- a/twml/twml/saved_model_cli/__main__.py +++ b/twml/twml/saved_model_cli/__main__.py @@ -5,5 +5,5 @@ from tensorflow.python.tools import saved_model_cli -if __name__ == '__main__': - sys.exit(saved_model_cli.main()) +if __name__ == "__main__": + sys.exit(saved_model_cli.main()) diff --git a/twml/twml/tensorboard/__main__.py b/twml/twml/tensorboard/__main__.py index c426060d1..75557b5f0 100644 --- a/twml/twml/tensorboard/__main__.py +++ b/twml/twml/tensorboard/__main__.py @@ -7,10 +7,9 @@ from tensorboard.main import run_main - -if __name__ == '__main__': - # Tensorboard relies on werkzeug for its HTTP server which logs at info level - # by default - logging.getLogger('werkzeug').setLevel(logging.WARNING) - sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0]) - sys.exit(run_main()) +if __name__ == "__main__": + # Tensorboard relies on werkzeug for its HTTP server which logs at info level + # by default + logging.getLogger("werkzeug").setLevel(logging.WARNING) + sys.argv[0] = re.sub(r"(-script\.pyw?|\.exe)?$", "", sys.argv[0]) + sys.exit(run_main()) diff --git a/twml/twml/tensorio.py b/twml/twml/tensorio.py index bc551ac56..9802d028d 100644 --- a/twml/twml/tensorio.py +++ b/twml/twml/tensorio.py @@ -4,11 +4,11 @@ # too-few-public-methods import os +from typing import List import numpy as np import yaml - """ Utility to load tensors serialized by Deepbird V1. @@ -19,143 +19,153 @@ # helper class used to assist hierarchical key access by remembering intermediate keys. class _KeyRecorder(object): - def __init__(self, tensorio, keys=[]): - self.tensorio = tensorio - self.keys = keys + def __init__( + self, + tensorio: "TensorIO", + keys: List[str] = [], + ): + self.tensorio = tensorio + self.keys = keys - def __getitem__(self, k): - new_keys = self.keys + [str(k)] - prefix = ".".join(new_keys) + def __getitem__(self, k: str): + new_keys = self.keys + [str(k)] + prefix = ".".join(new_keys) - key_list = self.tensorio.list_tensors() + key_list = self.tensorio.list_tensors() - # if we have a complete key, load the tensor. - if prefix in key_list: - return self.tensorio._load(prefix) + # if we have a complete key, load the tensor. + if prefix in key_list: + return self.tensorio._load(prefix) - # we don't have a complete key yet, but at least one tensor should start with this prefix. - for k_value in key_list: - if k_value.startswith(prefix): - return _KeyRecorder(self.tensorio, new_keys) + # we don't have a complete key yet, but at least one tensor should start with this prefix. + for k_value in key_list: + if k_value.startswith(prefix): + return _KeyRecorder(self.tensorio, new_keys) - # if no key starts with the prefix, this _key_recorder is not valid. - raise ValueError("Key not found: " + prefix) + # if no key starts with the prefix, this _key_recorder is not valid. + raise ValueError("Key not found: " + prefix) # convert tensorio tensor type to numpy data type. # also returns element size in bytes. -def _get_data_type(data_type): - if data_type == 'Double': - return (np.float64, 8) +def _get_data_type(data_type: str): + if data_type == "Double": + return (np.float64, 8) - if data_type == 'Float': - return (np.float32, 4) + if data_type == "Float": + return (np.float32, 4) - if data_type == 'Int': - return (np.int32, 4) + if data_type == "Int": + return (np.int32, 4) - if data_type == 'Long': - return (np.int64, 8) + if data_type == "Long": + return (np.int64, 8) - if data_type == 'Byte': - return (np.int8, 1) + if data_type == "Byte": + return (np.int8, 1) - raise ValueError('Unexpected tensorio data type: ' + data_type) + raise ValueError("Unexpected tensorio data type: " + data_type) class TensorIO(object): - """ - Construct a TensorIO class. - tensorio_path: a directory containing tensors serialized using tensorio. tar file not supported. - mmap_tensor: - By default, loaded tensors use mmap storage. - Set this to false to not use mmap. Useful when loading multiple tensors. - """ - - def __init__(self, tensorio_path, mmap_tensor=True): - self._tensorio_path = tensorio_path - self._mmap_tensor = mmap_tensor - - # Make sure we can locate spec.yaml. - yaml_file = os.path.join(tensorio_path, 'spec.yaml') - if not os.path.exists(yaml_file): - raise ValueError('Invalid tensorio path: no spec.yaml found.') - - # load spec.yaml. - with open(yaml_file, 'r') as file_open: - # Note that tensor names in the yaml are like this: \"weight\".\'1\' - # For user-friendliness, we remove the quotes. - _spec = yaml.safe_load(file_open) - self._spec = {k.replace("'", '').replace('"', ''): v for (k, v) in _spec.items()} - - def list_tensors(self): - """ - Returns a list of tensors saved in the given path. - """ - return self._spec.keys() - - def _load_tensor(self, name): """ - Load Tensor with the given name. - Raise value error if the named tensor is not found. - Returns a numpy array if the named tensor is found. + Construct a TensorIO class. + tensorio_path: a directory containing tensors serialized using tensorio. tar file not supported. + mmap_tensor: + By default, loaded tensors use mmap storage. + Set this to false to not use mmap. Useful when loading multiple tensors. """ - tensor_info = self._spec[name] - if tensor_info['type'] != 'tensor': - raise ValueError('Trying to load a tensor of unknown type: ' + tensor_info['type']) - - filename = os.path.join(self._tensorio_path, tensor_info['filename']) - (data_type, element_size) = _get_data_type(tensor_info['tensorType']) - - np_array = np.memmap( - filename, - dtype=data_type, - mode='r', - # -1 because lua offset is 1 based. - offset=(tensor_info['offset'] - 1) * element_size, - shape=tuple(tensor_info['size']), - order='C', - ) - - return np_array if self._mmap_tensor else np_array[:].copy() - - def _load_nontensor_data(self, name): - """ - Load non-tensor data with the given name. - Returns a python string. - """ - tensor_info = self._spec[name] - return tensor_info['data'] - def _load(self, name): - """ - Load data serialized under the given name, it could be a tensor or regular data. - """ - if name not in self._spec: - raise ValueError('The specified key {} is not found in {}'.format(name, self._tensorio_path)) - - data_type = self._spec[name]['type'] - if data_type == 'tensor': - return self._load_tensor(name) - else: - return self._load_nontensor_data(name) - - def load_all(self): - """ - Load all tensors stored in the tensorio directory. - Returns a dictionary from tensor name to numpy arrays. - """ - return {k: self._load(k) for k in self._spec} - - ########################################### - # The below are utilities for convenience # - ########################################### - def __getitem__(self, k): - """ - Shorthand for _load_tensor, but also supports hierarchical access like: tensorio['a']['b']['1'] - """ - if k in self._spec: - # We have a full tensor name, directly load it. - return self._load_tensor(k) - else: - return _KeyRecorder(self)[k] + def __init__(self, tensorio_path, mmap_tensor=True): + self._tensorio_path = tensorio_path + self._mmap_tensor = mmap_tensor + + # Make sure we can locate spec.yaml. + yaml_file = os.path.join(tensorio_path, "spec.yaml") + if not os.path.exists(yaml_file): + raise ValueError("Invalid tensorio path: no spec.yaml found.") + + # load spec.yaml. + with open(yaml_file, "r") as file_open: + # Note that tensor names in the yaml are like this: \"weight\".\'1\' + # For user-friendliness, we remove the quotes. + _spec = yaml.safe_load(file_open) + self._spec = { + k.replace("'", "").replace('"', ""): v for (k, v) in _spec.items() + } + + def list_tensors(self) -> List[str]: + """ + Returns a list of tensors saved in the given path. + """ + return self._spec.keys() + + def _load_tensor(self, name: str) -> np.ndarray: + """ + Load Tensor with the given name. + Raise value error if the named tensor is not found. + Returns a numpy array if the named tensor is found. + """ + tensor_info = self._spec[name] + if tensor_info["type"] != "tensor": + raise ValueError( + "Trying to load a tensor of unknown type: " + tensor_info["type"] + ) + + filename = os.path.join(self._tensorio_path, tensor_info["filename"]) + (data_type, element_size) = _get_data_type(tensor_info["tensorType"]) + + np_array = np.memmap( + filename, + dtype=data_type, + mode="r", + # -1 because lua offset is 1 based. + offset=(tensor_info["offset"] - 1) * element_size, + shape=tuple(tensor_info["size"]), + order="C", + ) + + return np_array if self._mmap_tensor else np_array[:].copy() + + def _load_nontensor_data(self, name: str) -> str: + """ + Load non-tensor data with the given name. + Returns a python string. + """ + tensor_info = self._spec[name] + return tensor_info["data"] + + def _load(self, name: str) -> np.ndarray: + """ + Load data serialized under the given name, it could be a tensor or regular data. + """ + if name not in self._spec: + raise ValueError( + f"The specified key {name} is not found in {self._tensorio_path}" + ) + + data_type = self._spec[name]["type"] + if data_type == "tensor": + return self._load_tensor(name) + else: + return self._load_nontensor_data(name) + + def load_all(self): + """ + Load all tensors stored in the tensorio directory. + Returns a dictionary from tensor name to numpy arrays. + """ + return {k: self._load(k) for k in self._spec} + + ########################################### + # The below are utilities for convenience # + ########################################### + def __getitem__(self, k: str) -> np.ndarray: + """ + Shorthand for _load_tensor, but also supports hierarchical access like: tensorio['a']['b']['1'] + """ + if k in self._spec: + # We have a full tensor name, directly load it. + return self._load_tensor(k) + else: + return _KeyRecorder(self)[k] diff --git a/twml/twml/tracking/experiment_tracker.py b/twml/twml/tracking/experiment_tracker.py index 4f275ba4b..12bacd111 100644 --- a/twml/twml/tracking/experiment_tracker.py +++ b/twml/twml/tracking/experiment_tracker.py @@ -1,543 +1,644 @@ """ This module contains the experiment tracker for tracking training in ML Metastore """ -from contextlib import contextmanager -from datetime import datetime import getpass import hashlib import os import re import sys import time +from contextlib import contextmanager +from datetime import datetime +from typing import Any, Callable, Dict, List, Optional, Type, Union -from absl import logging import tensorflow.compat.v1 as tf -from twml.hooks import MetricsUpdateHook +from absl import logging +from twml.hooks import MetricsUpdateHook try: - from urllib import quote as encode_url + from urllib import quote as encode_url except ImportError: - from urllib.parse import quote as encode_url + from urllib.parse import quote as encode_url try: - # ML Metastore packages might not be available on GCP. - # If they are not found, tracking is disabled - import requests - from com.twitter.mlmetastore.modelrepo.client import ModelRepoClient - from com.twitter.mlmetastore.modelrepo.core.path import ( - check_valid_id, get_components_from_id, generate_id) - from com.twitter.mlmetastore.modelrepo.core import ( - DeepbirdRun, Experiment, FeatureConfig, FeatureConfigFeature, Model, ProgressReport, Project, StatusUpdate) + # ML Metastore packages might not be available on GCP. + # If they are not found, tracking is disabled + import requests + from com.twitter.mlmetastore.modelrepo.client import ModelRepoClient + from com.twitter.mlmetastore.modelrepo.core import ( + DeepbirdRun, + Experiment, + FeatureConfig, + FeatureConfigFeature, + Model, + ProgressReport, + Project, + StatusUpdate, + ) + from com.twitter.mlmetastore.modelrepo.core.path import ( + check_valid_id, + generate_id, + get_components_from_id, + ) except ImportError: - ModelRepoClient = None + ModelRepoClient = None class ExperimentTracker(object): - """ - A tracker that records twml runs in ML Metastore. - """ - - def __init__(self, params, run_config, save_dir): - """ - - Args: - params (python dict): - The trainer params. ExperimentTracker uses `params.experiment_tracking_path` (String) and - `params.disable_experiment_tracking`. - If `experiment_tracking_path` is set to None, the tracker tries to guess a path with - save_dir. - If `disable_experiment_tracking` is True, the tracker is disabled. - run_config (tf.estimator.RunConfig): - The run config used by the estimator. - save_dir (str): - save_dir of the trainer """ - if isinstance(params, dict): - self._params = params - else: - # preserving backward compatibility for people still using HParams - logging.warning("Please stop using HParams and use python dicts. HParams are removed in TF 2") - self._params = dict((k, v) for k, v in params.values().items() if v != 'null') - self._run_config = run_config - self._graceful_shutdown_port = self._params.get('health_port') - - self.tracking_path = self._params.get('experiment_tracking_path') - is_tracking_path_too_long = self.tracking_path is not None and len(self.tracking_path) > 256 - - if is_tracking_path_too_long: - raise ValueError("Experiment Tracking Path longer than 256 characters") - - self.disabled = ( - self._params.get('disable_experiment_tracking', False) or - not self._is_env_eligible_for_tracking() or - ModelRepoClient is None - ) - - self._is_hogwild = bool(os.environ.get('TWML_HOGWILD_PORTS')) + A tracker that records twml runs in ML Metastore. + """ + + def __init__(self, params: dict, run_config: tf.estimator.RunConfig, save_dir: str): + """ + Args: + params (python dict): + The trainer params. ExperimentTracker uses `params.experiment_tracking_path` (String) and + `params.disable_experiment_tracking`. + If `experiment_tracking_path` is set to None, the tracker tries to guess a path with + save_dir. + If `disable_experiment_tracking` is True, the tracker is disabled. + run_config (tf.estimator.RunConfig): + The run config used by the estimator. + save_dir (str): + save_dir of the trainer + """ + if isinstance(params, dict): + self._params = params + else: + # preserving backward compatibility for people still using HParams + logging.warning( + "Please stop using HParams and use python dicts. HParams are removed in TF 2" + ) + self._params = dict( + (k, v) for k, v in params.values().items() if v != "null" + ) + self._run_config = run_config + self._graceful_shutdown_port = self._params.get("health_port") + + self.tracking_path = self._params.get("experiment_tracking_path") + is_tracking_path_too_long = ( + self.tracking_path is not None and len(self.tracking_path) > 256 + ) - self._is_distributed = bool(os.environ.get('TF_CONFIG')) + if is_tracking_path_too_long: + raise ValueError("Experiment Tracking Path longer than 256 characters") - self._client = None if self.disabled else ModelRepoClient() + self.disabled = ( + self._params.get("disable_experiment_tracking", False) + or not self._is_env_eligible_for_tracking() + or ModelRepoClient is None + ) - run_name_from_environ = self.run_name_from_environ() - run_name_can_be_inferred = ( - self.tracking_path is not None or run_name_from_environ is not None) + self._is_hogwild = bool(os.environ.get("TWML_HOGWILD_PORTS")) + self._is_distributed = bool(os.environ.get("TF_CONFIG")) + self._client = None if self.disabled else ModelRepoClient() - # Turn the flags off as needed in hogwild / distributed - if self._is_hogwild or self._is_distributed: - self._env_eligible_for_recording_experiment = ( - self._run_config.task_type == "evaluator") - if run_name_can_be_inferred: - self._env_eligible_for_recording_export_metadata = ( - self._run_config.task_type == "chief") - else: - logging.info( - 'experiment_tracking_path is not set and can not be inferred. ' - 'Recording export metadata is disabled because the chief node and eval node ' - 'are setting different experiment tracking paths.') - self._env_eligible_for_recording_export_metadata = False - else: - # Defaults to True - self._env_eligible_for_recording_experiment = True - self._env_eligible_for_recording_export_metadata = True - - if not self.disabled: - # Sanitize passed in experiment tracking paths. e.g. own:proJ:exp:Run.Name - # -> own:proj:exp:Run_Name - if self.tracking_path: - try: - check_valid_id(self.tracking_path) - except ValueError as err: - logging.error(f'Invalid experiment tracking path provided. Sanitizing: {self.tracking_path}\nError: {err}') - self.tracking_path = generate_id( - owner=self.path['owner'], - project_name=self.path['project_name'], - experiment_name=self.path['experiment_name'], - run_name=self.path['run_name'] - ) - logging.error(f'Generated sanitized experiment tracking path: {self.tracking_path}') - else: - logging.info( - 'No experiment_tracking_path set. Experiment Tracker will try to guess a path') - self.tracking_path = self.guess_path(save_dir, run_name_from_environ) - logging.info('Guessed path: %s', self.tracking_path) - - # additional check to see if generated path is valid - try: - check_valid_id(self.tracking_path) - except ValueError as err: - logging.error( - 'Could not generate valid experiment tracking path. Disabling tracking. ' + - 'Error:\n{}'.format(err) + run_name_from_environ = self.run_name_from_environ() + run_name_can_be_inferred = ( + self.tracking_path is not None or run_name_from_environ is not None ) - self.disabled = True - self.project_id = None if self.disabled else '{}:{}'.format( - self.path['owner'], self.path['project_name']) - self.base_run_id = None if self.disabled else self.tracking_path - self._current_run_name_suffix = None + # Turn the flags off as needed in hogwild / distributed + if self._is_hogwild or self._is_distributed: + self._env_eligible_for_recording_experiment = ( + self._run_config.task_type == "evaluator" + ) + if run_name_can_be_inferred: + self._env_eligible_for_recording_export_metadata = ( + self._run_config.task_type == "chief" + ) + else: + logging.info( + "experiment_tracking_path is not set and can not be inferred. " + "Recording export metadata is disabled because the chief node and eval node " + "are setting different experiment tracking paths." + ) + self._env_eligible_for_recording_export_metadata = False + else: + # Defaults to True + self._env_eligible_for_recording_experiment = True + self._env_eligible_for_recording_export_metadata = True + + if not self.disabled: + # Sanitize passed in experiment tracking paths. e.g. own:proJ:exp:Run.Name + # -> own:proj:exp:Run_Name + if self.tracking_path: + try: + check_valid_id(self.tracking_path) + except ValueError as err: + logging.error( + f"Invalid experiment tracking path provided. Sanitizing: {self.tracking_path}\nError: {err}" + ) + self.tracking_path = generate_id( + owner=self.path["owner"], + project_name=self.path["project_name"], + experiment_name=self.path["experiment_name"], + run_name=self.path["run_name"], + ) + logging.error( + f"Generated sanitized experiment tracking path: {self.tracking_path}" + ) + else: + logging.info( + "No experiment_tracking_path set. Experiment Tracker will try to guess a path" + ) + self.tracking_path = self.guess_path(save_dir, run_name_from_environ) + logging.info("Guessed path: %s", self.tracking_path) + + # additional check to see if generated path is valid + try: + check_valid_id(self.tracking_path) + except ValueError as err: + logging.error( + "Could not generate valid experiment tracking path. Disabling tracking. " + + f"Error:\n{err}" + ) + self.disabled = True + + self.project_id = ( + None + if self.disabled + else f'{self.path["owner"]}:{self.path["project_name"]}' + ) + self.base_run_id = None if self.disabled else self.tracking_path + self._current_run_name_suffix = None + self._current_tracker_hook = None - self._current_tracker_hook = None + if self.disabled: + logging.info("Experiment Tracker is disabled") + else: + logging.info( + "Experiment Tracker initialized with base run id: %s", self.base_run_id + ) + + @contextmanager + def track_experiment( + self, + eval_hooks: List[tf.estimator.SessionRunHook], + get_estimator_spec_fn: Callable[[], tf.estimator.EstimatorSpec], + name: Optional[str] = None, + ) -> tf.estimator.SessionRunHook: + """ + A context manager for tracking experiment. It should wrap the training loop. + An experiment tracker eval hook is appended to eval_hooks to collect metrics. + + Args: + eval_hooks (list): + The list of eval_hooks to be used. When it's not None, and does not contain any , + MetricsUpdateHook an experiment tracker eval hook is appended to it. When it contains + any MetricsUpdateHook, this tracker is disabled to avoid conflict with legacy Model Repo + tracker (`TrackRun`). + get_estimator_spec_fn (func): + A function to get the current EstimatorSpec of the trainer, used by the eval hook. + name (str); + Name of this training or evaluation. Used as a suffix of the run_id. + + Returns: + The tracker's eval hook which is appended to eval_hooks. + """ + + # disable this tracker if legacy TrackRun hook is present + # TODO: remove this once we completely deprecate the old TrackRun interface + if eval_hooks is not None: + self.disabled = self.disabled or any( + isinstance(x, MetricsUpdateHook) for x in eval_hooks + ) - if self.disabled: - logging.info('Experiment Tracker is disabled') - else: - logging.info('Experiment Tracker initialized with base run id: %s', self.base_run_id) + logging.info( + "Is environment eligible for recording experiment: %s", + self._env_eligible_for_recording_experiment, + ) - @contextmanager - def track_experiment(self, eval_hooks, get_estimator_spec_fn, name=None): - """ - A context manager for tracking experiment. It should wrap the training loop. - An experiment tracker eval hook is appended to eval_hooks to collect metrics. - - Args: - eval_hooks (list): - The list of eval_hooks to be used. When it's not None, and does not contain any , - MetricsUpdateHook an experiment tracker eval hook is appended to it. When it contains - any MetricsUpdateHook, this tracker is disabled to avoid conflict with legacy Model Repo - tracker (`TrackRun`). - get_estimator_spec_fn (func): - A function to get the current EstimatorSpec of the trainer, used by the eval hook. - name (str); - Name of this training or evaluation. Used as a suffix of the run_id. - - Returns: - The tracker's eval hook which is appended to eval_hooks. - """ + if self._env_eligible_for_recording_experiment and self._graceful_shutdown_port: + requests.post( + f"http://localhost:{self._graceful_shutdown_port}/track_training_start" + ) + + if self.disabled or eval_hooks is None: + yield None + else: + assert ( + self._current_tracker_hook is None + ), "experiment tracking has been started already" + + if name is not None: + self._current_run_name_suffix = "_" + name + + logging.info("Starting experiment tracking. Path: %s", self._current_run_id) + logging.info( + "Is environment eligible for recording export metadata: %s", + self._env_eligible_for_recording_export_metadata, + ) + logging.info( + "This run will be available at: http://go/mldash/experiments/%s", + encode_url(self.experiment_id), + ) + + try: + self._record_run() + self._add_run_status( + StatusUpdate(self._current_run_id, status="RUNNING") + ) + self._register_for_graceful_shutdown() + + self._current_tracker_hook = self.create_eval_hook( + get_estimator_spec_fn + ) + except Exception as err: + logging.error( + "Failed to record run. This experiment will not be tracked. Error: %s", + str(err), + ) + self._current_tracker_hook = None + + if self._current_tracker_hook is None: + yield None + else: + try: + eval_hooks.append(self._current_tracker_hook) + yield self._current_tracker_hook + except Exception as err: + self._add_run_status( + StatusUpdate( + self._current_run_id, status="FAILED", description=str(err) + ) + ) + self._deregister_for_graceful_shutdown() + self._current_tracker_hook = None + self._current_run_name_suffix = None + logging.error("Experiment tracking done. Experiment failed.") + raise + + try: + if self._current_tracker_hook.metric_values: + self._record_update(self._current_tracker_hook.metric_values) + self._add_run_status( + StatusUpdate(self._current_run_id, status="SUCCESS") + ) + logging.info("Experiment tracking done. Experiment succeeded.") + except Exception as err: + logging.error( + "Failed to update mark run as successful. Error: %s", str(err) + ) + finally: + self._deregister_for_graceful_shutdown() + self._current_tracker_hook = None + self._current_run_name_suffix = None + + def create_eval_hook( + self, get_estimator_spec_fn: Callable[[], tf.estimator.EstimatorSpec] + ) -> tf.estimator.SessionRunHook: + """ + Create an eval_hook to track eval metrics + + Args: + get_estimator_spec_fn (func): + A function that returns the current EstimatorSpec of the trainer. + + Returns: + The tracker's eval hook. + """ + return MetricsUpdateHook( + get_estimator_spec_fn=get_estimator_spec_fn, + add_metrics_fn=self._record_update, + ) - # disable this tracker if legacy TrackRun hook is present - # TODO: remove this once we completely deprecate the old TrackRun interface - if eval_hooks is not None: - self.disabled = self.disabled or any(isinstance(x, MetricsUpdateHook) for x in eval_hooks) - - logging.info('Is environment eligible for recording experiment: %s', - self._env_eligible_for_recording_experiment) - - if self._env_eligible_for_recording_experiment and self._graceful_shutdown_port: - requests.post('http://localhost:{}/track_training_start'.format( - self._graceful_shutdown_port - )) - - if self.disabled or eval_hooks is None: - yield None - else: - assert self._current_tracker_hook is None, 'experiment tracking has been started already' - - if name is not None: - self._current_run_name_suffix = '_' + name - - logging.info('Starting experiment tracking. Path: %s', self._current_run_id) - logging.info('Is environment eligible for recording export metadata: %s', - self._env_eligible_for_recording_export_metadata) - logging.info('This run will be available at: http://go/mldash/experiments/%s', - encode_url(self.experiment_id)) - - try: - self._record_run() - self._add_run_status(StatusUpdate(self._current_run_id, status='RUNNING')) - self._register_for_graceful_shutdown() - - self._current_tracker_hook = self.create_eval_hook(get_estimator_spec_fn) - except Exception as err: - logging.error( - 'Failed to record run. This experiment will not be tracked. Error: %s', str(err)) - self._current_tracker_hook = None + def register_model(self, export_path: str) -> None: + """ + Record the exported model. - if self._current_tracker_hook is None: - yield None - else: - try: - eval_hooks.append(self._current_tracker_hook) - yield self._current_tracker_hook - except Exception as err: - self._add_run_status( - StatusUpdate(self._current_run_id, status='FAILED', description=str(err))) - self._deregister_for_graceful_shutdown() - self._current_tracker_hook = None - self._current_run_name_suffix = None - logging.error('Experiment tracking done. Experiment failed.') - raise + Args: + export_path (str): + The path to the exported model. + """ + if self.disabled: + return None try: - if self._current_tracker_hook.metric_values: - self._record_update(self._current_tracker_hook.metric_values) - self._add_run_status(StatusUpdate(self._current_run_id, status='SUCCESS')) - logging.info('Experiment tracking done. Experiment succeeded.') + logging.info( + "Model is exported to %s. Computing hash of the model.", export_path + ) + model_hash = self.compute_model_hash(export_path) + logging.info("Model hash: %s. Registering it in ML Metastore.", model_hash) + self._client.register_model( + Model(model_hash, self.path["owner"], self.base_run_id) + ) except Exception as err: - logging.error( - 'Failed to update mark run as successful. Error: %s', str(err)) - finally: - self._deregister_for_graceful_shutdown() - self._current_tracker_hook = None - self._current_run_name_suffix = None - - def create_eval_hook(self, get_estimator_spec_fn): - """ - Create an eval_hook to track eval metrics + logging.error("Failed to register model. Error: %s", str(err)) - Args: - get_estimator_spec_fn (func): - A function that returns the current EstimatorSpec of the trainer. - """ - return MetricsUpdateHook( - get_estimator_spec_fn=get_estimator_spec_fn, - add_metrics_fn=self._record_update) + def export_feature_spec(self, feature_spec_dict: Dict[str, Any]) -> None: + """ + Export feature spec to ML Metastore (go/ml-metastore). - def register_model(self, export_path): - """ - Record the exported model. + Please note that the feature list in FeatureConfig only keeps the list of feature hash ids due + to the 1mb upper limit for values in manhattan, and more specific information (feature type, + feature name) for each feature config feature is stored separately in FeatureConfigFeature dataset. - Args: - export_path (str): - The path to the exported model. - """ - if self.disabled: - return None - - try: - logging.info('Model is exported to %s. Computing hash of the model.', export_path) - model_hash = self.compute_model_hash(export_path) - logging.info('Model hash: %s. Registering it in ML Metastore.', model_hash) - self._client.register_model(Model(model_hash, self.path['owner'], self.base_run_id)) - except Exception as err: - logging.error('Failed to register model. Error: %s', str(err)) - - def export_feature_spec(self, feature_spec_dict): - """ - Export feature spec to ML Metastore (go/ml-metastore). + Args: + feature_spec_dict (dict): A dictionary obtained from FeatureConfig.get_feature_spec() + """ + if self.disabled or not self._env_eligible_for_recording_export_metadata: + return None - Please note that the feature list in FeatureConfig only keeps the list of feature hash ids due - to the 1mb upper limit for values in manhattan, and more specific information (feature type, - feature name) for each feature config feature is stored separately in FeatureConfigFeature dataset. + try: + logging.info("Exporting feature spec to ML Metastore.") + feature_list = feature_spec_dict["features"] + label_list = feature_spec_dict["labels"] + weight_list = feature_spec_dict["weight"] + self._client.add_feature_config( + FeatureConfig( + self._current_run_id, + list(feature_list.keys()), + list(label_list.keys()), + list(weight_list.keys()), + ) + ) + + feature_config_features = [ + FeatureConfigFeature( + hash_id=_feature_hash_id, + feature_name=_feature["featureName"], + feature_type=_feature["featureType"], + ) + for _feature_hash_id, _feature in zip( + feature_list.keys(), feature_list.values() + ) + ] + self._client.add_feature_config_features( + list(feature_list.keys()), feature_config_features + ) + + feature_config_labels = [ + FeatureConfigFeature( + hash_id=_label_hash_id, feature_name=_label["featureName"] + ) + for _label_hash_id, _label in zip( + label_list.keys(), label_list.values() + ) + ] + self._client.add_feature_config_features( + list(label_list.keys()), feature_config_labels + ) + + feature_config_weights = [ + FeatureConfigFeature( + hash_id=_weight_hash_id, + feature_name=_weight["featureName"], + feature_type=_weight["featureType"], + ) + for _weight_hash_id, _weight in zip( + weight_list.keys(), weight_list.values() + ) + ] + self._client.add_feature_config_features( + list(weight_list.keys()), feature_config_weights + ) - Args: - feature_spec_dict (dict): A dictionary obtained from FeatureConfig.get_feature_spec() - """ - if self.disabled or not self._env_eligible_for_recording_export_metadata: - return None - - try: - logging.info('Exporting feature spec to ML Metastore.') - feature_list = feature_spec_dict['features'] - label_list = feature_spec_dict['labels'] - weight_list = feature_spec_dict['weight'] - self._client.add_feature_config(FeatureConfig(self._current_run_id, list(feature_list.keys()), - list(label_list.keys()), list(weight_list.keys()))) - - feature_config_features = [ - FeatureConfigFeature( - hash_id=_feature_hash_id, - feature_name=_feature['featureName'], - feature_type=_feature['featureType'] - ) - for _feature_hash_id, _feature in zip(feature_list.keys(), feature_list.values()) - ] - self._client.add_feature_config_features(list(feature_list.keys()), feature_config_features) - - feature_config_labels = [ - FeatureConfigFeature( - hash_id=_label_hash_id, - feature_name=_label['featureName'] - ) - for _label_hash_id, _label in zip(label_list.keys(), label_list.values()) - ] - self._client.add_feature_config_features(list(label_list.keys()), feature_config_labels) - - feature_config_weights = [ - FeatureConfigFeature( - hash_id=_weight_hash_id, - feature_name=_weight['featureName'], - feature_type=_weight['featureType'] + except Exception as err: + logging.error("Failed to export feature spec. Error: %s", str(err)) + + @property + def path(self) -> Optional[Dict[str, str]]: + if self.disabled: + return None + return get_components_from_id(self.tracking_path, ensure_valid_id=False) + + @property + def experiment_id(self) -> Optional[str]: + """Return the experiment id.""" + if self.disabled: + return None + return f"{self.path['owner']}:{self.path['project_name']}:{self.path['experiment_name']}" + + @property + def _current_run_name(self) -> str: + """Return the current run name.""" + if self._current_run_name_suffix is not None: + return self.path["run_name"] + self._current_run_name_suffix + return self.path["run_name"] + + @property + def _current_run_id(self) -> str: + """Return the current run id.""" + if self._current_run_name_suffix is not None: + return self.base_run_id + self._current_run_name_suffix + return self.base_run_id + + def get_run_status(self) -> Union[StatusUpdate, None]: + """Get the current run status.""" + if not self.disabled: + return self._client.get_latest_dbv2_status(self._current_run_id) + return None + + def _add_run_status(self, status: StatusUpdate) -> None: + """ + Add run status with underlying client. + + Args: + status (StatusUpdate): + The status update to add. + """ + if not self.disabled and self._env_eligible_for_recording_experiment: + self._client.add_run_status(status) + + def _record_run(self) -> None: + """Record the run in ML Metastore.""" + if self.disabled or not self._env_eligible_for_recording_experiment: + return None + + if not self._client.project_exists(self.project_id): + self._client.add_project( + Project(self.path["project_name"], self.path["owner"]) + ) + time.sleep(1) + + if not self._client.experiment_exists(self.experiment_id): + self._client.add_experiment( + Experiment( + self.path["experiment_name"], + self.path["owner"], + self.project_id, + "", + ) + ) + time.sleep(1) + + run = DeepbirdRun( + self.experiment_id, + self._current_run_name, + "", + {"raw_command": " ".join(sys.argv)}, + self._params, ) - for _weight_hash_id, _weight in zip(weight_list.keys(), weight_list.values()) - ] - self._client.add_feature_config_features(list(weight_list.keys()), feature_config_weights) - - except Exception as err: - logging.error('Failed to export feature spec. Error: %s', str(err)) - - @property - def path(self): - if self.disabled: - return None - return get_components_from_id(self.tracking_path, ensure_valid_id=False) - - @property - def experiment_id(self): - if self.disabled: - return None - return '%s:%s:%s' % (self.path['owner'], self.path['project_name'], - self.path['experiment_name']) - - @property - def _current_run_name(self): - """ - Return the current run name. - """ - if self._current_run_name_suffix is not None: - return self.path['run_name'] + self._current_run_name_suffix - else: - return self.path['run_name'] - - @property - def _current_run_id(self): - """ - Return the current run id. - """ - if self._current_run_name_suffix is not None: - return self.base_run_id + self._current_run_name_suffix - else: - return self.base_run_id - - def get_run_status(self) -> str: - if not self.disabled: - return self._client.get_latest_dbv2_status(self._current_run_id) - - def _add_run_status(self, status): - """ - Add run status with underlying client. - - Args: - status (StatusUpdate): - The status update to add. - """ - if not self.disabled and self._env_eligible_for_recording_experiment: - self._client.add_run_status(status) - - def _record_run(self): - """ - Record the run in ML Metastore. - """ - if self.disabled or not self._env_eligible_for_recording_experiment: - return None - - if not self._client.project_exists(self.project_id): - self._client.add_project(Project(self.path['project_name'], self.path['owner'])) - time.sleep(1) - - if not self._client.experiment_exists(self.experiment_id): - self._client.add_experiment(Experiment( - self.path['experiment_name'], self.path['owner'], self.project_id, '')) - time.sleep(1) - - run = DeepbirdRun(self.experiment_id, self._current_run_name, '', - {'raw_command': ' '.join(sys.argv)}, self._params) - self._client.add_deepbird_run(run, force=True) - time.sleep(1) - - def _record_update(self, metrics): - """ - Record metrics update in ML Metastore. - - Args: - metrics (dict): - The dict of the metrics and their values. - """ - - if self.disabled or not self._env_eligible_for_recording_experiment: - return None - - reported_metrics = {} - for k, v in metrics.items(): + self._client.add_deepbird_run(run, force=True) + time.sleep(1) - if hasattr(v, 'item'): - reported_metrics[k] = v.item() if v.size == 1 else str(v.tolist()) - else: - logging.warning("Ignoring %s because the value (%s) is not valid" % (k, str(v))) + def _record_update(self, metrics: Dict[str, Any]) -> None: + """ + Record metrics update in ML Metastore. - report = ProgressReport(self._current_run_id, reported_metrics) - - try: - self._client.add_progress_report(report) - except Exception as err: - logging.error('Failed to record metrics in ML Metastore. Error: {}'.format(err)) - logging.error('Run ID: {}'.format(self._current_run_id)) - logging.error('Progress Report: {}'.format(report.to_json_string())) - - def _register_for_graceful_shutdown(self): - """ - Register the tracker with the health server, enabling graceful shutdown. + Args: + metrics (dict): + The dict of the metrics and their values. + """ - Returns: - (Response) health server response - """ - if self._graceful_shutdown_port and not self.disabled and self._env_eligible_for_recording_experiment: - return requests.post('http://localhost:{}/register_id/{}'.format( - self._graceful_shutdown_port, - self._current_run_id - )) + if self.disabled or not self._env_eligible_for_recording_experiment: + return None - def _deregister_for_graceful_shutdown(self): - """ - Deregister the tracker with the health server, disabling graceful shutdown. + reported_metrics = {} + for k, v in metrics.items(): + if hasattr(v, "item"): + reported_metrics[k] = v.item() if v.size == 1 else str(v.tolist()) + else: + logging.warning( + "Ignoring %s because the value (%s) is not valid" % (k, str(v)) + ) - Returns: - (Response) health server response - """ - if self._graceful_shutdown_port and not self.disabled and self._env_eligible_for_recording_experiment: - return requests.post('http://localhost:{}/deregister_id/{}'.format( - self._graceful_shutdown_port, - self._current_run_id - )) + report = ProgressReport(self._current_run_id, reported_metrics) - def _is_env_eligible_for_tracking(self): - """ - Determine if experiment tracking should run in the env. - """ - is_unit_test = ( - os.environ.get('PYTEST_CURRENT_TEST') is not None and - os.environ.get('TEST_EXP_TRACKER') is None - ) - - is_running_on_ci = ( - getpass.getuser() == 'scoot-service' and - os.environ.get('TEST_EXP_TRACKER') is None - ) - - return ( - not is_unit_test and - not is_running_on_ci - ) - - @classmethod - def run_name_from_environ(cls): - """ - Create run id from environment if possible. - """ - job_name = os.environ.get("TWML_JOB_NAME") - job_launch_time = os.environ.get("TWML_JOB_LAUNCH_TIME") - - if not job_name or not job_launch_time: - return None - - try: - # job_launch_time should be in isoformat - # python2 doesnt support datetime.fromisoformat, so use hardcoded format string. - job_launch_time_formatted = datetime.strptime(job_launch_time, - "%Y-%m-%dT%H:%M:%S.%f") - except ValueError: - # Fallback in case aurora config is generating datetime in a different format. - job_launch_time_formatted = (job_launch_time - .replace("-", "_").replace("T", "_") - .replace(":", "_").replace(".", "_")) - - return '{}_{}'.format( - job_name, job_launch_time_formatted.strftime('%m_%d_%Y_%I_%M_%p')) - - @classmethod - def guess_path(cls, save_dir, run_name=None): - """ - Guess an experiment tracking path based on save_dir. - - Returns: - (str) guessed path - """ - if not run_name: - run_name = 'Unnamed_{}'.format(datetime.now().strftime('%m_%d_%Y_%I_%M_%p')) - - if save_dir.startswith('hdfs://'): - path_match = re.search(r'/user/([a-z0-9\-_]+)/([a-z0-9\-_]+)', save_dir) - - if path_match: - groups = path_match.groups() - user = groups[0] - project_name = groups[1] - - return generate_id(user, 'default', project_name, run_name) - - user = getpass.getuser() - project_name = re.sub(r'^[a-z0-9\-_]', os.path.basename(save_dir), '') - if not project_name: - project_name = 'unnamed' - - return generate_id(user, 'default', project_name, run_name) - - @classmethod - def compute_model_hash(cls, export_path): - """ - Computes the hash of an exported model. This is a gfile version of - twitter.mlmetastore.common.versioning.compute_hash. The two functions should generate - the same hash when given the same model. + try: + self._client.add_progress_report(report) + except Exception as err: + logging.error(f"Failed to record metrics in ML Metastore. Error: {err}") + logging.error(f"Run ID: {self._current_run_id}") + logging.error(f"Progress Report: {report.to_json_string()}") + + def _register_for_graceful_shutdown(self) -> Optional[requests.Response]: + """ + Register the tracker with the health server, enabling graceful shutdown. + + Returns: + (Response) health server response + """ + if ( + self._graceful_shutdown_port + and not self.disabled + and self._env_eligible_for_recording_experiment + ): + return requests.post( + f"http://localhost:{self._graceful_shutdown_port}/register_id/{self._current_run_id}" + ) + return None + + def _deregister_for_graceful_shutdown(self) -> Optional[requests.Response]: + """ + Deregister the tracker with the health server, disabling graceful shutdown. + + Returns: + (Response) health server response + """ + if ( + self._graceful_shutdown_port + and not self.disabled + and self._env_eligible_for_recording_experiment + ): + return requests.post( + f"http://localhost:{self._graceful_shutdown_port}/deregister_id/{self._current_run_id}" + ) + + def _is_env_eligible_for_tracking(self) -> bool: + """Determine if experiment tracking should run in the env.""" + is_unit_test = ( + os.environ.get("PYTEST_CURRENT_TEST") is not None + and os.environ.get("TEST_EXP_TRACKER") is None + ) - Args: - export_path (str): - The path to the exported model. + is_running_on_ci = ( + getpass.getuser() == "scoot-service" + and os.environ.get("TEST_EXP_TRACKER") is None + ) - Returns: - (str) hash of the exported model - """ - paths = [] - for path, subdirs, files in tf.io.gfile.walk(export_path): - for name in sorted(files): - paths.append(os.path.join(path, name)) + return (not is_unit_test) and (not is_running_on_ci) - paths.sort() - hash_object = hashlib.new('sha1') + @classmethod + def run_name_from_environ(cls: Type["ExperimentTracker"]) -> Optional[str]: + """ + Create run id from environment if possible. + """ + job_name = os.environ.get("TWML_JOB_NAME") + job_launch_time = os.environ.get("TWML_JOB_LAUNCH_TIME") - for path in paths: - with tf.io.gfile.GFile(path, "rb") as file: - hash_object.update(file.read()) + if not job_name or not job_launch_time: + return None - return hash_object.hexdigest() + try: + # job_launch_time should be in isoformat + # python2 doesnt support datetime.fromisoformat, so use hardcoded format string. + job_launch_time_formatted = datetime.strptime( + job_launch_time, "%Y-%m-%dT%H:%M:%S.%f" + ) + except ValueError: + # Fallback in case aurora config is generating datetime in a different format. + job_launch_time_formatted = ( + job_launch_time.replace("-", "_") + .replace("T", "_") + .replace(":", "_") + .replace(".", "_") + ) + return f"{job_name}_{job_launch_time_formatted.strftime('%m_%d_%Y_%I_%M_%p')}" + + @classmethod + def guess_path( + cls: Type["ExperimentTracker"], + save_dir: str, + run_name: Optional[str] = None, + ) -> str: + """ + Guess an experiment tracking path based on save_dir. + + Args: + save_dir (str): save directory + run_name (str): run name + + Returns: + (str) guessed path + """ + if not run_name: + run_name = f'Unnamed_{datetime.now().strftime("%m_%d_%Y_%I_%M_%p")}' + + if save_dir.startswith("hdfs://"): + path_match = re.search(r"/user/([a-z0-9\-_]+)/([a-z0-9\-_]+)", save_dir) + + if path_match: + groups = path_match.groups() + user = groups[0] + project_name = groups[1] + + return generate_id(user, "default", project_name, run_name) + + user = getpass.getuser() + project_name = re.sub(r"^[a-z0-9\-_]", os.path.basename(save_dir), "") + if not project_name: + project_name = "unnamed" + + return generate_id(user, "default", project_name, run_name) + + @classmethod + def compute_model_hash(cls, export_path: str) -> str: + """ + Computes the hash of an exported model. This is a gfile version of + twitter.mlmetastore.common.versioning.compute_hash. The two functions should generate + the same hash when given the same model. + + Args: + export_path (str): The path to the exported model. + + Returns: + (str) hash of the exported model + """ + paths = [] + for path, subdirs, files in tf.io.gfile.walk(export_path): + for name in sorted(files): + paths.append(os.path.join(path, name)) + + paths.sort() + hash_object = hashlib.new("sha1") + + for path in paths: + with tf.io.gfile.GFile(path, "rb") as file: + hash_object.update(file.read()) + + return hash_object.hexdigest() diff --git a/twml/twml/trainers/__init__.py b/twml/twml/trainers/__init__.py index e6664d9a6..9dbaf3cf4 100644 --- a/twml/twml/trainers/__init__.py +++ b/twml/twml/trainers/__init__.py @@ -6,5 +6,5 @@ `_. """ -from .trainer import Trainer # noqa: F401 from .data_record_trainer import DataRecordTrainer # noqa: F401 +from .trainer import Trainer # noqa: F401 diff --git a/twml/twml/trainers/data_record_trainer.py b/twml/twml/trainers/data_record_trainer.py index 76dd16f80..30c3f9684 100644 --- a/twml/twml/trainers/data_record_trainer.py +++ b/twml/twml/trainers/data_record_trainer.py @@ -56,766 +56,970 @@ """ import datetime +from typing import Any, Callable, Dict, List, Optional import tensorflow.compat.v1 as tf +from absl import logging from twitter.deepbird.io.dal import dal_to_hdfs_path, is_dal_path + import twml -from twml.trainers import Trainer from twml.contrib.feature_importances.feature_importances import ( - compute_feature_importances, - TREE, - write_feature_importances_to_hdfs, - write_feature_importances_to_ml_dash) -from absl import logging + TREE, + compute_feature_importances, + write_feature_importances_to_hdfs, + write_feature_importances_to_ml_dash, +) +from twml.trainers import Trainer class DataRecordTrainer(Trainer): # pylint: disable=abstract-method - """ - The ``DataRecordTrainer`` implementation is intended to satisfy the most common use cases - at Twitter where only the build_graph methods needs to be overridden. - For this reason, ``Trainer.[train,eval]_input_fn`` methods - assume a DataRecord dataset partitioned into part files stored in compressed (e.g. gzip) format. - - For use-cases that differ from this common Twitter use-case, - further Trainer methods can be overridden. - If that still doesn't provide enough flexibility, the user can always - use the tf.estimator.Esimator or tf.session.run directly. - """ - - def __init__( - self, name, params, - build_graph_fn, - feature_config=None, - **kwargs): """ - The DataRecordTrainer constructor builds a - ``tf.estimator.Estimator`` and stores it in self.estimator. - For this reason, DataRecordTrainer accepts the same Estimator constructor arguments. - It also accepts additional arguments to facilitate metric evaluation and multi-phase training - (init_from_dir, init_map). - - Args: - parent arguments: - See the `Trainer constructor <#twml.trainers.Trainer.__init__>`_ documentation - for a full list of arguments accepted by the parent class. - name, params, build_graph_fn (and other parent class args): - see documentation for twml.Trainer doc. - feature_config: - An object of type FeatureConfig describing what features to decode. - Defaults to None. But it is needed in the following cases: - - `get_train_input_fn()` / `get_eval_input_fn()` is called without a `parse_fn` - - `learn()`, `train()`, `eval()`, `calibrate()` are called without providing `*input_fn`. - - **kwargs: - further kwargs can be specified and passed to the Estimator constructor. + The ``DataRecordTrainer`` implementation is intended to satisfy the most common use cases + at Twitter where only the build_graph methods needs to be overridden. + For this reason, ``Trainer.[train,eval]_input_fn`` methods + assume a DataRecord dataset partitioned into part files stored in compressed (e.g. gzip) format. + + For use-cases that differ from this common Twitter use-case, + further Trainer methods can be overridden. + If that still doesn't provide enough flexibility, the user can always + use the tf.estimator.Esimator or tf.session.run directly. """ - # NOTE: DO NOT MODIFY `params` BEFORE THIS CALL. - super(DataRecordTrainer, self).__init__( - name=name, params=params, build_graph_fn=build_graph_fn, **kwargs) - - self._feature_config = feature_config - - # date range parameters common to both training and evaluation data: - hour_resolution = self.params.get("hour_resolution", 1) - data_threads = self.params.get("data_threads", 4) - datetime_format = self.params.get("datetime_format", "%Y/%m/%d") - - # retrieve the desired training dataset files - self._train_files = self.build_files_list( - files_list_path=self.params.get("train_files_list", None), - data_dir=self.params.get("train_data_dir", None), - start_datetime=self.params.get("train_start_datetime", None), - end_datetime=self.params.get("train_end_datetime", None), - datetime_format=datetime_format, data_threads=data_threads, - hour_resolution=hour_resolution, maybe_save=self.is_chief(), - overwrite=self.params.get("train_overwrite_files_list", False), - ) - - # retrieve the desired evaluation dataset files - eval_name = self.params.get("eval_name", None) - - if eval_name == "train": - self._eval_files = self._train_files - else: - self._eval_files = self.build_files_list( - files_list_path=self.params.get("eval_files_list", None), - data_dir=self.params.get("eval_data_dir", None), - start_datetime=self.params.get("eval_start_datetime", None), - end_datetime=self.params.get("eval_end_datetime", None), - datetime_format=datetime_format, data_threads=data_threads, - hour_resolution=hour_resolution, maybe_save=self.is_chief(), - overwrite=self.params.get("eval_overwrite_files_list", False), - ) - - if not self.params.get("allow_train_eval_overlap"): - # if there is overlap between train and eval, error out! - if self._train_files and self._eval_files: - overlap_files = set(self._train_files) & set(self._eval_files) + def __init__( + self, + name: str, + params: Dict[str, Any], + build_graph_fn: Callable, + feature_config: twml.FeatureConfig = None, + **kwargs, + ): + """ + The DataRecordTrainer constructor builds a + ``tf.estimator.Estimator`` and stores it in self.estimator. + For this reason, DataRecordTrainer accepts the same Estimator constructor arguments. + It also accepts additional arguments to facilitate metric evaluation and multi-phase training + (init_from_dir, init_map). + + Args: + parent Args: + See the `Trainer constructor <#twml.trainers.Trainer.__init__>`_ documentation + for a full list of arguments accepted by the parent class. + name, params, build_graph_fn (and other parent class args): + see documentation for twml.Trainer doc. + feature_config: + An object of type FeatureConfig describing what features to decode. + Defaults to None. But it is needed in the following cases: + - `get_train_input_fn()` / `get_eval_input_fn()` is called without a `parse_fn` + - `learn()`, `train()`, `eval()`, `calibrate()` are called without providing `*input_fn`. + + **kwargs: + further kwargs can be specified and passed to the Estimator constructor. + """ + + # NOTE: DO NOT MODIFY `params` BEFORE THIS CALL. + super(DataRecordTrainer, self).__init__( + name=name, params=params, build_graph_fn=build_graph_fn, **kwargs + ) + + self._feature_config = feature_config + + # date range parameters common to both training and evaluation data: + hour_resolution = self.params.get("hour_resolution", 1) + data_threads = self.params.get("data_threads", 4) + datetime_format = self.params.get("datetime_format", "%Y/%m/%d") + + # retrieve the desired training dataset files + self._train_files = self.build_files_list( + files_list_path=self.params.get("train_files_list", None), + data_dir=self.params.get("train_data_dir", None), + start_datetime=self.params.get("train_start_datetime", None), + end_datetime=self.params.get("train_end_datetime", None), + datetime_format=datetime_format, + data_threads=data_threads, + hour_resolution=hour_resolution, + maybe_save=self.is_chief(), + overwrite=self.params.get("train_overwrite_files_list", False), + ) + + # retrieve the desired evaluation dataset files + eval_name = self.params.get("eval_name", None) + + if eval_name == "train": + self._eval_files = self._train_files else: - overlap_files = set() - if overlap_files: - raise ValueError("There is an overlap between train and eval files:\n %s" % - (overlap_files)) - - @staticmethod - def build_hdfs_files_list( - files_list_path, data_dir, - start_datetime, end_datetime, datetime_format, - data_threads, hour_resolution, maybe_save, overwrite): - if files_list_path: - files_list_path = twml.util.preprocess_path(files_list_path) - - if isinstance(start_datetime, datetime.datetime): - start_datetime = start_datetime.strftime(datetime_format) - if isinstance(end_datetime, datetime.datetime): - end_datetime = end_datetime.strftime(datetime_format) - - list_files_by_datetime_args = { - "base_path": data_dir, - "start_datetime": start_datetime, - "end_datetime": end_datetime, - "datetime_prefix_format": datetime_format, - "extension": "lzo", - "parallelism": data_threads, - "hour_resolution": hour_resolution, - "sort": True, - } - - # no cache of data file paths, just get the list by scraping the directory - if not files_list_path or not tf.io.gfile.exists(files_list_path): - # twml.util.list_files_by_datetime returns None if data_dir is None. - # twml.util.list_files_by_datetime passes through data_dir if data_dir is a list - files_list = twml.util.list_files_by_datetime(**list_files_by_datetime_args) - else: - # the cached data file paths file exists. - files_info = twml.util.read_file(files_list_path, decode="json") - # use the cached list if data params match current params, - # or if current params are None - # Not including None checks for datetime_format and hour_resolution, - # since those are shared between eval and training. - if (all(param is None for param in [data_dir, start_datetime, end_datetime]) or - (files_info["data_dir"] == data_dir and - files_info["start_datetime"] == start_datetime and - files_info["end_datetime"] == end_datetime and - files_info["datetime_format"] == datetime_format and - files_info["hour_resolution"] == hour_resolution)): - files_list = files_info["files"] - elif overwrite: - # current params are not none and don't match saved params - # `overwrite` indicates we should thus update the list - files_list = twml.util.list_files_by_datetime(**list_files_by_datetime_args) - else: - # dont update the cached list - raise ValueError("Information in files_list is inconsistent with provided args.\n" - "Did you intend to overwrite files_list using " - "--train.overwrite_files_list or --eval.overwrite_files_list?\n" - "If you instead want to use the paths in files_list, ensure that " - "data_dir, start_datetime, and end_datetime are None.") - - if maybe_save and files_list_path and (overwrite or not tf.io.gfile.exists(files_list_path)): - save_dict = {} - save_dict["files"] = files_list - save_dict["data_dir"] = data_dir - save_dict["start_datetime"] = start_datetime - save_dict["end_datetime"] = end_datetime - save_dict["datetime_format"] = datetime_format - save_dict["hour_resolution"] = hour_resolution - twml.util.write_file(files_list_path, save_dict, encode="json") - - return files_list - - @staticmethod - def build_files_list(files_list_path, data_dir, - start_datetime, end_datetime, datetime_format, - data_threads, hour_resolution, maybe_save, overwrite): - ''' - When specifying DAL datasets, only data_dir, start_dateime, and end_datetime - should be given with the format: - - dal://{cluster}/{role}/{dataset_name}/{env} - - ''' - if not data_dir or not is_dal_path(data_dir): - logging.warn(f"Please consider specifying a dal:// dataset rather than passing a physical hdfs path.") - return DataRecordTrainer.build_hdfs_files_list( - files_list_path, data_dir, - start_datetime, end_datetime, datetime_format, - data_threads, hour_resolution, maybe_save, overwrite) - - del datetime_format - del data_threads - del hour_resolution - del maybe_save - del overwrite - - return dal_to_hdfs_path( - path=data_dir, - start_datetime=start_datetime, - end_datetime=end_datetime, - ) - - @property - def train_files(self): - return self._train_files - - @property - def eval_files(self): - return self._eval_files - - @staticmethod - def add_parser_arguments(): - """ - Add common commandline args to parse for the Trainer class. - Typically, the user calls this function and then parses cmd-line arguments - into an argparse.Namespace object which is then passed to the Trainer constructor - via the params argument. - - See the `Trainer code <_modules/twml/trainers/trainer.html#Trainer.add_parser_arguments>`_ - and `DataRecordTrainer code - <_modules/twml/trainers/trainer.html#DataRecordTrainer.add_parser_arguments>`_ - for a list and description of all cmd-line arguments. - - Args: - learning_rate_decay: - Defaults to False. When True, parses learning rate decay arguments. - - Returns: - argparse.ArgumentParser instance with some useful args already added. - """ - parser = super(DataRecordTrainer, DataRecordTrainer).add_parser_arguments() - parser.add_argument( - "--train.files_list", "--train_files_list", type=str, default=None, - dest="train_files_list", - help="Path for a json file storing information on training data.\n" - "Specifically, the file at files_list should contain the dataset parameters " - "for constructing the list of data files, and the list of data file paths.\n" - "If the json file does not exist, other args are used to construct the " - "training files list, and that list will be saved to the indicated json file.\n" - "If the json file does exist, and current args are consistent with " - "saved args, or are all None, then the saved files list will be used.\n" - "If current args are not consistent with the saved args, then error out " - "if train_overwrite_files_list==False, else overwrite files_list with " - "a newly constructed list.") - parser.add_argument( - "--train.overwrite_files_list", "--train_overwrite_files_list", action="store_true", default=False, - dest="train_overwrite_files_list", - help="When the --train.files_list param is used, indicates whether to " - "overwrite the existing --train.files_list when there are differences " - "between the current and saved dataset args. Default (False) is to " - "error out if files_list exists and differs from current params.") - parser.add_argument( - "--train.data_dir", "--train_data_dir", type=str, default=None, - dest="train_data_dir", - help="Path to the training data directory." - "Supports local, dal://{cluster}-{region}/{role}/{dataset_name}/{environment}, " - "and HDFS (hdfs://default/ ) paths.") - parser.add_argument( - "--train.start_date", "--train_start_datetime", - type=str, default=None, - dest="train_start_datetime", - help="Starting date for training inside the train data dir." - "The start datetime is inclusive." - "e.g. 2019/01/15") - parser.add_argument( - "--train.end_date", "--train_end_datetime", type=str, default=None, - dest="train_end_datetime", - help="Ending date for training inside the train data dir." - "The end datetime is inclusive." - "e.g. 2019/01/15") - parser.add_argument( - "--eval.files_list", "--eval_files_list", type=str, default=None, - dest="eval_files_list", - help="Path for a json file storing information on evaluation data.\n" - "Specifically, the file at files_list should contain the dataset parameters " - "for constructing the list of data files, and the list of data file paths.\n" - "If the json file does not exist, other args are used to construct the " - "evaluation files list, and that list will be saved to the indicated json file.\n" - "If the json file does exist, and current args are consistent with " - "saved args, or are all None, then the saved files list will be used.\n" - "If current args are not consistent with the saved args, then error out " - "if eval_overwrite_files_list==False, else overwrite files_list with " - "a newly constructed list.") - parser.add_argument( - "--eval.overwrite_files_list", "--eval_overwrite_files_list", action="store_true", default=False, - dest="eval_overwrite_files_list", - help="When the --eval.files_list param is used, indicates whether to " - "overwrite the existing --eval.files_list when there are differences " - "between the current and saved dataset args. Default (False) is to " - "error out if files_list exists and differs from current params.") - parser.add_argument( - "--eval.data_dir", "--eval_data_dir", type=str, default=None, - dest="eval_data_dir", - help="Path to the cross-validation data directory." - "Supports local, dal://{cluster}-{region}/{role}/{dataset_name}/{environment}, " - "and HDFS (hdfs://default/ ) paths.") - parser.add_argument( - "--eval.start_date", "--eval_start_datetime", - type=str, default=None, - dest="eval_start_datetime", - help="Starting date for evaluating inside the eval data dir." - "The start datetime is inclusive." - "e.g. 2019/01/15") - parser.add_argument( - "--eval.end_date", "--eval_end_datetime", type=str, default=None, - dest="eval_end_datetime", - help="Ending date for evaluating inside the eval data dir." - "The end datetime is inclusive." - "e.g. 2019/01/15") - parser.add_argument( - "--datetime_format", type=str, default="%Y/%m/%d", - help="Date format for training and evaluation datasets." - "Has to be a format that is understood by python datetime." - "e.g. %%Y/%%m/%%d for 2019/01/15." - "Used only if {train/eval}.{start/end}_date are provided.") - parser.add_argument( - "--hour_resolution", type=int, default=None, - help="Specify the hourly resolution of the stored data.") - parser.add_argument( - "--data_spec", type=str, required=True, - help="Path to data specification JSON file. This file is used to decode DataRecords") - parser.add_argument( - "--train.keep_rate", "--train_keep_rate", type=float, default=None, - dest="train_keep_rate", - help="A float value in (0.0, 1.0] that indicates to drop records according to the Bernoulli \ - distribution with p = 1 - keep_rate.") - parser.add_argument( - "--eval.keep_rate", "--eval_keep_rate", type=float, default=None, - dest="eval_keep_rate", - help="A float value in (0.0, 1.0] that indicates to drop records according to the Bernoulli \ - distribution with p = 1 - keep_rate.") - parser.add_argument( - "--train.parts_downsampling_rate", "--train_parts_downsampling_rate", - dest="train_parts_downsampling_rate", - type=float, default=None, - help="A float value in (0.0, 1.0] that indicates the factor by which to downsample part \ - files. For example, a value of 0.2 means only 20 percent of part files become part of the \ - dataset.") - parser.add_argument( - "--eval.parts_downsampling_rate", "--eval_parts_downsampling_rate", - dest="eval_parts_downsampling_rate", - type=float, default=None, - help="A float value in (0.0, 1.0] that indicates the factor by which to downsample part \ - files. For example, a value of 0.2 means only 20 percent of part files become part of the \ - dataset.") - parser.add_argument( - "--allow_train_eval_overlap", - dest="allow_train_eval_overlap", - action="store_true", - help="Allow overlap between train and eval datasets." - ) - parser.add_argument( - "--eval_name", type=str, default=None, - help="String denoting what we want to name the eval. If this is `train`, then we eval on \ - the training dataset." - ) - return parser - - def contrib_run_feature_importances(self, feature_importances_parse_fn=None, write_to_hdfs=True, extra_groups=None, datarecord_filter_fn=None, datarecord_filter_run_name=None): - """Compute feature importances on a trained model (this is a contrib feature) - Args: - feature_importances_parse_fn (fn): The same parse_fn that we use for training/evaluation. - Defaults to feature_config.get_parse_fn() - write_to_hdfs (bool): Setting this to True writes the feature importance metrics to HDFS - extra_groups (dict>): A dictionary mapping the name of extra feature groups to the list of - the names of the features in the group - datarecord_filter_fn (function): a function takes a single data sample in com.twitter.ml.api.ttypes.DataRecord format - and return a boolean value, to indicate if this data record should be kept in feature importance module or not. - """ - logging.info("Computing feature importance") - algorithm = self._params.feature_importance_algorithm - - kwargs = {} - if algorithm == TREE: - kwargs["split_feature_group_on_period"] = self._params.split_feature_group_on_period - kwargs["stopping_metric"] = self._params.feature_importance_metric - kwargs["sensitivity"] = self._params.feature_importance_sensitivity - kwargs["dont_build_tree"] = self._params.dont_build_tree - kwargs["extra_groups"] = extra_groups - if self._params.feature_importance_is_metric_larger_the_better: - # The user has specified that the stopping metric is one where larger values are better (e.g. ROC_AUC) - kwargs["is_metric_larger_the_better"] = True - elif self._params.feature_importance_is_metric_smaller_the_better: - # The user has specified that the stopping metric is one where smaller values are better (e.g. LOSS) - kwargs["is_metric_larger_the_better"] = False - else: - # The user has not specified which direction is better for the stopping metric - kwargs["is_metric_larger_the_better"] = None - logging.info("Using the tree algorithm with kwargs {}".format(kwargs)) - - feature_importances = compute_feature_importances( - trainer=self, - data_dir=self._params.get('feature_importance_data_dir'), - feature_config=self._feature_config, - algorithm=algorithm, - record_count=self._params.feature_importance_example_count, - parse_fn=feature_importances_parse_fn, - datarecord_filter_fn=datarecord_filter_fn, - **kwargs) - - if not feature_importances: - logging.info("Feature importances returned None") - else: - if write_to_hdfs: - logging.info("Writing feature importance to HDFS") - write_feature_importances_to_hdfs( - trainer=self, - feature_importances=feature_importances, - output_path=datarecord_filter_run_name, - metric=self._params.get('feature_importance_metric')) - else: - logging.info("Not writing feature importance to HDFS") - - logging.info("Writing feature importance to ML Metastore") - write_feature_importances_to_ml_dash( - trainer=self, feature_importances=feature_importances) - return feature_importances - - def export_model(self, serving_input_receiver_fn=None, - export_output_fn=None, - export_dir=None, checkpoint_path=None, - feature_spec=None): - """ - Export the model for prediction. Typically, the exported model - will later be run in production servers. This method is called - by the user to export the PREDICT graph to disk. - - Internally, this method calls `tf.estimator.Estimator.export_savedmodel - `_. - - Args: - serving_input_receiver_fn (Function): - function preparing the model for inference requests. - If not set; defaults to the the serving input receiver fn set by the FeatureConfig. - export_output_fn (Function): - Function to export the graph_output (output of build_graph) for - prediction. Takes a graph_output dict as sole argument and returns - the export_output_fns dict. - Defaults to ``twml.export_output_fns.batch_prediction_continuous_output_fn``. - export_dir: - directory to export a SavedModel for prediction servers. - Defaults to ``[save_dir]/exported_models``. - checkpoint_path: - the checkpoint path to export. If None (the default), the most recent checkpoint - found within the model directory ``save_dir`` is chosen. - - Returns: - The export directory where the PREDICT graph is saved. - """ - if serving_input_receiver_fn is None: - if self._feature_config is None: - raise ValueError("`feature_config` was not passed to `DataRecordTrainer`") - serving_input_receiver_fn = self._feature_config.get_serving_input_receiver_fn() - - if feature_spec is None: - if self._feature_config is None: - raise ValueError("feature_spec can not be inferred." - "Please pass feature_spec=feature_config.get_feature_spec() to the trainer.export_model method") - else: - feature_spec = self._feature_config.get_feature_spec() - - if isinstance(serving_input_receiver_fn, twml.feature_config.FeatureConfig): - raise ValueError("Cannot pass FeatureConfig as a parameter to serving_input_receiver_fn") - elif not callable(serving_input_receiver_fn): - raise ValueError("Expecting Function for serving_input_receiver_fn") - - if export_output_fn is None: - export_output_fn = twml.export_output_fns.batch_prediction_continuous_output_fn - - return super(DataRecordTrainer, self).export_model( - export_dir=export_dir, - serving_input_receiver_fn=serving_input_receiver_fn, - checkpoint_path=checkpoint_path, - export_output_fn=export_output_fn, - feature_spec=feature_spec, - ) - - def get_train_input_fn( - self, parse_fn=None, repeat=None, shuffle=True, interleave=True, shuffle_files=None, - initializable=False, log_tf_data_summaries=False, **kwargs): - """ - This method is used to create input function used by estimator.train(). - - Args: - parse_fn: - Function to parse a data record into a set of features. - Defaults to the parser returned by the FeatureConfig selected - repeat (optional): - Specifies if the dataset is to be repeated. Defaults to `params.train_steps > 0`. - This ensures the training is run for atleast `params.train_steps`. - Toggling this to `False` results in training finishing when one of the following happens: - - The entire dataset has been trained upon once. - - `params.train_steps` has been reached. - shuffle (optional): - Specifies if the files and records in the files need to be shuffled. - When `True`, files are shuffled, and records of each files are shuffled. - When `False`, files are read in alpha-numerical order. Also when `False` - the dataset is sharded among workers for Hogwild and distributed training - if no sharding configuration is provided in `params.train_dataset_shards`. - Defaults to `True`. - interleave (optional): - Specifies if records from multiple files need to be interleaved in parallel. - Defaults to `True`. - shuffle_files (optional): - Shuffle the list of files. Defaults to 'Shuffle' if not provided. - initializable (optional): - A boolean indicator. When the parsing function depends on some resource, e.g. a HashTable or - a Tensor, i.e. it's an initializable iterator, set it to True. Otherwise, default value - (false) is used for most plain iterators. - log_tf_data_summaries (optional): - A boolean indicator denoting whether to add a `tf.data.experimental.StatsAggregator` to the - tf.data pipeline. This adds summaries of pipeline utilization and buffer sizes to the output - events files. This requires that `initializable` is `True` above. - - Returns: - An input_fn that can be consumed by `estimator.train()`. - """ - if parse_fn is None: - if self._feature_config is None: - raise ValueError("`feature_config` was not passed to `DataRecordTrainer`") - parse_fn = self._feature_config.get_parse_fn() - - if not callable(parse_fn): - raise ValueError("Expecting parse_fn to be a function.") - - if log_tf_data_summaries and not initializable: - raise ValueError("Require `initializable` if `log_tf_data_summaries`.") - - if repeat is None: - repeat = self.params.train_steps > 0 or self.params.get('distributed', False) - - if not shuffle and self.num_workers > 1 and self.params.train_dataset_shards is None: - num_shards = self.num_workers - shard_index = self.worker_index - else: - num_shards = self.params.train_dataset_shards - shard_index = self.params.train_dataset_shard_index - - return lambda: twml.input_fns.default_input_fn( - files=self._train_files, - batch_size=self.params.train_batch_size, - parse_fn=parse_fn, - num_threads=self.params.num_threads, - repeat=repeat, - keep_rate=self.params.train_keep_rate, - parts_downsampling_rate=self.params.train_parts_downsampling_rate, - shards=num_shards, - shard_index=shard_index, - shuffle=shuffle, - shuffle_files=(shuffle if shuffle_files is None else shuffle_files), - interleave=interleave, - initializable=initializable, - log_tf_data_summaries=log_tf_data_summaries, - **kwargs) - - def get_eval_input_fn( - self, parse_fn=None, repeat=None, - shuffle=True, interleave=True, - shuffle_files=None, initializable=False, log_tf_data_summaries=False, **kwargs): - """ - This method is used to create input function used by estimator.eval(). - - Args: - parse_fn: - Function to parse a data record into a set of features. - Defaults to twml.parsers.get_sparse_parse_fn(feature_config). - repeat (optional): - Specifies if the dataset is to be repeated. Defaults to `params.eval_steps > 0`. - This ensures the evaluation is run for atleast `params.eval_steps`. - Toggling this to `False` results in evaluation finishing when one of the following happens: - - The entire dataset has been evaled upon once. - - `params.eval_steps` has been reached. - shuffle (optional): - Specifies if the files and records in the files need to be shuffled. - When `False`, files are read in alpha-numerical order. - When `True`, files are shuffled, and records of each files are shuffled. - Defaults to `True`. - interleave (optional): - Specifies if records from multiple files need to be interleaved in parallel. - Defaults to `True`. - shuffle_files (optional): - Shuffles the list of files. Defaults to 'Shuffle' if not provided. - initializable (optional): - A boolean indicator. When the parsing function depends on some resource, e.g. a HashTable or - a Tensor, i.e. it's an initializable iterator, set it to True. Otherwise, default value - (false) is used for most plain iterators. - log_tf_data_summaries (optional): - A boolean indicator denoting whether to add a `tf.data.experimental.StatsAggregator` to the - tf.data pipeline. This adds summaries of pipeline utilization and buffer sizes to the output - events files. This requires that `initializable` is `True` above. - - Returns: - An input_fn that can be consumed by `estimator.eval()`. - """ - if parse_fn is None: - if self._feature_config is None: - raise ValueError("`feature_config` was not passed to `DataRecordTrainer`") - parse_fn = self._feature_config.get_parse_fn() - - if not self._eval_files: - raise ValueError("`eval_files` was not present in `params` passed to `DataRecordTrainer`") - - if not callable(parse_fn): - raise ValueError("Expecting parse_fn to be a function.") - - if log_tf_data_summaries and not initializable: - raise ValueError("Require `initializable` if `log_tf_data_summaries`.") - - if repeat is None: - repeat = self.params.eval_steps > 0 - - return lambda: twml.input_fns.default_input_fn( - files=self._eval_files, - batch_size=self.params.eval_batch_size, - parse_fn=parse_fn, - num_threads=self.params.num_threads, - repeat=repeat, - keep_rate=self.params.eval_keep_rate, - parts_downsampling_rate=self.params.eval_parts_downsampling_rate, - shuffle=shuffle, - shuffle_files=(shuffle if shuffle_files is None else shuffle_files), - interleave=interleave, - initializable=initializable, - log_tf_data_summaries=log_tf_data_summaries, - **kwargs - ) - - def _assert_train_files(self): - if not self._train_files: - raise ValueError("train.data_dir was not set in params passed to DataRecordTrainer.") - - def _assert_eval_files(self): - if not self._eval_files: - raise ValueError("eval.data_dir was not set in params passed to DataRecordTrainer.") - - def train(self, input_fn=None, steps=None, hooks=None): - """ - Makes input functions optional. input_fn defaults to self.get_train_input_fn(). - See Trainer for more detailed documentation documentation. - """ - if input_fn is None: - self._assert_train_files() - input_fn = input_fn if input_fn else self.get_train_input_fn() - super(DataRecordTrainer, self).train(input_fn=input_fn, steps=steps, hooks=hooks) - - def evaluate(self, input_fn=None, steps=None, hooks=None, name=None): - """ - Makes input functions optional. input_fn defaults to self.get_eval_input_fn(). - See Trainer for more detailed documentation. - """ - if input_fn is None: - self._assert_eval_files() - input_fn = input_fn if input_fn else self.get_eval_input_fn(repeat=False) - return super(DataRecordTrainer, self).evaluate( - input_fn=input_fn, - steps=steps, - hooks=hooks, - name=name - ) - - def learn(self, train_input_fn=None, eval_input_fn=None, **kwargs): - """ - Overrides ``Trainer.learn`` to make ``input_fn`` functions optional. - Respectively, ``train_input_fn`` and ``eval_input_fn`` default to - ``self.train_input_fn`` and ``self.eval_input_fn``. - See ``Trainer.learn`` for more detailed documentation. - """ - if train_input_fn is None: - self._assert_train_files() - if eval_input_fn is None: - self._assert_eval_files() - train_input_fn = train_input_fn if train_input_fn else self.get_train_input_fn() - eval_input_fn = eval_input_fn if eval_input_fn else self.get_eval_input_fn() - - super(DataRecordTrainer, self).learn( - train_input_fn=train_input_fn, - eval_input_fn=eval_input_fn, - **kwargs - ) - - def train_and_evaluate(self, - train_input_fn=None, eval_input_fn=None, - **kwargs): - """ - Overrides ``Trainer.train_and_evaluate`` to make ``input_fn`` functions optional. - Respectively, ``train_input_fn`` and ``eval_input_fn`` default to - ``self.train_input_fn`` and ``self.eval_input_fn``. - See ``Trainer.train_and_evaluate`` for detailed documentation. - """ - if train_input_fn is None: - self._assert_train_files() - if eval_input_fn is None: - self._assert_eval_files() - train_input_fn = train_input_fn if train_input_fn else self.get_train_input_fn() - eval_input_fn = eval_input_fn if eval_input_fn else self.get_eval_input_fn() - - super(DataRecordTrainer, self).train_and_evaluate( - train_input_fn=train_input_fn, - eval_input_fn=eval_input_fn, - **kwargs - ) - - def _model_fn(self, features, labels, mode, params, config=None): - """ - Overrides the _model_fn to correct for the features shape of the sparse features - extracted with the contrib.FeatureConfig - """ - if isinstance(self._feature_config, twml.contrib.feature_config.FeatureConfig): - # Fix the shape of the features. The features dictionary will be modified to - # contain the shape changes. - twml.util.fix_shape_sparse(features, self._feature_config) - return super(DataRecordTrainer, self)._model_fn( - features=features, - labels=labels, - mode=mode, - params=params, - config=config - ) - - def calibrate(self, - calibrator, - input_fn=None, - steps=None, - save_calibrator=True, - hooks=None): - """ - Makes input functions optional. input_fn defaults to self.train_input_fn. - See Trainer for more detailed documentation. - """ - if input_fn is None: - self._assert_train_files() - input_fn = input_fn if input_fn else self.get_train_input_fn() - super(DataRecordTrainer, self).calibrate(calibrator=calibrator, - input_fn=input_fn, - steps=steps, - save_calibrator=save_calibrator, - hooks=hooks) - - def save_checkpoints_and_export_model(self, - serving_input_receiver_fn, - export_output_fn=None, - export_dir=None, - checkpoint_path=None, - input_fn=None): - """ - Exports saved module after saving checkpoint to save_dir. - Please note that to use this method, you need to assign a loss to the output - of the build_graph (for the train mode). - See export_model for more detailed information. - """ - self.train(input_fn=input_fn, steps=1) - self.export_model(serving_input_receiver_fn, export_output_fn, export_dir, checkpoint_path) - - def save_checkpoints_and_evaluate(self, - input_fn=None, - steps=None, - hooks=None, - name=None): - """ - Evaluates model after saving checkpoint to save_dir. - Please note that to use this method, you need to assign a loss to the output - of the build_graph (for the train mode). - See evaluate for more detailed information. - """ - self.train(input_fn=input_fn, steps=1) - self.evaluate(input_fn, steps, hooks, name) + self._eval_files = self.build_files_list( + files_list_path=self.params.get("eval_files_list", None), + data_dir=self.params.get("eval_data_dir", None), + start_datetime=self.params.get("eval_start_datetime", None), + end_datetime=self.params.get("eval_end_datetime", None), + datetime_format=datetime_format, + data_threads=data_threads, + hour_resolution=hour_resolution, + maybe_save=self.is_chief(), + overwrite=self.params.get("eval_overwrite_files_list", False), + ) + + if not self.params.get("allow_train_eval_overlap"): + # if there is overlap between train and eval, error out! + if self._train_files and self._eval_files: + overlap_files = set(self._train_files) & set(self._eval_files) + else: + overlap_files = set() + if overlap_files: + raise ValueError( + "There is an overlap between train and eval files:\n %s" + % (overlap_files) + ) + + @staticmethod + def build_hdfs_files_list( + files_list_path: str, + data_dir: str, + start_datetime: datetime.datetime, + end_datetime: datetime.datetime, + datetime_format: str, + data_threads: int, + hour_resolution: int, + maybe_save: bool, + overwrite: bool, + ) -> List[str]: + if files_list_path: + files_list_path = twml.util.preprocess_path(files_list_path) + + if isinstance(start_datetime, datetime.datetime): + start_datetime = start_datetime.strftime(datetime_format) + if isinstance(end_datetime, datetime.datetime): + end_datetime = end_datetime.strftime(datetime_format) + + list_files_by_datetime_args = { + "base_path": data_dir, + "start_datetime": start_datetime, + "end_datetime": end_datetime, + "datetime_prefix_format": datetime_format, + "extension": "lzo", + "parallelism": data_threads, + "hour_resolution": hour_resolution, + "sort": True, + } + + # no cache of data file paths, just get the list by scraping the directory + if not files_list_path or not tf.io.gfile.exists(files_list_path): + # twml.util.list_files_by_datetime returns None if data_dir is None. + # twml.util.list_files_by_datetime passes through data_dir if data_dir is a list + files_list = twml.util.list_files_by_datetime(**list_files_by_datetime_args) + else: + # the cached data file paths file exists. + files_info = twml.util.read_file(files_list_path, decode="json") + # use the cached list if data params match current params, + # or if current params are None + # Not including None checks for datetime_format and hour_resolution, + # since those are shared between eval and training. + if all( + param is None for param in [data_dir, start_datetime, end_datetime] + ) or ( + files_info["data_dir"] == data_dir + and files_info["start_datetime"] == start_datetime + and files_info["end_datetime"] == end_datetime + and files_info["datetime_format"] == datetime_format + and files_info["hour_resolution"] == hour_resolution + ): + files_list = files_info["files"] + elif overwrite: + # current params are not none and don't match saved params + # `overwrite` indicates we should thus update the list + files_list = twml.util.list_files_by_datetime( + **list_files_by_datetime_args + ) + else: + # dont update the cached list + raise ValueError( + "Information in files_list is inconsistent with provided args.\n" + "Did you intend to overwrite files_list using " + "--train.overwrite_files_list or --eval.overwrite_files_list?\n" + "If you instead want to use the paths in files_list, ensure that " + "data_dir, start_datetime, and end_datetime are None." + ) + + if ( + maybe_save + and files_list_path + and (overwrite or not tf.io.gfile.exists(files_list_path)) + ): + save_dict = {} + save_dict["files"] = files_list + save_dict["data_dir"] = data_dir + save_dict["start_datetime"] = start_datetime + save_dict["end_datetime"] = end_datetime + save_dict["datetime_format"] = datetime_format + save_dict["hour_resolution"] = hour_resolution + twml.util.write_file(files_list_path, save_dict, encode="json") + + return files_list + + @staticmethod + def build_files_list( + files_list_path: str, + data_dir: str, + start_datetime: datetime.datetime, + end_datetime: datetime.datetime, + datetime_format: str, + data_threads: int, + hour_resolution: int, + maybe_save: bool, + overwrite: bool, + ): + """ + When specifying DAL datasets, only data_dir, start_dateime, and end_datetime + should be given with the format: + + dal://{cluster}/{role}/{dataset_name}/{env} + + """ + if not data_dir or not is_dal_path(data_dir): + logging.warn( + f"Please consider specifying a dal:// dataset rather than passing a physical hdfs path." + ) + return DataRecordTrainer.build_hdfs_files_list( + files_list_path, + data_dir, + start_datetime, + end_datetime, + datetime_format, + data_threads, + hour_resolution, + maybe_save, + overwrite, + ) + + del datetime_format + del data_threads + del hour_resolution + del maybe_save + del overwrite + + return dal_to_hdfs_path( + path=data_dir, + start_datetime=start_datetime, + end_datetime=end_datetime, + ) + + @property + def train_files(self) -> List[str]: + return self._train_files + + @property + def eval_files(self) -> List[str]: + return self._eval_files + + @staticmethod + def add_parser_arguments(): + """ + Add common commandline args to parse for the Trainer class. + Typically, the user calls this function and then parses cmd-line arguments + into an argparse.Namespace object which is then passed to the Trainer constructor + via the params argument. + + See the `Trainer code <_modules/twml/trainers/trainer.html#Trainer.add_parser_arguments>`_ + and `DataRecordTrainer code + <_modules/twml/trainers/trainer.html#DataRecordTrainer.add_parser_arguments>`_ + for a list and description of all cmd-line arguments. + + Args: + learning_rate_decay: + Defaults to False. When True, parses learning rate decay arguments. + + Returns: + argparse.ArgumentParser instance with some useful args already added. + """ + parser = super(DataRecordTrainer, DataRecordTrainer).add_parser_arguments() + parser.add_argument( + "--train.files_list", + "--train_files_list", + type=str, + default=None, + dest="train_files_list", + help="Path for a json file storing information on training data.\n" + "Specifically, the file at files_list should contain the dataset parameters " + "for constructing the list of data files, and the list of data file paths.\n" + "If the json file does not exist, other args are used to construct the " + "training files list, and that list will be saved to the indicated json file.\n" + "If the json file does exist, and current args are consistent with " + "saved args, or are all None, then the saved files list will be used.\n" + "If current args are not consistent with the saved args, then error out " + "if train_overwrite_files_list==False, else overwrite files_list with " + "a newly constructed list.", + ) + parser.add_argument( + "--train.overwrite_files_list", + "--train_overwrite_files_list", + action="store_true", + default=False, + dest="train_overwrite_files_list", + help="When the --train.files_list param is used, indicates whether to " + "overwrite the existing --train.files_list when there are differences " + "between the current and saved dataset args. Default (False) is to " + "error out if files_list exists and differs from current params.", + ) + parser.add_argument( + "--train.data_dir", + "--train_data_dir", + type=str, + default=None, + dest="train_data_dir", + help="Path to the training data directory." + "Supports local, dal://{cluster}-{region}/{role}/{dataset_name}/{environment}, " + "and HDFS (hdfs://default/ ) paths.", + ) + parser.add_argument( + "--train.start_date", + "--train_start_datetime", + type=str, + default=None, + dest="train_start_datetime", + help="Starting date for training inside the train data dir." + "The start datetime is inclusive." + "e.g. 2019/01/15", + ) + parser.add_argument( + "--train.end_date", + "--train_end_datetime", + type=str, + default=None, + dest="train_end_datetime", + help="Ending date for training inside the train data dir." + "The end datetime is inclusive." + "e.g. 2019/01/15", + ) + parser.add_argument( + "--eval.files_list", + "--eval_files_list", + type=str, + default=None, + dest="eval_files_list", + help="Path for a json file storing information on evaluation data.\n" + "Specifically, the file at files_list should contain the dataset parameters " + "for constructing the list of data files, and the list of data file paths.\n" + "If the json file does not exist, other args are used to construct the " + "evaluation files list, and that list will be saved to the indicated json file.\n" + "If the json file does exist, and current args are consistent with " + "saved args, or are all None, then the saved files list will be used.\n" + "If current args are not consistent with the saved args, then error out " + "if eval_overwrite_files_list==False, else overwrite files_list with " + "a newly constructed list.", + ) + parser.add_argument( + "--eval.overwrite_files_list", + "--eval_overwrite_files_list", + action="store_true", + default=False, + dest="eval_overwrite_files_list", + help="When the --eval.files_list param is used, indicates whether to " + "overwrite the existing --eval.files_list when there are differences " + "between the current and saved dataset args. Default (False) is to " + "error out if files_list exists and differs from current params.", + ) + parser.add_argument( + "--eval.data_dir", + "--eval_data_dir", + type=str, + default=None, + dest="eval_data_dir", + help="Path to the cross-validation data directory." + "Supports local, dal://{cluster}-{region}/{role}/{dataset_name}/{environment}, " + "and HDFS (hdfs://default/ ) paths.", + ) + parser.add_argument( + "--eval.start_date", + "--eval_start_datetime", + type=str, + default=None, + dest="eval_start_datetime", + help="Starting date for evaluating inside the eval data dir." + "The start datetime is inclusive." + "e.g. 2019/01/15", + ) + parser.add_argument( + "--eval.end_date", + "--eval_end_datetime", + type=str, + default=None, + dest="eval_end_datetime", + help="Ending date for evaluating inside the eval data dir." + "The end datetime is inclusive." + "e.g. 2019/01/15", + ) + parser.add_argument( + "--datetime_format", + type=str, + default="%Y/%m/%d", + help="Date format for training and evaluation datasets." + "Has to be a format that is understood by python datetime." + "e.g. %%Y/%%m/%%d for 2019/01/15." + "Used only if {train/eval}.{start/end}_date are provided.", + ) + parser.add_argument( + "--hour_resolution", + type=int, + default=None, + help="Specify the hourly resolution of the stored data.", + ) + parser.add_argument( + "--data_spec", + type=str, + required=True, + help="Path to data specification JSON file. This file is used to decode DataRecords", + ) + parser.add_argument( + "--train.keep_rate", + "--train_keep_rate", + type=float, + default=None, + dest="train_keep_rate", + help="A float value in (0.0, 1.0] that indicates to drop records according to the Bernoulli \ + distribution with p = 1 - keep_rate.", + ) + parser.add_argument( + "--eval.keep_rate", + "--eval_keep_rate", + type=float, + default=None, + dest="eval_keep_rate", + help="A float value in (0.0, 1.0] that indicates to drop records according to the Bernoulli \ + distribution with p = 1 - keep_rate.", + ) + parser.add_argument( + "--train.parts_downsampling_rate", + "--train_parts_downsampling_rate", + dest="train_parts_downsampling_rate", + type=float, + default=None, + help="A float value in (0.0, 1.0] that indicates the factor by which to downsample part \ + files. For example, a value of 0.2 means only 20 percent of part files become part of the \ + dataset.", + ) + parser.add_argument( + "--eval.parts_downsampling_rate", + "--eval_parts_downsampling_rate", + dest="eval_parts_downsampling_rate", + type=float, + default=None, + help="A float value in (0.0, 1.0] that indicates the factor by which to downsample part \ + files. For example, a value of 0.2 means only 20 percent of part files become part of the \ + dataset.", + ) + parser.add_argument( + "--allow_train_eval_overlap", + dest="allow_train_eval_overlap", + action="store_true", + help="Allow overlap between train and eval datasets.", + ) + parser.add_argument( + "--eval_name", + type=str, + default=None, + help="String denoting what we want to name the eval. If this is `train`, then we eval on \ + the training dataset.", + ) + return parser + + def contrib_run_feature_importances( + self, + feature_importances_parse_fn: Optional[Callable] = None, + write_to_hdfs: bool = True, + extra_groups: Optional[Dict[str, List[str]]] = None, + datarecord_filter_fn: Optional[Callable] = None, + datarecord_filter_run_name: Optional[str] = None, + ): + """ + Compute feature importances on a trained model (this is a contrib feature) + + Args: + feature_importances_parse_fn (fn): + The same parse_fn that we use for training/evaluation. + Defaults to feature_config.get_parse_fn() + write_to_hdfs (bool): + Setting this to True writes the feature importance metrics to HDFS + extra_groups (dict>): + A dictionary mapping the name of extra feature groups to the list of + the names of the features in the group + datarecord_filter_fn (function): + a function takes a single data sample in com.twitter.ml.api.ttypes.DataRecord format + and return a boolean value, to indicate if this data record should be kept in feature importance module or not. + """ + logging.info("Computing feature importance") + algorithm = self._params.feature_importance_algorithm + + kwargs = {} + if algorithm == TREE: + kwargs[ + "split_feature_group_on_period" + ] = self._params.split_feature_group_on_period + kwargs["stopping_metric"] = self._params.feature_importance_metric + kwargs["sensitivity"] = self._params.feature_importance_sensitivity + kwargs["dont_build_tree"] = self._params.dont_build_tree + kwargs["extra_groups"] = extra_groups + if self._params.feature_importance_is_metric_larger_the_better: + # The user has specified that the stopping metric is one where larger values are better (e.g. ROC_AUC) + kwargs["is_metric_larger_the_better"] = True + elif self._params.feature_importance_is_metric_smaller_the_better: + # The user has specified that the stopping metric is one where smaller values are better (e.g. LOSS) + kwargs["is_metric_larger_the_better"] = False + else: + # The user has not specified which direction is better for the stopping metric + kwargs["is_metric_larger_the_better"] = None + logging.info(f"Using the tree algorithm with kwargs {kwargs}") + + feature_importances = compute_feature_importances( + trainer=self, + data_dir=self._params.get("feature_importance_data_dir"), + feature_config=self._feature_config, + algorithm=algorithm, + record_count=self._params.feature_importance_example_count, + parse_fn=feature_importances_parse_fn, + datarecord_filter_fn=datarecord_filter_fn, + **kwargs, + ) + + if not feature_importances: + logging.info("Feature importances returned None") + else: + if write_to_hdfs: + logging.info("Writing feature importance to HDFS") + write_feature_importances_to_hdfs( + trainer=self, + feature_importances=feature_importances, + output_path=datarecord_filter_run_name, + metric=self._params.get("feature_importance_metric"), + ) + else: + logging.info("Not writing feature importance to HDFS") + + logging.info("Writing feature importance to ML Metastore") + write_feature_importances_to_ml_dash( + trainer=self, feature_importances=feature_importances + ) + return feature_importances + + def export_model( + self, + serving_input_receiver_fn: Optional[Callable] = None, + export_output_fn: Optional[Callable] = None, + export_dir: Optional[str] = None, + checkpoint_path: Optional[str] = None, + feature_spec: Optional[Dict[str, tf.io.FixedLenFeature]] = None, + ) -> str: + """ + Export the model for prediction. Typically, the exported model + will later be run in production servers. This method is called + by the user to export the PREDICT graph to disk. + + Internally, this method calls `tf.estimator.Estimator.export_savedmodel + `_. + + Args: + serving_input_receiver_fn (Function): + function preparing the model for inference requests. + If not set; defaults to the the serving input receiver fn set by the FeatureConfig. + export_output_fn (Function): + Function to export the graph_output (output of build_graph) for + prediction. Takes a graph_output dict as sole argument and returns + the export_output_fns dict. + Defaults to ``twml.export_output_fns.batch_prediction_continuous_output_fn``. + export_dir: + directory to export a SavedModel for prediction servers. + Defaults to ``[save_dir]/exported_models``. + checkpoint_path: + the checkpoint path to export. If None (the default), the most recent checkpoint + found within the model directory ``save_dir`` is chosen. + + Returns: + The export directory where the PREDICT graph is saved. + """ + if serving_input_receiver_fn is None: + if self._feature_config is None: + raise ValueError( + "`feature_config` was not passed to `DataRecordTrainer`" + ) + serving_input_receiver_fn = ( + self._feature_config.get_serving_input_receiver_fn() + ) + + if feature_spec is None: + if self._feature_config is None: + raise ValueError( + "feature_spec can not be inferred." + "Please pass feature_spec=feature_config.get_feature_spec() to the trainer.export_model method" + ) + else: + feature_spec = self._feature_config.get_feature_spec() + + if isinstance(serving_input_receiver_fn, twml.feature_config.FeatureConfig): + raise ValueError( + "Cannot pass FeatureConfig as a parameter to serving_input_receiver_fn" + ) + elif not callable(serving_input_receiver_fn): + raise ValueError("Expecting Function for serving_input_receiver_fn") + + if export_output_fn is None: + export_output_fn = ( + twml.export_output_fns.batch_prediction_continuous_output_fn + ) + + return super(DataRecordTrainer, self).export_model( + export_dir=export_dir, + serving_input_receiver_fn=serving_input_receiver_fn, + checkpoint_path=checkpoint_path, + export_output_fn=export_output_fn, + feature_spec=feature_spec, + ) + + def get_train_input_fn( + self, + parse_fn: Optional[Callable] = None, + repeat: bool = True, + shuffle: bool = True, + interleave: bool = True, + shuffle_files: Optional[bool] = None, + initializable: bool = False, + log_tf_data_summaries: bool = False, + **kwargs, + ) -> Callable: + """ + This method is used to create input function used by estimator.train(). + + Args: + parse_fn: + Function to parse a data record into a set of features. + Defaults to the parser returned by the FeatureConfig selected + repeat (optional): + Specifies if the dataset is to be repeated. Defaults to `params.train_steps > 0`. + This ensures the training is run for at least `params.train_steps`. + Toggling this to `False` results in training finishing when one of the following happens: + - The entire dataset has been trained upon once. + - `params.train_steps` has been reached. + shuffle (optional): + Specifies if the files and records in the files need to be shuffled. + When `True`, files are shuffled, and records of each files are shuffled. + When `False`, files are read in alpha-numerical order. Also when `False` + the dataset is shared among workers for Hogwild and distributed training + if no sharding configuration is provided in `params.train_dataset_shards`. + Defaults to `True`. + interleave (optional): + Specifies if records from multiple files need to be interleaved in parallel. + Defaults to `True`. + shuffle_files (optional): + Shuffle the list of files. Defaults to 'Shuffle' if not provided. + initializable (optional): + A boolean indicator. When the parsing function depends on some resource, e.g. a HashTable or + a Tensor, i.e. it's an initializable iterator, set it to True. Otherwise, default value + (false) is used for most plain iterators. + log_tf_data_summaries (optional): + A boolean indicator denoting whether to add a `tf.data.experimental.StatsAggregator` to the + tf.data pipeline. This adds summaries of pipeline utilization and buffer sizes to the output + events files. This requires that `initializable` is `True` above. + + Returns: + An input_fn that can be consumed by `estimator.train()`. + """ + if parse_fn is None: + if self._feature_config is None: + raise ValueError( + "`feature_config` was not passed to `DataRecordTrainer`" + ) + parse_fn = self._feature_config.get_parse_fn() + + if not callable(parse_fn): + raise ValueError("Expecting parse_fn to be a function.") + + if log_tf_data_summaries and not initializable: + raise ValueError("Require `initializable` if `log_tf_data_summaries`.") + + if repeat is None: + repeat = self.params.train_steps > 0 or self.params.get( + "distributed", False + ) + + if ( + not shuffle + and self.num_workers > 1 + and self.params.train_dataset_shards is None + ): + num_shards = self.num_workers + shard_index = self.worker_index + else: + num_shards = self.params.train_dataset_shards + shard_index = self.params.train_dataset_shard_index + + return lambda: twml.input_fns.default_input_fn( + files=self._train_files, + batch_size=self.params.train_batch_size, + parse_fn=parse_fn, + num_threads=self.params.num_threads, + repeat=repeat, + keep_rate=self.params.train_keep_rate, + parts_downsampling_rate=self.params.train_parts_downsampling_rate, + shards=num_shards, + shard_index=shard_index, + shuffle=shuffle, + shuffle_files=(shuffle if shuffle_files is None else shuffle_files), + interleave=interleave, + initializable=initializable, + log_tf_data_summaries=log_tf_data_summaries, + **kwargs, + ) + + def get_eval_input_fn( + self, + parse_fn: Optional[Callable] = None, + repeat: bool = True, + shuffle: bool = True, + interleave: bool = True, + shuffle_files: Optional[bool] = None, + initializable: bool = False, + log_tf_data_summaries: bool = False, + **kwargs, + ) -> Callable: + """ + This method is used to create input function used by estimator.eval(). + + Args: + parse_fn: + Function to parse a data record into a set of features. + Defaults to twml.parsers.get_sparse_parse_fn(feature_config). + repeat (optional): + Specifies if the dataset is to be repeated. Defaults to `params.eval_steps > 0`. + This ensures the evaluation is run for at least `params.eval_steps`. + Toggling this to `False` results in evaluation finishing when one of the following happens: + - The entire dataset has been evaluated upon once. + - `params.eval_steps` has been reached. + shuffle (optional): + Specifies if the files and records in the files need to be shuffled. + When `False`, files are read in alpha-numerical order. + When `True`, files are shuffled, and records of each files are shuffled. + Defaults to `True`. + interleave (optional): + Specifies if records from multiple files need to be interleaved in parallel. + Defaults to `True`. + shuffle_files (optional): + Shuffles the list of files. Defaults to 'Shuffle' if not provided. + initializable (optional): + A boolean indicator. When the parsing function depends on some resource, e.g. a HashTable or + a Tensor, i.e. it's an initializable iterator, set it to True. Otherwise, default value + (false) is used for most plain iterators. + log_tf_data_summaries (optional): + A boolean indicator denoting whether to add a `tf.data.experimental.StatsAggregator` to the + tf.data pipeline. This adds summaries of pipeline utilization and buffer sizes to the output + events files. This requires that `initializable` is `True` above. + + Returns: + An input_fn that can be consumed by `estimator.eval()`. + """ + if parse_fn is None: + if self._feature_config is None: + raise ValueError( + "`feature_config` was not passed to `DataRecordTrainer`" + ) + parse_fn = self._feature_config.get_parse_fn() + + if not self._eval_files: + raise ValueError( + "`eval_files` was not present in `params` passed to `DataRecordTrainer`" + ) + + if not callable(parse_fn): + raise ValueError("Expecting parse_fn to be a function.") + + if log_tf_data_summaries and not initializable: + raise ValueError("Require `initializable` if `log_tf_data_summaries`.") + + if repeat is None: + repeat = self.params.eval_steps > 0 + + return lambda: twml.input_fns.default_input_fn( + files=self._eval_files, + batch_size=self.params.eval_batch_size, + parse_fn=parse_fn, + num_threads=self.params.num_threads, + repeat=repeat, + keep_rate=self.params.eval_keep_rate, + parts_downsampling_rate=self.params.eval_parts_downsampling_rate, + shuffle=shuffle, + shuffle_files=(shuffle if shuffle_files is None else shuffle_files), + interleave=interleave, + initializable=initializable, + log_tf_data_summaries=log_tf_data_summaries, + **kwargs, + ) + + def _assert_train_files(self) -> None: + if not self._train_files: + raise ValueError( + "train.data_dir was not set in params passed to DataRecordTrainer." + ) + + def _assert_eval_files(self) -> None: + if not self._eval_files: + raise ValueError( + "eval.data_dir was not set in params passed to DataRecordTrainer." + ) + + def train( + self, + input_fn: Optional[Callable] = None, + steps: Optional[int] = None, + hooks: Optional[List[tf.train.SessionRunHook]] = None, + ) -> None: + """ + Makes input functions optional. input_fn defaults to self.get_train_input_fn(). + See Trainer for more detailed documentation documentation. + """ + if input_fn is None: + self._assert_train_files() + input_fn = input_fn if input_fn else self.get_train_input_fn() + super(DataRecordTrainer, self).train( + input_fn=input_fn, steps=steps, hooks=hooks + ) + + def evaluate( + self, + input_fn: Optional[Callable] = None, + steps: Optional[int] = None, + hooks: Optional[List[tf.train.SessionRunHook]] = None, + name: Optional[str] = None, + ) -> Dict[str, float]: + """ + Makes input functions optional. input_fn defaults to self.get_eval_input_fn(). + See Trainer for more detailed documentation. + """ + if input_fn is None: + self._assert_eval_files() + input_fn = input_fn if input_fn else self.get_eval_input_fn(repeat=False) + return super(DataRecordTrainer, self).evaluate( + input_fn=input_fn, steps=steps, hooks=hooks, name=name + ) + + def learn( + self, + train_input_fn: Optional[Callable] = None, + eval_input_fn: Optional[Callable] = None, + **kwargs, + ) -> None: + """ + Overrides ``Trainer.learn`` to make ``input_fn`` functions optional. + Respectively, ``train_input_fn`` and ``eval_input_fn`` default to + ``self.train_input_fn`` and ``self.eval_input_fn``. + See ``Trainer.learn`` for more detailed documentation. + """ + if train_input_fn is None: + self._assert_train_files() + if eval_input_fn is None: + self._assert_eval_files() + train_input_fn = train_input_fn if train_input_fn else self.get_train_input_fn() + eval_input_fn = eval_input_fn if eval_input_fn else self.get_eval_input_fn() + + super(DataRecordTrainer, self).learn( + train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, **kwargs + ) + + def train_and_evaluate( + self, + train_input_fn: Optional[Callable] = None, + eval_input_fn: Optional[Callable] = None, + **kwargs, + ) -> None: + """ + Overrides ``Trainer.train_and_evaluate`` to make ``input_fn`` functions optional. + Respectively, ``train_input_fn`` and ``eval_input_fn`` default to + ``self.train_input_fn`` and ``self.eval_input_fn``. + See ``Trainer.train_and_evaluate`` for detailed documentation. + """ + if train_input_fn is None: + self._assert_train_files() + if eval_input_fn is None: + self._assert_eval_files() + train_input_fn = train_input_fn if train_input_fn else self.get_train_input_fn() + eval_input_fn = eval_input_fn if eval_input_fn else self.get_eval_input_fn() + + super(DataRecordTrainer, self).train_and_evaluate( + train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, **kwargs + ) + + def _model_fn( + self, + features: Dict[str, tf.Tensor], + labels: tf.Tensor, + mode: tf.estimator.ModeKeys, + params: Dict[str, Any], + config: Optional[tf.estimator.RunConfig] = None, + ) -> tf.estimator.EstimatorSpec: + """ + Overrides the _model_fn to correct for the features shape of the sparse features + extracted with the contrib.FeatureConfig + """ + if isinstance(self._feature_config, twml.contrib.feature_config.FeatureConfig): + # Fix the shape of the features. The features dictionary will be modified to + # contain the shape changes. + twml.util.fix_shape_sparse(features, self._feature_config) + return super(DataRecordTrainer, self)._model_fn( + features=features, labels=labels, mode=mode, params=params, config=config + ) + + def calibrate( + self, + calibrator, + input_fn: Optional[Callable] = None, + steps: Optional[int] = None, + save_calibrator: bool = True, + hooks: Optional[List[tf.train.SessionRunHook]] = None, + ) -> None: + """ + Makes input functions optional. input_fn defaults to self.train_input_fn. + See Trainer for more detailed documentation. + """ + if input_fn is None: + self._assert_train_files() + input_fn = input_fn if input_fn else self.get_train_input_fn() + super(DataRecordTrainer, self).calibrate( + calibrator=calibrator, + input_fn=input_fn, + steps=steps, + save_calibrator=save_calibrator, + hooks=hooks, + ) + + def save_checkpoints_and_export_model( + self, + serving_input_receiver_fn: Callable[ + [], tf.estimator.export.ServingInputReceiver + ], + export_output_fn: Optional[Callable] = None, + export_dir: Optional[str] = None, + checkpoint_path: Optional[str] = None, + input_fn: Optional[Callable] = None, + ) -> None: + """ + Exports saved module after saving checkpoint to save_dir. + Please note that to use this method, you need to assign a loss to the output + of the build_graph (for the train mode). + See export_model for more detailed information. + """ + self.train(input_fn=input_fn, steps=1) + self.export_model( + serving_input_receiver_fn, export_output_fn, export_dir, checkpoint_path + ) + + def save_checkpoints_and_evaluate( + self, + input_fn: Optional[Callable] = None, + steps: Optional[int] = None, + hooks: Optional[List[tf.train.SessionRunHook]] = None, + name: Optional[str] = None, + ) -> Dict[str, float]: + """ + Evaluates model after saving checkpoint to save_dir. + Please note that to use this method, you need to assign a loss to the output + of the build_graph (for the train mode). + See evaluate for more detailed information. + """ + self.train(input_fn=input_fn, steps=1) + return self.evaluate(input_fn, steps, hooks, name) diff --git a/twml/twml/trainers/trainer.py b/twml/twml/trainers/trainer.py index e51b4e0fd..7178b3a83 100644 --- a/twml/twml/trainers/trainer.py +++ b/twml/twml/trainers/trainer.py @@ -66,49 +66,58 @@ """ +import argparse import datetime import functools import math -from operator import itemgetter import os import pprint as pp import random -from string import Template import subprocess import sys import time +from operator import itemgetter +from string import Template from threading import Thread +from typing import Any, Callable, Dict, List, Optional, Union +from absl import logging from twitter.common.metrics import AtomicGauge from twitter.deepbird.stats_server import utils as stats_server_utils from twitter.deepbird.stats_server.stats_exporter import StatsExporter from twitter.ml.common import metrics -from twitter.ml.common.kubernetes import kubectl_delete_by_name, Resource -from twitter.ml.twml.status import get_distributed_training_job_status, TrainingJobStatus +from twitter.ml.common.kubernetes import Resource, kubectl_delete_by_name +from twitter.ml.twml.status import ( + TrainingJobStatus, + get_distributed_training_job_status, +) -from absl import logging -from twml.optimizers import LazyAdamOptimizer, optimize_loss, OPTIMIZER_SUMMARIES from twml.contrib.optimizers import DeepGradientCompressionOptimizer +from twml.optimizers import OPTIMIZER_SUMMARIES, LazyAdamOptimizer, optimize_loss from twml.tracking import ExperimentTracker -from twml.util import (delete_file_or_dir, - get_distributed_training_job_path, - sanitize_hdfs_path) +from twml.util import ( + delete_file_or_dir, + get_distributed_training_job_path, + sanitize_hdfs_path, +) + try: - from urllib import quote as encode_url + from urllib import quote as encode_url except ImportError: - from urllib.parse import quote as encode_url -import tensorflow.compat.v1 as tf + from urllib.parse import quote as encode_url + import tensorflow +import tensorflow.compat.v1 as tf import tensorflow_hub as hub - import twitter.ml.twml.kubernetes.status as k8s_status + import twml import twml.export_output_fns import twml.learning_rate_decay import twml.metrics - -_CLUSTER_TEMPLATE = Template('''{ +_CLUSTER_TEMPLATE = Template( + """{ "cluster": { "ps": [$PS], "chief": [$CHIEF], @@ -116,1662 +125,1872 @@ }, "task": {"type": "$TYPE", "index": $INDEX} } -''') +""" +) -def init_from_checkpoint(init_dir, init_map): - """ - Wrapper around tf.train.init_from_checkpoint - """ - if init_dir: - init_dir = sanitize_hdfs_path(init_dir) - tf.train.init_from_checkpoint(init_dir, init_map) +def init_from_checkpoint(init_dir: str, init_map: Dict[str, str] = None) -> None: + """ + Wrapper around tf.train.init_from_checkpoint + """ + if init_dir: + init_dir = sanitize_hdfs_path(init_dir) + tf.train.init_from_checkpoint(init_dir, init_map) class Trainer(object): - """ - This class wraps ``tf.estimator.Estimator`` to make construction, saving, and loading easier. - Supports multi-phase training (for example, use a Trainer for MDL calibration, then - another for training the rest of the model, then another for isotonic calibration). - The Trainer also implements a training and evaluation loop via the ``learn()`` method. - Each Trainer is associated to a fixed set of hyper parameters (params), and a single model - specified by ``build_graph``. Given these constraints, a single Trainer can be called - multiple times for training and evaluation over multiple epochs. - - However, if you intend to try different sets of hyper-parameters, we recommend you instantiate - a different Trainer for each such experiment. That way, each experiment can be tracked - in a different ``save_dir``. Indeed, after calling ``learn``, a Trainer's save_dir will contain - checkpoints of the model (its graph, and variables), and the history of metrics (for example, - evaluation accuracy at each epoch), and other store observations like the average time per step. - The latter metrics can be viewed by pointing - TensorBoard to the save_dir and accessing TensorBoard via your browser. - """ - - def __init__(self, name, params, build_graph_fn, - metric_fn=None, - optimize_loss_fn=None, - run_config=None, - save_dir=None, - init_from_dir=None, - init_map=None, - warm_start_from=None, - profiler_steps=None, - **kwargs): + """ + This class wraps ``tf.estimator.Estimator`` to make construction, saving, and loading easier. + Supports multi-phase training (for example, use a Trainer for MDL calibration, then + another for training the rest of the model, then another for isotonic calibration). + The Trainer also implements a training and evaluation loop via the ``learn()`` method. + Each Trainer is associated to a fixed set of hyper parameters (params), and a single model + specified by ``build_graph``. Given these constraints, a single Trainer can be called + multiple times for training and evaluation over multiple epochs. + + However, if you intend to try different sets of hyper-parameters, we recommend you instantiate + a different Trainer for each such experiment. That way, each experiment can be tracked + in a different ``save_dir``. Indeed, after calling ``learn``, a Trainer's save_dir will contain + checkpoints of the model (its graph, and variables), and the history of metrics (for example, + evaluation accuracy at each epoch), and other store observations like the average time per step. + The latter metrics can be viewed by pointing + TensorBoard to the save_dir and accessing TensorBoard via your browser. """ - Args: - name (String): - string name of this estimator; used as scope names for variables and tensors. - params (HParams, Namespace, or Dict): - hyper-parameters to be passed to Estimator constructor. - Must include params.train_batch_size and params.eval_batch_size. - Note that params is passed to twml.util.convert_to_hparams() to produce an HParams. - build_graph_fn: - A function for building tensorflow graphs. - This matches TensorFlow Estimator's model_fn signature. - For example, - - .. code-block:: python + def __init__( + self, + name: str, + params: Union[Dict, tf.contrib.training.HParams, argparse.Namespace], + build_graph_fn: Callable, + metric_fn: Callable[[tf.Tensor, tf.Tensor], tf.Tensor] = None, + optimize_loss_fn: Callable = optimize_loss, + run_config: tf.estimator.RunConfig = None, + save_dir: str = None, + init_from_dir: str = None, + init_map: Dict[str, str] = None, + warm_start_from_dir: str = None, + profiler_steps: int = 0, + **kwargs, + ): + """ - def build_graph(features, label, mode, params, config=None): - # Implements a simple binary logistic regression model - sparse_tf = twml.util.convert_to_sparse(features, params.input_size_bits) + Args: + name (String): + string name of this estimator; used as scope names for variables and tensors. + params (HParams, Namespace, or Dict): + hyper-parameters to be passed to Estimator constructor. + Must include params.train_batch_size and params.eval_batch_size. + Note that params is passed to twml.util.convert_to_hparams() to produce an HParams. + build_graph_fn: + A function for building tensorflow graphs. + This matches TensorFlow Estimator's model_fn signature. + For example, + + .. code-block:: python + + def build_graph(features, label, mode, params, config=None): + # Implements a simple binary logistic regression model + sparse_tf = twml.util.convert_to_sparse(features, params.input_size_bits) + logits = twml.layers.full_sparse(sparse_tf, 1 << params.input_size_bits, 1) + if mode == 'infer': + loss = None + else: + loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=label, logits=logits) + loss = twml.util.weighted_average(loss, features['weights']) + output = tf.nn.sigmoid(logits) + return {'output': output, 'loss': loss} + + features (dict of Tensor keyed by a string name): + input tensors. + mode (tf.estimator.ModeKeys / String): + one of 'train', 'eval', 'infer'. + label (Tensor): + if in ``mode == 'train'`` mode, these contain the corresponding labels for input. + params (HParams): + hyper parameters that control how to build a graph. + config: + the RunConfig object passed to Estimator constructor. + + This function is expected to return a dictionary containing the following keys: + + * 'output': a node representing model output; required. + * 'loss': (required) a loss node used for optimization; required for training and evaluation. + * 'train_op': (optional) an operation that minimizes the loss (as output by `tf.train.Optimizer.minimize`). + If train_op is specified, train_op is used for optimization as opposed to loss. Loss is always logged to tensorboard. + + Notes: + + * any tf.summary written inside build graph are logged to tensorboard during training. + * the ``build_graph_fn`` is called once or twice per epoch (once per training, + once per evaluation). All data loading (and preprocessing) logic not required + for serving should be in the ``input_fn`` passed to ``learn``, ``train``, + ``evaluate``, etc. + + optimize_loss_fn: + Defaults to Trainer.get_train_op. A function that takes params and loss as arguments + and returns a training op. The training op is used to update parameters (that is, to learn). + metric_fn: + A function that returns the eval_metric_ops dict given graph_output, labels and weights. + Defaults to None. + Use ``twml.metrics.get_binary_class_metric_fn()`` to return a ``metric_fn`` + which implements many binary classification metrics. + run_config (RunConfig): + optional configuration to be passed to Estimator constructor. Defaults to None. + save_dir (String): + optional directory where to save model checkpoints, + tensorboard event files and trained parameters. + Overwrites and defaults to run_config.model_dir. + init_from_dir (String): + optional directory to load weights from. + if set to None (the default), do not init from any directory. + init_map (map from String to String): + Must be specified if init_from_dir is specified. + Defines which scopes and variables to load. + Keys are the variables and scopes to load from the directory. + Values are the destinations (in the current graph) to load into. + See tf.init_from_checkpoint for more information. + Note that the the trainer prepends name_scope of the form `name`/model/ to the name_scope + of any variable defined inside `build_graph_fn` and this should be taken into account when + defining the values. + warm_start_from: + Optional string filepath to a checkpoint to warm-start from, + or a tf.estimator.WarmStartSettings object to fully configure warm-starting. + If the string filepath is provided instead of a WarmStartSettings, + then all variables are warm-started, and it is assumed that + vocabularies and Tensor names are unchanged. + profiler_steps (Integer): + Defaults to None. If set defines the number of steps in the + `tf.train.ProfileHook `_. + Captures CPU/GPU profiling information every ``profiler_steps`` steps or seconds. + When executing ``learn``, ``train`` or ``predict`` methods, + with ``profiler_steps`` set to a number, + a ``timeline_X.json`` file is created in the save_dir. This file contains profiling data + stored in Chrome trace format. To view stored data, use the Chrome browser to follow + these steps: + + 1) Go to the page chrome://tracing. + 2) In the upper left corner, you will find Load button. + 3) Press it and load our JSON file, which can be found in the ``save_dir`` + + *Warning*: This could create too many these json files which can be a potential problem, + e.g. for HDFS there is normally quota for file count, so use with caution. + + Note: this argument is ignored when a non-None ``hooks`` argument is passed to + ``train``, ``learn``, or ``predict`` methods. The hook can be added manually by passing + ``trainer.train(..., hooks=myhooks.extend(trainer.get_train_hooks()))``, for example. + """ + + if tensorflow.__version__ >= "2.0": + RuntimeError("Trainer not yet supported for Tensorflow >= 2.0") + + self._name = name + self._build_graph_fn = build_graph_fn + self._metric_fn = metric_fn + self._tensorboard_handle = None + self._current_estimator_spec = None # holds the current estimator spec + self._profiler_steps = profiler_steps + self._export_output_fn = None + self._is_early_stopping = False + + # NOTE: Sanitize all HDFS paths first. + save_dir = sanitize_hdfs_path(save_dir) + init_from_dir = sanitize_hdfs_path(init_from_dir) + + # warm_start_from can be of type tf.estimator.WarmStartSettings. + if isinstance(warm_start_from, str): + warm_start_from = sanitize_hdfs_path(warm_start_from) + + # convert to twitter.deepbird.hparam.hparam.HParams object + params = twml.util.convert_to_hparams(params) + + # keep a copy of the params because calling self._estimator.params creates a deepcopy + self._params = params + self.check_params() + + self._using_hogwild = True if os.environ.get("TWML_HOGWILD_PORTS") else False + # configure Hogwild (needs to be called before RunConfig is created) + self._hogwild_setup() + + if not run_config: + session_config = tf.ConfigProto() + # By default each process tries to allocate (almost) all of the memory. + # This option ensures the gpu memory grows dynamically instead. + session_config.gpu_options.allow_growth = True # pylint: disable=no-member + + if "TWML_NUM_CPUS" in os.environ: + num_available_cpus = int(os.environ.get("TWML_MESOS_CPU", "8")) + if params.num_mkl_threads > 1: + os.environ["OMP_NUM_THREADS"] = str(params.num_mkl_threads) + os.environ["MKL_NUM_THREADS"] = str(params.num_mkl_threads) + session_config.inter_op_parallelism_threads = ( + num_available_cpus // params.num_mkl_threads + ) + session_config.intra_op_parallelism_threads = params.num_mkl_threads + + run_config = tf.estimator.RunConfig( + session_config=session_config, + keep_checkpoint_max=self._params.get("keep_checkpoint_max", 20), + log_step_count_steps=10000, + save_checkpoints_secs=self._params.get("save_checkpoints_secs", 600), + tf_random_seed=self._tf_random_seed(), + ) + elif not isinstance(run_config, tf.estimator.RunConfig): + raise ValueError( + "Expecting run_config argument of type None or tf.estimator.RunConfig" + "Got %s instead." % type(run_config).__name__ + ) + elif os.environ.get("TWML_HOGWILD_PORTS"): + raise ValueError("Custom RunConfig not supported with Hogwild") + + if run_config.model_dir is None and save_dir is None: + raise ValueError( + "Expecting either save_dir or run_config.model_dir to be specified. Got None for each." + ) + elif run_config.model_dir is None: + run_config = run_config.replace(model_dir=save_dir) + elif save_dir is None: + save_dir = run_config.model_dir + + self._save_dir = save_dir + self.experiment_tracker = ExperimentTracker( + self._params, run_config, self._save_dir + ) - logits = twml.layers.full_sparse(sparse_tf, 1 << params.input_size_bits, 1) + # Check if should delete the tsd running this training job. In certain use case when + # there are other tf operations following trainer.train_and_evaluate (or trainer.learn), + # additional state files need to be specified to ensure those steps are executed after job restart. + kwargs["gke_state_files"] = kwargs.get("gke_state_files", ["_SUCCESS"]) + self._maybe_del_tsd_exit(kwargs["gke_state_files"]) + logging.info( + "Checkpoint and event files will be saved at save_dir=%s", save_dir + ) + self._optimize_loss_fn = ( + self.get_train_op if optimize_loss_fn is None else optimize_loss_fn + ) - if mode == 'infer': - loss = None + # overwrite the current save_dir + if self._params.get("overwrite_save_dir") and tf.io.gfile.exists( + self._save_dir + ): + logging.info( + "Trainer overwriting existing save directory: %s (params.overwrite_save_dir)" + % self._save_dir + ) + # if distributed or hogwild: + if self._params.get("distributed", False): + # sleep for 30 seconds to allow each worker to get to this point. + time.sleep(30) + if run_config.is_chief: + logging.info("Chief deleting the save_dir now") + delete_file_or_dir(self._save_dir) + # sleep for 30 seconds to allow each worker to get to this point. + time.sleep(30) else: - loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=label, logits=logits) - loss = twml.util.weighted_average(loss, features['weights']) + delete_file_or_dir(self._save_dir) + + # Exposing stats to a /vars.json endpoint that will be collected + # by the absorber + if self._params.get("stats_port"): + try: + stats_server_utils.start_stats_server( + self._params.get("stats_port"), self._save_dir + ) + except Exception as err: + logging.error("Failed to start the stats server. Error: %s", str(err)) + + checkpoint = os.path.join(self._save_dir, "checkpoint") + if tf.io.gfile.exists(checkpoint): + logging.info( + "The provided save_dir directory %s already exists." + " Training will be resumed." % checkpoint + ) + + self._maybe_restore_checkpoint = lambda: init_from_checkpoint( + init_from_dir, init_map + ) - output = tf.nn.sigmoid(logits) + if init_from_dir is not None and init_map is None: + raise ValueError("Need to provide init_map when init_from_dir is provided.") - return {'output': output, 'loss': loss} + if not tf.io.gfile.exists(self._save_dir): + # so tensorboard can point to a directory that exists + tf.io.gfile.mkdir(self._save_dir) - Args: - features (dict of Tensor keyed by a string name): - input tensors. - mode (tf.estimator.ModeKeys / String): - one of 'train', 'eval', 'infer'. - label (Tensor): - if in ``mode == 'train'`` mode, these contain the corresponding labels for input. - params (HParams): - hyper parameters that control how to build a graph. - config: - the RunConfig object passed to Estimator constructor. - - This function is expected to return a dictionary containing the following keys: - - * 'output': a node representing model output; required. - * 'loss': (required) a loss node used for optimization; required for training and - evaluation. - * 'train_op': (optional) an operation that minimizes the loss (as output by - `tf.train.Optimizer.minimize`). If train_op is specified, train_op is used - for optimization as opposed to loss. Loss is always logged to tensorboard. - - Notes: - - * any tf.summary written inside build graph are logged to tensorboard during training. - * the ``build_graph_fn`` is called once or twice per epoch (once per training, - once per evaluation). All data loading (and preprocessing) logic not required - for serving should be in the ``input_fn`` passed to ``learn``, ``train``, - ``evalulate``, etc. - - optimize_loss_fn: - Defaults to Trainer.get_train_op. A function that takes params and loss as arguments - and returns a training op. The training op is used to update parameters (that is, to learn). - metric_fn: - A function that returns the eval_metric_ops dict given graph_output, labels and weights. - Defaults to None. - Use ``twml.metrics.get_binary_class_metric_fn()`` to return a ``metric_fn`` - which implements many binary classification metrics. - run_config (RunConfig): - optional configuration to be passed to Estimator constructor. Defaults to None. - save_dir (String): - optional directory where to save model checkpoints, - tensorboard event files and trained parameters. - Overwrites and defaults to run_config.model_dir. - init_from_dir (String): - optional directory to load weights from. - if set to None (the default), do not init from any directory. - init_map (map from String to String): - Must be specified if init_from_dir is specified. - Defines which scopes and variables to load. - Keys are the variables and scopes to load from the directory. - Values are the destinations (in the current graph) to load into. - See tf.init_from_checkpoint for more information. - Note that the the trainer prepends name_scope of the form `name`/model/ to the name_scope - of any variable defined inside `build_graph_fn` and this should be taken into account when - defining the values. - warm_start_from: - Optional string filepath to a checkpoint to warm-start from, - or a tf.estimator.WarmStartSettings object to fully configure warm-starting. - If the string filepath is provided instead of a WarmStartSettings, - then all variables are warm-started, and it is assumed that - vocabularies and Tensor names are unchanged. - profiler_steps (Integer): - Defaults to None. If set defines the number of steps in the - `tf.train.ProfileHook `_. - Captures CPU/GPU profiling information every ``profiler_steps`` steps or seconds. - When executing ``learn``, ``train`` or ``predict`` methods, - with ``profiler_steps`` set to a number, - a ``timeline_X.json`` file is created in the save_dir. This file contains profiling data - storedin Chrome trace format. To view stored data, use the Chrome browser to follow - these steps: - - 1) Go to the page chrome://tracing. - 2) In the upper left corner, you will find Load button. - 3) Press it and load our JSON file, which can be found in the ``save_dir`` - - *Warning*: This could create too many these json files which can be a potential problem, - e.g. for HDFS there is normally quota forfile count, so use with caution. - - Note: this argument is ignored when a non-None ``hooks`` argument is pasesd to - ``train``, ``learn``, or ``predict`` methods. The hook can be added manually by passing - ``trainer.train(..., hooks=myhooks.extend(trainer.get_train_hooks()))``, for example. - """ - - if tensorflow.__version__ >= "2.0": - RuntimeError("Trainer not yet supported for Tensorflow >= 2.0") - - self._name = name - self._build_graph_fn = build_graph_fn - self._metric_fn = metric_fn - self._tensorboard_handle = None - self._current_estimator_spec = None # holds the current estimator spec - self._profiler_steps = profiler_steps - self._export_output_fn = None - self._is_early_stopping = False - - # NOTE: Sanitize all HDFS paths first. - save_dir = sanitize_hdfs_path(save_dir) - init_from_dir = sanitize_hdfs_path(init_from_dir) - - # warm_start_from can be of type tf.estimator.WarmStartSettings. - if isinstance(warm_start_from, str): - warm_start_from = sanitize_hdfs_path(warm_start_from) - - # convert to twitter.deepbird.hparam.hparam.HParams object - params = twml.util.convert_to_hparams(params) - - # keep a copy of the params because calling self._estimator.params creates a deepcopy - self._params = params - self.check_params() - - self._using_hogwild = True if os.environ.get('TWML_HOGWILD_PORTS') else False - # configure Hogwild (needs to be called before RunConfig is created) - self._hogwild_setup() - - if not run_config: - session_config = tf.ConfigProto() - # By default each process tries to allocate (almost) all of the memory. - # This option ensures the gpu memory grows dynamically instead. - session_config.gpu_options.allow_growth = True # pylint: disable=no-member - - if 'TWML_NUM_CPUS' in os.environ: - num_available_cpus = int(os.environ.get("TWML_MESOS_CPU", "8")) - if params.num_mkl_threads > 1: - os.environ["OMP_NUM_THREADS"] = str(params.num_mkl_threads) - os.environ["MKL_NUM_THREADS"] = str(params.num_mkl_threads) - session_config.inter_op_parallelism_threads = num_available_cpus // params.num_mkl_threads - session_config.intra_op_parallelism_threads = params.num_mkl_threads - - run_config = tf.estimator.RunConfig( - session_config=session_config, - keep_checkpoint_max=self._params.get('keep_checkpoint_max', 20), - log_step_count_steps=10000, - save_checkpoints_secs=self._params.get('save_checkpoints_secs', 600), - tf_random_seed=self._tf_random_seed()) - elif not isinstance(run_config, tf.estimator.RunConfig): - raise ValueError("Expecting run_config argument of type None or tf.estimator.RunConfig" - "Got %s instead." % type(run_config).__name__) - elif os.environ.get('TWML_HOGWILD_PORTS'): - raise ValueError("Custom RunConfig not supported with Hogwild") - - if run_config.model_dir is None and save_dir is None: - raise ValueError( - "Expecting either save_dir or run_config.model_dir to be specified. Got None for each.") - elif run_config.model_dir is None: - run_config = run_config.replace(model_dir=save_dir) - elif save_dir is None: - save_dir = run_config.model_dir - - self._save_dir = save_dir - self.experiment_tracker = ExperimentTracker(self._params, run_config, self._save_dir) - - # Check if should delete the tsd running this training job. In certain use case when - # there are other tf operations following trainer.train_and_evaluate (or trainer.learn), - # additional state files need to be specified to ensure those steps are executed after job restart. - kwargs['gke_state_files'] = kwargs.get('gke_state_files', ['_SUCCESS']) - self._maybe_del_tsd_exit(kwargs['gke_state_files']) - logging.info("Checkpoint and event files will be saved at save_dir=%s", save_dir) - self._optimize_loss_fn = self.get_train_op if optimize_loss_fn is None else optimize_loss_fn - - # overwrite the current save_dir - if self._params.get('overwrite_save_dir') and tf.io.gfile.exists(self._save_dir): - logging.info("Trainer overwriting existing save directory: %s (params.overwrite_save_dir)" - % self._save_dir) - # if distributed or hogwild: - if self._params.get('distributed', False): - # sleep for 30 seconds to allow each worker to get to this point. - time.sleep(30) - if run_config.is_chief: - logging.info("Chief deleting the save_dir now") - delete_file_or_dir(self._save_dir) - # sleep for 30 seconds to allow each worker to get to this point. - time.sleep(30) - else: - delete_file_or_dir(self._save_dir) - - # Exposing stats to a /vars.json endpoint that will be collected - # by the absorber - if self._params.get('stats_port'): - try: - stats_server_utils.start_stats_server(self._params.get('stats_port'), self._save_dir) - except Exception as err: - logging.error('Failed to start the stats server. Error: %s', str(err)) - - checkpoint = os.path.join(self._save_dir, 'checkpoint') - if tf.io.gfile.exists(checkpoint): - logging.info("The provided save_dir directory %s already exists." - " Training will be resumed." - % checkpoint) - - self._maybe_restore_checkpoint = lambda: init_from_checkpoint(init_from_dir, init_map) - - if init_from_dir is not None and init_map is None: - raise ValueError("Need to provide init_map when init_from_dir is provided.") - - if not tf.io.gfile.exists(self._save_dir): - # so tensorboard can point to a directory that exists - tf.io.gfile.mkdir(self._save_dir) - - self._estimator = tf.estimator.Estimator( - model_fn=self._model_fn, - params=self._params, # HParams - config=run_config, # RunConfig - warm_start_from=warm_start_from, - model_dir=self._save_dir, # By this point it is same as run_config.model_dir - ) - - # Log parameters that are used to construct trainer. This allows people to see default values. - logging.info("Trainer constructed using the following parameters: ") - pp_params = pp.pformat(self._params.values()) - logging.info(pp_params) - - # Start TensorBoard - if self._params.get('disable_tensorboard', False): - logging.info("Skipping launching TensorBoard [--disable_tensorboard is set]") - elif "tensorboard_port" in self._params.values() and self._params.tensorboard_port is not None: - self.start_tensorboard(self._params.tensorboard_port) - - # Export gauge that will track whether a model was exported - self.stats_exporter = StatsExporter("twml.trainer") - self.export_gauge = AtomicGauge('export_model') - self.stats_exporter.register_metrics(self.export_gauge) - - def _hogwild_setup(self): - """ - Setup the parameters required for hogwild. - """ - self._num_workers = self._params.get('num_workers') or 1 - logging.info("NUM_WORKERS: %d", self._num_workers) - if self._num_workers <= 1: - self._ports = None - return - - # a hogwild job is considered distributed - if 'distributed' in self._params: - self._params.set_hparam('distributed', True) - else: - self._params.add_hparam('distributed', True) - - ports = os.environ.get('TWML_HOGWILD_PORTS') - if ports: - self._ports = [int(port) for port in ports.strip().split(",")] - if (self._num_workers + 1!= len(self._ports)): - raise ValueError("Number of (workers + PS) and ports need to match") - else: - if self._num_workers > 1: - raise ValueError("TWML_HOGWILD_PORTS needs to be set to use hogwild training") - - # Split the number of data threads across multiple workers - num_threads = self._params.get('num_threads') - num_threads_per_worker = int(math.ceil(float(num_threads) / self._num_workers)) - self._params.set_hparam('num_threads', num_threads_per_worker) - - hogwild_task_type = os.environ.get('TWML_HOGWILD_TASK_TYPE') - hogwild_task_id = int(os.environ.get('TWML_HOGWILD_TASK_ID')) - os.environ['TF_CONFIG'] = self._get_cluster_config(hogwild_task_type, hogwild_task_id) - - def _tf_random_seed(self): - """ Returns user set seed and deal with Hogwild multiple seeds """ - tf_random_seed = self._params.get('tf_random_seed', None) - if tf_random_seed is None: - return None - elif self.using_hogwild and os.environ.get('TWML_HOGWILD_TASK_TYPE') == 'worker': - # chief (tf_random_seed), worker_0 (tf_random_seed + 1), worker_1 (tf_random_seed + 2)... - return tf_random_seed + 1 + int(os.environ.get('TWML_HOGWILD_TASK_ID')) - else: - return tf_random_seed - - def check_params(self): - """ Verify that params has the correct key,values """ - param_values = self._params.values() - - if 'train_batch_size' in param_values: - if not isinstance(self._params.train_batch_size, int): - raise ValueError("Expecting params.train_batch_size to be an integer.") - if self._params.train_batch_size <= 0: - raise ValueError("train_batch_size needs to be positive") - else: - raise ValueError("train_batch_size needs to be present in params") - - if 'eval_batch_size' in param_values: - if not isinstance(self._params.eval_batch_size, int): - raise ValueError("Expecting params.eval_batch_size to be an integer.") - if self._params.eval_batch_size <= 0: - raise ValueError("eval_batch_size needs to be positive.") - else: - self._params.add_hparam('eval_batch_size', self._params.train_batch_size) - - if (self._params.get('distributed_training_cleanup') and - not self._params.get('distributed')): - # we only need to support training discontinuation for distributed training - # bc we are still using TSDs on GKE for distributed training - raise ValueError( - "Expecting params.distributed to be set if " - "params.distributed_training_cleanup is set." - ) - - def _get_cluster_config(self, name, index): - """Create a tensorflow cluster config from ports, name and index""" - host = '"localhost:%d"' - ps = host % self._ports[0] - chief = host % self._ports[1] - workers = ", ".join([host % port for port in self._ports[2:]]) - config = _CLUSTER_TEMPLATE.substitute( - PS=ps, - CHIEF=chief, - WORKER=workers, - TYPE=name, - INDEX=index, - ) - return config - - @property - def current_estimator_spec(self): - """ - returns the current estimator (warning: often reset) - """ - return self._current_estimator_spec - - @property - def estimator(self): - """ returns estimator encapsulated by Trainer """ - return self._estimator + self._estimator = tf.estimator.Estimator( + model_fn=self._model_fn, + params=self._params, # HParams + config=run_config, # RunConfig + warm_start_from=warm_start_from, + model_dir=self._save_dir, # By this point it is same as run_config.model_dir + ) - @property - def num_workers(self): - """ returns number of workers """ - return self._estimator.config.num_worker_replicas + # Log parameters that are used to construct trainer. This allows people to see default values. + logging.info("Trainer constructed using the following parameters: ") + pp_params = pp.pformat(self._params.values()) + logging.info(pp_params) + + # Start TensorBoard + if self._params.get("disable_tensorboard", False): + logging.info( + "Skipping launching TensorBoard [--disable_tensorboard is set]" + ) + elif ( + "tensorboard_port" in self._params.values() + and self._params.tensorboard_port is not None + ): + self.start_tensorboard(self._params.tensorboard_port) + + # Export gauge that will track whether a model was exported + self.stats_exporter = StatsExporter("twml.trainer") + self.export_gauge = AtomicGauge("export_model") + self.stats_exporter.register_metrics(self.export_gauge) + + def _hogwild_setup(self) -> None: + """ + Setup the parameters required for hogwild. + """ + self._num_workers = self._params.get("num_workers") or 1 + logging.info("NUM_WORKERS: %d", self._num_workers) + if self._num_workers <= 1: + self._ports = None + return + + # a hogwild job is considered distributed + if "distributed" in self._params: + self._params.set_hparam("distributed", True) + else: + self._params.add_hparam("distributed", True) + + ports = os.environ.get("TWML_HOGWILD_PORTS") + if ports: + self._ports = [int(port) for port in ports.strip().split(",")] + if self._num_workers + 1 != len(self._ports): + raise ValueError("Number of (workers + PS) and ports need to match") + else: + if self._num_workers > 1: + raise ValueError( + "TWML_HOGWILD_PORTS needs to be set to use hogwild training" + ) + + # Split the number of data threads across multiple workers + num_threads = self._params.get("num_threads") + num_threads_per_worker = int(math.ceil(float(num_threads) / self._num_workers)) + self._params.set_hparam("num_threads", num_threads_per_worker) + + hogwild_task_type = os.environ.get("TWML_HOGWILD_TASK_TYPE") + hogwild_task_id = int(os.environ.get("TWML_HOGWILD_TASK_ID")) + os.environ["TF_CONFIG"] = self._get_cluster_config( + hogwild_task_type, hogwild_task_id + ) - @property - def worker_index(self): - """ - returns index of worker in the cluster - chief has index 0 - non-chief workers have indices 1 through (num_workers - 1) - """ - return self._estimator.config.global_id_in_cluster - - @property - def using_hogwild(self): - """ returns a bool indicating whether hogwild is being used """ - return self._using_hogwild - - def set_estimator(self, estimator): - """ sets the estimator used internally by Trainer """ - if not isinstance(estimator, tf.estimator.Estimator): - raise ValueError("Expecting tf.estimator.Estimator") - self._estimator = estimator - self._params = self.estimator.params - - @property - def params(self): - """ - returns the hyper-parameters passed to the constructor. - """ - return self._params + def _tf_random_seed(self) -> int: + """Returns user set seed and deal with Hogwild multiple seeds""" + tf_random_seed = self._params.get("tf_random_seed", None) + if tf_random_seed is None: + return None + elif ( + self.using_hogwild and os.environ.get("TWML_HOGWILD_TASK_TYPE") == "worker" + ): + # chief (tf_random_seed), worker_0 (tf_random_seed + 1), worker_1 (tf_random_seed + 2)... + return tf_random_seed + 1 + int(os.environ.get("TWML_HOGWILD_TASK_ID")) + else: + return tf_random_seed + + def check_params(self) -> None: + """Verify that params has the correct key,values""" + param_values = self._params.values() + + if "train_batch_size" in param_values: + if not isinstance(self._params.train_batch_size, int): + raise ValueError("Expecting params.train_batch_size to be an integer.") + if self._params.train_batch_size <= 0: + raise ValueError("train_batch_size needs to be positive") + else: + raise ValueError("train_batch_size needs to be present in params") + + if "eval_batch_size" in param_values: + if not isinstance(self._params.eval_batch_size, int): + raise ValueError("Expecting params.eval_batch_size to be an integer.") + if self._params.eval_batch_size <= 0: + raise ValueError("eval_batch_size needs to be positive.") + else: + self._params.add_hparam("eval_batch_size", self._params.train_batch_size) + + if self._params.get("distributed_training_cleanup") and not self._params.get( + "distributed" + ): + # we only need to support training discontinuation for distributed training + # bc we are still using TSDs on GKE for distributed training + raise ValueError( + "Expecting params.distributed to be set if " + "params.distributed_training_cleanup is set." + ) + + def _get_cluster_config(self, name, index) -> str: + """Create a tensorflow cluster config from ports, name and index""" + host = '"localhost:%d"' + ps = host % self._ports[0] + chief = host % self._ports[1] + workers = ", ".join([host % port for port in self._ports[2:]]) + config = _CLUSTER_TEMPLATE.substitute( + PS=ps, + CHIEF=chief, + WORKER=workers, + TYPE=name, + INDEX=index, + ) + return config + + @property + def current_estimator_spec(self) -> tf.estimator.EstimatorSpec: + """returns the current estimator (warning: often reset)""" + return self._current_estimator_spec + + @property + def estimator(self) -> tf.estimator.Estimator: + """returns estimator encapsulated by Trainer""" + return self._estimator + + @property + def num_workers(self) -> int: + """returns number of workers""" + return self._estimator.config.num_worker_replicas + + @property + def worker_index(self) -> int: + """ + returns index of worker in the cluster chief has index 0 + non-chief workers have indices 1 through (num_workers - 1) + """ + return self._estimator.config.global_id_in_cluster + + @property + def using_hogwild(self) -> bool: + """returns a bool indicating whether hogwild is being used""" + return self._using_hogwild + + def set_estimator(self, estimator: tf.estimator.Estimator) -> None: + """sets the estimator used internally by Trainer""" + if not isinstance(estimator, tf.estimator.Estimator): + raise ValueError("Expecting tf.estimator.Estimator") + self._estimator = estimator + self._params = self.estimator.params + + @property + def params(self): + """returns the hyper-parameters passed to the constructor.""" + return self._params + + @staticmethod + def add_parser_arguments() -> argparse.ArgumentParser: + """ + Add common commandline args to parse for the Trainer class. + Typically, the user calls this function and then parses cmd-line arguments + into an argparse.Namespace object which is then passed to the Trainer constructor + via the params argument. + + See the `code <_modules/twml/argument_parser.html#get_trainer_parser>`_ + for a list and description of all cmd-line arguments. + + Returns: + argparse.ArgumentParser instance with some useful args already added. + """ + return twml.argument_parser.get_trainer_parser() + + @staticmethod + def get_train_op( + params: tf.contrib.training.HParams, loss: tf.Tensor + ) -> tf.Operation: + """ + Return a training Op, that is, a `twml.optimizers.optimize_loss + `_ + instance given params and loss. + This method can be overwritten by passing the optimize_loss_fn to the Trainer + constructor. - @staticmethod - def add_parser_arguments(): - """ - Add common commandline args to parse for the Trainer class. - Typically, the user calls this function and then parses cmd-line arguments - into an argparse.Namespace object which is then passed to the Trainer constructor - via the params argument. + Args: + params: + tensorflow.contrib.training.HParams instance. Recognizes the optimizer, optimizer_summaries, + gradient_noise_scale, clip_gradients and learning_rate_decay (including + other learning rate decay arguments). + loss: + scalar Op returned by the build_graph that specifies the training loss to + be minimized. + """ + optimizer = params.get("optimizer") + + if not optimizer: + optimizer = "SGD" + + if optimizer == "LazyAdam": + optimizer = LazyAdamOptimizer + + if optimizer == "DGC": + optimizer = DeepGradientCompressionOptimizer( + learning_rate=params.learning_rate, + use_locking=False, + name="Sparse", + density=params.get("dgc_density"), + density_decay=params.get("dgc_density_decay"), + density_decay_steps=params.get("dgc_density_decay_steps"), + density_decay_rate=params.get("dgc_density_decay_rate"), + min_density=params.get("dgc_min_density"), + accumulation=params.get("dgc_accumulation"), + ) + + summaries = ["loss"] + if params.get("show_optimizer_summaries"): + summaries = OPTIMIZER_SUMMARIES + + train_op = optimize_loss( + loss=loss, + global_step=tf.train.get_global_step(), + optimizer=optimizer, + learning_rate=params.learning_rate, + summaries=summaries, + colocate_gradients_with_ops=True, + gradient_noise_scale=params.get("gradient_noise_scale"), + clip_gradients=params.get("clip_gradients"), + learning_rate_decay_fn=twml.learning_rate_decay.get_learning_rate_decay_fn( + params + ), + ) + return train_op + + def export_model_effects( + self, + export_path: str, + feature_spec: Dict[str, Any] = None, + log_features: bool = True, + ) -> None: + """Export model effects to disk.""" + if feature_spec: + if log_features: + features = feature_spec["features"] + feature_names = [ + ".".join(features[fid]["featureName"].split(".")[1:]) + for fid in features.keys() + ] + features_to_log = ",".join(feature_names) + try: + model_hash = self.experiment_tracker.compute_model_hash(export_path) + metrics.log_usage( + "dbv2", + "export_model_effects", + "v1", + custom_attrs=[ + model_hash, + "feature config present", + features_to_log, + ], + ) + except: # noqa: T803 + logging.info("Failed to log Feature Config features") + + twml.contrib.export.export_fn.export_feature_spec(export_path, feature_spec) + export_start_time = time.time() + self.experiment_tracker.export_feature_spec(feature_spec) + logging.info( + "Exported feature spec to ML Metastore in %s seconds.", + time.time() - export_start_time, + ) + + self.experiment_tracker.register_model(str(export_path)) + self.export_gauge.increment() + + @property + def best_or_latest_checkpoint(self) -> str: + if self._is_early_stopping: + best_checkpoint_path = os.path.join(self._save_dir, "best_checkpoint") + checkpoint_path = tf.train.latest_checkpoint(best_checkpoint_path) + # Return best checkpoint if necessary + if checkpoint_path: + return str(checkpoint_path) + else: + raise ValueError( + "Best checkpoint not found at %s." % best_checkpoint_path + ) + else: # Fallback to latest checkpoint from save directory + return str(self.latest_checkpoint) + + @property + def latest_checkpoint(self) -> str: + return str(self.estimator.latest_checkpoint()) + + def export_model( + self, + serving_input_receiver_fn: Callable[ + [], tf.estimator.export.ServingInputReceiver + ], + export_output_fn: Callable[ + [tf.estimator.EstimatorSpec], tf.estimator.export.ExportOutput + ] = None, + export_dir: str = None, + checkpoint_path: str = None, + feature_spec: Dict[str, Any] = None, + log_features: bool = True, + ) -> None: + """ + Export the model for prediction. Typically, the exported model + will later be run in production servers. This method is called + by the user to export the PREDICTgraph to disk. + + Internally, this method calls `tf.estimator.Estimator.export_savedmodel + `_. + + Note that a valid self._export_output_fn is required. + If export_ouput_fn is provided, it is used to set the self._export_output_fn. - See the `code <_modules/twml/argument_parser.html#get_trainer_parser>`_ - for a list and description of all cmd-line arguments. + Args: + serving_input_receiver_fn: + function preparing the model for inference requests. + This funtion returns the ``features`` dict passed to ``build_graph``. + export_dir: + directory to export a SavedModel for prediction servers. + Defaults to ``[save_dir]/exported_models``. + checkpoint_path: + the checkpoint path to export. If None (the default), the most recent checkpoint + found within the model directory is chosen. + export_output_fn: + Function to export the graph_output (output of build_graph) for + prediction. Takes a graph_output dict as sole argument and returns + the export_output_fns dict. + Defaults to `twml.export_output_fns.default_output_fn`. + + Return: + returns a string path to exported directory. + """ + if not self.is_chief(): + logging.info( + "Trainer.export_model ignored due to the process not being chief." + ) + return - Returns: - argparse.ArgumentParser instance with some useful args already added. - """ - return twml.argument_parser.get_trainer_parser() + self._export_output_fn = ( + export_output_fn or twml.export_output_fns.default_output_fn + ) - @staticmethod - def get_train_op(params, loss): - """ - Return a training Op, that is, a `twml.optimizers.optimize_loss - `_ - instance given params and loss. - This method can be overwritten by passing the optimize_loss_fn to the Trainer - constructor. - - Args: - params: - tensorflow.contrib.training.HParams instance. Recognizes the optimizer, optimizer_summaries, - gradient_noise_scale, clip_gradients and learning_rate_decay (including - other learning rate decay arguments). - loss: - scalar Op returned by the build_graph that specifies the training loss to - be minimized. - """ - optimizer = params.get('optimizer') - - if not optimizer: - optimizer = 'SGD' - - if optimizer == 'LazyAdam': - optimizer = LazyAdamOptimizer - - if optimizer == 'DGC': - optimizer = DeepGradientCompressionOptimizer( - learning_rate=params.learning_rate, - use_locking=False, - name="Sparse", - density=params.get('dgc_density'), - density_decay=params.get('dgc_density_decay'), - density_decay_steps=params.get('dgc_density_decay_steps'), - density_decay_rate=params.get('dgc_density_decay_rate'), - min_density=params.get('dgc_min_density'), - accumulation=params.get('dgc_accumulation') - ) - - summaries = ['loss'] - if params.get('show_optimizer_summaries'): - summaries = OPTIMIZER_SUMMARIES - - train_op = optimize_loss( - loss=loss, - global_step=tf.train.get_global_step(), - optimizer=optimizer, - learning_rate=params.learning_rate, - summaries=summaries, - colocate_gradients_with_ops=True, - gradient_noise_scale=params.get('gradient_noise_scale'), - clip_gradients=params.get('clip_gradients'), - learning_rate_decay_fn=twml.learning_rate_decay.get_learning_rate_decay_fn(params) - ) - return train_op - - def export_model_effects(self, export_path, feature_spec=None, log_features=True): - - # DO NOT CHANGE THE ORDER. - # This needs to be done before registering the model. - if feature_spec: - if log_features: - features = feature_spec['features'] - feature_names = ['.'.join(features[fid]['featureName'].split('.')[1:]) for fid in features.keys()] - features_to_log = ','.join(feature_names) - try: - model_hash = self.experiment_tracker.compute_model_hash(export_path) - metrics.log_usage('dbv2', 'export_model_effects', 'v1', custom_attrs=[model_hash, "feature config present", features_to_log]) - except: # noqa: T803 - logging.info("Failed to log Feature Config features") - - twml.contrib.export.export_fn.export_feature_spec(export_path, feature_spec) - export_start_time = time.time() - self.experiment_tracker.export_feature_spec(feature_spec) - logging.info("Exported feature spec to ML Metastore in %s seconds.", time.time() - export_start_time) - - self.experiment_tracker.register_model(str(export_path)) - self.export_gauge.increment() - - @property - def best_or_latest_checkpoint(self): - if self._is_early_stopping: - best_checkpoint_path = os.path.join(self._save_dir, "best_checkpoint") - checkpoint_path = tf.train.latest_checkpoint(best_checkpoint_path) - # Return best checkpoint if necessary - if checkpoint_path: - return checkpoint_path - else: - raise ValueError("Best checkpoint not found at %s." % best_checkpoint_path) - else: # Fallback to latest checkpoint from save directory - return self.latest_checkpoint - - @property - def latest_checkpoint(self): - return self.estimator.latest_checkpoint() - - def export_model(self, serving_input_receiver_fn, - export_output_fn=None, - export_dir=None, checkpoint_path=None, - feature_spec=None, - log_features=True): - """ - Export the model for prediction. Typically, the exported model - will later be run in production servers. This method is called - by the user to export the PREDICTgraph to disk. - - Internally, this method calls `tf.estimator.Estimator.export_savedmodel - `_. - - Note that a valid self._export_output_fn is required. - If export_ouput_fn is provided, it is used to set the self._export_output_fn. - - Args: - serving_input_receiver_fn: - function preparing the model for inference requests. - This funtion returns the ``features`` dict passed to ``build_graph``. - export_dir: - directory to export a SavedModel for prediction servers. - Defaults to ``[save_dir]/exported_models``. - checkpoint_path: - the checkpoint path to export. If None (the default), the most recent checkpoint - found within the model directory is chosen. - export_output_fn: - Function to export the graph_output (output of build_graph) for - prediction. Takes a graph_output dict as sole argument and returns - the export_output_fns dict. - Defaults to `twml.export_output_fns.default_output_fn`. - - Return: - returns a string path to exported directory. - - # set the export output function - """ - if not self.is_chief(): - logging.info("Trainer.export_model ignored due to the process not being chief.") - return + if not callable(self._export_output_fn): + raise RuntimeError( + "Expecting export_output_fn function. Got %s." + % type(self._export_output_fn).__name__ + ) + + if export_dir: + export_dir = sanitize_hdfs_path(export_dir) + + if checkpoint_path: + checkpoint_path = sanitize_hdfs_path(checkpoint_path) + else: + checkpoint_path = self.best_or_latest_checkpoint + + # actually export the model using the Estimator API + export_path = self._estimator.export_savedmodel( + export_dir_base=export_dir + or os.path.join(self._save_dir, "exported_models"), + serving_input_receiver_fn=serving_input_receiver_fn, + checkpoint_path=checkpoint_path, + ) - self._export_output_fn = export_output_fn or twml.export_output_fns.default_output_fn + # export_path is bytes, need to convert to string for python3 to work. + logging.info("The exported model path is: " + str(export_path)) - if not callable(self._export_output_fn): - raise RuntimeError( - "Expecting export_output_fn function. Got %s." - % type(self._export_output_fn).__name__) + self.export_model_effects(export_path, feature_spec, log_features) - if export_dir: - export_dir = sanitize_hdfs_path(export_dir) + return export_path - if checkpoint_path: - checkpoint_path = sanitize_hdfs_path(checkpoint_path) - else: - checkpoint_path = self.best_or_latest_checkpoint + def _model_fn( + self, + features: Dict[str, tf.Tensor], + labels: tf.Tensor, + mode: tf.estimator.ModeKeys, + params: tf.contrib.training.HParams, + config=None, + ) -> tf.estimator.EstimatorSpec: + """ + returns tf.estimator.EstimatorSpec that can be used with tf.estimator.Estimators. + You would probably never need to modify this method. + Instead, you should override build_graph, which this method calls. - # actually export the model using the Estimator API - export_path = self._estimator.export_savedmodel( - export_dir_base=export_dir or os.path.join(self._save_dir, 'exported_models'), - serving_input_receiver_fn=serving_input_receiver_fn, - checkpoint_path=checkpoint_path) + Args: + features: + Dict of input tensors. + labels: + Tensor of target labels. + mode: + an instance of tf.estimator.ModeKeys. + Typically used to toggle TRAINing or EVALuation. + params: + HParams object containing hyper-parameters. + """ + # pylint: disable=too-many-branches + if isinstance(features, dict): + weights = features.get("weights", None) + else: + weights = None + + with tf.variable_scope(self._name + "/model"): + graph_output = self._build_graph_fn(features, labels, mode, params, config) + loss = graph_output["loss"] if "loss" in graph_output else None + + self._maybe_restore_checkpoint() + + with tf.variable_scope(self._name + "/optim"): + train_op = None + if mode == tf.estimator.ModeKeys.TRAIN: + if "train_op" in graph_output: + train_op = graph_output["train_op"] + graph_output[ + "train_op" + ] = None # remove from preds to prevent error + elif loss is not None: + train_op = self._optimize_loss_fn(params, loss) + + if params.get("train_log_metrics") and self._metric_fn: + metric_ops = self._metric_fn( + graph_output=graph_output, labels=labels, weights=weights + ) + for metric_name in metric_ops: + tf.summary.scalar( + name="training_metric_" + metric_name, + tensor=metric_ops[metric_name][1], + ) # index 0 contains value_op, 1 contains update_op + + if mode == tf.estimator.ModeKeys.PREDICT and self._export_output_fn is not None: + # note that this is ignored by the predict method. + # Estimator only uses export_output_fn for export_model. + export_outputs = self._export_output_fn(graph_output) + else: + export_outputs = None + + if mode == tf.estimator.ModeKeys.EVAL and self._metric_fn: + eval_metric_ops = self._metric_fn( + graph_output=graph_output, labels=labels, weights=weights + ) + else: + eval_metric_ops = None + + # None and loss (scalar, not sliceable by TFMA) should be removed from the graph_output + preds = { + key: graph_output[key] + for key in graph_output + if (graph_output[key] is not None) and (key is not "loss") + } + + init_feed_dict = twml.contrib.initializers.get_init_feed_dict() + scaffold = tf.train.Scaffold(init_feed_dict=init_feed_dict) + + # Clear the init feed collection to avoid serializing the initializers. + twml.contrib.initializers.clear_init_feed_collection() + + # save estimator for use by later methods and hooks (warning: often reset) + self._current_estimator_spec = tf.estimator.EstimatorSpec( + mode=mode, + predictions=preds, + export_outputs=export_outputs, + loss=loss, + train_op=train_op, + eval_metric_ops=eval_metric_ops, + scaffold=scaffold, + ) - # export_path is bytes, need to convert to string for python3 to work. - logging.info("The exported model path is: " + str(export_path)) + return self._current_estimator_spec - self.export_model_effects(export_path, feature_spec, log_features) + def get_train_hooks(self) -> List[tf.train.SessionRunHook]: + """Return SessionRunHooks used during training. - return export_path + By default training uses one hooks `tf.train.StepCounterHook` for monitoring step speed. - def _model_fn(self, features, labels, mode, params, config=None): - """ - returns tf.estimator.EstimatorSpec that can be used with tf.estimator.Estimators. - You would probably never need to modify this method. - Instead, you should override build_graph, which this method calls. - - Args: - features: - Dict of input tensors. - labels: - Tensor of target labels. - mode: - an instance of tf.estimator.ModeKeys. - Typically used to toggle TRAINing or EVALuation. - params: - HParams object containing hyper-parameters. - """ - # pylint: disable=too-many-branches - if isinstance(features, dict): - weights = features.get('weights', None) - else: - weights = None - - with tf.variable_scope(self._name + '/model'): - graph_output = self._build_graph_fn(features, labels, mode, params, config) - loss = graph_output['loss'] if 'loss' in graph_output else None - - self._maybe_restore_checkpoint() - - with tf.variable_scope(self._name + '/optim'): - train_op = None - if mode == tf.estimator.ModeKeys.TRAIN: - if 'train_op' in graph_output: - train_op = graph_output['train_op'] - graph_output['train_op'] = None # remove from preds to prevent error - elif loss is not None: - train_op = self._optimize_loss_fn(params, loss) - - if params.get('train_log_metrics') and self._metric_fn: - metric_ops = self._metric_fn(graph_output=graph_output, labels=labels, weights=weights) - for metric_name in metric_ops: - tf.summary.scalar( - name="training_metric_" + metric_name, - tensor=metric_ops[metric_name][1]) # index 0 contains value_op, 1 contains update_op - - if mode == tf.estimator.ModeKeys.PREDICT and self._export_output_fn is not None: - # note that this is ignored by the predict method. - # Estimator only uses export_output_fn for export_model. - export_outputs = self._export_output_fn(graph_output) - else: - export_outputs = None - - if mode == tf.estimator.ModeKeys.EVAL and self._metric_fn: - eval_metric_ops = self._metric_fn(graph_output=graph_output, labels=labels, weights=weights) - else: - eval_metric_ops = None - - # None and loss (scalar, not sliceable by TFMA) should be removed from the graph_output - preds = {key: graph_output[key] for key in graph_output if (graph_output[key] is not None) and (key is not 'loss')} - - init_feed_dict = twml.contrib.initializers.get_init_feed_dict() - scaffold = tf.train.Scaffold(init_feed_dict=init_feed_dict) - - # Clear the init feed collection to avoid serializing the initializers. - twml.contrib.initializers.clear_init_feed_collection() - - # save estimator for use by later methods and hooks (warning: often reset) - self._current_estimator_spec = tf.estimator.EstimatorSpec( - mode=mode, - predictions=preds, - export_outputs=export_outputs, - loss=loss, - train_op=train_op, - eval_metric_ops=eval_metric_ops, - scaffold=scaffold, - ) - - return self._current_estimator_spec - - def get_train_hooks(self): - """Return SessionRunHooks used during training. - - By default training uses one hooks `tf.train.StepCounterHook` for monitoring step speed. - - If self._profiler_steps is set then we also use the ProfilerHook `tf.train.ProfilerHook` - for monitoring the profile. + If self._profiler_steps is set then we also use the ProfilerHook `tf.train.ProfilerHook` + for monitoring the profile. - """ - # Instead of having every_n_steps be a constant number, - # change it dynamically based on batch size. - # Ideally we should be using every_n_secs, but that seems buggy as of 1.7. - # The every_n_steps = 20K / batch_size - every_n_steps = ((2048 * 100) // self._params.train_batch_size) - step_counter = tf.train.StepCounterHook( - every_n_steps=every_n_steps, output_dir=self._save_dir - ) - train_hooks = [step_counter] - - if self._profiler_steps is not None: - if not self._params.get('distributed') or self._estimator.config.is_chief: - profiler = tf.train.ProfilerHook( - save_steps=self._profiler_steps, - output_dir=self._save_dir + """ + # Instead of having every_n_steps be a constant number, + # change it dynamically based on batch size. + # Ideally we should be using every_n_secs, but that seems buggy as of 1.7. + # The every_n_steps = 20K / batch_size + every_n_steps = (2048 * 100) // self._params.train_batch_size + step_counter = tf.train.StepCounterHook( + every_n_steps=every_n_steps, output_dir=self._save_dir ) - train_hooks.append(profiler) - - return train_hooks - - def is_task_type(self, name): - """ - Helper function to specify if the current process is of the given worker type. - Note: This an only be called *after* self._hogwild_setup() is called in __init__() - """ - if os.environ.get('TF_CONFIG'): - if self._estimator.config.task_type == name: + train_hooks = [step_counter] + + if self._profiler_steps is not None: + if not self._params.get("distributed") or self._estimator.config.is_chief: + profiler = tf.train.ProfilerHook( + save_steps=self._profiler_steps, output_dir=self._save_dir + ) + train_hooks.append(profiler) + + return train_hooks + + def is_task_type(self, name: str) -> bool: + """ + Helper function to specify if the current process is of the given worker type. + Note: This an only be called *after* self._hogwild_setup() is called in __init__() + """ + if os.environ.get("TF_CONFIG"): + if self._estimator.config.task_type == name: + return True + else: + return False return True - else: - return False - return True - def is_evaluator(self): - """ - Helper function to let you know if the worker is evaluator. - Note: This an only be called *after* self._hogwild_setup() is called in __init__() - """ - return self.is_task_type("evaluator") - - def is_chief(self): - """ - Helper function to let you know if the worker is chief. - Note: This an only be called *after* self._hogwild_setup() is called in __init__() - """ - return self.is_task_type("chief") or self.is_task_type("master") - - def is_ps(self): - """ - Helper function to let you know if the task is parameter server. - """ - if os.environ.get('TF_CONFIG') and self._estimator.config.task_type == 'ps': - return True - return False + def is_evaluator(self) -> bool: + """ + Helper function to let you know if the worker is evaluator. + Note: This an only be called *after* self._hogwild_setup() is called in __init__() + """ + return self.is_task_type("evaluator") + + def is_chief(self) -> bool: + """ + Helper function to let you know if the worker is chief. + Note: This an only be called *after* self._hogwild_setup() is called in __init__() + """ + return self.is_task_type("chief") or self.is_task_type("master") + + def is_ps(self) -> bool: + """ + Helper function to let you know if the task is parameter server. + """ + if os.environ.get("TF_CONFIG") and self._estimator.config.task_type == "ps": + return True + return False - def _exit_ps_after_training_complete(self): - """ - Helper function to shutdown parameter server after training job complete (either succeed or failed). - """ - if not self.is_ps(): - return - - # No need to exit ps if on the same machine - if os.environ.get('TWML_HOGWILD_PORTS'): - return - - if self._params.get('disable_auto_ps_shutdown', False): - logging.info("Skip shutting down parameter server after training complete [--disable_auto_ps_shutdown is set]") - return - - # checking job status is different on gke vs aurora - if self._is_on_gke(): - get_job_status = functools.partial( - k8s_status.get_training_job_status, - cluster=None, - namespace=os.environ['TWML_JOB_ROLE'], - environment=os.environ['TWML_JOB_ENV'], - job_name=os.environ['TWML_JOB_NAME'], - using_tsd=True) - else: - get_job_status = functools.partial( - get_distributed_training_job_path, - base_job_path=get_distributed_training_job_path() - ) - - def wait_complete_then_exit(): - retry_max = 60 - retry = 0 - while True: - try: - training_status = get_job_status() - if training_status == TrainingJobStatus.FINISHED: - logging.info("Distributed training job succeed, shutting down parameter server.") - os._exit(0) - elif training_status == TrainingJobStatus.FAILED: - logging.info("Distributed training job failed, shutting down parameter server.") - os._exit(0) - elif training_status == TrainingJobStatus.NOT_FOUND: - raise Exception("Distributed training job status not found.") - else: - poke_interval = random.randrange(60, 90) # prevent spike QPS to aurora endpoint - time.sleep(poke_interval) + def _exit_ps_after_training_complete(self): + """Helper function to shutdown parameter server after training job complete (either succeed or failed).""" + if not self.is_ps(): + return + + # No need to exit ps if on the same machine + if os.environ.get("TWML_HOGWILD_PORTS"): + return + + if self._params.get("disable_auto_ps_shutdown", False): + logging.info( + "Skip shutting down parameter server after training complete [--disable_auto_ps_shutdown is set]" + ) + return + + # checking job status is different on gke vs aurora + if self._is_on_gke(): + get_job_status = functools.partial( + k8s_status.get_training_job_status, + cluster=None, + namespace=os.environ["TWML_JOB_ROLE"], + environment=os.environ["TWML_JOB_ENV"], + job_name=os.environ["TWML_JOB_NAME"], + using_tsd=True, + ) + else: + get_job_status = functools.partial( + get_distributed_training_job_path, + base_job_path=get_distributed_training_job_path(), + ) + + def wait_complete_then_exit() -> None: + """Wait for distributed training job to complete, then exit parameter server.""" + retry_max = 60 retry = 0 - except Exception as e: - if retry >= retry_max: - raise e # only exception in this thread, won't fail parameter server thread - retry += 1 - poke_interval = random.randrange(60, 90) + retry * 10 - logging.warn("Error getting distributed training job status, will retry after %s seconds." % poke_interval) - time.sleep(poke_interval) - Thread(target=wait_complete_then_exit).start() - - def get_eval_hooks(self): # pylint: disable=no-self-use - """ Return SessionRunHooks used during evaluation.""" - return None - - def get_predict_hooks(self): - """ Return hooks used during prediction. - If profiler_steps is set in the constructor to the Trainer, - we pass a tf.Train.ProfilerHook to the estimator's predict function. - """ - hooks = [] - if self._profiler_steps is not None: - profiler = tf.train.ProfilerHook( - save_steps=self._profiler_steps, - output_dir=self._save_dir - ) - hooks.append(profiler) - return hooks - - def learn(self, train_input_fn=None, eval_input_fn=None, - train_max_steps=None, - train_steps=None, eval_steps=None, - train_hooks=None, eval_hooks=None, - early_stop_metric=None, early_stop_patience=-1, - early_stop_minimize=True, early_stop_tolerance=0, start_epoch=0, - exporters=None, export_output_fn=None, max_duration=None): - """ - Train and evaluate the estimator for ``train_max_steps`` steps. - Each epoch involves ``train_steps`` training steps followed - by ``eval_steps`` evaluation steps. Note that each step - is a ``session.run()``, that is, each batch is a step. - - Args: - train_max_steps: - maximum number of global steps of training to run. - Defaults to params.train_max_steps. - None-values cause learn() to terminate after *one* call to train() and evaluate(), - which is usually useful when using train_steps=-1 - Non-positive values trains indefinitely in a loop (use with caution), - which is usually useful when used with early stopping. - train_steps: - number of training steps per epoch. For example, 100 means each - training epoch will end after processing 100 batches. - Defaults to params.train_steps. - Non-positive values and None-values go through the entire training set each epoch. - eval_steps: - number of evaluation steps per epoch. - Defaults to params.eval_steps. - Non-positive values and None-values go through the entire evaluation set each epoch. - train_input_fn: - Function to iterate through training set. It is passed to estimator.train. - eval_input_fn: - Function to iterate through evaluation set. It is passed to estimator.evaluate. - train_hooks: - List of SessionRunHooks uses for training. Defaults to self.get_train_hooks(). - eval_hooks: - List of SessionRunHooks uses for evaluation. Defaults to self.get_eval_hooks() - start_epoch: - The epoch from which to start learn. If you want to do training and evaluation - for N epochs, you can call ``learn()`` in a loop as follows: - exporters: - List of exporters called at the end of each evaluation run. - Defaults to none. - export_output_fn: - The output format to use for exported models. - Only used if exporters is not None. - - .. code-block:: python - - for epoch in range(1,max_epoch): - trainer.learn(start_epoch=epoch) - - Early-stopping arguments: - early_stop_metric: - String specifying the metric to early-stop on. Required with positive - ``early_stop_patience``. For example, 'accuracy', 'accuracy_0', 'loss', etc. - The string is used to extract the relevant tensor Op from the dict returned by - the get_eval_metric_ops method. For ``metrics`` pass to the constructor, - the string is one of those. For multi-class (that is, multi-metric) - metrics, the string may be appended with a ``_0``, ``_1``, etc. or one - of the ``multi_metric_names`` (one per class). - early_stop_patience: - Maximum number of epochs to wait for an improvement in the early_stop_metric - before breaking off training. For example, a patience of 10 means that - training will have 10 epochs to improve the metric before it is killed. - Whenever the metric is improved before running out of patience, - patience is reset to ``early_stop_patience``. - Defaults to -1 (that is, no early-stopping). - early_stop_minimize: - Set this to True (the default) for metrics that need to be minimized - (like ``loss``). Metrics like ``accuracy`` that need to be maximized - should set this to False. - early_stop_tolerance: - A non-negative tolerance for comparing early_stop_metric. - E.g. when maximizing the condition is current_metric > best_metric + tolerance. - Defaults to 0. - max_duration: - A float. When this argument is defined, the job will automatically terminate after - `max_duration` seconds if it has not already compeleted. - - Returns: - The directory where the checkpoints were saved. - That is, save_dir. - You can point TensorBoard to this directory to get metrics, - or pass it to another Trainer via ``init_from_dir`` when doing - multi-phase training. - """ - # pylint: disable=too-many-branches - - if not callable(train_input_fn): - raise ValueError("Expecting callable train_input_fn function") - if not callable(eval_input_fn): - raise ValueError("Expecting callable eval_input_fn function") - - if os.environ.get('TF_CONFIG'): - raise ValueError("trainer.learn() can not be used with distributed / hogwild setups") - - if exporters and export_output_fn: - self._export_output_fn = export_output_fn - - train_hooks = self.get_train_hooks() if train_hooks is None else train_hooks - eval_hooks = self.get_eval_hooks() if eval_hooks is None else eval_hooks - eval_hooks = [] if eval_hooks is None else eval_hooks - - if train_max_steps is None: - train_max_steps = self.params.get('train_max_steps') - - if train_steps is None: - train_steps = self.params.train_steps - if train_steps <= 0: - train_steps = None - - if eval_steps is None: - eval_steps = self.params.eval_steps - if eval_steps <= 0: - eval_steps = None - - if early_stop_patience > 0: - assert train_max_steps is not None, "Early stopping and max_steps=None are not compatible." - # prepare early stopping hook (which also handles logic here) - self._is_early_stopping = True - early_stop_hook = twml.hooks.EarlyStopHook( - metric=early_stop_metric, - checkpoint_dir=self._save_dir, - patience=early_stop_patience, - minimize=early_stop_minimize, - tolerance=early_stop_tolerance, - get_estimator_spec_fn=lambda: self.current_estimator_spec, - start_epoch=start_epoch) - # add early stop hook to eval hooks - eval_hooks.append(early_stop_hook) - - if max_duration is not None: - train_early_stop_duration_hook = twml.hooks.EarlyStopDuration( - max_duration=max_duration, - exit_on_end=False, - save_dir=self._save_dir, - overwrite=True, - ) - train_hooks.append(train_early_stop_duration_hook) - - eval_early_stop_duration_hook = twml.hooks.EarlyStopDuration( - max_duration=max_duration, - exit_on_end=False, - save_dir=self._save_dir, - overwrite=True, - ) - eval_hooks.append(eval_early_stop_duration_hook) - - if not self._is_early_stopping: - if (train_max_steps is not None) and (train_max_steps <= 0): - if ((max_duration is not None) and (max_duration < 0)) or (max_duration is None): - logging.warn("train.max_steps is non-positive, and no early or duration stopping is configured. " - "Training job will loop forever.") - - if train_max_steps is not None and train_max_steps > 0: - # we can't pass max_steps AND steps to estimator.train. - # so we pass steps to estimator.train and max_steps to this hook instead... - stop_at_step_hook = twml.hooks.StopAtStepHook(last_step=train_max_steps) - train_hooks.append(stop_at_step_hook) - - with self.experiment_tracker.track_experiment(eval_hooks, - lambda: self.current_estimator_spec): - # alternate training and evaluation epochs - epoch = start_epoch - while True: - logging.info("Training epoch %d", epoch) - self._estimator.train(train_input_fn, steps=train_steps, hooks=train_hooks) - - logging.info("Evaluating epoch %d", epoch) - eval_result = self._estimator.evaluate( - eval_input_fn, steps=eval_steps, hooks=eval_hooks) - - if exporters: - checkpoint_path = self.estimator.latest_checkpoint() - for exporter in exporters: - export_path = os.path.join(self._save_dir, "export", exporter.name) - exporter.export( - estimator=self.estimator, export_path=export_path, - checkpoint_path=checkpoint_path, eval_result=eval_result, - is_the_final_export=False) - - # If train_max_step is none. Terminate after one loop. - if train_max_steps is None: - break - - # If stop_at_step_hook requested a stop, break - if train_max_steps > 0 and stop_at_step_hook.stop_requested: - break + while True: + try: + training_status = get_job_status() + if training_status == TrainingJobStatus.FINISHED: + logging.info( + "Distributed training job succeed, shutting down parameter server." + ) + os._exit(0) + elif training_status == TrainingJobStatus.FAILED: + logging.info( + "Distributed training job failed, shutting down parameter server." + ) + os._exit(0) + elif training_status == TrainingJobStatus.NOT_FOUND: + raise Exception("Distributed training job status not found.") + else: + poke_interval = random.randrange( + 60, 90 + ) # prevent spike QPS to aurora endpoint + time.sleep(poke_interval) + retry = 0 + except Exception as e: + if retry >= retry_max: + raise e # only exception in this thread, won't fail parameter server thread + retry += 1 + poke_interval = random.randrange(60, 90) + retry * 10 + logging.warn( + "Error getting distributed training job status, will retry after %s seconds." + % poke_interval + ) + time.sleep(poke_interval) + + Thread(target=wait_complete_then_exit).start() + + def get_eval_hooks(self) -> None: # pylint: disable=no-self-use + """Return SessionRunHooks used during evaluation.""" + return None + + def get_predict_hooks(self) -> List[tf.train.SessionRunHook]: + """Return hooks used during prediction. + If profiler_steps is set in the constructor to the Trainer, + we pass a tf.Train.ProfilerHook to the estimator's predict function. + """ + hooks = [] + if self._profiler_steps is not None: + profiler = tf.train.ProfilerHook( + save_steps=self._profiler_steps, output_dir=self._save_dir + ) + hooks.append(profiler) + return hooks + + def learn( + self, + train_input_fn: Optional[Callable[[], tf.data.Dataset]] = None, + eval_input_fn: Optional[Callable[[], tf.data.Dataset]] = None, + train_max_steps: Optional[int] = None, + train_steps: Optional[int] = None, + eval_steps: Optional[int] = None, + train_hooks: Optional[List[tf.train.SessionRunHook]] = None, + eval_hooks: Optional[List[tf.train.SessionRunHook]] = None, + early_stop_metric: Optional[str] = None, + early_stop_patience: Optional[int] = -1, + early_stop_minimize: Optional[bool] = True, + early_stop_tolerance: Optional[int] = 0, + start_epoch: Optional[int] = 0, + exporters: Optional[List[tf.estimator.Exporter]] = None, + export_output_fn: Optional[ + Callable[[tf.estimator.Estimator], tf.estimator.ExportOutput] + ] = None, + max_duration: Optional[int] = None, + ) -> None: + """ + Train and evaluate the estimator for ``train_max_steps`` steps. + Each epoch involves ``train_steps`` training steps followed + by ``eval_steps`` evaluation steps. Note that each step + is a ``session.run()``, that is, each batch is a step. - # early-stopping logic is handled internally by the hook - if early_stop_patience > 0 and early_stop_hook.should_stop: - # but we still need to break here - break - epoch += 1 - - self.write_state_to_disk(save_dir=self._save_dir, filename='_SUCCESS') - - return self._save_dir + Args: + train_max_steps: + maximum number of global steps of training to run. + Defaults to params.train_max_steps. + None-values cause learn() to terminate after *one* call to train() and evaluate(), + which is usually useful when using train_steps=-1 + Non-positive values trains indefinitely in a loop (use with caution), + which is usually useful when used with early stopping. + train_steps: + number of training steps per epoch. For example, 100 means each + training epoch will end after processing 100 batches. + Defaults to params.train_steps. + Non-positive values and None-values go through the entire training set each epoch. + eval_steps: + number of evaluation steps per epoch. + Defaults to params.eval_steps. + Non-positive values and None-values go through the entire evaluation set each epoch. + train_input_fn: + Function to iterate through training set. It is passed to estimator.train. + eval_input_fn: + Function to iterate through evaluation set. It is passed to estimator.evaluate. + train_hooks: + List of SessionRunHooks uses for training. Defaults to self.get_train_hooks(). + eval_hooks: + List of SessionRunHooks uses for evaluation. Defaults to self.get_eval_hooks() + start_epoch: + The epoch from which to start learn. If you want to do training and evaluation + for N epochs, you can call ``learn()`` in a loop as follows: + exporters: + List of exporters called at the end of each evaluation run. + Defaults to none. + export_output_fn: + The output format to use for exported models. + Only used if exporters is not None. + + .. code-block:: python + + for epoch in range(1,max_epoch): + trainer.learn(start_epoch=epoch) + + Early-stopping Args: + early_stop_metric: + String specifying the metric to early-stop on. Required with positive + ``early_stop_patience``. For example, 'accuracy', 'accuracy_0', 'loss', etc. + The string is used to extract the relevant tensor Op from the dict returned by + the get_eval_metric_ops method. For ``metrics`` pass to the constructor, + the string is one of those. For multi-class (that is, multi-metric) + metrics, the string may be appended with a ``_0``, ``_1``, etc. or one + of the ``multi_metric_names`` (one per class). + early_stop_patience: + Maximum number of epochs to wait for an improvement in the early_stop_metric + before breaking off training. For example, a patience of 10 means that + training will have 10 epochs to improve the metric before it is killed. + Whenever the metric is improved before running out of patience, + patience is reset to ``early_stop_patience``. + Defaults to -1 (that is, no early-stopping). + early_stop_minimize: + Set this to True (the default) for metrics that need to be minimized + (like ``loss``). Metrics like ``accuracy`` that need to be maximized + should set this to False. + early_stop_tolerance: + A non-negative tolerance for comparing early_stop_metric. + E.g. when maximizing the condition is current_metric > best_metric + tolerance. + Defaults to 0. + max_duration: + A float. When this argument is defined, the job will automatically terminate after + `max_duration` seconds if it has not already compeleted. + + Returns: + The directory where the checkpoints were saved. That is, save_dir. + You can point TensorBoard to this directory to get metrics, + or pass it to another Trainer via ``init_from_dir`` when doing + multi-phase training. + """ + # pylint: disable=too-many-branches + + if not callable(train_input_fn): + raise ValueError("Expecting callable train_input_fn function") + if not callable(eval_input_fn): + raise ValueError("Expecting callable eval_input_fn function") + + if os.environ.get("TF_CONFIG"): + raise ValueError( + "trainer.learn() can not be used with distributed / hogwild setups" + ) + + if exporters and export_output_fn: + self._export_output_fn = export_output_fn + + train_hooks = self.get_train_hooks() if train_hooks is None else train_hooks + eval_hooks = self.get_eval_hooks() if eval_hooks is None else eval_hooks + eval_hooks = [] if eval_hooks is None else eval_hooks - def get_train_spec(self, input_fn, max_steps=None, hooks=None): - """Get the TrainSpec used by ``tf.train.train_and_evaluate``.""" - if not callable(input_fn): - raise ValueError("Expecting callable train_input_fn") + if train_max_steps is None: + train_max_steps = self.params.get("train_max_steps") + + if train_steps is None: + train_steps = self.params.train_steps + if train_steps <= 0: + train_steps = None + + if eval_steps is None: + eval_steps = self.params.eval_steps + if eval_steps <= 0: + eval_steps = None + + if early_stop_patience > 0: + assert ( + train_max_steps is not None + ), "Early stopping and max_steps=None are not compatible." + # prepare early stopping hook (which also handles logic here) + self._is_early_stopping = True + early_stop_hook = twml.hooks.EarlyStopHook( + metric=early_stop_metric, + checkpoint_dir=self._save_dir, + patience=early_stop_patience, + minimize=early_stop_minimize, + tolerance=early_stop_tolerance, + get_estimator_spec_fn=lambda: self.current_estimator_spec, + start_epoch=start_epoch, + ) + # add early stop hook to eval hooks + eval_hooks.append(early_stop_hook) + + if max_duration is not None: + train_early_stop_duration_hook = twml.hooks.EarlyStopDuration( + max_duration=max_duration, + exit_on_end=False, + save_dir=self._save_dir, + overwrite=True, + ) + train_hooks.append(train_early_stop_duration_hook) + + eval_early_stop_duration_hook = twml.hooks.EarlyStopDuration( + max_duration=max_duration, + exit_on_end=False, + save_dir=self._save_dir, + overwrite=True, + ) + eval_hooks.append(eval_early_stop_duration_hook) + + if not self._is_early_stopping: + if (train_max_steps is not None) and (train_max_steps <= 0): + if ((max_duration is not None) and (max_duration < 0)) or ( + max_duration is None + ): + logging.warn( + "train.max_steps is non-positive, and no early or duration stopping is configured. " + "Training job will loop forever." + ) + + if train_max_steps is not None and train_max_steps > 0: + # we can't pass max_steps AND steps to estimator.train. + # so we pass steps to estimator.train and max_steps to this hook instead... + stop_at_step_hook = twml.hooks.StopAtStepHook(last_step=train_max_steps) + train_hooks.append(stop_at_step_hook) + + with self.experiment_tracker.track_experiment( + eval_hooks, lambda: self.current_estimator_spec + ): + # alternate training and evaluation epochs + epoch = start_epoch + while True: + logging.info("Training epoch %d", epoch) + self._estimator.train( + train_input_fn, steps=train_steps, hooks=train_hooks + ) + + logging.info("Evaluating epoch %d", epoch) + eval_result = self._estimator.evaluate( + eval_input_fn, steps=eval_steps, hooks=eval_hooks + ) + + if exporters: + checkpoint_path = self.estimator.latest_checkpoint() + for exporter in exporters: + export_path = os.path.join( + self._save_dir, "export", exporter.name + ) + exporter.export( + estimator=self.estimator, + export_path=export_path, + checkpoint_path=checkpoint_path, + eval_result=eval_result, + is_the_final_export=False, + ) + + # If train_max_step is none. Terminate after one loop. + if train_max_steps is None: + break + + # If stop_at_step_hook requested a stop, break + if train_max_steps > 0 and stop_at_step_hook.stop_requested: + break + + # early-stopping logic is handled internally by the hook + if early_stop_patience > 0 and early_stop_hook.should_stop: + # but we still need to break here + break + epoch += 1 + + self.write_state_to_disk(save_dir=self._save_dir, filename="_SUCCESS") + + return self._save_dir + + def get_train_spec( + self, + input_fn: Callable[[], tf.data.Dataset], + max_steps: Optional[int] = None, + hooks: Optional[List[tf.train.SessionRunHook]] = None, + ) -> tf.estimator.TrainSpec: + """Get the TrainSpec used by ``tf.train.train_and_evaluate``.""" + if not callable(input_fn): + raise ValueError("Expecting callable train_input_fn") + + if max_steps is None: + max_steps = self.params.train_max_steps + + if max_steps is not None and max_steps <= 0: + max_steps = None + + hooks = self.get_train_hooks() if hooks is None else hooks + + return tf.estimator.TrainSpec( + input_fn=input_fn, max_steps=max_steps, hooks=hooks + ) - if max_steps is None: - max_steps = self.params.train_max_steps + def get_eval_spec( + self, + input_fn: Callable[[], tf.data.Dataset], + steps: Optional[int] = None, + delay: Optional[int] = None, + period: Optional[int] = None, + hooks: Optional[List[tf.train.SessionRunHook]] = None, + exporters: Optional[List[tf.estimator.Exporter]] = None, + ) -> tf.estimator.EvalSpec: + """Get the EvalSpec used by ``tf.train.train_and_evaluate``.""" + if not callable(input_fn): + raise ValueError("Expecting callable eval_input_fn") + + if steps is None: + steps = self.params.eval_steps + + if steps <= 0: + steps = None + + if delay is None: + delay = self.params.eval_delay + + if period is None: + period = self.params.eval_period + + hooks = self.get_eval_hooks() if hooks is None else hooks + + eval_name = self.params.get("eval_name", None) + + return tf.estimator.EvalSpec( + input_fn=input_fn, + steps=steps, + name=eval_name, + start_delay_secs=delay, + throttle_secs=period, + hooks=hooks, + exporters=exporters, + ) - if max_steps is not None and max_steps <= 0: - max_steps = None + def train_and_evaluate( + self, + train_input_fn: Callable[[], tf.data.Dataset] = None, + eval_input_fn: Callable[[], tf.data.Dataset] = None, + train_max_steps: Optional[int] = None, + eval_steps: Optional[int] = None, + eval_delay: Optional[int] = None, + eval_period: Optional[int] = None, + train_hooks: Optional[List[tf.train.SessionRunHook]] = None, + eval_hooks: Optional[List[tf.train.SessionRunHook]] = None, + early_stop_metric: Optional[str] = None, + early_stop_patience: Optional[int] = -1, + early_stop_minimize: Optional[bool] = True, + early_stop_tolerance: Optional[float] = 0.0, + exporters: Optional[List[tf.estimator.Exporter]] = None, + export_output_fn: Optional[Callable[[tf.estimator.Estimator], None]] = None, + max_duration: Optional[int] = None, + ) -> str: + """ + Train and evaluate the estimator for ``train_max_steps`` + using ``tf.estimator.train_and_evaluate``. + With a cluster configuration provided in the ``TF_CONFIG`` environment variable, this method + can be used for distributed training (multi-node or multi-process). + Unlike the ``learn`` method, training is continuous with ``train_max_steps``. + For distributed use case, evaluation happens periodically. + That is, after ``eval_delay`` seconds, an evaluation epoch of ``eval_step`` steps + occurs every ``eval_period`` seconds. Evaluation happens on the most recent checkpoint. + TF defaults to saving checkpoints every 10 mins. + For local use case, training occurs for train_max_steps epochs followed by a + single evaluation. For local use case we therefore recommend using learn() instead + as it provides early-stopping and multiple evaluations. + + ``train_and_evaluate`` will evaluate for ``eval_steps`` every ``eval_period`` seconds. + It will stop after ``train_steps`` is reached. + + You must ensure that all workers/servers are assigned the same `save_dir`. + + .. Note:: + + If the TF_CONFIG environment variable is set, this function assumes its running a distribute job. - hooks = self.get_train_hooks() if hooks is None else hooks + Args: + train_input_fn: + Function to iterate through training set. It is passed to estimator.train_and_evalute + eval_input_fn: + Function to iterate through evaluation set. It is passed to estimator.train_and_evalute. + train_max_steps: + maximum number of global steps of training to run. + Defaults to params.train_max_steps. + Non-positive values and None-values train indefinitely (use with caution). + eval_steps: + number of steps per evaluation. + Defaults to params.eval_steps. + Non-positive values and None-values go through + the entire evaluation set for each evaluation. + Note that the number of eval_steps should be high enough to minimize noise. + This is especially true for early-stopping. + eval_delay: + Start the first evaluation after eval_delay. Defaults to params.eval_delay or 2*60s. + eval_period: + Run an evaluation every eval_period seconds. Defaults to params.eval_period or 10*60s. + exporters: + List of exporters called at the end of each evaluation run. + Defaults to none. + export_output_fn: + The output format to use for exported models. + Only used if exporters is not None. + + Early-stopping Args: + early_stop_metric: + String specifying the metric to early-stop on. Required with positive + ``early_stop_patience``. For example, 'accuracy', 'accuracy_0', 'loss', etc. + The string is used to extract the relevant tensor Op from the dict returned by + the get_eval_metric_ops method. For ``metrics`` pass to the constructor, + the string is one of those. For multi-class (that is, multi-metric) + metrics, the string may be appended with a ``_0``, ``_1``, etc. or one + of the ``multi_metric_names`` (one per class). + early_stop_patience: + Maximum number of epochs to wait for an improvement in the early_stop_metric + before breaking off training. For example, a patience of 10 means that + training will have 10 epochs to improve the metric before it is killed. + Whenever the metric is improved before running out of patience, + patience is reset to ``early_stop_patience``. + Defaults to -1 (that is, no early-stopping). + early_stop_minimize: + Set this to True (the default) for metrics that need to be minimized + (like ``loss``). Metrics like ``accuracy`` that need to be maximized + should set this to False. + early_stop_tolerance: + A non-negative tolerance for comparing early_stop_metric. + E.g. when maximizing the condition is current_metric > best_metric + tolerance. + Defaults to 0. + max_duration: + A float. When this argument is defined, the job will automatically terminate after + `max_duration` seconds if it has not already compeleted. + + Returns: + The directory where the checkpoints were saved. + """ + + logging.info("WARNING: Trainer.train_and_evaluate is an EXPERIMENTAL API.") + logging.info( + "Trainer.train_and_evaluate may change or be removed in future versions." + ) - return tf.estimator.TrainSpec(input_fn=input_fn, - max_steps=max_steps, - hooks=hooks) + if not callable(train_input_fn): + raise ValueError("Expecting callable train_input_fn function") + if not callable(eval_input_fn): + raise ValueError("Expecting callable eval_input_fn function") + + self._exit_ps_after_training_complete() + + # Maybe export in eval processes. + if self.is_evaluator(): + if self.params.get("eval_name") is not None: + # Do not export if running special eval. + exporters = None + export_output_fn = None + elif exporters and export_output_fn: + self._export_output_fn = export_output_fn + else: + # Default option. + self._export_output_fn = None - def get_eval_spec(self, input_fn, steps=None, delay=None, period=None, - hooks=None, exporters=None): - """Get the EvalSpec used by ``tf.train.train_and_evaluate``.""" - if not callable(input_fn): - raise ValueError("Expecting callable eval_input_fn") + train_hooks = self.get_train_hooks() if train_hooks is None else train_hooks + train_hooks = [] if train_hooks is None else train_hooks - if steps is None: - steps = self.params.eval_steps + eval_hooks = self.get_eval_hooks() if eval_hooks is None else eval_hooks + eval_hooks = [] if eval_hooks is None else eval_hooks - if steps <= 0: - steps = None + if train_max_steps is None: + train_max_steps = self.params.get("train_max_steps") + + if eval_steps is None: + eval_steps = self.params.eval_steps + if eval_steps <= 0: + eval_steps = None + + if eval_delay is None: + eval_delay = self.params.eval_delay + if eval_period is None: + eval_period = self.params.eval_period + + if early_stop_patience > 0: + # when training hooks detect this file, they request a stop to training + early_stop_path = os.path.join(self._save_dir, "earlystop_now.txt") + # prepare early stopping hook (which also handles logic here) + + self._is_early_stopping = True + + eval_early_stop_hook = twml.hooks.EarlyStopHook( + metric=early_stop_metric, + checkpoint_dir=self._save_dir, + patience=early_stop_patience, + minimize=early_stop_minimize, + tolerance=early_stop_tolerance, + get_estimator_spec_fn=lambda: self.current_estimator_spec, + file_path=early_stop_path, + exit_on_end=os.environ.get("TF_CONFIG") is not None, + ) # only exit for distributed jobs + # add early stop hook to eval hooks + eval_hooks.append(eval_early_stop_hook) + + # prepare the commensurate training hook + train_early_stop_hook = twml.hooks.StopIfExistsHook(early_stop_path) + train_hooks.append(train_early_stop_hook) + + if max_duration is not None: + train_early_stop_duration_hook = twml.hooks.EarlyStopDuration( + max_duration=max_duration, + exit_on_end=False, + save_dir=self._save_dir, + overwrite=self.is_chief(), + ) + eval_early_stop_duration_hook = twml.hooks.EarlyStopDuration( + max_duration=max_duration, + exit_on_end=os.environ.get("TF_CONFIG") is not None, + save_dir=self._save_dir, + overwrite=False, + ) # only exit for distributed jobs + + train_hooks.append(train_early_stop_duration_hook) + eval_hooks.append(eval_early_stop_duration_hook) + + with self.experiment_tracker.track_experiment( + eval_hooks, lambda: self.current_estimator_spec + ): + train_spec = self.get_train_spec( + train_input_fn, train_max_steps, train_hooks + ) + eval_spec = self.get_eval_spec( + eval_input_fn, + eval_steps, + eval_delay, + eval_period, + eval_hooks, + exporters, + ) + self._train_and_evaluate(train_spec, eval_spec) + + if self.is_chief(): + self.write_state_to_disk(save_dir=self._save_dir, filename="_SUCCESS") + + return self._save_dir + + def _train_and_evaluate( + self, train_spec: tf.estimator.TrainSpec, eval_spec: tf.estimator.EvalSpec + ) -> None: + """ + Private method that calls + ``tf.estimator.train_and_evaluate(self._estimator, train_spec, eval_spec)``. + """ + try: + tf.estimator.train_and_evaluate(self._estimator, train_spec, eval_spec) + except twml.errors.EarlyStopError: + # Ignore the exception if on evaluator. + if self.is_evaluator(): + pass + else: + raise - if delay is None: - delay = self.params.eval_delay + def train( + self, + input_fn: Optional[Callable] = None, + steps: Optional[int] = None, + hooks: Optional[List[tf.train.SessionRunHook]] = None, + ) -> None: + """ + Train the estimator for `steps` training steps. - if period is None: - period = self.params.eval_period + Args: + steps: + number of steps for which to perform training. For example, 100 means each + evaluation will end after processing 100 batches. + Defaults to None. i.e. trains on the entire dataset a single time. + Non-positive values and None-values go through the entire training set each epoch. + input_fn: + Function to iterate through training set. It is passed to estimator.train. + hooks: + List of SessionRunHooks uses for training. Defaults to self.get_train_hooks(). + """ + if os.environ.get("TF_CONFIG") and "is_calibrating" not in self.params: + raise ValueError( + "trainer.train() can not be used with distributed / hogwild setups" + ) + + if not callable(input_fn): + raise ValueError("Expecting callable input_fn function") + + if self._is_early_stopping: + raise ValueError( + "Can not call train() after learn() when using early stopping." + ) + + hooks = self.get_train_hooks() if hooks is None else hooks + self._estimator.train(input_fn, steps=steps, hooks=hooks) + return self + + def evaluate( + self, + input_fn: Optional[Callable] = None, + steps: Optional[int] = None, + hooks: Optional[List[tf.train.SessionRunHook]] = None, + name: Optional[str] = None, + ) -> Dict[str, float]: + """ + Evaluate the estimator for `steps` evaluation steps. - hooks = self.get_eval_hooks() if hooks is None else hooks + Args: + steps: + number of steps for which to perform evaluation. For example, 100 means each + evaluation will end after processing 100 batches. + Defaults to None. i.e. evaluates on the entire dataset a single time. + Negative values and None-values go through the entire training set each epoch. + input_fn: + Function to iterate through evaluation set. It is passed to estimator.evaluate. + hooks: + List of SessionRunHooks used for evaluation. Defaults to None. + Note that, unlike learn(), hooks defaults to None instead of self.get_eval_hooks() + as the latter may implement early-stopping, which isn't necessarilty the desired + behavior when calling evaluate() on its own. + name: + Name of the evaluation if user needs to run multiple evaluations on different data sets. + Metrics for different evaluations are saved in separate folders, + and appear separately in tensorboard. + + Returns: + If `is_evaluator()`, returns a dict containing the evaluation metrics specified + in `metric_fn` keyed by name, as well as an entry `global_step` that contains + the value of the global step for which this evaluation was performed. + Otherwise (i.e. `is_evaluator() == False`), returns None. + """ + if not self.is_evaluator(): + return None + + if not callable(input_fn): + raise ValueError("Expecting callable input_fn function") + + hooks = self.get_eval_hooks() if hooks is None else hooks + hooks = [] if hooks is None else hooks + + # for consistency with train/learn + eval_steps = None if steps is not None and steps < 0 else steps + + with self.experiment_tracker.track_experiment( + hooks, lambda: self.current_estimator_spec, name=name + ): + checkpoint = self.best_or_latest_checkpoint + computed_metrics = self._estimator.evaluate( + input_fn, + steps=eval_steps, + hooks=hooks, + checkpoint_path=checkpoint, + name=name, + ) + + return computed_metrics + + def start_tensorboard(self, port: Optional[int] = None) -> None: + """ + Start tensorboard process to visualize logs in save_dir. + """ + logging.info("Starting tensorboard.") + if self._tensorboard_handle: + logging.warn("Tensorboard already running. Nothing done.") + return + + if port is None: + if "tensorboard_port" not in self.params.values(): + raise ValueError("You must specify a port for tensorboard to run on.") + elif self.params.tensorboard_port is None: + return + else: + port = self.params.tensorboard_port - eval_name = self.params.get("eval_name", None) + mldash_path = "experiments" + if self.experiment_tracker.path: + mldash_path += "/%s" % encode_url(self.experiment_tracker.experiment_id) + tensorboard_args = ["--logdir=%s" % self._save_dir, "--port=%d" % port] - return tf.estimator.EvalSpec(input_fn=input_fn, - steps=steps, - name=eval_name, - start_delay_secs=delay, - throttle_secs=period, - hooks=hooks, - exporters=exporters) + try: + args = [ + "email_and_launch_tensorboard", + mldash_path, + "--", + ] + tensorboard_args + self._tensorboard_handle = subprocess.Popen(args) + except OSError: + try: + self._tensorboard_handle = subprocess.Popen( + ["tensorboard"] + tensorboard_args + ) + except OSError: + try: + # this will work with Twitter internal pants build when run locally + args = [ + "./pants", + "run", + "twml:tensorboard", + "--", + ] + tensorboard_args + self._tensorboard_handle = subprocess.Popen(args) + except OSError: + logging.error( + "No tensorboard installed, won't able to visualize training in tensorboard." + ) + + def stop_tensorboard(self) -> None: + """ + Shutdown this Trainer's associated Tensorboard. + """ + if self._tensorboard_handle: + logging.info("Shutting down tensorboard.") + self._tensorboard_handle.kill() + else: + logging.warn("No known tensorboard process. Nothing done.") + + def calibrate( + self, + calibrator: Union[Calibrator, Dict[str, Calibrator]], + steps: Optional[int] = None, + input_fn: Optional[Callable[[], tf.data.Dataset]] = None, + save_calibrator: bool = True, + hooks: Optional[List[tf.train.SessionRunHook]] = None, + ) -> None: + """ + Calibrate the calibrator for `steps` calibration steps using the estimator.train method. + The build_graph passed to the Trainer constructor should + call calibrator.accumulate using something like tf.py_func. + That way, when this method calls estimator.train the calibrator will + accumulate one epoch of samples. After which, this method calls calibrator.calibrate(). + It is up to the user to then call calibrator.save() to save the calibrated Layer + and other information to disk for multi-phase training. - def train_and_evaluate(self, train_input_fn=None, eval_input_fn=None, - train_max_steps=None, eval_steps=None, - eval_delay=None, eval_period=None, - train_hooks=None, eval_hooks=None, - early_stop_metric=None, early_stop_patience=-1, - early_stop_minimize=True, early_stop_tolerance=0, exporters=None, - export_output_fn=None, max_duration=None): - """ - Train and evaluate the estimator for ``train_max_steps`` - using ``tf.estimator.train_and_evaluate``. - With a cluster configuration provided in the ``TF_CONFIG`` environment variable, this method - can be used for distributed training (multi-node or multi-process). - Unlike the ``learn`` method, training is continuous with ``train_max_steps``. - For distributed use case, evaluation happens periodically. - That is, after ``eval_delay`` seconds, an evaluation epoch of ``eval_step`` steps - occurs every ``eval_period`` seconds. Evaluation happens on the most recent checkpoint. - TF defaults to saving checkpoints every 10 mins. - For local use case, training occurs for train_max_steps epochs followed by a - single evaluation. For local use case we therefore recommend using learn() instead - as it provides early-stopping and multiple evaluations. - - ``train_and_evaluate`` will evaluate for ``eval_steps`` every ``eval_period`` seconds. - It will stop after ``train_steps`` is reached. - - You must ensure that all workers/servers are assigned the same `save_dir`. - - .. Note:: - - If the TF_CONFIG environment variable is set, this function assumes its running a distribute job. - - Args: - train_input_fn: - Function to iterate through training set. It is passed to estimator.train_and_evalute - eval_input_fn: - Function to iterate through evaluation set. It is passed to estimator.train_and_evalute. - train_max_steps: - maximum number of global steps of training to run. - Defaults to params.train_max_steps. - Non-positive values and None-values train indefinitely (use with caution). - eval_steps: - number of steps per evaluation. - Defaults to params.eval_steps. - Non-positive values and None-values go through - the entire evaluation set for each evaluation. - Note that the number of eval_steps should be high enough to minimize noise. - This is especially true for early-stopping. - eval_delay: - Start the first evaluation after eval_delay. Defaults to params.eval_delay or 2*60s. - eval_period: - Run an evaluation every eval_period seconds. Defaults to params.eval_period or 10*60s. - exporters: - List of exporters called at the end of each evaluation run. - Defaults to none. - export_output_fn: - The output format to use for exported models. - Only used if exporters is not None. - - Early-stopping arguments: - early_stop_metric: - String specifying the metric to early-stop on. Required with positive - ``early_stop_patience``. For example, 'accuracy', 'accuracy_0', 'loss', etc. - The string is used to extract the relevant tensor Op from the dict returned by - the get_eval_metric_ops method. For ``metrics`` pass to the constructor, - the string is one of those. For multi-class (that is, multi-metric) - metrics, the string may be appended with a ``_0``, ``_1``, etc. or one - of the ``multi_metric_names`` (one per class). - early_stop_patience: - Maximum number of epochs to wait for an improvement in the early_stop_metric - before breaking off training. For example, a patience of 10 means that - training will have 10 epochs to improve the metric before it is killed. - Whenever the metric is improved before running out of patience, - patience is reset to ``early_stop_patience``. - Defaults to -1 (that is, no early-stopping). - early_stop_minimize: - Set this to True (the default) for metrics that need to be minimized - (like ``loss``). Metrics like ``accuracy`` that need to be maximized - should set this to False. - early_stop_tolerance: - A non-negative tolerance for comparing early_stop_metric. - E.g. when maximizing the condition is current_metric > best_metric + tolerance. - Defaults to 0. - max_duration: - A float. When this argument is defined, the job will automatically terminate after - `max_duration` seconds if it has not already compeleted. - - Returns: - The directory where the checkpoints were saved. - """ + Args: + calibrator: + a twml.Calibrator instance or a dict of the form {name(str): twml.Calibrator}. + steps: + Maximum steps to accumulate examples for calibration. Optional. + If not specified, examples will be accumulated until all downsampled parts are processed. + input_fn: + Function to iterate through training set. It is passed to estimator.train. + hooks: + List of SessionRunHooks uses for training. Defaults to self.get_train_hooks(). + save_calibrator: + Boolean (default: True). If set to True it will save the calibrator layer. + """ + + if not callable(input_fn): + raise ValueError("Expecting callable input_fn function") + + # making everything a dict to avoid multiple ifs + if isinstance(calibrator, twml.contrib.calibrators.Calibrator): + calibrator = {"default": calibrator} + + # This is a dummy call to train, since we cannot predict without training + # from the Estimator API + self._estimator.train(input_fn, steps=1) + max_steps = steps if steps is not None else -1 + for name, clbrt in sorted(calibrator.items(), key=itemgetter(0)): + count = 0 + for out in self._estimator.predict( + input_fn, hooks=hooks, yield_single_examples=False + ): + if max_steps > 0 and count > max_steps: + break + clbrt.accumulate_feature(out) + count += 1 + clbrt.calibrate() + + # this step is done to allow us to keep the current phases event file for + # visualization on Tensorboard. It removes all files that + # are not event files. This piece of code should be deprecated when + # we deprecate the MDL calibrator (CX-12329) + for fname in tf.io.gfile.listdir(self._save_dir): + if not fname.startswith("events"): + tf.io.gfile.remove(os.path.join(self._save_dir, fname)) + + if save_calibrator: + # If we only have one calibrator, the calibrator signature + # will be set to default + if len(calibrator) == 1: + calibrator = calibrator["default"] + calibrator.save( + self.params.save_dir, name=calibrator.name, verbose=True + ) + else: + for name, clbrt in calibrator.items(): + clbrt.save( + self.params.save_dir, name=clbrt.name + str(name), verbose=True + ) + + def predict(self, *args, **kwargs) -> tf.estimator.EstimatorSpec: + """ + Wrapper over the tensorflow `Estimator.predict + `_. + method. See that documentation for description of arguments accepted. + + If hooks is passed as an argument, the specified hooks are used. + Else when profiler_steps is specified in the constructor of the Trainer, a + tf.train.ProfilerHook is passed to the predict interface. + Otherwise, hooks is set to an empty list. + """ + if "hooks" not in kwargs and len(args) < 3: + # If hooks is not specified as a keyword argument, nor as a positional argument + # add hooks as a keyword argument. + kwargs["hooks"] = self.get_predict_hooks() + + return self.estimator.predict(*args, **kwargs) + + def hub_export( + self, + name: str, + serving_input_receiver_fn: Callable[ + [], tf.estimator.export.ServingInputReceiver + ], + export_dir: Optional[str] = None, + checkpoint_path: Optional[str] = None, + export_task_type_overrider: Optional[str] = None, + ) -> None: + """ + Exports registered modules into a save directory. + + This method creates a directory under export_path with the save TF Hub. + One sub-directory (named export_name) per module registered via register_module_for_export. - logging.info("WARNING: Trainer.train_and_evaluate is an EXPERIMENTAL API.") - logging.info("Trainer.train_and_evaluate may change or be removed in future versions.") - - if not callable(train_input_fn): - raise ValueError("Expecting callable train_input_fn function") - if not callable(eval_input_fn): - raise ValueError("Expecting callable eval_input_fn function") - - self._exit_ps_after_training_complete() - - # Maybe export in eval processes. - if self.is_evaluator(): - if self.params.get("eval_name") is not None: - # Do not export if running special eval. - exporters = None - export_output_fn = None - elif exporters and export_output_fn: - self._export_output_fn = export_output_fn - else: - # Default option. - self._export_output_fn = None + Args: + name: + unique name of the module to export. + serving_input_receiver_fn: + A function with no arguments that returns a ServingInputReceiver. + This is used with the estimator passed to export() to build the graph (in PREDICT mode) + that registers the modules for export. The model in that graph is never run, + so the actual data provided by this input fn does not matter. + export_dir: + A string containing a directory where to write the export directories. + Defaults to the save_dir. + checkpoint_path: + The checkpoint path to export. Defaults to the latest. + export_task_type_overrider: + Specifies the task type that will override the default task type used for export + (hogwild training defaults to evaluator, otherwise, defaults to chief) + """ + if export_task_type_overrider: + if not self.is_task_type(export_task_type_overrider): + logging.info( + f"Trainer.hub_export ignored due to process not being {export_task_type_overrider}" + ) + return + else: + if self._using_hogwild: + if not self.is_evaluator(): + logging.info( + "Trainer.hub_export ignored due to the process not being evaluator." + ) + return + else: + if not self.is_chief(): + logging.info( + "Trainer.hub_export ignored due to the process not being chief." + ) + return + + if export_dir: + export_dir = sanitize_hdfs_path(export_dir) + + if checkpoint_path: + checkpoint_path = sanitize_hdfs_path(checkpoint_path) + else: + checkpoint_path = self.best_or_latest_checkpoint + + export_dir = export_dir if export_dir is not None else self._save_dir + exporter = hub.LatestModuleExporter(name, serving_input_receiver_fn) + # The path_exporter by default contains a timestamp directory in its path. + path_exporter = exporter.export( + estimator=self.estimator, + export_path=export_dir, + checkpoint_path=checkpoint_path, + ) - train_hooks = self.get_train_hooks() if train_hooks is None else train_hooks - train_hooks = [] if train_hooks is None else train_hooks - - eval_hooks = self.get_eval_hooks() if eval_hooks is None else eval_hooks - eval_hooks = [] if eval_hooks is None else eval_hooks - - if train_max_steps is None: - train_max_steps = self.params.get('train_max_steps') - - if eval_steps is None: - eval_steps = self.params.eval_steps - if eval_steps <= 0: - eval_steps = None - - if eval_delay is None: - eval_delay = self.params.eval_delay - if eval_period is None: - eval_period = self.params.eval_period - - if early_stop_patience > 0: - # when training hooks detect this file, they request a stop to training - early_stop_path = os.path.join(self._save_dir, 'earlystop_now.txt') - # prepare early stopping hook (which also handles logic here) - - self._is_early_stopping = True - - eval_early_stop_hook = twml.hooks.EarlyStopHook( - metric=early_stop_metric, - checkpoint_dir=self._save_dir, - patience=early_stop_patience, - minimize=early_stop_minimize, - tolerance=early_stop_tolerance, - get_estimator_spec_fn=lambda: self.current_estimator_spec, - file_path=early_stop_path, - exit_on_end=os.environ.get('TF_CONFIG') is not None) # only exit for distributed jobs - # add early stop hook to eval hooks - eval_hooks.append(eval_early_stop_hook) - - # prepare the commensurate training hook - train_early_stop_hook = twml.hooks.StopIfExistsHook(early_stop_path) - train_hooks.append(train_early_stop_hook) - - if max_duration is not None: - train_early_stop_duration_hook = twml.hooks.EarlyStopDuration( - max_duration=max_duration, - exit_on_end=False, - save_dir=self._save_dir, - overwrite=self.is_chief() - ) - eval_early_stop_duration_hook = twml.hooks.EarlyStopDuration( - max_duration=max_duration, - exit_on_end=os.environ.get('TF_CONFIG') is not None, - save_dir=self._save_dir, - overwrite=False - ) # only exit for distributed jobs - - train_hooks.append(train_early_stop_duration_hook) - eval_hooks.append(eval_early_stop_duration_hook) - - with self.experiment_tracker.track_experiment(eval_hooks, lambda: self.current_estimator_spec): - train_spec = self.get_train_spec(train_input_fn, train_max_steps, train_hooks) - eval_spec = self.get_eval_spec(eval_input_fn, eval_steps, - eval_delay, eval_period, - eval_hooks, exporters) - self._train_and_evaluate(train_spec, eval_spec) - - if self.is_chief(): - self.write_state_to_disk(save_dir=self._save_dir, filename='_SUCCESS') - - return self._save_dir - - def _train_and_evaluate(self, train_spec, eval_spec): - """ - Private method that calls - ``tf.estimator.train_and_evaluate(self._estimator, train_spec, eval_spec)``. - """ - try: - tf.estimator.train_and_evaluate(self._estimator, train_spec, eval_spec) - except twml.errors.EarlyStopError: - # Ignore the exception if on evaluator. - if self.is_evaluator(): - pass - else: - raise - - def train(self, input_fn=None, steps=None, hooks=None): - """ - Train the estimator for `steps` training steps. - - Args: - steps: - number of steps for which to perform training. For example, 100 means each - evaluation will end after processing 100 batches. - Defaults to None. i.e. trains on the entire dataset a single time. - Non-positive values and None-values go through the entire training set each epoch. - input_fn: - Function to iterate through training set. It is passed to estimator.train. - hooks: - List of SessionRunHooks uses for training. Defaults to self.get_train_hooks(). - """ - if os.environ.get('TF_CONFIG') and "is_calibrating" not in self.params: - raise ValueError("trainer.train() can not be used with distributed / hogwild setups") + # LatestModuleExporter.export() returns a binary string on Cloud ML Engine + # but tf.io.gfile.listdir() does not; this is an issue when joining paths + if isinstance(path_exporter, bytes): + path_exporter = path_exporter.decode() - if not callable(input_fn): - raise ValueError("Expecting callable input_fn function") + # Copying the saved hub module to export_dir so we don't need to specify + # the timestamp when loading the module. + # This is a workaround due to the current implementation of hub.LatestModuleExporter. + # This works for multiple hub modules. + hub_exported_modules = tf.io.gfile.listdir(path_exporter) - if self._is_early_stopping: - raise ValueError("Can not call train() after learn() when using early stopping.") + backup_dir = os.path.join( + export_dir, "backups", datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + ) - hooks = self.get_train_hooks() if hooks is None else hooks - self._estimator.train(input_fn, steps=steps, hooks=hooks) - return self + for folder in hub_exported_modules: + hub_module_oldpath = os.path.join(path_exporter, folder) + hub_module_newpath = os.path.join(export_dir, folder) - def evaluate(self, input_fn=None, steps=None, hooks=None, name=None): - """ - Evaluate the estimator for `steps` evaluation steps. - - Args: - steps: - number of steps for which to perform evaluation. For example, 100 means each - evaluation will end after processing 100 batches. - Defaults to None. i.e. evaluates on the entire dataset a single time. - Negative values and None-values go through the entire training set each epoch. - input_fn: - Function to iterate through evaluation set. It is passed to estimator.evaluate. - hooks: - List of SessionRunHooks used for evaluation. Defaults to None. - Note that, unlike learn(), hooks defaults to None instead of self.get_eval_hooks() - as the latter may implement early-stopping, which isn't necessarilty the desired - behavior when calling evaluate() on its own. - name: - Name of the evaluation if user needs to run multiple evaluations on different data sets. - Metrics for different evaluations are saved in separate folders, - and appear separately in tensorboard. - - Returns: - If `is_evaluator()`, returns a dict containing the evaluation metrics specified - in `metric_fn` keyed by name, as well as an entry `global_step` that contains - the value of the global step for which this evaluation was performed. - Otherwise (i.e. `is_evaluator() == False`), returns None. - """ - if not self.is_evaluator(): - return None + # If the destination already exists, move to backup + if tf.io.gfile.exists(hub_module_newpath): + # Ensure backup_dir exists + tf.io.gfile.makedirs(backup_dir) + hub_module_backup = os.path.join(backup_dir, folder) + tf.io.gfile.rename(hub_module_newpath, hub_module_backup) - if not callable(input_fn): - raise ValueError("Expecting callable input_fn function") + tf.io.gfile.rename(hub_module_oldpath, hub_module_newpath) - hooks = self.get_eval_hooks() if hooks is None else hooks - hooks = [] if hooks is None else hooks + # Since the timestamped folder exists but is empty, we can delete it. + tf.io.gfile.rmtree(path_exporter) - # for consistency with train/learn - eval_steps = None if steps is not None and steps < 0 else steps + def _is_on_gke(self) -> bool: + """Returns True if running on gke.""" + cluster = os.environ.get("TWML_JOB_CLUSTER") + if not cluster or cluster in {"smf1", "atla"}: + return False + return True - with self.experiment_tracker.track_experiment(hooks, lambda: self.current_estimator_spec, name=name): - checkpoint = self.best_or_latest_checkpoint - computed_metrics = self._estimator.evaluate( - input_fn, - steps=eval_steps, - hooks=hooks, - checkpoint_path=checkpoint, - name=name - ) + def _maybe_del_tsd_exit(self, state_files: List[str]) -> None: + """Handle potential early exit and TwitterSetDeployment deletion. - return computed_metrics + If: + - distributed training + - running GKE + - training is finished (all state_files exists) + we will exit early and not restart work - def start_tensorboard(self, port=None): - """ - Start tensorboard process to visualize logs in save_dir. - """ - logging.info("Starting tensorboard.") - if self._tensorboard_handle: - logging.warn("Tensorboard already running. Nothing done.") - return - - if port is None: - if 'tensorboard_port' not in self.params.values(): - raise ValueError('You must specify a port for tensorboard to run on.') - elif self.params.tensorboard_port is None: - return - else: - port = self.params.tensorboard_port - - mldash_path = 'experiments' - if self.experiment_tracker.path: - mldash_path += '/%s' % encode_url(self.experiment_tracker.experiment_id) - tensorboard_args = ['--logdir=%s' % self._save_dir, '--port=%d' % port] - - try: - args = ['email_and_launch_tensorboard', mldash_path, '--'] + tensorboard_args - self._tensorboard_handle = subprocess.Popen(args) - except OSError: - try: - self._tensorboard_handle = subprocess.Popen(['tensorboard'] + tensorboard_args) - except OSError: - try: - # this will work with Twitter internal pants build when run locally - args = ['./pants', 'run', 'twml:tensorboard', '--'] + tensorboard_args - self._tensorboard_handle = subprocess.Popen(args) - except OSError: - logging.error("No tensorboard installed, won't able to visualize training in tensorboard.") + If --distributed_training_cleanup = True then we will also handle + cleaning up the TwitterSetDeployments. - def stop_tensorboard(self): - """ - Shutdown this Trainer's associated Tensorboard. - """ - if self._tensorboard_handle: - logging.info("Shutting down tensorboard.") - self._tensorboard_handle.kill() - else: - logging.warn("No known tensorboard process. Nothing done.") - - def calibrate(self, - calibrator, - steps=None, - input_fn=None, - save_calibrator=True, - hooks=None): - """ - Calibrate the calibrator for `steps` calibration steps using the estimator.train method. - The build_graph passed to the Trainer constructor should - call calibrator.accumulate using something like tf.py_func. - That way, when this method calls estimator.train the calibrator will - accumulate one epoch of samples. After which, this method calls calibrator.calibrate(). - It is up to the user to then call calibrator.save() to save the calibrated Layer - and other information to disk for multi-phase training. - - Args: - calibrator: - a twml.Calibrator instance or a dict of the form {name(str): twml.Calibrator}. - steps: - Maximum steps to accumulate examples for calibration. Optional. - If not specified, examples will be accumulated until all downsampled parts are processed. - input_fn: - Function to iterate through training set. It is passed to estimator.train. - hooks: - List of SessionRunHooks uses for training. Defaults to self.get_train_hooks(). - save_calibrator: - Boolean (default: True). If set to True it will save the calibrator layer. - """ + Args: + state_files: + A python list indicate state files to determine the finish state of the job. + """ + # job type that is responsible for experiment tracking will remain alive + # until it marks the experiment as finished. + if self.experiment_tracker._env_eligible_for_recording_experiment: + exp_status = self.experiment_tracker.get_run_status() + if exp_status and exp_status not in {"Success", "Failed"}: + logging.info( + f"Not exiting early because experiment is still {exp_status}." + ) + return + + # do not bother if we are on prem + if not self._is_on_gke(): + logging.info("No need to exit early because running on prem.") + return + + states = [ + twml.util.file_exist_in_dir(self._save_dir, state_file) + for state_file in state_files + ] + do_not_restart = self._params.get("distributed") and all(states) + if not do_not_restart: + return - if not callable(input_fn): - raise ValueError("Expecting callable input_fn function") - - # making everything a dict to avoid multiple ifs - if isinstance(calibrator, twml.contrib.calibrators.Calibrator): - calibrator = {"default": calibrator} - - # This is a dummy call to train, since we cannot predict without training - # from the Estimator API - self._estimator.train(input_fn, steps=1) - max_steps = steps if steps is not None else -1 - for name, clbrt in sorted(calibrator.items(), key=itemgetter(0)): - count = 0 - for out in self._estimator.predict(input_fn, hooks=hooks, yield_single_examples=False): - if max_steps > 0 and count > max_steps: - break - clbrt.accumulate_feature(out) - count += 1 - clbrt.calibrate() - - # this step is done to allow us to keep the current phases event file for - # visualization on Tensorboard. It removes all files that - # are not event files. This piece of code should be deprecated when - # we deprecate the MDL calibrator (CX-12329) - for fname in tf.io.gfile.listdir(self._save_dir): - if not fname.startswith("events"): - tf.io.gfile.remove(os.path.join(self._save_dir, fname)) - - if save_calibrator: - # If we only have one calibrator, the calibrator signature - # will be set to default - if len(calibrator) == 1: - calibrator = calibrator['default'] - calibrator.save( - self.params.save_dir, - name=calibrator.name, - verbose=True - ) - else: - for name, clbrt in calibrator.items(): - clbrt.save( - self.params.save_dir, - name=clbrt.name + str(name), - verbose=True - ) - - def predict(self, *args, **kwargs): - """ - Wrapper over the tensorflow `Estimator.predict - `_. - method. See that documentation for description of arguments accepted. - - If hooks is passed as an argument, the specified hooks are used. - Else when profiler_steps is specified in the constructor of the Trainer, a - tf.train.ProfilerHook is passed to the predict interface. - Otherwise, hooks is set to an empty list. - """ - if 'hooks' not in kwargs and len(args) < 3: - # If hooks is not specified as a keyword argument, nor as a positional argument - # add hooks as a keyword argument. - kwargs['hooks'] = self.get_predict_hooks() - - return self.estimator.predict(*args, **kwargs) - - def hub_export(self, - name, - serving_input_receiver_fn, - export_dir=None, - checkpoint_path=None, - export_task_type_overrider=None): - """ - Exports registered modules into a save directory. - - This method creates a directory under export_path with the save TF Hub. - One sub-directory (named export_name) per module registered via register_module_for_export. - - Arguments: - name: - unique name of the module to export. - serving_input_receiver_fn: - A function with no arguments that returns a ServingInputReceiver. - This is used with the estimator passed to export() to build the graph (in PREDICT mode) - that registers the modules for export. The model in that graph is never run, - so the actual data provided by this input fn does not matter. - export_dir: - A string containing a directory where to write the export directories. - Defaults to the save_dir. - checkpoint_path: - The checkpoint path to export. Defaults to the latest. - export_task_type_overrider: - Specifies the task type that will override the default task type used for export - (hogwild training defaults to evaluator, otherwise, defaults to chief) - """ - if export_task_type_overrider: - if not self.is_task_type(export_task_type_overrider): - logging.info( - f"Trainer.hub_export ignored due to process not being {export_task_type_overrider}") - return - else: - if self._using_hogwild: - if not self.is_evaluator(): - logging.info("Trainer.hub_export ignored due to the process not being evaluator.") - return - else: - if not self.is_chief(): - logging.info("Trainer.hub_export ignored due to the process not being chief.") - return - - if export_dir: - export_dir = sanitize_hdfs_path(export_dir) - - if checkpoint_path: - checkpoint_path = sanitize_hdfs_path(checkpoint_path) - else: - checkpoint_path = self.best_or_latest_checkpoint - - export_dir = export_dir if export_dir is not None else self._save_dir - exporter = hub.LatestModuleExporter(name, serving_input_receiver_fn) - # The path_exporter by default contains a timestamp directory in its path. - path_exporter = exporter.export(estimator=self.estimator, - export_path=export_dir, - checkpoint_path=checkpoint_path) - - # LatestModuleExporter.export() returns a binary string on Cloud ML Engine - # but tf.io.gfile.listdir() does not; this is an issue when joining paths - if isinstance(path_exporter, bytes): - path_exporter = path_exporter.decode() - - # Copying the saved hub module to export_dir so we don't need to specify - # the timestamp when loading the module. - # This is a workaround due to the current implementation of hub.LatestModuleExporter. - # This works for multiple hub modules. - hub_exported_modules = tf.io.gfile.listdir(path_exporter) - - backup_dir = os.path.join(export_dir, "backups", - datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) - - for folder in hub_exported_modules: - hub_module_oldpath = os.path.join(path_exporter, folder) - hub_module_newpath = os.path.join(export_dir, folder) - - # If the destination already exists, move to backup - if tf.io.gfile.exists(hub_module_newpath): - # Ensure backup_dir exists - tf.io.gfile.makedirs(backup_dir) - hub_module_backup = os.path.join(backup_dir, folder) - tf.io.gfile.rename(hub_module_newpath, hub_module_backup) - - tf.io.gfile.rename(hub_module_oldpath, hub_module_newpath) - - # Since the timestamped folder exists but is empty, we can delete it. - tf.io.gfile.rmtree(path_exporter) - - def _is_on_gke(self) -> bool: - """Returns True if running on gke.""" - cluster = os.environ.get('TWML_JOB_CLUSTER') - if not cluster or cluster in {'smf1', 'atla'}: - return False - return True - - def _maybe_del_tsd_exit(self, state_files) -> None: - """Handle potential early exit and TwitterSetDeployment deletion. - - If: - - distributed training - - running GKE - - training is finished (all state_files exists) - we will exit early and not restart work - - If --distributed_training_cleanup = True then we will also handle - cleaning up the TwitterSetDeployments. - - Args: - state_files: A python list indicate state files to determine the finish - state of the job. - """ - # job type that is responsible for experiment tracking will remain alive - # until it marks the experiment as finished. - if self.experiment_tracker._env_eligible_for_recording_experiment: - exp_status = self.experiment_tracker.get_run_status() - if exp_status and exp_status not in {'Success', 'Failed'}: logging.info( - f"Not exiting early because experiment is still {exp_status}." + f"Exiting early because a _SUCCESS file already exists in {self._save_dir}" ) - return - - # do not bother if we are on prem - if not self._is_on_gke(): - logging.info("No need to exit early because running on prem.") - return - - states = [ - twml.util.file_exist_in_dir(self._save_dir, state_file) for state_file in state_files] - do_not_restart = (self._params.get('distributed') and all(states)) - if not do_not_restart: - return - - logging.info( - f"Exiting early because a _SUCCESS file already exists in {self._save_dir}") - if self._params.get('distributed_training_cleanup'): - resource_name = '-'.join([ - os.environ['TWML_JOB_NAME'], - os.environ['TWML_DISTRIBUTED_JOB_TYPE'], - os.environ['TWML_JOB_ENV'], - ]) - logging.info(f"Deleting TwitterSetDeployment {resource_name}") - # each job type will manage its own deletion so that deletion happens - # in the trainer init call for every job type - # otherwise we may kill another job type during an important - # process like experiment tracking management (handled by the evaluator - kubectl_delete_by_name( - zone=None, - namespace=os.environ['TWML_JOB_ROLE'], - resource_type=Resource.TWITTERSETDEPLOYMENTS.value, - resource_name=resource_name, - wait=False, - ) - sys.exit(0) - - def write_state_to_disk(self, save_dir, filename='_SUCCESS') -> None: - """Write state file to disk to indicate the state of training process. This is usually used - to mark the state of training progress and determine the start when job restarts/resumes. - Args: - save_dir: A str of local/gcs/hdfs dir to write the state file. - file_name: A str indicate the state file. Default to `_SUCCESS`. - """ - file_path = os.path.join(save_dir, filename) - if tf.io.gfile.exists(file_path): - tf.logging.warn(f'{file_path} already exist.') - return + if self._params.get("distributed_training_cleanup"): + resource_name = "-".join( + [ + os.environ["TWML_JOB_NAME"], + os.environ["TWML_DISTRIBUTED_JOB_TYPE"], + os.environ["TWML_JOB_ENV"], + ] + ) + logging.info(f"Deleting TwitterSetDeployment {resource_name}") + # each job type will manage its own deletion so that deletion happens + # in the trainer init call for every job type + # otherwise we may kill another job type during an important + # process like experiment tracking management (handled by the evaluator + kubectl_delete_by_name( + zone=None, + namespace=os.environ["TWML_JOB_ROLE"], + resource_type=Resource.TWITTERSETDEPLOYMENTS.value, + resource_name=resource_name, + wait=False, + ) + sys.exit(0) + + def write_state_to_disk(self, save_dir, filename="_SUCCESS") -> None: + """ + Write state file to disk to indicate the state of training process. This is usually used + to mark the state of training progress and determine the start when job restarts/resumes. - with tf.io.gfile.GFile(file_path, 'w') as f: - f.write('') \ No newline at end of file + Args: + save_dir: A str of local/gcs/hdfs dir to write the state file. + file_name: A str indicate the state file. Default to `_SUCCESS`. + """ + file_path = os.path.join(save_dir, filename) + if tf.io.gfile.exists(file_path): + tf.logging.warn(file_path + " already exist.") + return + + with tf.io.gfile.GFile(file_path, "w") as f: + f.write("") diff --git a/twml/twml/util.py b/twml/twml/util.py index cd7679a6f..271cb284b 100644 --- a/twml/twml/util.py +++ b/twml/twml/util.py @@ -3,940 +3,1073 @@ """ import argparse -from datetime import datetime import itertools import json import logging as _logging import os import re +from datetime import datetime +from typing import Any, Callable, Dict, Iterable, List, Optional, Union -from twitter.ml.common.resources import AuroraPath -from twitter.deepbird.hparam import HParams -from twitter.deepbird.io.util import ( - _get_feature_id, # noqa: F401 - feature_id, # noqa: F401 - preprocess_feature_regex, # noqa: F401 - preprocess_path, # noqa: F401 - sanitize_hdfs_path, # noqa: F401 - is_string, # noqa: F401 - list_files, # noqa: F401 - match_files, # noqa: F401 -) -from twitter.deepbird.io.legacy.util import ( - batch_apply, # noqa: F401 - boolean_mask, # noqa: F401 - fixed_length_tensor, # noqa: F401 -) -from twitter.deepbird.sparse.util import ( - convert_to_sparse, # noqa: F401 - limit_bits, # noqa: F401 -) - -from dateutil import rrule -from joblib import delayed, Parallel -from six import string_types - +import tensorflow.compat.v1 as tf from absl import logging +from dateutil import rrule +from joblib import Parallel, delayed from libtwml import CLIB, OPLIB # noqa: F401 -import tensorflow.compat.v1 as tf +from six import string_types from tensorflow.python.platform import tf_logging +from twitter.deepbird.hparam import HParams +from twitter.deepbird.io.legacy.util import batch_apply # noqa: F401 +from twitter.deepbird.io.legacy.util import boolean_mask # noqa: F401 +from twitter.deepbird.io.legacy.util import fixed_length_tensor # noqa: F401 +from twitter.deepbird.io.util import _get_feature_id # noqa: F401 +from twitter.deepbird.io.util import feature_id # noqa: F401 +from twitter.deepbird.io.util import is_string # noqa: F401 +from twitter.deepbird.io.util import list_files # noqa: F401 +from twitter.deepbird.io.util import match_files # noqa: F401 +from twitter.deepbird.io.util import preprocess_feature_regex # noqa: F401 +from twitter.deepbird.io.util import preprocess_path # noqa: F401 +from twitter.deepbird.io.util import sanitize_hdfs_path # noqa: F401 +from twitter.deepbird.sparse.util import convert_to_sparse # noqa: F401 +from twitter.deepbird.sparse.util import limit_bits # noqa: F401 +from twitter.ml.common.resources import AuroraPath + import twml from twml.feature_config import FeatureConfigBuilder - # big_prime is less than 2**32 # This just needs to be co-prime with powers of 2 # any large prime is sufficient, but it's not necessary. HASHING_PRIME = 2479700537 -def multiplicative_hash(input, hash_constant=HASHING_PRIME): - return input * hash_constant - - -def _return_tensors_from_checkpoint_folder(init_dir, model_name=None): - """Returns tensors list from a checkpoint folder - - Args: - init_dir: Name of the checkpoint directory. - model_name: the model which we will use to obtain the checkpoint - (e.g. model.ckpt-50000) if set to None it will default to the - latest model saved in the checkpont file. - - """ - if model_name is None: - # gets the most recently generated model.cpkt file - model_path = tf.train.latest_checkpoint(init_dir) - if model_path is None: - raise ValueError("Could not find a valid model checkpoint inside the directory") - else: - model_path = os.path.join(init_dir, model_name) - reader = tf.train.NewCheckpointReader(model_path) - try: - return (reader.debug_string().decode("utf-8")) - except OSError: - logging.error('Could not decode the string') - - -def get_scope_dict(init_dir, incoming_scope_name, current_scope_name, model_name=None): - """Returns tensors map from a checkpoint file. - - Args: - file_name: - Name of the checkpoint directory. - incoming_scope_name: - scope name of the previous phase - current_scope_name: - scope name of current phase - model_name: - the model which we will use to obtain the checkpoint - (e.g. model.ckpt-50000) if set to None it will default - to the latest model saved in the checkpoint file. - Returns: - init_map: - init_map which will be inputted to the checkpoint - """ - init_map = {} - reader_dump = _return_tensors_from_checkpoint_folder(init_dir=init_dir, - model_name=model_name).splitlines() - for member in reader_dump: - # remove global_step since it is not necessary - if 'global_step' not in member: - saved_variables = str(member.split(" ")[0]) - saved_scope = saved_variables.rsplit('/', 1)[0] + "/" - new_scope = saved_scope.replace(incoming_scope_name, current_scope_name, 1) - # create key in init_map - if saved_scope not in init_map.keys(): # pylint: disable=dict-keys-not-iterating - init_map[saved_scope] = new_scope - return init_map +def multiplicative_hash(input: int, hash_constant: int = HASHING_PRIME) -> int: + return input * hash_constant + + +def _return_tensors_from_checkpoint_folder( + init_dir: str, model_name: Optional[str] = None +) -> Optional[str]: + """Returns tensors list from a checkpoint folder + + Args: + init_dir: Name of the checkpoint directory. + model_name: the model which we will use to obtain the checkpoint + (e.g. model.ckpt-50000) if set to None it will default to the + latest model saved in the checkpoint file. + + Returns: + debug_string (str): + debug string of the checkpoint file + """ + if model_name is None: + # gets the most recently generated model.cpkt file + model_path = tf.train.latest_checkpoint(init_dir) + if model_path is None: + raise ValueError( + "Could not find a valid model checkpoint inside the directory" + ) + else: + model_path = os.path.join(init_dir, model_name) + reader = tf.train.NewCheckpointReader(model_path) + try: + return reader.debug_string().decode("utf-8") + except OSError: + logging.error("Could not decode the string") + + +def get_scope_dict( + init_dir: str, + incoming_scope_name: str, + current_scope_name: str, + model_name: Optional[str] = None, +) -> Dict[str, str]: + """Returns tensors map from a checkpoint file. + + Args: + file_name: str + Name of the checkpoint directory. + incoming_scope_name: str + scope name of the previous phase + current_scope_name: str + scope name of current phase + model_name: str + the model which we will use to obtain the checkpoint + (e.g. model.ckpt-50000) if set to None it will default + to the latest model saved in the checkpoint file. + Returns: + init_map (dict): + init_map which will be inputted to the checkpoint + """ + init_map = {} + reader_dump = _return_tensors_from_checkpoint_folder( + init_dir=init_dir, model_name=model_name + ).splitlines() + for member in reader_dump: + # remove global_step since it is not necessary + if "global_step" not in member: + saved_variables = str(member.split(" ")[0]) + saved_scope = saved_variables.rsplit("/", 1)[0] + "/" + new_scope = saved_scope.replace(incoming_scope_name, current_scope_name, 1) + # create key in init_map + if ( + saved_scope not in init_map.keys() + ): # pylint: disable=dict-keys-not-iterating + init_map[saved_scope] = new_scope + return init_map def get_init_map( + init_from_dir: str, + exclude_var_names: Optional[List[str]] = None, + exclude_name_scopes: Optional[List[str]] = None, + name_scope_to_remove: Optional[str] = None, + name_scope_to_prepend: Optional[str] = None, +) -> Dict[str, str]: + """ + Builds a map for initializing from a checkpoint (see tf.train.init_from_checkpoint). + + It assumes that the latter part of the variable names are consistent between the checkpoint and + the new model, but their name_scopes may be different. If the checkpoint model has variable names + of the form old/scope/var/foo, and the corresponding variable names for the new model should be + my/new/scope/var/foo, then you should set name_scope_to_remove = 'old/' and + name_scope_to_prepend = 'my/new/'. + + This function can be used to + + 1. Generate an ``init_map`` map that can be passed to the ``Trainer`` init or + 2. Used to generate an ``init_map`` directly inside ``build_graph_fn``, in + which case it should be passed directly to ``tf.train.init_from_checkpoint`` inside + ``build_graph_fn``, in which case you do not also need to specify the ``init_map`` argument to + the trainer. + + Parameters + ---------- + init_from_dir: str + Directory containing checkpoint + exclude_var_names: list[str] + List of variables in the checkpoint that should be excluded from the map. + exclude_name_scopes: list[str] + List of name_scopes in the checkpoint model that should be excluded from the map. + name_scope_to_remove: str + portion of name_scope for checkpoint variables that should not be included in variable names + for new model. + name_scope_to_prepend: str + name_scope to prepend to variable names in checkpoint to give variable names for new model. + + Returns + ------- + dict + keys are variable names in the checkpoint and values are variable names in the new model, + into which the checkpoint parameters should be loaded. + """ + vars_to_restore = get_checkpoint_variable_names( init_from_dir, - exclude_var_names=None, - exclude_name_scopes=None, - name_scope_to_remove=None, - name_scope_to_prepend=None): - """ - Builds a map for initializing from a checkpoint (see tf.train.init_from_checkpoint). - - It assumes that the latter part of the variable names are consistent between the checkpoint and - the new model, but their name_scopes may be different. If the checkpoint model has variable names - of the form old/scope/var/foo, and the corresponding variable names for the new model should be - my/new/scope/var/foo, then you should set name_scope_to_remove = 'old/' and - name_scope_to_prepend = 'my/new/'. - - This function can be used to - - 1. Generate an ``init_map`` map that can be passed to the ``Trainer`` init or - 2. Used to generate an ``init_map`` directly inside ``build_graph_fn``, in - which case it should be passed directly to ``tf.train.init_from_checkpoint`` inside - ``build_graph_fn``, in which case you do not also need to specify the ``init_map`` argument to - the trainer. - - Parameters - ---------- - init_from_dir: Directory containing checkpoint - exclude_var_names: list[str] - List of variables in the checkpoint that should be excluded from the map. - exclude_name_scopes: list[str] - List of name_scopes in the checkpoint model that should be excluded from the map. - name_scope_to_remove: str - portion of name_scope for checkpoint variables that should not be included in variable names - for new model. - name_scope_to_prepend: str - name_scope to prepend to variable names in checkpoint to give variable names for new model. - - Returns - ------- - dict - keys are variable names in the checkpoint and values are variable names in the new model, - into which the checkpoint parameters should be loaded. - """ - vars_to_restore = get_checkpoint_variable_names( - init_from_dir, - exclude_var_names=exclude_var_names, - exclude_scopes=exclude_name_scopes, - ) - - if name_scope_to_prepend is not None: - if not name_scope_to_prepend.endswith('/'): - name_scope_to_prepend += '/' - - if name_scope_to_remove is not None: - if not name_scope_to_remove.endswith('/'): - name_scope_to_remove += '/' - - init_map = {} - - for var_name in vars_to_restore: - var_name_checkpoint = var_name - - if name_scope_to_remove is not None: - var_name = var_name.replace(name_scope_to_remove, '') - - var_name_new_model = var_name + exclude_var_names=exclude_var_names, + exclude_scopes=exclude_name_scopes, + ) if name_scope_to_prepend is not None: - var_name_new_model = name_scope_to_prepend + var_name_new_model - - init_map[var_name_checkpoint] = var_name_new_model - - return init_map - - -def get_checkpoint_variable_names(model_dir, exclude_var_names=None, exclude_scopes=None): - """ - Gets a list of variable names from the latest checkpoint in model_dir. - Removes variables with scope defined by exclude_scopes, and/or with names defined by - exclude_var_names. - - Args: - model_dir (str): Directory containing checkpoint file for the pre-trained model - exclude_var_names (list): Optional variable names to exclude (can include full/partial scope) - exclude_scopes (list): Optional scopes to exclude - - Returns: - list: variable names - """ - checkpoint_path = tf.train.latest_checkpoint(model_dir) - variables_and_shapes = tf.train.list_variables(checkpoint_path) - - def _keep(name): - if exclude_scopes and any(name.startswith(exc_scope) for exc_scope in exclude_scopes): - return False - if exclude_var_names and any(name.endswith(exc_var) for exc_var in exclude_var_names): - return False - return True - - names = [x[0] for x in variables_and_shapes if _keep(x[0])] - - return names - - -def to_snake_case(name): - """ - Changes name to snake case - """ - intermediate = re.sub('(.)([A-Z][a-z0-9]+)', r'\1_\2', name) - insecure = re.sub('([a-z])([A-Z])', r'\1_\2', intermediate).lower() - # If the class is private the name starts with "_" which is not secure - # for creating scopes. We prefix the name with "private" in this case. - if insecure[0] != '_': - return insecure - return 'private' + insecure - - -def copy_phase_inputs(init_dir, dest_dir): - """Automatically copies the .json.tf from the init_dir to save_dir - so we can load multiple parameters at the same time. - - Args: - init_dir: - Name of the checkpoint directory. - dest_dir: - Name of the output directory. - """ - if init_dir is not None: - # we are using tf.io.gfile so we can use it with both local and hdfs paths - for files in tf.io.gfile.listdir(init_dir): - if files.endswith(".json.tf"): - src_file = os.path.join(init_dir, files) - dest_file = os.path.join(dest_dir, files) - if not tf.io.gfile.exists(dest_dir): - # creates the folder - try: - tf.io.gfile.makedirs(dest_dir) - # to prevent racing condition - except OSError: - if not tf.io.gfile.isdir(dest_dir): - raise - # dest_file may be old if it exists and - # dest_file gets copied several times in distributed training - tf.io.gfile.copy(src_file, dest_file, overwrite=True) - - -def rehash_sparse_features_nbits(sp_a, nbits, hash_fn=multiplicative_hash): - """ - Rehash the feature ids of the sparse tensor, - and limit the output to n bits. - - This is useful for making the distribution of - feature_ids more uniform, which may improve performance - in some situations. - - This would typically be used on the output of - PercentileDiscretizer, since it assigns many - bins to low-valued output feature ids. - - Input feature IDs should take values less than 2**32, - and nbits should be less than 32 - - Args: - sp_a: - a tf.SparseTensor object - nbits: - integer number of bits to mask output feature_ids - hash_fn: - Function that takes integer values and returns hashes of these values. - The output does not need to be masked to the desired number of bits, - as this masking will be taken care of. Default value = multiplicative_hash. - - Returns: - a new tf.SparseTensor - """ - - feature_ids = sp_a.indices[:, 1] - feature_ids = hash_fn(feature_ids) - - sample_ids = sp_a.indices[:, 0] - values = sp_a.values - dense_shape = sp_a.dense_shape - - indices = tf.stack([sample_ids, feature_ids], axis=1) - - sp_a = tf.SparseTensor(indices, values, dense_shape) - - # note - we need 2**nbits >= batch size - # otherwise, sample_ids will be squashed by the mask. - return limit_sparse_tensor_size(sp_a, nbits) - - -def convert_to_hparams(opt): - """ - Converts argparse.Namespace object to twitter.deepbird.hparam.hparam.HParams. - Note that tensorflow.contrib.training.HParams is gone in TF 2.x, and we forward ported - tensorflow.contrib.training.HParams to twitter.deepbird.hparam.hapram.HParams. - - NOTE: If you are using estimators, please don't call this method and directly pass python dict - to TensorFlow estimator. Starting TensorFlow 2.0, Estimator will only accept dicts. - """ - - # Convert to dict so we can iterate through it cleanly. - if isinstance(opt, argparse.Namespace): - params_dict = vars(opt) - elif isinstance(opt, dict): - params_dict = opt - elif isinstance(opt, HParams): - logging.warning('If you are using Estimator, please pass python dict directly to Estimator.') - params_dict = opt.values() - else: - raise ValueError("Input can not be of type %s. " - "It can be one of { argparse.Namespace, dict, " - "twitter.deepbird.hparam.HParams}." - % type(opt)) - - params = HParams() - # Hack to convert all parameters from hdfs:/// format to hdfs://default/ - # Note: .items() makes a copy in python 2.7, but that is fine since the performance isn't critical. - for key, val in params_dict.items(): - val = params_dict[key] - # Fix the path if the value is a string - if isinstance(val, str): - params.add_hparam(key, sanitize_hdfs_path(val)) - else: - params.add_hparam(key, val) - - return params + if not name_scope_to_prepend.endswith("/"): + name_scope_to_prepend += "/" - -def dynamic_partition(features, partitions, num_partitions=2, name=None): - """ - Partitions each of the tensor in features using the provided mask. - - Args: - features: - A single tensor or an iterable of tensors (list, tuple, dict) - partitions: - A bool or integer tensor representing the partitions. - - Returns partitioned outputs as a list. Each element of the list is the same type as features. - - This uses tf.dynamic_partition but adds the following niceties: - - features can be a list or dict of different tensor types. - - only a partition tensor is used to partition all the feature tensors recursively. - - the partition tensor is automatically converted into an integer tensor. - - defaults to num_partitions == 2 - """ - - if not isinstance(features, (dict, list, tuple, tf.Tensor)): - raise AssertionError("features container must be a dict, list, or tuple, tf.Tensor") - - if isinstance(partitions, tf.Tensor): - partitions = tf.cast(partitions, tf.int32) - - if isinstance(features, tf.Tensor): - return tf.dynamic_partition(features, partitions, num_partitions, name) - - outputs = [] - for _ in range(num_partitions): - if isinstance(features, (tuple, list)): - # Create an empty list of lists first, will be converted to right type afterwards. - outputs.append([None for _ in range(len(features))]) + if name_scope_to_remove is not None: + if not name_scope_to_remove.endswith("/"): + name_scope_to_remove += "/" + + init_map = {} + + for var_name in vars_to_restore: + var_name_checkpoint = var_name + + if name_scope_to_remove is not None: + var_name = var_name.replace(name_scope_to_remove, "") + + var_name_new_model = var_name + + if name_scope_to_prepend is not None: + var_name_new_model = name_scope_to_prepend + var_name_new_model + + init_map[var_name_checkpoint] = var_name_new_model + + return init_map + + +def get_checkpoint_variable_names( + model_dir: str, + exclude_var_names: Optional[List[str]] = None, + exclude_scopes: Optional[List[str]] = None, +) -> List[str]: + """ + Gets a list of variable names from the latest checkpoint in model_dir. + Removes variables with scope defined by exclude_scopes, and/or with names defined by + exclude_var_names. + + Args: + model_dir (str): Directory containing checkpoint file for the pre-trained model + exclude_var_names (list): Optional variable names to exclude (can include full/partial scope) + exclude_scopes (list): Optional scopes to exclude + + Returns: + list: variable names + """ + checkpoint_path = tf.train.latest_checkpoint(model_dir) + variables_and_shapes = tf.train.list_variables(checkpoint_path) + + def _keep(name: str) -> bool: + if exclude_scopes and any( + name.startswith(exc_scope) for exc_scope in exclude_scopes + ): + return False + if exclude_var_names and any( + name.endswith(exc_var) for exc_var in exclude_var_names + ): + return False + return True + + names = [str(x[0]) for x in variables_and_shapes if _keep(x[0])] + + return names + + +def to_snake_case(name: str) -> str: + """ + Changes name to snake case + """ + intermediate = re.sub("(.)([A-Z][a-z0-9]+)", r"\1_\2", name) + insecure = re.sub("([a-z])([A-Z])", r"\1_\2", intermediate).lower() + # If the class is private the name starts with "_" which is not secure + # for creating scopes. We prefix the name with "private" in this case. + if insecure[0] != "_": + return insecure + return "private" + insecure + + +def copy_phase_inputs(init_dir: str, dest_dir: str): + """Automatically copies the .json.tf from the init_dir to save_dir + so we can load multiple parameters at the same time. + + Args: + init_dir (str): + Name of the checkpoint directory. + dest_dir (str): + Name of the output directory. + """ + if init_dir is not None: + # we are using tf.io.gfile so we can use it with both local and hdfs paths + for files in tf.io.gfile.listdir(init_dir): + if files.endswith(".json.tf"): + src_file = os.path.join(init_dir, files) + dest_file = os.path.join(dest_dir, files) + if not tf.io.gfile.exists(dest_dir): + # creates the folder + try: + tf.io.gfile.makedirs(dest_dir) + # to prevent racing condition + except OSError: + if not tf.io.gfile.isdir(dest_dir): + raise + # dest_file may be old if it exists and + # dest_file gets copied several times in distributed training + tf.io.gfile.copy(src_file, dest_file, overwrite=True) + + +def rehash_sparse_features_nbits( + sp_a: tf.SparseTensor, + nbits: int, + hash_fn: Callable[[int], int] = multiplicative_hash, +) -> tf.SparseTensor: + """ + Rehash the feature ids of the sparse tensor, + and limit the output to n bits. + + This is useful for making the distribution of + feature_ids more uniform, which may improve performance + in some situations. + + This would typically be used on the output of + PercentileDiscretizer, since it assigns many + bins to low-valued output feature ids. + + Input feature IDs should take values less than 2**32, + and nbits should be less than 32 + + Args: + sp_a: + a tf.SparseTensor object + nbits: + integer number of bits to mask output feature_ids + hash_fn: + Function that takes integer values and returns hashes of these values. + The output does not need to be masked to the desired number of bits, + as this masking will be taken care of. Default value = multiplicative_hash. + + Returns: + a new tf.SparseTensor + """ + + feature_ids = sp_a.indices[:, 1] + feature_ids = hash_fn(feature_ids) + + sample_ids = sp_a.indices[:, 0] + values = sp_a.values + dense_shape = sp_a.dense_shape + + indices = tf.stack([sample_ids, feature_ids], axis=1) + + sp_a = tf.SparseTensor(indices, values, dense_shape) + + # note - we need 2**nbits >= batch size + # otherwise, sample_ids will be squashed by the mask. + return limit_sparse_tensor_size(sp_a, nbits) + + +def convert_to_hparams(opt: Union[argparse.Namespace, dict, HParams]) -> HParams: + """ + Converts argparse.Namespace object to twitter.deepbird.hparam.hparam.HParams. + Note that tensorflow.contrib.training.HParams is gone in TF 2.x, and we forward ported + tensorflow.contrib.training.HParams to twitter.deepbird.hparam.hapram.HParams. + + NOTE: If you are using estimators, please don't call this method and directly pass python dict + to TensorFlow estimator. Starting TensorFlow 2.0, Estimator will only accept dicts. + """ + + # Convert to dict so we can iterate through it cleanly. + if isinstance(opt, argparse.Namespace): + params_dict = vars(opt) + elif isinstance(opt, dict): + params_dict = opt + elif isinstance(opt, HParams): + logging.warning( + "If you are using Estimator, please pass python dict directly to Estimator." + ) + params_dict = opt.values() else: - outputs.append(dict()) + raise ValueError( + "Input can not be of type %s. " + "It can be one of { argparse.Namespace, dict, " + "twitter.deepbird.hparam.HParams}." % type(opt) + ) + + params = HParams() + # Hack to convert all parameters from hdfs:/// format to hdfs://default/ + # Note: .items() makes a copy in python 2.7, but that is fine since the performance isn't critical. + for key, val in params_dict.items(): + val = params_dict[key] + # Fix the path if the value is a string + if isinstance(val, str): + params.add_hparam(key, sanitize_hdfs_path(val)) + else: + params.add_hparam(key, val) + + return params + + +def dynamic_partition( + features: Iterable, + partitions: tf.Tensor, + num_partitions: int = 2, + name: Optional[str] = None, +) -> list: + """ + Partitions each of the tensor in features using the provided mask. + + Args: + features: + A single tensor or an iterable of tensors (list, tuple, dict) + partitions: + A bool or integer tensor representing the partitions. + + Returns partitioned outputs as a list. Each element of the list is the same type as features. + + This uses tf.dynamic_partition but adds the following niceties: + - features can be a list or dict of different tensor types. + - only a partition tensor is used to partition all the feature tensors recursively. + - the partition tensor is automatically converted into an integer tensor. + - defaults to num_partitions == 2 + """ + + if not isinstance(features, (dict, list, tuple, tf.Tensor)): + raise AssertionError( + "features container must be a dict, list, or tuple, tf.Tensor" + ) - iterable = features.items() if isinstance(features, dict) else enumerate(features) - - # Handling partitions of nested classes handled here: - # Recursively call dynamic_partition for containers - for key, feature in iterable: - name_key = None if name is None else name + "_" + str(key) if isinstance(partitions, tf.Tensor): - results = tf.dynamic_partition(feature, partitions, num_partitions, name_key) - else: - results = tf.dynamic_partition(feature, partitions[key], num_partitions[key], name_key) - # Append the result to the proper output container - for idx, result in enumerate(results): - outputs[idx][key] = result - - # if input is tuple, convert list of lists back to list of tuples - if isinstance(features, tuple): - outputs = [type(features)(output) for output in outputs] - - return outputs - - -def write_file(filename, contents, encode=False): - ''' - Optionally encodes contents and writes contents to a file. - - Arguments: - filename: - path to file where the contents will be saved. - Accepts HDFS and local paths. - contents: - contents to save to the file. - Must be a string when encode is False. - encode: - False | 'json'. When encode='json', contents is encoded - with json.dumps. - ''' - if encode == 'json': - contents = json.dumps(contents) - elif not is_string(contents): - raise ValueError("Expecting string for encode=False") - - graph = tf.Graph() - with graph.as_default(): - write = tf.write_file(filename, contents) - - with tf.Session(graph=graph) as sess: - sess.run(write) - - -def read_file(filename, decode=False): - ''' - Reads contents from a file and optionally decodes it. - - Arguments: - filename: - path to file where the contents will be loaded from. - Accepts HDFS and local paths. - decode: - False | 'json'. When decode='json', contents is decoded - with json.loads. When False, contents is returned as is. - - Returns: - contents - ''' - graph = tf.Graph() - with graph.as_default(): - read = tf.read_file(filename) - - with tf.Session(graph=graph) as sess: - contents = (sess.run(read)) - # particular version of TF and/or Python may or may not perform decoding step from utf-8 to str - if not isinstance(contents, str): - contents = contents.decode() - - if decode == 'json': - contents = json.loads(contents) - - return contents - -def setup_tf_logging_formatter(): - formatter = _logging.Formatter( - '%(asctime)s [%(levelname)s] %(name)s: %(message)s', - None) - # Setting up absl logging verbosity - logging.set_verbosity('info') - logging.set_stderrthreshold('info') - logging.get_absl_handler().setFormatter(formatter) - tf.logging.set_verbosity(tf.logging.INFO) - # Set tensorflow logging handler format - if len(tf_logging.get_logger().handlers) > 0: - tf_logging.get_logger().handlers[0].setFormatter(formatter) - - -def set_tensorflow_log_level(log_level): - """ - Sets tensorflow's default logging level. - - 0. all logs are shown. - 1. filter out INFO logs. - 2. filter out WARNINGs and INFOs. - 3. filter out ERRORs, WARNINGs, and INFOs. - - Note that tf.Print output are INFO logs, so setting log_level above 0 would hide - output from tf.Print. - """ - assert isinstance(log_level, int) and log_level >= 0 and log_level <= 3 - os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(log_level) - - -def weighted_average(values, weights): - """ - Compute a weighted average using the given values and weights. - E.g. this is usually used to compute a weighted loss given sample weights. - """ - return tf.reduce_sum(tf.multiply(values, weights)) / tf.reduce_sum(weights) - - -def backup_checkpoint(checkpoint_path_prefix, - backup_path='backup', - empty_backup=True): - """ - Creates a backup copy of a checkpoint in backup_dir. - This function is used by the Trainer for early-stopping. - - Arguments: - checkpoint_path_prefix: - Prefix of the path to the checkpoint files. - backup_path: - path to a directory where checkpoint files will be backed up. - empty_backup: - When True (the default), the current contents of the backup directory - are removed before the backup is performed. - - Returns: - The number of backed up files. - """ - checkpoint_file_prefix = os.path.basename(checkpoint_path_prefix) - - if tf.io.gfile.exists(backup_path) and empty_backup: - tf.io.gfile.rmtree(backup_path) - - tf.io.gfile.mkdir(backup_path) - - n_backup = 0 - # copy all checkpoint files to backup directory (TODO use gfile.glob instead) - try: - checkpoint_files = tf.io.gfile.glob(checkpoint_path_prefix + "*") - if len(checkpoint_files) == 0: - raise twml.errors.CheckpointNotFoundError("%s not found" % checkpoint_path_prefix) - for filename in checkpoint_files: - n_backup += 1 - tf.io.gfile.copy( - src=filename, - dst=os.path.join(backup_path, os.path.basename(filename)) - ) - except tf.errors.OpError as ex: - raise twml.errors.CheckpointNotFoundError( - f"{str(ex)}\n {checkpoint_path_prefix} not found." + partitions = tf.cast(partitions, tf.int32) + + if isinstance(features, tf.Tensor): + return tf.dynamic_partition(features, partitions, num_partitions, name) + + outputs = [] + for _ in range(num_partitions): + if isinstance(features, (tuple, list)): + # Create an empty list of lists first, will be converted to right type afterwards. + outputs.append([None for _ in range(len(features))]) + else: + outputs.append(dict()) + + iterable = features.items() if isinstance(features, dict) else enumerate(features) + + # Handling partitions of nested classes handled here: + # Recursively call dynamic_partition for containers + for key, feature in iterable: + name_key = None if name is None else name + "_" + str(key) + if isinstance(partitions, tf.Tensor): + results = tf.dynamic_partition( + feature, partitions, num_partitions, name_key + ) + else: + results = tf.dynamic_partition( + feature, partitions[key], num_partitions[key], name_key + ) + # Append the result to the proper output container + for idx, result in enumerate(results): + outputs[idx][key] = result + + # if input is tuple, convert list of lists back to list of tuples + if isinstance(features, tuple): + outputs = [type(features)(output) for output in outputs] + + return outputs + + +def write_file(filename: str, contents: str, encode: bool = False) -> None: + """ + Optionally encodes contents and writes contents to a file. + + Args: + filename: + path to file where the contents will be saved. + Accepts HDFS and local paths. + contents: + contents to save to the file. + Must be a string when encode is False. + encode: + False | 'json'. When encode='json', contents is encoded + with json.dumps. + """ + if encode == "json": + contents = json.dumps(contents) + elif not is_string(contents): + raise ValueError("Expecting string for encode=False") + + graph = tf.Graph() + with graph.as_default(): + write = tf.write_file(filename, contents) + + with tf.Session(graph=graph) as sess: + sess.run(write) + + +def read_file(filename: str, decode: bool = False) -> str: + """ + Reads contents from a file and optionally decodes it. + + Args: + filename: + path to file where the contents will be loaded from. + Accepts HDFS and local paths. + decode: + False | 'json'. When decode='json', contents is decoded + with json.loads. When False, contents is returned as is. + + Returns: + contents + """ + graph = tf.Graph() + with graph.as_default(): + read = tf.read_file(filename) + + with tf.Session(graph=graph) as sess: + contents = sess.run(read) + # particular version of TF and/or Python may or may not perform decoding step from utf-8 to str + if not isinstance(contents, str): + contents = contents.decode() + + if decode == "json": + contents = json.loads(contents) + + return contents + + +def setup_tf_logging_formatter() -> None: + formatter = _logging.Formatter( + "%(asctime)s [%(levelname)s] %(name)s: %(message)s", None ) - - # tf.train.latest_checkpoint needs the 'checkpoint' file. - with tf.io.gfile.GFile(os.path.join(backup_path, 'checkpoint'), 'w') as f: - f.write('model_checkpoint_path: "%s"\n' % checkpoint_file_prefix) - - return n_backup - - -def set_only_checkpoint(source_path, dest_path, remove_source=True): - """ - Removes the checkpoint and model.ckpt* files from dest_path. - Moves the latest checkpoint from source_path to dest_path. - - Arguments: - source_path: - path to directory containing the latest checkpoint. - Should contain a valid checkpoint file and model.ckpt files. - For early-stopping, this should be the save_dir/best_checkpoint dir. - dest_path: - path to directory where the latest checkpoint files will be moved. - All its checkpoint and model.ckpt* files will be removed. - For early-stopping, this should be the save_dir. - remove_source: - When True (the default), deletes the source directory. - Note that even when False, its checkpoint files are moved to - dest_path anyway. - This deletes the source directory (and any remaining contents). - """ - # make it so that source_path checkpoint is the only checkpoint - source_path_prefix = tf.train.latest_checkpoint(source_path) - if source_path_prefix is not None: - # remove intermediate checkpoints - for filename in tf.io.gfile.listdir(dest_path): - if filename.startswith("model.ckpt"): - tf.io.gfile.Remove(os.path.join(dest_path, filename)) - # move contents of source_path to dest_path - for filename in tf.io.gfile.listdir(source_path): - tf.io.gfile.rename( - oldname=os.path.join(source_path, filename), - newname=os.path.join(dest_path, filename), - overwrite=True) # overwrite "checkpoint" file - # delete the source_path dir - if remove_source: - tf.io.gfile.rmtree(source_path) + # Setting up absl logging verbosity + logging.set_verbosity("info") + logging.set_stderrthreshold("info") + logging.get_absl_handler().setFormatter(formatter) + tf.logging.set_verbosity(tf.logging.INFO) + # Set tensorflow logging handler format + if len(tf_logging.get_logger().handlers) > 0: + tf_logging.get_logger().handlers[0].setFormatter(formatter) + + +def set_tensorflow_log_level(log_level: object) -> None: + """ + Sets tensorflow's default logging level. + + 0. all logs are shown. + 1. filter out INFO logs. + 2. filter out WARNINGs and INFOs. + 3. filter out ERRORs, WARNINGs, and INFOs. + + Note that tf.Print output are INFO logs, so setting log_level above 0 would hide + output from tf.Print. + """ + assert isinstance(log_level, int) and log_level >= 0 and log_level <= 3 + os.environ["TF_CPP_MIN_LOG_LEVEL"] = str(log_level) + + +def weighted_average(values, weights) -> tf.Tensor: + """ + Compute a weighted average using the given values and weights. + E.g. this is usually used to compute a weighted loss given sample weights. + """ + return tf.reduce_sum(tf.multiply(values, weights)) / tf.reduce_sum(weights) + + +def backup_checkpoint( + checkpoint_path_prefix: str, + backup_path: str = "backup", + empty_backup: bool = True, +) -> int: + """ + Creates a backup copy of a checkpoint in backup_dir. + This function is used by the Trainer for early-stopping. + + Args: + checkpoint_path_prefix: + Prefix of the path to the checkpoint files. + backup_path: + path to a directory where checkpoint files will be backed up. + empty_backup: + When True (the default), the current contents of the backup directory + are removed before the backup is performed. + + Returns: + The number of backed up files. + """ + checkpoint_file_prefix = os.path.basename(checkpoint_path_prefix) + + if tf.io.gfile.exists(backup_path) and empty_backup: + tf.io.gfile.rmtree(backup_path) + + tf.io.gfile.mkdir(backup_path) + + n_backup = 0 + # copy all checkpoint files to backup directory (TODO use gfile.glob instead) + try: + checkpoint_files = tf.io.gfile.glob(checkpoint_path_prefix + "*") + if len(checkpoint_files) == 0: + raise twml.errors.CheckpointNotFoundError( + "%s not found" % checkpoint_path_prefix + ) + for filename in checkpoint_files: + n_backup += 1 + tf.io.gfile.copy( + src=filename, dst=os.path.join(backup_path, os.path.basename(filename)) + ) + except tf.errors.OpError as ex: + raise twml.errors.CheckpointNotFoundError( + f"{str(ex)}\n {checkpoint_path_prefix} not found." + ) + + # tf.train.latest_checkpoint needs the 'checkpoint' file. + with tf.io.gfile.GFile(os.path.join(backup_path, "checkpoint"), "w") as f: + f.write('model_checkpoint_path: "%s"\n' % checkpoint_file_prefix) + + return n_backup + + +def set_only_checkpoint( + source_path: str, + dest_path: str, + remove_source: bool = True, +) -> None: + """ + Removes the checkpoint and model.ckpt* files from dest_path. + Moves the latest checkpoint from source_path to dest_path. + + Args: + source_path: + path to directory containing the latest checkpoint. + Should contain a valid checkpoint file and model.ckpt files. + For early-stopping, this should be the save_dir/best_checkpoint dir. + dest_path: + path to directory where the latest checkpoint files will be moved. + All its checkpoint and model.ckpt* files will be removed. + For early-stopping, this should be the save_dir. + remove_source: + When True (the default), deletes the source directory. + Note that even when False, its checkpoint files are moved to + dest_path anyway. + This deletes the source directory (and any remaining contents). + """ + # make it so that source_path checkpoint is the only checkpoint + source_path_prefix = tf.train.latest_checkpoint(source_path) + if source_path_prefix is not None: + # remove intermediate checkpoints + for filename in tf.io.gfile.listdir(dest_path): + if filename.startswith("model.ckpt"): + tf.io.gfile.Remove(os.path.join(dest_path, filename)) + # move contents of source_path to dest_path + for filename in tf.io.gfile.listdir(source_path): + tf.io.gfile.rename( + oldname=os.path.join(source_path, filename), + newname=os.path.join(dest_path, filename), + overwrite=True, + ) # overwrite "checkpoint" file + # delete the source_path dir + if remove_source: + tf.io.gfile.rmtree(source_path) def list_files_by_datetime( - base_path, - start_datetime, - end_datetime=None, - datetime_prefix_format='%Y/%m/%d/%H', - extension='lzo', - parallelism=1, - hour_resolution=1, - sort=False + base_path: str, + start_datetime: Optional[datetime] = None, + end_datetime: Optional[datetime] = None, + datetime_prefix_format: str = "%Y/%m/%d/%H", + extension: str = "lzo", + parallelism: int = 1, + hour_resolution: int = 1, + sort: bool = False, +) -> List[str]: + """List files matching `base_path/dt_prefix_format/*.extension` for the requested datetime range. + + Args: + base_path: + The base path. If `None`, returns `None`. + start_datetime: + A `datetime.datetime` or string representing the start of the range (inclusive). + If `None`, it returns `list_files(base_path, extension, sort)`. + end_datetime: + A `datetime.datetime` or string representing the end of the range (inclusive). + If `None`, assumed to be the same as start_datetime. + datetime_prefix_format: + Format compatible with `datetime.datetime.strftime` + (https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior). + extension: + The extension of the files composing the dataset (e.g. 'lzo'). + parallelism: + The number of threads used to process list patterns (this is mostly useful + when dealing with filesystems such as HDFS in which listing files is a potentially expensive + operation). + hour_resolution: + The separation between consecutive hours. The default value is 1. + sort: + bool, whether to return a sorted list of files. Default False. + + Returns: + A list with all the matching files. + + Raises: + errors.OpError: If there are filesystem / directory listing errors. + """ + if hour_resolution is None: + hour_resolution = 1 + + if base_path is None: + return None + + if start_datetime is None: + return list_files(base_path, extension, sort) + + # Do this in case people want to use a single day for training. + if end_datetime is None: + end_datetime = start_datetime + + assert parallelism > 0 + assert start_datetime <= end_datetime + + if isinstance(start_datetime, str): + start_datetime = datetime.strptime(start_datetime, datetime_prefix_format) + + if isinstance(end_datetime, str): + end_datetime = datetime.strptime(end_datetime, datetime_prefix_format) + + assert isinstance(start_datetime, datetime) + assert isinstance(end_datetime, datetime) + + base_path = preprocess_path(base_path) + + def _handle_missing_globs(pattern: str) -> List[str]: + try: + return tf.io.gfile.glob(pattern) + except tf.errors.NotFoundError as e: + tf.logging.warning(e.message) + return [] + + # a set is used because there might be some repeated globs depending on dt_prefix_format + globs = { + os.path.join(base_path, dt.strftime(datetime_prefix_format), "*.%s" % extension) + for dt in rrule.rrule( + freq=rrule.HOURLY, + interval=hour_resolution, + dtstart=start_datetime, + until=end_datetime, + ) + } + nested_files = Parallel(n_jobs=parallelism, backend="threading")( + delayed(_handle_missing_globs)(p) for p in globs + ) + flattened_files = list(itertools.chain.from_iterable(nested_files)) + + if not flattened_files: + error_msg = f"Files list is empty: base_path={base_path}, start_datetime={start_datetime}, end_datetime={end_datetime}" + raise OSError(error_msg) + + if sort: + flattened_files = sorted(flattened_files) + + return flattened_files + + +def limit_sparse_tensor_size( + sparse_tf: Union[twml.SparseTensor, tf.SparseTensor], + input_size_bits: int, + mask_indices: bool = True, +) -> tf.SparseTensor: + """ + Returns a ``tf.SparseTensor`` which is the input SparseTensor + limited to the specified input_size_bits + + Args: + sparse_tf: + twml.SparseTensor or tf.SparseTensor + input_size_bits: + The number of bits allocated to the input size. + Input size will be power(2,input_size_bits). + Note that twml.limit_bits truncates any feature keys that + exceed the input size. + mask_indices: + If mask indices is False; only the shape is changed. Defaults to True. + + Returns: + (tf.SparseTensor) The limited sparse tensor + """ + if isinstance(sparse_tf, twml.SparseTensor): + sparse_tf = sparse_tf.to_tf() + if not isinstance(sparse_tf, tf.SparseTensor): + raise TypeError( + "Input argument `sparse_tf` should either be of type" + f"twml.SparseTensor of tf.SparseTensor. Found type: {type(sparse_tf)}" + ) + if mask_indices: + indices = twml.limit_bits(sparse_tf.indices, input_size_bits) + else: + indices = sparse_tf.indices + dense_shape = tf.stack([sparse_tf.dense_shape[0], 1 << input_size_bits]) + return tf.SparseTensor( + indices=indices, values=sparse_tf.values, dense_shape=dense_shape + ) + + +def create_module_spec( + mlp_fn: Callable[ + [tf.estimator.ModeKeys, Dict[str, Any]], tf.estimator.EstimatorSpec + ], + mode: tf.estimator.ModeKeys, + params: Dict[str, Any], + drop_collections: Optional[List[str]] = None, ): - """List files matching `base_path/dt_prefix_format/*.extension` for the requested datetime range. - - Args: - base_path: - The base path. If `None`, returns `None`. - start_datetime: - A `datetime.datetime` or string representing the start of the range (inclusive). - If `None`, it returns `list_files(base_path, extension, sort)`. - end_datetime: - A `datetime.datetime` or string representing the end of the range (inclusive). - If `None`, assumed to be the same as start_datetime. - datetime_prefix_format: - Format compatible with `datetime.datetime.strftime` - (https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior). - extension: - The extension of the files composing the dataset (e.g. 'lzo'). - parallelism: - The number of threads used to process list patterns (this is mostly useful - when dealing with filesystems such as HDFS in which listing files is a potentially expensive - operation). - hour_resolution: - The separation between consecutive hours. The default value is 1. - sort: - bool, whether to return a sorted list of files. Default False. - - Returns: - A list with all the matching files. - - Raises: - errors.OpError: If there are filesystem / directory listing errors. - """ - if hour_resolution is None: - hour_resolution = 1 - - if base_path is None: - return None - - if start_datetime is None: - return list_files(base_path, extension, sort) - - # Do this in case people want to use a single day for training. - if end_datetime is None: - end_datetime = start_datetime - - assert parallelism > 0 - assert start_datetime <= end_datetime - - if isinstance(start_datetime, str): - start_datetime = datetime.strptime(start_datetime, datetime_prefix_format) - - if isinstance(end_datetime, str): - end_datetime = datetime.strptime(end_datetime, datetime_prefix_format) - - assert isinstance(start_datetime, datetime) - assert isinstance(end_datetime, datetime) - - base_path = preprocess_path(base_path) - - def _handle_missing_globs(pattern): - try: - return tf.io.gfile.glob(pattern) - except tf.errors.NotFoundError as e: - tf.logging.warning(e.message) - return [] - - # a set is used because there might be some repeated globs depending on dt_prefix_format - globs = { - os.path.join(base_path, dt.strftime(datetime_prefix_format), '*.%s' % extension) - for dt in rrule.rrule( - freq=rrule.HOURLY, interval=hour_resolution, dtstart=start_datetime, until=end_datetime) - } - nested_files = Parallel(n_jobs=parallelism, backend='threading')( - delayed(_handle_missing_globs)(p) for p in globs - ) - flattened_files = list(itertools.chain.from_iterable(nested_files)) - - if not flattened_files: - error_msg = "Files list is empty: base_path={base_path}, start_datetime={start_datetime}, end_datetime={end_datetime}".format( - base_path=base_path, start_datetime=start_datetime, end_datetime=end_datetime + """ + Creates a standard tags_and_args which should be passed to the create_module_spec + spec = hub.create_module_spec(mlp_fn, tags_and_args=tags_and_args). + + Args: + module_fn: + a function to build a graph for the Module. + mode: + mode in which the Estimator is run + params: + parameters passed to the Estimator + """ + import tensorflow_hub as hub # noqa: F402 + + tags_and_args = [ + (set(), {"params": params, "mode": mode}), # serving graph + ({"train"}, {"params": params, "mode": mode}), # training graph + ] + spec = hub.create_module_spec( + mlp_fn, tags_and_args=tags_and_args, drop_collections=drop_collections ) - raise OSError(error_msg) - - if sort: - flattened_files = sorted(flattened_files) - - return flattened_files - - -def limit_sparse_tensor_size(sparse_tf, input_size_bits, mask_indices=True): - """ - Returns a ``tf.SparseTensor`` which is the input SparseTensor - limited to the specified input_size_bits - - Args: - sparse_tf: - twml.SparseTensor or tf.SparseTensor - input_size_bits: - The number of bits allocated to the input size. - Input size will be power(2,input_size_bits). - Note that twml.limit_bits truncates any feature keys that - exceed the input size. - mask_indices: - If mask indices is False; only the shape is changed. Defaults to True. - """ - if isinstance(sparse_tf, twml.SparseTensor): - sparse_tf = sparse_tf.to_tf() - if not isinstance(sparse_tf, tf.SparseTensor): - raise TypeError('Input argument `sparse_tf` should either be of type' - 'twml.SparseTensor of tf.SparseTensor. Found type: {}'. - format(type(sparse_tf))) - if mask_indices: - indices = twml.limit_bits(sparse_tf.indices, input_size_bits) - else: - indices = sparse_tf.indices - dense_shape = tf.stack([sparse_tf.dense_shape[0], 1 << input_size_bits]) - return tf.SparseTensor(indices=indices, values=sparse_tf.values, - dense_shape=dense_shape) - - -def create_module_spec(mlp_fn, mode, params, drop_collections=None): - """ - Creates a standard tags_and_args which should be passed to the create_module_spec - spec = hub.create_module_spec(mlp_fn, tags_and_args=tags_and_args). - - Args: - module_fn: - a function to build a graph for the Module. - mode: - mode in which the Estimator is run - params: - parameters passed to the Estimator - """ - import tensorflow_hub as hub # noqa: F402 - tags_and_args = [(set(), {"params": params, "mode": mode}), # serving graph - ({"train"}, {"params": params, "mode": mode}) # training graph - ] - spec = hub.create_module_spec(mlp_fn, tags_and_args=tags_and_args, drop_collections=drop_collections) - return spec - - -def change_name_scope_from_dir(init_scope_name, final_scope_name, save_dir): - """ - Changes the name of the saved scope to the desired name and saves it - to the same save_dir. - - Args: - init_scope_name: - initial scope name - final_scope_name: - desired (final) scope name - save_dir: - directory which the scopes are saved - - In the follwing section we: - - Read all the variables from the latest checkpoint. - - Make a copy of the variables with new name scope. - - Store both sets of variables into the latest checkpoint. - This essentially doubles up the size of the checkpoint. - But when a job is restarted after this part is done, the checkpoint size doubles again. - To avoid doing this, we create a copy in backup if a backup isn't found. - This allows us always read (from backup) and write same sized checkpoint files. - """ - - # Create a backup_checkpoints dir - backup_dir = os.path.join(save_dir, "change_name_scope_backups") - tf.io.gfile.makedirs(backup_dir) - - latest_checkpoint = tf.train.latest_checkpoint(save_dir) - - if latest_checkpoint is None: - raise OSError("No checkpoints found in save_dir: %s" % save_dir) - - latest_backup_checkpoint = tf.train.latest_checkpoint(backup_dir) - - if (latest_backup_checkpoint is None or - (os.path.basename(latest_checkpoint) != - os.path.basename(latest_backup_checkpoint))): - backup_checkpoint(latest_checkpoint, backup_dir, empty_backup=False) - - variables = tf.train.list_variables(backup_dir) - with tf.Graph().as_default(), tf.Session().as_default() as sess: - new_variables = [] - for name, _ in variables: - var = tf.train.load_variable(backup_dir, name) - # Append both the rename and the original variable - new_variables.append( - tf.Variable(var, name=name.replace(init_scope_name, final_scope_name))) - new_variables.append(tf.Variable(var, name=name)) - # Save this to the checkpoint in the save_dir - saver = tf.train.Saver(new_variables) - sess.run(tf.global_variables_initializer()) - saver.save(sess, latest_checkpoint) # pylint: disable=no-member - - -def hub_import(input, module, module_name, trainable=False): - """ - Loads exported hub module. - - Args: - input: - input to hub module - module: - module path - module_name: - signature of the exported hub module - """ - import tensorflow_hub as hub # noqa: F402 - hub_module = hub.Module(module, trainable=trainable) - output = hub_module(input, signature=module_name) - return output - - -def _extract_hash_space_bits(feature_config): - """ - Extract Sparse Shapes for contrib.FeatureConfig. - Arguments: - feature_config: - Feature Configuration of the type contrib.FeatureConfig - Returns: - Dictionary of tensor names and hash space bits. - """ - if not isinstance(feature_config, twml.contrib.feature_config.FeatureConfig): - fc_type = type(feature_config) - raise TypeError(f"Feature config must be of type contrib.FeatureConfig: {fc_type}") - sparse_shapes_dict = {} - for config in feature_config.sparse_extraction_configs: - sparse_shapes_dict[config.output_name] = config.hash_space_bits - return sparse_shapes_dict - - -def fix_shape_sparse(features, feature_config): - """ - Modifies the shape of features which are extracted using the hashing trick. - Features itself is changed by this function. - Arguments: - features: - Feature dictionary extracted by the feature config - feature_config: - Feature Configuration of the type contrib.FeatureConfig - """ - if not isinstance(feature_config, twml.contrib.feature_config.FeatureConfig): - raise TypeError(f"Feature config must be of type contrib.FeatureConfig, currently of {type(feature_config)}") - sparse_shape = _extract_hash_space_bits(feature_config) - if not isinstance(features, dict): - raise TypeError(f"features must be of dictionary type, it is of {type(features)} type") - for key in set(features) & set(sparse_shape): - features[key] = limit_sparse_tensor_size(features[key], sparse_shape[key], mask_indices=False) - - -def touch_file_in_dir(directory, filename): - """ - Creates a file named filename in directory. - - Arguments: - filename: (str) - directory: (str) - """ - file_path = os.path.join(directory, filename) - with tf.io.gfile.GFile(file_path, "w") as f: - f.write("") + return spec + + +def change_name_scope_from_dir( + init_scope_name: str, + final_scope_name: str, + save_dir: str, +) -> None: + """ + Changes the name of the saved scope to the desired name and saves it + to the same save_dir. + + Args: + init_scope_name: + initial scope name + final_scope_name: + desired (final) scope name + save_dir: + directory which the scopes are saved + + In the follwing section we: + - Read all the variables from the latest checkpoint. + - Make a copy of the variables with new name scope. + - Store both sets of variables into the latest checkpoint. + This essentially doubles up the size of the checkpoint. + But when a job is restarted after this part is done, the checkpoint size doubles again. + To avoid doing this, we create a copy in backup if a backup isn't found. + This allows us always read (from backup) and write same sized checkpoint files. + """ + + # Create a backup_checkpoints dir + backup_dir = os.path.join(save_dir, "change_name_scope_backups") + tf.io.gfile.makedirs(backup_dir) + + latest_checkpoint = tf.train.latest_checkpoint(save_dir) + + if latest_checkpoint is None: + raise OSError("No checkpoints found in save_dir: %s" % save_dir) + + latest_backup_checkpoint = tf.train.latest_checkpoint(backup_dir) + + if latest_backup_checkpoint is None or ( + os.path.basename(latest_checkpoint) + != os.path.basename(latest_backup_checkpoint) + ): + backup_checkpoint(latest_checkpoint, backup_dir, empty_backup=False) + + variables = tf.train.list_variables(backup_dir) + with tf.Graph().as_default(), tf.Session().as_default() as sess: + new_variables = [] + for name, _ in variables: + var = tf.train.load_variable(backup_dir, name) + # Append both the rename and the original variable + new_variables.append( + tf.Variable(var, name=name.replace(init_scope_name, final_scope_name)) + ) + new_variables.append(tf.Variable(var, name=name)) + # Save this to the checkpoint in the save_dir + saver = tf.train.Saver(new_variables) + sess.run(tf.global_variables_initializer()) + saver.save(sess, latest_checkpoint) # pylint: disable=no-member + + +def hub_import( + input: tf.Tensor, module: str, module_name: str, trainable: bool = False +) -> tf.Tensor: + """ + Loads exported hub module. + + Args: + input: + input to hub module + module: + module path + module_name: + signature of the exported hub module + + Returns: + output of the hub module + """ + import tensorflow_hub as hub # noqa: F402 + + hub_module = hub.Module(module, trainable=trainable) + output = hub_module(input, signature=module_name) + return output + + +def _extract_hash_space_bits( + feature_config: twml.contrib.feature_config.FeatureConfig, +) -> Dict[str, int]: + """ + Extract Sparse Shapes for contrib.FeatureConfig. + + Args: + feature_config: + Feature Configuration of the type contrib.FeatureConfig + + Returns: + Dictionary of tensor names and hash space bits. + """ + if not isinstance(feature_config, twml.contrib.feature_config.FeatureConfig): + fc_type = type(feature_config) + raise TypeError( + f"Feature config must be of type contrib.FeatureConfig: {fc_type}" + ) + sparse_shapes_dict = {} + for config in feature_config.sparse_extraction_configs: + sparse_shapes_dict[config.output_name] = config.hash_space_bits + return sparse_shapes_dict + + +def fix_shape_sparse( + features: dict, feature_config: twml.contrib.feature_config.FeatureConfig +) -> None: + """ + Modifies the shape of features which are extracted using the hashing trick. + Features itself is changed by this function. + Args: + features: + Feature dictionary extracted by the feature config + feature_config: + Feature Configuration of the type contrib.FeatureConfig + """ + if not isinstance(feature_config, twml.contrib.feature_config.FeatureConfig): + raise TypeError( + f"Feature config must be of type contrib.FeatureConfig, currently of {type(feature_config)}" + ) + sparse_shape = _extract_hash_space_bits(feature_config) + if not isinstance(features, dict): + raise TypeError( + f"features must be of dictionary type, it is of {type(features)} type" + ) + for key in set(features) & set(sparse_shape): + features[key] = limit_sparse_tensor_size( + features[key], sparse_shape[key], mask_indices=False + ) + + +def touch_file_in_dir(directory: str, filename: str) -> None: + """ + Creates a file named filename in directory. + + Args: + filename: (str) + directory: (str) + """ + file_path = os.path.join(directory, filename) + with tf.io.gfile.GFile(file_path, "w") as f: + f.write("") def file_exist_in_dir(directory: str, filename: str) -> bool: - file_path = os.path.join(directory, filename) - return tf.io.gfile.exists(file_path) - - -def copy_to_local(remote, local, filename, overwrite=False): - """Function to file from remote directory to local directory.""" - assert "hdfs://" not in local - tf.io.gfile.makedirs(local) - return tf.io.gfile.copy( - os.path.join(remote, filename), - os.path.join(local, filename), - overwrite=overwrite, - ) - - -def copy_recursive(src, dst, overwrite=False): - """ - Function to copy a directory recursively. - - Arguments: - src: Source directory. - dst: Destination directory. - overwrite: Specifies if files are to be overwritten if they exist. - """ - - src = src.rstrip("/") - dst = dst.rstrip("/") - - for dirname, subdirs, files in tf.io.gfile.walk(src): - dst_dirname = dirname.replace(src, dst) - tf.io.gfile.makedirs(dst_dirname) - - for f in files: - src_f = os.path.join(dirname, f) - dst_f = os.path.join(dst_dirname, f) - - tf.logging.info(f"Copying {src_f} to {dst_f}") - tf.io.gfile.copy(src_f, dst_f, overwrite=overwrite) - - -def delete_file_or_dir(path): - """ - Delete the file or directory given by `path` - Arguments: - path: - string indicating path of file or directory to remove - """ - if tf.io.gfile.isdir(path): - tf.io.gfile.rmtree(path) - else: - tf.io.gfile.remove(path) - - -def get_distributed_training_job_path(): - """ - Function to get distributed training job path. - Note: distributed training has three jobs, one parameter server job, - one worker job and one evaluator job. All of these three jobs' name - share a common base job name. - """ - job_path = AuroraPath(dc=os.environ.get("TWML_JOB_CLUSTER"), - role=os.environ.get("TWML_JOB_ROLE"), - env=os.environ.get("TWML_JOB_ENV"), - job_name=os.environ.get("TWML_DISTRIBUTED_BASE_JOBNAME")) - return job_path - -def do_every_n_steps(action, num_steps): - """ - Execute a sequence of TensorFlow operations only once in a while. - Specifically, `action` is performed if `global_step` is a - multiple of `num_steps` - - Args: - action: callable to be performed at regular intervals. This callable - must return a TF op with no output tensors. - num_steps: period of performing the action, as measured - in number of training steps - - Returns: - A TensorFlow op with no output tensors, like a tf.print() or tf.no_op(). - You must use tf.control_dependencies() to execute the op. - - """ - global_step = tf.train.get_or_create_global_step() - condition = tf.math.equal(tf.math.floormod(global_step, num_steps), 0) - return tf.cond(condition, action, lambda: tf.no_op()) + """ + Checks if a file exists in directory. + + Args: + filename: (str) + the name of the file + directory: (str) + the directory where the file is located + + Returns: + bool: True if the file exists, False otherwise + """ + file_path = os.path.join(directory, filename) + return tf.io.gfile.exists(file_path) + + +def copy_to_local( + remote: str, + local: str, + filename: str, + overwrite: bool = False, +) -> tf.io.gfile: + """ + Function to file from remote directory to local directory. + + Args: + remote (str): Remote directory. + local (str): Local directory. + filename (str): Name of the file to be copied. + overwrite (bool): Specifies if files are to be overwritten if they exist. + + Returns: + tf.io.gfile: The copied file. + """ + assert "hdfs://" not in local + tf.io.gfile.makedirs(local) + return tf.io.gfile.copy( + os.path.join(remote, filename), + os.path.join(local, filename), + overwrite=overwrite, + ) + + +def copy_recursive( + src: str, + dst: str, + overwrite: bool = False, +) -> None: + """ + Function to copy a directory recursively. + + Args: + src (str): Source directory. + dst (str): Destination directory. + overwrite (bool): Specifies if files are to be overwritten if they exist. + """ + + src = src.rstrip("/") + dst = dst.rstrip("/") + + for dirname, subdirs, files in tf.io.gfile.walk(src): + dst_dirname = dirname.replace(src, dst) + tf.io.gfile.makedirs(dst_dirname) + + for f in files: + src_f = os.path.join(dirname, f) + dst_f = os.path.join(dst_dirname, f) + + tf.logging.info(f"Copying {src_f} to {dst_f}") + tf.io.gfile.copy(src_f, dst_f, overwrite=overwrite) + + +def delete_file_or_dir(path: str) -> None: + """ + Delete the file or directory given by `path` + Args: + path (str): + string indicating path of file or directory to remove + """ + if tf.io.gfile.isdir(path): + tf.io.gfile.rmtree(path) + else: + tf.io.gfile.remove(path) + + +def get_distributed_training_job_path() -> AuroraPath: + """ + Function to get distributed training job path. + Note: distributed training has three jobs, one parameter server job, + one worker job and one evaluator job. All of these three jobs' name + share a common base job name. + + Returns: + AuroraPath: The distributed training job path. + """ + job_path = AuroraPath( + dc=os.environ.get("TWML_JOB_CLUSTER"), + role=os.environ.get("TWML_JOB_ROLE"), + env=os.environ.get("TWML_JOB_ENV"), + job_name=os.environ.get("TWML_DISTRIBUTED_BASE_JOBNAME"), + ) + return job_path + + +def do_every_n_steps( + action: Callable[[], tf.Operation], + num_steps: int, +) -> tf.Operation: + """ + Execute a sequence of TensorFlow operations only once in a while. + Specifically, `action` is performed if `global_step` is a multiple of `num_steps` + + Args: + action: callable to be performed at regular intervals. This callable + must return a TF op with no output tensors. + num_steps: period of performing the action, as measured + in number of training steps + + Returns: + A TensorFlow op with no output tensors, like a tf.print() or tf.no_op(). + You must use tf.control_dependencies() to execute the op. + """ + + global_step = tf.train.get_or_create_global_step() + condition = tf.math.equal(tf.math.floormod(global_step, num_steps), 0) + return tf.cond(condition, action, lambda: tf.no_op()) diff --git a/twml/twml_common/initializer.py b/twml/twml_common/initializer.py index 7a9c734c7..31c3c1407 100644 --- a/twml/twml_common/initializer.py +++ b/twml/twml_common/initializer.py @@ -1,14 +1,18 @@ +import numpy as np import tensorflow.compat.v1 as tf class PartitionInitializer(tf.keras.initializers.Initializer): - """Required to initialize partitioned weight with numpy array for tests""" + """Required to initialize partitioned weight with numpy array for tests""" - def __init__(self, np_array): - self.np_array = np_array + def __init__(self, np_array: np.ndarray): + self.np_array = np_array - def __call__(self, shape, dtype=None, partition_info=None): - offset = partition_info.var_offset - ix0, ix1 = offset[0], offset[0] + shape[0] - iy0, iy1 = offset[1], offset[1] + shape[1] - return self.np_array[ix0:ix1, iy0:iy1] + def __call__(self, shape, dtype=None, partition_info=None) -> np.ndarray: + """Returns a numpy array for the given shape and dtype.""" + offset = partition_info.var_offset + ix0, ix1 = offset[0], offset[0] + shape[0] + iy0, iy1 = offset[1], offset[1] + shape[1] + if dtype is not None: + return self.np_array[ix0:ix1, iy0:iy1].astype(dtype) + return self.np_array[ix0:ix1, iy0:iy1] diff --git a/twml/twml_common/serialize.py b/twml/twml_common/serialize.py index 36c53881e..e7210bc0e 100644 --- a/twml/twml_common/serialize.py +++ b/twml/twml_common/serialize.py @@ -2,15 +2,36 @@ from thrift.transport import TTransport -def serialize(obj): - tbuf = TTransport.TMemoryBuffer() - iproto = TBinaryProtocol.TBinaryProtocol(tbuf) - obj.write(iproto) - return tbuf.getvalue() - - -def deserialize(record, bytes): - tbuf = TTransport.TMemoryBuffer(bytes) - iproto = TBinaryProtocol.TBinaryProtocol(tbuf) - record.read(iproto) - return record +def serialize(obj: TBinaryProtocol.TBinaryProtocol) -> bytes: + """ + Serialize a thrift object into a byte string + + Args: + obj: the thrift object to serialize + + Returns: + The serialized thrift object + """ + tbuf = TTransport.TMemoryBuffer() + iproto = TBinaryProtocol.TBinaryProtocol(tbuf) + obj.write(iproto) + return tbuf.getvalue() + + +def deserialize( + record: TBinaryProtocol.TBinaryProtocol, bytes: bytes +) -> TBinaryProtocol.TBinaryProtocol: + """ + Deserialize a thrift object from a byte string + + Args: + record: the thrift object to deserialize into + bytes: the byte string to deserialize from + + Returns: + The deserialized thrift object + """ + tbuf = TTransport.TMemoryBuffer(bytes) + iproto = TBinaryProtocol.TBinaryProtocol(tbuf) + record.read(iproto) + return record diff --git a/twml/twml_common/sparse_inputs.py b/twml/twml_common/sparse_inputs.py index b8f7939e5..fc2d61e2b 100644 --- a/twml/twml_common/sparse_inputs.py +++ b/twml/twml_common/sparse_inputs.py @@ -2,23 +2,59 @@ import tensorflow.compat.v1 as tf -def create_sparse_tensor(batch_size, input_size, num_values, dtype=tf.float32): - random_indices = np.sort(np.random.randint(batch_size * input_size, size=num_values)) - test_indices_i = random_indices // input_size - test_indices_j = random_indices % input_size - test_indices = np.stack([test_indices_i, test_indices_j], axis=1) - test_values = np.random.random(num_values).astype(dtype.as_numpy_dtype) - - return tf.SparseTensor(indices=tf.constant(test_indices), - values=tf.constant(test_values), - dense_shape=(batch_size, input_size)) - - -def create_reference_input(sparse_input, use_binary_values): - if use_binary_values: - sp_a = tf.SparseTensor(indices=sparse_input.indices, - values=tf.ones_like(sparse_input.values), - dense_shape=sparse_input.dense_shape) - else: - sp_a = sparse_input - return sp_a +def create_sparse_tensor( + batch_size: int, + input_size: int, + num_values: int, + dtype: tf.DType = tf.float32, +) -> tf.SparseTensor: + """ + Creates a sparse tensor with the given batch size, input size, and number of values. + + Args: + batch_size (int): The batch size of the sparse tensor. + input_size (int): The input size of the sparse tensor. + num_values (int): The number of values in the sparse tensor. + dtype (tf.DType): The dtype of the sparse tensor. + + Returns: + A sparse tensor with the given batch size, input size, and number of values. + """ + random_indices = np.sort( + np.random.randint(batch_size * input_size, size=num_values) + ) + test_indices_i = random_indices // input_size + test_indices_j = random_indices % input_size + test_indices = np.stack([test_indices_i, test_indices_j], axis=1) + test_values = np.random.random(num_values).astype(dtype.as_numpy_dtype) + + return tf.SparseTensor( + indices=tf.constant(test_indices), + values=tf.constant(test_values), + dense_shape=(batch_size, input_size), + ) + + +def create_reference_input( + sparse_input: tf.SparseTensor, use_binary_values: bool +) -> tf.SparseTensor: + """ + Creates a reference input for the sparse input. + + Args: + sparse_input (tf.SparseTensor): The sparse input. + use_binary_values (bool): Whether to use binary values. + + Returns: + A reference input for the sparse input. + """ + + if use_binary_values: + sp_a = tf.SparseTensor( + indices=sparse_input.indices, + values=tf.ones_like(sparse_input.values), + dense_shape=sparse_input.dense_shape, + ) + else: + sp_a = sparse_input + return sp_a