diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 000000000..0dd6f0561
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,6 @@
+[*.py]
+indent_style = space
+indent_size = 4
+end_of_line = lf
+insert_final_newline = true
+trim_trailing_whitespace = true
diff --git a/.gitignore b/.gitignore
deleted file mode 100644
index 5ca0973f8..000000000
--- a/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-.DS_Store
-
diff --git a/ann/src/main/python/dataflow/faiss_index_bq_dataset.py b/ann/src/main/python/dataflow/faiss_index_bq_dataset.py
index dd45070db..b9eca2fc3 100644
--- a/ann/src/main/python/dataflow/faiss_index_bq_dataset.py
+++ b/ann/src/main/python/dataflow/faiss_index_bq_dataset.py
@@ -3,230 +3,246 @@
 import os
 import pkgutil
 import sys
+from typing import Dict, List, Optional
 from urllib.parse import urlsplit
 
 import apache_beam as beam
-from apache_beam.options.pipeline_options import PipelineOptions
 import faiss
+from apache_beam.options.pipeline_options import PipelineOptions
 
 
-def parse_d6w_config(argv=None):
-  """Parse d6w config.
-  :param argv: d6w config
-  :return: dictionary containing d6w config
-  """
-
-  parser = argparse.ArgumentParser(
-    description="See https://docbird.twitter.biz/d6w/model.html for any parameters inherited from d6w job config"
-  )
-  parser.add_argument("--job_name", dest="job_name", required=True, help="d6w attribute")
-  parser.add_argument("--project", dest="project", required=True, help="d6w attribute")
-  parser.add_argument(
-    "--staging_location", dest="staging_location", required=True, help="d6w attribute"
-  )
-  parser.add_argument("--temp_location", dest="temp_location", required=True, help="d6w attribute")
-  parser.add_argument(
-    "--output_location",
-    dest="output_location",
-    required=True,
-    help="GCS bucket and path where resulting artifacts are uploaded",
-  )
-  parser.add_argument(
-    "--service_account_email", dest="service_account_email", required=True, help="d6w attribute"
-  )
-  parser.add_argument(
-    "--factory_string",
-    dest="factory_string",
-    required=False,
-    help="FAISS factory string describing index to build. See https://github.com/facebookresearch/faiss/wiki/The-index-factory",
-  )
-  parser.add_argument(
-    "--metric",
-    dest="metric",
-    required=True,
-    help="Metric used to compute distance between embeddings. Valid values are 'l2', 'ip', 'l1', 'linf'",
-  )
-  parser.add_argument(
-    "--use_gpu",
-    dest="gpu",
-    required=True,
-    help="--use_gpu=yes if you want to use GPU during index building",
-  )
-
-  known_args, unknown_args = parser.parse_known_args(argv)
-  d6w_config = vars(known_args)
-  d6w_config["gpu"] = d6w_config["gpu"].lower() == "yes"
-  d6w_config["metric"] = parse_metric(d6w_config)
-
-  """
-  WARNING: Currently, d6w (a Twitter tool used to deploy Dataflow jobs to GCP) and
-  PipelineOptions.for_dataflow_runner (a helper method in twitter.ml.common.apache_beam) do not
-  play nicely together. The helper method will overwrite some of the config specified in the d6w
-  file using the defaults in https://sourcegraph.twitter.biz/git.twitter.biz/source/-/blob/src/python/twitter/ml/common/apache_beam/__init__.py?L24.'
-  However, the d6w output message will still report that the config specified in the d6w file was used.
-  """
-  logging.warning(
-    f"The following d6w config parameters will be overwritten by the defaults in "
-    f"https://sourcegraph.twitter.biz/git.twitter.biz/source/-/blob/src/python/twitter/ml/common/apache_beam/__init__.py?L24\n"
-    f"{str(unknown_args)}"
-  )
-  return d6w_config
-
-
-def get_bq_query():
-  """
-  Query is expected to return rows with unique entityId
-  """
-  return pkgutil.get_data(__name__, "bq.sql").decode("utf-8")
-
-
-def parse_metric(config):
-  metric_str = config["metric"].lower()
-  if metric_str == "l2":
-    return faiss.METRIC_L2
-  elif metric_str == "ip":
-    return faiss.METRIC_INNER_PRODUCT
-  elif metric_str == "l1":
-    return faiss.METRIC_L1
-  elif metric_str == "linf":
-    return faiss.METRIC_Linf
-  else:
-    raise Exception(f"Unknown metric: {metric_str}")
+def parse_d6w_config(argv: Optional[List[str]] = None):
+    """Parse d6w config.
+    :param argv: d6w config
+    :return: dictionary containing d6w config
+    """
 
-
-def run_pipeline(argv=[]):
-  config = parse_d6w_config(argv)
-  argv_with_extras = argv
-  if config["gpu"]:
-    argv_with_extras.extend(["--experiments", "use_runner_v2"])
-    argv_with_extras.extend(
-      ["--experiments", "worker_accelerator=type:nvidia-tesla-t4;count:1;install-nvidia-driver"]
+    parser = argparse.ArgumentParser(
+        description="See https://docbird.twitter.biz/d6w/model.html for any parameters inherited from d6w job config"
     )
-    argv_with_extras.extend(
-      [
-        "--worker_harness_container_image",
-        "gcr.io/twttr-recos-ml-prod/dataflow-gpu/beam2_39_0_py3_7",
-      ]
+    parser.add_argument(
+        "--job_name", dest="job_name", required=True, help="d6w attribute"
     )
-
-  options = PipelineOptions(argv_with_extras)
-  output_bucket_name = urlsplit(config["output_location"]).netloc
-
-  with beam.Pipeline(options=options) as p:
-    input_data = p | "Read from BigQuery" >> beam.io.ReadFromBigQuery(
-      method=beam.io.ReadFromBigQuery.Method.DIRECT_READ,
-      query=get_bq_query(),
-      use_standard_sql=True,
+    parser.add_argument(
+        "--project", dest="project", required=True, help="d6w attribute"
+    )
+    parser.add_argument(
+        "--staging_location",
+        dest="staging_location",
+        required=True,
+        help="d6w attribute",
+    )
+    parser.add_argument(
+        "--temp_location", dest="temp_location", required=True, help="d6w attribute"
+    )
+    parser.add_argument(
+        "--output_location",
+        dest="output_location",
+        required=True,
+        help="GCS bucket and path where resulting artifacts are uploaded",
+    )
+    parser.add_argument(
+        "--service_account_email",
+        dest="service_account_email",
+        required=True,
+        help="d6w attribute",
+    )
+    parser.add_argument(
+        "--factory_string",
+        dest="factory_string",
+        required=False,
+        help="FAISS factory string describing index to build. See https://github.com/facebookresearch/faiss/wiki/The-index-factory",
+    )
+    parser.add_argument(
+        "--metric",
+        dest="metric",
+        required=True,
+        help="Metric used to compute distance between embeddings. Valid values are 'l2', 'ip', 'l1', 'linf'",
+    )
+    parser.add_argument(
+        "--use_gpu",
+        dest="gpu",
+        required=True,
+        help="--use_gpu=yes if you want to use GPU during index building",
     )
 
-    index_built = input_data | "Build and upload index" >> beam.CombineGlobally(
-      MergeAndBuildIndex(
-        output_bucket_name,
-        config["output_location"],
-        config["factory_string"],
-        config["metric"],
-        config["gpu"],
-      )
+    known_args, unknown_args = parser.parse_known_args(argv)
+    d6w_config = vars(known_args)
+    d6w_config["gpu"] = d6w_config["gpu"].lower() == "yes"
+    d6w_config["metric"] = parse_metric(d6w_config)
+
+    """
+    WARNING: Currently, d6w (a Twitter tool used to deploy Dataflow jobs to GCP) and
+    PipelineOptions.for_dataflow_runner (a helper method in twitter.ml.common.apache_beam) do not
+    play nicely together. The helper method will overwrite some of the config specified in the d6w
+    file using the defaults in https://sourcegraph.twitter.biz/git.twitter.biz/source/-/blob/src/python/twitter/ml/common/apache_beam/__init__.py?L24.'
+    However, the d6w output message will still report that the config specified in the d6w file was used.
+    """
+    logging.warning(
+        f"The following d6w config parameters will be overwritten by the defaults in "
+        f"https://sourcegraph.twitter.biz/git.twitter.biz/source/-/blob/src/python/twitter/ml/common/apache_beam/__init__.py?L24\n"
+        f"{str(unknown_args)}"
     )
+    return d6w_config
+
+
+def get_bq_query():
+    """
+    Query is expected to return rows with unique entityId
+    """
+    return pkgutil.get_data(__name__, "bq.sql").decode("utf-8")
+
+
+def parse_metric(config: Dict[str, str]):
+    metric_str = config["metric"].lower()
+    if metric_str == "l2":
+        return faiss.METRIC_L2
+    elif metric_str == "ip":
+        return faiss.METRIC_INNER_PRODUCT
+    elif metric_str == "l1":
+        return faiss.METRIC_L1
+    elif metric_str == "linf":
+        return faiss.METRIC_Linf
+    raise Exception(f"Unknown metric: {metric_str}")
+
 
-    # Make linter happy
-    index_built
+def run_pipeline(argv: List[str] = []):
+    config = parse_d6w_config(argv)
+    argv_with_extras = argv
+    if config["gpu"]:
+        argv_with_extras.extend(["--experiments", "use_runner_v2"])
+        argv_with_extras.extend(
+            [
+                "--experiments",
+                "worker_accelerator=type:nvidia-tesla-t4;count:1;install-nvidia-driver",
+            ]
+        )
+        argv_with_extras.extend(
+            [
+                "--worker_harness_container_image",
+                "gcr.io/twttr-recos-ml-prod/dataflow-gpu/beam2_39_0_py3_7",
+            ]
+        )
+
+    options = PipelineOptions(argv_with_extras)
+    output_bucket_name = urlsplit(config["output_location"]).netloc
+
+    with beam.Pipeline(options=options) as p:
+        input_data = p | "Read from BigQuery" >> beam.io.ReadFromBigQuery(
+            method=beam.io.ReadFromBigQuery.Method.DIRECT_READ,
+            query=get_bq_query(),
+            use_standard_sql=True,
+        )
+
+        index_built = input_data | "Build and upload index" >> beam.CombineGlobally(
+            MergeAndBuildIndex(
+                output_bucket_name,
+                config["output_location"],
+                config["factory_string"],
+                config["metric"],
+                config["gpu"],
+            )
+        )  # pylint: disable=unused-variable
 
 
 class MergeAndBuildIndex(beam.CombineFn):
-  def __init__(self, bucket_name, gcs_output_path, factory_string, metric, gpu):
-    self.bucket_name = bucket_name
-    self.gcs_output_path = gcs_output_path
-    self.factory_string = factory_string
-    self.metric = metric
-    self.gpu = gpu
-
-  def create_accumulator(self):
-    return []
-
-  def add_input(self, accumulator, element):
-    accumulator.append(element)
-    return accumulator
-
-  def merge_accumulators(self, accumulators):
-    merged = []
-    for accum in accumulators:
-      merged.extend(accum)
-    return merged
-
-  def extract_output(self, rows):
-    # Reimports are needed on workers
-    import glob
-    import subprocess
-
-    import faiss
-    from google.cloud import storage
-    import numpy as np
-
-    client = storage.Client()
-    bucket = client.get_bucket(self.bucket_name)
-
-    logging.info("Building FAISS index")
-    logging.info(f"There are {len(rows)} rows")
-
-    ids = np.array([x["entityId"] for x in rows]).astype("long")
-    embeds = np.array([x["embedding"] for x in rows]).astype("float32")
-    dimensions = len(embeds[0])
-    N = ids.shape[0]
-    logging.info(f"There are {dimensions} dimensions")
-
-    if self.factory_string is None:
-      M = 48
-
-      divideable_dimensions = (dimensions // M) * M
-      if divideable_dimensions != dimensions:
-        opq_prefix = f"OPQ{M}_{divideable_dimensions}"
-      else:
-        opq_prefix = f"OPQ{M}"
-
-      clusters = N // 20
-      self.factory_string = f"{opq_prefix},IVF{clusters},PQ{M}"
-
-    logging.info(f"Factory string is {self.factory_string}, metric={self.metric}")
-
-    if self.gpu:
-      logging.info("Using GPU")
-
-      res = faiss.StandardGpuResources()
-      cpu_index = faiss.index_factory(dimensions, self.factory_string, self.metric)
-      cpu_index = faiss.IndexIDMap(cpu_index)
-      gpu_index = faiss.index_cpu_to_gpu(res, 0, cpu_index)
-      gpu_index.train(embeds)
-      gpu_index.add_with_ids(embeds, ids)
-      cpu_index = faiss.index_gpu_to_cpu(gpu_index)
-    else:
-      logging.info("Using CPU")
-
-      cpu_index = faiss.index_factory(dimensions, self.factory_string, self.metric)
-      cpu_index = faiss.IndexIDMap(cpu_index)
-      cpu_index.train(embeds)
-      cpu_index.add_with_ids(embeds, ids)
-
-    logging.info("Built faiss index")
-
-    local_path = "/indices"
-    logging.info(f"Writing indices to local {local_path}")
-    subprocess.run(f"mkdir -p {local_path}".strip().split())
-    local_index_path = os.path.join(local_path, "result.index")
-
-    faiss.write_index(cpu_index, local_index_path)
-    logging.info(f"Done writing indices to local {local_path}")
-
-    logging.info(f"Uploading to GCS with path {self.gcs_output_path}")
-    assert os.path.isdir(local_path)
-    for local_file in glob.glob(local_path + "/*"):
-      remote_path = os.path.join(
-        self.gcs_output_path.split("/")[-1], local_file[1 + len(local_path) :]
-      )
-      blob = bucket.blob(remote_path)
-      blob.upload_from_filename(local_file)
+    def __init__(self, bucket_name, gcs_output_path, factory_string, metric, gpu):
+        self.bucket_name = bucket_name
+        self.gcs_output_path = gcs_output_path
+        self.factory_string = factory_string
+        self.metric = metric
+        self.gpu = gpu
+
+    def create_accumulator(self):
+        return []
+
+    def add_input(self, accumulator: List, element) -> List:
+        accumulator.append(element)
+        return accumulator
+
+    def merge_accumulators(self, accumulators):
+        merged = []
+        for accum in accumulators:
+            merged.extend(accum)
+        return merged
+
+    def extract_output(self, rows):
+        # Reimports are needed on workers
+        import glob
+        import subprocess
+
+        import faiss
+        import numpy as np
+        from google.cloud import storage
+
+        client = storage.Client()
+        bucket = client.get_bucket(self.bucket_name)
+
+        logging.info("Building FAISS index")
+        logging.info(f"There are {len(rows)} rows")
+
+        ids = np.array([x["entityId"] for x in rows]).astype("long")
+        embeds = np.array([x["embedding"] for x in rows]).astype("float32")
+        dimensions = len(embeds[0])
+        N = ids.shape[0]
+        logging.info(f"There are {dimensions} dimensions")
+
+        if self.factory_string is None:
+            M = 48
+
+            divideable_dimensions = (dimensions // M) * M
+            if divideable_dimensions != dimensions:
+                opq_prefix = f"OPQ{M}_{divideable_dimensions}"
+            else:
+                opq_prefix = f"OPQ{M}"
+
+            clusters = N // 20
+            self.factory_string = f"{opq_prefix},IVF{clusters},PQ{M}"
+
+        logging.info(f"Factory string is {self.factory_string}, metric={self.metric}")
+
+        if self.gpu:
+            logging.info("Using GPU")
+
+            res = faiss.StandardGpuResources()
+            cpu_index = faiss.index_factory(
+                dimensions, self.factory_string, self.metric
+            )
+            cpu_index = faiss.IndexIDMap(cpu_index)
+            gpu_index = faiss.index_cpu_to_gpu(res, 0, cpu_index)
+            gpu_index.train(embeds)
+            gpu_index.add_with_ids(embeds, ids)
+            cpu_index = faiss.index_gpu_to_cpu(gpu_index)
+        else:
+            logging.info("Using CPU")
+
+            cpu_index = faiss.index_factory(
+                dimensions, self.factory_string, self.metric
+            )
+            cpu_index = faiss.IndexIDMap(cpu_index)
+            cpu_index.train(embeds)
+            cpu_index.add_with_ids(embeds, ids)
+
+        logging.info("Built faiss index")
+
+        local_path = "/indices"
+        logging.info(f"Writing indices to local {local_path}")
+        subprocess.run(f"mkdir -p {local_path}".strip().split())
+        local_index_path = os.path.join(local_path, "result.index")
+
+        faiss.write_index(cpu_index, local_index_path)
+        logging.info(f"Done writing indices to local {local_path}")
+
+        logging.info(f"Uploading to GCS with path {self.gcs_output_path}")
+        assert os.path.isdir(local_path)
+        for local_file in glob.glob(local_path + "/*"):
+            remote_path = os.path.join(
+                self.gcs_output_path.split("/")[-1], local_file[1 + len(local_path) :]
+            )
+            blob = bucket.blob(remote_path)
+            blob.upload_from_filename(local_file)
 
 
 if __name__ == "__main__":
-  logging.getLogger().setLevel(logging.INFO)
-  run_pipeline(sys.argv)
+    logging.getLogger().setLevel(logging.INFO)
+    run_pipeline(sys.argv)
diff --git a/src/python/twitter/deepbird/projects/timelines/configs/recap_earlybird/feature_config.py b/src/python/twitter/deepbird/projects/timelines/configs/recap_earlybird/feature_config.py
index 167756c01..7bc382b54 100644
--- a/src/python/twitter/deepbird/projects/timelines/configs/recap_earlybird/feature_config.py
+++ b/src/python/twitter/deepbird/projects/timelines/configs/recap_earlybird/feature_config.py
@@ -2,82 +2,82 @@
 from twml.feature_config import FeatureConfigBuilder
 
 
-def get_feature_config(data_spec_path, label):
-  return (
-    FeatureConfigBuilder(data_spec_path=data_spec_path, debug=True)
-    .batch_add_features(
-      [
-        ("ebd.author_specific_score", "A"),
-        ("ebd.has_diff_lang", "A"),
-        ("ebd.has_english_tweet_diff_ui_lang", "A"),
-        ("ebd.has_english_ui_diff_tweet_lang", "A"),
-        ("ebd.is_self_tweet", "A"),
-        ("ebd.tweet_age_in_secs", "A"),
-        ("encoded_tweet_features.favorite_count", "A"),
-        ("encoded_tweet_features.from_verified_account_flag", "A"),
-        ("encoded_tweet_features.has_card_flag", "A"),
-        # ("encoded_tweet_features.has_consumer_video_flag", "A"),
-        ("encoded_tweet_features.has_image_url_flag", "A"),
-        ("encoded_tweet_features.has_link_flag", "A"),
-        ("encoded_tweet_features.has_multiple_hashtags_or_trends_flag", "A"),
-        # ("encoded_tweet_features.has_multiple_media_flag", "A"),
-        ("encoded_tweet_features.has_native_image_flag", "A"),
-        ("encoded_tweet_features.has_news_url_flag", "A"),
-        ("encoded_tweet_features.has_periscope_flag", "A"),
-        ("encoded_tweet_features.has_pro_video_flag", "A"),
-        ("encoded_tweet_features.has_quote_flag", "A"),
-        ("encoded_tweet_features.has_trend_flag", "A"),
-        ("encoded_tweet_features.has_video_url_flag", "A"),
-        ("encoded_tweet_features.has_vine_flag", "A"),
-        ("encoded_tweet_features.has_visible_link_flag", "A"),
-        ("encoded_tweet_features.is_offensive_flag", "A"),
-        ("encoded_tweet_features.is_reply_flag", "A"),
-        ("encoded_tweet_features.is_retweet_flag", "A"),
-        ("encoded_tweet_features.is_sensitive_content", "A"),
-        # ("encoded_tweet_features.is_user_new_flag", "A"),
-        ("encoded_tweet_features.language", "A"),
-        ("encoded_tweet_features.link_language", "A"),
-        ("encoded_tweet_features.num_hashtags", "A"),
-        ("encoded_tweet_features.num_mentions", "A"),
-        # ("encoded_tweet_features.profile_is_egg_flag", "A"),
-        ("encoded_tweet_features.reply_count", "A"),
-        ("encoded_tweet_features.retweet_count", "A"),
-        ("encoded_tweet_features.text_score", "A"),
-        ("encoded_tweet_features.user_reputation", "A"),
-        ("extended_encoded_tweet_features.embeds_impression_count", "A"),
-        ("extended_encoded_tweet_features.embeds_impression_count_v2", "A"),
-        ("extended_encoded_tweet_features.embeds_url_count", "A"),
-        ("extended_encoded_tweet_features.embeds_url_count_v2", "A"),
-        ("extended_encoded_tweet_features.favorite_count_v2", "A"),
-        ("extended_encoded_tweet_features.label_abusive_hi_rcl_flag", "A"),
-        ("extended_encoded_tweet_features.label_dup_content_flag", "A"),
-        ("extended_encoded_tweet_features.label_nsfw_hi_prc_flag", "A"),
-        ("extended_encoded_tweet_features.label_nsfw_hi_rcl_flag", "A"),
-        ("extended_encoded_tweet_features.label_spam_flag", "A"),
-        ("extended_encoded_tweet_features.label_spam_hi_rcl_flag", "A"),
-        ("extended_encoded_tweet_features.quote_count", "A"),
-        ("extended_encoded_tweet_features.reply_count_v2", "A"),
-        ("extended_encoded_tweet_features.retweet_count_v2", "A"),
-        ("extended_encoded_tweet_features.weighted_favorite_count", "A"),
-        ("extended_encoded_tweet_features.weighted_quote_count", "A"),
-        ("extended_encoded_tweet_features.weighted_reply_count", "A"),
-        ("extended_encoded_tweet_features.weighted_retweet_count", "A"),
-      ]
+def get_feature_config(data_spec_path: str, label: str) -> FeatureConfigBuilder:
+    return (
+        FeatureConfigBuilder(data_spec_path=data_spec_path, debug=True)
+        .batch_add_features(
+            [
+                ("ebd.author_specific_score", "A"),
+                ("ebd.has_diff_lang", "A"),
+                ("ebd.has_english_tweet_diff_ui_lang", "A"),
+                ("ebd.has_english_ui_diff_tweet_lang", "A"),
+                ("ebd.is_self_tweet", "A"),
+                ("ebd.tweet_age_in_secs", "A"),
+                ("encoded_tweet_features.favorite_count", "A"),
+                ("encoded_tweet_features.from_verified_account_flag", "A"),
+                ("encoded_tweet_features.has_card_flag", "A"),
+                # ("encoded_tweet_features.has_consumer_video_flag", "A"),
+                ("encoded_tweet_features.has_image_url_flag", "A"),
+                ("encoded_tweet_features.has_link_flag", "A"),
+                ("encoded_tweet_features.has_multiple_hashtags_or_trends_flag", "A"),
+                # ("encoded_tweet_features.has_multiple_media_flag", "A"),
+                ("encoded_tweet_features.has_native_image_flag", "A"),
+                ("encoded_tweet_features.has_news_url_flag", "A"),
+                ("encoded_tweet_features.has_periscope_flag", "A"),
+                ("encoded_tweet_features.has_pro_video_flag", "A"),
+                ("encoded_tweet_features.has_quote_flag", "A"),
+                ("encoded_tweet_features.has_trend_flag", "A"),
+                ("encoded_tweet_features.has_video_url_flag", "A"),
+                ("encoded_tweet_features.has_vine_flag", "A"),
+                ("encoded_tweet_features.has_visible_link_flag", "A"),
+                ("encoded_tweet_features.is_offensive_flag", "A"),
+                ("encoded_tweet_features.is_reply_flag", "A"),
+                ("encoded_tweet_features.is_retweet_flag", "A"),
+                ("encoded_tweet_features.is_sensitive_content", "A"),
+                # ("encoded_tweet_features.is_user_new_flag", "A"),
+                ("encoded_tweet_features.language", "A"),
+                ("encoded_tweet_features.link_language", "A"),
+                ("encoded_tweet_features.num_hashtags", "A"),
+                ("encoded_tweet_features.num_mentions", "A"),
+                # ("encoded_tweet_features.profile_is_egg_flag", "A"),
+                ("encoded_tweet_features.reply_count", "A"),
+                ("encoded_tweet_features.retweet_count", "A"),
+                ("encoded_tweet_features.text_score", "A"),
+                ("encoded_tweet_features.user_reputation", "A"),
+                ("extended_encoded_tweet_features.embeds_impression_count", "A"),
+                ("extended_encoded_tweet_features.embeds_impression_count_v2", "A"),
+                ("extended_encoded_tweet_features.embeds_url_count", "A"),
+                ("extended_encoded_tweet_features.embeds_url_count_v2", "A"),
+                ("extended_encoded_tweet_features.favorite_count_v2", "A"),
+                ("extended_encoded_tweet_features.label_abusive_hi_rcl_flag", "A"),
+                ("extended_encoded_tweet_features.label_dup_content_flag", "A"),
+                ("extended_encoded_tweet_features.label_nsfw_hi_prc_flag", "A"),
+                ("extended_encoded_tweet_features.label_nsfw_hi_rcl_flag", "A"),
+                ("extended_encoded_tweet_features.label_spam_flag", "A"),
+                ("extended_encoded_tweet_features.label_spam_hi_rcl_flag", "A"),
+                ("extended_encoded_tweet_features.quote_count", "A"),
+                ("extended_encoded_tweet_features.reply_count_v2", "A"),
+                ("extended_encoded_tweet_features.retweet_count_v2", "A"),
+                ("extended_encoded_tweet_features.weighted_favorite_count", "A"),
+                ("extended_encoded_tweet_features.weighted_quote_count", "A"),
+                ("extended_encoded_tweet_features.weighted_reply_count", "A"),
+                ("extended_encoded_tweet_features.weighted_retweet_count", "A"),
+            ]
+        )
+        .add_labels(
+            [
+                label,  # Tensor index: 0
+                "recap.engagement.is_clicked",  # Tensor index: 1
+                "recap.engagement.is_favorited",  # Tensor index: 2
+                "recap.engagement.is_open_linked",  # Tensor index: 3
+                "recap.engagement.is_photo_expanded",  # Tensor index: 4
+                "recap.engagement.is_profile_clicked",  # Tensor index: 5
+                "recap.engagement.is_replied",  # Tensor index: 6
+                "recap.engagement.is_retweeted",  # Tensor index: 7
+                "recap.engagement.is_video_playback_50",  # Tensor index: 8
+                "timelines.earlybird_score",  # Tensor index: 9
+            ]
+        )
+        .define_weight("meta.record_weight/type=earlybird")
+        .build()
     )
-    .add_labels(
-      [
-        label,  # Tensor index: 0
-        "recap.engagement.is_clicked",  # Tensor index: 1
-        "recap.engagement.is_favorited",  # Tensor index: 2
-        "recap.engagement.is_open_linked",  # Tensor index: 3
-        "recap.engagement.is_photo_expanded",  # Tensor index: 4
-        "recap.engagement.is_profile_clicked",  # Tensor index: 5
-        "recap.engagement.is_replied",  # Tensor index: 6
-        "recap.engagement.is_retweeted",  # Tensor index: 7
-        "recap.engagement.is_video_playback_50",  # Tensor index: 8
-        "timelines.earlybird_score",  # Tensor index: 9
-      ]
-    )
-    .define_weight("meta.record_weight/type=earlybird")
-    .build()
-  )
diff --git a/src/python/twitter/deepbird/projects/timelines/configs/rectweet_earlybird/feature_config.py b/src/python/twitter/deepbird/projects/timelines/configs/rectweet_earlybird/feature_config.py
index 85b7d7f10..faec156c6 100644
--- a/src/python/twitter/deepbird/projects/timelines/configs/rectweet_earlybird/feature_config.py
+++ b/src/python/twitter/deepbird/projects/timelines/configs/rectweet_earlybird/feature_config.py
@@ -2,73 +2,78 @@
 from twml.feature_config import FeatureConfigBuilder
 
 
-def get_feature_config(data_spec_path, label):
-  return FeatureConfigBuilder(data_spec_path=data_spec_path, debug=True) \
-    .batch_add_features(
-    [
-      ("ebd.has_diff_lang", "A"),
-      ("ebd.tweet_age_in_secs", "A"),
-      ("encoded_tweet_features.composer_source_is_camera_flag", "A"),
-      ("encoded_tweet_features.favorite_count", "A"),
-      ("encoded_tweet_features.has_card_flag", "A"),
-      ("encoded_tweet_features.has_image_url_flag", "A"),
-      ("encoded_tweet_features.has_native_image_flag", "A"),
-      ("encoded_tweet_features.has_news_url_flag", "A"),
-      ("encoded_tweet_features.has_periscope_flag", "A"),
-      ("encoded_tweet_features.has_pro_video_flag", "A"),
-      ("encoded_tweet_features.has_quote_flag", "A"),
-      ("encoded_tweet_features.has_video_url_flag", "A"),
-      ("encoded_tweet_features.has_vine_flag", "A"),
-      ("encoded_tweet_features.has_visible_link_flag", "A"),
-      ("encoded_tweet_features.is_sensitive_content", "A"),
-      ("encoded_tweet_features.is_user_spam_flag", "A"),
-      ("encoded_tweet_features.link_language", "A"),
-      ("encoded_tweet_features.num_hashtags", "A"),
-      ("encoded_tweet_features.num_mentions", "A"),
-      ("encoded_tweet_features.reply_count", "A"),
-      ("encoded_tweet_features.retweet_count", "A"),
-      ("encoded_tweet_features.text_score", "A"),
-      ("encoded_tweet_features.user_reputation", "A"),
-      ("extended_encoded_tweet_features.decayed_favorite_count", "A"),
-      ("extended_encoded_tweet_features.decayed_quote_count", "A"),
-      ("extended_encoded_tweet_features.decayed_reply_count", "A"),
-      ("extended_encoded_tweet_features.decayed_retweet_count", "A"),
-      ("extended_encoded_tweet_features.embeds_impression_count_v2", "A"),
-      ("extended_encoded_tweet_features.embeds_url_count_v2", "A"),
-      ("extended_encoded_tweet_features.fake_favorite_count", "A"),
-      ("extended_encoded_tweet_features.fake_quote_count", "A"),
-      ("extended_encoded_tweet_features.fake_reply_count", "A"),
-      ("extended_encoded_tweet_features.fake_retweet_count", "A"),
-      ("extended_encoded_tweet_features.favorite_count_v2", "A"),
-      ("extended_encoded_tweet_features.label_dup_content_flag", "A"),
-      ("extended_encoded_tweet_features.label_nsfw_hi_prc_flag", "A"),
-      ("extended_encoded_tweet_features.label_nsfw_hi_rcl_flag", "A"),
-      ("extended_encoded_tweet_features.label_spam_hi_rcl_flag", "A"),
-      ("extended_encoded_tweet_features.periscope_exists", "A"),
-      ("extended_encoded_tweet_features.periscope_has_been_featured", "A"),
-      ("extended_encoded_tweet_features.periscope_is_currently_featured", "A"),
-      ("extended_encoded_tweet_features.periscope_is_from_quality_source", "A"),
-      ("extended_encoded_tweet_features.periscope_is_live", "A"),
-      ("extended_encoded_tweet_features.quote_count", "A"),
-      ("extended_encoded_tweet_features.reply_count_v2", "A"),
-      ("extended_encoded_tweet_features.retweet_count_v2", "A"),
-      ("extended_encoded_tweet_features.weighted_favorite_count", "A"),
-      ("extended_encoded_tweet_features.weighted_quote_count", "A"),
-      ("extended_encoded_tweet_features.weighted_reply_count", "A"),
-      ("extended_encoded_tweet_features.weighted_retweet_count", "A"),
-      ("timelines.earlybird.visible_token_ratio", "A")
-    ]
-  ).add_labels([
-    label,                                 # Tensor index: 0
-    "itl.engagement.is_clicked",           # Tensor index: 1
-    "itl.engagement.is_favorited",         # Tensor index: 2
-    "itl.engagement.is_open_linked",       # Tensor index: 3
-    "itl.engagement.is_photo_expanded",    # Tensor index: 4
-    "itl.engagement.is_profile_clicked",   # Tensor index: 5
-    "itl.engagement.is_replied",           # Tensor index: 6
-    "itl.engagement.is_retweeted",         # Tensor index: 7
-    "itl.engagement.is_video_playback_50",  # Tensor index: 8
-    "timelines.earlybird_score",           # Tensor index: 9
-  ]) \
-    .define_weight("meta.record_weight/type=earlybird") \
-    .build()
+def get_feature_config(data_spec_path: str, label: str) -> FeatureConfigBuilder:
+    return (
+        FeatureConfigBuilder(data_spec_path=data_spec_path, debug=True)
+        .batch_add_features(
+            [
+                ("ebd.has_diff_lang", "A"),
+                ("ebd.tweet_age_in_secs", "A"),
+                ("encoded_tweet_features.composer_source_is_camera_flag", "A"),
+                ("encoded_tweet_features.favorite_count", "A"),
+                ("encoded_tweet_features.has_card_flag", "A"),
+                ("encoded_tweet_features.has_image_url_flag", "A"),
+                ("encoded_tweet_features.has_native_image_flag", "A"),
+                ("encoded_tweet_features.has_news_url_flag", "A"),
+                ("encoded_tweet_features.has_periscope_flag", "A"),
+                ("encoded_tweet_features.has_pro_video_flag", "A"),
+                ("encoded_tweet_features.has_quote_flag", "A"),
+                ("encoded_tweet_features.has_video_url_flag", "A"),
+                ("encoded_tweet_features.has_vine_flag", "A"),
+                ("encoded_tweet_features.has_visible_link_flag", "A"),
+                ("encoded_tweet_features.is_sensitive_content", "A"),
+                ("encoded_tweet_features.is_user_spam_flag", "A"),
+                ("encoded_tweet_features.link_language", "A"),
+                ("encoded_tweet_features.num_hashtags", "A"),
+                ("encoded_tweet_features.num_mentions", "A"),
+                ("encoded_tweet_features.reply_count", "A"),
+                ("encoded_tweet_features.retweet_count", "A"),
+                ("encoded_tweet_features.text_score", "A"),
+                ("encoded_tweet_features.user_reputation", "A"),
+                ("extended_encoded_tweet_features.decayed_favorite_count", "A"),
+                ("extended_encoded_tweet_features.decayed_quote_count", "A"),
+                ("extended_encoded_tweet_features.decayed_reply_count", "A"),
+                ("extended_encoded_tweet_features.decayed_retweet_count", "A"),
+                ("extended_encoded_tweet_features.embeds_impression_count_v2", "A"),
+                ("extended_encoded_tweet_features.embeds_url_count_v2", "A"),
+                ("extended_encoded_tweet_features.fake_favorite_count", "A"),
+                ("extended_encoded_tweet_features.fake_quote_count", "A"),
+                ("extended_encoded_tweet_features.fake_reply_count", "A"),
+                ("extended_encoded_tweet_features.fake_retweet_count", "A"),
+                ("extended_encoded_tweet_features.favorite_count_v2", "A"),
+                ("extended_encoded_tweet_features.label_dup_content_flag", "A"),
+                ("extended_encoded_tweet_features.label_nsfw_hi_prc_flag", "A"),
+                ("extended_encoded_tweet_features.label_nsfw_hi_rcl_flag", "A"),
+                ("extended_encoded_tweet_features.label_spam_hi_rcl_flag", "A"),
+                ("extended_encoded_tweet_features.periscope_exists", "A"),
+                ("extended_encoded_tweet_features.periscope_has_been_featured", "A"),
+                ("extended_encoded_tweet_features.periscope_is_currently_featured","A"),
+                ("extended_encoded_tweet_features.periscope_is_from_quality_source", "A"),
+                ("extended_encoded_tweet_features.periscope_is_live", "A"),
+                ("extended_encoded_tweet_features.quote_count", "A"),
+                ("extended_encoded_tweet_features.reply_count_v2", "A"),
+                ("extended_encoded_tweet_features.retweet_count_v2", "A"),
+                ("extended_encoded_tweet_features.weighted_favorite_count", "A"),
+                ("extended_encoded_tweet_features.weighted_quote_count", "A"),
+                ("extended_encoded_tweet_features.weighted_reply_count", "A"),
+                ("extended_encoded_tweet_features.weighted_retweet_count", "A"),
+                ("timelines.earlybird.visible_token_ratio", "A"),
+            ]
+        )
+        .add_labels(
+            [
+                label,  # Tensor index: 0
+                "itl.engagement.is_clicked",  # Tensor index: 1
+                "itl.engagement.is_favorited",  # Tensor index: 2
+                "itl.engagement.is_open_linked",  # Tensor index: 3
+                "itl.engagement.is_photo_expanded",  # Tensor index: 4
+                "itl.engagement.is_profile_clicked",  # Tensor index: 5
+                "itl.engagement.is_replied",  # Tensor index: 6
+                "itl.engagement.is_retweeted",  # Tensor index: 7
+                "itl.engagement.is_video_playback_50",  # Tensor index: 8
+                "timelines.earlybird_score",  # Tensor index: 9
+            ]
+        )
+        .define_weight("meta.record_weight/type=earlybird")
+        .build()
+    )
diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/constants.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/constants.py
index 57178b92c..d20fccb52 100644
--- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/constants.py
+++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/constants.py
@@ -1,21 +1,30 @@
 # checkstyle: noqa
 
 INDEX_BY_LABEL = {
-  "is_clicked": 1,
-  "is_favorited": 2,
-  "is_open_linked": 3,
-  "is_photo_expanded": 4,
-  "is_profile_clicked": 5,
-  "is_replied": 6,
-  "is_retweeted": 7,
-  "is_video_playback_50": 8
+    "is_clicked": 1,
+    "is_favorited": 2,
+    "is_open_linked": 3,
+    "is_photo_expanded": 4,
+    "is_profile_clicked": 5,
+    "is_replied": 6,
+    "is_retweeted": 7,
+    "is_video_playback_50": 8,
 }
 
 TARGET_LABEL_IDX = 0
+
 EB_SCORE_IDX = 9
 
-LABEL_NAMES = [label_name for label_name, _ in sorted(INDEX_BY_LABEL.items(), key=lambda item: item[1])]
+LABEL_NAMES = [
+    label_name
+    for label_name, _ in sorted(INDEX_BY_LABEL.items(), key=lambda item: item[1])
+]
 
-PREDICTED_CLASSES = \
-  ["tf_target"] + ["tf_" + label_name for label_name in LABEL_NAMES] + ["tf_timelines.earlybird_score"] + \
-  ["lolly_target"] + ["lolly_" + label_name for label_name in LABEL_NAMES] + ["lolly_timelines.earlybird_score"]
+PREDICTED_CLASSES = (
+    ["tf_target"]
+    + ["tf_" + label_name for label_name in LABEL_NAMES]
+    + ["tf_timelines.earlybird_score"]
+    + ["lolly_target"]
+    + ["lolly_" + label_name for label_name in LABEL_NAMES]
+    + ["lolly_timelines.earlybird_score"]
+)
diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/example_weights.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/example_weights.py
index cf0c38ecc..f361c6cd5 100644
--- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/example_weights.py
+++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/example_weights.py
@@ -1,43 +1,65 @@
 # checkstyle: noqa
+from typing import Dict, Final
+from tensorflow import Tensor
+from twml import DefaultSubcommandArgParse
+import argparse
 import tensorflow.compat.v1 as tf
+
 from .constants import INDEX_BY_LABEL, LABEL_NAMES
 
 # TODO: Read these from command line arguments, since they specify the existing example weights in the input data.
-DEFAULT_WEIGHT_BY_LABEL = {
-  "is_clicked": 0.3,
-  "is_favorited": 1.0,
-  "is_open_linked": 0.1,
-  "is_photo_expanded": 0.03,
-  "is_profile_clicked": 1.0,
-  "is_replied": 9.0,
-  "is_retweeted": 1.0,
-  "is_video_playback_50": 0.01
+
+DEFAULT_WEIGHT_BY_LABEL: Final[Dict[str, float]] = {
+    "is_clicked": 0.3,
+    "is_favorited": 1.0,
+    "is_open_linked": 0.1,
+    "is_photo_expanded": 0.03,
+    "is_profile_clicked": 1.0,
+    "is_replied": 9.0,
+    "is_retweeted": 1.0,
+    "is_video_playback_50": 0.01,
 }
 
-def add_weight_arguments(parser):
-  for label_name in LABEL_NAMES:
-    parser.add_argument(
-      _make_weight_cli_argument_name(label_name),
-      type=float,
-      default=DEFAULT_WEIGHT_BY_LABEL[label_name],
-      dest=_make_weight_param_name(label_name)
-    )
-
-def make_weights_tensor(input_weights, label, params):
-  '''
-  Replaces the weights for each positive engagement and keeps the input weights for negative examples.
-  '''
-  weight_tensors = [input_weights]
-  for label_name in LABEL_NAMES:
-    index, default_weight = INDEX_BY_LABEL[label_name], DEFAULT_WEIGHT_BY_LABEL[label_name]
-    weight_param_name =_make_weight_param_name(label_name)
-    weight_tensors.append(
-      tf.reshape(tf.math.scalar_mul(getattr(params, weight_param_name) - default_weight, label[:, index]), [-1, 1])
-    )
-  return tf.math.accumulate_n(weight_tensors)
-
-def _make_weight_cli_argument_name(label_name):
-  return f"--weight.{label_name}"
-
-def _make_weight_param_name(label_name):
-  return f"weight_{label_name}"
+def add_weight_arguments(parser: DefaultSubcommandArgParse) -> DefaultSubcommandArgParse:
+    """Adds command line arguments for example weights."""
+
+    for label_name in LABEL_NAMES:
+        parser.add_argument(
+            _make_weight_cli_argument_name(label_name),
+            type=float,
+            default=DEFAULT_WEIGHT_BY_LABEL[label_name],
+            dest=_make_weight_param_name(label_name),
+        )
+
+
+def make_weights_tensor(
+    input_weights: tf.Tensor, label: tf.Tensor, params
+) -> Tensor:
+    """Replaces the weights for each positive engagement and keeps the input weights for negative examples."""
+
+    weight_tensors = [input_weights]
+    for label_name in LABEL_NAMES:
+        index, default_weight = (
+            INDEX_BY_LABEL[label_name],
+            DEFAULT_WEIGHT_BY_LABEL[label_name],
+        )
+        weight_param_name = _make_weight_param_name(label_name)
+        weight_tensors.append(
+            tf.reshape(
+                tf.math.scalar_mul(
+                    getattr(params, weight_param_name) - default_weight, label[:, index]
+                ),
+                [-1, 1],
+            )
+        )
+    return tf.math.accumulate_n(weight_tensors)
+
+def _make_weight_cli_argument_name(label_name: str) -> str:
+    """Returns the name of the command line argument that holds the weight for the given label."""
+
+    return f"--weight.{label_name}"
+
+def _make_weight_param_name(label_name: str) -> str:
+    """Returns the name of the parameter that holds the weight for the given label."""
+
+    return f"weight_{label_name}"
\ No newline at end of file
diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/data_helpers.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/data_helpers.py
index 723dd626c..5d7e7124a 100644
--- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/data_helpers.py
+++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/data_helpers.py
@@ -1,23 +1,36 @@
 # checkstyle: noqa
 import tensorflow.compat.v1 as tf
+
 from ..constants import EB_SCORE_IDX
 
+
 # The rationale behind this logic is available at TQ-9678.
-def get_lolly_logits(labels):
-  '''
-  :param labels: tf.Tensor of shape (batch size, num labels) with labels as specified by the feature config.
-  :return: tf.Tensor of shape (batch size) with the extracted lolly logits.
-  '''
-  eb_lolly_scores = get_lolly_scores(labels)
-  inverse_eb_lolly_scores = tf.math.subtract(1.0, eb_lolly_scores)
-  lolly_activations = tf.math.subtract(tf.math.log(eb_lolly_scores), tf.math.log(inverse_eb_lolly_scores))
-  return lolly_activations
-
-def get_lolly_scores(labels):
-  '''
-  :param labels: tf.Tensor of shape (batch size, num labels) with labels as specified by the feature config.
-  :return: tf.Tensor of shape (batch size) with the extracted lolly scores.
-  '''
-  logged_eb_lolly_scores = tf.reshape(labels[:, EB_SCORE_IDX], (-1, 1))
-  eb_lolly_scores = tf.truediv(logged_eb_lolly_scores, 100.0)
-  return eb_lolly_scores
+def get_lolly_logits(labels: tf.Tensor) -> tf.Tensor:
+    """
+    Args:
+        labels: tf.Tensor of shape (batch size, num labels) with labels as specified by the feature config.
+
+    Returns:
+        tf.Tensor of shape (batch size) with the extracted lolly logits.
+    """
+
+    eb_lolly_scores = get_lolly_scores(labels)
+    inverse_eb_lolly_scores = tf.math.subtract(1.0, eb_lolly_scores)
+    lolly_activations = tf.math.subtract(
+        tf.math.log(eb_lolly_scores), tf.math.log(inverse_eb_lolly_scores)
+    )
+    return lolly_activations
+
+
+def get_lolly_scores(labels: tf.Tensor) -> tf.Tensor:
+    """
+    Args:
+        labels: tf.Tensor of shape (batch size, num labels) with labels as specified by the feature config.
+
+    Returns:
+        tf.Tensor of shape (batch size) with the extracted lolly scores.
+    """
+
+    logged_eb_lolly_scores = tf.reshape(labels[:, EB_SCORE_IDX], (-1, 1))
+    eb_lolly_scores = tf.truediv(logged_eb_lolly_scores, 100.0)
+    return eb_lolly_scores
diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/parsers.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/parsers.py
index cb39c67a7..b2454a870 100644
--- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/parsers.py
+++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/parsers.py
@@ -1,145 +1,153 @@
 import re
+from typing import Tuple
 
 from twitter.deepbird.io.util import _get_feature_id
 
 
 class Parser(object):
-  def parse(self, line):
-    match = re.search(self.pattern(), line)
-    if match:
-      return self._parse_match(match)
-    return None
+    """Base class for parsers."""
 
-  def pattern(self):
-    raise NotImplementedError
+    def parse(self, line: str) -> object:
+        match = re.search(self.pattern(), line)
+        if match:
+            return self._parse_match(match)
+        return None
 
-  def _parse_match(self, match):
-    raise NotImplementedError
+    def _parse_match(self, match: re.Match) -> float:
+        return float(match.group(1))
+
+    def pattern(self):
+        raise NotImplementedError
 
 
 class BiasParser(Parser):
-  '''
-  Parses the bias feature available in lolly model tsv files.
-  '''
+    """Parses the bias feature available in lolly model tsv files."""
 
-  def pattern(self):
-    '''
-    Matches lines like:
-      unified_engagement	bias	-0.935945
-    :return: a RegEx that extracts feature weight.
-    '''
-    return r"\t(bias)\t([^\s]+)"
+    def pattern(self) -> str:
+        """
+        Matches lines like:
+            unified_engagement	bias	-0.935945
+        :return: a RegEx that extracts feature weight.
+        """
+        return r"\t(bias)\t([^\s]+)"
 
-  def _parse_match(self, match):
-    return float(match.group(2))
+    def _parse_match(self, match: re.Match) -> float:
+        return float(match.group(2))
 
 
 class BinaryFeatureParser(Parser):
-  '''
-  Parses binary features available in lolly model tsv files.
-  '''
+    """Parses binary features available in lolly model tsv files."""
 
-  def pattern(self):
-    '''
-    Matches lines like:
-      unified_engagement	encoded_tweet_features.is_user_spam_flag	-0.181130
-    :return: a RegEx that extracts feature name and weight.
-    '''
-    return r"\t([\w\.]+)\t([^\s]+)"
+    def pattern(self) -> str:
+        """
+        Matches lines like:
+            unified_engagement	encoded_tweet_features.is_user_spam_flag	-0.181130
+        :return: a RegEx that extracts feature name and weight.
+        """
+        return r"\t([\w\.]+)\t([^\s]+)"
 
-  def _parse_match(self, match):
-    return (match.group(1), float(match.group(2)))
+    def _parse_match(self, match: re.Match) -> Tuple[str, float]:
+        return (match.group(1), float(match.group(2)))
 
 
 class DiscretizedFeatureParser(Parser):
-  '''
-  Parses discretized features available in lolly model tsv files.
-  '''
-
-  def pattern(self):
-    '''
-    Matches lines like:
-      unified_engagement	encoded_tweet_features.user_reputation.dz/dz_model=mdl/dz_range=1.000000e+00_2.000000e+00	0.031004
-    :return: a RegEx that extracts feature name, bin boundaries and weight.
-    '''
-    return r"([\w\.]+)\.dz\/dz_model=mdl\/dz_range=([^\s]+)\t([^\s]+)"
-
-  def _parse_match(self, match):
-    left_bin_side, right_bin_side = [float(number) for number in match.group(2).split("_")]
-    return (
-      match.group(1),
-      left_bin_side,
-      right_bin_side,
-      float(match.group(3))
-    )
+    """Parses discretized features available in lolly model tsv files."""
+
+    def pattern(self) -> str:
+        """
+        Matches lines like:
+            unified_engagement	encoded_tweet_features.user_reputation.dz/dz_model=mdl/dz_range=1.000000e+00_2.000000e+00	0.031004
+        :return: a RegEx that extracts feature name, bin boundaries and weight.
+        """
+        return r"([\w\.]+)\.dz\/dz_model=mdl\/dz_range=([^\s]+)\t([^\s]+)"
+
+    def _parse_match(self, match: re.Match) -> Tuple[str, float, float, float]:
+        left_bin_side, right_bin_side = [
+            float(number) for number in match.group(2).split("_")
+        ]
+        return (match.group(1), left_bin_side, right_bin_side, float(match.group(3)))
 
 
 class LollyModelFeaturesParser(Parser):
-  def __init__(self, bias_parser=BiasParser(), binary_feature_parser=BinaryFeatureParser(), discretized_feature_parser=DiscretizedFeatureParser()):
-    self._bias_parser = bias_parser
-    self._binary_feature_parser = binary_feature_parser
-    self._discretized_feature_parser = discretized_feature_parser
-
-  def parse(self, lolly_model_reader):
-    parsed_features = {
-      "bias": None,
-      "binary": {},
-      "discretized": {}
-    }
-    def process_line_fn(line):
-      bias_parser_result = self._bias_parser.parse(line)
-      if bias_parser_result:
-        parsed_features["bias"] = bias_parser_result
-        return
-
-      binary_feature_parser_result = self._binary_feature_parser.parse(line)
-      if binary_feature_parser_result:
-        name, value = binary_feature_parser_result
-        parsed_features["binary"][name] = value
-        return
-
-      discretized_feature_parser_result = self._discretized_feature_parser.parse(line)
-      if discretized_feature_parser_result:
-        name, left_bin, right_bin, weight = discretized_feature_parser_result
-        discretized_features = parsed_features["discretized"]
-        if name not in discretized_features:
-          discretized_features[name] = []
-        discretized_features[name].append((left_bin, right_bin, weight))
-
-    lolly_model_reader.read(process_line_fn)
-
-    return parsed_features
+    """Parses lolly model tsv files."""
+
+    def __init__(
+        self,
+        bias_parser: BiasParser = BiasParser(),
+        binary_feature_parser: BinaryFeatureParser = BinaryFeatureParser(),
+        discretized_feature_parser: DiscretizedFeatureParser = DiscretizedFeatureParser(),
+    ):
+        self._bias_parser = bias_parser
+        self._binary_feature_parser = binary_feature_parser
+        self._discretized_feature_parser = discretized_feature_parser
+
+    def parse(self, lolly_model_reader: object) -> dict:
+        parsed_features = {"bias": None, "binary": {}, "discretized": {}}
+
+        def process_line_fn(line: str) -> None:
+            bias_parser_result = self._bias_parser.parse(line)
+            if bias_parser_result:
+                parsed_features["bias"] = bias_parser_result
+                return
+            binary_feature_parser_result = self._binary_feature_parser.parse(line)
+            if binary_feature_parser_result:
+                name, value = binary_feature_parser_result
+                parsed_features["binary"][name] = value
+                return
+            discretized_feature_parser_result = self._discretized_feature_parser.parse(
+                line
+            )
+            if discretized_feature_parser_result:
+                name, left_bin, right_bin, weight = discretized_feature_parser_result
+                discretized_features = parsed_features["discretized"]
+                if name not in discretized_features:
+                    discretized_features[name] = []
+                discretized_features[name].append((left_bin, right_bin, weight))
+
+        lolly_model_reader.read(process_line_fn)
+
+        return parsed_features
 
 
 class DBv2DataExampleParser(Parser):
-  '''
-  Parses data records printed by the DBv2 train.py build_graph function.
-  Format: [[dbv2 logit]][[logged lolly logit]][[space separated feature ids]][[space separated feature values]]
-  '''
-
-  def __init__(self, lolly_model_reader, lolly_model_features_parser=LollyModelFeaturesParser()):
-    self.features = lolly_model_features_parser.parse(lolly_model_reader)
-    self.feature_name_by_dbv2_id = {}
-
-    for feature_name in list(self.features["binary"].keys()) + list(self.features["discretized"].keys()):
-      self.feature_name_by_dbv2_id[str(_get_feature_id(feature_name))] = feature_name
-
-  def pattern(self):
-    '''
-    :return: a RegEx that extracts dbv2 logit, logged lolly logit, feature ids and feature values.
-    '''
-    return r"\[\[([\w\.\-]+)\]\]\[\[([\w\.\-]+)\]\]\[\[([\w\.\- ]+)\]\]\[\[([\w\. ]+)\]\]"
-
-  def _parse_match(self, match):
-    feature_ids = match.group(3).split(" ")
-    feature_values = match.group(4).split(" ")
-
-    value_by_feature_name = {}
-    for index in range(len(feature_ids)):
-      feature_id = feature_ids[index]
-      if feature_id not in self.feature_name_by_dbv2_id:
-        print("Missing feature with id: " + str(feature_id))
-        continue
-      value_by_feature_name[self.feature_name_by_dbv2_id[feature_id]] = float(feature_values[index])
-
-    return value_by_feature_name
+    """
+    Parses data records printed by the DBv2 train.py build_graph function.
+    Format: [[dbv2 logit]][[logged lolly logit]][[space separated feature ids]][[space separated feature values]]
+    """
+
+    def __init__(
+        self,
+        lolly_model_reader: object,
+        lolly_model_features_parser: LollyModelFeaturesParser = LollyModelFeaturesParser(),
+    ):
+        self.features = lolly_model_features_parser.parse(lolly_model_reader)
+        self.feature_name_by_dbv2_id = {}
+
+        for feature_name in (
+            self.features["binary"].keys() + self.features["discretized"].keys()
+        ):
+            self.feature_name_by_dbv2_id[
+                str(_get_feature_id(feature_name))
+            ] = feature_name
+
+    def pattern(self) -> str:
+        """
+        :return: a RegEx that extracts dbv2 logit, logged lolly logit, feature ids and feature values.
+        """
+        return r"\[\[([\w\.\-]+)\]\]\[\[([\w\.\-]+)\]\]\[\[([\w\.\- ]+)\]\]\[\[([\w\. ]+)\]\]"
+
+    def _parse_match(self, match) -> dict:
+        feature_ids = match.group(3).split(" ")
+        feature_values = match.group(4).split(" ")
+
+        value_by_feature_name = dict()
+        for index in range(len(feature_ids)):
+            feature_id = feature_ids[index]
+            if feature_id not in self.feature_name_by_dbv2_id:
+                print("Missing feature with id: " + str(feature_id))
+                continue
+            value_by_feature_name[self.feature_name_by_dbv2_id[feature_id]] = float(
+                feature_values[index]
+            )
+
+        return value_by_feature_name
diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/reader.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/reader.py
index ab33ee4e7..3c76233af 100644
--- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/reader.py
+++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/reader.py
@@ -1,8 +1,11 @@
+from typing import Callable
+
+
 class LollyModelReader(object):
-  def __init__(self, lolly_model_file_path):
-    self._lolly_model_file_path = lolly_model_file_path
+    def __init__(self, lolly_model_file_path: str):
+        self._lolly_model_file_path = lolly_model_file_path
 
-  def read(self, process_line_fn):
-    with open(self._lolly_model_file_path, "r") as file:
-      for line in file:
-        process_line_fn(line)
+    def read(self, process_line_fn: Callable[[str], None]):
+        with open(self._lolly_model_file_path, "r") as file:
+            for line in file:
+                process_line_fn(line)
diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/score.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/score.py
index 5692616c2..b018844c5 100644
--- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/score.py
+++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/score.py
@@ -4,10 +4,8 @@
 from .reader import LollyModelReader
 from .scorer import LollyModelScorer
 
-
 if __name__ == "__main__":
-  lolly_model_reader = LollyModelReader(lolly_model_file_path=sys.argv[1])
-  lolly_model_scorer = LollyModelScorer(data_example_parser=DBv2DataExampleParser(lolly_model_reader))
-
-  score = lolly_model_scorer.score(data_example=sys.argv[2])
-  print(score)
+    lolly_model_reader = LollyModelReader(lolly_model_file_path=sys.argv[1])
+    lolly_model_scorer = LollyModelScorer(DBv2DataExampleParser(lolly_model_reader))
+    score = lolly_model_scorer.score(data_example=sys.argv[2])
+    print(score)
diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/scorer.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/scorer.py
index 621c43388..932e3b51f 100644
--- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/scorer.py
+++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/scorer.py
@@ -1,37 +1,56 @@
+from typing import List
+
+from python.twitter.deepbird.projects.timelines.scripts.models.earlybird.lolly.parsers import (
+    DBv2DataExampleParser,
+)
+
+
 class LollyModelScorer(object):
-  def __init__(self, data_example_parser):
-    self._data_example_parser = data_example_parser
-
-  def score(self, data_example):
-    value_by_feature_name = self._data_example_parser.parse(data_example)
-    features = self._data_example_parser.features
-    return self._score(value_by_feature_name, features)
-
-  def _score(self, value_by_feature_name, features):
-    score = features["bias"]
-    score += self._score_binary_features(features["binary"], value_by_feature_name)
-    score += self._score_discretized_features(features["discretized"], value_by_feature_name)
-    return score
-
-  def _score_binary_features(self, binary_features, value_by_feature_name):
-    score = 0.0
-    for binary_feature_name, binary_feature_weight in binary_features.items():
-      if binary_feature_name in value_by_feature_name:
-        score += binary_feature_weight
-    return score
-
-  def _score_discretized_features(self, discretized_features, value_by_feature_name):
-    score = 0.0
-    for discretized_feature_name, buckets in discretized_features.items():
-      if discretized_feature_name in value_by_feature_name:
-        feature_value = value_by_feature_name[discretized_feature_name]
-        score += self._find_matching_bucket_weight(buckets, feature_value)
-    return score
-
-  def _find_matching_bucket_weight(self, buckets, feature_value):
-    for left_side, right_side, weight in buckets:
-      # The Earlybird Lolly prediction engine discretizer bin membership interval is [a, b)
-      if feature_value >= left_side and feature_value < right_side:
-        return weight
-
-    raise LookupError("Couldn't find a matching bucket for the given feature value.")
+    def __init__(self, data_example_parser: DBv2DataExampleParser):
+        self._data_example_parser = data_example_parser
+
+    def score(self, data_example: str) -> float:
+        value_by_feature_name = self._data_example_parser.parse(data_example)
+        features = self._data_example_parser.features
+        return self._score(value_by_feature_name, features)
+
+    def _score(self, value_by_feature_name: dict, features: dict) -> float:
+        score = features["bias"]
+        score += self._score_binary_features(features["binary"], value_by_feature_name)
+        score += self._score_discretized_features(
+            features["discretized"], value_by_feature_name
+        )
+        return score
+
+    def _score_binary_features(
+        self, binary_features: dict, value_by_feature_name: dict
+    ) -> float:
+        score = 0.0
+        for binary_feature_name, binary_feature_weight in binary_features.items():
+            if binary_feature_name in value_by_feature_name:
+                score += binary_feature_weight
+        return score
+
+    def _score_discretized_features(
+        self, discretized_features: dict, value_by_feature_name: dict
+    ) -> float:
+        score = 0.0
+        for discretized_feature_name, buckets in discretized_features.items():
+            if discretized_feature_name in value_by_feature_name:
+                feature_value = value_by_feature_name[discretized_feature_name]
+                score += self._find_matching_bucket_weight(buckets, feature_value)
+        return score
+
+    def _find_matching_bucket_weight(
+        self,
+        buckets: List[tuple[float, float, float]],
+        feature_value: float,
+    ) -> float:
+        for left_side, right_side, weight in buckets:
+            # The Earlybird Lolly prediction engine discretizer bin membership interval is [a, b)
+            if feature_value >= left_side and feature_value < right_side:
+                return weight
+
+        raise LookupError(
+            "Couldn't find a matching bucket for the given feature value."
+        )
diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/tf_model_initializer_builder.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/tf_model_initializer_builder.py
index 2d0342551..4a576d749 100644
--- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/tf_model_initializer_builder.py
+++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/tf_model_initializer_builder.py
@@ -1,91 +1,97 @@
+from typing import Any, Dict, List, Tuple
+
 from .parsers import LollyModelFeaturesParser
 
 
 class TFModelInitializerBuilder:
-
-  def __init__(self, model_features_parser=LollyModelFeaturesParser()):
-    self._model_features_parser = model_features_parser
-
-  def build(self, lolly_model_reader):
-    '''
-    :param lolly_model_reader: LollyModelReader instance
-    :return: tf_model_initializer dictionary of the following format:
-      {
-        "features": {
-          "bias": 0.0,
-          "binary": {
-            # (feature name : feature weight) pairs
-            "feature_name_1": 0.0,
-            ...
-            "feature_nameN": 0.0
-          },
-          "discretized": {
-            # (feature name : index aligned lists of bin_boundaries and weights
-            "feature_name_1": {
-              "bin_boundaries": [1, ..., inf],
-              "weights": [0.0, ..., 0.0]
-            }
-            ...
-            "feature_name_K": {
-              "bin_boundaries": [1, ..., inf],
-              "weights": [0.0, ..., 0.0]
+    def __init__(self, model_features_parser=LollyModelFeaturesParser()):
+        self._model_features_parser = model_features_parser
+
+    def build(self, lolly_model_reader: object) -> Dict[str, Dict[str, Any]]:
+        """
+        :param lolly_model_reader: LollyModelReader instance
+        :return: tf_model_initializer dictionary of the following format:
+        {
+            "features": {
+                "bias": 0.0,
+                "binary": {
+                    # (feature name : feature weight) pairs
+                    "feature_name_1": 0.0,
+                    ...
+                    "feature_nameN": 0.0
+                },
+                "discretized": {
+                    # (feature name : index aligned lists of bin_boundaries and weights
+                    "feature_name_1": {
+                        "bin_boundaries": [1, ..., inf],
+                        "weights": [0.0, ..., 0.0]
+                    }
+                    ...
+                    "feature_name_K": {
+                        "bin_boundaries": [1, ..., inf],
+                        "weights": [0.0, ..., 0.0]
+                    }
+                }
             }
-          }
         }
-      }
-    '''
-    tf_model_initializer = {
-      "features": {}
-    }
-
-    features = self._model_features_parser.parse(lolly_model_reader)
-    tf_model_initializer["features"]["bias"] = features["bias"]
-    self._set_discretized_features(features["discretized"], tf_model_initializer)
-
-    self._dedup_binary_features(features["binary"], features["discretized"])
-    tf_model_initializer["features"]["binary"] = features["binary"]
-
-    return tf_model_initializer
-
-  def _set_discretized_features(self, discretized_features, tf_model_initializer):
-    if len(discretized_features) == 0:
-      return
-
-    num_bins = max([len(bins) for bins in discretized_features.values()])
-
-    bin_boundaries_and_weights = {}
-    for feature_name in discretized_features:
-      bin_boundaries_and_weights[feature_name] = self._extract_bin_boundaries_and_weights(
-        discretized_features[feature_name], num_bins)
-
-    tf_model_initializer["features"]["discretized"] = bin_boundaries_and_weights
-
-  def _dedup_binary_features(self, binary_features, discretized_features):
-    [binary_features.pop(feature_name) for feature_name in discretized_features]
-
-  def _extract_bin_boundaries_and_weights(self, discretized_feature_buckets, num_bins):
-    bin_boundary_weight_pairs = []
-
-    for bucket in discretized_feature_buckets:
-      bin_boundary_weight_pairs.append([bucket[0], bucket[2]])
-
-    # The default DBv2 HashingDiscretizer bin membership interval is (a, b]
-    #
-    # The Earlybird Lolly prediction engine discretizer bin membership interval is [a, b)
-    #
-    # Thus, convert (a, b] to [a, b) by inverting the bin boundaries.
-    for bin_boundary_weight_pair in bin_boundary_weight_pairs:
-      if bin_boundary_weight_pair[0] < float("inf"):
-        bin_boundary_weight_pair[0] *= -1
-
-    while len(bin_boundary_weight_pairs) < num_bins:
-      bin_boundary_weight_pairs.append([float("inf"), float(0)])
-
-    bin_boundary_weight_pairs.sort(key=lambda bin_boundary_weight_pair: bin_boundary_weight_pair[0])
-
-    bin_boundaries, weights = list(zip(*bin_boundary_weight_pairs))
-
-    return {
-      "bin_boundaries": bin_boundaries,
-      "weights": weights
-    }
+        """
+        tf_model_initializer = {"features": {}}
+
+        features = self._model_features_parser.parse(lolly_model_reader)
+        tf_model_initializer["features"]["bias"] = features["bias"]
+        self._set_discretized_features(features["discretized"], tf_model_initializer)
+
+        self._dedup_binary_features(features["binary"], features["discretized"])
+        tf_model_initializer["features"]["binary"] = features["binary"]
+
+        return tf_model_initializer
+
+    def _set_discretized_features(
+        self, discretized_features: dict, tf_model_initializer: dict
+    ) -> None:
+        if len(discretized_features) == 0:
+            return
+
+        num_bins = max([len(bins) for bins in discretized_features.values()])
+
+        bin_boundaries_and_weights = {}
+        for feature_name in discretized_features:
+            bin_boundaries_and_weights[
+                feature_name
+            ] = self._extract_bin_boundaries_and_weights(
+                discretized_features[feature_name], num_bins
+            )
+
+        tf_model_initializer["features"]["discretized"] = bin_boundaries_and_weights
+
+    def _dedup_binary_features(
+        self, binary_features: dict, discretized_features: dict
+    ) -> None:
+        [binary_features.pop(feature_name) for feature_name in discretized_features]
+
+    def _extract_bin_boundaries_and_weights(
+        self,
+        discretized_feature_buckets: List[List[float]],
+        num_bins: int,
+    ) -> Dict[str, Tuple[float]]:
+        bin_boundary_weight_pairs = [
+            [bucket[0], bucket[2]] for bucket in discretized_feature_buckets
+        ]
+
+        # The default DBv2 HashingDiscretizer bin membership interval is (a, b]
+        # The Earlybird Lolly prediction engine discretizer bin membership interval is [a, b)
+        # Thus, convert (a, b] to [a, b) by inverting the bin boundaries.
+        for bin_boundary_weight_pair in bin_boundary_weight_pairs:
+            if bin_boundary_weight_pair[0] < float("inf"):
+                bin_boundary_weight_pair[0] *= -1
+
+        while len(bin_boundary_weight_pairs) < num_bins:
+            bin_boundary_weight_pairs.append([float("inf"), float(0)])
+
+        bin_boundary_weight_pairs.sort(
+            key=lambda bin_boundary_weight_pair: bin_boundary_weight_pair[0]
+        )
+
+        bin_boundaries, weights = list(zip(*bin_boundary_weight_pairs))
+
+        return {"bin_boundaries": bin_boundaries, "weights": weights}
diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/metrics.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/metrics.py
index 6919914f8..21bca238d 100644
--- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/metrics.py
+++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/metrics.py
@@ -1,120 +1,147 @@
 # checkstyle: noqa
+from typing import Dict, List, Optional
+
 import tensorflow.compat.v1 as tf
-from collections import OrderedDict
+
+import twml
+
 from .constants import EB_SCORE_IDX
 from .lolly.data_helpers import get_lolly_scores
 
-import twml
 
-def get_multi_binary_class_metric_fn(metrics, classes=None, class_dim=1):
-  """
-  This function was copied from twml/metrics.py with the following adjustments:
-    - Override example weights with the ones set in graph_output.
-    - Tile labels in order to support per engagement metrics for both TF and Lolly scores.
-    - Add lolly_tf_score_MSE metric.
-  Note: All custom lines have a comment that starts with 'Added'
-  """
-  # pylint: disable=invalid-name,dict-keys-not-iterating
-  if metrics is None:
-    # remove expensive metrics by default for faster eval
-    metrics = list(twml.metrics.SUPPORTED_BINARY_CLASS_METRICS.keys())
-    metrics.remove('pr_curve')
-
-  def get_eval_metric_ops(graph_output, labels, weights):
+def get_multi_binary_class_metric_fn(
+    metrics: Dict[str, float],
+    classes: Optional[List[str]] = None,
+    class_dim: int = 1,
+) -> callable:
     """
-    graph_output:
-      dict that is returned by build_graph given input features.
-    labels:
-      target labels associated to batch.
-    weights:
-      weights of the samples..
+    This function was copied from twml/metrics.py with the following adjustments:
+        - Override example weights with the ones set in graph_output.
+        - Tile labels in order to support per engagement metrics for both TF and Lolly scores.
+        - Add lolly_tf_score_MSE metric.
+    Note: All custom lines have a comment that starts with 'Added'
     """
-
-    # Added to support the example weights overriding.
-    weights = graph_output["weights"]
-    # Added to support per engagement metrics for both TF and Lolly scores.
-    labels = tf.tile(labels, [1, 2])
-
-    eval_metric_ops = OrderedDict()
-
-    preds = graph_output['output']
-
-    threshold = graph_output['threshold'] if 'threshold' in graph_output else 0.5
-
-    hard_preds = graph_output.get('hard_output')
-    if not hard_preds:
-      hard_preds = tf.greater_equal(preds, threshold)
-
-    shape = labels.get_shape()
-
-    # basic sanity check: multi_metric dimension must exist
-    assert len(shape) > class_dim, "Dimension specified by class_dim does not exist."
-
-    num_labels = shape[class_dim]
-    # If we are doing multi-class / multi-label metric, the number of classes / labels must
-    # be know at graph construction time.  This dimension cannot have size None.
-    assert num_labels is not None, "The multi-metric dimension cannot be None."
-    assert classes is None or len(classes) == num_labels, (
-      "Number of classes must match the number of labels")
-
-    weights_shape = weights.get_shape() if weights is not None else None
-    if weights_shape is None:
-      num_weights = None
-    elif len(weights_shape) > 1:
-      num_weights = weights_shape[class_dim]
-    else:
-      num_weights = 1
-
-    for i in range(num_labels):
-
-      # add metrics to eval_metric_ops dict
-      for metric_name in metrics:
-        metric_name = metric_name.lower()  # metric name are case insensitive.
-
-        class_metric_name = metric_name + "_" + (classes[i] if classes is not None else str(i))
-
-        if class_metric_name in eval_metric_ops:
-          # avoid adding duplicate metrics.
-          continue
-
-        class_labels = tf.gather(labels, indices=[i], axis=class_dim)
-        class_preds = tf.gather(preds, indices=[i], axis=class_dim)
-        class_hard_preds = tf.gather(hard_preds, indices=[i], axis=class_dim)
-
-        if num_weights is None:
-          class_weights = None
-        elif num_weights == num_labels:
-          class_weights = tf.gather(weights, indices=[i], axis=class_dim)
-        elif num_weights == 1:
-          class_weights = weights
-        else:
-          raise ValueError("num_weights (%d) and num_labels (%d) do not match"
-                           % (num_weights, num_labels))
-
-        metric_factory, requires_threshold = twml.metrics.SUPPORTED_BINARY_CLASS_METRICS.get(metric_name)
-        if metric_factory:
-          value_op, update_op = metric_factory(
-            labels=class_labels,
-            predictions=(class_hard_preds if requires_threshold else class_preds),
-            weights=class_weights, name=class_metric_name)
-          eval_metric_ops[class_metric_name] = (value_op, update_op)
+    # pylint: disable=invalid-name,dict-keys-not-iterating
+    if metrics is None:
+        # remove expensive metrics by default for faster eval
+        metrics = list(twml.metrics.SUPPORTED_BINARY_CLASS_METRICS.keys())
+        metrics.remove("pr_curve")
+
+    def get_eval_metric_ops(
+        graph_output: Dict[str, tf.Tensor],
+        labels: tf.Tensor,
+        weights: tf.Tensor,
+    ) -> dict:
+        """
+        Args:
+            graph_output:
+                dict that is returned by build_graph given input features.
+            labels:
+                target labels associated to batch.
+            weights:
+                weights of the samples.
+
+        Returns:
+            dict of metric name to tuple of (value_op, update_op).
+        """
+
+        # Added to support the example weights overriding.
+        weights = graph_output["weights"]
+        # Added to support per engagement metrics for both TF and Lolly scores.
+        labels = tf.tile(labels, [1, 2])
+
+        eval_metric_ops = dict()
+
+        preds = graph_output["output"]
+
+        threshold = graph_output["threshold"] if "threshold" in graph_output else 0.5
+
+        hard_preds = graph_output.get("hard_output")
+        if not hard_preds:
+            hard_preds = tf.greater_equal(preds, threshold)
+
+        shape = labels.get_shape()
+
+        # basic sanity check: multi_metric dimension must exist
+        assert (
+            len(shape) > class_dim
+        ), "Dimension specified by class_dim does not exist."
+
+        num_labels = shape[class_dim]
+        # If we are doing multi-class / multi-label metric, the number of classes / labels must
+        # be know at graph construction time.  This dimension cannot have size None.
+        assert num_labels is not None, "The multi-metric dimension cannot be None."
+        assert (
+            classes is None or len(classes) == num_labels
+        ), "Number of classes must match the number of labels"
+
+        weights_shape = weights.get_shape() if weights is not None else None
+        if weights_shape is None:
+            num_weights = None
+        elif len(weights_shape) > 1:
+            num_weights = weights_shape[class_dim]
         else:
-          raise ValueError('Cannot find the metric named ' + metric_name)
-
-    # Added to compare TF and Lolly scores.
-    eval_metric_ops["lolly_tf_score_MSE"] = get_mse(graph_output["output"], labels)
-
-    return eval_metric_ops
-
-  return get_eval_metric_ops
-
-
-def get_mse(predictions, labels):
-  lolly_scores = get_lolly_scores(labels)
-  tf_scores = predictions[:, EB_SCORE_IDX]
-  squared_lolly_tf_score_diff = tf.square(tf.subtract(tf_scores, lolly_scores))
-
-  value_op = tf.reduce_mean(squared_lolly_tf_score_diff, name="value_op")
-  update_op = tf.reduce_mean(squared_lolly_tf_score_diff, name="update_op")
-
-  return value_op, update_op
+            num_weights = 1
+
+        for i in range(num_labels):
+            # add metrics to eval_metric_ops dict
+            for metric_name in metrics:
+                metric_name = metric_name.lower()  # metric name are case insensitive.
+
+                class_metric_name = (
+                    metric_name + "_" + (classes[i] if classes is not None else str(i))
+                )
+
+                if class_metric_name in eval_metric_ops:
+                    # avoid adding duplicate metrics.
+                    continue
+
+                class_labels = tf.gather(labels, indices=[i], axis=class_dim)
+                class_preds = tf.gather(preds, indices=[i], axis=class_dim)
+                class_hard_preds = tf.gather(hard_preds, indices=[i], axis=class_dim)
+
+                if num_weights is None:
+                    class_weights = None
+                elif num_weights == num_labels:
+                    class_weights = tf.gather(weights, indices=[i], axis=class_dim)
+                elif num_weights == 1:
+                    class_weights = weights
+                else:
+                    raise ValueError(
+                        "num_weights (%d) and num_labels (%d) do not match"
+                        % (num_weights, num_labels)
+                    )
+
+                (
+                    metric_factory,
+                    requires_threshold,
+                ) = twml.metrics.SUPPORTED_BINARY_CLASS_METRICS.get(metric_name)
+                if metric_factory:
+                    value_op, update_op = metric_factory(
+                        labels=class_labels,
+                        predictions=(
+                            class_hard_preds if requires_threshold else class_preds
+                        ),
+                        weights=class_weights,
+                        name=class_metric_name,
+                    )
+                    eval_metric_ops[class_metric_name] = (value_op, update_op)
+                else:
+                    raise ValueError("Cannot find the metric named " + metric_name)
+
+        # Added to compare TF and Lolly scores.
+        eval_metric_ops["lolly_tf_score_MSE"] = get_mse(graph_output["output"], labels)
+        return eval_metric_ops
+
+    return get_eval_metric_ops
+
+
+def get_mse(predictions: tf.Tensor, labels: tf.Tensor) -> tf.Tensor:
+    lolly_scores = get_lolly_scores(labels)
+    tf_scores = predictions[:, EB_SCORE_IDX]
+    squared_lolly_tf_score_diff = tf.square(tf.subtract(tf_scores, lolly_scores))
+
+    value_op = tf.reduce_mean(squared_lolly_tf_score_diff, name="value_op")
+    update_op = tf.reduce_mean(squared_lolly_tf_score_diff, name="update_op")
+
+    return value_op, update_op
diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/discretizer_builder.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/discretizer_builder.py
index 82c31bde0..619f9306c 100644
--- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/discretizer_builder.py
+++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/discretizer_builder.py
@@ -1,62 +1,62 @@
-from .hashing_utils import make_feature_id
+from typing import Any, Dict
 
-from twml.contrib.layers.hashing_discretizer import HashingDiscretizer
 import numpy as np
 
+from twml.contrib.layers.hashing_discretizer import HashingDiscretizer
 
-class TFModelDiscretizerBuilder(object):
-  def __init__(self, num_bits):
-    self.num_bits = num_bits
-
-  def build(self, tf_model_initializer):
-    '''
-    :param tf_model_initializer: dictionary of the following format:
-      {
-        "features": {
-          "bias": 0.0,
-          "binary": {
-            # (feature name : feature weight) pairs
-            "feature_name_1": 0.0,
-            ...
-            "feature_nameN": 0.0
-          },
-          "discretized": {
-            # (feature name : index aligned lists of bin_boundaries and weights
-            "feature_name_1": {
-              "bin_boundaries": [1, ..., inf],
-              "weights": [0.0, ..., 0.0]
-            }
-            ...
-            "feature_name_K": {
-              "bin_boundaries": [1, ..., inf],
-              "weights": [0.0, ..., 0.0]
-            }
-          }
-        }
-      }
-    :return: a HashingDiscretizer instance.
-    '''
-    discretized_features = tf_model_initializer["features"]["discretized"]
-
-    max_bins = 0
-
-    feature_ids = []
-    bin_vals = []
-    for feature_name in discretized_features:
-      bin_boundaries = discretized_features[feature_name]["bin_boundaries"]
-      feature_id = make_feature_id(feature_name, self.num_bits)
-      feature_ids.append(feature_id)
-      np_bin_boundaries = [np.float(bin_boundary) for bin_boundary in bin_boundaries]
-      bin_vals.append(np_bin_boundaries)
-
-      max_bins = max(max_bins, len(np_bin_boundaries))
+from .hashing_utils import make_feature_id
 
-    feature_ids_np = np.array(feature_ids)
-    bin_vals_np = np.array(bin_vals).flatten()
 
-    return HashingDiscretizer(
-      feature_ids=feature_ids_np,
-      bin_vals=bin_vals_np,
-      n_bin=max_bins,
-      out_bits=self.num_bits
-    )
+class TFModelDiscretizerBuilder(object):
+    def __init__(self, num_bits: int):
+        self.num_bits = num_bits
+
+    def build(self, tf_model_initializer: Dict[str, Any]) -> HashingDiscretizer:
+        """
+        :param tf_model_initializer: dictionary of the following format:
+            {
+                "features": {
+                    "bias": 0.0,
+                    "binary": {
+                        # (feature name : feature weight) pairs
+                        "feature_name_1": 0.0,
+                        ...
+                        "feature_nameN": 0.0
+                    },
+                    "discretized": {
+                        # (feature name : index aligned lists of bin_boundaries and weights
+                        "feature_name_1": {
+                            "bin_boundaries": [1, ..., inf],
+                            "weights": [0.0, ..., 0.0]
+                        }
+                        ...
+                        "feature_name_K": {
+                            "bin_boundaries": [1, ..., inf],
+                            "weights": [0.0, ..., 0.0]
+                        }
+                    }
+                }
+            }
+        :return: a HashingDiscretizer instance.
+        """
+        discretized_features = tf_model_initializer["features"]["discretized"]
+        max_bins = 0
+        feature_ids = []
+        bin_vals = []
+        for feature_name in discretized_features:
+            bin_boundaries = discretized_features[feature_name]["bin_boundaries"]
+            feature_id = make_feature_id(feature_name, self.num_bits)
+            feature_ids.append(feature_id)
+            np_bin_boundaries = [
+                np.float(bin_boundary) for bin_boundary in bin_boundaries
+            ]
+            bin_vals.append(np_bin_boundaries)
+            max_bins = max(max_bins, len(np_bin_boundaries))
+        feature_ids_np = np.array(feature_ids)
+        bin_vals_np = np.array(bin_vals).flatten()
+        return HashingDiscretizer(
+            feature_ids=feature_ids_np,
+            bin_vals=bin_vals_np,
+            n_bin=max_bins,
+            out_bits=self.num_bits,
+        )
diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/hashing_utils.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/hashing_utils.py
index 2c57f8d63..acb668587 100644
--- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/hashing_utils.py
+++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/hashing_utils.py
@@ -1,29 +1,33 @@
+import numpy as np
 from twitter.deepbird.io.util import _get_feature_id
 
-import numpy as np
+
+def numpy_hashing_uniform(the_id: int, bin_idx: int, output_bits: int) -> int:
+    """
+    integer_multiplicative_hashing
+    This is a reimplementation, for testing purposes, of the
+        c++ version found in hashing_discretizer_impl.cpp
+    """
+
+    hashing_constant = 2654435761
+    N = 32
+    with np.errstate(over="ignore"):
+        the_id *= hashing_constant
+        the_id += bin_idx
+        the_id *= hashing_constant
+        the_id >>= N - output_bits
+        the_id &= (1 << output_bits) - 1
+    return the_id
 
 
-def numpy_hashing_uniform(the_id, bin_idx, output_bits):
-  """
-  integer_multiplicative_hashing
-  This is a reimplementation, for testing purposes, of the
-    c++ version found in hashing_discretizer_impl.cpp
-  """
-  hashing_constant = 2654435761
-  N = 32
-  with np.errstate(over='ignore'):
-    the_id *= hashing_constant
-    the_id += bin_idx
-    the_id *= hashing_constant
-    the_id >>= N - output_bits
-    the_id &= (1 << output_bits) - 1
-  return the_id
+def make_feature_id(name: str, num_bits: int) -> np.int64:
+    """Returns a feature id for the given feature name."""
 
+    feature_id = _get_feature_id(name)
+    return np.int64(limit_bits(feature_id, num_bits))
 
-def make_feature_id(name, num_bits):
-  feature_id = _get_feature_id(name)
-  return np.int64(limit_bits(feature_id, num_bits))
 
+def limit_bits(value: int, num_bits: int) -> int:
+    """Limits the number of bits in the given value."""
 
-def limit_bits(value, num_bits):
-  return value & ((2 ** num_bits) - 1)
+    return value & ((1<<num_bits) - 1)
diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/weights_initializer_builder.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/weights_initializer_builder.py
index 63491ea38..d2a911f3b 100644
--- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/weights_initializer_builder.py
+++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/weights_initializer_builder.py
@@ -1,34 +1,54 @@
-from .hashing_utils import make_feature_id, numpy_hashing_uniform
+from typing import Any, Callable, Dict, Tuple
 
 import numpy as np
 import tensorflow.compat.v1 as tf
+
 import twml
 
+from .hashing_utils import make_feature_id, numpy_hashing_uniform
+
 
 class TFModelWeightsInitializerBuilder(object):
-  def __init__(self, num_bits):
-    self.num_bits = num_bits
-
-  def build(self, tf_model_initializer):
-    '''
-    :return: (bias_initializer, weight_initializer)
-    '''
-    initial_weights = np.zeros((2 ** self.num_bits, 1))
-
-    features = tf_model_initializer["features"]
-    self._set_binary_feature_weights(initial_weights, features["binary"])
-    self._set_discretized_feature_weights(initial_weights, features["discretized"])
-
-    return tf.constant_initializer(features["bias"]), twml.contrib.initializers.PartitionConstant(initial_weights)
-
-  def _set_binary_feature_weights(self, initial_weights, binary_features):
-    for feature_name, weight in binary_features.items():
-      feature_id = make_feature_id(feature_name, self.num_bits)
-      initial_weights[feature_id][0] = weight
-
-  def _set_discretized_feature_weights(self, initial_weights, discretized_features):
-    for feature_name, discretized_feature in discretized_features.items():
-      feature_id = make_feature_id(feature_name, self.num_bits)
-      for bin_idx, weight in enumerate(discretized_feature["weights"]):
-        final_bucket_id = numpy_hashing_uniform(feature_id, bin_idx, self.num_bits)
-        initial_weights[final_bucket_id][0] = weight
+    def __init__(self, num_bits: int):
+        self.num_bits = num_bits
+
+    def build(self, tf_model_initializer: Dict[str, Any]) -> Tuple[Callable, Callable]:
+        """
+        :return: (bias_initializer, weight_initializer)
+        """
+
+        initial_weights = np.zeros((1 << self.num_bits, 1))
+        features = tf_model_initializer["features"]
+
+        self._set_binary_feature_weights(initial_weights, features["binary"])
+        self._set_discretized_feature_weights(initial_weights, features["discretized"])
+
+        return tf.constant_initializer(
+            features["bias"]
+        ), twml.contrib.initializers.PartitionConstant(initial_weights)
+
+    def _set_binary_feature_weights(
+        self,
+        initial_weights: np.ndarray,
+        binary_features: Dict[str, float],
+    ) -> None:
+        """set weights for binary features"""
+
+        for feature_name, weight in binary_features.items():
+            feature_id = make_feature_id(feature_name, self.num_bits)
+            initial_weights[feature_id][0] = weight
+
+    def _set_discretized_feature_weights(
+        self,
+        initial_weights: np.ndarray,
+        discretized_features: Dict[str, Dict[str, Any]],
+    ) -> None:
+        """set weights for discretized features"""
+
+        for feature_name, discretized_feature in discretized_features.items():
+            feature_id = make_feature_id(feature_name, self.num_bits)
+            for bin_idx, weight in enumerate(discretized_feature["weights"]):
+                final_bucket_id = numpy_hashing_uniform(
+                    feature_id, bin_idx, self.num_bits
+                )
+                initial_weights[final_bucket_id][0] = weight
diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/train.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/train.py
index 6ef181f5f..f5cc3deab 100644
--- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/train.py
+++ b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/train.py
@@ -1,212 +1,275 @@
 # checkstyle: noqa
+from datetime import datetime
+from typing import Any, Dict, Optional
+
 import tensorflow.compat.v1 as tf
-from tensorflow.python.estimator.export.export import build_raw_serving_input_receiver_fn
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import array_ops
 import tensorflow_hub as hub
-
-from datetime import datetime
 from tensorflow.compat.v1 import logging
+from tensorflow.python.estimator.export.export import (
+    build_raw_serving_input_receiver_fn,
+)
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
 from twitter.deepbird.projects.timelines.configs import all_configs
+
+import twml
+from twml.contrib.calibrators.common_calibrators import (
+    build_percentile_discretizer_graph,
+    calibrate_discretizer_and_export,
+)
 from twml.trainers import DataRecordTrainer
-from twml.contrib.calibrators.common_calibrators import build_percentile_discretizer_graph
-from twml.contrib.calibrators.common_calibrators import calibrate_discretizer_and_export
-from .metrics import get_multi_binary_class_metric_fn
-from .constants import TARGET_LABEL_IDX, PREDICTED_CLASSES
+
+from .constants import PREDICTED_CLASSES, TARGET_LABEL_IDX
 from .example_weights import add_weight_arguments, make_weights_tensor
 from .lolly.data_helpers import get_lolly_logits
-from .lolly.tf_model_initializer_builder import TFModelInitializerBuilder
 from .lolly.reader import LollyModelReader
+from .lolly.tf_model_initializer_builder import TFModelInitializerBuilder
+from .metrics import get_multi_binary_class_metric_fn
 from .tf_model.discretizer_builder import TFModelDiscretizerBuilder
 from .tf_model.weights_initializer_builder import TFModelWeightsInitializerBuilder
 
-import twml
 
-def get_feature_values(features_values, params):
-  if params.lolly_model_tsv:
-    # The default DBv2 HashingDiscretizer bin membership interval is (a, b]
-    #
-    # The Earlybird Lolly prediction engine discretizer bin membership interval is [a, b)
-    #
-    # TFModelInitializerBuilder converts (a, b] to [a, b) by inverting the bin boundaries.
-    #
-    # Thus, invert the feature values, so that HashingDiscretizer can to find the correct bucket.
-    return tf.multiply(features_values, -1.0)
-  else:
+def get_feature_values(
+    features_values: tf.Tensor, params: tf.contrib.training.HParams
+) -> tf.Tensor:
+    if params.lolly_model_tsv:
+        # The default DBv2 HashingDiscretizer bin membership interval is (a, b]
+        # The Earlybird Lolly prediction engine discretizer bin membership interval is [a, b)
+        # TFModelInitializerBuilder converts (a, b] to [a, b) by inverting the bin boundaries.
+        # Thus, invert the feature values, so that HashingDiscretizer can to find the correct bucket.
+        return tf.multiply(features_values, -1.0)
     return features_values
 
-def build_graph(features, label, mode, params, config=None):
-  weights = None
-  if "weights" in features:
-    weights = make_weights_tensor(features["weights"], label, params)
 
-  num_bits = params.input_size_bits
+def build_graph(
+    features: Dict[str, tf.Tensor],
+    label: tf.Tensor,
+    mode: str,
+    params: tf.contrib.training.HParams,
+    config: Optional[tf.estimator.RunConfig] = None,
+) -> Dict[str, Any]:
+    weights = None
+    if "weights" in features:
+        weights = make_weights_tensor(features["weights"], label, params)
 
-  if mode == "infer":
-    indices = twml.limit_bits(features["input_sparse_tensor_indices"], num_bits)
-    dense_shape = tf.stack([features["input_sparse_tensor_shape"][0], 1 << num_bits])
-    sparse_tf = tf.SparseTensor(
-      indices=indices,
-      values=get_feature_values(features["input_sparse_tensor_values"], params),
-      dense_shape=dense_shape
-    )
-  else:
-    features["values"] = get_feature_values(features["values"], params)
-    sparse_tf = twml.util.convert_to_sparse(features, num_bits)
-
-  if params.lolly_model_tsv:
-    tf_model_initializer = TFModelInitializerBuilder().build(LollyModelReader(params.lolly_model_tsv))
-    bias_initializer, weight_initializer = TFModelWeightsInitializerBuilder(num_bits).build(tf_model_initializer)
-    discretizer = TFModelDiscretizerBuilder(num_bits).build(tf_model_initializer)
-  else:
-    discretizer = hub.Module(params.discretizer_save_dir)
-    bias_initializer, weight_initializer = None, None
-
-  input_sparse = discretizer(sparse_tf, signature="hashing_discretizer_calibrator")
-
-  logits = twml.layers.full_sparse(
-    inputs=input_sparse,
-    output_size=1,
-    bias_initializer=bias_initializer,
-    weight_initializer=weight_initializer,
-    use_sparse_grads=(mode == "train"),
-    use_binary_values=True,
-    name="full_sparse_1"
-  )
-
-  loss = None
-
-  if mode != "infer":
-    lolly_activations = get_lolly_logits(label)
-
-    if opt.print_data_examples:
-      logits = print_data_example(logits, lolly_activations, features)
-
-    if params.replicate_lolly:
-      loss = tf.reduce_mean(tf.math.squared_difference(logits, lolly_activations))
+    num_bits = params.input_size_bits
+
+    if mode == "infer":
+        indices = twml.limit_bits(features["input_sparse_tensor_indices"], num_bits)
+        dense_shape = tf.stack(
+            [features["input_sparse_tensor_shape"][0], 1 << num_bits]
+        )
+        sparse_tf = tf.SparseTensor(
+            indices=indices,
+            values=get_feature_values(features["input_sparse_tensor_values"], params),
+            dense_shape=dense_shape,
+        )
     else:
-      batch_size = tf.shape(label)[0]
-      target_label = tf.reshape(tensor=label[:, TARGET_LABEL_IDX], shape=(batch_size, 1))
-      loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=target_label, logits=logits)
-      loss = twml.util.weighted_average(loss, weights)
-
-    num_labels = tf.shape(label)[1]
-    eb_scores = tf.tile(lolly_activations, [1, num_labels])
-    logits = tf.tile(logits, [1, num_labels])
-    logits = tf.concat([logits, eb_scores], axis=1)
-
-  output = tf.nn.sigmoid(logits)
-
-  return {"output": output, "loss": loss, "weights": weights}
-
-def print_data_example(logits, lolly_activations, features):
-  return tf.Print(
-    logits,
-    [logits, lolly_activations, tf.reshape(features['keys'], (1, -1)), tf.reshape(tf.multiply(features['values'], -1.0), (1, -1))],
-    message="DATA EXAMPLE = ",
-    summarize=10000
-  )
-
-def earlybird_output_fn(graph_output):
-  export_outputs = {
-    tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
-      tf.estimator.export.PredictOutput(
-        {"prediction": tf.identity(graph_output["output"], name="output_scores")}
-      )
-  }
-  return export_outputs
+        features["values"] = get_feature_values(features["values"], params)
+        sparse_tf = twml.util.convert_to_sparse(features, num_bits)
 
-if __name__ == "__main__":
-  parser = DataRecordTrainer.add_parser_arguments()
-
-  parser = twml.contrib.calibrators.add_discretizer_arguments(parser)
-
-  parser.add_argument("--label", type=str, help="label for the engagement")
-  parser.add_argument("--model.use_existing_discretizer", action="store_true",
-                      dest="model_use_existing_discretizer",
-                      help="Load a pre-trained calibration or train a new one")
-  parser.add_argument("--input_size_bits", type=int)
-  parser.add_argument("--export_module_name", type=str, default="base_mlp", dest="export_module_name")
-  parser.add_argument("--feature_config", type=str)
-  parser.add_argument("--replicate_lolly", type=bool, default=False, dest="replicate_lolly",
-                      help="Train a regression model with MSE loss and the logged Earlybird score as a label")
-  parser.add_argument("--lolly_model_tsv", type=str, required=False, dest="lolly_model_tsv",
-                      help="Initialize with weights and discretizer bins available in the given Lolly model tsv file"
-                      "No discretizer gets trained or loaded if set.")
-  parser.add_argument("--print_data_examples", type=bool, default=False, dest="print_data_examples",
-                      help="Prints 'DATA EXAMPLE = [[tf logit]][[logged lolly logit]][[feature ids][feature values]]'")
-  add_weight_arguments(parser)
-
-  opt = parser.parse_args()
-
-  feature_config_module = all_configs.select_feature_config(opt.feature_config)
-
-  feature_config = feature_config_module.get_feature_config(data_spec_path=opt.data_spec, label=opt.label)
-
-  parse_fn = twml.parsers.get_sparse_parse_fn(
-    feature_config,
-    keep_fields=("ids", "keys", "values", "batch_size", "total_size", "codes"))
-
-  if not opt.lolly_model_tsv:
-    if opt.model_use_existing_discretizer:
-      logging.info("Skipping discretizer calibration [model.use_existing_discretizer=True]")
-      logging.info(f"Using calibration at {opt.discretizer_save_dir}")
+    if params.lolly_model_tsv:
+        tf_model_initializer = TFModelInitializerBuilder().build(
+            LollyModelReader(params.lolly_model_tsv)
+        )
+        bias_initializer, weight_initializer = TFModelWeightsInitializerBuilder(
+            num_bits
+        ).build(tf_model_initializer)
+        discretizer = TFModelDiscretizerBuilder(num_bits).build(tf_model_initializer)
     else:
-      logging.info("Calibrating new discretizer [model.use_existing_discretizer=False]")
-      calibrator = twml.contrib.calibrators.HashingDiscretizerCalibrator(
-        opt.discretizer_num_bins,
-        opt.discretizer_output_size_bits
-      )
-      calibrate_discretizer_and_export(name="recap_earlybird_hashing_discretizer",
-                                       params=opt,
-                                       calibrator=calibrator,
-                                       build_graph_fn=build_percentile_discretizer_graph,
-                                       feature_config=feature_config)
-
-  trainer = DataRecordTrainer(
-    name="earlybird",
-    params=opt,
-    build_graph_fn=build_graph,
-    save_dir=opt.save_dir,
-    feature_config=feature_config,
-    metric_fn=get_multi_binary_class_metric_fn(
-      metrics=["roc_auc"],
-      classes=PREDICTED_CLASSES
-    ),
-    warm_start_from=None
-  )
-
-  train_input_fn = trainer.get_train_input_fn(parse_fn=parse_fn)
-  eval_input_fn = trainer.get_eval_input_fn(parse_fn=parse_fn)
-
-  logging.info("Training and Evaluation ...")
-  trainingStartTime = datetime.now()
-  trainer.train_and_evaluate(train_input_fn=train_input_fn, eval_input_fn=eval_input_fn)
-  trainingEndTime = datetime.now()
-  logging.info("Training and Evaluation time: " + str(trainingEndTime - trainingStartTime))
-
-  if trainer._estimator.config.is_chief:
-    serving_input_in_earlybird = {
-      "input_sparse_tensor_indices": array_ops.placeholder(
-        name="input_sparse_tensor_indices",
-        shape=[None, 2],
-        dtype=dtypes.int64),
-      "input_sparse_tensor_values": array_ops.placeholder(
-        name="input_sparse_tensor_values",
-        shape=[None],
-        dtype=dtypes.float32),
-      "input_sparse_tensor_shape": array_ops.placeholder(
-        name="input_sparse_tensor_shape",
-        shape=[2],
-        dtype=dtypes.int64)
+        discretizer = hub.Module(params.discretizer_save_dir)
+        bias_initializer, weight_initializer = None, None
+
+    input_sparse = discretizer(sparse_tf, signature="hashing_discretizer_calibrator")
+
+    logits = twml.layers.full_sparse(
+        inputs=input_sparse,
+        output_size=1,
+        bias_initializer=bias_initializer,
+        weight_initializer=weight_initializer,
+        use_sparse_grads=(mode == "train"),
+        use_binary_values=True,
+        name="full_sparse_1",
+    )
+
+    loss = None
+
+    if mode != "infer":
+        lolly_activations = get_lolly_logits(label)
+
+        if opt.print_data_examples:
+            logits = print_data_example(logits, lolly_activations, features)
+
+        if params.replicate_lolly:
+            loss = tf.reduce_mean(tf.math.squared_difference(logits, lolly_activations))
+        else:
+            batch_size = tf.shape(label)[0]
+            target_label = tf.reshape(
+                tensor=label[:, TARGET_LABEL_IDX], shape=(batch_size, 1)
+            )
+            loss = tf.nn.sigmoid_cross_entropy_with_logits(
+                labels=target_label, logits=logits
+            )
+            loss = twml.util.weighted_average(loss, weights)
+
+        num_labels = tf.shape(label)[1]
+        eb_scores = tf.tile(lolly_activations, [1, num_labels])
+        logits = tf.tile(logits, [1, num_labels])
+        logits = tf.concat([logits, eb_scores], axis=1)
+
+    output = tf.nn.sigmoid(logits)
+
+    return {"output": output, "loss": loss, "weights": weights}
+
+
+def print_data_example(
+    logits: tf.Tensor,
+    lolly_activations: tf.Tensor,
+    features: Dict[str, tf.Tensor],
+) -> tf.Tensor:
+    return tf.Print(
+        logits,
+        [
+            logits,
+            lolly_activations,
+            tf.reshape(features["keys"], (1, -1)),
+            tf.reshape(tf.multiply(features["values"], -1.0), (1, -1)),
+        ],
+        message="DATA EXAMPLE = ",
+        summarize=10000,
+    )
+
+
+def earlybird_output_fn(graph_output: Dict[str, Any]) -> Dict[str, Any]:
+    export_outputs = {
+        tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: tf.estimator.export.PredictOutput(
+            {"prediction": tf.identity(graph_output["output"], name="output_scores")}
+        )
     }
-    serving_input_receiver_fn = build_raw_serving_input_receiver_fn(serving_input_in_earlybird)
-    twml.contrib.export.export_fn.export_all_models(
-      trainer=trainer,
-      export_dir=opt.export_dir,
-      parse_fn=parse_fn,
-      serving_input_receiver_fn=serving_input_receiver_fn,
-      export_output_fn=earlybird_output_fn,
-      feature_spec=feature_config.get_feature_spec()
+    return export_outputs
+
+
+if __name__ == "__main__":
+    parser = DataRecordTrainer.add_parser_arguments()
+
+    parser = twml.contrib.calibrators.add_discretizer_arguments(parser)
+
+    parser.add_argument("--label", type=str, help="label for the engagement")
+    parser.add_argument(
+        "--model.use_existing_discretizer",
+        action="store_true",
+        dest="model_use_existing_discretizer",
+        help="Load a pre-trained calibration or train a new one",
+    )
+    parser.add_argument("--input_size_bits", type=int)
+    parser.add_argument(
+        "--export_module_name", type=str, default="base_mlp", dest="export_module_name"
+    )
+    parser.add_argument("--feature_config", type=str)
+    parser.add_argument(
+        "--replicate_lolly",
+        type=bool,
+        default=False,
+        dest="replicate_lolly",
+        help="Train a regression model with MSE loss and the logged Earlybird score as a label",
+    )
+    parser.add_argument(
+        "--lolly_model_tsv",
+        type=str,
+        required=False,
+        dest="lolly_model_tsv",
+        help="Initialize with weights and discretizer bins available in the given Lolly model tsv file"
+        "No discretizer gets trained or loaded if set.",
+    )
+    parser.add_argument(
+        "--print_data_examples",
+        type=bool,
+        default=False,
+        dest="print_data_examples",
+        help="Prints 'DATA EXAMPLE = [[tf logit]][[logged lolly logit]][[feature ids][feature values]]'",
     )
-    logging.info("The export model path is: " + opt.export_dir)
+    add_weight_arguments(parser)
+
+    opt = parser.parse_args()
+
+    feature_config_module = all_configs.select_feature_config(opt.feature_config)
+
+    feature_config = feature_config_module.get_feature_config(
+        data_spec_path=opt.data_spec, label=opt.label
+    )
+
+    parse_fn = twml.parsers.get_sparse_parse_fn(
+        feature_config,
+        keep_fields=("ids", "keys", "values", "batch_size", "total_size", "codes"),
+    )
+
+    if not opt.lolly_model_tsv:
+        if opt.model_use_existing_discretizer:
+            logging.info(
+                "Skipping discretizer calibration [model.use_existing_discretizer=True]"
+            )
+            logging.info(f"Using calibration at {opt.discretizer_save_dir}")
+        else:
+            logging.info(
+                "Calibrating new discretizer [model.use_existing_discretizer=False]"
+            )
+            calibrator = twml.contrib.calibrators.HashingDiscretizerCalibrator(
+                opt.discretizer_num_bins, opt.discretizer_output_size_bits
+            )
+            calibrate_discretizer_and_export(
+                name="recap_earlybird_hashing_discretizer",
+                params=opt,
+                calibrator=calibrator,
+                build_graph_fn=build_percentile_discretizer_graph,
+                feature_config=feature_config,
+            )
+
+    trainer = DataRecordTrainer(
+        name="earlybird",
+        params=opt,
+        build_graph_fn=build_graph,
+        save_dir=opt.save_dir,
+        feature_config=feature_config,
+        metric_fn=get_multi_binary_class_metric_fn(
+            metrics=["roc_auc"], classes=PREDICTED_CLASSES
+        ),
+        warm_start_from=None,
+    )
+
+    train_input_fn = trainer.get_train_input_fn(parse_fn=parse_fn)
+    eval_input_fn = trainer.get_eval_input_fn(parse_fn=parse_fn)
+
+    logging.info("Training and Evaluation ...")
+    trainingStartTime = datetime.now()
+    trainer.train_and_evaluate(
+        train_input_fn=train_input_fn, eval_input_fn=eval_input_fn
+    )
+    trainingEndTime = datetime.now()
+    logging.info(
+        "Training and Evaluation time: " + str(trainingEndTime - trainingStartTime)
+    )
+
+    if trainer._estimator.config.is_chief:
+        serving_input_in_earlybird = {
+            "input_sparse_tensor_indices": array_ops.placeholder(
+                name="input_sparse_tensor_indices", shape=[None, 2], dtype=dtypes.int64
+            ),
+            "input_sparse_tensor_values": array_ops.placeholder(
+                name="input_sparse_tensor_values", shape=[None], dtype=dtypes.float32
+            ),
+            "input_sparse_tensor_shape": array_ops.placeholder(
+                name="input_sparse_tensor_shape", shape=[2], dtype=dtypes.int64
+            ),
+        }
+        serving_input_receiver_fn = build_raw_serving_input_receiver_fn(
+            serving_input_in_earlybird
+        )
+        twml.contrib.export.export_fn.export_all_models(
+            trainer=trainer,
+            export_dir=opt.export_dir,
+            parse_fn=parse_fn,
+            serving_input_receiver_fn=serving_input_receiver_fn,
+            export_output_fn=earlybird_output_fn,
+            feature_spec=feature_config.get_feature_spec(),
+        )
+        logging.info("The export model path is: " + opt.export_dir)
diff --git a/trust_and_safety_models/abusive/abusive_model.py b/trust_and_safety_models/abusive/abusive_model.py
index 06fff4ed2..5cc7d5086 100644
--- a/trust_and_safety_models/abusive/abusive_model.py
+++ b/trust_and_safety_models/abusive/abusive_model.py
@@ -1,48 +1,57 @@
 import tensorflow as tf
 
-physical_devices = tf.config.list_physical_devices('GPU') 
+physical_devices = tf.config.list_physical_devices("GPU")
 for device in physical_devices:
     tf.config.experimental.set_memory_growth(device, True)
 
-from twitter.hmli.nimbus.modeling.model_config import FeatureType, EncodingType, Feature, Model, LogType
-from twitter.hmli.nimbus.modeling.feature_loader import BigQueryFeatureLoader
-from twitter.cuad.representation.models.text_encoder import TextEncoder
-from twitter.cuad.representation.models.optimization import create_optimizer
-from twitter.hmli.nimbus.modeling.feature_encoder import FeatureEncoder
-
 import numpy as np
 import pandas as pd
 import utils
+from twitter.cuad.representation.models.optimization import create_optimizer
+from twitter.cuad.representation.models.text_encoder import TextEncoder
+from twitter.hmli.nimbus.modeling.feature_encoder import FeatureEncoder
+from twitter.hmli.nimbus.modeling.feature_loader import BigQueryFeatureLoader
+from twitter.hmli.nimbus.modeling.model_config import (
+    EncodingType,
+    Feature,
+    FeatureType,
+    LogType,
+    Model,
+)
 
-cat_names = [
-...
-]
+cat_names = [...]
 
-category_features = [Feature(name=cat_name, ftype=FeatureType.CONTINUOUS) for cat_name in cat_names]
+category_features = [
+    Feature(name=cat_name, ftype=FeatureType.CONTINUOUS) for cat_name in cat_names
+]
 features = [
-  Feature(name="tweet_text_with_media_annotations", ftype=FeatureType.STRING, encoding=EncodingType.BERT),
-  Feature(name="precision_nsfw", ftype=FeatureType.CONTINUOUS),
-  Feature(name="has_media", ftype=FeatureType.BINARY),
-  Feature(name="num_media", ftype=FeatureType.DISCRETE)
+    Feature(
+        name="tweet_text_with_media_annotations",
+        ftype=FeatureType.STRING,
+        encoding=EncodingType.BERT,
+    ),
+    Feature(name="precision_nsfw", ftype=FeatureType.CONTINUOUS),
+    Feature(name="has_media", ftype=FeatureType.BINARY),
+    Feature(name="num_media", ftype=FeatureType.DISCRETE),
 ] + category_features
 
 ptos_prototype = Model(
-  name='ptos_prototype',
-  export_path="...",
-  features=features,
+    name="ptos_prototype",
+    export_path="...",
+    features=features,
 )
 print(ptos_prototype)
 
 cq_loader = BigQueryFeatureLoader(gcp_project=COMPUTE_PROJECT)
 labels = [
-  "has_non_punitive_action",
-  "has_punitive_action",
-  "has_punitive_action_contains_self_harm",
-  "has_punitive_action_encourage_self_harm",
-  "has_punitive_action_episodic",
-  "has_punitive_action_episodic_hateful_conduct",
-  "has_punitive_action_other_abuse_policy",
-  "has_punitive_action_without_self_harm"
+    "has_non_punitive_action",
+    "has_punitive_action",
+    "has_punitive_action_contains_self_harm",
+    "has_punitive_action_encourage_self_harm",
+    "has_punitive_action_episodic",
+    "has_punitive_action_episodic_hateful_conduct",
+    "has_punitive_action_other_abuse_policy",
+    "has_punitive_action_without_self_harm",
 ]
 
 train_query = f"""
@@ -64,112 +73,128 @@
 print(train.describe(model=ptos_prototype))
 
 params = {
-  'max_seq_lengths': 128,
-  'batch_size': 196,
-  'lr': 1e-5,
-  'optimizer_type': 'adamw',
-  'warmup_steps': 0,
-  'cls_dropout_rate': 0.1,
-  'epochs': 30,
-  'steps_per_epoch': 5000,
-  'model_type': 'twitter_multilingual_bert_base_cased_mlm', 
-  'mixed_precision': True,
+    "max_seq_lengths": 128,
+    "batch_size": 196,
+    "lr": 1e-5,
+    "optimizer_type": "adamw",
+    "warmup_steps": 0,
+    "cls_dropout_rate": 0.1,
+    "epochs": 30,
+    "steps_per_epoch": 5000,
+    "model_type": "twitter_multilingual_bert_base_cased_mlm",
+    "mixed_precision": True,
 }
 params
 
+
 def parse_labeled_data(row_dict):
-  label = [row_dict.pop(l) for l in labels]
-  return row_dict, label
+    label = [row_dict.pop(l) for l in labels]
+    return row_dict, label
 
-mirrored_strategy = tf.distribute.MirroredStrategy()
-BATCH_SIZE = params['batch_size'] * mirrored_strategy.num_replicas_in_sync
 
-train_ds = train.to_tf_dataset().map(parse_labeled_data).shuffle(BATCH_SIZE*100).batch(BATCH_SIZE).repeat()
+mirrored_strategy = tf.distribute.MirroredStrategy()
+BATCH_SIZE = params["batch_size"] * mirrored_strategy.num_replicas_in_sync
+
+train_ds = (
+    train.to_tf_dataset()
+    .map(parse_labeled_data)
+    .shuffle(BATCH_SIZE * 100)
+    .batch(BATCH_SIZE)
+    .repeat()
+)
 val_ds = val.to_tf_dataset().map(parse_labeled_data).batch(BATCH_SIZE)
 
 for record in train_ds:
-  tf.print(record)
-  break
+    tf.print(record)
+    break
+
 
 def get_positive_weights():
-  """Computes positive weights used for class imbalance from training data."""
-  label_weights_df = utils.get_label_weights(
-      "tos-data-media-full",
-      project_id="twttr-abusive-interact-prod",
-      dataset_id="tos_policy"
-  )
-  pos_weight_tensor = tf.cast(
-      label_weights_df.sort_values(by='label').positive_class_weight,
-      dtype=tf.float32
-  )
-  return pos_weight_tensor
+    """Computes positive weights used for class imbalance from training data."""
+    label_weights_df = utils.get_label_weights(
+        "tos-data-media-full",
+        project_id="twttr-abusive-interact-prod",
+        dataset_id="tos_policy",
+    )
+    pos_weight_tensor = tf.cast(
+        label_weights_df.sort_values(by="label").positive_class_weight, dtype=tf.float32
+    )
+    return pos_weight_tensor
+
 
 pos_weight_tensor = get_positive_weights()
 print(pos_weight_tensor)
 
+
 class TextEncoderPooledOutput(TextEncoder):
-  def call(self, x):
-    return super().call([x])["pooled_output"]  
+    def call(self, x):
+        return super().call([x])["pooled_output"]
+
+    def get_config(self):
+        return super().get_config()
 
-  def get_config(self):
-    return super().get_config()
 
 with mirrored_strategy.scope():
-  text_encoder_pooled_output = TextEncoderPooledOutput(
-                                params['max_seq_lengths'], 
-                                model_type=params['model_type'],
-                                trainable=True
-                              )
-
-  fe = FeatureEncoder(train)
-  inputs, preprocessing_head = fe.build_model_head(model=ptos_prototype, text_encoder=text_encoder_pooled_output)
-
-  cls_dropout = tf.keras.layers.Dropout(params['cls_dropout_rate'], name="cls_dropout")
-  outputs = cls_dropout(preprocessing_head)
-  outputs = tf.keras.layers.Dense(8, name="output", dtype="float32")(outputs)
-
-  model = tf.keras.Model(
-      inputs=inputs,
-      outputs=outputs
-  )
-  pr_auc = tf.keras.metrics.AUC(curve="PR", num_thresholds=1000, multi_label=True, from_logits=True)
-
-  custom_loss = lambda y_true, y_pred: utils.multilabel_weighted_loss(y_true, y_pred, weights=pos_weight_tensor)
-  optimizer = create_optimizer(
-    init_lr=params["lr"], 
-    num_train_steps=(params["epochs"] * params["steps_per_epoch"]),
-    num_warmup_steps=params["warmup_steps"],
-    optimizer_type=params["optimizer_type"],
-  )
-  if params.get("mixed_precision"):
-      optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer)
-      
-  model.compile(
-    optimizer=optimizer,
-    loss=custom_loss,
-    metrics=[pr_auc]
-  )
+    text_encoder_pooled_output = TextEncoderPooledOutput(
+        params["max_seq_lengths"], model_type=params["model_type"], trainable=True
+    )
+
+    fe = FeatureEncoder(train)
+    inputs, preprocessing_head = fe.build_model_head(
+        model=ptos_prototype, text_encoder=text_encoder_pooled_output
+    )
+
+    cls_dropout = tf.keras.layers.Dropout(
+        params["cls_dropout_rate"], name="cls_dropout"
+    )
+    outputs = cls_dropout(preprocessing_head)
+    outputs = tf.keras.layers.Dense(8, name="output", dtype="float32")(outputs)
+
+    model = tf.keras.Model(inputs=inputs, outputs=outputs)
+    pr_auc = tf.keras.metrics.AUC(
+        curve="PR", num_thresholds=1000, multi_label=True, from_logits=True
+    )
+
+    custom_loss = lambda y_true, y_pred: utils.multilabel_weighted_loss(
+        y_true, y_pred, weights=pos_weight_tensor
+    )
+    optimizer = create_optimizer(
+        init_lr=params["lr"],
+        num_train_steps=(params["epochs"] * params["steps_per_epoch"]),
+        num_warmup_steps=params["warmup_steps"],
+        optimizer_type=params["optimizer_type"],
+    )
+    if params.get("mixed_precision"):
+        optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
+            optimizer
+        )
+
+    model.compile(optimizer=optimizer, loss=custom_loss, metrics=[pr_auc])
 
 model.weights
 model.summary()
 pr_auc.name
 
 import getpass
+
 import wandb
 from wandb.keras import WandbCallback
+
 try:
-  wandb_key = ...
-  wandb.login(...)
-  run = wandb.init(project='ptos_with_media',
-             group='new-split-trains',
-             notes='tweet text with only (num_media, precision_nsfw). on full train set, new split.',
-             entity='absv',
-             config=params,
-             name='tweet-text-w-nsfw-1.1',
-             sync_tensorboard=True)
+    wandb_key = ...
+    wandb.login(...)
+    run = wandb.init(
+        project="ptos_with_media",
+        group="new-split-trains",
+        notes="tweet text with only (num_media, precision_nsfw). on full train set, new split.",
+        entity="absv",
+        config=params,
+        name="tweet-text-w-nsfw-1.1",
+        sync_tensorboard=True,
+    )
 except FileNotFoundError:
-  print('Wandb key not found')
-  run = wandb.init(mode='disabled')
+    print("Wandb key not found")
+    run = wandb.init(mode="disabled")
 import datetime
 import os
 
@@ -179,27 +204,34 @@ def get_config(self):
 print("Saving model checkpoints here: ", checkpoint_path)
 
 cp_callback = tf.keras.callbacks.ModelCheckpoint(
-  filepath=os.path.join(checkpoint_path, "model.{epoch:04d}.tf"),
-  verbose=1,
-  monitor=f'val_{pr_auc.name}',
-  mode='max',
-  save_freq='epoch',
-  save_best_only=True
+    filepath=os.path.join(checkpoint_path, "model.{epoch:04d}.tf"),
+    verbose=1,
+    monitor=f"val_{pr_auc.name}",
+    mode="max",
+    save_freq="epoch",
+    save_best_only=True,
 )
 
-early_stopping_callback = tf.keras.callbacks.EarlyStopping(patience=7,
-                                                           monitor=f"val_{pr_auc.name}",
-                                                           mode="max")
+early_stopping_callback = tf.keras.callbacks.EarlyStopping(
+    patience=7, monitor=f"val_{pr_auc.name}", mode="max"
+)
 
-model.fit(train_ds, epochs=params["epochs"], validation_data=val_ds, callbacks=[cp_callback, early_stopping_callback],
-        steps_per_epoch=params["steps_per_epoch"], 
-        verbose=2)
+model.fit(
+    train_ds,
+    epochs=params["epochs"],
+    validation_data=val_ds,
+    callbacks=[cp_callback, early_stopping_callback],
+    steps_per_epoch=params["steps_per_epoch"],
+    verbose=2,
+)
 
 import tensorflow_hub as hub
 
 gs_model_path = ...
 reloaded_keras_layer = hub.KerasLayer(gs_model_path)
-inputs = tf.keras.layers.Input(name="tweet__core__tweet__text", shape=(1,), dtype=tf.string)
+inputs = tf.keras.layers.Input(
+    name="tweet__core__tweet__text", shape=(1,), dtype=tf.string
+)
 output = reloaded_keras_layer(inputs)
 v7_model = tf.keras.models.Model(inputs=inputs, outputs=output)
 pr_auc = tf.keras.metrics.AUC(curve="PR", name="pr_auc")
@@ -210,7 +242,7 @@ def get_config(self):
 candidate_model = model
 
 with mirrored_strategy.scope():
-  candidate_eval = candidate_model.evaluate(val_ds)
+    candidate_eval = candidate_model.evaluate(val_ds)
 
 test_query = f"""
 SELECT 
@@ -229,48 +261,64 @@ def get_config(self):
 test_only_media = test.filter(lambda x, y: tf.equal(x["has_media"], True))
 test_only_nsfw = test.filter(lambda x, y: tf.greater_equal(x["precision_nsfw"], 0.95))
 test_no_media = test.filter(lambda x, y: tf.equal(x["has_media"], False))
-test_media_not_nsfw = test.filter(lambda x, y: tf.logical_and(tf.equal(x["has_media"], True), tf.less(x["precision_nsfw"], 0.95)))
+test_media_not_nsfw = test.filter(
+    lambda x, y: tf.logical_and(
+        tf.equal(x["has_media"], True), tf.less(x["precision_nsfw"], 0.95)
+    )
+)
 for d in [test, test_only_media, test_only_nsfw, test_no_media, test_media_not_nsfw]:
-  print(d.reduce(0, lambda x, _: x + 1).numpy())
+    print(d.reduce(0, lambda x, _: x + 1).numpy())
 
-from notebook_eval_utils import SparseMultilabelEvaluator, EvalConfig
 from dataclasses import asdict
 
+from notebook_eval_utils import EvalConfig, SparseMultilabelEvaluator
+
+
 def display_metrics(probs, targets, labels=labels):
-  eval_config = EvalConfig(prediction_threshold=0.5, precision_k=0.9)
-  for eval_mode, y_mask in [("implicit", np.ones(targets.shape))]:
-    print("Evaluation mode", eval_mode)
-    metrics = SparseMultilabelEvaluator.evaluate(
-        targets, np.array(probs), y_mask, classes=labels, eval_config=eval_config
-    )
-    metrics_df = pd.DataFrame.from_dict(asdict(metrics)["per_topic_metrics"]).transpose()
-    metrics_df["pos_to_neg"] = metrics_df["num_pos_samples"] / (metrics_df["num_neg_samples"] + 1)
-    display(metrics_df.median())    
-    display(metrics_df)
-    return metrics_df
+    eval_config = EvalConfig(prediction_threshold=0.5, precision_k=0.9)
+    for eval_mode, y_mask in [("implicit", np.ones(targets.shape))]:
+        print("Evaluation mode", eval_mode)
+        metrics = SparseMultilabelEvaluator.evaluate(
+            targets, np.array(probs), y_mask, classes=labels, eval_config=eval_config
+        )
+        metrics_df = pd.DataFrame.from_dict(
+            asdict(metrics)["per_topic_metrics"]
+        ).transpose()
+        metrics_df["pos_to_neg"] = metrics_df["num_pos_samples"] / (
+            metrics_df["num_neg_samples"] + 1
+        )
+        display(metrics_df.median())
+        display(metrics_df)
+        return metrics_df
 
 
 def eval_model(model, df):
-  with mirrored_strategy.scope():
-    targets = np.stack(list(df.map(lambda x, y: y).as_numpy_iterator()), axis=0)
-    df = df.padded_batch(BATCH_SIZE)
-    preds = model.predict(df)
-    return display_metrics(preds, targets)
-
-subsets = {"test": test,
-          "test_only_media": test_only_media,
-          "test_only_nsfw": test_only_nsfw,
-          "test_no_media": test_no_media,
-          "test_media_not_nsfw": test_media_not_nsfw}
+    with mirrored_strategy.scope():
+        targets = np.stack(list(df.map(lambda x, y: y).as_numpy_iterator()), axis=0)
+        df = df.padded_batch(BATCH_SIZE)
+        preds = model.predict(df)
+        return display_metrics(preds, targets)
+
+
+subsets = {
+    "test": test,
+    "test_only_media": test_only_media,
+    "test_only_nsfw": test_only_nsfw,
+    "test_no_media": test_no_media,
+    "test_media_not_nsfw": test_media_not_nsfw,
+}
 
 metrics = {}
 for name, df in subsets.items():
-  metrics[name] = eval_model(candidate_model, df)
+    metrics[name] = eval_model(candidate_model, df)
 [(name, m.pr_auc) for name, m in metrics.items()]
-for name, x in [(name, m.pr_auc.to_string(index=False).strip().split("\n")) for name, m in metrics.items()]:
-  print(name)
-  for y in x:
-    print(y.strip(), end="\t")
-  print(".")
+for name, x in [
+    (name, m.pr_auc.to_string(index=False).strip().split("\n"))
+    for name, m in metrics.items()
+]:
+    print(name)
+    for y in x:
+        print(y.strip(), end="\t")
+    print(".")
 for d in [test, test_only_media, test_only_nsfw, test_no_media, test_media_not_nsfw]:
-  print(d.reduce(0, lambda x, _: x + 1).numpy())
\ No newline at end of file
+    print(d.reduce(0, lambda x, _: x + 1).numpy())
diff --git a/trust_and_safety_models/nsfw/nsfw_media.py b/trust_and_safety_models/nsfw/nsfw_media.py
index b5dfebb65..4975b4b32 100644
--- a/trust_and_safety_models/nsfw/nsfw_media.py
+++ b/trust_and_safety_models/nsfw/nsfw_media.py
@@ -1,51 +1,55 @@
-import kerastuner as kt
+import glob
 import math
+import os
+import random
+
+import kerastuner as kt
 import numpy as np
 import pandas as pd
-import random
 import sklearn.metrics
 import tensorflow as tf
-import os
-import glob
-
-from tqdm import tqdm
+from google.cloud import storage
 from matplotlib import pyplot as plt
-from tensorflow.keras.models import Sequential
 from tensorflow.keras.layers import Dense
-from google.cloud import storage
+from tensorflow.keras.models import Sequential
+from tqdm import tqdm
 
-physical_devices = tf.config.list_physical_devices('GPU')
+physical_devices = tf.config.list_physical_devices("GPU")
 physical_devices
 
-tf.config.set_visible_devices([tf.config.PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')], 'GPU')
-tf.config.get_visible_devices('GPU')
+tf.config.set_visible_devices(
+    [tf.config.PhysicalDevice(name="/physical_device:GPU:1", device_type="GPU")], "GPU"
+)
+tf.config.get_visible_devices("GPU")
+
 
 def decode_fn_embedding(example_proto):
-  
-  feature_description = {
-    "embedding": tf.io.FixedLenFeature([256], dtype=tf.float32),
-    "labels": tf.io.FixedLenFeature([], dtype=tf.int64),
-  }
-  
-  example = tf.io.parse_single_example(
-      example_proto,
-      feature_description
-  )
-
-  return example
-
-def preprocess_embedding_example(example_dict, positive_label=1, features_as_dict=False):
-  labels = example_dict["labels"]
-  label = tf.math.reduce_any(labels == positive_label)
-  label = tf.cast(label, tf.int32)
-  embedding = example_dict["embedding"]
-  
-  if features_as_dict:
-    features = {"embedding": embedding}
-  else:
-    features = embedding
-    
-  return features, label
+    feature_description = {
+        "embedding": tf.io.FixedLenFeature([256], dtype=tf.float32),
+        "labels": tf.io.FixedLenFeature([], dtype=tf.int64),
+    }
+
+    example = tf.io.parse_single_example(example_proto, feature_description)
+
+    return example
+
+
+def preprocess_embedding_example(
+    example_dict, positive_label=1, features_as_dict=False
+):
+    labels = example_dict["labels"]
+    label = tf.math.reduce_any(labels == positive_label)
+    label = tf.cast(label, tf.int32)
+    embedding = example_dict["embedding"]
+
+    if features_as_dict:
+        features = {"embedding": embedding}
+    else:
+        features = embedding
+
+    return features, label
+
+
 input_root = ...
 sens_prev_input_root = ...
 
@@ -58,161 +62,189 @@ def preprocess_embedding_example(example_dict, positive_label=1, features_as_dic
 validation_batch_size = 256
 
 do_resample = False
+
+
 def class_func(features, label):
-  return label
+    return label
+
 
 resample_fn = tf.data.experimental.rejection_resample(
-    class_func, target_dist = [0.5, 0.5], seed=0
+    class_func, target_dist=[0.5, 0.5], seed=0
 )
 train_glob = f"{input_root}/train/tfrecord/*.tfrecord"
 train_files = tf.io.gfile.glob(train_glob)
 
 if use_sens_prev_data:
-  train_sens_prev_glob = f"{sens_prev_input_root}/train/tfrecord/*.tfrecord"
-  train_sens_prev_files = tf.io.gfile.glob(train_sens_prev_glob)
-  train_files = train_files + train_sens_prev_files
-  
+    train_sens_prev_glob = f"{sens_prev_input_root}/train/tfrecord/*.tfrecord"
+    train_sens_prev_files = tf.io.gfile.glob(train_sens_prev_glob)
+    train_files = train_files + train_sens_prev_files
+
 random.shuffle(train_files)
 
 if not len(train_files):
-  raise ValueError(f"Did not find any train files matching {train_glob}")
+    raise ValueError(f"Did not find any train files matching {train_glob}")
 
 
 test_glob = f"{input_root}/test/tfrecord/*.tfrecord"
-test_files =  tf.io.gfile.glob(test_glob)
+test_files = tf.io.gfile.glob(test_glob)
 
 if not len(test_files):
-  raise ValueError(f"Did not find any eval files matching {test_glob}")
-  
+    raise ValueError(f"Did not find any eval files matching {test_glob}")
+
 test_ds = tf.data.TFRecordDataset(test_files).map(decode_fn_embedding)
-test_ds = test_ds.map(lambda x: preprocess_embedding_example(x, positive_label=positive_label)).batch(batch_size=test_batch_size)
-  
+test_ds = test_ds.map(
+    lambda x: preprocess_embedding_example(x, positive_label=positive_label)
+).batch(batch_size=test_batch_size)
+
 if use_sens_prev_data:
-  test_sens_prev_glob = f"{sens_prev_input_root}/test/tfrecord/*.tfrecord"
-  test_sens_prev_files =  tf.io.gfile.glob(test_sens_prev_glob)
-  
-  if not len(test_sens_prev_files):
-    raise ValueError(f"Did not find any eval files matching {test_sens_prev_glob}")
-  
-  test_sens_prev_ds = tf.data.TFRecordDataset(test_sens_prev_files).map(decode_fn_embedding)
-  test_sens_prev_ds = test_sens_prev_ds.map(lambda x: preprocess_embedding_example(x, positive_label=positive_label)).batch(batch_size=test_batch_size)
+    test_sens_prev_glob = f"{sens_prev_input_root}/test/tfrecord/*.tfrecord"
+    test_sens_prev_files = tf.io.gfile.glob(test_sens_prev_glob)
+
+    if not len(test_sens_prev_files):
+        raise ValueError(f"Did not find any eval files matching {test_sens_prev_glob}")
+
+    test_sens_prev_ds = tf.data.TFRecordDataset(test_sens_prev_files).map(
+        decode_fn_embedding
+    )
+    test_sens_prev_ds = test_sens_prev_ds.map(
+        lambda x: preprocess_embedding_example(x, positive_label=positive_label)
+    ).batch(batch_size=test_batch_size)
 
 train_ds = tf.data.TFRecordDataset(train_files).map(decode_fn_embedding)
-train_ds = train_ds.map(lambda x: preprocess_embedding_example(x, positive_label=positive_label))
+train_ds = train_ds.map(
+    lambda x: preprocess_embedding_example(x, positive_label=positive_label)
+)
 
 if do_resample:
-  train_ds = train_ds.apply(resample_fn).map(lambda _,b:(b))
+    train_ds = train_ds.apply(resample_fn).map(lambda _, b: (b))
 
 train_ds = train_ds.batch(batch_size=256).shuffle(buffer_size=10)
 train_ds = train_ds.repeat()
-  
-
-if has_validation_data: 
-  eval_glob = f"{input_root}/validation/tfrecord/*.tfrecord"
-  eval_files =  tf.io.gfile.glob(eval_glob)
-    
-  if use_sens_prev_data:
-    eval_sens_prev_glob = f"{sens_prev_input_root}/validation/tfrecord/*.tfrecord"
-    eval_sens_prev_files = tf.io.gfile.glob(eval_sens_prev_glob)
-    eval_files =  eval_files + eval_sens_prev_files
-    
-    
-  if not len(eval_files):
-    raise ValueError(f"Did not find any eval files matching {eval_glob}")
-  
-  eval_ds = tf.data.TFRecordDataset(eval_files).map(decode_fn_embedding)
-  eval_ds = eval_ds.map(lambda x: preprocess_embedding_example(x, positive_label=positive_label)).batch(batch_size=validation_batch_size)
+
+
+if has_validation_data:
+    eval_glob = f"{input_root}/validation/tfrecord/*.tfrecord"
+    eval_files = tf.io.gfile.glob(eval_glob)
+
+    if use_sens_prev_data:
+        eval_sens_prev_glob = f"{sens_prev_input_root}/validation/tfrecord/*.tfrecord"
+        eval_sens_prev_files = tf.io.gfile.glob(eval_sens_prev_glob)
+        eval_files = eval_files + eval_sens_prev_files
+
+    if not len(eval_files):
+        raise ValueError(f"Did not find any eval files matching {eval_glob}")
+
+    eval_ds = tf.data.TFRecordDataset(eval_files).map(decode_fn_embedding)
+    eval_ds = eval_ds.map(
+        lambda x: preprocess_embedding_example(x, positive_label=positive_label)
+    ).batch(batch_size=validation_batch_size)
 
 else:
-  
-  eval_ds = tf.data.TFRecordDataset(test_files).map(decode_fn_embedding)
-  eval_ds = eval_ds.map(lambda x: preprocess_embedding_example(x, positive_label=positive_label)).batch(batch_size=validation_batch_size)
+    eval_ds = tf.data.TFRecordDataset(test_files).map(decode_fn_embedding)
+    eval_ds = eval_ds.map(
+        lambda x: preprocess_embedding_example(x, positive_label=positive_label)
+    ).batch(batch_size=validation_batch_size)
 check_ds = tf.data.TFRecordDataset(train_files).map(decode_fn_embedding)
 cnt = 0
 pos_cnt = 0
 for example in tqdm(check_ds):
-  label = example['labels']
-  if label == 1:
-    pos_cnt += 1
-  cnt += 1
-print(f'{cnt} train entries with {pos_cnt} positive')
+    label = example["labels"]
+    if label == 1:
+        pos_cnt += 1
+    cnt += 1
+print(f"{cnt} train entries with {pos_cnt} positive")
 
 metrics = []
 
 metrics.append(
-  tf.keras.metrics.PrecisionAtRecall(
-    recall=0.9, num_thresholds=200, class_id=None, name=None, dtype=None
-  )
+    tf.keras.metrics.PrecisionAtRecall(
+        recall=0.9, num_thresholds=200, class_id=None, name=None, dtype=None
+    )
 )
 
 metrics.append(
-  tf.keras.metrics.AUC(
-    num_thresholds=200,
-    curve="PR",
-  )
+    tf.keras.metrics.AUC(
+        num_thresholds=200,
+        curve="PR",
+    )
 )
+
+
 def build_model(hp):
-  model = Sequential()
+    model = Sequential()
+
+    optimizer = tf.keras.optimizers.Adam(
+        learning_rate=0.001,
+        beta_1=0.9,
+        beta_2=0.999,
+        epsilon=1e-08,
+        amsgrad=False,
+        name="Adam",
+    )
+
+    activation = hp.Choice("activation", ["tanh", "gelu"])
+    kernel_initializer = hp.Choice(
+        "kernel_initializer", ["he_uniform", "glorot_uniform"]
+    )
+    for i in range(hp.Int("num_layers", 1, 2)):
+        model.add(tf.keras.layers.BatchNormalization())
+
+        units = hp.Int("units", min_value=128, max_value=256, step=128)
+
+        if i == 0:
+            model.add(
+                Dense(
+                    units=units,
+                    activation=activation,
+                    kernel_initializer=kernel_initializer,
+                    input_shape=(None, 256),
+                )
+            )
+        else:
+            model.add(
+                Dense(
+                    units=units,
+                    activation=activation,
+                    kernel_initializer=kernel_initializer,
+                )
+            )
+
+    model.add(Dense(1, activation="sigmoid", kernel_initializer=kernel_initializer))
+    model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=metrics)
+
+    return model
 
-  optimizer = tf.keras.optimizers.Adam(
-    learning_rate=0.001,
-    beta_1=0.9,
-    beta_2=0.999,
-    epsilon=1e-08,
-    amsgrad=False,
-    name="Adam",
-  )
-  
-  activation=hp.Choice("activation", ["tanh", "gelu"])
-  kernel_initializer=hp.Choice("kernel_initializer", ["he_uniform", "glorot_uniform"])
-  for i in range(hp.Int("num_layers", 1, 2)):
-    model.add(tf.keras.layers.BatchNormalization())
-
-    units=hp.Int("units", min_value=128, max_value=256, step=128)
-    
-    if i == 0:
-      model.add(
-        Dense(
-          units=units,
-          activation=activation,
-          kernel_initializer=kernel_initializer,
-          input_shape=(None, 256)
-        )
-      )
-    else:
-      model.add(
-        Dense(
-          units=units,
-          activation=activation,
-          kernel_initializer=kernel_initializer,
-        )
-      )
-    
-  model.add(Dense(1, activation='sigmoid', kernel_initializer=kernel_initializer))
-  model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=metrics)
-
-  return model
 
 tuner = kt.tuners.BayesianOptimization(
-  build_model,
-  objective=kt.Objective('val_loss', direction="min"),
-  max_trials=30,
-  directory='tuner_dir',
-  project_name='with_twitter_clip')
+    build_model,
+    objective=kt.Objective("val_loss", direction="min"),
+    max_trials=30,
+    directory="tuner_dir",
+    project_name="with_twitter_clip",
+)
 
-callbacks = [tf.keras.callbacks.EarlyStopping(
-    monitor='val_loss', min_delta=0, patience=5, verbose=0,
-    mode='auto', baseline=None, restore_best_weights=True
-)]
+callbacks = [
+    tf.keras.callbacks.EarlyStopping(
+        monitor="val_loss",
+        min_delta=0,
+        patience=5,
+        verbose=0,
+        mode="auto",
+        baseline=None,
+        restore_best_weights=True,
+    )
+]
 
 steps_per_epoch = 400
-tuner.search(train_ds,
-             epochs=100,
-             batch_size=256,
-             steps_per_epoch=steps_per_epoch,
-             verbose=2,
-             validation_data=eval_ds,
-             callbacks=callbacks)
+tuner.search(
+    train_ds,
+    epochs=100,
+    batch_size=256,
+    steps_per_epoch=steps_per_epoch,
+    verbose=2,
+    validation_data=eval_ds,
+    callbacks=callbacks,
+)
 
 tuner.results_summary()
 models = tuner.get_best_models(num_models=2)
@@ -230,109 +262,126 @@ def build_model(hp):
     epsilon=1e-08,
     amsgrad=False,
     name="Adam",
-  )
-best_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=metrics)
+)
+best_model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=metrics)
 best_model.summary()
 
-callbacks = [tf.keras.callbacks.EarlyStopping(
-    monitor='val_loss', min_delta=0, patience=10, verbose=0,
-    mode='auto', baseline=None, restore_best_weights=True
-)]
-history = best_model.fit(train_ds, epochs=100, validation_data=eval_ds, steps_per_epoch=steps_per_epoch, callbacks=callbacks)
+callbacks = [
+    tf.keras.callbacks.EarlyStopping(
+        monitor="val_loss",
+        min_delta=0,
+        patience=10,
+        verbose=0,
+        mode="auto",
+        baseline=None,
+        restore_best_weights=True,
+    )
+]
+history = best_model.fit(
+    train_ds,
+    epochs=100,
+    validation_data=eval_ds,
+    steps_per_epoch=steps_per_epoch,
+    callbacks=callbacks,
+)
 
-model_name = 'twitter_hypertuned'
-model_path = f'models/nsfw_Keras_with_CLIP_{model_name}'
+model_name = "twitter_hypertuned"
+model_path = f"models/nsfw_Keras_with_CLIP_{model_name}"
 tf.keras.models.save_model(best_model, model_path)
 
+
 def copy_local_directory_to_gcs(local_path, bucket, gcs_path):
     """Recursively copy a directory of files to GCS.
 
     local_path should be a directory and not have a trailing slash.
     """
     assert os.path.isdir(local_path)
-    for local_file in glob.glob(local_path + '/**'):
+    for local_file in glob.glob(local_path + "/**"):
         if not os.path.isfile(local_file):
             dir_name = os.path.basename(os.path.normpath(local_file))
             copy_local_directory_to_gcs(local_file, bucket, f"{gcs_path}/{dir_name}")
         else:
-          remote_path = os.path.join(gcs_path, local_file[1 + len(local_path) :])
-          blob = bucket.blob(remote_path)
-          blob.upload_from_filename(local_file)
+            remote_path = os.path.join(gcs_path, local_file[1 + len(local_path) :])
+            blob = bucket.blob(remote_path)
+            blob.upload_from_filename(local_file)
+
 
 client = storage.Client(project=...)
 bucket = client.get_bucket(...)
 copy_local_directory_to_gcs(model_path, bucket, model_path)
-copy_local_directory_to_gcs('tuner_dir', bucket, 'tuner_dir')
+copy_local_directory_to_gcs("tuner_dir", bucket, "tuner_dir")
 loaded_model = tf.keras.models.load_model(model_path)
 print(history.history.keys())
 
-plt.figure(figsize = (20, 5))
+plt.figure(figsize=(20, 5))
 
 plt.subplot(1, 3, 1)
-plt.plot(history.history['auc'])
-plt.plot(history.history['val_auc'])
-plt.title('model auc')
-plt.ylabel('auc')
-plt.xlabel('epoch')
-plt.legend(['train', 'test'], loc='upper left')
+plt.plot(history.history["auc"])
+plt.plot(history.history["val_auc"])
+plt.title("model auc")
+plt.ylabel("auc")
+plt.xlabel("epoch")
+plt.legend(["train", "test"], loc="upper left")
 
 plt.subplot(1, 3, 2)
-plt.plot(history.history['loss'])
-plt.plot(history.history['val_loss'])
-plt.title('model loss')
-plt.ylabel('loss')
-plt.xlabel('epoch')
-plt.legend(['train', 'test'], loc='upper left')
+plt.plot(history.history["loss"])
+plt.plot(history.history["val_loss"])
+plt.title("model loss")
+plt.ylabel("loss")
+plt.xlabel("epoch")
+plt.legend(["train", "test"], loc="upper left")
 
 plt.subplot(1, 3, 3)
-plt.plot(history.history['precision_at_recall'])
-plt.plot(history.history['val_precision_at_recall'])
-plt.title('model precision at 0.9 recall')
-plt.ylabel('precision_at_recall')
-plt.xlabel('epoch')
-plt.legend(['train', 'test'], loc='upper left')
+plt.plot(history.history["precision_at_recall"])
+plt.plot(history.history["val_precision_at_recall"])
+plt.title("model precision at 0.9 recall")
+plt.ylabel("precision_at_recall")
+plt.xlabel("epoch")
+plt.legend(["train", "test"], loc="upper left")
 
-plt.savefig('history_with_twitter_clip.pdf')
+plt.savefig("history_with_twitter_clip.pdf")
 
 test_labels = []
 test_preds = []
 
 for batch_features, batch_labels in tqdm(test_ds):
-  test_preds.extend(loaded_model.predict_proba(batch_features))
-  test_labels.extend(batch_labels.numpy())
-  
+    test_preds.extend(loaded_model.predict_proba(batch_features))
+    test_labels.extend(batch_labels.numpy())
+
 test_sens_prev_labels = []
 test_sens_prev_preds = []
 
 for batch_features, batch_labels in tqdm(test_sens_prev_ds):
-  test_sens_prev_preds.extend(loaded_model.predict_proba(batch_features))
-  test_sens_prev_labels.extend(batch_labels.numpy())
-  
+    test_sens_prev_preds.extend(loaded_model.predict_proba(batch_features))
+    test_sens_prev_labels.extend(batch_labels.numpy())
+
 n_test_pos = 0
 n_test_neg = 0
 n_test = 0
 
 for label in test_labels:
-  n_test +=1
-  if label == 1:
-    n_test_pos +=1
-  else:
-    n_test_neg +=1
+    n_test += 1
+    if label == 1:
+        n_test_pos += 1
+    else:
+        n_test_neg += 1
 
-print(f'n_test = {n_test}, n_pos = {n_test_pos}, n_neg = {n_test_neg}')
+print(f"n_test = {n_test}, n_pos = {n_test_pos}, n_neg = {n_test_neg}")
 
 n_test_sens_prev_pos = 0
 n_test_sens_prev_neg = 0
 n_test_sens_prev = 0
 
 for label in test_sens_prev_labels:
-  n_test_sens_prev +=1
-  if label == 1:
-    n_test_sens_prev_pos +=1
-  else:
-    n_test_sens_prev_neg +=1
+    n_test_sens_prev += 1
+    if label == 1:
+        n_test_sens_prev_pos += 1
+    else:
+        n_test_sens_prev_neg += 1
 
-print(f'n_test_sens_prev = {n_test_sens_prev}, n_pos_sens_prev = {n_test_sens_prev_pos}, n_neg = {n_test_sens_prev_neg}')
+print(
+    f"n_test_sens_prev = {n_test_sens_prev}, n_pos_sens_prev = {n_test_sens_prev_pos}, n_neg = {n_test_sens_prev_neg}"
+)
 
 test_weights = np.ones(np.asarray(test_preds).shape)
 
@@ -340,9 +389,7 @@ def copy_local_directory_to_gcs(local_path, bucket, gcs_path):
 test_preds = np.asarray(test_preds)
 test_weights = np.asarray(test_weights)
 
-pr = sklearn.metrics.precision_recall_curve(
-  test_labels, 
-  test_preds)
+pr = sklearn.metrics.precision_recall_curve(test_labels, test_preds)
 
 auc = sklearn.metrics.auc(pr[1], pr[0])
 plt.plot(pr[1], pr[0])
@@ -355,25 +402,26 @@ def copy_local_directory_to_gcs(local_path, bucket, gcs_path):
 test_sens_prev_weights = np.asarray(test_sens_prev_weights)
 
 pr_sens_prev = sklearn.metrics.precision_recall_curve(
-  test_sens_prev_labels, 
-  test_sens_prev_preds)
+    test_sens_prev_labels, test_sens_prev_preds
+)
 
 auc_sens_prev = sklearn.metrics.auc(pr_sens_prev[1], pr_sens_prev[0])
 plt.plot(pr_sens_prev[1], pr_sens_prev[0])
 plt.title("nsfw (sens prev test set)")
 
 df = pd.DataFrame(
-  {
-    "label": test_labels.squeeze(), 
-    "preds_keras": np.asarray(test_preds).flatten(),
-  })
+    {
+        "label": test_labels.squeeze(),
+        "preds_keras": np.asarray(test_preds).flatten(),
+    }
+)
 plt.figure(figsize=(15, 10))
 df["preds_keras"].hist()
 plt.title("Keras predictions", size=20)
-plt.xlabel('score')
+plt.xlabel("score")
 plt.ylabel("freq")
 
-plt.figure(figsize = (20, 5))
+plt.figure(figsize=(20, 5))
 plt.subplot(1, 3, 1)
 
 plt.plot(pr[2], pr[0][0:-1])
@@ -393,15 +441,19 @@ def copy_local_directory_to_gcs(local_path, bucket, gcs_path):
 plt.xlabel("recall")
 plt.ylabel("precision")
 
-plt.savefig('with_twitter_clip.pdf')
+plt.savefig("with_twitter_clip.pdf")
+
 
 def get_point_for_recall(recall_value, recall, precision):
-  idx = np.argmin(np.abs(recall - recall_value))
-  return (recall[idx], precision[idx])
+    idx = np.argmin(np.abs(recall - recall_value))
+    return (recall[idx], precision[idx])
+
 
 def get_point_for_precision(precision_value, recall, precision):
-  idx = np.argmin(np.abs(precision - precision_value))
-  return (recall[idx], precision[idx])
+    idx = np.argmin(np.abs(precision - precision_value))
+    return (recall[idx], precision[idx])
+
+
 precision, recall, thresholds = pr
 
 auc_precision_recall = sklearn.metrics.auc(recall, precision)
@@ -416,23 +468,23 @@ def get_point_for_precision(precision_value, recall, precision):
 
 ptAt50 = get_point_for_recall(0.5, recall, precision)
 print(ptAt50)
-plt.plot( [ptAt50[0],ptAt50[0]], [0,ptAt50[1]], 'r')
-plt.plot([0, ptAt50[0]], [ptAt50[1], ptAt50[1]], 'r')
+plt.plot([ptAt50[0], ptAt50[0]], [0, ptAt50[1]], "r")
+plt.plot([0, ptAt50[0]], [ptAt50[1], ptAt50[1]], "r")
 
 ptAt90 = get_point_for_recall(0.9, recall, precision)
 print(ptAt90)
-plt.plot( [ptAt90[0],ptAt90[0]], [0,ptAt90[1]], 'b')
-plt.plot([0, ptAt90[0]], [ptAt90[1], ptAt90[1]], 'b')
+plt.plot([ptAt90[0], ptAt90[0]], [0, ptAt90[1]], "b")
+plt.plot([0, ptAt90[0]], [ptAt90[1], ptAt90[1]], "b")
 
 ptAt50fmt = "%.4f" % ptAt50[1]
 ptAt90fmt = "%.4f" % ptAt90[1]
 aucFmt = "%.4f" % auc_precision_recall
 plt.title(
-  f"Keras (nsfw MU test)\nAUC={aucFmt}\np={ptAt50fmt} @ r=0.5\np={ptAt90fmt} @ r=0.9\nN_train={...}} ({...} pos), N_test={n_test} ({n_test_pos} pos)",
-  size=20
+    f"Keras (nsfw MU test)\nAUC={aucFmt}\np={ptAt50fmt} @ r=0.5\np={ptAt90fmt} @ r=0.9\nN_train={...} ({...} pos), N_test={n_test} ({n_test_pos} pos)",
+    size=20,
 )
 plt.subplots_adjust(top=0.72)
-plt.savefig('recall_precision_nsfw_Keras_with_twitter_CLIP_MU_test.pdf')
+plt.savefig("recall_precision_nsfw_Keras_with_twitter_CLIP_MU_test.pdf")
 
 precision, recall, thresholds = pr_sens_prev
 
@@ -447,20 +499,20 @@ def get_point_for_precision(precision_value, recall, precision):
 
 ptAt50 = get_point_for_recall(0.5, recall, precision)
 print(ptAt50)
-plt.plot( [ptAt50[0],ptAt50[0]], [0,ptAt50[1]], 'r')
-plt.plot([0, ptAt50[0]], [ptAt50[1], ptAt50[1]], 'r')
+plt.plot([ptAt50[0], ptAt50[0]], [0, ptAt50[1]], "r")
+plt.plot([0, ptAt50[0]], [ptAt50[1], ptAt50[1]], "r")
 
 ptAt90 = get_point_for_recall(0.9, recall, precision)
 print(ptAt90)
-plt.plot( [ptAt90[0],ptAt90[0]], [0,ptAt90[1]], 'b')
-plt.plot([0, ptAt90[0]], [ptAt90[1], ptAt90[1]], 'b')
+plt.plot([ptAt90[0], ptAt90[0]], [0, ptAt90[1]], "b")
+plt.plot([0, ptAt90[0]], [ptAt90[1], ptAt90[1]], "b")
 
 ptAt50fmt = "%.4f" % ptAt50[1]
 ptAt90fmt = "%.4f" % ptAt90[1]
 aucFmt = "%.4f" % auc_precision_recall
 plt.title(
-  f"Keras (nsfw sens prev test)\nAUC={aucFmt}\np={ptAt50fmt} @ r=0.5\np={ptAt90fmt} @ r=0.9\nN_train={...} ({...} pos), N_test={n_test_sens_prev} ({n_test_sens_prev_pos} pos)",
-  size=20
+    f"Keras (nsfw sens prev test)\nAUC={aucFmt}\np={ptAt50fmt} @ r=0.5\np={ptAt90fmt} @ r=0.9\nN_train={...} ({...} pos), N_test={n_test_sens_prev} ({n_test_sens_prev_pos} pos)",
+    size=20,
 )
 plt.subplots_adjust(top=0.72)
-plt.savefig('recall_precision_nsfw_Keras_with_twitter_CLIP_sens_prev_test.pdf')
\ No newline at end of file
+plt.savefig("recall_precision_nsfw_Keras_with_twitter_CLIP_sens_prev_test.pdf")
diff --git a/trust_and_safety_models/nsfw/nsfw_text.py b/trust_and_safety_models/nsfw/nsfw_text.py
index 980fc8fd4..0d7735371 100644
--- a/trust_and_safety_models/nsfw/nsfw_text.py
+++ b/trust_and_safety_models/nsfw/nsfw_text.py
@@ -1,41 +1,47 @@
+import os
+import re
 from datetime import datetime
 from functools import reduce
-import os
+
+import matplotlib.pyplot as plt
 import pandas as pd
-import re
-from sklearn.metrics import average_precision_score, classification_report, precision_recall_curve, PrecisionRecallDisplay
-from sklearn.model_selection import train_test_split
 import tensorflow as tf
-import matplotlib.pyplot as plt
-import re
-
+from sklearn.metrics import (
+    PrecisionRecallDisplay,
+    average_precision_score,
+    classification_report,
+    precision_recall_curve,
+)
+from sklearn.model_selection import train_test_split
 from twitter.cuad.representation.models.optimization import create_optimizer
 from twitter.cuad.representation.models.text_encoder import TextEncoder
 
-pd.set_option('display.max_colwidth', None)
-pd.set_option('display.expand_frame_repr', False)
+pd.set_option("display.max_colwidth", None)
+pd.set_option("display.expand_frame_repr", False)
 
 print(tf.__version__)
 print(tf.config.list_physical_devices())
 
-log_path = os.path.join('pnsfwtweettext_model_runs', datetime.now().strftime('%Y-%m-%d_%H.%M.%S'))
+log_path = os.path.join(
+    "pnsfwtweettext_model_runs", datetime.now().strftime("%Y-%m-%d_%H.%M.%S")
+)
 
-tweet_text_feature = 'text'
+tweet_text_feature = "text"
 
 params = {
-  'batch_size': 32,
-  'max_seq_lengths': 256,
-  'model_type': 'twitter_bert_base_en_uncased_augmented_mlm',
-  'trainable_text_encoder': True,
-  'lr': 5e-5,
-  'epochs': 10,
+    "batch_size": 32,
+    "max_seq_lengths": 256,
+    "model_type": "twitter_bert_base_en_uncased_augmented_mlm",
+    "trainable_text_encoder": True,
+    "lr": 5e-5,
+    "epochs": 10,
 }
 
 REGEX_PATTERNS = [
-    r'^RT @[A-Za-z0-9_]+: ', 
+    r"^RT @[A-Za-z0-9_]+: ",
     r"@[A-Za-z0-9_]+",
-    r'https:\/\/t\.co\/[A-Za-z0-9]{10}',
-    r'@\?\?\?\?\?',
+    r"https:\/\/t\.co\/[A-Za-z0-9]{10}",
+    r"@\?\?\?\?\?",
 ]
 
 EMOJI_PATTERN = re.compile(
@@ -52,34 +58,40 @@
     "\U0001FA70-\U0001FAFF"
     "\U00002702-\U000027B0"
     "])"
-  )
+)
+
 
 def clean_tweet(text):
     for pattern in REGEX_PATTERNS:
-        text = re.sub(pattern, '', text)
+        text = re.sub(pattern, "", text)
+
+    text = re.sub(EMOJI_PATTERN, r" \1 ", text)
+
+    text = re.sub(r"\n", " ", text)
 
-    text = re.sub(EMOJI_PATTERN, r' \1 ', text)
-    
-    text = re.sub(r'\n', ' ', text)
-    
     return text.strip().lower()
 
 
-df['processed_text'] = df['text'].astype(str).map(clean_tweet)
+df["processed_text"] = df["text"].astype(str).map(clean_tweet)
 df.sample(10)
 
-X_train, X_val, y_train, y_val = train_test_split(df[['processed_text']], df['is_nsfw'], test_size=0.1, random_state=1)
+X_train, X_val, y_train, y_val = train_test_split(
+    df[["processed_text"]], df["is_nsfw"], test_size=0.1, random_state=1
+)
+
 
 def df_to_ds(X, y, shuffle=False):
-  ds = tf.data.Dataset.from_tensor_slices((
-    X.values,
-    tf.one_hot(tf.cast(y.values, tf.int32), depth=2, axis=-1)
-  ))
-  
-  if shuffle:
-    ds = ds.shuffle(1000, seed=1, reshuffle_each_iteration=True)
-  
-  return ds.map(lambda text, label: ({ tweet_text_feature: text }, label)).batch(params['batch_size'])
+    ds = tf.data.Dataset.from_tensor_slices(
+        (X.values, tf.one_hot(tf.cast(y.values, tf.int32), depth=2, axis=-1))
+    )
+
+    if shuffle:
+        ds = ds.shuffle(1000, seed=1, reshuffle_each_iteration=True)
+
+    return ds.map(lambda text, label: ({tweet_text_feature: text}, label)).batch(
+        params["batch_size"]
+    )
+
 
 ds_train = df_to_ds(X_train, y_train, shuffle=True)
 ds_val = df_to_ds(X_val, y_val)
@@ -87,51 +99,47 @@ def df_to_ds(X, y, shuffle=False):
 
 inputs = tf.keras.layers.Input(shape=(), dtype=tf.string, name=tweet_text_feature)
 encoder = TextEncoder(
-    max_seq_lengths=params['max_seq_lengths'],
-    model_type=params['model_type'],
-    trainable=params['trainable_text_encoder'],
-    local_preprocessor_path='demo-preprocessor'
+    max_seq_lengths=params["max_seq_lengths"],
+    model_type=params["model_type"],
+    trainable=params["trainable_text_encoder"],
+    local_preprocessor_path="demo-preprocessor",
 )
 embedding = encoder([inputs])["pooled_output"]
-predictions = tf.keras.layers.Dense(2, activation='softmax')(embedding)
+predictions = tf.keras.layers.Dense(2, activation="softmax")(embedding)
 model = tf.keras.models.Model(inputs=inputs, outputs=predictions)
 
 model.summary()
 
 optimizer = create_optimizer(
-  params['lr'],
-  params['epochs'] * len(ds_train),
-  0,
-  weight_decay_rate=0.01,
-  optimizer_type='adamw'
+    params["lr"],
+    params["epochs"] * len(ds_train),
+    0,
+    weight_decay_rate=0.01,
+    optimizer_type="adamw",
 )
 bce = tf.keras.losses.BinaryCrossentropy(from_logits=False)
-pr_auc = tf.keras.metrics.AUC(curve='PR', num_thresholds=1000, from_logits=False)
+pr_auc = tf.keras.metrics.AUC(curve="PR", num_thresholds=1000, from_logits=False)
 model.compile(optimizer=optimizer, loss=bce, metrics=[pr_auc])
 
 callbacks = [
-  tf.keras.callbacks.EarlyStopping(
-    monitor='val_loss',
-    mode='min',
-    patience=1,
-    restore_best_weights=True
-  ),
-  tf.keras.callbacks.ModelCheckpoint(
-    filepath=os.path.join(log_path, 'checkpoints', '{epoch:02d}'),
-    save_freq='epoch'
-  ),
-  tf.keras.callbacks.TensorBoard(
-    log_dir=os.path.join(log_path, 'scalars'),
-    update_freq='batch',
-    write_graph=False
-  )
+    tf.keras.callbacks.EarlyStopping(
+        monitor="val_loss", mode="min", patience=1, restore_best_weights=True
+    ),
+    tf.keras.callbacks.ModelCheckpoint(
+        filepath=os.path.join(log_path, "checkpoints", "{epoch:02d}"), save_freq="epoch"
+    ),
+    tf.keras.callbacks.TensorBoard(
+        log_dir=os.path.join(log_path, "scalars"),
+        update_freq="batch",
+        write_graph=False,
+    ),
 ]
 history = model.fit(
-  ds_train,
-  epochs=params['epochs'],
-  callbacks=callbacks,
-  validation_data=ds_val,
-  steps_per_epoch=len(ds_train)
+    ds_train,
+    epochs=params["epochs"],
+    callbacks=callbacks,
+    validation_data=ds_val,
+    steps_per_epoch=len(ds_train),
 )
 
 model.predict(["xxx 🍑"])
diff --git a/trust_and_safety_models/toxicity/data/data_preprocessing.py b/trust_and_safety_models/toxicity/data/data_preprocessing.py
index f7da608f6..7d2ece32e 100644
--- a/trust_and_safety_models/toxicity/data/data_preprocessing.py
+++ b/trust_and_safety_models/toxicity/data/data_preprocessing.py
@@ -1,118 +1,130 @@
-from abc import ABC
 import re
-
-from toxicity_ml_pipeline.settings.hcomp_settings import TOXIC_35
+from abc import ABC
 
 import numpy as np
-
+import pandas as pd
+from toxicity_ml_pipeline.settings.hcomp_settings import TOXIC_35
 
 TOXIC_35_set = set(TOXIC_35)
 
-url_group = r"(\bhttps?:\/\/\S+)"
-mention_group = r"(\B@\S+)"
-urls_mentions_re = re.compile(url_group + r"|" + mention_group, re.IGNORECASE)
-url_re = re.compile(url_group, re.IGNORECASE)
-mention_re = re.compile(mention_group, re.IGNORECASE)
-newline_re = re.compile(r"\n+", re.IGNORECASE)
-and_re = re.compile(r"&\s?amp\s?;", re.IGNORECASE)
+URL_GROUP = r"(\bhttps?:\/\/\S+)"
+MENTION_GROUP = r"(\B@\S+)"
+URLS_MENTIONS_RE = re.compile(URL_GROUP + r"|" + MENTION_GROUP, re.IGNORECASE)
+URL_RE = re.compile(URL_GROUP, re.IGNORECASE)
+MENTION_RE = re.compile(MENTION_GROUP, re.IGNORECASE)
+NEWLINE_RE = re.compile(r"\n+", re.IGNORECASE)
+AND_RE = re.compile(r"&\s?amp\s?;", re.IGNORECASE)
 
 
 class DataframeCleaner(ABC):
-  def __init__(self):
-    pass
-
-  def _clean(self, df):
-    return df
+    def __init__(self):
+        pass
 
-  def _systematic_preprocessing(self, df):
-    df.reset_index(inplace=True, drop=True)
-    if "media_url" in df.columns:
-      print(".... removing tweets with media")
-      df.drop(df[~df.media_url.isna()].index, inplace=True, axis=0)
-    else:
-      print("WARNING you are not removing tweets with media to train a BERT model.")
+    def _clean(self, df: pd.DataFrame) -> pd.DataFrame:
+        return df
 
-    print(".... deleting duplicates")
-    df.drop_duplicates("text", inplace=True, keep="last")
-    print(f"Got {df.shape[0]} after cleaning")
-
-    return df.reset_index(inplace=False, drop=True)
-
-  def _postprocess(self, df, *args, **kwargs):
-    return df
+    def _systematic_preprocessing(self, df: pd.DataFrame) -> pd.DataFrame:
+        df.reset_index(inplace=True, drop=True)
+        if "media_url" in df.columns:
+            print(".... removing tweets with media")
+            df.drop(df[~df.media_url.isna()].index, inplace=True, axis=0)
+        else:
+            print(
+                "WARNING you are not removing tweets with media to train a BERT model."
+            )
 
-  def __call__(self, df, *args, **kwargs):
-    print(f"Got {df.shape[0]} before cleaning")
+        print(".... deleting duplicates")
+        df.drop_duplicates("text", inplace=True, keep="last")
+        print(f"Got {df.shape[0]} after cleaning")
 
-    df["raw_text"] = df.text
-    df = self._clean(df)
+        return df.reset_index(inplace=False, drop=True)
 
-    df = self._systematic_preprocessing(df)
+    def _postprocess(self, df: pd.DataFrame, *args, **kwargs) -> pd.DataFrame:
+        return df
 
-    return self._postprocess(df, *args, **kwargs)
+    def __call__(self, df: pd.DataFrame, *args, **kwargs) -> pd.DataFrame:
+        print(f"Got {df.shape[0]} before cleaning")
+        df["raw_text"] = df.text
+        df = self._clean(df)
+        df = self._systematic_preprocessing(df)
+        return self._postprocess(df, *args, **kwargs)
 
 
-def mapping_func(el):
-  if el.aggregated_content in TOXIC_35_set:
-    return 2
-  if el.label == 1:
-    return 1
-  return 0
+def mapping_func(el: pd.Series) -> int:
+    if el.aggregated_content in TOXIC_35_set:
+        return 2
+    if el.label == 1:
+        return 1
+    return 0
 
 
 class DefaultENNoPreprocessor(DataframeCleaner):
-  def _postprocess(self, df, *args, **kwargs):
-    if "toxic_count" in df.columns and "non_toxic_count" in df.columns:
-      df["vote"] = df.toxic_count / (df.toxic_count + df.non_toxic_count)
-      df["agreement_rate"] = np.max((df.vote, 1 - df.vote), axis=0)
-
-    if "label_column" in kwargs and kwargs["label_column"] != "label":
-      if kwargs["label_column"] == "aggregated_content":
-        print("Replacing v3 label by v3.5 label.")
-        if "num_classes" in kwargs and kwargs["num_classes"] < 3:
-          df["label"] = np.where(df.aggregated_content.isin(TOXIC_35_set), 1, 0)
-        elif "num_classes" in kwargs and kwargs["num_classes"] == 3:
-          print("Making it a 3-class pb")
-          df["label"] = df.apply(mapping_func, axis=1)
-        else:
-          raise NotImplementedError
-      elif kwargs['label_column'] in df.columns:
-        df['label'] = df[kwargs['label_column']]
-        if kwargs['class_weight'] is not None:
-          df["class_weight"] = np.where(df['label'] == 1, 1-kwargs['class_weight'],
-                                        kwargs['class_weight'])
-      else:
-        raise NotImplementedError
-
-    if "filter_low_agreements" in kwargs and kwargs["filter_low_agreements"] == True:
-      df.drop(df[(df.agreement_rate <= 0.6)].index, axis=0, inplace=True)
-      raise NotImplementedError
-
-    return df
+    def _postprocess(self, df: pd.DataFrame, *args, **kwargs) -> pd.DataFrame:
+        if "toxic_count" in df.columns and "non_toxic_count" in df.columns:
+            df["vote"] = df.toxic_count / (df.toxic_count + df.non_toxic_count)
+            df["agreement_rate"] = np.max((df.vote, 1 - df.vote), axis=0)
+
+        if "label_column" in kwargs and kwargs["label_column"] != "label":
+            if kwargs["label_column"] == "aggregated_content":
+                print("Replacing v3 label by v3.5 label.")
+                if "num_classes" in kwargs and kwargs["num_classes"] < 3:
+                    df["label"] = np.where(
+                        df.aggregated_content.isin(TOXIC_35_set), 1, 0
+                    )
+                elif "num_classes" in kwargs and kwargs["num_classes"] == 3:
+                    print("Making it a 3-class pb")
+                    df["label"] = df.apply(mapping_func, axis=1)
+                else:
+                    raise NotImplementedError
+            elif kwargs["label_column"] in df.columns:
+                df["label"] = df[kwargs["label_column"]]
+                if kwargs["class_weight"] is not None:
+                    df["class_weight"] = np.where(
+                        df["label"] == 1,
+                        1 - kwargs["class_weight"],
+                        kwargs["class_weight"],
+                    )
+            else:
+                raise NotImplementedError
+
+        if (
+            "filter_low_agreements" in kwargs
+            and kwargs["filter_low_agreements"] == True
+        ):
+            df.drop(df[(df.agreement_rate <= 0.6)].index, axis=0, inplace=True)
+            raise NotImplementedError
+
+        return df
 
 
 class DefaultENPreprocessor(DefaultENNoPreprocessor):
-  def _clean(self, adhoc_df):
-    print(
-      ".... removing \\n and replacing @mentions and URLs by placeholders. "
-      "Emoji filtering is not done."
-    )
-    adhoc_df["text"] = [url_re.sub("URL", tweet) for tweet in adhoc_df.raw_text.values]
-    adhoc_df["text"] = [mention_re.sub("MENTION", tweet) for tweet in adhoc_df.text.values]
-    adhoc_df["text"] = [
-      newline_re.sub(" ", tweet).lstrip(" ").rstrip(" ") for tweet in adhoc_df.text.values
-    ]
-    adhoc_df["text"] = [and_re.sub("&", tweet) for tweet in adhoc_df.text.values]
-
-    return adhoc_df
+    def _clean(self, adhoc_df: pd.DataFrame) -> pd.DataFrame:
+        print(
+            "... removing \\n and replacing @mentions and URLs by placeholders. "
+            "Emoji filtering is not done."
+        )
+        adhoc_df["text"] = [
+            URL_RE.sub("URL", tweet) for tweet in adhoc_df.raw_text.values
+        ]
+        adhoc_df["text"] = [
+            MENTION_RE.sub("MENTION", tweet) for tweet in adhoc_df.text.values
+        ]
+        adhoc_df["text"] = [
+            NEWLINE_RE.sub(" ", tweet).lstrip(" ").rstrip(" ")
+            for tweet in adhoc_df.text.values
+        ]
+        adhoc_df["text"] = [AND_RE.sub("&", tweet) for tweet in adhoc_df.text.values]
+        return adhoc_df
 
 
 class Defaulti18nPreprocessor(DataframeCleaner):
-  def _clean(self, adhoc_df):
-    print(".... removing @mentions, \\n and URLs. Emoji filtering is not done.")
-    adhoc_df["text"] = [urls_mentions_re.sub("", tweet) for tweet in adhoc_df.raw_text.values]
-    adhoc_df["text"] = [
-      newline_re.sub(" ", tweet).lstrip(" ").rstrip(" ") for tweet in adhoc_df.text.values
-    ]
-
-    return adhoc_df
+    def _clean(self, adhoc_df):
+        print("... removing @mentions, \\n and URLs. Emoji filtering is not done.")
+        adhoc_df["text"] = [
+            URLS_MENTIONS_RE.sub("", tweet) for tweet in adhoc_df.raw_text.values
+        ]
+        adhoc_df["text"] = [
+            NEWLINE_RE.sub(" ", tweet).lstrip(" ").rstrip(" ")
+            for tweet in adhoc_df.text.values
+        ]
+        return adhoc_df
diff --git a/trust_and_safety_models/toxicity/data/dataframe_loader.py b/trust_and_safety_models/toxicity/data/dataframe_loader.py
index f3855d6b5..b9b1613ff 100644
--- a/trust_and_safety_models/toxicity/data/dataframe_loader.py
+++ b/trust_and_safety_models/toxicity/data/dataframe_loader.py
@@ -1,348 +1,359 @@
+import pickle
 from abc import ABC, abstractmethod
 from datetime import date
 from importlib import import_module
-import pickle
+from typing import Optional, Tuple
 
+import numpy as np
+import pandas as pd
 from toxicity_ml_pipeline.settings.default_settings_tox import (
-  CLIENT,
-  EXISTING_TASK_VERSIONS,
-  GCS_ADDRESS,
-  TRAINING_DATA_LOCATION,
-)
+    CLIENT, EXISTING_TASK_VERSIONS, GCS_ADDRESS, TRAINING_DATA_LOCATION)
 from toxicity_ml_pipeline.utils.helpers import execute_command, execute_query
-from toxicity_ml_pipeline.utils.queries import (
-  FULL_QUERY,
-  FULL_QUERY_W_TWEET_TYPES,
-  PARSER_UDF,
-  QUERY_SETTINGS,
-)
-
-import numpy as np
-import pandas
+from toxicity_ml_pipeline.utils.queries import (FULL_QUERY,
+                                                FULL_QUERY_W_TWEET_TYPES,
+                                                PARSER_UDF, QUERY_SETTINGS)
 
 
 class DataframeLoader(ABC):
+    def __init__(self, project: str):
+        self.project = project
 
-  def __init__(self, project):
-    self.project = project
-
-  @abstractmethod
-  def produce_query(self):
-    pass
+    @abstractmethod
+    def produce_query(self):
+        pass
 
-  @abstractmethod
-  def load_data(self, test=False):
-    pass
+    @abstractmethod
+    def load_data(self, test=False):
+        pass
 
 
 class ENLoader(DataframeLoader):
-  def __init__(self, project, setting_file):
-    super(ENLoader, self).__init__(project=project)
-    self.date_begin = setting_file.DATE_BEGIN
-    self.date_end = setting_file.DATE_END
-    TASK_VERSION = setting_file.TASK_VERSION
-    if TASK_VERSION not in EXISTING_TASK_VERSIONS:
-      raise ValueError
-    self.task_version = TASK_VERSION
-    self.query_settings = dict(QUERY_SETTINGS)
-    self.full_query = FULL_QUERY
-
-  def produce_query(self, date_begin, date_end, task_version=None, **keys):
-    task_version = self.task_version if task_version is None else task_version
-
-    if task_version in keys["table"]:
-      table_name = keys["table"][task_version]
-      print(f"Loading {table_name}")
-
-      main_query = keys["main"].format(
-        table=table_name,
-        parser_udf=PARSER_UDF[task_version],
-        date_begin=date_begin,
-        date_end=date_end,
-      )
-
-      return self.full_query.format(
-        main_table_query=main_query, date_begin=date_begin, date_end=date_end
-      )
-    return ""
-
-  def _reload(self, test, file_keyword):
-    query = f"SELECT * from `{TRAINING_DATA_LOCATION.format(project=self.project)}_{file_keyword}`"
-
-    if test:
-      query += " ORDER BY RAND() LIMIT 1000"
-    try:
-      df = execute_query(client=CLIENT, query=query)
-    except Exception:
-      print(
-        "Loading from BQ failed, trying to load from GCS. "
-        "NB: use this option only for intermediate files, which will be deleted at the end of "
-        "the project."
-      )
-      copy_cmd = f"gsutil cp {GCS_ADDRESS.format(project=self.project)}/training_data/{file_keyword}.pkl ."
-      execute_command(copy_cmd)
-      try:
-        with open(f"{file_keyword}.pkl", "rb") as file:
-          df = pickle.load(file)
-      except Exception:
-        return None
-
-      if test:
-        df = df.sample(frac=1)
-        return df.iloc[:1000]
-
-    return df
-
-  def load_data(self, test=False, **kwargs):
-    if "reload" in kwargs and kwargs["reload"]:
-      df = self._reload(test, kwargs["reload"])
-      if df is not None and df.shape[0] > 0:
+    def __init__(self, project: str, setting_file):
+        super(ENLoader, self).__init__(project=project)
+        self.date_begin = setting_file.DATE_BEGIN
+        self.date_end = setting_file.DATE_END
+        TASK_VERSION = setting_file.TASK_VERSION
+        if TASK_VERSION not in EXISTING_TASK_VERSIONS:
+            raise ValueError
+        self.task_version = TASK_VERSION
+        self.query_settings = dict(QUERY_SETTINGS)
+        self.full_query = FULL_QUERY
+
+    def produce_query(
+        self, date_begin: str, date_end: str, task_version: float = None, **keys
+    ) -> str:
+        task_version = self.task_version if task_version is None else task_version
+
+        if task_version in keys["table"]:
+            table_name = keys["table"][task_version]
+            print(f"Loading {table_name}")
+
+            main_query = keys["main"].format(
+                table=table_name,
+                parser_udf=PARSER_UDF[task_version],
+                date_begin=date_begin,
+                date_end=date_end,
+            )
+
+            return self.full_query.format(
+                main_table_query=main_query, date_begin=date_begin, date_end=date_end
+            )
+        return ""
+
+    def _reload(self, test: bool, file_keyword: str) -> pd.DataFrame:
+        query = f"SELECT * from `{TRAINING_DATA_LOCATION.format(project=self.project)}_{file_keyword}`"
+        if test:
+            query += " ORDER BY RAND() LIMIT 1000"
+        try:
+            df = execute_query(client=CLIENT, query=query)
+        except Exception:
+            print(
+                "Loading from BQ failed, trying to load from GCS. "
+                "NB: use this option only for intermediate files, which will be deleted at the end of "
+                "the project."
+            )
+            copy_cmd = f"gsutil cp {GCS_ADDRESS.format(project=self.project)}/training_data/{file_keyword}.pkl ."
+            execute_command(copy_cmd)
+            try:
+                with open(f"{file_keyword}.pkl", "rb") as file:
+                    df = pickle.load(file)
+            except Exception:
+                return None
+
+            if test:
+                df = df.sample(frac=1)
+                return df.iloc[:1000]
+        return df
+
+    def load_data(self, test: bool = False, **kwargs) -> Optional[pd.DataFrame]:
+        if "reload" in kwargs and kwargs["reload"]:
+            df = self._reload(test, kwargs["reload"])
+            if df is not None and df.shape[0] > 0:
+                return df
+
+        df = None
+        query_settings = self.query_settings
+        if test:
+            query_settings = {"fairness": self.query_settings["fairness"]}
+            query_settings["fairness"]["main"] += " LIMIT 500"
+        for table, query_info in query_settings.items():
+            curr_query = self.produce_query(
+                date_begin=self.date_begin, date_end=self.date_end, **query_info
+            )
+            if curr_query == "":
+                continue
+            curr_df = execute_query(client=CLIENT, query=curr_query)
+            curr_df["origin"] = table
+            df = curr_df if df is None else pd.concat((df, curr_df))
+        df["loading_date"] = date.today()
+        df["date"] = pd.to_datetime(df.date)
         return df
 
-    df = None
-    query_settings = self.query_settings
-    if test:
-      query_settings = {"fairness": self.query_settings["fairness"]}
-      query_settings["fairness"]["main"] += " LIMIT 500"
-
-    for table, query_info in query_settings.items():
-      curr_query = self.produce_query(
-        date_begin=self.date_begin, date_end=self.date_end, **query_info
-      )
-      if curr_query == "":
-        continue
-      curr_df = execute_query(client=CLIENT, query=curr_query)
-      curr_df["origin"] = table
-      df = curr_df if df is None else pandas.concat((df, curr_df))
-
-    df["loading_date"] = date.today()
-    df["date"] = pandas.to_datetime(df.date)
-    return df
-
-  def load_precision_set(
-    self, begin_date="...", end_date="...", with_tweet_types=False, task_version=3.5
-  ):
-    if with_tweet_types:
-      self.full_query = FULL_QUERY_W_TWEET_TYPES
-
-    query_settings = self.query_settings
-    curr_query = self.produce_query(
-      date_begin=begin_date,
-      date_end=end_date,
-      task_version=task_version,
-      **query_settings["precision"],
-    )
-    curr_df = execute_query(client=CLIENT, query=curr_query)
-
-    curr_df.rename(columns={"media_url": "media_presence"}, inplace=True)
-    return curr_df
+    def load_precision_set(
+        self,
+        begin_date: str = "...",
+        end_date: str = "...",
+        with_tweet_types: bool = False,
+        task_version: float = 3.5,
+    ) -> pd.DataFrame:
+        if with_tweet_types:
+            self.full_query = FULL_QUERY_W_TWEET_TYPES
+
+        query_settings = self.query_settings
+        curr_query = self.produce_query(
+            date_begin=begin_date,
+            date_end=end_date,
+            task_version=task_version,
+            **query_settings["precision"],
+        )
+        curr_df = execute_query(client=CLIENT, query=curr_query)
+
+        curr_df.rename(columns={"media_url": "media_presence"}, inplace=True)
+        return curr_df
 
 
 class ENLoaderWithSampling(ENLoader):
+    keywords = {
+        "politics": [...],
+        "insults": [...],
+        "race": [...],
+    }
+    n = ...
+    N = ...
+
+    def __init__(self, project: str):
+        self.raw_loader = ENLoader(project=project)
+        if project == ...:
+            self.project = project
+        else:
+            raise ValueError
+
+    def sample_with_weights(self, df: pd.DataFrame, n: int) -> pd.DataFrame:
+        w = df["label"].value_counts(normalize=True)[1]
+        dist = np.full((df.shape[0],), w)
+        sampled_df = df.sample(n=n, weights=dist, replace=False)
+        return sampled_df
+
+    def sample_keywords(
+        self, df: pd.DataFrame, N: int, group: str
+    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
+        print("\nmatching", group, "keywords...")
+        keyword_list = self.keywords[group]
+        match_df = df.loc[
+            df.text.str.lower().str.contains("|".join(keyword_list), regex=True)
+        ]
+
+        print("sampling N/3 from", group)
+        if match_df.shape[0] <= N / 3:
+            print(
+                "WARNING: Sampling only",
+                match_df.shape[0],
+                "instead of",
+                N / 3,
+                "examples from race focused tweets due to insufficient data",
+            )
+            sample_df = match_df
+
+        else:
+            print(
+                "sampling",
+                group,
+                "at",
+                round(match_df["label"].value_counts(normalize=True)[1], 3),
+                "% action rate",
+            )
+            sample_df = self.sample_with_weights(match_df, int(N / 3))
+        print(sample_df.shape)
+        print(sample_df.label.value_counts(normalize=True))
+
+        print(
+            "\nshape of df before dropping sampled rows after",
+            group,
+            "matching..",
+            df.shape[0],
+        )
+        df = df.loc[df.index.difference(sample_df.index),]
+        print(
+            "\nshape of df after dropping sampled rows after",
+            group,
+            "matching..",
+            df.shape[0],
+        )
+        return df, sample_df
+
+    def sample_first_set_helper(
+        self, train_df: pd.DataFrame, first_set: pd.DataFrame, new_n: int
+    ) -> pd.DataFrame:
+        if first_set == "prev":
+            fset = train_df.loc[
+                train_df["origin"].isin(["prevalence", "causal prevalence"])
+            ]
+            print(
+                f"sampling prev at {fset['label'].value_counts(normalize=True)[1]:.3f}% action rate"
+            )
+        else:
+            fset = train_df
+
+        n_fset = self.sample_with_weights(fset, new_n)
+        print("len of sampled first set", n_fset.shape[0])
+        print(n_fset.label.value_counts(normalize=True))
+
+        return n_fset
+
+    def sample(
+        self,
+        df: pd.DataFrame,
+        first_set: pd.DataFrame,
+        second_set: pd.DataFrame,
+        keyword_sampling: bool,
+        n: int,
+        N: int,
+    ) -> pd.DataFrame:
+        train_df = df[df.origin != "precision"]
+        val_test_df = df[df.origin == "precision"]
+
+        print("\nsampling first set of data")
+        new_n = n - N if second_set is not None else n
+        n_fset = self.sample_first_set_helper(train_df, first_set, new_n)
+
+        print("\nsampling second set of data")
+        train_df = train_df.loc[train_df.index.difference(n_fset.index),]
+
+        if second_set is None:
+            print("no second set sampling being done")
+            df = n_fset.append(val_test_df)
+            return df
+
+        if second_set == "prev":
+            sset = train_df.loc[
+                train_df["origin"].isin(["prevalence", "causal prevalence"])
+            ]
+        elif second_set == "fdr":
+            sset = train_df.loc[train_df["origin"] == "fdr"]
+        else:
+            sset = train_df
+
+        if keyword_sampling == True:
+            print("sampling based off of keywords defined...")
+            print("second set is", second_set, "with length", sset.shape[0])
+            sset, n_politics = self.sample_keywords(sset, N, "politics")
+            sset, n_insults = self.sample_keywords(sset, N, "insults")
+            sset, n_race = self.sample_keywords(sset, N, "race")
+            n_sset = n_politics.append([n_insults, n_race])
+            print("len of sampled second set", n_sset.shape[0])
+        else:
+            print(
+                "No keyword sampling. Instead random sampling from",
+                second_set,
+                "at",
+                round(sset["label"].value_counts(normalize=True)[1], 3),
+                "% action rate",
+            )
+            n_sset = self.sample_with_weights(sset, N)
+            print("len of sampled second set", n_sset.shape[0])
+            print(n_sset.label.value_counts(normalize=True))
+
+        df = n_fset.append([n_sset, val_test_df])
+        df = df.sample(frac=1).reset_index(drop=True)
+        return df
+
+    def load_data(
+        self,
+        first_set: str = "prev",
+        second_set: str = None,
+        keyword_sampling: bool = False,
+        test: bool = False,
+        **kwargs,
+    ) -> pd.DataFrame:
+        n = kwargs.get("n", self.n)
+        N = kwargs.get("N", self.N)
 
-  keywords = {
-    "politics": [
-...
-    ],
-    "insults": [
-...    
-    ],
-    "race": [
-...
-    ],
-  }
-  n = ...
-  N = ...
-
-  def __init__(self, project):
-    self.raw_loader = ENLoader(project=project)
-    if project == ...:
-      self.project = project
-    else:
-      raise ValueError
-
-  def sample_with_weights(self, df, n):
-    w = df["label"].value_counts(normalize=True)[1]
-    dist = np.full((df.shape[0],), w)
-    sampled_df = df.sample(n=n, weights=dist, replace=False)
-    return sampled_df
-
-  def sample_keywords(self, df, N, group):
-    print("\nmatching", group, "keywords...")
-
-    keyword_list = self.keywords[group]
-    match_df = df.loc[df.text.str.lower().str.contains("|".join(keyword_list), regex=True)]
-
-    print("sampling N/3 from", group)
-    if match_df.shape[0] <= N / 3:
-      print(
-        "WARNING: Sampling only",
-        match_df.shape[0],
-        "instead of",
-        N / 3,
-        "examples from race focused tweets due to insufficient data",
-      )
-      sample_df = match_df
-
-    else:
-      print(
-        "sampling",
-        group,
-        "at",
-        round(match_df["label"].value_counts(normalize=True)[1], 3),
-        "% action rate",
-      )
-      sample_df = self.sample_with_weights(match_df, int(N / 3))
-    print(sample_df.shape)
-    print(sample_df.label.value_counts(normalize=True))
-
-    print("\nshape of df before dropping sampled rows after", group, "matching..", df.shape[0])
-    df = df.loc[
-      df.index.difference(sample_df.index),
-    ]
-    print("\nshape of df after dropping sampled rows after", group, "matching..", df.shape[0])
-
-    return df, sample_df
-
-  def sample_first_set_helper(self, train_df, first_set, new_n):
-    if first_set == "prev":
-      fset = train_df.loc[train_df["origin"].isin(["prevalence", "causal prevalence"])]
-      print(
-        "sampling prev at", round(fset["label"].value_counts(normalize=True)[1], 3), "% action rate"
-      )
-    else:
-      fset = train_df
-
-    n_fset = self.sample_with_weights(fset, new_n)
-    print("len of sampled first set", n_fset.shape[0])
-    print(n_fset.label.value_counts(normalize=True))
-
-    return n_fset
-
-  def sample(self, df, first_set, second_set, keyword_sampling, n, N):
-    train_df = df[df.origin != "precision"]
-    val_test_df = df[df.origin == "precision"]
-
-    print("\nsampling first set of data")
-    new_n = n - N if second_set is not None else n
-    n_fset = self.sample_first_set_helper(train_df, first_set, new_n)
-
-    print("\nsampling second set of data")
-    train_df = train_df.loc[
-      train_df.index.difference(n_fset.index),
-    ]
-
-    if second_set is None:
-      print("no second set sampling being done")
-      df = n_fset.append(val_test_df)
-      return df
-
-    if second_set == "prev":
-      sset = train_df.loc[train_df["origin"].isin(["prevalence", "causal prevalence"])]
-
-    elif second_set == "fdr":
-      sset = train_df.loc[train_df["origin"] == "fdr"]
-
-    else: 
-      sset = train_df
-
-    if keyword_sampling == True:
-      print("sampling based off of keywords defined...")
-      print("second set is", second_set, "with length", sset.shape[0])
-
-      sset, n_politics = self.sample_keywords(sset, N, "politics")
-      sset, n_insults = self.sample_keywords(sset, N, "insults")
-      sset, n_race = self.sample_keywords(sset, N, "race")
-
-      n_sset = n_politics.append([n_insults, n_race]) 
-      print("len of sampled second set", n_sset.shape[0])
-
-    else:
-      print(
-        "No keyword sampling. Instead random sampling from",
-        second_set,
-        "at",
-        round(sset["label"].value_counts(normalize=True)[1], 3),
-        "% action rate",
-      )
-      n_sset = self.sample_with_weights(sset, N)
-      print("len of sampled second set", n_sset.shape[0])
-      print(n_sset.label.value_counts(normalize=True))
-
-    df = n_fset.append([n_sset, val_test_df])
-    df = df.sample(frac=1).reset_index(drop=True)
-
-    return df
-
-  def load_data(
-    self, first_set="prev", second_set=None, keyword_sampling=False, test=False, **kwargs
-  ):
-    n = kwargs.get("n", self.n)
-    N = kwargs.get("N", self.N)
-
-    df = self.raw_loader.load_data(test=test, **kwargs)
-    return self.sample(df, first_set, second_set, keyword_sampling, n, N)
+        df = self.raw_loader.load_data(test=test, **kwargs)
+        return self.sample(df, first_set, second_set, keyword_sampling, n, N)
 
 
 class I18nLoader(DataframeLoader):
-  def __init__(self):
-    super().__init__(project=...)
-    from archive.settings.... import ACCEPTED_LANGUAGES, QUERY_SETTINGS
-
-    self.accepted_languages = ACCEPTED_LANGUAGES
-    self.query_settings = dict(QUERY_SETTINGS)
-
-  def produce_query(self, language, query, dataset, table, lang):
-    query = query.format(dataset=dataset, table=table)
-    add_query = f"AND reviewed.{lang}='{language}'"
-    query += add_query
-
-    return query
-
-  def query_keys(self, language, task=2, size="50"):
-    if task == 2:
-      if language == "ar":
-        self.query_settings["adhoc_v2"]["table"] = "..."
-      elif language == "tr":
-        self.query_settings["adhoc_v2"]["table"] = "..."
-      elif language == "es":
-        self.query_settings["adhoc_v2"]["table"] = f"..."
-      else:
-        self.query_settings["adhoc_v2"]["table"] = "..."
-
-      return self.query_settings["adhoc_v2"]
-
-    if task == 3:
-      return self.query_settings["adhoc_v3"]
-
-    raise ValueError(f"There are no other tasks than 2 or 3. {task} does not exist.")
-
-  def load_data(self, language, test=False, task=2):
-    if language not in self.accepted_languages:
-      raise ValueError(
-        f"Language not in the data {language}. Accepted values are " f"{self.accepted_languages}"
-      )
-
-    print(".... adhoc data")
-    key_dict = self.query_keys(language=language, task=task)
-    query_adhoc = self.produce_query(language=language, **key_dict)
-    if test:
-      query_adhoc += " LIMIT 500"
-    adhoc_df = execute_query(CLIENT, query_adhoc)
-
-    if not (test or language == "tr" or task == 3):
-      if language == "es":
-        print(".... additional adhoc data")
-        key_dict = self.query_keys(language=language, size="100")
-        query_adhoc = self.produce_query(language=language, **key_dict)
-        adhoc_df = pandas.concat(
-          (adhoc_df, execute_query(CLIENT, query_adhoc)), axis=0, ignore_index=True
+    def __init__(self):
+        super().__init__(project=...)
+        from archive.settings.... import ACCEPTED_LANGUAGES, QUERY_SETTINGS
+
+        self.accepted_languages = ACCEPTED_LANGUAGES
+        self.query_settings = dict(QUERY_SETTINGS)
+
+    def produce_query(self, language: str, query: str, dataset: str, table: str, lang: str) -> str:
+        query = query.format(dataset=dataset, table=table)
+        add_query = f"AND reviewed.{lang}='{language}'"
+        query += add_query
+        return query
+
+    def query_keys(self, language: str, task: int=2, size: str="50"):
+        if task == 2:
+            if language == "ar":
+                self.query_settings["adhoc_v2"]["table"] = "..."
+            elif language == "tr":
+                self.query_settings["adhoc_v2"]["table"] = "..."
+            elif language == "es":
+                self.query_settings["adhoc_v2"]["table"] = f"..."
+            else:
+                self.query_settings["adhoc_v2"]["table"] = "..."
+        return self.query_settings["adhoc_v2"]
+        if task == 3:
+            return self.query_settings["adhoc_v3"]
+        raise ValueError(
+            f"There are no other tasks than 2 or 3. {task} does not exist."
         )
 
-      print(".... prevalence data")
-      query_prev = self.produce_query(language=language, **self.query_settings["prevalence_v2"])
-      prev_df = execute_query(CLIENT, query_prev)
-      prev_df["description"] = "Prevalence"
-      adhoc_df = pandas.concat((adhoc_df, prev_df), axis=0, ignore_index=True)
+    def load_data(self, language: str, test: bool=False, task: int=2):
+        if language not in self.accepted_languages:
+            raise ValueError(
+                f"Language not in the data {language}. Accepted values are "
+                f"{self.accepted_languages}"
+            )
 
-    return self.clean(adhoc_df)
+        print(".... adhoc data")
+        key_dict = self.query_keys(language=language, task=task)
+        query_adhoc = self.produce_query(language=language, **key_dict)
+        if test:
+            query_adhoc += " LIMIT 500"
+        adhoc_df = execute_query(CLIENT, query_adhoc)
+
+        if not (test or language == "tr" or task == 3):
+            if language == "es":
+                print(".... additional adhoc data")
+                key_dict = self.query_keys(language=language, size="100")
+                query_adhoc = self.produce_query(language=language, **key_dict)
+                adhoc_df = pd.concat(
+                    (adhoc_df, execute_query(CLIENT, query_adhoc)),
+                    axis=0,
+                    ignore_index=True,
+                )
+
+            print(".... prevalence data")
+            query_prev = self.produce_query(
+                language=language, **self.query_settings["prevalence_v2"]
+            )
+            prev_df = execute_query(CLIENT, query_prev)
+            prev_df["description"] = "Prevalence"
+            adhoc_df = pd.concat((adhoc_df, prev_df), axis=0, ignore_index=True)
+
+        return self.clean(adhoc_df)
diff --git a/trust_and_safety_models/toxicity/data/mb_generator.py b/trust_and_safety_models/toxicity/data/mb_generator.py
index 58a89f8c5..350a6ebdd 100644
--- a/trust_and_safety_models/toxicity/data/mb_generator.py
+++ b/trust_and_safety_models/toxicity/data/mb_generator.py
@@ -1,284 +1,325 @@
-from importlib import import_module
 import os
+from importlib import import_module
+from typing import Tuple, Union
 
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+from sklearn.model_selection import StratifiedKFold
 from toxicity_ml_pipeline.settings.default_settings_tox import (
-  INNER_CV,
-  LOCAL_DIR,
-  MAX_SEQ_LENGTH,
-  NUM_PREFETCH,
-  NUM_WORKERS,
-  OUTER_CV,
-  TARGET_POS_PER_EPOCH,
+    INNER_CV,
+    LOCAL_DIR,
+    MAX_SEQ_LENGTH,
+    NUM_PREFETCH,
+    NUM_WORKERS,
+    OUTER_CV,
+    TARGET_POS_PER_EPOCH,
 )
 from toxicity_ml_pipeline.utils.helpers import execute_command
 
-import numpy as np
-import pandas
-from sklearn.model_selection import StratifiedKFold
-import tensorflow as tf
-
-
 try:
-  from transformers import AutoTokenizer, DataCollatorWithPadding
+    from transformers import AutoTokenizer, DataCollatorWithPadding
 except ModuleNotFoundError:
-  print("...")
+    print("...")
 else:
-  from datasets import Dataset
+    from datasets import Dataset
 
 
 class BalancedMiniBatchLoader(object):
-  def __init__(
-    self,
-    fold,
-    mb_size,
-    seed,
-    perc_training_tox,
-    scope="TOX",
-    project=...,
-    dual_head=None,
-    n_outer_splits=None,
-    n_inner_splits=None,
-    sample_weights=None,
-    huggingface=False,
-  ):
-    if 0 >= perc_training_tox or perc_training_tox > 0.5:
-      raise ValueError("Perc_training_tox should be in ]0; 0.5]")
-
-    self.perc_training_tox = perc_training_tox
-    if not n_outer_splits:
-      n_outer_splits = OUTER_CV
-    if isinstance(n_outer_splits, int):
-      self.n_outer_splits = n_outer_splits
-      self.get_outer_fold = self._get_outer_cv_fold
-      if fold < 0 or fold >= self.n_outer_splits or int(fold) != fold:
-        raise ValueError(f"Number of fold should be an integer in [0 ; {self.n_outer_splits} [.")
-
-    elif n_outer_splits == "time":
-      self.get_outer_fold = self._get_time_fold
-      if fold != "time":
-        raise ValueError(
-          "To avoid repeating the same run many times, the external fold"
-          "should be time when test data is split according to dates."
-        )
-      try:
-        setting_file = import_module(f"toxicity_ml_pipeline.settings.{scope.lower()}{project}_settings")
-      except ModuleNotFoundError:
-        raise ValueError(f"You need to define a setting file for your project {project}.")
-      self.test_begin_date = setting_file.TEST_BEGIN_DATE
-      self.test_end_date = setting_file.TEST_END_DATE
-
-    else:
-      raise ValueError(
-        f"Argument n_outer_splits should either an integer or 'time'. Provided: {n_outer_splits}"
-      )
-
-    self.n_inner_splits = n_inner_splits if n_inner_splits is not None else INNER_CV
-
-    self.seed = seed
-    self.mb_size = mb_size
-    self.fold = fold
-
-    self.sample_weights = sample_weights
-    self.dual_head = dual_head
-    self.huggingface = huggingface
-    if self.huggingface:
-      self._load_tokenizer()
-
-  def _load_tokenizer(self):
-    print("Making a local copy of Bertweet-base model")
-    local_model_dir = os.path.join(LOCAL_DIR, "models")
-    cmd = f"mkdir {local_model_dir} ; gsutil -m cp -r gs://... {local_model_dir}"
-    execute_command(cmd)
-
-    self.tokenizer = AutoTokenizer.from_pretrained(
-      os.path.join(local_model_dir, "bertweet-base"), normalization=True
-    )
-
-  def tokenize_function(self, el):
-    return self.tokenizer(
-      el["text"],
-      max_length=MAX_SEQ_LENGTH,
-      padding="max_length",
-      truncation=True,
-      add_special_tokens=True,
-      return_token_type_ids=False,
-      return_attention_mask=False,
-    )
-
-  def _get_stratified_kfold(self, n_splits):
-    return StratifiedKFold(shuffle=True, n_splits=n_splits, random_state=self.seed)
-
-  def _get_time_fold(self, df):
-    test_begin_date = pandas.to_datetime(self.test_begin_date).date()
-    test_end_date = pandas.to_datetime(self.test_end_date).date()
-    print(f"Test is going from {test_begin_date} to {test_end_date}.")
-    test_data = df.query("@test_begin_date <= date <= @test_end_date")
-
-    query = "date < @test_begin_date"
-    other_set = df.query(query)
-    return other_set, test_data
-
-  def _get_outer_cv_fold(self, df):
-    labels = df.int_label
-    stratifier = self._get_stratified_kfold(n_splits=self.n_outer_splits)
-
-    k = 0
-    for train_index, test_index in stratifier.split(np.zeros(len(labels)), labels):
-      if k == self.fold:
-        break
-      k += 1
-
-    train_data = df.iloc[train_index].copy()
-    test_data = df.iloc[test_index].copy()
-
-    return train_data, test_data
-
-  def get_steps_per_epoch(self, nb_pos_examples):
-    return int(max(TARGET_POS_PER_EPOCH, nb_pos_examples) / self.mb_size / self.perc_training_tox)
-
-  def make_huggingface_tensorflow_ds(self, group, mb_size=None, shuffle=True):
-    huggingface_ds = Dataset.from_pandas(group).map(self.tokenize_function, batched=True)
-    data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer, return_tensors="tf")
-    tensorflow_ds = huggingface_ds.to_tf_dataset(
-      columns=["input_ids"],
-      label_cols=["labels"],
-      shuffle=shuffle,
-      batch_size=self.mb_size if mb_size is None else mb_size,
-      collate_fn=data_collator,
-    )
-
-    if shuffle:
-      return tensorflow_ds.repeat()
-    return tensorflow_ds
-
-  def make_pure_tensorflow_ds(self, df, nb_samples):
-    buffer_size = nb_samples * 2
-
-    if self.sample_weights is not None:
-      if self.sample_weights not in df.columns:
-        raise ValueError
-      ds = tf.data.Dataset.from_tensor_slices(
-        (df.text.values, df.label.values, df[self.sample_weights].values)
-      )
-    elif self.dual_head:
-      label_d = {f'{e}_output': df[f'{e}_label'].values for e in self.dual_head}
-      label_d['content_output'] = tf.keras.utils.to_categorical(label_d['content_output'], num_classes=3)
-      ds = tf.data.Dataset.from_tensor_slices((df.text.values, label_d))
-
-    else:
-      ds = tf.data.Dataset.from_tensor_slices((df.text.values, df.label.values))
-    ds = ds.shuffle(buffer_size, seed=self.seed, reshuffle_each_iteration=True).repeat()
-    return ds
-
-  def get_balanced_dataset(self, training_data, size_limit=None, return_as_batch=True):
-    training_data = training_data.sample(frac=1, random_state=self.seed)
-    nb_samples = training_data.shape[0] if not size_limit else size_limit
-
-    num_classes = training_data.int_label.nunique()
-    toxic_class = training_data.int_label.max()
-    if size_limit:
-      training_data = training_data[: size_limit * num_classes]
-
-    print(
-      ".... {} examples, incl. {:.2f}% tox in train, {} classes".format(
-        nb_samples,
-        100 * training_data[training_data.int_label == toxic_class].shape[0] / nb_samples,
-        num_classes,
-      )
-    )
-    label_groups = training_data.groupby("int_label")
-    if self.huggingface:
-      label_datasets = {
-        label: self.make_huggingface_tensorflow_ds(group) for label, group in label_groups
-      }
-
-    else:
-      label_datasets = {
-        label: self.make_pure_tensorflow_ds(group, nb_samples=nb_samples * 2)
-        for label, group in label_groups
-      }
-
-    datasets = [label_datasets[0], label_datasets[1]]
-    weights = [1 - self.perc_training_tox, self.perc_training_tox]
-    if num_classes == 3:
-      datasets.append(label_datasets[2])
-      weights = [1 - self.perc_training_tox, self.perc_training_tox / 2, self.perc_training_tox / 2]
-    elif num_classes != 2:
-      raise ValueError("Currently it should not be possible to get other than 2 or 3 classes")
-    resampled_ds = tf.data.experimental.sample_from_datasets(datasets, weights, seed=self.seed)
-
-    if return_as_batch and not self.huggingface:
-      return resampled_ds.batch(
-        self.mb_size, drop_remainder=True, num_parallel_calls=NUM_WORKERS, deterministic=True
-      ).prefetch(NUM_PREFETCH)
-
-    return resampled_ds
-
-  @staticmethod
-  def _compute_int_labels(full_df):
-    if full_df.label.dtype == int:
-      full_df["int_label"] = full_df.label
-
-    elif "int_label" not in full_df.columns:
-      if full_df.label.max() > 1:
-        raise ValueError("Binarizing labels that should not be.")
-      full_df["int_label"] = np.where(full_df.label >= 0.5, 1, 0)
-
-    return full_df
-
-  def __call__(self, full_df, *args, **kwargs):
-    full_df = self._compute_int_labels(full_df)
-
-    train_data, test_data = self.get_outer_fold(df=full_df)
-
-    stratifier = self._get_stratified_kfold(n_splits=self.n_inner_splits)
-    for train_index, val_index in stratifier.split(
-      np.zeros(train_data.shape[0]), train_data.int_label
+    def __init__(
+        self,
+        fold: int,
+        mb_size: int,
+        seed: int,
+        perc_training_tox: float,
+        scope: str = "TOX",
+        project=...,
+        dual_head=None,
+        n_outer_splits: Union[str, int] = None,
+        n_inner_splits: int = None,
+        sample_weights=None,
+        huggingface: bool = False,
     ):
-      curr_train_data = train_data.iloc[train_index]
+        if 0 >= perc_training_tox or perc_training_tox > 0.5:
+            raise ValueError("Perc_training_tox should be in ]0; 0.5]")
+
+        self.perc_training_tox = perc_training_tox
+        if not n_outer_splits:
+            n_outer_splits = OUTER_CV
+        if isinstance(n_outer_splits, int):
+            self.n_outer_splits = n_outer_splits
+            self.get_outer_fold = self._get_outer_cv_fold
+            if fold < 0 or fold >= self.n_outer_splits or int(fold) != fold:
+                raise ValueError(
+                    f"Number of fold should be an integer in [0 ; {self.n_outer_splits} [."
+                )
+
+        elif n_outer_splits == "time":
+            self.get_outer_fold = self._get_time_fold
+            if fold != "time":
+                raise ValueError(
+                    "To avoid repeating the same run many times, the external fold"
+                    "should be time when test data is split according to dates."
+                )
+            try:
+                setting_file = import_module(
+                    f"toxicity_ml_pipeline.settings.{scope.lower()}{project}_settings"
+                )
+            except ModuleNotFoundError:
+                raise ValueError(
+                    f"You need to define a setting file for your project {project}."
+                )
+            self.test_begin_date = setting_file.TEST_BEGIN_DATE
+            self.test_end_date = setting_file.TEST_END_DATE
+
+        else:
+            raise ValueError(
+                f"Argument n_outer_splits should either an integer or 'time'. Provided: {n_outer_splits}"
+            )
+
+        self.n_inner_splits = n_inner_splits if n_inner_splits is not None else INNER_CV
+        self.seed = seed
+        self.mb_size = mb_size
+        self.fold = fold
+        self.sample_weights = sample_weights
+        self.dual_head = dual_head
+        self.huggingface = huggingface
+        if self.huggingface:
+            self._load_tokenizer()
+
+    def _load_tokenizer(self):
+        print("Making a local copy of Bertweet-base model")
+        local_model_dir = os.path.join(LOCAL_DIR, "models")
+        cmd = f"mkdir {local_model_dir} ; gsutil -m cp -r gs://... {local_model_dir}"
+        execute_command(cmd)
+
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            os.path.join(local_model_dir, "bertweet-base"), normalization=True
+        )
+
+    def tokenize_function(self, el: dict) -> dict:
+        return self.tokenizer(
+            el["text"],
+            max_length=MAX_SEQ_LENGTH,
+            padding="max_length",
+            truncation=True,
+            add_special_tokens=True,
+            return_token_type_ids=False,
+            return_attention_mask=False,
+        )
+
+    def _get_stratified_kfold(self, n_splits: int) -> StratifiedKFold:
+        return StratifiedKFold(shuffle=True, n_splits=n_splits, random_state=self.seed)
+
+    def _get_time_fold(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
+        test_begin_date = pd.to_datetime(self.test_begin_date).date()
+        test_end_date = pd.to_datetime(self.test_end_date).date()
+        print(f"Test is going from {test_begin_date} to {test_end_date}.")
+        test_data = df.query("@test_begin_date <= date <= @test_end_date")
+
+        query = "date < @test_begin_date"
+        other_set = df.query(query)
+        return other_set, test_data
+
+    def _get_outer_cv_fold(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
+        labels = df.int_label
+        stratifier = self._get_stratified_kfold(n_splits=self.n_outer_splits)
+
+        k = 0
+        for train_index, test_index in stratifier.split(np.zeros(len(labels)), labels):
+            if k == self.fold:
+                break
+            k += 1
+        train_data = df.iloc[train_index].copy()
+        test_data = df.iloc[test_index].copy()
+        return train_data, test_data
+
+    def get_steps_per_epoch(self, nb_pos_examples: int) -> int:
+        return int(
+            max(TARGET_POS_PER_EPOCH, nb_pos_examples)
+            / self.mb_size
+            / self.perc_training_tox
+        )
 
-      mini_batches = self.get_balanced_dataset(curr_train_data)
+    def make_huggingface_tensorflow_ds(self, group, mb_size=None, shuffle: bool = True):
+        huggingface_ds = Dataset.from_pandas(group).map(
+            self.tokenize_function, batched=True
+        )
+        data_collator = DataCollatorWithPadding(
+            tokenizer=self.tokenizer, return_tensors="tf"
+        )
+        tensorflow_ds = huggingface_ds.to_tf_dataset(
+            columns=["input_ids"],
+            label_cols=["labels"],
+            shuffle=shuffle,
+            batch_size=self.mb_size if mb_size is None else mb_size,
+            collate_fn=data_collator,
+        )
+
+        if shuffle:
+            return tensorflow_ds.repeat()
+        return tensorflow_ds
+
+    def make_pure_tensorflow_ds(
+        self, df: pd.DataFrame, nb_samples: int
+    ) -> tf.data.Dataset:
+        buffer_size = nb_samples * 2
+
+        if self.sample_weights is not None:
+            if self.sample_weights not in df.columns:
+                raise ValueError
+            ds = tf.data.Dataset.from_tensor_slices(
+                (df.text.values, df.label.values, df[self.sample_weights].values)
+            )
+        elif self.dual_head:
+            label_d = {f"{e}_output": df[f"{e}_label"].values for e in self.dual_head}
+            label_d["content_output"] = tf.keras.utils.to_categorical(
+                label_d["content_output"], num_classes=3
+            )
+            ds = tf.data.Dataset.from_tensor_slices((df.text.values, label_d))
+
+        else:
+            ds = tf.data.Dataset.from_tensor_slices((df.text.values, df.label.values))
+        ds = ds.shuffle(
+            buffer_size, seed=self.seed, reshuffle_each_iteration=True
+        ).repeat()
+        return ds
+
+    def get_balanced_dataset(
+        self,
+        training_data: pd.DataFrame,
+        size_limit: int = None,
+        return_as_batch: bool = True,
+    ) -> tf.data.Dataset:
+        training_data = training_data.sample(frac=1, random_state=self.seed)
+        nb_samples = training_data.shape[0] if not size_limit else size_limit
+
+        num_classes = training_data.int_label.nunique()
+        toxic_class = training_data.int_label.max()
+        if size_limit:
+            training_data = training_data[: size_limit * num_classes]
+
+        percent_tox = (
+            100
+            * training_data[training_data.int_label == toxic_class].shape[0]
+            / nb_samples
+        )
+        print(
+            f"... {nb_samples} examples, incl. {percent_tox:.2f}% tox in train, {num_classes} classes"
+        )
+        label_groups = training_data.groupby("int_label")
+        if self.huggingface:
+            label_datasets = {
+                label: self.make_huggingface_tensorflow_ds(group)
+                for label, group in label_groups
+            }
+
+        else:
+            label_datasets = {
+                label: self.make_pure_tensorflow_ds(group, nb_samples=nb_samples * 2)
+                for label, group in label_groups
+            }
+
+        datasets = [label_datasets[0], label_datasets[1]]
+        weights = [1 - self.perc_training_tox, self.perc_training_tox]
+        if num_classes == 3:
+            datasets.append(label_datasets[2])
+            weights = [
+                1 - self.perc_training_tox,
+                self.perc_training_tox / 2,
+                self.perc_training_tox / 2,
+            ]
+        elif num_classes != 2:
+            raise ValueError(
+                "Currently it should not be possible to get other than 2 or 3 classes"
+            )
+        resampled_ds = tf.data.experimental.sample_from_datasets(
+            datasets, weights, seed=self.seed
+        )
+
+        if return_as_batch and not self.huggingface:
+            return resampled_ds.batch(
+                self.mb_size,
+                drop_remainder=True,
+                num_parallel_calls=NUM_WORKERS,
+                deterministic=True,
+            ).prefetch(NUM_PREFETCH)
 
-      steps_per_epoch = self.get_steps_per_epoch(
-        nb_pos_examples=curr_train_data[curr_train_data.int_label != 0].shape[0]
-      )
+        return resampled_ds
 
-      val_data = train_data.iloc[val_index].copy()
+    @staticmethod
+    def _compute_int_labels(full_df: pd.DataFrame) -> pd.DataFrame:
+        if full_df.label.dtype == int:
+            full_df["int_label"] = full_df.label
+        elif "int_label" not in full_df.columns:
+            if full_df.label.max() > 1:
+                raise ValueError("Binarizing labels that should not be.")
+            full_df["int_label"] = np.where(full_df.label >= 0.5, 1, 0)
 
-      yield mini_batches, steps_per_epoch, val_data, test_data
+        return full_df
 
-  def simple_cv_load(self, full_df):
-    full_df = self._compute_int_labels(full_df)
+    def __call__(self, full_df: pd.DataFrame, *args, **kwargs):
+        full_df = self._compute_int_labels(full_df)
 
-    train_data, test_data = self.get_outer_fold(df=full_df)
-    if test_data.shape[0] == 0:
-      test_data = train_data.iloc[:500]
+        train_data, test_data = self.get_outer_fold(df=full_df)
 
-    mini_batches = self.get_balanced_dataset(train_data)
-    steps_per_epoch = self.get_steps_per_epoch(
-      nb_pos_examples=train_data[train_data.int_label != 0].shape[0]
-    )
+        stratifier = self._get_stratified_kfold(n_splits=self.n_inner_splits)
+        for train_index, val_index in stratifier.split(
+            np.zeros(train_data.shape[0]), train_data.int_label
+        ):
+            curr_train_data = train_data.iloc[train_index]
+
+            mini_batches = self.get_balanced_dataset(curr_train_data)
+
+            steps_per_epoch = self.get_steps_per_epoch(
+                nb_pos_examples=curr_train_data[curr_train_data.int_label != 0].shape[0]
+            )
+
+            val_data = train_data.iloc[val_index].copy()
+
+            yield mini_batches, steps_per_epoch, val_data, test_data
+
+    def simple_cv_load(
+        self, full_df: pd.DataFrame
+    ) -> Tuple[tf.data.Dataset, pd.DataFrame, int]:
+        full_df = self._compute_int_labels(full_df)
+
+        train_data, test_data = self.get_outer_fold(df=full_df)
+        if test_data.shape[0] == 0:
+            test_data = train_data.iloc[:500]
+
+        mini_batches = self.get_balanced_dataset(train_data)
+        steps_per_epoch = self.get_steps_per_epoch(
+            nb_pos_examples=train_data[train_data.int_label != 0].shape[0]
+        )
 
-    return mini_batches, test_data, steps_per_epoch
+        return mini_batches, test_data, steps_per_epoch
 
-  def no_cv_load(self, full_df):
-    full_df = self._compute_int_labels(full_df)
+    def no_cv_load(
+        self, full_df: pd.DataFrame
+    ) -> Tuple[tf.data.Dataset, pd.DataFrame, int]:
+        full_df = self._compute_int_labels(full_df)
 
-    val_test = full_df[full_df.origin == "precision"].copy(deep=True)
-    val_data, test_data = self.get_outer_fold(df=val_test)
+        val_test = full_df[full_df.origin == "precision"].copy(deep=True)
+        val_data, test_data = self.get_outer_fold(df=val_test)
 
-    train_data = full_df.drop(full_df[full_df.origin == "precision"].index, axis=0)
-    if test_data.shape[0] == 0:
-      test_data = train_data.iloc[:500]
+        train_data = full_df.drop(full_df[full_df.origin == "precision"].index, axis=0)
+        if test_data.shape[0] == 0:
+            test_data = train_data.iloc[:500]
 
-    mini_batches = self.get_balanced_dataset(train_data)
-    if train_data.int_label.nunique() == 1:
-      raise ValueError('Should be at least two labels')
+        mini_batches = self.get_balanced_dataset(train_data)
+        if train_data.int_label.nunique() == 1:
+            raise ValueError("Should be at least two labels")
 
-    num_examples = train_data[train_data.int_label == 1].shape[0]
-    if train_data.int_label.nunique() > 2:
-      second_most_frequent_label = train_data.loc[train_data.int_label != 0, 'int_label'].mode().values[0]
-      num_examples = train_data[train_data.int_label == second_most_frequent_label].shape[0] * 2
-    steps_per_epoch = self.get_steps_per_epoch(nb_pos_examples=num_examples)
+        num_examples = train_data[train_data.int_label == 1].shape[0]
+        if train_data.int_label.nunique() > 2:
+            second_most_frequent_label = (
+                train_data.loc[train_data.int_label != 0, "int_label"].mode().values[0]
+            )
+            num_examples = (
+                train_data[train_data.int_label == second_most_frequent_label].shape[0]
+                * 2
+            )
+        steps_per_epoch = self.get_steps_per_epoch(nb_pos_examples=num_examples)
 
-    return mini_batches, steps_per_epoch, val_data, test_data
+        return mini_batches, steps_per_epoch, val_data, test_data
diff --git a/trust_and_safety_models/toxicity/load_model.py b/trust_and_safety_models/toxicity/load_model.py
index 7b271066f..b052c34ce 100644
--- a/trust_and_safety_models/toxicity/load_model.py
+++ b/trust_and_safety_models/toxicity/load_model.py
@@ -1,227 +1,253 @@
 import os
 
 from toxicity_ml_pipeline.settings.default_settings_tox import LOCAL_DIR, MAX_SEQ_LENGTH
+
 try:
-  from toxicity_ml_pipeline.optim.losses import MaskedBCE
+    from toxicity_ml_pipeline.optim.losses import MaskedBCE
 except ImportError:
-  print('No MaskedBCE loss')
-from toxicity_ml_pipeline.utils.helpers import execute_command
-
+    print("No MaskedBCE loss")
 import tensorflow as tf
-
+from toxicity_ml_pipeline.utils.helpers import execute_command
 
 try:
-  from twitter.cuad.representation.models.text_encoder import TextEncoder
+    from twitter.cuad.representation.models.text_encoder import TextEncoder
 except ModuleNotFoundError:
-  print("No TextEncoder package")
+    print("No TextEncoder package")
 
 try:
-  from transformers import TFAutoModelForSequenceClassification
+    from transformers import TFAutoModelForSequenceClassification
 except ModuleNotFoundError:
-  print("No HuggingFace package")
+    print("No HuggingFace package")
 
 LOCAL_MODEL_DIR = os.path.join(LOCAL_DIR, "models")
 
 
-def reload_model_weights(weights_dir, language, **kwargs):
-  optimizer = tf.keras.optimizers.Adam(0.01)
-  model_type = (
-    "twitter_bert_base_en_uncased_mlm"
-    if language == "en"
-    else "twitter_multilingual_bert_base_cased_mlm"
-  )
-  model = load(optimizer=optimizer, seed=42, model_type=model_type, **kwargs)
-  model.load_weights(weights_dir)
-
-  return model
-
-
-def _locally_copy_models(model_type):
-  if model_type == "twitter_multilingual_bert_base_cased_mlm":
-    preprocessor = "bert_multi_cased_preprocess_3"
-  elif model_type == "twitter_bert_base_en_uncased_mlm":
-    preprocessor = "bert_en_uncased_preprocess_3"
-  else:
-    raise NotImplementedError
-
-  copy_cmd = """mkdir {local_dir}
-gsutil cp -r ...
-gsutil cp -r ..."""
-  execute_command(
-    copy_cmd.format(model_type=model_type, preprocessor=preprocessor, local_dir=LOCAL_MODEL_DIR)
-  )
-
-  return preprocessor
-
-
-def load_encoder(model_type, trainable):
-  try:
-    model = TextEncoder(
-      max_seq_lengths=MAX_SEQ_LENGTH,
-      model_type=model_type,
-      cluster="gcp",
-      trainable=trainable,
-      enable_dynamic_shapes=True,
+def reload_model_weights(weights_dir, language: str, **kwargs):
+    optimizer = tf.keras.optimizers.Adam(0.01)
+    model_type = (
+        "twitter_bert_base_en_uncased_mlm"
+        if language == "en"
+        else "twitter_multilingual_bert_base_cased_mlm"
     )
-  except (OSError, tf.errors.AbortedError) as e:
-    print(e)
-    preprocessor = _locally_copy_models(model_type)
-
-    model = TextEncoder(
-      max_seq_lengths=MAX_SEQ_LENGTH,
-      local_model_path=f"models/{model_type}",
-      local_preprocessor_path=f"models/{preprocessor}",
-      cluster="gcp",
-      trainable=trainable,
-      enable_dynamic_shapes=True,
-    )
-
-  return model
-
+    model = load(optimizer=optimizer, seed=42, model_type=model_type, **kwargs)
+    model.load_weights(weights_dir)
 
-def get_loss(loss_name, from_logits, **kwargs):
-  loss_name = loss_name.lower()
-  if loss_name == "bce":
-    print("Binary CE loss")
-    return tf.keras.losses.BinaryCrossentropy(from_logits=from_logits)
+    return model
 
-  if loss_name == "cce":
-    print("Categorical cross-entropy loss")
-    return tf.keras.losses.CategoricalCrossentropy(from_logits=from_logits)
 
-  if loss_name == "scce":
-    print("Sparse categorical cross-entropy loss")
-    return tf.keras.losses.SparseCategoricalCrossentropy(from_logits=from_logits)
+def _locally_copy_models(model_type: str):
+    if model_type == "twitter_multilingual_bert_base_cased_mlm":
+        preprocessor = "bert_multi_cased_preprocess_3"
+    elif model_type == "twitter_bert_base_en_uncased_mlm":
+        preprocessor = "bert_en_uncased_preprocess_3"
+    else:
+        raise NotImplementedError
 
-  if loss_name == "focal_bce":
-    gamma = kwargs.get("gamma", 2)
-    print("Focal binary CE loss", gamma)
-    return tf.keras.losses.BinaryFocalCrossentropy(gamma=gamma, from_logits=from_logits)
+    copy_cmd = "mkdir {local_dir}\ngsutil cp -r ...\ngsutil cp -r ..."
+    execute_command(
+        copy_cmd.format(
+            model_type=model_type, preprocessor=preprocessor, local_dir=LOCAL_MODEL_DIR
+        )
+    )
 
-  if loss_name == 'masked_bce':
-    multitask = kwargs.get("multitask", False)
-    if from_logits or multitask:
-      raise NotImplementedError
-    print(f'Masked Binary Cross Entropy')
-    return MaskedBCE()
+    return preprocessor
+
+
+def load_encoder(model_type: str, trainable: bool):
+    try:
+        model = TextEncoder(
+            max_seq_lengths=MAX_SEQ_LENGTH,
+            model_type=model_type,
+            cluster="gcp",
+            trainable=trainable,
+            enable_dynamic_shapes=True,
+        )
+    except (OSError, tf.errors.AbortedError) as e:
+        print(e)
+        preprocessor = _locally_copy_models(model_type)
+
+        model = TextEncoder(
+            max_seq_lengths=MAX_SEQ_LENGTH,
+            local_model_path=f"models/{model_type}",
+            local_preprocessor_path=f"models/{preprocessor}",
+            cluster="gcp",
+            trainable=trainable,
+            enable_dynamic_shapes=True,
+        )
+
+    return model
+
+
+def get_loss(loss_name: str, from_logits, **kwargs):
+    loss_name = loss_name.lower()
+    if loss_name == "bce":
+        print("Binary CE loss")
+        return tf.keras.losses.BinaryCrossentropy(from_logits=from_logits)
+
+    if loss_name == "cce":
+        print("Categorical cross-entropy loss")
+        return tf.keras.losses.CategoricalCrossentropy(from_logits=from_logits)
+
+    if loss_name == "scce":
+        print("Sparse categorical cross-entropy loss")
+        return tf.keras.losses.SparseCategoricalCrossentropy(from_logits=from_logits)
+
+    if loss_name == "focal_bce":
+        gamma = kwargs.get("gamma", 2)
+        print("Focal binary CE loss", gamma)
+        return tf.keras.losses.BinaryFocalCrossentropy(
+            gamma=gamma, from_logits=from_logits
+        )
+
+    if loss_name == "masked_bce":
+        multitask = kwargs.get("multitask", False)
+        if from_logits or multitask:
+            raise NotImplementedError
+        print(f"Masked Binary Cross Entropy")
+        return MaskedBCE()
+
+    if loss_name == "inv_kl_loss":
+        raise NotImplementedError
+
+    raise ValueError(
+        f"This loss name is not valid: {loss_name}. Accepted loss names: BCE, masked BCE, CCE, sCCE, "
+        f"Focal_BCE, inv_KL_loss"
+    )
 
-  if loss_name == "inv_kl_loss":
-    raise NotImplementedError
 
-  raise ValueError(
-    f"This loss name is not valid: {loss_name}. Accepted loss names: BCE, masked BCE, CCE, sCCE, "
-    f"Focal_BCE, inv_KL_loss"
-  )
+def _add_additional_embedding_layer(doc_embedding, glorot, seed: int):
+    doc_embedding = tf.keras.layers.Dense(
+        768, activation="tanh", kernel_initializer=glorot
+    )(doc_embedding)
+    doc_embedding = tf.keras.layers.Dropout(rate=0.1, seed=seed)(doc_embedding)
+    return doc_embedding
 
-def _add_additional_embedding_layer(doc_embedding, glorot, seed):
-  doc_embedding = tf.keras.layers.Dense(768, activation="tanh", kernel_initializer=glorot)(doc_embedding)
-  doc_embedding = tf.keras.layers.Dropout(rate=0.1, seed=seed)(doc_embedding)
-  return doc_embedding
 
 def _get_bias(**kwargs):
-  smart_bias_value = kwargs.get('smart_bias_value', 0)
-  print('Smart bias init to ', smart_bias_value)
-  output_bias = tf.keras.initializers.Constant(smart_bias_value)
-  return output_bias
+    smart_bias_value = kwargs.get("smart_bias_value", 0)
+    print("Smart bias init to ", smart_bias_value)
+    output_bias = tf.keras.initializers.Constant(smart_bias_value)
+    return output_bias
 
 
 def load_inhouse_bert(model_type, trainable, seed, **kwargs):
-  inputs = tf.keras.layers.Input(shape=(), dtype=tf.string)
-  encoder = load_encoder(model_type=model_type, trainable=trainable)
-  doc_embedding = encoder([inputs])["pooled_output"]
-  doc_embedding = tf.keras.layers.Dropout(rate=0.1, seed=seed)(doc_embedding)
-
-  glorot = tf.keras.initializers.glorot_uniform(seed=seed)
-  if kwargs.get("additional_layer", False):
-    doc_embedding = _add_additional_embedding_layer(doc_embedding, glorot, seed)
-
-  if kwargs.get('content_num_classes', None):
-    probs = get_last_layer(glorot=glorot, last_layer_name='target_output', **kwargs)(doc_embedding)
-    second_probs = get_last_layer(num_classes=kwargs['content_num_classes'],
-                                  last_layer_name='content_output',
-                                  glorot=glorot)(doc_embedding)
-    probs = [probs, second_probs]
-  else:
-    probs = get_last_layer(glorot=glorot, **kwargs)(doc_embedding)
-  model = tf.keras.models.Model(inputs=inputs, outputs=probs)
-
-  return model, False
+    inputs = tf.keras.layers.Input(shape=(), dtype=tf.string)
+    encoder = load_encoder(model_type=model_type, trainable=trainable)
+    doc_embedding = encoder([inputs])["pooled_output"]
+    doc_embedding = tf.keras.layers.Dropout(rate=0.1, seed=seed)(doc_embedding)
+
+    glorot = tf.keras.initializers.glorot_uniform(seed=seed)
+    if kwargs.get("additional_layer", False):
+        doc_embedding = _add_additional_embedding_layer(doc_embedding, glorot, seed)
+
+    if kwargs.get("content_num_classes", None):
+        probs = get_last_layer(
+            glorot=glorot, last_layer_name="target_output", **kwargs
+        )(doc_embedding)
+        second_probs = get_last_layer(
+            num_classes=kwargs["content_num_classes"],
+            last_layer_name="content_output",
+            glorot=glorot,
+        )(doc_embedding)
+        probs = [probs, second_probs]
+    else:
+        probs = get_last_layer(glorot=glorot, **kwargs)(doc_embedding)
+    model = tf.keras.models.Model(inputs=inputs, outputs=probs)
+
+    return model, False
 
-def get_last_layer(**kwargs):
-  output_bias = _get_bias(**kwargs)
-  if 'glorot' in kwargs:
-    glorot = kwargs['glorot']
-  else:
-    glorot = tf.keras.initializers.glorot_uniform(seed=kwargs['seed'])
-  layer_name = kwargs.get('last_layer_name', 'dense_1')
-
-  if kwargs.get('num_classes', 1) > 1:
-    last_layer = tf.keras.layers.Dense(
-      kwargs["num_classes"], activation="softmax", kernel_initializer=glorot,
-      bias_initializer=output_bias, name=layer_name
-    )
 
-  elif kwargs.get('num_raters', 1) > 1:
-    if kwargs.get('multitask', False):
-      raise NotImplementedError
-    last_layer = tf.keras.layers.Dense(
-      kwargs['num_raters'], activation="sigmoid", kernel_initializer=glorot,
-      bias_initializer=output_bias, name='probs')
-
-  else:
-    last_layer = tf.keras.layers.Dense(
-      1, activation="sigmoid", kernel_initializer=glorot,
-      bias_initializer=output_bias, name=layer_name
-    )
+def get_last_layer(**kwargs):
+    output_bias = _get_bias(**kwargs)
+    if "glorot" in kwargs:
+        glorot = kwargs["glorot"]
+    else:
+        glorot = tf.keras.initializers.glorot_uniform(seed=kwargs["seed"])
+    layer_name = kwargs.get("last_layer_name", "dense_1")
+
+    if kwargs.get("num_classes", 1) > 1:
+        last_layer = tf.keras.layers.Dense(
+            kwargs["num_classes"],
+            activation="softmax",
+            kernel_initializer=glorot,
+            bias_initializer=output_bias,
+            name=layer_name,
+        )
+
+    elif kwargs.get("num_raters", 1) > 1:
+        if kwargs.get("multitask", False):
+            raise NotImplementedError
+        last_layer = tf.keras.layers.Dense(
+            kwargs["num_raters"],
+            activation="sigmoid",
+            kernel_initializer=glorot,
+            bias_initializer=output_bias,
+            name="probs",
+        )
+
+    else:
+        last_layer = tf.keras.layers.Dense(
+            1,
+            activation="sigmoid",
+            kernel_initializer=glorot,
+            bias_initializer=output_bias,
+            name=layer_name,
+        )
+
+    return last_layer
 
-  return last_layer
 
 def load_bertweet(**kwargs):
-  bert = TFAutoModelForSequenceClassification.from_pretrained(
-    os.path.join(LOCAL_MODEL_DIR, "bertweet-base"),
-    num_labels=1,
-    classifier_dropout=0.1,
-    hidden_size=768,
-  )
-  if "num_classes" in kwargs and kwargs["num_classes"] > 2:
-    raise NotImplementedError
+    bert = TFAutoModelForSequenceClassification.from_pretrained(
+        os.path.join(LOCAL_MODEL_DIR, "bertweet-base"),
+        num_labels=1,
+        classifier_dropout=0.1,
+        hidden_size=768,
+    )
+    if "num_classes" in kwargs and kwargs["num_classes"] > 2:
+        raise NotImplementedError
 
-  return bert, True
+    return bert, True
 
 
 def load(
-  optimizer,
-  seed,
-  model_type="twitter_multilingual_bert_base_cased_mlm",
-  loss_name="BCE",
-  trainable=True,
-  **kwargs,
+    optimizer: str,
+    seed: int,
+    model_type: str = "twitter_multilingual_bert_base_cased_mlm",
+    loss_name: str = "BCE",
+    trainable: bool = True,
+    **kwargs,
 ):
-  if model_type == "bertweet-base":
-    model, from_logits = load_bertweet()
-  else:
-    model, from_logits = load_inhouse_bert(model_type, trainable, seed, **kwargs)
-
-  pr_auc = tf.keras.metrics.AUC(curve="PR", name="pr_auc", from_logits=from_logits)
-  roc_auc = tf.keras.metrics.AUC(curve="ROC", name="roc_auc", from_logits=from_logits)
-
-  loss = get_loss(loss_name, from_logits, **kwargs)
-  if kwargs.get('content_num_classes', None):
-    second_loss = get_loss(loss_name=kwargs['content_loss_name'], from_logits=from_logits)
-    loss_weights = {'content_output': kwargs['content_loss_weight'], 'target_output': 1}
-    model.compile(
-      optimizer=optimizer,
-      loss={'content_output': second_loss, 'target_output': loss},
-      loss_weights=loss_weights,
-      metrics=[pr_auc, roc_auc],
-    )
-
-  else:
-    model.compile(
-      optimizer=optimizer,
-      loss=loss,
-      metrics=[pr_auc, roc_auc],
-    )
-  print(model.summary(), "logits: ", from_logits)
-
-  return model
\ No newline at end of file
+    if model_type == "bertweet-base":
+        model, from_logits = load_bertweet()
+    else:
+        model, from_logits = load_inhouse_bert(model_type, trainable, seed, **kwargs)
+
+    pr_auc = tf.keras.metrics.AUC(curve="PR", name="pr_auc", from_logits=from_logits)
+    roc_auc = tf.keras.metrics.AUC(curve="ROC", name="roc_auc", from_logits=from_logits)
+
+    loss = get_loss(loss_name, from_logits, **kwargs)
+    if kwargs.get("content_num_classes", None):
+        second_loss = get_loss(
+            loss_name=kwargs["content_loss_name"], from_logits=from_logits
+        )
+        loss_weights = {
+            "content_output": kwargs["content_loss_weight"],
+            "target_output": 1,
+        }
+        model.compile(
+            optimizer=optimizer,
+            loss={"content_output": second_loss, "target_output": loss},
+            loss_weights=loss_weights,
+            metrics=[pr_auc, roc_auc],
+        )
+
+    else:
+        model.compile(
+            optimizer=optimizer,
+            loss=loss,
+            metrics=[pr_auc, roc_auc],
+        )
+    print(model.summary(), "logits: ", from_logits)
+
+    return model
diff --git a/trust_and_safety_models/toxicity/optim/callbacks.py b/trust_and_safety_models/toxicity/optim/callbacks.py
index bbf8d7c97..bced640df 100644
--- a/trust_and_safety_models/toxicity/optim/callbacks.py
+++ b/trust_and_safety_models/toxicity/optim/callbacks.py
@@ -1,220 +1,246 @@
-from collections import defaultdict
 import os
+from collections import defaultdict
+from typing import List
 
-from toxicity_ml_pipeline.settings.default_settings_tox import REMOTE_LOGDIR
-from toxicity_ml_pipeline.settings.default_settings_abs import LABEL_NAMES
-from toxicity_ml_pipeline.utils.absv_utils import parse_labeled_data
-from toxicity_ml_pipeline.utils.helpers import compute_precision_fixed_recall, execute_command
-
-from sklearn.metrics import average_precision_score, roc_auc_score
+import numpy as np
 import tensorflow as tf
 import wandb
+from sklearn.metrics import average_precision_score, roc_auc_score
+from toxicity_ml_pipeline.settings.default_settings_abs import LABEL_NAMES
+from toxicity_ml_pipeline.settings.default_settings_tox import REMOTE_LOGDIR
+from toxicity_ml_pipeline.utils.absv_utils import parse_labeled_data
+from toxicity_ml_pipeline.utils.helpers import (
+    compute_precision_fixed_recall,
+    execute_command,
+)
 
 
 class NothingCallback(tf.keras.callbacks.Callback):
-  def on_epoch_begin(self, epoch, logs=None):
-    print("ici, ", epoch)
+    def on_epoch_begin(self, epoch, logs=None):
+        print("ici, ", epoch)
 
-  def on_epoch_end(self, epoch, logs=None):
-    print("fin ", epoch)
+    def on_epoch_end(self, epoch, logs=None):
+        print("fin ", epoch)
 
-  def on_train_batch_end(self, batch, logs=None):
-    print("fin de batch ", batch)
+    def on_train_batch_end(self, batch, logs=None):
+        print("fin de batch ", batch)
 
 
 class ControlledStoppingCheckpointCallback(tf.keras.callbacks.ModelCheckpoint):
-  def __init__(self, stopping_epoch, *args, **kwargs):
-    super().__init__(*args, **kwargs)
-    self.stopping_epoch = stopping_epoch
+    def __init__(self, stopping_epoch, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.stopping_epoch = stopping_epoch
 
-  def on_epoch_end(self, epoch, logs=None):
-    super().on_epoch_end(epoch, logs)
-    if epoch == self.stopping_epoch:
-      self.model.stop_training = True
+    def on_epoch_end(self, epoch, logs=None):
+        super().on_epoch_end(epoch, logs)
+        if epoch == self.stopping_epoch:
+            self.model.stop_training = True
 
 
 class SyncingTensorBoard(tf.keras.callbacks.TensorBoard):
-  def __init__(self, remote_logdir=None, *args, **kwargs):
-    super().__init__(*args, **kwargs)
-    self.remote_logdir = remote_logdir if remote_logdir is not None else REMOTE_LOGDIR
+    def __init__(self, remote_logdir=None, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.remote_logdir = (
+            remote_logdir if remote_logdir is not None else REMOTE_LOGDIR
+        )
 
-  def on_epoch_end(self, epoch, logs=None):
-    super().on_epoch_end(epoch, logs=logs)
-    self.synchronize()
+    def on_epoch_end(self, epoch, logs=None):
+        super().on_epoch_end(epoch, logs=logs)
+        self.synchronize()
 
-  def synchronize(self):
-    base_dir = os.path.dirname(self.log_dir)
-    cmd = f"gsutil -m rsync -r {base_dir} {self.remote_logdir}"
-    execute_command(cmd)
+    def synchronize(self):
+        base_dir = os.path.dirname(self.log_dir)
+        cmd = f"gsutil -m rsync -r {base_dir} {self.remote_logdir}"
+        execute_command(cmd)
 
 
 class GradientLoggingTensorBoard(SyncingTensorBoard):
-  def __init__(self, loader, val_data, freq, *args, **kwargs):
-    super().__init__(*args, **kwargs)
-    val_dataset = loader.get_balanced_dataset(
-      training_data=val_data, size_limit=50, return_as_batch=False
-    )
-    data_args = list(val_dataset.batch(32).take(1))[0]
-    self.x_batch, self.y_batch = data_args[0], data_args[1]
-    self.freq = freq
-    self.counter = 0
-
-  def _log_gradients(self):
-    writer = self._train_writer
-
-    with writer.as_default():
-      with tf.GradientTape() as tape:
-        y_pred = self.model(self.x_batch)
-        loss = self.model.compiled_loss(y_true=self.y_batch, y_pred=y_pred)
-        gradient_norm = tf.linalg.global_norm(tape.gradient(loss, self.model.trainable_weights))
-
-      tf.summary.scalar("gradient_norm", data=gradient_norm, step=self.counter)
-    writer.flush()
-
-  def on_train_batch_end(self, batch, logs=None):
-    super().on_batch_end(batch, logs=logs)
-    self.counter += 1
-    if batch % self.freq == 0:
-      self._log_gradients()
+    def __init__(self, loader, val_data, freq, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        val_dataset = loader.get_balanced_dataset(
+            training_data=val_data, size_limit=50, return_as_batch=False
+        )
+        data_args = list(val_dataset.batch(32).take(1))[0]
+        self.x_batch, self.y_batch = data_args[0], data_args[1]
+        self.freq = freq
+        self.counter = 0
+
+    def _log_gradients(self):
+        writer = self._train_writer
+
+        with writer.as_default():
+            with tf.GradientTape() as tape:
+                y_pred = self.model(self.x_batch)
+                loss = self.model.compiled_loss(y_true=self.y_batch, y_pred=y_pred)
+                gradient_norm = tf.linalg.global_norm(
+                    tape.gradient(loss, self.model.trainable_weights)
+                )
+
+            tf.summary.scalar("gradient_norm", data=gradient_norm, step=self.counter)
+        writer.flush()
+
+    def on_train_batch_end(self, batch, logs=None):
+        super().on_batch_end(batch, logs=logs)
+        self.counter += 1
+        if batch % self.freq == 0:
+            self._log_gradients()
 
 
 class AdditionalResultLogger(tf.keras.callbacks.Callback):
-  def __init__(
-    self,
-    data,
-    set_,
-    fixed_recall=0.85,
-    from_logits=False,
-    dataset_transform_func=None,
-    batch_size=64,
-    dual_head=None,
-    *args,
-    **kwargs,
-  ):
-    super().__init__(*args, **kwargs)
-    self.set_ = set_
-    if data is None:
-      return None    
-
-    self.single_head = True
-    try:
-      self.labels = data.int_label.values
-    except AttributeError:
-      self.labels = data.to_dataframe()[LABEL_NAMES].values.astype('int')
-      self.data = data.to_tf_dataset().map(parse_labeled_data).batch(batch_size)
-      self.label_names = LABEL_NAMES
-    else:
-      self.label_names = ['']
-      if dual_head:
-        self.label_names = [f'{e}_label' for e in dual_head]
-        self.labels = {f'{e}_output': data[f'{e}_label'].values for e in dual_head}
-        self.single_head = False
-      if dataset_transform_func is None:
-        self.data = data.text.values
-      else:
-        self.data = dataset_transform_func(data, mb_size=batch_size, shuffle=False)
-        
-    finally:
-      if len(self.label_names) == 1:
-        self.metric_kw = {}
-      else:
-        self.metric_kw = {'average': None}
-
-      self.counter = 0
-      self.best_metrics = defaultdict(float)
-      self.from_logits = from_logits
-      print(f"Loaded callback for {set_}, from_logits: {from_logits}, labels {self.label_names}")
-
-      if 1 < fixed_recall <= 100:
-        fixed_recall = fixed_recall / 100
-      elif not (0 < fixed_recall <= 100):
-        raise ValueError("Threshold should be between 0 and 1, or 0 and 100")
-      self.fixed_recall = fixed_recall
-      self.batch_size = batch_size
-
-  def compute_precision_fixed_recall(self, labels, preds):
-    result, _ = compute_precision_fixed_recall(labels=labels, preds=preds,
-      fixed_recall=self.fixed_recall)
-
-    return result
-
-  def on_epoch_end(self, epoch, logs=None):
-    self.additional_evaluations(step=epoch, eval_time="epoch")
-
-  def on_train_batch_end(self, batch, logs=None):
-    self.counter += 1
-    if self.counter % 2000 == 0:
-      self.additional_evaluations(step=self.counter, eval_time="batch")
-
-  def _binary_evaluations(self, preds, label_name=None, class_index=None):
-    mask = None
-    curr_labels = self.labels
-    if label_name is not None:
-      curr_labels = self.labels[label_name]
-      if class_index is not None:
-        curr_labels = (curr_labels == class_index).astype(int)
-
-    if -1 in curr_labels:
-      mask = curr_labels != -1   
-      curr_labels = curr_labels[mask]
-      preds = preds[mask] 
-    
-    return {
-        f"precision_recall{self.fixed_recall}": self.compute_precision_fixed_recall(
-          labels=curr_labels, preds=preds
-        ),
-        "pr_auc": average_precision_score(y_true=curr_labels, y_score=preds),
-        "roc_auc": roc_auc_score(y_true=curr_labels, y_score=preds),
-      }
-
-
-  def _multiclass_evaluations(self, preds):
-    pr_auc_l = average_precision_score(y_true=self.labels, y_score=preds, **self.metric_kw)
-    roc_auc_l = roc_auc_score(y_true=self.labels, y_score=preds, **self.metric_kw)
-    metrics = {}
-    for i, label in enumerate(self.label_names):
-      metrics[f'pr_auc_{label}'] = pr_auc_l[i]
-      metrics[f'roc_auc_{label}'] = roc_auc_l[i]
-
-    return metrics
-  
-  def additional_evaluations(self, step, eval_time):
-    print("Evaluating ", self.set_, eval_time, step)
-
-    preds = self.model.predict(x=self.data, batch_size=self.batch_size)
-    if self.from_logits:
-      preds = tf.keras.activations.sigmoid(preds.logits).numpy()
-    
-    if self.single_head:
-      if len(self.label_names) == 1:
-        metrics = self._binary_evaluations(preds)
-      else:
-        metrics = self._multiclass_evaluations(preds)
-    else:
-      if preds[0].shape[1] == 1:
-        binary_preds = preds[0]
-        multic_preds = preds[1]
-      else:
-        binary_preds = preds[1]
-        multic_preds = preds[0]
-
-      binary_metrics = self._binary_evaluations(binary_preds, label_name='target_output')
-      metrics = {f'{k}_target': v for k, v in binary_metrics.items()}
-      num_classes = multic_preds.shape[1]
-      for class_ in range(num_classes):
-        binary_metrics = self._binary_evaluations(multic_preds[:, class_], label_name='content_output', class_index=class_)
-        metrics.update({f'{k}_content_{class_}': v for k, v in binary_metrics.items()})
-
-    for k, v in metrics.items():
-      self.best_metrics[f"max_{k}"] = max(v, self.best_metrics[f"max_{k}"])
-
-    self.log_metrics(metrics, step=step, eval_time=eval_time)
-
-  def log_metrics(self, metrics_d, step, eval_time):
-    commit = False if self.set_ == "validation" else True
-    to_report = {self.set_: {**metrics_d, **self.best_metrics}}
-
-    if eval_time == "epoch":
-      to_report["epoch"] = step
-
-    wandb.log(to_report, commit=commit)
+    def __init__(
+        self,
+        data: tf.data.Dataset,
+        set_: str,
+        fixed_recall: float = 0.85,
+        from_logits: bool = False,
+        dataset_transform_func: callable = None,
+        batch_size: int = 64,
+        dual_head: List[str] = None,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.set_ = set_
+        if data is None:
+            return None
+
+        self.single_head = True
+        try:
+            self.labels = data.int_label.values
+        except AttributeError:
+            self.labels = data.to_dataframe()[LABEL_NAMES].values.astype("int")
+            self.data = data.to_tf_dataset().map(parse_labeled_data).batch(batch_size)
+            self.label_names = LABEL_NAMES
+        else:
+            self.label_names = [""]
+            if dual_head:
+                self.label_names = [f"{e}_label" for e in dual_head]
+                self.labels = {
+                    f"{e}_output": data[f"{e}_label"].values for e in dual_head
+                }
+                self.single_head = False
+            if dataset_transform_func is None:
+                self.data = data.text.values
+            else:
+                self.data = dataset_transform_func(
+                    data, mb_size=batch_size, shuffle=False
+                )
+
+        finally:
+            if len(self.label_names) == 1:
+                self.metric_kw = {}
+            else:
+                self.metric_kw = {"average": None}
+
+            self.counter = 0
+            self.best_metrics = defaultdict(float)
+            self.from_logits = from_logits
+            print(
+                f"Loaded callback for {set_}, from_logits: {from_logits}, labels {self.label_names}"
+            )
+
+            if 1 < fixed_recall <= 100:
+                fixed_recall = fixed_recall / 100
+            elif not (0 < fixed_recall <= 100):
+                raise ValueError("Threshold should be between 0 and 1, or 0 and 100")
+            self.fixed_recall = fixed_recall
+            self.batch_size = batch_size
+
+    def compute_precision_fixed_recall(self, labels: np.ndarray, preds: np.ndarray):
+        result, _ = compute_precision_fixed_recall(
+            labels=labels, preds=preds, fixed_recall=self.fixed_recall
+        )
+
+        return result
+
+    def on_epoch_end(self, epoch, logs=None):
+        self.additional_evaluations(step=epoch, eval_time="epoch")
+
+    def on_train_batch_end(self, batch, logs=None):
+        self.counter += 1
+        if self.counter % 2000 == 0:
+            self.additional_evaluations(step=self.counter, eval_time="batch")
+
+    def _binary_evaluations(
+        self, preds: np.ndarray, label_name=None, class_index: int = None
+    ):
+        mask = None
+        curr_labels = self.labels
+        if label_name is not None:
+            curr_labels = self.labels[label_name]
+            if class_index is not None:
+                curr_labels = (curr_labels == class_index).astype(int)
+
+        if -1 in curr_labels:
+            mask = curr_labels != -1
+            curr_labels = curr_labels[mask]
+            preds = preds[mask]
+
+        return {
+            f"precision_recall{self.fixed_recall}": self.compute_precision_fixed_recall(
+                labels=curr_labels, preds=preds
+            ),
+            "pr_auc": average_precision_score(y_true=curr_labels, y_score=preds),
+            "roc_auc": roc_auc_score(y_true=curr_labels, y_score=preds),
+        }
+
+    def _multiclass_evaluations(self, preds: np.ndarray):
+        pr_auc_l = average_precision_score(
+            y_true=self.labels, y_score=preds, **self.metric_kw
+        )
+        roc_auc_l = roc_auc_score(y_true=self.labels, y_score=preds, **self.metric_kw)
+        metrics = {}
+        for i, label in enumerate(self.label_names):
+            metrics[f"pr_auc_{label}"] = pr_auc_l[i]
+            metrics[f"roc_auc_{label}"] = roc_auc_l[i]
+
+        return metrics
+
+    def additional_evaluations(self, step: int, eval_time: str):
+        print("Evaluating ", self.set_, eval_time, step)
+
+        preds = self.model.predict(x=self.data, batch_size=self.batch_size)
+        if self.from_logits:
+            preds = tf.keras.activations.sigmoid(preds.logits).numpy()
+
+        if self.single_head:
+            if len(self.label_names) == 1:
+                metrics = self._binary_evaluations(preds)
+            else:
+                metrics = self._multiclass_evaluations(preds)
+        else:
+            if preds[0].shape[1] == 1:
+                binary_preds = preds[0]
+                multic_preds = preds[1]
+            else:
+                binary_preds = preds[1]
+                multic_preds = preds[0]
+
+            binary_metrics = self._binary_evaluations(
+                binary_preds, label_name="target_output"
+            )
+            metrics = {f"{k}_target": v for k, v in binary_metrics.items()}
+            num_classes = multic_preds.shape[1]
+            for class_ in range(num_classes):
+                binary_metrics = self._binary_evaluations(
+                    multic_preds[:, class_],
+                    label_name="content_output",
+                    class_index=class_,
+                )
+                metrics.update(
+                    {f"{k}_content_{class_}": v for k, v in binary_metrics.items()}
+                )
+
+        for k, v in metrics.items():
+            self.best_metrics[f"max_{k}"] = max(v, self.best_metrics[f"max_{k}"])
+
+        self.log_metrics(metrics, step=step, eval_time=eval_time)
+
+    def log_metrics(self, metrics_d: dict, step: int, eval_time: str):
+        commit = False if self.set_ == "validation" else True
+        to_report = {self.set_: {**metrics_d, **self.best_metrics}}
+
+        if eval_time == "epoch":
+            to_report["epoch"] = step
+
+        wandb.log(to_report, commit=commit)
diff --git a/trust_and_safety_models/toxicity/optim/losses.py b/trust_and_safety_models/toxicity/optim/losses.py
index 273c6676e..ede9a3c0c 100644
--- a/trust_and_safety_models/toxicity/optim/losses.py
+++ b/trust_and_safety_models/toxicity/optim/losses.py
@@ -1,56 +1,59 @@
 import tensorflow as tf
-from keras.utils import tf_utils
-from keras.utils import losses_utils
 from keras import backend
+from keras.utils import losses_utils, tf_utils
 
-def inv_kl_divergence(y_true, y_pred):
-  y_pred = tf.convert_to_tensor(y_pred)
-  y_true = tf.cast(y_true, y_pred.dtype)
-  y_true = backend.clip(y_true, backend.epsilon(), 1)
-  y_pred = backend.clip(y_pred, backend.epsilon(), 1)
-  return tf.reduce_sum(y_pred * tf.math.log(y_pred / y_true), axis=-1)
 
-def masked_bce(y_true, y_pred):
-  y_true = tf.cast(y_true, dtype=tf.float32)
-  mask = y_true != -1
-  
-  return tf.keras.metrics.binary_crossentropy(tf.boolean_mask(y_true, mask), 
-                                              tf.boolean_mask(y_pred, mask))
+def inv_kl_divergence(y_true: tf.Tensor, y_pred: tf.Tensor) -> tf.Tensor:
+    y_pred = tf.convert_to_tensor(y_pred)
+    y_true = tf.cast(y_true, y_pred.dtype)
+    y_true = backend.clip(y_true, backend.epsilon(), 1)
+    y_pred = backend.clip(y_pred, backend.epsilon(), 1)
+    return tf.reduce_sum(y_pred * tf.math.log(y_pred / y_true), axis=-1)
+
+
+def masked_bce(y_true: tf.Tensor, y_pred: tf.Tensor) -> tf.Tensor:
+    y_true = tf.cast(y_true, dtype=tf.float32)
+    mask = y_true != -1
+
+    return tf.keras.metrics.binary_crossentropy(
+        tf.boolean_mask(y_true, mask), tf.boolean_mask(y_pred, mask)
+    )
 
 
 class LossFunctionWrapper(tf.keras.losses.Loss):
-  def __init__(self,
-    fn,
-    reduction=losses_utils.ReductionV2.AUTO,
-    name=None,
-    **kwargs):
-    super().__init__(reduction=reduction, name=name)
-    self.fn = fn
-    self._fn_kwargs = kwargs
-
-  def call(self, y_true, y_pred):
-    if tf.is_tensor(y_pred) and tf.is_tensor(y_true):
-      y_pred, y_true = losses_utils.squeeze_or_expand_dimensions(y_pred, y_true)
-
-    ag_fn = tf.__internal__.autograph.tf_convert(self.fn, tf.__internal__.autograph.control_status_ctx())
-    return ag_fn(y_true, y_pred, **self._fn_kwargs)
-
-  def get_config(self):
-    config = {}
-    for k, v in self._fn_kwargs.items():
-      config[k] = backend.eval(v) if tf_utils.is_tensor_or_variable(v) else v
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    def __init__(
+        self, fn, reduction=losses_utils.ReductionV2.AUTO, name=None, **kwargs
+    ):
+        super().__init__(reduction=reduction, name=name)
+        self.fn = fn
+        self._fn_kwargs = kwargs
+
+    def call(self, y_true: tf.Tensor, y_pred: tf.Tensor) -> tf.Tensor:
+        if tf.is_tensor(y_pred) and tf.is_tensor(y_true):
+            y_pred, y_true = losses_utils.squeeze_or_expand_dimensions(y_pred, y_true)
+
+        ag_fn = tf.__internal__.autograph.tf_convert(
+            self.fn, tf.__internal__.autograph.control_status_ctx()
+        )
+        return ag_fn(y_true, y_pred, **self._fn_kwargs)
+
+    def get_config(self) -> dict:
+        config = {}
+        for k, v in self._fn_kwargs.items():
+            config[k] = backend.eval(v) if tf_utils.is_tensor_or_variable(v) else v
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
 
 class InvKLD(LossFunctionWrapper):
-  def __init__(self,
-    reduction=losses_utils.ReductionV2.AUTO,
-    name='inv_kl_divergence'):
-    super().__init__(inv_kl_divergence, name=name, reduction=reduction)
+    def __init__(
+        self, reduction=losses_utils.ReductionV2.AUTO, name: str = "inv_kl_divergence"
+    ):
+        super().__init__(inv_kl_divergence, name=name, reduction=reduction)
 
 
 class MaskedBCE(LossFunctionWrapper):
-  def __init__(self,
-    reduction=losses_utils.ReductionV2.AUTO,
-    name='masked_bce'):
-    super().__init__(masked_bce, name=name, reduction=reduction)
+    def __init__(
+        self, reduction=losses_utils.ReductionV2.AUTO, name: str = "masked_bce"
+    ):
+        super().__init__(masked_bce, name=name, reduction=reduction)
diff --git a/trust_and_safety_models/toxicity/optim/schedulers.py b/trust_and_safety_models/toxicity/optim/schedulers.py
index 59f6c9afa..4a3d5091e 100644
--- a/trust_and_safety_models/toxicity/optim/schedulers.py
+++ b/trust_and_safety_models/toxicity/optim/schedulers.py
@@ -4,41 +4,41 @@
 
 
 class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
-  def __init__(
-    self,
-    initial_learning_rate: float,
-    decay_schedule_fn: Callable,
-    warmup_steps: int,
-    power: float = 1.0,
-    name: str = "",
-  ):
-    super().__init__()
-    self.initial_learning_rate = initial_learning_rate
-    self.warmup_steps = warmup_steps
-    self.power = power
-    self.decay_schedule_fn = decay_schedule_fn
-    self.name = name
+    def __init__(
+        self,
+        initial_learning_rate: float,
+        decay_schedule_fn: Callable,
+        warmup_steps: int,
+        power: float = 1.0,
+        name: str = "",
+    ):
+        super().__init__()
+        self.initial_learning_rate = initial_learning_rate
+        self.warmup_steps = warmup_steps
+        self.power = power
+        self.decay_schedule_fn = decay_schedule_fn
+        self.name = name
 
-  def __call__(self, step):
-    with tf.name_scope(self.name or "WarmUp") as name:
-      global_step_float = tf.cast(step, tf.float32)
-      warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
-      warmup_percent_done = global_step_float / warmup_steps_float
-      warmup_learning_rate = self.initial_learning_rate * tf.math.pow(
-        warmup_percent_done, self.power
-      )
-      return tf.cond(
-        global_step_float < warmup_steps_float,
-        lambda: warmup_learning_rate,
-        lambda: self.decay_schedule_fn(step - self.warmup_steps),
-        name=name,
-      )
+    def __call__(self, step):
+        with tf.name_scope(self.name or "WarmUp") as name:
+            global_step_float = tf.cast(step, tf.float32)
+            warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
+            warmup_percent_done = global_step_float / warmup_steps_float
+            warmup_learning_rate = self.initial_learning_rate * tf.math.pow(
+                warmup_percent_done, self.power
+            )
+            return tf.cond(
+                global_step_float < warmup_steps_float,
+                lambda: warmup_learning_rate,
+                lambda: self.decay_schedule_fn(step - self.warmup_steps),
+                name=name,
+            )
 
-  def get_config(self):
-    return {
-      "initial_learning_rate": self.initial_learning_rate,
-      "decay_schedule_fn": self.decay_schedule_fn,
-      "warmup_steps": self.warmup_steps,
-      "power": self.power,
-      "name": self.name,
-    }
+    def get_config(self):
+        return {
+            "initial_learning_rate": self.initial_learning_rate,
+            "decay_schedule_fn": self.decay_schedule_fn,
+            "warmup_steps": self.warmup_steps,
+            "power": self.power,
+            "name": self.name,
+        }
diff --git a/trust_and_safety_models/toxicity/rescoring.py b/trust_and_safety_models/toxicity/rescoring.py
index 71d95ed76..0fe1d71b3 100644
--- a/trust_and_safety_models/toxicity/rescoring.py
+++ b/trust_and_safety_models/toxicity/rescoring.py
@@ -1,54 +1,72 @@
+import numpy as np
+import pandas as pd
+import tensorflow as tf
 from toxicity_ml_pipeline.load_model import reload_model_weights
 from toxicity_ml_pipeline.utils.helpers import load_inference_func, upload_model
 
-import numpy as np
-import tensorflow as tf
 
+def score(
+    language: str,
+    df: pd.DataFrame,
+    gcs_model_path: str,
+    batch_size: int = 64,
+    text_col: str = "text",
+    kw: str = "",
+    **kwargs,
+):
+    if language != "en":
+        raise NotImplementedError(
+            "Data preprocessing not implemented here, needs to be added for i18n models"
+        )
+    model_folder = upload_model(full_gcs_model_path=gcs_model_path)
+    try:
+        inference_func = load_inference_func(model_folder)
+    except OSError:
+        model = reload_model_weights(model_folder, language, **kwargs)
+        preds = model.predict(x=df[text_col], batch_size=batch_size)
+        if type(preds) != list:
+            if len(preds.shape) > 1 and preds.shape[1] > 1:
+                if "num_classes" in kwargs and kwargs["num_classes"] > 1:
+                    raise NotImplementedError
+                preds = np.mean(preds, 1)
 
-def score(language, df, gcs_model_path, batch_size=64, text_col="text", kw="", **kwargs):
-  if language != "en":
-    raise NotImplementedError(
-      "Data preprocessing not implemented here, needs to be added for i18n models"
-    )
-  model_folder = upload_model(full_gcs_model_path=gcs_model_path)
-  try:
-    inference_func = load_inference_func(model_folder)
-  except OSError:
-    model = reload_model_weights(model_folder, language, **kwargs)
-    preds = model.predict(x=df[text_col], batch_size=batch_size)
-    if type(preds) != list:
-      if len(preds.shape)> 1 and preds.shape[1] > 1:
-        if 'num_classes' in kwargs and kwargs['num_classes'] > 1:
-          raise NotImplementedError
-        preds = np.mean(preds, 1)
-
-      df[f"prediction_{kw}"] = preds
-    else:
-      if len(preds) > 2:
-        raise NotImplementedError
-      for preds_arr in preds:
-        if preds_arr.shape[1] == 1:
-          df[f"prediction_{kw}_target"] = preds_arr
+            df[f"prediction_{kw}"] = preds
         else:
-          for ind in range(preds_arr.shape[1]):
-            df[f"prediction_{kw}_content_{ind}"] = preds_arr[:, ind]
+            if len(preds) > 2:
+                raise NotImplementedError
+            for preds_arr in preds:
+                if preds_arr.shape[1] == 1:
+                    df[f"prediction_{kw}_target"] = preds_arr
+                else:
+                    for ind in range(preds_arr.shape[1]):
+                        df[f"prediction_{kw}_content_{ind}"] = preds_arr[:, ind]
 
-    return df
-  else:
-    return _get_score(inference_func, df, kw=kw, batch_size=batch_size, text_col=text_col)
+        return df
+    else:
+        return _get_score(
+            inference_func, df, kw=kw, batch_size=batch_size, text_col=text_col
+        )
 
 
-def _get_score(inference_func, df, text_col="text", kw="", batch_size=64):
-  score_col = f"prediction_{kw}"
-  beginning = 0
-  end = df.shape[0]
-  predictions = np.zeros(shape=end, dtype=float)
+def _get_score(
+    inference_func: tf.function,
+    df: pd.DataFrame,
+    text_col: str = "text",
+    kw: str = "",
+    batch_size: int = 64,
+) -> pd.DataFrame:
+    score_col = f"prediction_{kw}"
+    beginning = 0
+    end = df.shape[0]
+    predictions = np.zeros(shape=end, dtype=float)
 
-  while beginning < end:
-    mb = df[text_col].values[beginning : beginning + batch_size]
-    res = inference_func(input_1=tf.constant(mb))
-    predictions[beginning : beginning + batch_size] = list(res.values())[0].numpy()[:, 0]
-    beginning += batch_size
+    while beginning < end:
+        mb = df[text_col].values[beginning : beginning + batch_size]
+        res = inference_func(input_1=tf.constant(mb))
+        predictions[beginning : beginning + batch_size] = list(res.values())[0].numpy()[
+            :, 0
+        ]
+        beginning += batch_size
 
-  df[score_col] = predictions
-  return df
+    df[score_col] = predictions
+    return df
diff --git a/trust_and_safety_models/toxicity/settings/default_settings_tox.py b/trust_and_safety_models/toxicity/settings/default_settings_tox.py
index 0968b9adc..dbae08a50 100644
--- a/trust_and_safety_models/toxicity/settings/default_settings_tox.py
+++ b/trust_and_safety_models/toxicity/settings/default_settings_tox.py
@@ -1,20 +1,19 @@
 import os
 
-
 TEAM_PROJECT = "twttr-toxicity-prod"
 try:
-  from google.cloud import bigquery
+    from google.cloud import bigquery
 except (ModuleNotFoundError, ImportError):
-  print("No Google packages")
-  CLIENT = None
+    print("No Google packages")
+    CLIENT = None
 else:
-  from google.auth.exceptions import DefaultCredentialsError
+    from google.auth.exceptions import DefaultCredentialsError
 
-  try:
-    CLIENT = bigquery.Client(project=TEAM_PROJECT)
-  except DefaultCredentialsError as e:
-    CLIENT = None
-    print("Issue at logging time", e)
+    try:
+        CLIENT = bigquery.Client(project=TEAM_PROJECT)
+    except DefaultCredentialsError as e:
+        CLIENT = None
+        print("Issue at logging time", e)
 
 TRAINING_DATA_LOCATION = f"..."
 GCS_ADDRESS = "..."
diff --git a/trust_and_safety_models/toxicity/train.py b/trust_and_safety_models/toxicity/train.py
index de450ee7b..26c4bf7a1 100644
--- a/trust_and_safety_models/toxicity/train.py
+++ b/trust_and_safety_models/toxicity/train.py
@@ -1,401 +1,444 @@
+import os
 from datetime import datetime
 from importlib import import_module
-import os
 
+import numpy as np
+import pandas as pd
+import tensorflow as tf
 from toxicity_ml_pipeline.data.data_preprocessing import (
-  DefaultENNoPreprocessor,
-  DefaultENPreprocessor,
+    DefaultENNoPreprocessor,
+    DefaultENPreprocessor,
 )
 from toxicity_ml_pipeline.data.dataframe_loader import ENLoader, ENLoaderWithSampling
 from toxicity_ml_pipeline.data.mb_generator import BalancedMiniBatchLoader
-from toxicity_ml_pipeline.load_model import load, get_last_layer
+from toxicity_ml_pipeline.load_model import get_last_layer, load
 from toxicity_ml_pipeline.optim.callbacks import (
-  AdditionalResultLogger,
-  ControlledStoppingCheckpointCallback,
-  GradientLoggingTensorBoard,
-  SyncingTensorBoard,
+    AdditionalResultLogger,
+    ControlledStoppingCheckpointCallback,
+    GradientLoggingTensorBoard,
+    SyncingTensorBoard,
 )
 from toxicity_ml_pipeline.optim.schedulers import WarmUp
 from toxicity_ml_pipeline.settings.default_settings_abs import GCS_ADDRESS as ABS_GCS
+from toxicity_ml_pipeline.settings.default_settings_tox import GCS_ADDRESS as TOX_GCS
 from toxicity_ml_pipeline.settings.default_settings_tox import (
-  GCS_ADDRESS as TOX_GCS,
-  MODEL_DIR,
-  RANDOM_SEED,
-  REMOTE_LOGDIR,
-  WARM_UP_PERC,
+    MODEL_DIR,
+    RANDOM_SEED,
+    REMOTE_LOGDIR,
+    WARM_UP_PERC,
 )
 from toxicity_ml_pipeline.utils.helpers import check_gpu, set_seeds, upload_model
 
-import numpy as np
-import tensorflow as tf
-
-
 try:
-  from tensorflow_addons.optimizers import AdamW
+    from tensorflow_addons.optimizers import AdamW
 except ModuleNotFoundError:
-  print("No TFA")
+    print("No TFA")
 
 
 class Trainer(object):
-  OPTIMIZERS = ["Adam", "AdamW"]
-
-  def __init__(
-    self,
-    optimizer_name,
-    weight_decay,
-    learning_rate,
-    mb_size,
-    train_epochs,
-    content_loss_weight=1,
-    language="en",
-    scope='TOX',
-    project=...,
-    experiment_id="default",
-    gradient_clipping=None,
-    fold="time",
-    seed=RANDOM_SEED,
-    log_gradients=False,
-    kw="",
-    stopping_epoch=None,
-    test=False,
-  ):
-    self.seed = seed
-    self.weight_decay = weight_decay
-    self.learning_rate = learning_rate
-    self.mb_size = mb_size
-    self.train_epochs = train_epochs
-    self.gradient_clipping = gradient_clipping
-
-    if optimizer_name not in self.OPTIMIZERS:
-      raise ValueError(
-        f"Optimizer {optimizer_name} not implemented. Accepted values {self.OPTIMIZERS}."
-      )
-    self.optimizer_name = optimizer_name
-    self.log_gradients = log_gradients
-    self.test = test
-    self.fold = fold
-    self.stopping_epoch = stopping_epoch
-    self.language = language
-    if scope == 'TOX':
-      GCS_ADDRESS = TOX_GCS.format(project=project)
-    elif scope == 'ABS':
-      GCS_ADDRESS = ABS_GCS
-    else:
-      raise ValueError
-    GCS_ADDRESS = GCS_ADDRESS.format(project=project)
-    try:
-      self.setting_file = import_module(f"toxicity_ml_pipeline.settings.{scope.lower()}{project}_settings")
-    except ModuleNotFoundError:
-      raise ValueError(f"You need to define a setting file for your project {project}.")
-    experiment_settings = self.setting_file.experiment_settings
-
-    self.project = project
-    self.remote_logdir = REMOTE_LOGDIR.format(GCS_ADDRESS=GCS_ADDRESS, project=project)
-    self.model_dir = MODEL_DIR.format(GCS_ADDRESS=GCS_ADDRESS, project=project)
-
-    if experiment_id not in experiment_settings:
-      raise ValueError("This is not an experiment id as defined in the settings file.")
-
-    for var, default_value in experiment_settings["default"].items():
-      override_val = experiment_settings[experiment_id].get(var, default_value)
-      print("Setting ", var, override_val)
-      self.__setattr__(var, override_val)
-
-    self.content_loss_weight = content_loss_weight if self.dual_head else None
-
-    self.mb_loader = BalancedMiniBatchLoader(
-      fold=self.fold,
-      seed=self.seed,
-      perc_training_tox=self.perc_training_tox,
-      mb_size=self.mb_size,
-      n_outer_splits="time",
-      scope=scope,
-      project=project,
-      dual_head=self.dual_head,
-      sample_weights=self.sample_weights,
-      huggingface=("bertweet" in self.model_type),
-    )
-    self._init_dirnames(kw=kw, experiment_id=experiment_id)
-    print("------- Checking there is a GPU")
-    check_gpu()
-
-  def _init_dirnames(self, kw, experiment_id):
-    kw = "test" if self.test else kw
-    hyper_param_kw = ""
-    if self.optimizer_name == "AdamW":
-      hyper_param_kw += f"{self.weight_decay}_"
-    if self.gradient_clipping:
-      hyper_param_kw += f"{self.gradient_clipping}_"
-    if self.content_loss_weight:
-      hyper_param_kw += f"{self.content_loss_weight}_"
-    experiment_name = (
-      f"{self.language}{str(datetime.now()).replace(' ', '')[:-7]}{kw}_{experiment_id}{self.fold}_"
-      f"{self.optimizer_name}_"
-      f"{self.learning_rate}_"
-      f"{hyper_param_kw}"
-      f"{self.mb_size}_"
-      f"{self.perc_training_tox}_"
-      f"{self.train_epochs}_seed{self.seed}"
-    )
-    print("------- Experiment name: ", experiment_name)
-    self.logdir = (
-      f"..."
-      if self.test
-      else f"..."
-    )
-    self.checkpoint_path = f"{self.model_dir}/{experiment_name}"
-
-  @staticmethod
-  def _additional_writers(logdir, metric_name):
-    return tf.summary.create_file_writer(os.path.join(logdir, metric_name))
-
-  def get_callbacks(self, fold, val_data, test_data):
-    fold_logdir = self.logdir + f"_fold{fold}"
-    fold_checkpoint_path = self.checkpoint_path + f"_fold{fold}/{{epoch:02d}}"
-
-    tb_args = {
-      "log_dir": fold_logdir,
-      "histogram_freq": 0,
-      "update_freq": 500,
-      "embeddings_freq": 0,
-      "remote_logdir": f"{self.remote_logdir}_{self.language}"
-      if not self.test
-      else f"{self.remote_logdir}_test",
-    }
-    tensorboard_callback = (
-      GradientLoggingTensorBoard(loader=self.mb_loader, val_data=val_data, freq=10, **tb_args)
-      if self.log_gradients
-      else SyncingTensorBoard(**tb_args)
-    )
-
-    callbacks = [tensorboard_callback]
-    if "bertweet" in self.model_type:
-      from_logits = True
-      dataset_transform_func = self.mb_loader.make_huggingface_tensorflow_ds
-    else:
-      from_logits = False
-      dataset_transform_func = None
-
-    fixed_recall = 0.85 if not self.dual_head else 0.5
-    val_callback = AdditionalResultLogger(
-      data=val_data,
-      set_="validation",
-      from_logits=from_logits,
-      dataset_transform_func=dataset_transform_func,
-      dual_head=self.dual_head,
-      fixed_recall=fixed_recall
-    )
-    if val_callback is not None:
-      callbacks.append(val_callback)
-
-    test_callback = AdditionalResultLogger(
-      data=test_data,
-      set_="test",
-      from_logits=from_logits,
-      dataset_transform_func=dataset_transform_func,
-      dual_head=self.dual_head,
-      fixed_recall=fixed_recall
-    )
-    callbacks.append(test_callback)
-
-    checkpoint_args = {
-      "filepath": fold_checkpoint_path,
-      "verbose": 0,
-      "monitor": "val_pr_auc",
-      "save_weights_only": True,
-      "mode": "max",
-      "save_freq": "epoch",
-    }
-    if self.stopping_epoch:
-      checkpoint_callback = ControlledStoppingCheckpointCallback(
-        **checkpoint_args,
-        stopping_epoch=self.stopping_epoch,
-        save_best_only=False,
-      )
-      callbacks.append(checkpoint_callback)
-
-    return callbacks
-
-  def get_lr_schedule(self, steps_per_epoch):
-    total_num_steps = steps_per_epoch * self.train_epochs
-
-    warm_up_perc = WARM_UP_PERC if self.learning_rate >= 1e-3 else 0
-    warm_up_steps = int(total_num_steps * warm_up_perc)
-    if self.linear_lr_decay:
-      learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
-        self.learning_rate,
-        total_num_steps - warm_up_steps,
-        end_learning_rate=0.0,
-        power=1.0,
-        cycle=False,
-      )
-    else:
-      print('Constant learning rate')
-      learning_rate_fn = self.learning_rate
-
-    if warm_up_perc > 0:
-      print(f".... using warm-up for {warm_up_steps} steps")
-      warm_up_schedule = WarmUp(
-        initial_learning_rate=self.learning_rate,
-        decay_schedule_fn=learning_rate_fn,
-        warmup_steps=warm_up_steps,
-      )
-      return warm_up_schedule
-    return learning_rate_fn
-
-  def get_optimizer(self, schedule):
-    optim_args = {
-      "learning_rate": schedule,
-      "beta_1": 0.9,
-      "beta_2": 0.999,
-      "epsilon": 1e-6,
-      "amsgrad": False,
-    }
-    if self.gradient_clipping:
-      optim_args["global_clipnorm"] = self.gradient_clipping
-
-    print(f".... {self.optimizer_name} w global clipnorm {self.gradient_clipping}")
-    if self.optimizer_name == "Adam":
-      return tf.keras.optimizers.Adam(**optim_args)
-
-    if self.optimizer_name == "AdamW":
-      optim_args["weight_decay"] = self.weight_decay
-      return AdamW(**optim_args)
-    raise NotImplementedError
-
-  def get_training_actors(self, steps_per_epoch, val_data, test_data, fold):
-    callbacks = self.get_callbacks(fold=fold, val_data=val_data, test_data=test_data)
-    schedule = self.get_lr_schedule(steps_per_epoch=steps_per_epoch)
-
-    optimizer = self.get_optimizer(schedule)
-
-    return optimizer, callbacks
-
-  def load_data(self):
-    if self.project == 435 or self.project == 211:
-      if self.dataset_type is None:
-        data_loader = ENLoader(project=self.project, setting_file=self.setting_file)
-        dataset_type_args = {}
-      else:
-        data_loader = ENLoaderWithSampling(project=self.project, setting_file=self.setting_file)
-        dataset_type_args = self.dataset_type
-
-    df = data_loader.load_data(
-      language=self.language, test=self.test, reload=self.dataset_reload, **dataset_type_args
-    )
-
-    return df
-
-  def preprocess(self, df):
-    if self.project == 435 or self.project == 211:
-      if self.preprocessing is None:
-        data_prepro = DefaultENNoPreprocessor()
-      elif self.preprocessing == "default":
-        data_prepro = DefaultENPreprocessor()
-      else:
+    OPTIMIZERS = ["Adam", "AdamW"]
+
+    def __init__(
+        self,
+        optimizer_name: str,
+        weight_decay: float,
+        learning_rate: float,
+        mb_size: int,
+        train_epochs: int,
+        content_loss_weight: float = 1.0,
+        language: str = "en",
+        scope: str = "TOX",
+        project=...,
+        experiment_id: str = "default",
+        gradient_clipping: float = None,
+        fold: str = "time",
+        seed: int = RANDOM_SEED,
+        log_gradients: bool = False,
+        kw: str = "",
+        stopping_epoch: int = None,
+        test: bool = False,
+    ):
+        self.seed = seed
+        self.weight_decay = weight_decay
+        self.learning_rate = learning_rate
+        self.mb_size = mb_size
+        self.train_epochs = train_epochs
+        self.gradient_clipping = gradient_clipping
+
+        if optimizer_name not in self.OPTIMIZERS:
+            raise ValueError(
+                f"Optimizer {optimizer_name} not implemented. Accepted values {self.OPTIMIZERS}."
+            )
+        self.optimizer_name = optimizer_name
+        self.log_gradients = log_gradients
+        self.test = test
+        self.fold = fold
+        self.stopping_epoch = stopping_epoch
+        self.language = language
+        if scope == "TOX":
+            GCS_ADDRESS = TOX_GCS.format(project=project)
+        elif scope == "ABS":
+            GCS_ADDRESS = ABS_GCS
+        else:
+            raise ValueError
+        GCS_ADDRESS = GCS_ADDRESS.format(project=project)
+        try:
+            self.setting_file = import_module(
+                f"toxicity_ml_pipeline.settings.{scope.lower()}{project}_settings"
+            )
+        except ModuleNotFoundError:
+            raise ValueError(
+                f"You need to define a setting file for your project {project}."
+            )
+        experiment_settings = self.setting_file.experiment_settings
+
+        self.project = project
+        self.remote_logdir = REMOTE_LOGDIR.format(
+            GCS_ADDRESS=GCS_ADDRESS, project=project
+        )
+        self.model_dir = MODEL_DIR.format(GCS_ADDRESS=GCS_ADDRESS, project=project)
+
+        if experiment_id not in experiment_settings:
+            raise ValueError(
+                "This is not an experiment id as defined in the settings file."
+            )
+
+        for var, default_value in experiment_settings["default"].items():
+            override_val = experiment_settings[experiment_id].get(var, default_value)
+            print("Setting ", var, override_val)
+            self.__setattr__(var, override_val)
+
+        self.content_loss_weight = content_loss_weight if self.dual_head else None
+
+        self.mb_loader = BalancedMiniBatchLoader(
+            fold=self.fold,
+            seed=self.seed,
+            perc_training_tox=self.perc_training_tox,
+            mb_size=self.mb_size,
+            n_outer_splits="time",
+            scope=scope,
+            project=project,
+            dual_head=self.dual_head,
+            sample_weights=self.sample_weights,
+            huggingface=("bertweet" in self.model_type),
+        )
+        self._init_dirnames(kw=kw, experiment_id=experiment_id)
+        print("------- Checking there is a GPU")
+        check_gpu()
+
+    def _init_dirnames(self, kw: str, experiment_id: str):
+        kw = "test" if self.test else kw
+        hyper_param_kw = ""
+        if self.optimizer_name == "AdamW":
+            hyper_param_kw += f"{self.weight_decay}_"
+        if self.gradient_clipping:
+            hyper_param_kw += f"{self.gradient_clipping}_"
+        if self.content_loss_weight:
+            hyper_param_kw += f"{self.content_loss_weight}_"
+        experiment_name = (
+            f"{self.language}{str(datetime.now()).replace(' ', '')[:-7]}{kw}_{experiment_id}{self.fold}_"
+            f"{self.optimizer_name}_"
+            f"{self.learning_rate}_"
+            f"{hyper_param_kw}"
+            f"{self.mb_size}_"
+            f"{self.perc_training_tox}_"
+            f"{self.train_epochs}_seed{self.seed}"
+        )
+        print("------- Experiment name: ", experiment_name)
+        self.logdir = f"..." if self.test else f"..."
+        self.checkpoint_path = f"{self.model_dir}/{experiment_name}"
+
+    @staticmethod
+    def _additional_writers(logdir: str, metric_name: str) -> tf.summary.SummaryWriter:
+        return tf.summary.create_file_writer(os.path.join(logdir, metric_name))
+
+    def get_callbacks(self, fold, val_data, test_data):
+        fold_logdir = self.logdir + f"_fold{fold}"
+        fold_checkpoint_path = self.checkpoint_path + f"_fold{fold}/{{epoch:02d}}"
+
+        tb_args = {
+            "log_dir": fold_logdir,
+            "histogram_freq": 0,
+            "update_freq": 500,
+            "embeddings_freq": 0,
+            "remote_logdir": f"{self.remote_logdir}_{self.language}"
+            if not self.test
+            else f"{self.remote_logdir}_test",
+        }
+        tensorboard_callback = (
+            GradientLoggingTensorBoard(
+                loader=self.mb_loader, val_data=val_data, freq=10, **tb_args
+            )
+            if self.log_gradients
+            else SyncingTensorBoard(**tb_args)
+        )
+
+        callbacks = [tensorboard_callback]
+        if "bertweet" in self.model_type:
+            from_logits = True
+            dataset_transform_func = self.mb_loader.make_huggingface_tensorflow_ds
+        else:
+            from_logits = False
+            dataset_transform_func = None
+
+        fixed_recall = 0.85 if not self.dual_head else 0.5
+        val_callback = AdditionalResultLogger(
+            data=val_data,
+            set_="validation",
+            from_logits=from_logits,
+            dataset_transform_func=dataset_transform_func,
+            dual_head=self.dual_head,
+            fixed_recall=fixed_recall,
+        )
+        if val_callback is not None:
+            callbacks.append(val_callback)
+
+        test_callback = AdditionalResultLogger(
+            data=test_data,
+            set_="test",
+            from_logits=from_logits,
+            dataset_transform_func=dataset_transform_func,
+            dual_head=self.dual_head,
+            fixed_recall=fixed_recall,
+        )
+        callbacks.append(test_callback)
+
+        checkpoint_args = {
+            "filepath": fold_checkpoint_path,
+            "verbose": 0,
+            "monitor": "val_pr_auc",
+            "save_weights_only": True,
+            "mode": "max",
+            "save_freq": "epoch",
+        }
+        if self.stopping_epoch:
+            checkpoint_callback = ControlledStoppingCheckpointCallback(
+                **checkpoint_args,
+                stopping_epoch=self.stopping_epoch,
+                save_best_only=False,
+            )
+            callbacks.append(checkpoint_callback)
+
+        return callbacks
+
+    def get_lr_schedule(self, steps_per_epoch):
+        total_num_steps = steps_per_epoch * self.train_epochs
+
+        warm_up_perc = WARM_UP_PERC if self.learning_rate >= 1e-3 else 0
+        warm_up_steps = int(total_num_steps * warm_up_perc)
+        if self.linear_lr_decay:
+            learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
+                self.learning_rate,
+                total_num_steps - warm_up_steps,
+                end_learning_rate=0.0,
+                power=1.0,
+                cycle=False,
+            )
+        else:
+            print("Constant learning rate")
+            learning_rate_fn = self.learning_rate
+
+        if warm_up_perc > 0:
+            print(f".... using warm-up for {warm_up_steps} steps")
+            warm_up_schedule = WarmUp(
+                initial_learning_rate=self.learning_rate,
+                decay_schedule_fn=learning_rate_fn,
+                warmup_steps=warm_up_steps,
+            )
+            return warm_up_schedule
+        return learning_rate_fn
+
+    def get_optimizer(
+        self, schedule: tf.keras.optimizers.schedules.LearningRateSchedule
+    ):
+        optim_args = {
+            "learning_rate": schedule,
+            "beta_1": 0.9,
+            "beta_2": 0.999,
+            "epsilon": 1e-6,
+            "amsgrad": False,
+        }
+        if self.gradient_clipping:
+            optim_args["global_clipnorm"] = self.gradient_clipping
+
+        print(f".... {self.optimizer_name} w global clipnorm {self.gradient_clipping}")
+        if self.optimizer_name == "Adam":
+            return tf.keras.optimizers.Adam(**optim_args)
+
+        if self.optimizer_name == "AdamW":
+            optim_args["weight_decay"] = self.weight_decay
+            return AdamW(**optim_args)
         raise NotImplementedError
 
-    return data_prepro(
-      df=df,
-      label_column=self.label_column,
-      class_weight=self.perc_training_tox if self.sample_weights == 'class_weight' else None,
-      filter_low_agreements=self.filter_low_agreements,
-      num_classes=self.num_classes,
-    )
-
-  def load_model(self, optimizer):
-    smart_bias_value = (
-      np.log(self.perc_training_tox / (1 - self.perc_training_tox)) if self.smart_bias_init else 0
-    )
-    model = load(
-      optimizer,
-      seed=self.seed,
-      trainable=self.trainable,
-      model_type=self.model_type,
-      loss_name=self.loss_name,
-      num_classes=self.num_classes,
-      additional_layer=self.additional_layer,
-      smart_bias_value=smart_bias_value,
-      content_num_classes=self.content_num_classes,
-      content_loss_name=self.content_loss_name,
-      content_loss_weight=self.content_loss_weight
-    )
-
-    if self.model_reload is not False:
-      model_folder = upload_model(full_gcs_model_path=os.path.join(self.model_dir, self.model_reload))
-      model.load_weights(model_folder)
-      if self.scratch_last_layer:
-        print('Putting the last layer back to scratch')
-        model.layers[-1] = get_last_layer(seed=self.seed,
-                                        num_classes=self.num_classes,
-                                        smart_bias_value=smart_bias_value)
-
-    return model
-
-  def _train_single_fold(self, mb_generator, test_data, steps_per_epoch, fold, val_data=None):
-    steps_per_epoch = 100 if self.test else steps_per_epoch
-
-    optimizer, callbacks = self.get_training_actors(
-      steps_per_epoch=steps_per_epoch, val_data=val_data, test_data=test_data, fold=fold
-    )
-    print("Loading model")
-    model = self.load_model(optimizer)
-    print(f"Nb of steps per epoch: {steps_per_epoch} ---- launching training")
-    training_args = {
-      "epochs": self.train_epochs,
-      "steps_per_epoch": steps_per_epoch,
-      "batch_size": self.mb_size,
-      "callbacks": callbacks,
-      "verbose": 2,
-    }
-
-    model.fit(mb_generator, **training_args)
-    return
-
-  def train_full_model(self):
-    print("Setting up random seed.")
-    set_seeds(self.seed)
-
-    print(f"Loading {self.language} data")
-    df = self.load_data()
-    df = self.preprocess(df=df)
-
-    print("Going to train on everything but the test dataset")
-    mini_batches, test_data, steps_per_epoch = self.mb_loader.simple_cv_load(df)
-
-    self._train_single_fold(
-      mb_generator=mini_batches, test_data=test_data, steps_per_epoch=steps_per_epoch, fold="full"
-    )
-
-  def train(self):
-    print("Setting up random seed.")
-    set_seeds(self.seed)
-
-    print(f"Loading {self.language} data")
-    df = self.load_data()
-    df = self.preprocess(df=df)
-
-    print("Loading MB generator")
-    i = 0
-    if self.project == 435 or self.project == 211:
-      mb_generator, steps_per_epoch, val_data, test_data = self.mb_loader.no_cv_load(full_df=df)
-      self._train_single_fold(
-        mb_generator=mb_generator,
-        val_data=val_data,
-        test_data=test_data,
-        steps_per_epoch=steps_per_epoch,
-        fold=i,
-      )
-    else:
-      raise ValueError("Sure you want to do multiple fold training")
-      for mb_generator, steps_per_epoch, val_data, test_data in self.mb_loader(full_df=df):
+    def get_training_actors(self, steps_per_epoch, val_data, test_data, fold):
+        callbacks = self.get_callbacks(
+            fold=fold, val_data=val_data, test_data=test_data
+        )
+        schedule = self.get_lr_schedule(steps_per_epoch=steps_per_epoch)
+
+        optimizer = self.get_optimizer(schedule)
+
+        return optimizer, callbacks
+
+    def load_data(self) -> pd.DataFrame:
+        if self.project == 435 or self.project == 211:
+            if self.dataset_type is None:
+                data_loader = ENLoader(
+                    project=self.project, setting_file=self.setting_file
+                )
+                dataset_type_args = {}
+            else:
+                data_loader = ENLoaderWithSampling(
+                    project=self.project, setting_file=self.setting_file
+                )
+                dataset_type_args = self.dataset_type
+
+        df = data_loader.load_data(
+            language=self.language,
+            test=self.test,
+            reload=self.dataset_reload,
+            **dataset_type_args,
+        )
+
+        return df
+
+    def preprocess(self, df: pd.DataFrame):
+        if self.project == 435 or self.project == 211:
+            if self.preprocessing is None:
+                data_prepro = DefaultENNoPreprocessor()
+            elif self.preprocessing == "default":
+                data_prepro = DefaultENPreprocessor()
+            else:
+                raise NotImplementedError
+
+        return data_prepro(
+            df=df,
+            label_column=self.label_column,
+            class_weight=self.perc_training_tox
+            if self.sample_weights == "class_weight"
+            else None,
+            filter_low_agreements=self.filter_low_agreements,
+            num_classes=self.num_classes,
+        )
+
+    def load_model(self, optimizer: tf.keras.optimizers.Optimizer):
+        smart_bias_value = (
+            np.log(self.perc_training_tox / (1 - self.perc_training_tox))
+            if self.smart_bias_init
+            else 0
+        )
+        model = load(
+            optimizer,
+            seed=self.seed,
+            trainable=self.trainable,
+            model_type=self.model_type,
+            loss_name=self.loss_name,
+            num_classes=self.num_classes,
+            additional_layer=self.additional_layer,
+            smart_bias_value=smart_bias_value,
+            content_num_classes=self.content_num_classes,
+            content_loss_name=self.content_loss_name,
+            content_loss_weight=self.content_loss_weight,
+        )
+
+        if self.model_reload is not False:
+            model_folder = upload_model(
+                full_gcs_model_path=os.path.join(self.model_dir, self.model_reload)
+            )
+            model.load_weights(model_folder)
+            if self.scratch_last_layer:
+                print("Putting the last layer back to scratch")
+                model.layers[-1] = get_last_layer(
+                    seed=self.seed,
+                    num_classes=self.num_classes,
+                    smart_bias_value=smart_bias_value,
+                )
+
+        return model
+
+    def _train_single_fold(
+        self,
+        mb_generator: tf.data.Dataset,
+        test_data: pd.DataFrame,
+        steps_per_epoch: int,
+        fold: int,
+        val_data: pd.DataFrame = None,
+    ):
+        steps_per_epoch = 100 if self.test else steps_per_epoch
+
+        optimizer, callbacks = self.get_training_actors(
+            steps_per_epoch=steps_per_epoch,
+            val_data=val_data,
+            test_data=test_data,
+            fold=fold,
+        )
+        print("Loading model")
+        model = self.load_model(optimizer)
+        print(f"Nb of steps per epoch: {steps_per_epoch} ---- launching training")
+        training_args = {
+            "epochs": self.train_epochs,
+            "steps_per_epoch": steps_per_epoch,
+            "batch_size": self.mb_size,
+            "callbacks": callbacks,
+            "verbose": 2,
+        }
+
+        model.fit(mb_generator, **training_args)
+
+    def train_full_model(self):
+        print("Setting up random seed.")
+        set_seeds(self.seed)
+
+        print(f"Loading {self.language} data")
+        df = self.load_data()
+        df = self.preprocess(df=df)
+
+        print("Going to train on everything but the test dataset")
+        mini_batches, test_data, steps_per_epoch = self.mb_loader.simple_cv_load(df)
+
         self._train_single_fold(
-          mb_generator=mb_generator,
-          val_data=val_data,
-          test_data=test_data,
-          steps_per_epoch=steps_per_epoch,
-          fold=i,
+            mb_generator=mini_batches,
+            test_data=test_data,
+            steps_per_epoch=steps_per_epoch,
+            fold="full",
         )
-        i += 1
-        if i == 3:
-          break
+
+    def train(self):
+        print("Setting up random seed.")
+        set_seeds(self.seed)
+        print(f"Loading {self.language} data")
+        df = self.load_data()
+        df = self.preprocess(df=df)
+        print("Loading MB generator")
+        i = 0
+
+        if self.project == 435 or self.project == 211:
+            (
+                mb_generator,
+                steps_per_epoch,
+                val_data,
+                test_data,
+            ) = self.mb_loader.no_cv_load(full_df=df)
+            self._train_single_fold(
+                mb_generator=mb_generator,
+                val_data=val_data,
+                test_data=test_data,
+                steps_per_epoch=steps_per_epoch,
+                fold=i,
+            )
+        else:
+            raise ValueError("Sure you want to do multiple fold training")
+
+        for mb_generator, steps_per_epoch, val_data, test_data in self.mb_loader(
+            full_df=df
+        ):
+            self._train_single_fold(
+                mb_generator=mb_generator,
+                val_data=val_data,
+                test_data=test_data,
+                steps_per_epoch=steps_per_epoch,
+                fold=i,
+            )
+            i += 1
+            if i == 3:
+                break
diff --git a/trust_and_safety_models/toxicity/utils/helpers.py b/trust_and_safety_models/toxicity/utils/helpers.py
index c21d7eb1c..8924978bb 100644
--- a/trust_and_safety_models/toxicity/utils/helpers.py
+++ b/trust_and_safety_models/toxicity/utils/helpers.py
@@ -3,97 +3,101 @@
 import random as python_random
 import subprocess
 
-from toxicity_ml_pipeline.settings.default_settings_tox import LOCAL_DIR
-
 import numpy as np
 from sklearn.metrics import precision_recall_curve
-
+from toxicity_ml_pipeline.settings.default_settings_tox import LOCAL_DIR
 
 try:
-  import tensorflow as tf
+    import tensorflow as tf
 except ModuleNotFoundError:
-  pass
-
+    pass
 
-def upload_model(full_gcs_model_path):
-  folder_name = full_gcs_model_path
-  if folder_name[:5] != "gs://":
-    folder_name = "gs://" + folder_name
 
-  dirname = os.path.dirname(folder_name)
-  epoch = os.path.basename(folder_name)
+def upload_model(full_gcs_model_path: str):
+    folder_name = full_gcs_model_path
+    if folder_name[:5] != "gs://":
+        folder_name = "gs://" + folder_name
+
+    dirname = os.path.dirname(folder_name)
+    epoch = os.path.basename(folder_name)
+
+    model_dir = os.path.join(LOCAL_DIR, "models")
+    cmd = f"mkdir {model_dir}"
+    try:
+        execute_command(cmd)
+    except subprocess.CalledProcessError:
+        pass
+    model_dir = os.path.join(model_dir, os.path.basename(dirname))
+    cmd = f"mkdir {model_dir}"
+    try:
+        execute_command(cmd)
+    except subprocess.CalledProcessError:
+        pass
+
+    try:
+        _ = int(epoch)
+    except ValueError:
+        cmd = f"gsutil rsync -r '{folder_name}' {model_dir}"
+        weights_dir = model_dir
+
+    else:
+        cmd = f"gsutil cp '{dirname}/checkpoint' {model_dir}/"
+        execute_command(cmd)
+        cmd = f"gsutil cp '{os.path.join(dirname, epoch)}*' {model_dir}/"
+        weights_dir = f"{model_dir}/{epoch}"
 
-  model_dir = os.path.join(LOCAL_DIR, "models")
-  cmd = f"mkdir {model_dir}"
-  try:
-    execute_command(cmd)
-  except subprocess.CalledProcessError:
-    pass
-  model_dir = os.path.join(model_dir, os.path.basename(dirname))
-  cmd = f"mkdir {model_dir}"
-  try:
     execute_command(cmd)
-  except subprocess.CalledProcessError:
-    pass
-
-  try:
-    _ = int(epoch)
-  except ValueError:
-    cmd = f"gsutil rsync -r '{folder_name}' {model_dir}"
-    weights_dir = model_dir
+    return weights_dir
 
-  else:
-    cmd = f"gsutil cp '{dirname}/checkpoint' {model_dir}/"
-    execute_command(cmd)
-    cmd = f"gsutil cp '{os.path.join(dirname, epoch)}*' {model_dir}/"
-    weights_dir = f"{model_dir}/{epoch}"
 
-  execute_command(cmd)
-  return weights_dir
+def compute_precision_fixed_recall(
+    labels: np.ndarray, preds: np.ndarray, fixed_recall: float
+):
+    precision_values, recall_values, thresholds = precision_recall_curve(
+        y_true=labels, probas_pred=preds
+    )
+    index_recall = bisect.bisect_left(-recall_values, -1 * fixed_recall)
+    result = precision_values[index_recall - 1]
+    print(f"Precision at {recall_values[index_recall-1]} recall: {result}")
 
-def compute_precision_fixed_recall(labels, preds, fixed_recall):
-  precision_values, recall_values, thresholds = precision_recall_curve(y_true=labels, probas_pred=preds)
-  index_recall = bisect.bisect_left(-recall_values, -1 * fixed_recall)
-  result = precision_values[index_recall - 1]
-  print(f"Precision at {recall_values[index_recall-1]} recall: {result}")
+    return result, thresholds[index_recall - 1]
 
-  return result, thresholds[index_recall - 1]
 
-def load_inference_func(model_folder):
-  model = tf.saved_model.load(model_folder, ["serve"])
-  inference_func = model.signatures["serving_default"]
-  return inference_func
+def load_inference_func(model_folder: str):
+    model = tf.saved_model.load(model_folder, ["serve"])
+    inference_func = model.signatures["serving_default"]
+    return inference_func
 
 
 def execute_query(client, query):
-  job = client.query(query)
-  df = job.result().to_dataframe()
-  return df
+    job = client.query(query)
+    df = job.result().to_dataframe()
+    return df
 
 
-def execute_command(cmd, print_=True):
-  s = subprocess.run(cmd, shell=True, capture_output=print_, check=True)
-  if print_:
-    print(s.stderr.decode("utf-8"))
-    print(s.stdout.decode("utf-8"))
+def execute_command(cmd: str, print_: bool = True):
+    s = subprocess.run(cmd, shell=True, capture_output=print_, check=True)
+    if print_:
+        print(s.stderr.decode("utf-8"))
+        print(s.stdout.decode("utf-8"))
 
 
 def check_gpu():
-  try:
-    execute_command("nvidia-smi")
-  except subprocess.CalledProcessError:
-    print("There is no GPU when there should be one.")
-    raise AttributeError
-
-  l = tf.config.list_physical_devices("GPU")
-  if len(l) == 0:
-    raise ModuleNotFoundError("Tensorflow has not found the GPU. Check your installation")
-  print(l)
-
-
-def set_seeds(seed):
-  np.random.seed(seed)
-
-  python_random.seed(seed)
-
-  tf.random.set_seed(seed)
+    try:
+        execute_command("nvidia-smi")
+    except subprocess.CalledProcessError:
+        print("There is no GPU when there should be one.")
+        raise AttributeError
+
+    l = tf.config.list_physical_devices("GPU")
+    if len(l) == 0:
+        raise ModuleNotFoundError(
+            "Tensorflow has not found the GPU. Check your installation"
+        )
+    print(l)
+
+
+def set_seeds(seed: int):
+    np.random.seed(seed)
+    python_random.seed(seed)
+    tf.random.set_seed(seed)
diff --git a/twml/libtwml/setup.py b/twml/libtwml/setup.py
index 2dcfa105d..ebd76e577 100644
--- a/twml/libtwml/setup.py
+++ b/twml/libtwml/setup.py
@@ -1,12 +1,12 @@
 """
 libtwml setup.py module
 """
-from setuptools import setup, find_packages
+from setuptools import find_packages, setup
 
 setup(
-  name='libtwml',
-  version='2.0',
-  description="Tensorflow C++ ops for twml",
-  packages=find_packages(),
-  data_files=[('', ['libtwml_tf.so'])],
+    name="libtwml",
+    version="2.0",
+    description="Tensorflow C++ ops for twml",
+    packages=find_packages(),
+    data_files=[("", ["libtwml_tf.so"])],
 )
diff --git a/twml/libtwml/src/ops/scripts/get_inc.py b/twml/libtwml/src/ops/scripts/get_inc.py
index c50edfa90..df92dea44 100644
--- a/twml/libtwml/src/ops/scripts/get_inc.py
+++ b/twml/libtwml/src/ops/scripts/get_inc.py
@@ -2,4 +2,4 @@
 
 import tensorflow.compat.v1 as tf
 
-print(tf.sysconfig.get_include(), end='')
+print(tf.sysconfig.get_include(), end="")
diff --git a/twml/libtwml/src/ops/scripts/get_lib.py b/twml/libtwml/src/ops/scripts/get_lib.py
index 7150c48b7..c212e27d0 100644
--- a/twml/libtwml/src/ops/scripts/get_lib.py
+++ b/twml/libtwml/src/ops/scripts/get_lib.py
@@ -2,4 +2,4 @@
 
 import tensorflow.compat.v1 as tf
 
-print(tf.sysconfig.get_lib(), end='')
+print(tf.sysconfig.get_lib(), end="")
diff --git a/twml/setup.py b/twml/setup.py
index 7e4003bae..bdd548874 100644
--- a/twml/setup.py
+++ b/twml/setup.py
@@ -2,28 +2,27 @@
 
 from setuptools import find_packages, setup
 
-
 THIS_DIR = os.path.dirname(os.path.realpath(__file__))
-TWML_TEST_DATA_DIR = os.path.join(THIS_DIR, 'twml/tests/data')
+TWML_TEST_DATA_DIR = os.path.join(THIS_DIR, "twml/tests/data")
 
 data_files = []
 for parent, children, files in os.walk(TWML_TEST_DATA_DIR):
-  data_files += [os.path.join(parent, f) for f in files]
+    data_files += [os.path.join(parent, f) for f in files]
 
 setup(
-  name='twml',
-  version='2.0',
-  description="Tensorflow wrapper for twml",
-  packages=find_packages(exclude=["build"]),
-  install_requires=[
-    'thriftpy2',
-    'numpy',
-    'pyyaml',
-    'future',
-    'scikit-learn',
-    'scipy'
-  ],
-  package_data={
-    'twml': data_files,
-  },
+    name="twml",
+    version="2.0",
+    description="Tensorflow wrapper for twml",
+    packages=find_packages(exclude=["build"]),
+    install_requires=[
+        "thriftpy2",
+        "numpy",
+        "pyyaml",
+        "future",
+        "scikit-learn",
+        "scipy",
+    ],
+    package_data={
+        "twml": data_files,
+    },
 )
diff --git a/twml/twml/__init__.py b/twml/twml/__init__.py
index 0c96df68b..0abfbed35 100644
--- a/twml/twml/__init__.py
+++ b/twml/twml/__init__.py
@@ -2,60 +2,67 @@
 
 import os
 
+import tensorflow.compat.v1 as tf  # noqa: F402
+
 # Import from twitter.deepbird
 from twitter.deepbird.logging.log_level import set_logging_level  # noqa: F401
 from twitter.deepbird.sparse import SparseTensor  # noqa: F401
 from twitter.deepbird.sparse import sparse_dense_matmul  # noqa: F401
 
-from .util import dynamic_partition, feature_id, limit_bits, limit_sparse_tensor_size  # noqa: F401
-from .util import write_file, fixed_length_tensor, setup_tf_logging_formatter  # noqa: F401
-from .array import Array  # noqa: F401
+from . import constants  # noqa: F401
+from . import errors  # noqa: F401
+from . import layers  # noqa: F401
+from . import lookup  # noqa: F401
+from . import readers  # noqa: F401
+from . import summary  # noqa: F401
+from . import tensorboard  # noqa: F401
 
-# Module to parse feature patterns and match them from data_spec.json
-from .feature_config import FeatureConfig, FeatureConfigBuilder  # noqa: F401
+# Custom argparser for Trainer
+from .argument_parser import *  # noqa: T400
+from .array import Array  # noqa: F401
+from .block_format_writer import *  # noqa: T400
 
 # Data record streaming, reading, writing, and parsing.
 from .dataset import *  # noqa: T400
-from .readers import *  # noqa: T400
-from .block_format_writer import *  # noqa: T400
 
 # Graph output functions
 from .export_output_fns import *  # noqa: T400
 
-# Input parsers
-from .parsers import *  # noqa: T400
-
-# Input functions
-from .input_fns import *  # noqa: T400
+# Module to parse feature patterns and match them from data_spec.json
+from .feature_config import FeatureConfig, FeatureConfigBuilder  # noqa: F401
 
 # Feature filter functions
 from .filters import *  # noqa: T400
 
-# Custom argparser for Trainer
-from .argument_parser import *  # noqa: T400
+# Input functions
+from .input_fns import *  # noqa: T400
 
-from . import constants  # noqa: F401
-from . import errors  # noqa: F401
-from . import layers  # noqa: F401
-from . import lookup  # noqa: F401
-from . import readers  # noqa: F401
-from . import summary  # noqa: F401
-from . import tensorboard  # noqa: F401
+# Input parsers
+from .parsers import *  # noqa: T400
+from .readers import *  # noqa: T400
+from .util import feature_id  # noqa: F401
+from .util import (
+    dynamic_partition,
+    fixed_length_tensor,
+    limit_bits,
+    limit_sparse_tensor_size,
+    setup_tf_logging_formatter,
+    write_file,
+)
 
-import tensorflow.compat.v1 as tf  # noqa: F402
 tf.disable_eager_execution()
 
 # TODO: Figure out a better way to deal with this.
-if 'OMP_NUM_THREADS' not in os.environ and 'MKL_NUM_THREADS' not in os.environ:
-  os.environ["OMP_NUM_THREADS"] = '1'
+if "OMP_NUM_THREADS" not in os.environ and "MKL_NUM_THREADS" not in os.environ:
+    os.environ["OMP_NUM_THREADS"] = "1"
 
 # Import all custom C++ ops
-from libtwml import add1, partition_sparse_tensor, CLIB  # noqa: F401
+from libtwml import CLIB, add1, partition_sparse_tensor  # noqa: F401
 
 # Configure logging levels to info for various frameworks
-set_logging_level('INFO')
+set_logging_level("INFO")
 
 from . import contrib  # noqa: F401
 from . import hooks  # noqa: F401
-from . import trainers  # noqa: F401
 from . import metrics  # noqa: F401
+from . import trainers  # noqa: F401
diff --git a/twml/twml/argument_parser.py b/twml/twml/argument_parser.py
index c771eebdf..29ab45c86 100644
--- a/twml/twml/argument_parser.py
+++ b/twml/twml/argument_parser.py
@@ -3,559 +3,804 @@
 Command-line argument parsing for the Trainer.
 """
 import argparse
+import tempfile
 from argparse import ArgumentError
 from operator import attrgetter
-import tempfile
+from typing import List
 
-import twml
 import tensorflow.compat.v1 as tf
 
+import twml
 
 SERIAL = "serial"
 TREE = "tree"
 LOG_LEVELS = {
-  "debug": tf.logging.DEBUG,
-  "info": tf.logging.INFO,
-  "warn": tf.logging.WARN,
-  "error": tf.logging.ERROR}
+    "debug": tf.logging.DEBUG,
+    "info": tf.logging.INFO,
+    "warn": tf.logging.WARN,
+    "error": tf.logging.ERROR,
+}
 
 
 class SortingHelpFormatter(argparse.HelpFormatter):
-  """
-  Used to sort args alphabetically in the help message.
-  """
-
-  def add_arguments(self, actions):
-    actions = sorted(actions, key=attrgetter('option_strings'))
-    super(SortingHelpFormatter, self).add_arguments(actions)
-
-
-def _set_log_level(level=None):
-  """Sets the tensorflow log level to the input level."""
-  if level is None:
-    return None
-  level = level.lower()
-  if level not in LOG_LEVELS.keys():
-    raise ValueError(f"Unexpected log level {level} was given but expected one of {LOG_LEVELS.keys()}.")
-  tf.logging.set_verbosity(LOG_LEVELS[level])
-  tf.logging.info(f"Setting tensorflow logging level to {level} or {LOG_LEVELS[level]}")
-  return level
-
-
-def get_trainer_parser():
-  """
-  Add common commandline args to parse for the Trainer class.
-  Typically, the user calls this function and then parses cmd-line arguments
-  into an argparse.Namespace object which is then passed to the Trainer constructor
-  via the params argument.
-
-  See the `code <_modules/twml/argument_parser.html#get_trainer_parser>`_
-  for a list and description of all cmd-line arguments.
-
-  Args:
-    learning_rate_decay:
-      Defaults to False. When True, parses learning rate decay arguments.
-
-  Returns:
-    argparse.ArgumentParser instance with some useful args already added.
-  """
-  parser = twml.DefaultSubcommandArgParse(formatter_class=SortingHelpFormatter)
-
-  parser.add_argument(
-    "--save_dir", type=str, default=tempfile.mkdtemp(),
-    help="Path to the training result directory."
-         "supports local filesystem path and hdfs://default/<path> which requires "
-         "setting HDFS configuration via env variable HADOOP_CONF_DIR ")
-  parser.add_argument(
-    "--export_dir", type=str, default=None,
-    help="Path to the directory to export a SavedModel for prediction servers.")
-  parser.add_argument(
-    "--log_aggregation_app_id", type=str, default=None,
-    help="specify app_id for log aggregation. disabled by default.")
-  parser.add_argument(
-    "--train.batch_size", "--train_batch_size", type=int, default=32,
-    dest='train_batch_size',
-    help="number of samples per training batch")
-  parser.add_argument(
-    "--eval.batch_size", "--eval_batch_size", type=int, default=32,
-    dest='eval_batch_size',
-    help="number of samples per cross-validation batch. Defaults to train_batch_size")
-  parser.add_argument(
-    "--train.learning_rate", "--learning_rate", type=float, default=0.002,
-    dest='learning_rate',
-    help="learning rate. Scales the gradient update.")
-  parser.add_argument(
-    "--train.steps", "--train_steps", type=int, default=-1,
-    dest='train_steps',
-    help="number of training batches before running evaluation."
-         "Defaults to -1 (runs through entire dataset). "
-         "Only used for Trainer.[train,learn]. "
-         "For Trainer.train_and_evaluate, use train.max_steps instead. ")
-  parser.add_argument(
-    "--eval.steps", "--eval_steps", type=int, default=-1,
-    dest="eval_steps",
-    help="number of steps per evaluation. Each batch is a step."
-         "Defaults to -1 (runs through entire dataset). ")
-  parser.add_argument(
-    "--eval.period", "--eval_period", type=int, default=600,
-    dest="eval_period",
-    help="Trainer.train_and_evaluate waits for this long after each evaluation. "
-         "Defaults to 600 seconds (evaluate every ten minutes). "
-         "Note that anything lower than 10*60seconds is probably a bad idea because TF saves "
-         "checkpoints every 10mins by default. eval.delay is time to wait before doing first eval. "
-         "eval.period is time between successive evals.")
-  parser.add_argument(
-    "--eval.delay", "--eval_delay", type=int, default=120,
-    dest="eval_delay",
-    help="Trainer.train_and_evaluate waits for this long before performing the first evaluation"
-         "Defaults to 120 seconds (evaluate after first 2 minutes of training). "
-         "eval.delay is time to wait before doing first eval. "
-         "eval.period is time between successive evals.")
-  parser.add_argument(
-    "--train.max_steps", "--train_max_steps", type=int, default=None,
-    dest="train_max_steps",
-    help="Stop training after this many global steps. Each training batch is its own step."
-         "If set to None, step after one train()/evaluate() call. Useful when train.steps=-1."
-         "If set to a non-positive value, loop forever. Usually useful with early stopping.")
-  parser.add_argument(
-    "--train.log_metrics", dest="train_log_metrics", action="store_true", default=False,
-    help="Set this to true to see metrics during training. "
-         "WARNING: metrics during training does not represent model performance. "
-         "WARNING: use for debugging only as this slows down training.")
-  parser.add_argument(
-    "--train.early_stop_patience", "--early_stop_patience", type=int, default=-1,
-    dest="early_stop_patience",
-    help="max number of evaluations (epochs) to wait for an improvement in the early_stop_metric."
-         "Defaults to -1 (no early-stopping)."
-         "NOTE: This can not be enabled when --distributed is also set.")
-  parser.add_argument(
-    "--train.early_stop_tolerance", "--early_stop_tolerance", type=float, default=0,
-    dest="early_stop_tolerance",
-    help="a non-negative tolerance for comparing early_stop_metric."
-         "e.g. when maximizing the condition is current_metric > best_metric + tolerance."
-         "Defaults to 0.")
-  parser.add_argument(
-    "--train.dataset_shards", "--train_dataset_shards",
-    dest="train_dataset_shards",
-    type=int, default=None,
-    help="An int value that indicates the number of partitions (shards) for the dataset. This is"
-    " useful for codistillation and other techniques that require each worker to train on disjoint"
-    " partitions of the dataset.")
-  parser.add_argument(
-    "--train.dataset_shard_index", "--train_dataset_shard_index",
-    dest="train_dataset_shard_index",
-    type=int, default=None,
-    help="An int value (starting at zero) that indicates which partition (shard) of the dataset"
-    " to use if --train.dataset_shards is set.")
-  parser.add_argument(
-    "--continue_from_checkpoint", dest="continue_from_checkpoint", action="store_true",
-    help="DEPRECATED. This option is currently a no-op."
-    " Continuing from the provided checkpoint is now the default."
-    " Use --overwrite_save_dir if you would like to override it instead"
-    " and restart training from scratch.")
-  parser.add_argument(
-    "--overwrite_save_dir", dest="overwrite_save_dir", action="store_true",
-    help="Delete the contents of the current save_dir if it exists")
-  parser.add_argument(
-    "--data_threads", "--num_threads", type=int, default=2,
-    dest="num_threads",
-    help="Number of threads to use for loading the dataset. "
-         "num_threads is deprecated and to be removed in future versions. Use data_threads.")
-  parser.add_argument(
-    "--max_duration", "--max_duration", type=float, default=None,
-    dest="max_duration",
-    help="Maximum duration (in secs) that training/validation will be allowed to run for before being automatically terminated.")
-  parser.add_argument(
-    "--num_workers", type=int, default=None,
-    help="Number of workers to use when training in hogwild manner on a single node.")
-  parser.add_argument(
-    "--distributed", dest="distributed", action="store_true",
-    help="Pass this flag to use train_and_evaluate to train in a distributed fashion"
-         "NOTE: You can not use early stopping when --distributed is enabled"
-  )
-  parser.add_argument(
-    "--distributed_training_cleanup",
-    dest="distributed_training_cleanup",
-    action="store_true",
-    help="Set if using distributed training on GKE to stop TwitterSetDeployment"
-         "from continuing training upon restarts (will be deprecated once we migrate off"
-         "TwitterSetDeployment for distributed training on GKE)."
-  )
-  parser.add_argument(
-    "--disable_auto_ps_shutdown", default=False, action="store_true",
-    help="Disable the functionality of automatically shutting down parameter server after "
-         "distributed training complete (either succeed or failed)."
-  )
-  parser.add_argument(
-    "--disable_tensorboard", default=False, action="store_true",
-    help="Do not start the TensorBoard server."
-  )
-  parser.add_argument(
-    "--tensorboard_port", type=int, default=None,
-    help="Port for tensorboard to run on. Ignored if --disable_tensorboard is set.")
-  parser.add_argument(
-    "--health_port", type=int, default=None,
-    help="Port to listen on for health-related endpoints (e.g. graceful shutdown)."
-         "Not user-facing as it is set automatically by the twml_cli."
-  )
-  parser.add_argument(
-    "--stats_port", type=int, default=None,
-    help="Port to listen on for stats endpoints"
-  )
-  parser.add_argument(
-    "--experiment_tracking_path",
-    dest="experiment_tracking_path",
-    type=str, default=None,
-    help="The tracking path of this experiment. Format: \
-        user_name:project_name:experiment_name:run_name. The path is used to track and display \
-        a record of this experiment on ML Dashboard. Note: this embedded experiment tracking is \
-        disabled when the deprecated Model Repo TrackRun is used in your model config. ")
-  parser.add_argument(
-    "--disable_experiment_tracking",
-    dest="disable_experiment_tracking",
-    action="store_true",
-    help="Whether experiment tracking should be disabled.")
-  parser.add_argument(
-    "--config.save_checkpoints_secs", "--save_checkpoints_secs", type=int, default=600,
-    dest='save_checkpoints_secs',
-    help="Configures the tf.estimator.RunConfig.save_checkpoints_secs attribute. "
-    "Specifies how often checkpoints are saved in seconds. Defaults to 10*60 seconds.")
-  parser.add_argument(
-    "--config.keep_checkpoint_max", "--keep_checkpoint_max", type=int, default=20,
-    dest='keep_checkpoint_max',
-    help="Configures the tf.estimator.RunConfig.keep_checkpoint_max attribute. "
-    "Specifies how many checkpoints to keep. Defaults to 20.")
-  parser.add_argument(
-    "--config.tf_random_seed", "--tf_random_seed", type=int, default=None,
-    dest='tf_random_seed',
-    help="Configures the tf.estimator.RunConfig.tf_random_seed attribute. "
-         "Specifies the seed to use. Defaults to None.")
-  parser.add_argument(
-    "--optimizer", type=str, default='SGD',
-    help="Optimizer to use: SGD (Default), Adagrad, Adam, Ftrl, Momentum, RMSProp, LazyAdam, DGC.")
-  parser.add_argument(
-    "--gradient_noise_scale", type=float, default=None,
-    help="adds 0-mean normal noise scaled by this value. Defaults to None.")
-  parser.add_argument(
-    "--clip_gradients", type=float, default=None,
-    help="If specified, a global clipping is applied to prevent "
-         "the norm of the gradient to exceed this value. Defaults to None.")
-  parser.add_argument(
-    "--dgc.density", "--dgc_density", type=float, default=0.1,
-    dest="dgc_density",
-    help="Specifies gradient density level when using deep gradient compression optimizer."
-         "E.g., default value being 0.1 means that only top 10%% most significant rows "
-         "(based on absolute value sums) are kept."
-  )
-  parser.add_argument(
-    "--dgc.density_decay", "--dgc_density_decay", type=bool, default=True,
-    dest="dgc_density_decay",
-    help="Specifies whether to (exponentially) decay the gradient density level when"
-         " doing gradient compression. If set 'False', the 'density_decay_steps', "
-         "'density_decay_rate' and 'min_density' arguments will be ignored."
-  )
-  parser.add_argument(
-    "--dgc.density_decay_steps", "--dgc_density_decay_steps", type=int, default=10000,
-    dest="dgc_density_decay_steps",
-    help="Specifies the step interval to perform density decay."
-  )
-  parser.add_argument(
-    "--dgc.density_decay_rate", "--dgc_density_decay_rate", type=float, default=0.5,
-    dest="dgc_density_decay_rate",
-    help="Specifies the decay rate when perfoming density decay."
-  )
-  parser.add_argument(
-    "--dgc.min_density", "--dgc_min_density", type=float, default=0.1,
-    dest="dgc_min_density",
-    help="Specifies the minimum density level when perfoming density decay."
-  )
-  parser.add_argument(
-    "--dgc.accumulation", "--dgc_accumulation", type=bool, default=False,
-    dest="dgc_accumulation",
-    help="Specifies whether to accumulate small gradients when using deep gradient compression "
-         "optimizer."
-  )
-  parser.add_argument(
-    "--show_optimizer_summaries", dest="show_optimizer_summaries", action="store_true",
-    help="When specified, displays gradients and learning rate in tensorboard."
-    "Turning it on has 10-20%% performance hit. Enable for debugging only")
-
-  parser.add_argument(
-    "--num_mkl_threads", dest="num_mkl_threads", default=1, type=int,
-    help="Specifies how many threads to use for MKL"
-    "inter_op_ parallelism_threds is set to TWML_NUM_CPUS / num_mkl_threads."
-    "intra_op_parallelism_threads is set to num_mkl_threads.")
-
-  parser.add_argument("--verbosity", type=_set_log_level, choices=LOG_LEVELS.keys(), default=None,
-    help="Sets log level to a given verbosity.")
-
-  parser.add_argument(
-    "--feature_importance.algorithm", dest="feature_importance_algorithm",
-    type=str, default=TREE, choices=[SERIAL, TREE],
-    help="""
-    There are two algorithms that the module supports, `serial` and `tree`.
-      The `serial` algorithm computes feature importances for each feature, and
-      the `tree` algorithm groups features by feature name prefix, computes feature
-      importances for groups of features, and then only 'zooms-in' on a group when the
-      importance is greater than the `--feature_importance.sensitivity` value. The `tree` algorithm
-      will usually run faster, but for relatively unimportant features it will only compute an
-      upper bound rather than an exact importance value. We suggest that users generally stick
-      to the `tree` algorithm, unless if they have a very small number of features or
-      near-random model performance.
-      """)
-
-  parser.add_argument(
-    "--feature_importance.sensitivity", dest="feature_importance_sensitivity", type=float, default=0.03,
-    help="""
-    The maximum amount that permuting a feature group can cause the model performance (determined
-      by `feature_importance.metric`) to drop before the algorithm decides to not expand the feature
-      group. This is only used for the `tree` algorithm.
-    """)
-
-  parser.add_argument(
-    "--feature_importance.dont_build_tree", dest="dont_build_tree", action="store_true", default=False,
-    help="""
-    If True, don't build the feature trie for the tree algorithm and only use the extra_groups
-    """)
-
-  parser.add_argument(
-    "--feature_importance.split_feature_group_on_period", dest="split_feature_group_on_period", action="store_true", default=False,
-    help="If true, split feature groups by the period rather than the optimal prefix. Only used for the TREE algorithm")
-
-  parser.add_argument(
-    "--feature_importance.example_count", dest="feature_importance_example_count", type=int, default=10000,
-    help="""
-    The number of examples used to compute feature importance.
-    Larger values yield more reliable results, but also take longer to compute.
-    These records are loaded into memory. This number is agnostic to batch size.
-    """)
-
-  parser.add_argument(
-    "--feature_importance.data_dir", dest="feature_importance_data_dir", type=str, default=None,
-    help="Path to the dataset used to compute feature importance."
-         "supports local filesystem path and hdfs://default/<path> which requires "
-         "setting HDFS configuration via env variable HADOOP_CONF_DIR "
-         "Defaults to eval_data_dir")
-
-  parser.add_argument(
-    "--feature_importance.metric", dest="feature_importance_metric", type=str, default="roc_auc",
-    help="The metric used to determine when to stop expanding the feature importance tree. This is only used for the `tree` algorithm.")
-
-  parser.add_argument(
-    "--feature_importance.is_metric_larger_the_better", dest="feature_importance_is_metric_larger_the_better", action="store_true", default=False,
-    help="If true, interpret `--feature_importance.metric` to be a metric where larger values are better (e.g. ROC_AUC)")
-
-  parser.add_argument(
-    "--feature_importance.is_metric_smaller_the_better", dest="feature_importance_is_metric_smaller_the_better", action="store_true", default=False,
-    help="If true, interpret `--feature_importance.metric` to be a metric where smaller values are better (e.g. LOSS)")
-
-  subparsers = parser.add_subparsers(help='Learning Rate Decay Functions. Can only pass 1.'
-                                          'Should be specified after all the optional arguments'
-                                          'and followed by its specific args'
-                                          'e.g. --learning_rate 0.01 inverse_learning_rate_decay_fn'
-                                          ' --decay_rate 0.0004 --min_learning_rate 0.001',
-                                     dest='learning_rate_decay')
-
-  # Create the parser for the "exponential_learning_rate_decay_fn"
-  parser_exponential = subparsers.add_parser('exponential_learning_rate_decay',
-                                             help='Exponential learning rate decay. '
-                                             'Exponential decay implements:'
-                                             'decayed_learning_rate = learning_rate * '
-                                             'exponential_decay_rate ^ '
-                                             '(global_step / decay_steps')
-  parser_exponential.add_argument(
-    "--decay_steps", type=float, default=None,
-    help="Required for 'exponential' learning_rate_decay.")
-  parser_exponential.add_argument(
-    "--exponential_decay_rate", type=float, default=None,
-    help="Required for 'exponential' learning_rate_decay. Must be positive. ")
-
-  # Create the parser for the "polynomial_learning_rate_decay_fn"
-  parser_polynomial = subparsers.add_parser('polynomial_learning_rate_decay',
-                                            help='Polynomial learning rate decay. '
-                                            'Polynomial decay implements: '
-                                            'global_step = min(global_step, decay_steps)'
-                                            'decayed_learning_rate = '
-                                            '(learning_rate - end_learning_rate) * '
-                                            '(1 - global_step / decay_steps) ^ '
-                                            '(polynomial_power) + end_learning_rate'
-                                            'So for linear decay you can use a '
-                                            'polynomial_power=1 (the default)')
-  parser_polynomial.add_argument(
-    "--end_learning_rate", type=float, default=0.0001,
-    help="Required for 'polynomial' learning_rate_decay (ignored otherwise).")
-  parser_polynomial.add_argument(
-    "--polynomial_power", type=float, default=0.0001,
-    help="Required for 'polynomial' learning_rate_decay."
-         "The power of the polynomial. Defaults to linear, 1.0.")
-  parser_polynomial.add_argument(
-    "--decay_steps", type=float, default=None,
-    help="Required for 'polynomial' learning_rate_decay. ")
-
-  # Create the parser for the "piecewise_constant_learning_rate_decay_fn"
-  parser_piecewise_constant = subparsers.add_parser('piecewise_constant_learning_rate_decay',
-                                                    help='Piecewise Constant '
-                                                    'learning rate decay. '
-                                                    'For piecewise_constant, '
-                                                    'consider this example: '
-                                                    'We want to use a learning rate '
-                                                    'that is 1.0 for'
-                                                    'the first 100000 steps,'
-                                                    '0.5 for steps 100001 to 110000, '
-                                                    'and 0.1 for any additional steps. '
-                                                    'To do so, specify '
-                                                    '--piecewise_constant_boundaries=100000,110000'
-                                                    '--piecewise_constant_values=1.0,0.5,0.1')
-  parser_piecewise_constant.add_argument(
-    "--piecewise_constant_values",
-    action=parse_comma_separated_list(element_type=float),
-    default=None,
-    help="Required for 'piecewise_constant_values' learning_rate_decay. "
-         "A list of comma seperated floats or ints that specifies the values "
-         "for the intervals defined by boundaries. It should have one more "
-         "element than boundaries.")
-  parser_piecewise_constant.add_argument(
-    "--piecewise_constant_boundaries",
-    action=parse_comma_separated_list(element_type=int),
-    default=None,
-    help="Required for 'piecewise_constant_values' learning_rate_decay. "
-         "A list of comma seperated integers, with strictly increasing entries.")
-
-  # Create the parser for the "inverse_learning_rate_decay_fn"
-  parser_inverse = subparsers.add_parser('inverse_learning_rate_decay',
-                                         help='Inverse Leaning rate decay. '
-                                         'Inverse implements:'
-                                         'decayed_lr = max(lr /(1 + decay_rate * '
-                                         'floor(global_step /decay_step)),'
-                                         ' min_learning_rate)'
-                                         'When decay_step=1 this mimics the behaviour'
-                                         'of the default learning rate decay'
-                                         'of DeepBird v1.')
-
-  parser_inverse.add_argument(
-    "--decay_rate", type=float, default=None,
-    help="Required for 'inverse' learning_rate_decay. Rate in which we decay the learning rate.")
-  parser_inverse.add_argument(
-    "--min_learning_rate", type=float, default=None,
-    help="Required for 'inverse' learning_rate_decay.Minimum possible learning_rate.")
-  parser_inverse.add_argument(
-    "--decay_steps", type=float, default=1,
-    help="Required for 'inverse' learning_rate_decay.")
-
-  # Create the parser for the "cosine_learning_rate_decay_fn"
-  parser_cosine = subparsers.add_parser('cosine_learning_rate_decay',
-                                        help='Cosine Leaning rate decay. '
-                                        'Cosine implements:'
-                                        'decayed_lr = 0.5 * (1 + cos(pi *\
-                                         global_step / decay_steps)) * lr'
-                                       )
-
-  parser_cosine.add_argument(
-    "--alpha", type=float, default=0,
-    help="A scalar float32 or float64 Tensor or a Python number.\
-    Minimum learning rate value as a fraction of learning_rate.")
-  parser_cosine.add_argument(
-    "--decay_steps", type=float,
-    help="Required for 'inverse' learning_rate_decay.")
-
-  # Create the parser for the "cosine_restart_learning_rate_decay_fn"
-  parser_cosine_restart = subparsers.add_parser('cosine_restarts_learning_rate_decay',
-                                                help='Applies cosine decay with restarts \
-                                                  to the learning rate'
-                                                'See [Loshchilov & Hutter, ICLR2016],\
-                                                   SGDR: Stochastic'
-                                                'Gradient Descent with Warm Restarts.'
-                                                'https://arxiv.org/abs/1608.03983'
-                                               )
-  parser_cosine_restart.add_argument(
-    "--first_decay_steps", type=float,
-    help="Required for 'cosine_restart' learning_rate_decay.")
-  parser_cosine_restart.add_argument(
-    "--alpha", type=float, default=0,
-    help="A scalar float32 or float64 Tensor or a Python number. \
-           Minimum learning rate value as a fraction of learning_rate.")
-  parser_cosine_restart.add_argument(
-    "--t_mul", type=float, default=2,
-    help="A scalar float32 or float64 Tensor or a Python number. \
-           Used to derive the number of iterations in the i-th period")
-  parser_cosine_restart.add_argument(
-    "--m_mul", type=float, default=1,
-    help="A scalar float32 or float64 Tensor or a Python number. \
-      Used to derive the initial learning rate of the i-th period.")
-
-  # Create dummy parser for None, which is the default.
-  parser_default = subparsers.add_parser(
-    'no_learning_rate_decay',
-    help='No learning rate decay')  # noqa: F841
-
-  parser.set_default_subparser('no_learning_rate_decay')
-
-  return parser
+    """
+    Used to sort args alphabetically in the help message.
+    """
 
+    def add_arguments(self, actions: argparse.Action) -> None:
+        actions = sorted(actions, key=attrgetter("option_strings"))
+        super(SortingHelpFormatter, self).add_arguments(actions)
 
-class DefaultSubcommandArgParse(argparse.ArgumentParser):
-  """
-  Subclass of argparse.ArgumentParser that sets default parser
-  """
-  _DEFAULT_SUBPARSER = None
 
-  def set_default_subparser(self, name):
+def _set_log_level(level: str = None) -> str:
+    """Sets the tensorflow log level to the input level."""
+    if level is None:
+        return None
+    level = level.lower()
+    if level not in LOG_LEVELS.keys():
+        raise ValueError(
+            f"Unexpected log level {level} was given but expected one of {LOG_LEVELS.keys()}."
+        )
+    tf.logging.set_verbosity(LOG_LEVELS[level])
+    tf.logging.info(
+        f"Setting tensorflow logging level to {level} or {LOG_LEVELS[level]}"
+    )
+    return level
+
+
+def get_trainer_parser() -> argparse.ArgumentParser:
     """
-    sets the default subparser
+    Add common commandline args to parse for the Trainer class.
+    Typically, the user calls this function and then parses cmd-line arguments
+    into an argparse.Namespace object which is then passed to the Trainer constructor
+    via the params argument.
+
+    See the `code <_modules/twml/argument_parser.html#get_trainer_parser>`_
+    for a list and description of all cmd-line arguments.
+
+    Args:
+        learning_rate_decay: Defaults to False. When True, parses learning rate decay arguments.
+
+    Returns:
+        argparse.ArgumentParser instance with some useful args already added.
     """
-    self._DEFAULT_SUBPARSER = name
+    # define the parser
+    parser = twml.DefaultSubcommandArgParse(formatter_class=SortingHelpFormatter)
+
+    parser.add_argument(
+        "--save_dir",
+        type=str,
+        default=tempfile.mkdtemp(),
+        help="Path to the training result directory."
+        "supports local filesystem path and hdfs://default/<path> which requires "
+        "setting HDFS configuration via env variable HADOOP_CONF_DIR ",
+    )
+    parser.add_argument(
+        "--export_dir",
+        type=str,
+        default=None,
+        help="Path to the directory to export a SavedModel for prediction servers.",
+    )
+    parser.add_argument(
+        "--log_aggregation_app_id",
+        type=str,
+        default=None,
+        help="specify app_id for log aggregation. disabled by default.",
+    )
+    parser.add_argument(
+        "--train.batch_size",
+        "--train_batch_size",
+        type=int,
+        default=32,
+        dest="train_batch_size",
+        help="number of samples per training batch",
+    )
+    parser.add_argument(
+        "--eval.batch_size",
+        "--eval_batch_size",
+        type=int,
+        default=32,
+        dest="eval_batch_size",
+        help="number of samples per cross-validation batch. Defaults to train_batch_size",
+    )
+    parser.add_argument(
+        "--train.learning_rate",
+        "--learning_rate",
+        type=float,
+        default=0.002,
+        dest="learning_rate",
+        help="learning rate. Scales the gradient update.",
+    )
+    parser.add_argument(
+        "--train.steps",
+        "--train_steps",
+        type=int,
+        default=-1,
+        dest="train_steps",
+        help="number of training batches before running evaluation."
+        "Defaults to -1 (runs through entire dataset). "
+        "Only used for Trainer.[train,learn]. "
+        "For Trainer.train_and_evaluate, use train.max_steps instead. ",
+    )
+    parser.add_argument(
+        "--eval.steps",
+        "--eval_steps",
+        type=int,
+        default=-1,
+        dest="eval_steps",
+        help="number of steps per evaluation. Each batch is a step."
+        "Defaults to -1 (runs through entire dataset). ",
+    )
+    parser.add_argument(
+        "--eval.period",
+        "--eval_period",
+        type=int,
+        default=600,
+        dest="eval_period",
+        help="Trainer.train_and_evaluate waits for this long after each evaluation. "
+        "Defaults to 600 seconds (evaluate every ten minutes). "
+        "Note that anything lower than 10*60seconds is probably a bad idea because TF saves "
+        "checkpoints every 10mins by default. eval.delay is time to wait before doing first eval. "
+        "eval.period is time between successive evals.",
+    )
+    parser.add_argument(
+        "--eval.delay",
+        "--eval_delay",
+        type=int,
+        default=120,
+        dest="eval_delay",
+        help="Trainer.train_and_evaluate waits for this long before performing the first evaluation"
+        "Defaults to 120 seconds (evaluate after first 2 minutes of training). "
+        "eval.delay is time to wait before doing first eval. "
+        "eval.period is time between successive evals.",
+    )
+    parser.add_argument(
+        "--train.max_steps",
+        "--train_max_steps",
+        type=int,
+        default=None,
+        dest="train_max_steps",
+        help="Stop training after this many global steps. Each training batch is its own step."
+        "If set to None, step after one train()/evaluate() call. Useful when train.steps=-1."
+        "If set to a non-positive value, loop forever. Usually useful with early stopping.",
+    )
+    parser.add_argument(
+        "--train.log_metrics",
+        dest="train_log_metrics",
+        action="store_true",
+        default=False,
+        help="Set this to true to see metrics during training. "
+        "WARNING: metrics during training does not represent model performance. "
+        "WARNING: use for debugging only as this slows down training.",
+    )
+    parser.add_argument(
+        "--train.early_stop_patience",
+        "--early_stop_patience",
+        type=int,
+        default=-1,
+        dest="early_stop_patience",
+        help="max number of evaluations (epochs) to wait for an improvement in the early_stop_metric."
+        "Defaults to -1 (no early-stopping)."
+        "NOTE: This can not be enabled when --distributed is also set.",
+    )
+    parser.add_argument(
+        "--train.early_stop_tolerance",
+        "--early_stop_tolerance",
+        type=float,
+        default=0,
+        dest="early_stop_tolerance",
+        help="a non-negative tolerance for comparing early_stop_metric."
+        "e.g. when maximizing the condition is current_metric > best_metric + tolerance."
+        "Defaults to 0.",
+    )
+    parser.add_argument(
+        "--train.dataset_shards",
+        "--train_dataset_shards",
+        dest="train_dataset_shards",
+        type=int,
+        default=None,
+        help="An int value that indicates the number of partitions (shards) for the dataset. This is"
+        " useful for codistillation and other techniques that require each worker to train on disjoint"
+        " partitions of the dataset.",
+    )
+    parser.add_argument(
+        "--train.dataset_shard_index",
+        "--train_dataset_shard_index",
+        dest="train_dataset_shard_index",
+        type=int,
+        default=None,
+        help="An int value (starting at zero) that indicates which partition (shard) of the dataset"
+        " to use if --train.dataset_shards is set.",
+    )
+    parser.add_argument(
+        "--continue_from_checkpoint",
+        dest="continue_from_checkpoint",
+        action="store_true",
+        help="DEPRECATED. This option is currently a no-op."
+        " Continuing from the provided checkpoint is now the default."
+        " Use --overwrite_save_dir if you would like to override it instead"
+        " and restart training from scratch.",
+    )
+    parser.add_argument(
+        "--overwrite_save_dir",
+        dest="overwrite_save_dir",
+        action="store_true",
+        help="Delete the contents of the current save_dir if it exists",
+    )
+    parser.add_argument(
+        "--data_threads",
+        "--num_threads",
+        type=int,
+        default=2,
+        dest="num_threads",
+        help="Number of threads to use for loading the dataset. "
+        "num_threads is deprecated and to be removed in future versions. Use data_threads.",
+    )
+    parser.add_argument(
+        "--max_duration",
+        "--max_duration",
+        type=float,
+        default=None,
+        dest="max_duration",
+        help="Maximum duration (in secs) that training/validation will be allowed to run for before being automatically terminated.",
+    )
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        default=None,
+        help="Number of workers to use when training in hogwild manner on a single node.",
+    )
+    parser.add_argument(
+        "--distributed",
+        dest="distributed",
+        action="store_true",
+        help="Pass this flag to use train_and_evaluate to train in a distributed fashion"
+        "NOTE: You can not use early stopping when --distributed is enabled",
+    )
+    parser.add_argument(
+        "--distributed_training_cleanup",
+        dest="distributed_training_cleanup",
+        action="store_true",
+        help="Set if using distributed training on GKE to stop TwitterSetDeployment"
+        "from continuing training upon restarts (will be deprecated once we migrate off"
+        "TwitterSetDeployment for distributed training on GKE).",
+    )
+    parser.add_argument(
+        "--disable_auto_ps_shutdown",
+        default=False,
+        action="store_true",
+        help="Disable the functionality of automatically shutting down parameter server after "
+        "distributed training complete (either succeed or failed).",
+    )
+    parser.add_argument(
+        "--disable_tensorboard",
+        default=False,
+        action="store_true",
+        help="Do not start the TensorBoard server.",
+    )
+    parser.add_argument(
+        "--tensorboard_port",
+        type=int,
+        default=None,
+        help="Port for tensorboard to run on. Ignored if --disable_tensorboard is set.",
+    )
+    parser.add_argument(
+        "--health_port",
+        type=int,
+        default=None,
+        help="Port to listen on for health-related endpoints (e.g. graceful shutdown)."
+        "Not user-facing as it is set automatically by the twml_cli.",
+    )
+    parser.add_argument(
+        "--stats_port",
+        type=int,
+        default=None,
+        help="Port to listen on for stats endpoints",
+    )
+    parser.add_argument(
+        "--experiment_tracking_path",
+        dest="experiment_tracking_path",
+        type=str,
+        default=None,
+        help="The tracking path of this experiment. Format: \
+        user_name:project_name:experiment_name:run_name. The path is used to track and display \
+        a record of this experiment on ML Dashboard. Note: this embedded experiment tracking is \
+        disabled when the deprecated Model Repo TrackRun is used in your model config. ",
+    )
+    parser.add_argument(
+        "--disable_experiment_tracking",
+        dest="disable_experiment_tracking",
+        action="store_true",
+        help="Whether experiment tracking should be disabled.",
+    )
+    parser.add_argument(
+        "--config.save_checkpoints_secs",
+        "--save_checkpoints_secs",
+        type=int,
+        default=600,
+        dest="save_checkpoints_secs",
+        help="Configures the tf.estimator.RunConfig.save_checkpoints_secs attribute. "
+        "Specifies how often checkpoints are saved in seconds. Defaults to 10*60 seconds.",
+    )
+    parser.add_argument(
+        "--config.keep_checkpoint_max",
+        "--keep_checkpoint_max",
+        type=int,
+        default=20,
+        dest="keep_checkpoint_max",
+        help="Configures the tf.estimator.RunConfig.keep_checkpoint_max attribute. "
+        "Specifies how many checkpoints to keep. Defaults to 20.",
+    )
+    parser.add_argument(
+        "--config.tf_random_seed",
+        "--tf_random_seed",
+        type=int,
+        default=None,
+        dest="tf_random_seed",
+        help="Configures the tf.estimator.RunConfig.tf_random_seed attribute. "
+        "Specifies the seed to use. Defaults to None.",
+    )
+    parser.add_argument(
+        "--optimizer",
+        type=str,
+        default="SGD",
+        help="Optimizer to use: SGD (Default), Adagrad, Adam, Ftrl, Momentum, RMSProp, LazyAdam, DGC.",
+    )
+    parser.add_argument(
+        "--gradient_noise_scale",
+        type=float,
+        default=None,
+        help="adds 0-mean normal noise scaled by this value. Defaults to None.",
+    )
+    parser.add_argument(
+        "--clip_gradients",
+        type=float,
+        default=None,
+        help="If specified, a global clipping is applied to prevent "
+        "the norm of the gradient to exceed this value. Defaults to None.",
+    )
+    parser.add_argument(
+        "--dgc.density",
+        "--dgc_density",
+        type=float,
+        default=0.1,
+        dest="dgc_density",
+        help="Specifies gradient density level when using deep gradient compression optimizer."
+        "E.g., default value being 0.1 means that only top 10%% most significant rows "
+        "(based on absolute value sums) are kept.",
+    )
+    parser.add_argument(
+        "--dgc.density_decay",
+        "--dgc_density_decay",
+        type=bool,
+        default=True,
+        dest="dgc_density_decay",
+        help="Specifies whether to (exponentially) decay the gradient density level when"
+        " doing gradient compression. If set 'False', the 'density_decay_steps', "
+        "'density_decay_rate' and 'min_density' arguments will be ignored.",
+    )
+    parser.add_argument(
+        "--dgc.density_decay_steps",
+        "--dgc_density_decay_steps",
+        type=int,
+        default=10000,
+        dest="dgc_density_decay_steps",
+        help="Specifies the step interval to perform density decay.",
+    )
+    parser.add_argument(
+        "--dgc.density_decay_rate",
+        "--dgc_density_decay_rate",
+        type=float,
+        default=0.5,
+        dest="dgc_density_decay_rate",
+        help="Specifies the decay rate when perfoming density decay.",
+    )
+    parser.add_argument(
+        "--dgc.min_density",
+        "--dgc_min_density",
+        type=float,
+        default=0.1,
+        dest="dgc_min_density",
+        help="Specifies the minimum density level when perfoming density decay.",
+    )
+    parser.add_argument(
+        "--dgc.accumulation",
+        "--dgc_accumulation",
+        type=bool,
+        default=False,
+        dest="dgc_accumulation",
+        help="Specifies whether to accumulate small gradients when using deep gradient compression "
+        "optimizer.",
+    )
+    parser.add_argument(
+        "--show_optimizer_summaries",
+        dest="show_optimizer_summaries",
+        action="store_true",
+        help="When specified, displays gradients and learning rate in tensorboard."
+        "Turning it on has 10-20%% performance hit. Enable for debugging only",
+    )
+
+    parser.add_argument(
+        "--num_mkl_threads",
+        dest="num_mkl_threads",
+        default=1,
+        type=int,
+        help="Specifies how many threads to use for MKL"
+        "inter_op_ parallelism_threds is set to TWML_NUM_CPUS / num_mkl_threads."
+        "intra_op_parallelism_threads is set to num_mkl_threads.",
+    )
 
-  def _parse_known_args(self, arg_strings, *args, **kwargs):
+    parser.add_argument(
+        "--verbosity",
+        type=_set_log_level,
+        choices=LOG_LEVELS.keys(),
+        default=None,
+        help="Sets log level to a given verbosity.",
+    )
+
+    parser.add_argument(
+        "--feature_importance.algorithm",
+        dest="feature_importance_algorithm",
+        type=str,
+        default=TREE,
+        choices=[SERIAL, TREE],
+        help="""
+        There are two algorithms that the module supports, `serial` and `tree`.
+        The `serial` algorithm computes feature importance for each feature, and
+        the `tree` algorithm groups features by feature name prefix, computes feature
+        importance for groups of features, and then only 'zooms-in' on a group when the
+        importance is greater than the `--feature_importance.sensitivity` value. The `tree` algorithm
+        will usually run faster, but for relatively unimportant features it will only compute an
+        upper bound rather than an exact importance value. We suggest that users generally stick
+        to the `tree` algorithm, unless if they have a very small number of features or
+        near-random model performance.
+        """,
+    )
+
+    parser.add_argument(
+        "--feature_importance.sensitivity",
+        dest="feature_importance_sensitivity",
+        type=float,
+        default=0.03,
+        help="""
+        The maximum amount that permuting a feature group can cause the model performance (determined
+        by `feature_importance.metric`) to drop before the algorithm decides to not expand the feature
+        group. This is only used for the `tree` algorithm.
+        """,
+    )
+
+    parser.add_argument(
+        "--feature_importance.dont_build_tree",
+        dest="dont_build_tree",
+        action="store_true",
+        default=False,
+        help="""
+        If True, don't build the feature trie for the tree algorithm and only use the extra_groups
+        """,
+    )
+
+    parser.add_argument(
+        "--feature_importance.split_feature_group_on_period",
+        dest="split_feature_group_on_period",
+        action="store_true",
+        default=False,
+        help="If true, split feature groups by the period rather than the optimal prefix. Only used for the TREE algorithm",
+    )
+
+    parser.add_argument(
+        "--feature_importance.example_count",
+        dest="feature_importance_example_count",
+        type=int,
+        default=10000,
+        help="""
+        The number of examples used to compute feature importance.
+        Larger values yield more reliable results, but also take longer to compute.
+        These records are loaded into memory. This number is agnostic to batch size.
+        """,
+    )
+
+    parser.add_argument(
+        "--feature_importance.data_dir",
+        dest="feature_importance_data_dir",
+        type=str,
+        default=None,
+        help="Path to the dataset used to compute feature importance."
+        "supports local filesystem path and hdfs://default/<path> which requires "
+        "setting HDFS configuration via env variable HADOOP_CONF_DIR "
+        "Defaults to eval_data_dir",
+    )
+
+    parser.add_argument(
+        "--feature_importance.metric",
+        dest="feature_importance_metric",
+        type=str,
+        default="roc_auc",
+        help="The metric used to determine when to stop expanding the feature importance tree. This is only used for the `tree` algorithm.",
+    )
+
+    parser.add_argument(
+        "--feature_importance.is_metric_larger_the_better",
+        dest="feature_importance_is_metric_larger_the_better",
+        action="store_true",
+        default=False,
+        help="If true, interpret `--feature_importance.metric` to be a metric where larger values are better (e.g. ROC_AUC)",
+    )
+
+    parser.add_argument(
+        "--feature_importance.is_metric_smaller_the_better",
+        dest="feature_importance_is_metric_smaller_the_better",
+        action="store_true",
+        default=False,
+        help="If true, interpret `--feature_importance.metric` to be a metric where smaller values are better (e.g. LOSS)",
+    )
+
+    subparsers = parser.add_subparsers(
+        help="Learning Rate Decay Functions. Can only pass 1."
+        "Should be specified after all the optional arguments"
+        "and followed by its specific args"
+        "e.g. --learning_rate 0.01 inverse_learning_rate_decay_fn"
+        " --decay_rate 0.0004 --min_learning_rate 0.001",
+        dest="learning_rate_decay",
+    )
+
+    # Create the parser for the "exponential_learning_rate_decay_fn"
+    parser_exponential = subparsers.add_parser(
+        "exponential_learning_rate_decay",
+        help="Exponential learning rate decay. "
+        "Exponential decay implements:"
+        "decayed_learning_rate = learning_rate * "
+        "exponential_decay_rate ^ "
+        "(global_step / decay_steps",
+    )
+    parser_exponential.add_argument(
+        "--decay_steps",
+        type=float,
+        default=None,
+        help="Required for 'exponential' learning_rate_decay.",
+    )
+    parser_exponential.add_argument(
+        "--exponential_decay_rate",
+        type=float,
+        default=None,
+        help="Required for 'exponential' learning_rate_decay. Must be positive. ",
+    )
+
+    # Create the parser for the "polynomial_learning_rate_decay_fn"
+    parser_polynomial = subparsers.add_parser(
+        "polynomial_learning_rate_decay",
+        help="Polynomial learning rate decay. "
+        "Polynomial decay implements: "
+        "global_step = min(global_step, decay_steps)"
+        "decayed_learning_rate = "
+        "(learning_rate - end_learning_rate) * "
+        "(1 - global_step / decay_steps) ^ "
+        "(polynomial_power) + end_learning_rate"
+        "So for linear decay you can use a "
+        "polynomial_power=1 (the default)",
+    )
+    parser_polynomial.add_argument(
+        "--end_learning_rate",
+        type=float,
+        default=0.0001,
+        help="Required for 'polynomial' learning_rate_decay (ignored otherwise).",
+    )
+    parser_polynomial.add_argument(
+        "--polynomial_power",
+        type=float,
+        default=0.0001,
+        help="Required for 'polynomial' learning_rate_decay."
+        "The power of the polynomial. Defaults to linear, 1.0.",
+    )
+    parser_polynomial.add_argument(
+        "--decay_steps",
+        type=float,
+        default=None,
+        help="Required for 'polynomial' learning_rate_decay. ",
+    )
+
+    # Create the parser for the "piecewise_constant_learning_rate_decay_fn"
+    parser_piecewise_constant = subparsers.add_parser(
+        "piecewise_constant_learning_rate_decay",
+        help="Piecewise Constant "
+        "learning rate decay. "
+        "For piecewise_constant, "
+        "consider this example: "
+        "We want to use a learning rate "
+        "that is 1.0 for"
+        "the first 100000 steps,"
+        "0.5 for steps 100001 to 110000, "
+        "and 0.1 for any additional steps. "
+        "To do so, specify "
+        "--piecewise_constant_boundaries=100000,110000"
+        "--piecewise_constant_values=1.0,0.5,0.1",
+    )
+    parser_piecewise_constant.add_argument(
+        "--piecewise_constant_values",
+        action=parse_comma_separated_list(element_type=float),
+        default=None,
+        help="Required for 'piecewise_constant_values' learning_rate_decay. "
+        "A list of comma seperated floats or ints that specifies the values "
+        "for the intervals defined by boundaries. It should have one more "
+        "element than boundaries.",
+    )
+    parser_piecewise_constant.add_argument(
+        "--piecewise_constant_boundaries",
+        action=parse_comma_separated_list(element_type=int),
+        default=None,
+        help="Required for 'piecewise_constant_values' learning_rate_decay. "
+        "A list of comma seperated integers, with strictly increasing entries.",
+    )
+
+    # Create the parser for the "inverse_learning_rate_decay_fn"
+    parser_inverse = subparsers.add_parser(
+        "inverse_learning_rate_decay",
+        help="Inverse Leaning rate decay. "
+        "Inverse implements:"
+        "decayed_lr = max(lr /(1 + decay_rate * "
+        "floor(global_step /decay_step)),"
+        " min_learning_rate)"
+        "When decay_step=1 this mimics the behaviour"
+        "of the default learning rate decay"
+        "of DeepBird v1.",
+    )
+
+    parser_inverse.add_argument(
+        "--decay_rate",
+        type=float,
+        default=None,
+        help="Required for 'inverse' learning_rate_decay. Rate in which we decay the learning rate.",
+    )
+    parser_inverse.add_argument(
+        "--min_learning_rate",
+        type=float,
+        default=None,
+        help="Required for 'inverse' learning_rate_decay.Minimum possible learning_rate.",
+    )
+    parser_inverse.add_argument(
+        "--decay_steps",
+        type=float,
+        default=1,
+        help="Required for 'inverse' learning_rate_decay.",
+    )
+
+    # Create the parser for the "cosine_learning_rate_decay_fn"
+    parser_cosine = subparsers.add_parser(
+        "cosine_learning_rate_decay",
+        help="Cosine Leaning rate decay. "
+        "Cosine implements:"
+        "decayed_lr = 0.5 * (1 + cos(pi *\
+                                         global_step / decay_steps)) * lr",
+    )
+
+    parser_cosine.add_argument(
+        "--alpha",
+        type=float,
+        default=0,
+        help="A scalar float32 or float64 Tensor or a Python number.\
+    Minimum learning rate value as a fraction of learning_rate.",
+    )
+    parser_cosine.add_argument(
+        "--decay_steps", type=float, help="Required for 'inverse' learning_rate_decay."
+    )
+
+    # Create the parser for the "cosine_restart_learning_rate_decay_fn"
+    parser_cosine_restart = subparsers.add_parser(
+        "cosine_restarts_learning_rate_decay",
+        help="Applies cosine decay with restarts \
+                                                  to the learning rate"
+        "See [Loshchilov & Hutter, ICLR2016],\
+                                                   SGDR: Stochastic"
+        "Gradient Descent with Warm Restarts."
+        "https://arxiv.org/abs/1608.03983",
+    )
+    parser_cosine_restart.add_argument(
+        "--first_decay_steps",
+        type=float,
+        help="Required for 'cosine_restart' learning_rate_decay.",
+    )
+    parser_cosine_restart.add_argument(
+        "--alpha",
+        type=float,
+        default=0,
+        help="A scalar float32 or float64 Tensor or a Python number. \
+           Minimum learning rate value as a fraction of learning_rate.",
+    )
+    parser_cosine_restart.add_argument(
+        "--t_mul",
+        type=float,
+        default=2,
+        help="A scalar float32 or float64 Tensor or a Python number. \
+           Used to derive the number of iterations in the i-th period",
+    )
+    parser_cosine_restart.add_argument(
+        "--m_mul",
+        type=float,
+        default=1,
+        help="A scalar float32 or float64 Tensor or a Python number. \
+      Used to derive the initial learning rate of the i-th period.",
+    )
+
+    # Create dummy parser for None, which is the default.
+    parser_default = subparsers.add_parser(
+        "no_learning_rate_decay", help="No learning rate decay"
+    )  # noqa: F841
+
+    parser.set_default_subparser("no_learning_rate_decay")
+
+    return parser
+
+
+class DefaultSubcommandArgParse(argparse.ArgumentParser):
     """
-    Overwrites _parse_known_args
+    Subclass of argparse.ArgumentParser that sets default parser
     """
-    in_args = set(arg_strings)
-    d_sp = self._DEFAULT_SUBPARSER
-    if d_sp is not None and not {'-h', '--help'}.intersection(in_args):
-      for x_val in self._subparsers._actions:
-        subparser_found = (
-          isinstance(x_val, argparse._SubParsersAction) and
-          in_args.intersection(x_val._name_parser_map.keys())
+
+    _DEFAULT_SUBPARSER = None
+
+    def set_default_subparser(self, name: str) -> None:
+        """
+        sets the default subparser
+        """
+        self._DEFAULT_SUBPARSER = name
+
+    def _parse_known_args(
+        self, arg_strings: List[str], *args, **kwargs
+    ) -> argparse.Namespace:
+        """
+        Overwrites _parse_known_args
+        """
+        in_args = set(arg_strings)
+        d_sp = self._DEFAULT_SUBPARSER
+        if d_sp is not None and not {"-h", "--help"}.intersection(in_args):
+            for x_val in self._subparsers._actions:
+                subparser_found = isinstance(
+                    x_val, argparse._SubParsersAction
+                ) and in_args.intersection(x_val._name_parser_map.keys())
+                if subparser_found:
+                    break
+            else:
+                # insert default in first position, this implies no
+                # global options without a sub_parsers specified
+                arg_strings = arg_strings + [d_sp]
+        return super(DefaultSubcommandArgParse, self)._parse_known_args(
+            arg_strings, *args, **kwargs
         )
-        if subparser_found:
-          break
-      else:
-        # insert default in first position, this implies no
-        # global options without a sub_parsers specified
-        arg_strings = arg_strings + [d_sp]
-    return super(DefaultSubcommandArgParse, self)._parse_known_args(
-      arg_strings, *args, **kwargs
-    )
-
-  def _check_value(self, action, value):
-    try:
-      super(DefaultSubcommandArgParse, self)._check_value(
-        action, value
-      )
-    except ArgumentError as error:
-      error.message += ("\nERROR: Deepbird is trying to interpret \"{}\" as a value of {}. If this is not what you expected, "
-        "then most likely one of the following two things are happening: Either one of your cli arguments are not recognized, "
-        "probably {} or whichever argument you are passing {} as a value to OR you are passing in an argument after "
-        "the `learning_rate_decay` argument.\n").format(value, action.dest, value, value)
-      raise error
-
-
-def parse_comma_separated_list(element_type=str):
-  """
-  Generates an argparse.Action that converts a string representing a comma separated list to a
-  list and converts each element to a specified type.
-  """
-
-  # pylint: disable-msg=too-few-public-methods
-  class _ParseCommaSeparatedList(argparse.Action):
+
+    def _check_value(self, action: argparse.Action, value: str) -> None:
+        try:
+            super(DefaultSubcommandArgParse, self)._check_value(action, value)
+        except ArgumentError as error:
+            error.message += (
+                '\nERROR: Deepbird is trying to interpret "{}" as a value of {}. If this is not what you expected, '
+                "then most likely one of the following two things are happening: Either one of your cli arguments are not recognized, "
+                "probably {} or whichever argument you are passing {} as a value to OR you are passing in an argument after "
+                "the `learning_rate_decay` argument.\n"
+            ).format(value, action.dest, value, value)
+            raise error
+
+
+def parse_comma_separated_list(element_type=str) -> argparse.Action:
     """
-    Converts a string representing a comma separated list to a list and converts each element to a
-    specified type.
+    Generates an argparse.Action that converts a string representing a comma separated list to a
+    list and converts each element to a specified type.
     """
 
-    def __call__(self, parser, namespace, values, option_string=None):
-      if values is not None:
-        values = [element_type(v) for v in values.split(',')]
-      setattr(namespace, self.dest, values)
-
-  return _ParseCommaSeparatedList
+    # pylint: disable-msg=too-few-public-methods
+    class _ParseCommaSeparatedList(argparse.Action):
+        """
+        Converts a string representing a comma separated list to a list and converts each element to a
+        specified type.
+        """
+
+        def __call__(
+            self,
+            parser: argparse.ArgumentParser,
+            namespace: argparse.Namespace,
+            values: str,
+            option_string: str = None,
+        ) -> None:  # pylint: disable-unused-argument
+            if values is not None:
+                values = [element_type(v) for v in values.split(",")]
+            setattr(namespace, self.dest, values)
+
+    return _ParseCommaSeparatedList
diff --git a/twml/twml/array.py b/twml/twml/array.py
index a8524a06d..3b7adea93 100644
--- a/twml/twml/array.py
+++ b/twml/twml/array.py
@@ -2,100 +2,102 @@
 
 import ctypes as ct
 
+import numpy as np
 from absl import logging
 from libtwml import CLIB
-import numpy as np
-
 
 _NP_TO_TWML_TYPE = {
-  'float32': ct.c_int(1),
-  'float64': ct.c_int(2),
-  'int32': ct.c_int(3),
-  'int64': ct.c_int(4),
-  'int8': ct.c_int(5),
-  'uint8': ct.c_int(6),
+    "float32": ct.c_int(1),
+    "float64": ct.c_int(2),
+    "int32": ct.c_int(3),
+    "int64": ct.c_int(4),
+    "int8": ct.c_int(5),
+    "uint8": ct.c_int(6),
 }
 
 
 class Array(object):
-  """
-  Wrapper class to allow numpy arrays to work with twml functions.
-  """
-
-  def __init__(self, array):
-    """
-    Wraps numpy array and creates a handle that can be passed to C functions from libtwml.
-
-    array: Numpy array
-    """
-    if not isinstance(array, np.ndarray):
-      raise TypeError("Input must be a numpy array")
-
-    try:
-      ttype = _NP_TO_TWML_TYPE[array.dtype.name]
-    except KeyError as err:
-      logging.error("Unsupported numpy type")
-      raise err
-
-    handle = ct.c_void_p(0)
-    ndim = ct.c_int(array.ndim)
-    dims = array.ctypes.get_shape()
-    isize = array.dtype.itemsize
-
-    strides_t = ct.c_size_t * array.ndim
-    strides = strides_t(*[n // isize for n in array.strides])
-
-    err = CLIB.twml_tensor_create(ct.pointer(handle),
-                                  array.ctypes.get_as_parameter(),
-                                  ndim, dims, strides, ttype)
-
-    if err != 1000:
-      raise RuntimeError("Error from libtwml")
-
-    # Store the numpy array to ensure it isn't deleted before self
-    self._array = array
-
-    self._handle = handle
-
-    self._type = ttype
-
-  @property
-  def handle(self):
-    """
-    Return the twml handle
-    """
-    return self._handle
-
-  @property
-  def shape(self):
     """
-    Return the shape
+    Wrapper class to allow numpy arrays to work with twml functions.
     """
-    return self._array.shape
 
-  @property
-  def ndim(self):
-    """
-    Return the shape
-    """
-    return self._array.ndim
-
-  @property
-  def array(self):
-    """
-    Return the numpy array
-    """
-    return self._array
-
-  @property
-  def dtype(self):
-    """
-    Return numpy dtype
-    """
-    return self._array.dtype
-
-  def __del__(self):
-    """
-    Delete the handle
-    """
-    CLIB.twml_tensor_delete(self._handle)
+    def __init__(self, array: np.ndarray):
+        """
+        Wraps numpy array and creates a handle that can be passed to C functions from libtwml.
+
+        array: Numpy array
+        """
+        if not isinstance(array, np.ndarray):
+            raise TypeError("Input must be a numpy array")
+
+        try:
+            ttype = _NP_TO_TWML_TYPE[array.dtype.name]
+        except KeyError as err:
+            logging.error("Unsupported numpy type")
+            raise err
+
+        handle = ct.c_void_p(0)
+        ndim = ct.c_int(array.ndim)
+        dims = array.ctypes.get_shape()
+        isize = array.dtype.itemsize
+
+        strides_t = ct.c_size_t * array.ndim
+        strides = strides_t(*[n // isize for n in array.strides])
+
+        err = CLIB.twml_tensor_create(
+            ct.pointer(handle),
+            array.ctypes.get_as_parameter(),
+            ndim,
+            dims,
+            strides,
+            ttype,
+        )
+
+        if err != 1000:
+            raise RuntimeError("Error from libtwml")
+
+        # Store the numpy array to ensure it isn't deleted before self
+        self._array = array
+        self._handle = handle
+        self._type = ttype
+
+    @property
+    def handle(self) -> ct.c_void_p:
+        """
+        Return the twml handle
+        """
+        return self._handle
+
+    @property
+    def shape(self) -> tuple:
+        """
+        Return the shape
+        """
+        return self._array.shape
+
+    @property
+    def ndim(self) -> int:
+        """
+        Return the shape
+        """
+        return self._array.ndim
+
+    @property
+    def array(self) -> np.ndarray:
+        """
+        Return the numpy array
+        """
+        return self._array
+
+    @property
+    def dtype(self) -> np.dtype:
+        """
+        Return numpy dtype
+        """
+        return self._array.dtype
+
+    def __del__(self) -> None:
+        """
+        Delete the handle
+        """
+        CLIB.twml_tensor_delete(self._handle)
diff --git a/twml/twml/block_format_writer.py b/twml/twml/block_format_writer.py
index 9c4a9b6a8..8132b456c 100644
--- a/twml/twml/block_format_writer.py
+++ b/twml/twml/block_format_writer.py
@@ -5,61 +5,61 @@
 
 
 class BlockFormatWriter(object):
-  """
-  Class to write block format file.
-  """
+    """
+    Class to write block format file.
+    """
 
-  def __init__(self, file_name, records_per_block=100):
-    file_name = file_name
-    if not isinstance(file_name, str):
-      raise ValueError("file_name has to be of type str")
+    def __init__(self, file_name: str, records_per_block: int = 100):
+        file_name = file_name
+        if not isinstance(file_name, str):
+            raise ValueError("file_name has to be of type str")
 
-    self.file_name = ct.c_char_p(file_name.encode())
-    self.records_per_block = ct.c_int(int(records_per_block))
-    handle = ct.c_void_p(0)
-    err = CLIB.block_format_writer_create(ct.pointer(handle),
-                                          self.file_name,
-                                          self.records_per_block)
-    self._handle = None
-    # 1000 means TWML_ERR_NONE
-    if err != 1000:
-      raise RuntimeError("Error from libtwml")
-    self._handle = handle
+        self.file_name = ct.c_char_p(file_name.encode())
+        self.records_per_block = ct.c_int(int(records_per_block))
+        handle = ct.c_void_p(0)
+        err = CLIB.block_format_writer_create(
+            ct.pointer(handle), self.file_name, self.records_per_block
+        )
+        self._handle = None
+        # 1000 means TWML_ERR_NONE
+        if err != 1000:
+            raise RuntimeError("Error from libtwml")
+        self._handle = handle
 
-  @property
-  def handle(self):
-    """
-    Return the handle
-    """
-    return self._handle
+    @property
+    def handle(self) -> ct.c_void_p:
+        """
+        Return the handle
+        """
+        return self._handle
 
-  def write(self, class_name, record):
-    """
-    Write a record.
+    def write(self, class_name: str, record: bytes) -> None:
+        """
+        Write a record.
 
-    Note: `record` needs to be in a format that can be converted to ctypes.c_char_p.
-    """
-    if not isinstance(class_name, str):
-      raise ValueError("class_name has to be of type str")
+        Note: `record` needs to be in a format that can be converted to ctypes.c_char_p.
+        """
+        if not isinstance(class_name, str):
+            raise ValueError("class_name has to be of type str")
 
-    record_len = len(record)
-    class_name = ct.c_char_p(class_name.encode())
-    record = ct.c_char_p(record)
-    err = CLIB.block_format_write(self._handle, class_name, record, record_len)
-    if err != 1000:
-      raise RuntimeError("Error from libtwml")
+        record_len = len(record)
+        class_name = ct.c_char_p(class_name.encode())
+        record = ct.c_char_p(record)
+        err = CLIB.block_format_write(self._handle, class_name, record, record_len)
+        if err != 1000:
+            raise RuntimeError("Error from libtwml")
 
-  def flush(self):
-    """
-    Flush records in buffer to outputfile.
-    """
-    err = CLIB.block_format_flush(self._handle)
-    if err != 1000:
-      raise RuntimeError("Error from libtwml")
+    def flush(self) -> None:
+        """
+        Flush records in buffer to outputfile.
+        """
+        err = CLIB.block_format_flush(self._handle)
+        if err != 1000:
+            raise RuntimeError("Error from libtwml")
 
-  def __del__(self):
-    """
-    Delete the handle
-    """
-    if self._handle:
-      CLIB.block_format_writer_delete(self._handle)
+    def __del__(self) -> None:
+        """
+        Delete the handle
+        """
+        if self._handle:
+            CLIB.block_format_writer_delete(self._handle)
diff --git a/twml/twml/constants.py b/twml/twml/constants.py
index c6c726eed..8d71c4210 100644
--- a/twml/twml/constants.py
+++ b/twml/twml/constants.py
@@ -1,11 +1,11 @@
 # These should coincide with 'enum class DecodeMode' values in HashedDataRecordReader.h
 
+from twitter.deepbird.io.legacy.constants import DECODE_MODES  # noqa: F401
+from twitter.deepbird.io.legacy.constants import DEFAULT_DECODE_MODE  # noqa: F401
+from twitter.deepbird.io.legacy.constants import DEFAULT_ZOOKEEPER_HOST  # noqa: F401
+from twitter.deepbird.io.legacy.constants import HASH_FNAME_AND_VALNAME  # noqa: F401
+from twitter.deepbird.io.legacy.constants import HASH_VALNAME  # noqa: F401
+from twitter.deepbird.io.legacy.constants import HashingDiscretizerOptions  # noqa: F401
 from twitter.deepbird.io.legacy.constants import (
-  DECODE_MODES,  # noqa: F401
-  DEFAULT_DECODE_MODE,  # noqa: F401
-  HASH_FNAME_AND_VALNAME,  # noqa: F401
-  HASH_VALNAME,  # noqa: F401
-  HashingDiscretizerOptions,  # noqa: F401
-  DEFAULT_ZOOKEEPER_BASE_ZNODE,  # noqa: F401
-  DEFAULT_ZOOKEEPER_HOST,  # noqa: F401
-)
+    DEFAULT_ZOOKEEPER_BASE_ZNODE,
+)  # noqa: F401
diff --git a/twml/twml/contrib/__init__.py b/twml/twml/contrib/__init__.py
index 1a5e8efe4..2860971b6 100644
--- a/twml/twml/contrib/__init__.py
+++ b/twml/twml/contrib/__init__.py
@@ -1,21 +1,21 @@
 # pylint: disable=wildcard-import
 """ experimental and contributed modules """
 
-from . import layers  # noqa: F401
-from . import feature_importances  # noqa: F401
-from . import calibrators  # noqa: F401
-from . import readers  # noqa: F401
-from . import utils  # noqa: F401
-from . import build_graphs_fns  # noqa: F401
-from . import feature_config  # noqa: F401
-from . import parsers  # noqa: F401
-from . import initializers  # noqa: F401
-from . import export # noqa: F401
-from . import feature_config_parsers # noqa: F401
-
 # These imports do not work with TF 2.x and are not needed either.
 # If you are using TF 2.x, use the modular targets under src/python/twitter/deepbird.
 import tensorflow
-from . import trainers  # noqa: F401
-from . import metrics  # noqa: F401
+
+from . import build_graphs_fns  # noqa: F401
+from . import calibrators  # noqa: F401
+from . import export  # noqa: F401
+from . import feature_config  # noqa: F401
+from . import feature_config_parsers  # noqa: F401
+from . import feature_importances  # noqa: F401
 from . import hooks  # noqa: F401
+from . import initializers  # noqa: F401
+from . import layers  # noqa: F401
+from . import metrics  # noqa: F401
+from . import parsers  # noqa: F401
+from . import readers  # noqa: F401
+from . import trainers  # noqa: F401
+from . import utils  # noqa: F401
diff --git a/twml/twml/contrib/build_graphs_fns.py b/twml/twml/contrib/build_graphs_fns.py
index 829f61512..108d9d4d1 100644
--- a/twml/twml/contrib/build_graphs_fns.py
+++ b/twml/twml/contrib/build_graphs_fns.py
@@ -1,32 +1,35 @@
 # pylint: disable=unused-argument, missing-docstring
-'''
+"""
 Common build graphs that can be reused
-'''
+"""
 import tensorflow.compat.v1 as tf
 
 
-def get_saved_modules_graph(input_graph_fn):
-  """
-  Get common graph for stitching different saved modules for export.
-  This graph is used to save checkpoints; and then export the modules
-  as a unity.
-  Args:
+def get_saved_modules_graph(
+    input_graph_fn: callable, params: dict, features: dict, mode: str = "train"
+) -> dict:
+    """
+    Get common graph for stitching different saved modules for export.
+    This graph is used to save checkpoints; and then export the modules
+    as a unity.
+    Args:
         features:
-          model features
+            model features
         params:
-          model params
+            model params
         input_graph_fn:
-          main logic for the stitching
-  Returns:
-    build_graph
-  """
-  def build_graph(features, label, mode, params, config=None):
+            main logic for the stitching
+        mode:
+            the mode of the graph
+    Returns:
+        output of input_graph_fn
+    """
+
     output = input_graph_fn(features, params)
     # If mode is train, we just need to assign a dummy loss
     # and update the train op. This is done to save the graph to save_dir.
-    if mode == 'train':
-      loss = tf.constant(1)
-      train_op = tf.assign_add(tf.train.get_global_step(), 1)
-      return {'train_op': train_op, 'loss': loss}
+    if mode == "train":
+        loss = tf.constant(1)
+        train_op = tf.assign_add(tf.train.get_global_step(), 1)
+        return {"train_op": train_op, "loss": loss}
     return output
-  return build_graph
diff --git a/twml/twml/contrib/calibrators/__init__.py b/twml/twml/contrib/calibrators/__init__.py
index 02181ed12..0f17fdf55 100644
--- a/twml/twml/contrib/calibrators/__init__.py
+++ b/twml/twml/contrib/calibrators/__init__.py
@@ -9,10 +9,13 @@
 Ultimately, the ``Calibrator`` should produce an initialized layer via its ``to_layer()`` method.
 """
 
-from .common_calibrators import calibrate_discretizer_and_export, add_discretizer_arguments  # noqa: F401
 from .calibrator import Calibrator  # noqa: F401
-from .mdl import MDLCalibrator  # noqa: F401
+from .common_calibrators import add_discretizer_arguments  # noqa: F401
+from .common_calibrators import calibrate_discretizer_and_export
+from .hashed_percentile_discretizer import (
+    HashedPercentileDiscretizerCalibrator,
+)  # noqa: F401
+from .hashing_discretizer import HashingDiscretizerCalibrator  # noqa: F401
 from .isotonic import IsotonicCalibrator  # noqa: F401
+from .mdl import MDLCalibrator  # noqa: F401
 from .percentile_discretizer import PercentileDiscretizerCalibrator  # noqa: F401
-from .hashed_percentile_discretizer import HashedPercentileDiscretizerCalibrator  # noqa: F401
-from .hashing_discretizer import HashingDiscretizerCalibrator  # noqa: F401
\ No newline at end of file
diff --git a/twml/twml/contrib/calibrators/calibrator.py b/twml/twml/contrib/calibrators/calibrator.py
index 7408412e0..aba4615e2 100644
--- a/twml/twml/contrib/calibrators/calibrator.py
+++ b/twml/twml/contrib/calibrators/calibrator.py
@@ -1,5 +1,5 @@
 # pylint: disable=missing-docstring, unused-argument
-''' Contains the base classes for CalibrationFeature and Calibrator '''
+""" Contains the base classes for CalibrationFeature and Calibrator """
 
 
 from collections import defaultdict
@@ -7,151 +7,155 @@
 import numpy as np
 import tensorflow.compat.v1 as tf
 import tensorflow_hub as hub
+
 import twml
 import twml.util
 
 
 class CalibrationFeature(object):
-  '''
-  Accumulates values and weights for individual features.
-  Typically, each unique feature defined in the accumulated SparseTensor or Tensor
-  would have its own CalibrationFeature instance.
-  '''
-
-  def __init__(self, feature_id):
-    ''' Constructs a CalibrationFeature
-
-    Arguments:
-      feature_id:
-        number identifying the feature.
-    '''
-    self.feature_id = feature_id
-    self._calibrated = False
-    self._features_dict = defaultdict(list)
-
-  def add_values(self, new_features):
-    '''
-    Extends lists to contain the values in this batch
-    '''
-    for key in new_features:
-      self._features_dict[key].append(new_features[key])
-
-  def _concat_arrays(self):
-    '''
-    This class calls this function after you have added all the values.
-    It creates a dictionary with the concatanated arrays
-    '''
-    self._features_dict.update((k, np.concatenate(v)) for k, v in self._features_dict.items())
-
-  def calibrate(self, *args, **kwargs):
-    raise NotImplementedError
+    """
+    Accumulates values and weights for individual features.
+    Typically, each unique feature defined in the accumulated SparseTensor or Tensor
+    would have its own CalibrationFeature instance.
+    """
+
+    def __init__(self, feature_id: int):
+        """Constructs a CalibrationFeature
+
+        Args:
+            feature_id:
+                number identifying the feature.
+        """
+        self.feature_id = feature_id
+        self._calibrated = False
+        self._features_dict = defaultdict(list)
+
+    def add_values(self, new_features: dict):
+        """Extends lists to contain the values in this batch"""
+        for key in new_features:
+            self._features_dict[key].append(new_features[key])
+
+    def _concat_arrays(self):
+        """
+        This class calls this function after you have added all the values.
+        It creates a dictionary with the concatenated arrays
+        """
+        for k, v in self._features_dict.items():
+            self._features_dict[k] = np.concatenate(v)
+
+    def calibrate(self, *args, **kwargs):
+        raise NotImplementedError
 
 
 class Calibrator(object):
-  '''
-  Accumulates features and their respective values for Calibration
-  The steps for calibration are typically as follows:
-
-   1. accumulate feature values from batches by calling ``accumulate()`` and;
-   2. calibrate by calling ``calibrate()``;
-   3. convert to a twml.layers layer by calling ``to_layer()``.
-
-  Note you can only use one calibrator per Trainer.
-  '''
-
-  def __init__(self, calibrator_name=None, **kwargs):
-    '''
-    Arguments:
-      calibrator_name.
-        Default: if set to None it will be the same as the class name.
-        Please be reminded that if in the model there are many calibrators
-        of the same type the calibrator_name should be changed to avoid confusion.
-    '''
-    self._calibrated = False
-    if calibrator_name is None:
-      calibrator_name = twml.util.to_snake_case(self.__class__.__name__)
-    self._calibrator_name = calibrator_name
-    self._kwargs = kwargs
-
-  @property
-  def is_calibrated(self):
-    return self._calibrated
-
-  @property
-  def name(self):
-    return self._calibrator_name
-
-  def accumulate(self, *args, **kwargs):
-    '''Accumulates features and their respective values for Calibration.'''
-    raise NotImplementedError
-
-  def calibrate(self):
-    '''Calibrates after the accumulation has ended.'''
-    self._calibrated = True
-
-  def to_layer(self, name=None):
-    '''
-    Returns a twml.layers.Layer instance with the result of calibrator.
-
-    Arguments:
-      name:
-        name-scope of the layer
-    '''
-    raise NotImplementedError
-
-  def get_layer_args(self):
-    '''
-    Returns layer arguments required to implement multi-phase training.
-
-    Returns:
-      dictionary of Layer constructor arguments to initialize the
-      layer Variables. Typically, this should contain enough information
-      to initialize empty layer Variables of the correct size, which will then
-      be filled with the right data using init_map.
-    '''
-    raise NotImplementedError
-
-  def save(self, save_dir, name="default", verbose=False):
-    '''Save the calibrator into the given save_directory.
-    Arguments:
-      save_dir:
-        name of the saving directory. Default (string): "default".
-      name:
-        name for the calibrator.
-    '''
-    if not self._calibrated:
-      raise RuntimeError("Expecting prior call to calibrate().Cannot save() prior to calibrate()")
-
-    # This module allows for the calibrator to save be saved as part of
-    # Tensorflow Hub (this will allow it to be used in further steps)
-    def calibrator_module():
-      # Note that this is usually expecting a sparse_placeholder
-      inputs = tf.sparse_placeholder(tf.float32)
-      calibrator_layer = self.to_layer()
-      output = calibrator_layer(inputs)
-      # creates the signature to the calibrator module
-      hub.add_signature(inputs=inputs, outputs=output, name=name)
-
-    # exports the module to the save_dir
-    spec = hub.create_module_spec(calibrator_module)
-    with tf.Graph().as_default():
-      module = hub.Module(spec)
-      with tf.Session() as session:
-        module.export(save_dir, session)
-
-  def write_summary(self, writer, sess=None):
     """
-    This method is called by save() to write tensorboard summaries to disk.
-    See MDLCalibrator.write_summary for an example.
-    By default, the method does nothing. It can be overloaded by child-classes.
-
-    Arguments:
-      writer:
-        `tf.summary.FilteWriter
-        <https://www.tensorflow.org/versions/master/api_docs/python/tf/summary/FileWriter>`_
-        instance.
-        The ``writer`` is used to add summaries to event files for inclusion in tensorboard.
-      sess (optional):
-        `tf.Session <https://www.tensorflow.org/versions/master/api_docs/python/tf/Session>`_
-        instance. The ``sess`` is used to produces summaries for the writer.
+    Accumulates features and their respective values for Calibration
+    The steps for calibration are typically as follows:
+
+        1. accumulate feature values from batches by calling ``accumulate()`` and;
+        2. calibrate by calling ``calibrate()``;
+        3. convert to a twml.layers layer by calling ``to_layer()``.
+
+    Note you can only use one calibrator per Trainer.
     """
+
+    def __init__(self, calibrator_name: str = None, **kwargs):
+        """
+        Args:
+            calibrator_name (str):
+                Default: if set to None it will be the same as the class name.
+                Please be reminded that if in the model there are many calibrators
+                of the same type the calibrator_name should be changed to avoid confusion.
+        """
+        self._calibrated = False
+        if calibrator_name is None:
+            calibrator_name = twml.util.to_snake_case(self.__class__.__name__)
+        self._calibrator_name = calibrator_name
+        self._kwargs = kwargs
+
+    @property
+    def is_calibrated(self) -> bool:
+        return self._calibrated
+
+    @property
+    def name(self) -> str:
+        return self._calibrator_name
+
+    def accumulate(self, *args, **kwargs):
+        """Accumulates features and their respective values for Calibration."""
+        raise NotImplementedError
+
+    def calibrate(self):
+        """Calibrates after the accumulation has ended."""
+        self._calibrated = True
+
+    def to_layer(self, name: str = None):
+        """
+        Returns a twml.layers.Layer instance with the result of calibrator.
+
+        Args:
+            name (str):
+                name-scope of the layer
+        """
+        raise NotImplementedError
+
+    def get_layer_args(self):
+        """
+        Returns layer arguments required to implement multi-phase training.
+
+        Returns:
+            dictionary of Layer constructor arguments to initialize the
+            layer Variables. Typically, this should contain enough information
+            to initialize empty layer Variables of the correct size, which will then
+            be filled with the right data using init_map.
+        """
+        raise NotImplementedError
+
+    def save(
+        self, save_dir: str, name: str = "default", verbose: bool = False
+    ):  # pylint: disable=unused-argument
+        """Save the calibrator into the given save_directory.
+        Args:
+            save_dir (str):
+                name of the saving directory.
+            name (str):
+                name for the calibrator. Default (string): "default".
+        """
+        if not self._calibrated:
+            raise RuntimeError(
+                "Expecting prior call to calibrate().Cannot save() prior to calibrate()"
+            )
+
+        # This module allows for the calibrator to save be saved as part of
+        # Tensorflow Hub (this will allow it to be used in further steps)
+        def calibrator_module():
+            # Note that this is usually expecting a sparse_placeholder
+            inputs = tf.sparse_placeholder(tf.float32)
+            calibrator_layer = self.to_layer()
+            output = calibrator_layer(inputs)
+            # creates the signature to the calibrator module
+            hub.add_signature(inputs=inputs, outputs=output, name=name)
+
+        # exports the module to the save_dir
+        spec = hub.create_module_spec(calibrator_module)
+        with tf.Graph().as_default():
+            module = hub.Module(spec)
+            with tf.Session() as session:
+                module.export(save_dir, session)
+
+    def write_summary(self, writer: tf.summary.FileWriter, sess=None):
+        """
+        This method is called by save() to write tensorboard summaries to disk.
+        See MDLCalibrator.write_summary for an example.
+        By default, the method does nothing. It can be overloaded by child-classes.
+
+        Args:
+            writer:
+                `tf.summary.FileWriter
+                <https://www.tensorflow.org/versions/master/api_docs/python/tf/summary/FileWriter>`_
+                instance.
+                The ``writer`` is used to add summaries to event files for inclusion in tensorboard.
+            sess (optional):
+                `tf.Session <https://www.tensorflow.org/versions/master/api_docs/python/tf/Session>`_
+                instance. The ``sess`` is used to produces summaries for the writer.
+        """
diff --git a/twml/twml/contrib/calibrators/common_calibrators.py b/twml/twml/contrib/calibrators/common_calibrators.py
index 5301901e4..f554fceb5 100644
--- a/twml/twml/contrib/calibrators/common_calibrators.py
+++ b/twml/twml/contrib/calibrators/common_calibrators.py
@@ -9,699 +9,952 @@
 # TODO: many of these functions aren't common at all.
 #       For example, Discretizer functions should be moved to PercentileDiscretizer.
 
+import argparse
 import copy
 import os
 import time
+from typing import Callable
 
-from absl import logging
 import tensorflow.compat.v1 as tf
 import tensorflow_hub as hub
+from absl import logging
+
 import twml
 from twml.argument_parser import SortingHelpFormatter
+from twml.contrib.calibrators.isotonic import IsotonicCalibrator
 from twml.input_fns import data_record_input_fn
+from twml.twml.feature_config import FeatureConfig
+from twml.twml.trainers.trainer import Trainer
 from twml.util import list_files_by_datetime, sanitize_hdfs_path
-from twml.contrib.calibrators.isotonic import IsotonicCalibrator
 
 
-def calibrator_arguments(parser):
-  """
-  Calibrator Parameters to add to relevant parameters to the DataRecordTrainerParser.
-  Otherwise, if alone in a file, it just creates its own default parser.
-  Arguments:
-    parser:
-      Parser with the options to the model
-  """
-  parser.add_argument("--calibrator.save_dir", type=str,
-    dest="calibrator_save_dir",
-    help="Path to save or load calibrator calibration")
-  parser.add_argument("--calibrator_batch_size", type=int, default=128,
-    dest="calibrator_batch_size",
-    help="calibrator batch size")
-  parser.add_argument("--calibrator_parts_downsampling_rate", type=float, default=1,
-    dest="calibrator_parts_downsampling_rate",
-    help="Parts downsampling rate")
-  parser.add_argument("--calibrator_max_steps", type=int, default=None,
-    dest="calibrator_max_steps",
-    help="Max Steps taken by calibrator to accumulate samples")
-  parser.add_argument("--calibrator_num_bins", type=int, default=22,
-    dest="calibrator_num_bins",
-    help="Num bins of calibrator")
-  parser.add_argument("--isotonic_calibrator", dest='isotonic_calibrator', action='store_true',
-    help="Isotonic Calibrator present")
-  parser.add_argument("--calibrator_keep_rate", type=float, default=1.0,
-    dest="calibrator_keep_rate",
-    help="Keep rate")
-  return parser
+def calibrator_arguments(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    """
+    Calibrator Parameters to add to relevant parameters to the DataRecordTrainerParser.
+    Otherwise, if alone in a file, it just creates its own default parser.
+
+    Args:
+        parser:
+            Parser with the options to the model
+    """
+    parser.add_argument(
+        "--calibrator.save_dir",
+        type=str,
+        dest="calibrator_save_dir",
+        help="Path to save or load calibrator calibration",
+    )
+    parser.add_argument(
+        "--calibrator_batch_size",
+        type=int,
+        default=128,
+        dest="calibrator_batch_size",
+        help="calibrator batch size",
+    )
+    parser.add_argument(
+        "--calibrator_parts_downsampling_rate",
+        type=float,
+        default=1,
+        dest="calibrator_parts_downsampling_rate",
+        help="Parts downsampling rate",
+    )
+    parser.add_argument(
+        "--calibrator_max_steps",
+        type=int,
+        default=None,
+        dest="calibrator_max_steps",
+        help="Max Steps taken by calibrator to accumulate samples",
+    )
+    parser.add_argument(
+        "--calibrator_num_bins",
+        type=int,
+        default=22,
+        dest="calibrator_num_bins",
+        help="Num bins of calibrator",
+    )
+    parser.add_argument(
+        "--isotonic_calibrator",
+        dest="isotonic_calibrator",
+        action="store_true",
+        help="Isotonic Calibrator present",
+    )
+    parser.add_argument(
+        "--calibrator_keep_rate",
+        type=float,
+        default=1.0,
+        dest="calibrator_keep_rate",
+        help="Keep rate",
+    )
+    return parser
 
 
 def _generate_files_by_datetime(params):
+    files = list_files_by_datetime(
+        base_path=sanitize_hdfs_path(params.train_data_dir),
+        start_datetime=params.train_start_datetime,
+        end_datetime=params.train_end_datetime,
+        datetime_prefix_format=params.datetime_format,
+        extension="lzo",
+        parallelism=1,
+        hour_resolution=params.hour_resolution,
+        sort=True,
+    )
+
+    return files
+
+
+def get_calibrate_input_fn(parse_fn: callable, params: argparse.Namespace) -> callable:
+    """
+    Default input function used for the calibrator.
+    Args:
+        parse_fn:
+            Parse_fn
+        params:
+            Parameters
+    Returns:
+        input_fn
+    """
+
+    return lambda: data_record_input_fn(
+        files=_generate_files_by_datetime(params),
+        batch_size=params.calibrator_batch_size,
+        parse_fn=parse_fn,
+        num_threads=1,
+        repeat=False,
+        keep_rate=params.calibrator_keep_rate,
+        parts_downsampling_rate=params.calibrator_parts_downsampling_rate,
+        shards=None,
+        shard_index=None,
+        shuffle=True,
+        shuffle_files=True,
+        interleave=True,
+    )
 
-  files = list_files_by_datetime(
-    base_path=sanitize_hdfs_path(params.train_data_dir),
-    start_datetime=params.train_start_datetime,
-    end_datetime=params.train_end_datetime,
-    datetime_prefix_format=params.datetime_format,
-    extension="lzo",
-    parallelism=1,
-    hour_resolution=params.hour_resolution,
-    sort=True)
-
-  return files
-
-
-def get_calibrate_input_fn(parse_fn, params):
-  """
-  Default input function used for the calibrator.
-  Arguments:
-    parse_fn:
-      Parse_fn
-    params:
-      Parameters
-  Returns:
-    input_fn
-  """
-
-  return lambda: data_record_input_fn(
-    files=_generate_files_by_datetime(params),
-    batch_size=params.calibrator_batch_size,
-    parse_fn=parse_fn,
-    num_threads=1,
-    repeat=False,
-    keep_rate=params.calibrator_keep_rate,
-    parts_downsampling_rate=params.calibrator_parts_downsampling_rate,
-    shards=None,
-    shard_index=None,
-    shuffle=True,
-    shuffle_files=True,
-    interleave=True)
-
-
-def get_discretize_input_fn(parse_fn, params):
-  """
-  Default input function used for the calibrator.
-  Arguments:
-    parse_fn:
-      Parse_fn
-    params:
-      Parameters
-  Returns:
-    input_fn
-  """
-
-  return lambda: data_record_input_fn(
-    files=_generate_files_by_datetime(params),
-    batch_size=params.discretizer_batch_size,
-    parse_fn=parse_fn,
-    num_threads=1,
-    repeat=False,
-    keep_rate=params.discretizer_keep_rate,
-    parts_downsampling_rate=params.discretizer_parts_downsampling_rate,
-    shards=None,
-    shard_index=None,
-    shuffle=True,
-    shuffle_files=True,
-    interleave=True)
-
-
-def discretizer_arguments(parser=None):
-  """
-  Discretizer Parameters to add to relevant parameters to the DataRecordTrainerParser.
-  Otherwise, if alone in a file, it just creates its own default parser.
-  Arguments:
-    parser:
-      Parser with the options to the model. Defaults to None
-  """
-
-  if parser is None:
-    parser = twml.DefaultSubcommandArgParse(formatter_class=SortingHelpFormatter)
+
+def get_discretize_input_fn(parse_fn: callable, params: argparse.Namespace) -> callable:
+    """
+    Default input function used for the calibrator.
+    Args:
+        parse_fn:
+            Parse_fn
+        params:
+            Parameters
+    Returns:
+        input_fn
+    """
+
+    return lambda: data_record_input_fn(
+        files=_generate_files_by_datetime(params),
+        batch_size=params.discretizer_batch_size,
+        parse_fn=parse_fn,
+        num_threads=1,
+        repeat=False,
+        keep_rate=params.discretizer_keep_rate,
+        parts_downsampling_rate=params.discretizer_parts_downsampling_rate,
+        shards=None,
+        shard_index=None,
+        shuffle=True,
+        shuffle_files=True,
+        interleave=True,
+    )
+
+
+def discretizer_arguments(parser: argparse.ArgumentParser = None):
+    """
+    Discretizer Parameters to add to relevant parameters to the DataRecordTrainerParser.
+    Otherwise, if alone in a file, it just creates its own default parser.
+
+    Args:
+        parser:
+            Parser with the options to the model. Defaults to None
+    """
+
+    if parser is None:
+        parser = twml.DefaultSubcommandArgParse(formatter_class=SortingHelpFormatter)
+        parser.add_argument(
+            "--overwrite_save_dir",
+            dest="overwrite_save_dir",
+            action="store_true",
+            help="Delete the contents of the current save_dir if it exists",
+        )
+        parser.add_argument(
+            "--train.data_dir",
+            "--train_data_dir",
+            type=str,
+            default=None,
+            dest="train_data_dir",
+            help="Path to the training data directory."
+            "Supports local and HDFS (hdfs://default/<path> ) paths.",
+        )
+        parser.add_argument(
+            "--train.start_date",
+            "--train_start_datetime",
+            type=str,
+            default=None,
+            dest="train_start_datetime",
+            help="Starting date for training inside the train data dir."
+            "The start datetime is inclusive."
+            "e.g. 2019/01/15",
+        )
+        parser.add_argument(
+            "--train.end_date",
+            "--train_end_datetime",
+            type=str,
+            default=None,
+            dest="train_end_datetime",
+            help="Ending date for training inside the train data dir."
+            "The end datetime is inclusive."
+            "e.g. 2019/01/15",
+        )
+        parser.add_argument(
+            "--datetime_format",
+            type=str,
+            default="%Y/%m/%d",
+            help="Date format for training and evaluation datasets."
+            "Has to be a format that is understood by python datetime."
+            "e.g. %Y/%m/%d for 2019/01/15."
+            "Used only if {train/eval}.{start/end}_date are provided.",
+        )
+        parser.add_argument(
+            "--hour_resolution",
+            type=int,
+            default=None,
+            help="Specify the hourly resolution of the stored data.",
+        )
+        parser.add_argument(
+            "--tensorboard_port",
+            type=int,
+            default=None,
+            help="Port for tensorboard to run on.",
+        )
+        parser.add_argument(
+            "--stats_port",
+            type=int,
+            default=None,
+            help="Port for stats server to run on.",
+        )
+        parser.add_argument(
+            "--health_port",
+            type=int,
+            default=None,
+            help="Port to listen on for health-related endpoints (e.g. graceful shutdown)."
+            "Not user-facing as it is set automatically by the twml_cli.",
+        )
+        parser.add_argument(
+            "--data_spec",
+            type=str,
+            default=None,
+            help="Path to data specification JSON file. This file is used to decode DataRecords",
+        )
     parser.add_argument(
-      "--overwrite_save_dir", dest="overwrite_save_dir", action="store_true",
-      help="Delete the contents of the current save_dir if it exists")
+        "--discretizer.save_dir",
+        type=str,
+        dest="discretizer_save_dir",
+        help="Path to save or load discretizer calibration",
+    )
     parser.add_argument(
-      "--train.data_dir", "--train_data_dir", type=str, default=None,
-      dest="train_data_dir",
-      help="Path to the training data directory."
-           "Supports local and HDFS (hdfs://default/<path> ) paths.")
+        "--discretizer_batch_size",
+        type=int,
+        default=128,
+        dest="discretizer_batch_size",
+        help="Discretizer batch size",
+    )
     parser.add_argument(
-      "--train.start_date", "--train_start_datetime",
-      type=str, default=None,
-      dest="train_start_datetime",
-      help="Starting date for training inside the train data dir."
-           "The start datetime is inclusive."
-           "e.g. 2019/01/15")
+        "--discretizer_keep_rate",
+        type=float,
+        default=0.0008,
+        dest="discretizer_keep_rate",
+        help="Keep rate",
+    )
     parser.add_argument(
-      "--train.end_date", "--train_end_datetime", type=str, default=None,
-      dest="train_end_datetime",
-      help="Ending date for training inside the train data dir."
-           "The end datetime is inclusive."
-           "e.g. 2019/01/15")
+        "--discretizer_parts_downsampling_rate",
+        type=float,
+        default=0.2,
+        dest="discretizer_parts_downsampling_rate",
+        help="Parts downsampling rate",
+    )
     parser.add_argument(
-      "--datetime_format", type=str, default="%Y/%m/%d",
-      help="Date format for training and evaluation datasets."
-           "Has to be a format that is understood by python datetime."
-           "e.g. %Y/%m/%d for 2019/01/15."
-           "Used only if {train/eval}.{start/end}_date are provided.")
+        "--discretizer_max_steps",
+        type=int,
+        default=None,
+        dest="discretizer_max_steps",
+        help="Max Steps taken by discretizer to accumulate samples",
+    )
+    return parser
+
+
+def calibrate(
+    trainer: Trainer,
+    params: argparse.Namespace,
+    build_graph: callable,
+    input_fn: callable,
+    debug: bool = False,
+):
+    """
+    Calibrate Isotonic Calibration
+    Args:
+        trainer:
+            Trainer
+        params:
+            Parameters
+        build_graph:
+            Build Graph used to be the input to the calibrator
+        input_fn:
+            Input Function specified by the user
+        debug:
+            Defaults to False. Returns the calibrator
+    """
+
+    if trainer._estimator.config.is_chief:
+        # overwrite the current save_dir
+        if params.overwrite_save_dir and tf.io.gfile.exists(params.calibrator_save_dir):
+            logging.info(
+                "Trainer overwriting existing save directory: %s (params.overwrite_save_dir)"
+                % params.calibrator_save_dir
+            )
+            tf.io.gfile.rmtree(params.calibrator_save_dir)
+
+        calibrator = IsotonicCalibrator(params.calibrator_num_bins)
+
+        # chief trains discretizer
+        logging.info("Chief training calibrator")
+
+        # Accumulate the features for each calibrator
+        features, labels = input_fn()
+        if "weights" not in features:
+            raise ValueError("Weights need to be returned as part of the parse_fn")
+        weights = features.pop("weights")
+
+        preds = build_graph(
+            features=features, label=None, mode="infer", params=params, config=None
+        )
+        init = tf.global_variables_initializer()
+        table_init = tf.tables_initializer()
+        with tf.Session() as sess:
+            sess.run(init)
+            sess.run(table_init)
+            count = 0
+            max_steps = params.calibrator_max_steps or -1
+            while max_steps <= 0 or count <= max_steps:
+                try:
+                    weights_vals, labels_vals, preds_vals = sess.run(
+                        [weights, labels, preds["output"]]
+                    )
+                    calibrator.accumulate(
+                        preds_vals, labels_vals, weights_vals.flatten()
+                    )
+                except tf.errors.OutOfRangeError:
+                    break
+                count += 1
+
+        calibrator.calibrate()
+        calibrator.save(params.calibrator_save_dir)
+        trainer.estimator._params.isotonic_calibrator = True
+
+        if debug:
+            return calibrator
+
+    else:
+        calibrator_save_dir = twml.util.sanitize_hdfs_path(params.calibrator_save_dir)
+        # workers wait for calibration to be ready
+        while not tf.io.gfile.exists(
+            calibrator_save_dir + os.path.sep + "tfhub_module.pb"
+        ):
+            logging.info("Worker waiting for calibration at %s" % calibrator_save_dir)
+            time.sleep(60)
+
+
+def discretize(
+    params: argparse.Namespace,
+    feature_config: dict,
+    input_fn: callable,
+    debug: bool = False,
+):
+    """
+    Discretizes continuous features
+
+    Args:
+        params (argparse.Namespace):
+            Parameters
+        feature_config (dict):
+            Feature Config
+        input_fn (callable):
+            Input Function specified by the user
+        debug (bool):
+            Defaults to False. Returns the calibrator
+    """
+
+    if (
+        os.environ.get("TWML_HOGWILD_TASK_TYPE") == "chief"
+        or "num_workers" not in params
+        or params.num_workers is None
+    ):
+        # overwrite the current save_dir
+        if params.overwrite_save_dir and tf.io.gfile.exists(
+            params.discretizer_save_dir
+        ):
+            logging.info(
+                "Trainer overwriting existing save directory: %s (params.overwrite_save_dir)"
+                % params.discretizer_save_dir
+            )
+            tf.io.gfile.rmtree(params.discretizer_save_dir)
+
+        config_map = feature_config()
+        discretize_dict = config_map["discretize_config"]
+
+        # chief trains discretizer
+        logging.info("Chief training discretizer")
+
+        batch = input_fn()
+        # Accumulate the features for each calibrator
+        with tf.Session() as sess:
+            count = 0
+            max_steps = params.discretizer_max_steps or -1
+            while max_steps <= 0 or count <= max_steps:
+                try:
+                    inputs = sess.run(batch)
+                    for name, clbrt in discretize_dict.items():
+                        clbrt.accumulate_features(inputs[0], name)
+                except tf.errors.OutOfRangeError:
+                    break
+                count += 1
+
+        # This module allows for the calibrator to save be saved as part of
+        # Tensorflow Hub (this will allow it to be used in further steps)
+        def calibrator_module():
+            # Note that this is usually expecting a sparse_placeholder
+            for name, clbrt in discretize_dict.items():
+                clbrt.calibrate()
+                clbrt.add_hub_signatures(name)
+
+        # exports the module to the save_dir
+        spec = hub.create_module_spec(calibrator_module)
+        with tf.Graph().as_default():
+            module = hub.Module(spec)
+            with tf.Session() as session:
+                module.export(params.discretizer_save_dir, session)
+
+        for name, clbrt in discretize_dict.items():
+            clbrt.write_summary_json(params.discretizer_save_dir, name)
+
+        if debug:
+            return discretize_dict
+
+    else:
+        # wait for the file to be removed (if necessary)
+        # should be removed after an actual fix applied
+        time.sleep(60)
+        discretizer_save_dir = twml.util.sanitize_hdfs_path(params.discretizer_save_dir)
+        # workers wait for calibration to be ready
+        while not tf.io.gfile.exists(
+            discretizer_save_dir + os.path.sep + "tfhub_module.pb"
+        ):
+            logging.info("Worker waiting for calibration at %s" % discretizer_save_dir)
+            time.sleep(60)
+
+
+def add_discretizer_arguments(parser):
+    """
+    Add discretizer-specific command-line arguments to a Trainer parser.
+
+    Args:
+      parser: argparse.ArgumentParser instance obtained from Trainer.get_trainer_parser
+
+    Returns:
+      argparse.ArgumentParser instance with discretizer-specific arguments added
+    """
+
     parser.add_argument(
-      "--hour_resolution", type=int, default=None,
-      help="Specify the hourly resolution of the stored data.")
+        "--discretizer.save_dir",
+        type=str,
+        dest="discretizer_save_dir",
+        help="Path to save or load discretizer calibration",
+    )
     parser.add_argument(
-      "--tensorboard_port", type=int, default=None,
-      help="Port for tensorboard to run on.")
+        "--discretizer.batch_size",
+        type=int,
+        default=128,
+        dest="discretizer_batch_size",
+        help="Discretizer batch size",
+    )
+    parser.add_argument(
+        "--discretizer.keep_rate",
+        type=float,
+        default=0.0008,
+        dest="discretizer_keep_rate",
+        help="Keep rate",
+    )
     parser.add_argument(
-      "--stats_port", type=int, default=None,
-      help="Port for stats server to run on.")
+        "--discretizer.parts_downsampling_rate",
+        type=float,
+        default=0.2,
+        dest="discretizer_parts_downsampling_rate",
+        help="Parts downsampling rate",
+    )
     parser.add_argument(
-      "--health_port", type=int, default=None,
-      help="Port to listen on for health-related endpoints (e.g. graceful shutdown)."
-           "Not user-facing as it is set automatically by the twml_cli."
+        "--discretizer.num_bins",
+        type=int,
+        default=20,
+        dest="discretizer_num_bins",
+        help="Number of bins per feature",
     )
     parser.add_argument(
-      "--data_spec", type=str, default=None,
-      help="Path to data specification JSON file. This file is used to decode DataRecords")
-  parser.add_argument("--discretizer.save_dir", type=str,
-    dest="discretizer_save_dir",
-    help="Path to save or load discretizer calibration")
-  parser.add_argument("--discretizer_batch_size", type=int, default=128,
-    dest="discretizer_batch_size",
-    help="Discretizer batch size")
-  parser.add_argument("--discretizer_keep_rate", type=float, default=0.0008,
-    dest="discretizer_keep_rate",
-    help="Keep rate")
-  parser.add_argument("--discretizer_parts_downsampling_rate", type=float, default=0.2,
-    dest="discretizer_parts_downsampling_rate",
-    help="Parts downsampling rate")
-  parser.add_argument("--discretizer_max_steps", type=int, default=None,
-    dest="discretizer_max_steps",
-    help="Max Steps taken by discretizer to accumulate samples")
-  return parser
-
-
-def calibrate(trainer, params, build_graph, input_fn, debug=False):
-  """
-  Calibrate Isotonic Calibration
-  Arguments:
-    trainer:
-      Trainer
-    params:
-      Parameters
-    build_graph:
-      Build Graph used to be the input to the calibrator
-    input_fn:
-      Input Function specified by the user
-    debug:
-      Defaults to False. Returns the calibrator
-  """
-
-  if trainer._estimator.config.is_chief:
-
-    # overwrite the current save_dir
-    if params.overwrite_save_dir and tf.io.gfile.exists(params.calibrator_save_dir):
-      logging.info("Trainer overwriting existing save directory: %s (params.overwrite_save_dir)"
-                   % params.calibrator_save_dir)
-      tf.io.gfile.rmtree(params.calibrator_save_dir)
-
-    calibrator = IsotonicCalibrator(params.calibrator_num_bins)
-
-    # chief trains discretizer
-    logging.info("Chief training calibrator")
-
-    # Accumulate the features for each calibrator
-    features, labels = input_fn()
-    if 'weights' not in features:
-      raise ValueError("Weights need to be returned as part of the parse_fn")
-    weights = features.pop('weights')
-
-    preds = build_graph(features=features, label=None, mode='infer', params=params, config=None)
-    init = tf.global_variables_initializer()
-    table_init = tf.tables_initializer()
-    with tf.Session() as sess:
-      sess.run(init)
-      sess.run(table_init)
-      count = 0
-      max_steps = params.calibrator_max_steps or -1
-      while max_steps <= 0 or count <= max_steps:
-        try:
-          weights_vals, labels_vals, preds_vals = sess.run([weights, labels, preds['output']])
-          calibrator.accumulate(preds_vals, labels_vals, weights_vals.flatten())
-        except tf.errors.OutOfRangeError:
-          break
-        count += 1
-
-    calibrator.calibrate()
-    calibrator.save(params.calibrator_save_dir)
-    trainer.estimator._params.isotonic_calibrator = True
-
-    if debug:
-      return calibrator
-
-  else:
-    calibrator_save_dir = twml.util.sanitize_hdfs_path(params.calibrator_save_dir)
-    # workers wait for calibration to be ready
-    while not tf.io.gfile.exists(calibrator_save_dir + os.path.sep + "tfhub_module.pb"):
-      logging.info("Worker waiting for calibration at %s" % calibrator_save_dir)
-      time.sleep(60)
-
-
-def discretize(params, feature_config, input_fn, debug=False):
-  """
-  Discretizes continuous features
-  Arguments:
-    params:
-      Parameters
-    input_fn:
-      Input Function specified by the user
-    debug:
-      Defaults to False. Returns the calibrator
-  """
-
-  if (os.environ.get("TWML_HOGWILD_TASK_TYPE") == "chief" or "num_workers" not in params or
-    params.num_workers is None):
-
-    # overwrite the current save_dir
-    if params.overwrite_save_dir and tf.io.gfile.exists(params.discretizer_save_dir):
-      logging.info("Trainer overwriting existing save directory: %s (params.overwrite_save_dir)"
-                   % params.discretizer_save_dir)
-      tf.io.gfile.rmtree(params.discretizer_save_dir)
-
-    config_map = feature_config()
-    discretize_dict = config_map['discretize_config']
-
-    # chief trains discretizer
-    logging.info("Chief training discretizer")
-
-    batch = input_fn()
-    # Accumulate the features for each calibrator
-    with tf.Session() as sess:
-      count = 0
-      max_steps = params.discretizer_max_steps or -1
-      while max_steps <= 0 or count <= max_steps:
-        try:
-          inputs = sess.run(batch)
-          for name, clbrt in discretize_dict.items():
-            clbrt.accumulate_features(inputs[0], name)
-        except tf.errors.OutOfRangeError:
-          break
-        count += 1
-
-    # This module allows for the calibrator to save be saved as part of
-    # Tensorflow Hub (this will allow it to be used in further steps)
-    def calibrator_module():
-      # Note that this is usually expecting a sparse_placeholder
-      for name, clbrt in discretize_dict.items():
-        clbrt.calibrate()
-        clbrt.add_hub_signatures(name)
-
-    # exports the module to the save_dir
-    spec = hub.create_module_spec(calibrator_module)
-    with tf.Graph().as_default():
-      module = hub.Module(spec)
-      with tf.Session() as session:
-        module.export(params.discretizer_save_dir, session)
-
-    for name, clbrt in discretize_dict.items():
-      clbrt.write_summary_json(params.discretizer_save_dir, name)
-
-    if debug:
-      return discretize_dict
-
-  else:
-    # wait for the file to be removed (if necessary)
-    # should be removed after an actual fix applied
-    time.sleep(60)
-    discretizer_save_dir = twml.util.sanitize_hdfs_path(params.discretizer_save_dir)
-    # workers wait for calibration to be ready
-    while not tf.io.gfile.exists(discretizer_save_dir + os.path.sep + "tfhub_module.pb"):
-      logging.info("Worker waiting for calibration at %s" % discretizer_save_dir)
-      time.sleep(60)
+        "--discretizer.output_size_bits",
+        type=int,
+        default=22,
+        dest="discretizer_output_size_bits",
+        help="Number of bits allocated to the output size",
+    )
+    return parser
 
 
-def add_discretizer_arguments(parser):
-  """
-  Add discretizer-specific command-line arguments to a Trainer parser.
-
-  Arguments:
-    parser: argparse.ArgumentParser instance obtained from Trainer.get_trainer_parser
-
-  Returns:
-    argparse.ArgumentParser instance with discretizer-specific arguments added
-  """
-
-  parser.add_argument("--discretizer.save_dir", type=str,
-                      dest="discretizer_save_dir",
-                      help="Path to save or load discretizer calibration")
-  parser.add_argument("--discretizer.batch_size", type=int, default=128,
-                      dest="discretizer_batch_size",
-                      help="Discretizer batch size")
-  parser.add_argument("--discretizer.keep_rate", type=float, default=0.0008,
-                      dest="discretizer_keep_rate",
-                      help="Keep rate")
-  parser.add_argument("--discretizer.parts_downsampling_rate", type=float, default=0.2,
-                      dest="discretizer_parts_downsampling_rate",
-                      help="Parts downsampling rate")
-  parser.add_argument("--discretizer.num_bins", type=int, default=20,
-                      dest="discretizer_num_bins",
-                      help="Number of bins per feature")
-  parser.add_argument("--discretizer.output_size_bits", type=int, default=22,
-                      dest="discretizer_output_size_bits",
-                      help="Number of bits allocated to the output size")
-  return parser
-
-
-def add_isotonic_calibrator_arguments(parser):
-  """
-  Add discretizer-specific command-line arguments to a Trainer parser.
-
-  Arguments:
-    parser: argparse.ArgumentParser instance obtained from Trainer.get_trainer_parser
-
-  Returns:
-    argparse.ArgumentParser instance with discretizer-specific arguments added
-  """
-  parser.add_argument("--calibrator.num_bins", type=int,
-    default=25000, dest="calibrator_num_bins",
-    help="number of bins for isotonic calibration")
-  parser.add_argument("--calibrator.parts_downsampling_rate", type=float, default=0.1,
-    dest="calibrator_parts_downsampling_rate", help="Parts downsampling rate")
-  parser.add_argument("--calibrator.save_dir", type=str,
-    dest="calibrator_save_dir", help="Path to save or load calibrator output")
-  parser.add_argument("--calibrator.load_tensorflow_module", type=str, default=None,
-    dest="calibrator_load_tensorflow_module",
-    help="Location from where to load a pretrained graph from. \
-                           Typically, this is where the MLP graph is saved")
-  parser.add_argument("--calibrator.export_mlp_module_name", type=str, default='tf_hub_mlp',
-    help="Name for loaded hub signature",
-    dest="export_mlp_module_name")
-  parser.add_argument("--calibrator.export_isotonic_module_name",
-    type=str, default="tf_hub_isotonic",
-    dest="calibrator_export_module_name",
-    help="export module name")
-  parser.add_argument("--calibrator.final_evaluation_steps", type=int,
-    dest="calibrator_final_evaluation_steps", default=None,
-    help="number of steps for final evaluation")
-  parser.add_argument("--calibrator.train_steps", type=int, default=-1,
-    dest="calibrator_train_steps",
-    help="number of steps for calibration")
-  parser.add_argument("--calibrator.batch_size", type=int, default=1024,
-    dest="calibrator_batch_size",
-    help="Calibrator batch size")
-  parser.add_argument("--calibrator.is_calibrating", action='store_true',
-    dest="is_calibrating",
-    help="Dummy argument to allow running in chief worker")
-  return parser
-
-
-def calibrate_calibrator_and_export(name, calibrator, build_graph_fn, params, feature_config,
-                                    run_eval=True, input_fn=None, metric_fn=None,
-                                    export_task_type_overrider=None):
-  """
-  Pre-set `isotonic calibrator` calibrator.
-  Args:
-    name:
-      scope name used for the calibrator
-    calibrator:
-      calibrator that will be calibrated and exported.
-    build_graph_fn:
-      build graph function for the calibrator
-    params:
-      params passed to the calibrator
-    feature_config:
-      feature config which will be passed to the trainer
-    export_task_type_overrider:
-      the task type for exporting the calibrator
-      if specified, this will override the default export task type in trainer.hub_export(..)
-  """
-
-  # create calibrator params
-  params_c = copy.deepcopy(params)
-  params_c.data_threads = 1
-  params_c.num_workers = 1
-  params_c.continue_from_checkpoint = True
-  params_c.overwrite_save_dir = False
-  params_c.stats_port = None
-
-  # Automatically load from the saved Tensorflow Hub module if not specified.
-  if params_c.calibrator_load_tensorflow_module is None:
-    path_saved_tensorflow_model = os.path.join(params.save_dir, params.export_mlp_module_name)
-    params_c.calibrator_load_tensorflow_module = path_saved_tensorflow_model
-
-  if "calibrator_parts_downsampling_rate" in params_c:
-    params_c.train_parts_downsampling_rate = params_c.calibrator_parts_downsampling_rate
-  if "calibrator_save_dir" in params_c:
-    params_c.save_dir = params_c.calibrator_save_dir
-  if "calibrator_batch_size" in params_c:
-    params_c.train_batch_size = params_c.calibrator_batch_size
-    params_c.eval_batch_size = params_c.calibrator_batch_size
-  # TODO: Deprecate this option. It is not actually used. Calibrator
-  #       simply iterates until the end of input_fn.
-  if "calibrator_train_steps" in params_c:
-    params_c.train_steps = params_c.calibrator_train_steps
-
-  if metric_fn is None:
-    metric_fn = twml.metrics.get_multi_binary_class_metric_fn(None)
-
-  # Common Trainer which will also be used by all workers
-  trainer = twml.trainers.DataRecordTrainer(
-    name=name,
-    params=params_c,
-    feature_config=feature_config,
-    build_graph_fn=build_graph_fn,
-    save_dir=params_c.save_dir,
-    metric_fn=metric_fn
-  )
-
-  if trainer._estimator.config.is_chief:
-
-    # Chief trains calibrator
-    logging.info("Chief training calibrator")
-
-    # Disregard hogwild config
-    os_twml_hogwild_ports = os.environ.get("TWML_HOGWILD_PORTS")
-    os.environ["TWML_HOGWILD_PORTS"] = ""
-
-    hooks = None
-    if params_c.calibrator_train_steps > 0:
-      hooks = [twml.hooks.StepProgressHook(params_c.calibrator_train_steps)]
-
-    def parse_fn(input_x):
-      fc_parse_fn = feature_config.get_parse_fn()
-      features, labels = fc_parse_fn(input_x)
-      features['labels'] = labels
-      return features, labels
-
-    if input_fn is None:
-      input_fn = trainer.get_train_input_fn(parse_fn=parse_fn, repeat=False)
-
-    # Calibrate stage
-    trainer.estimator._params.mode = 'calibrate'
-    trainer.calibrate(calibrator=calibrator,
-                      input_fn=input_fn,
-                      steps=params_c.calibrator_train_steps,
-                      hooks=hooks)
-
-    # Save Checkpoint
-    # We need to train for 1 step, to save the graph to checkpoint.
-    # This is done just by the chief.
-    # We need to set the mode to evaluate to save the graph that will be consumed
-    # In the final evaluation
-    trainer.estimator._params.mode = 'evaluate'
-    trainer.train(input_fn=input_fn, steps=1)
-
-    # Restore hogwild setup
-    if os_twml_hogwild_ports is not None:
-      os.environ["TWML_HOGWILD_PORTS"] = os_twml_hogwild_ports
-  else:
-    # Workers wait for calibration to be ready
-    final_calibrator_path = os.path.join(params_c.calibrator_save_dir,
-                                         params_c.calibrator_export_module_name)
-
-    final_calibrator_path = twml.util.sanitize_hdfs_path(final_calibrator_path)
-
-    while not tf.io.gfile.exists(final_calibrator_path + os.path.sep + "tfhub_module.pb"):
-      logging.info("Worker waiting for calibration at %s" % final_calibrator_path)
-      time.sleep(60)
-
-  # Evaluate stage
-  if run_eval:
-    trainer.estimator._params.mode = 'evaluate'
-    # This will allow the Evaluate method to be run in Hogwild
-    # trainer.estimator._params.continue_from_checkpoint = True
-    trainer.evaluate(name='test', input_fn=input_fn, steps=params_c.calibrator_final_evaluation_steps)
-
-  trainer.hub_export(name=params_c.calibrator_export_module_name,
-    export_task_type_overrider=export_task_type_overrider,
-    serving_input_receiver_fn=feature_config.get_serving_input_receiver_fn())
-
-  return trainer
-
-
-def calibrate_discretizer_and_export(name, calibrator, build_graph_fn, params, feature_config):
-  """
-  Pre-set percentile discretizer calibrator.
-  Args:
-    name:
-      scope name used for the calibrator
-    calibrator:
-      calibrator that will be calibrated and exported.
-    build_graph_fn:
-      build graph function for the calibrator
-    params:
-      params passed to the calibrator
-    feature_config:
-      feature config or input_fn which will be passed to the trainer.
-  """
-
-  if (os.environ.get("TWML_HOGWILD_TASK_TYPE") == "chief" or "num_workers" not in params or
-        params.num_workers is None):
-
-    # chief trains discretizer
-    logging.info("Chief training discretizer")
-
-    # disregard hogwild config
-    os_twml_hogwild_ports = os.environ.get("TWML_HOGWILD_PORTS")
-    os.environ["TWML_HOGWILD_PORTS"] = ""
-
-    # create discretizer params
+def add_isotonic_calibrator_arguments(
+    parser: argparse.ArgumentParser,
+) -> argparse.ArgumentParser:
+    """
+    Add discretizer-specific command-line arguments to a Trainer parser.
+
+    Args:
+        parser: argparse.ArgumentParser instance obtained from Trainer.get_trainer_parser
+
+    Returns:
+        argparse.ArgumentParser instance with discretizer-specific arguments added
+    """
+    parser.add_argument(
+        "--calibrator.num_bins",
+        type=int,
+        default=25000,
+        dest="calibrator_num_bins",
+        help="number of bins for isotonic calibration",
+    )
+    parser.add_argument(
+        "--calibrator.parts_downsampling_rate",
+        type=float,
+        default=0.1,
+        dest="calibrator_parts_downsampling_rate",
+        help="Parts downsampling rate",
+    )
+    parser.add_argument(
+        "--calibrator.save_dir",
+        type=str,
+        dest="calibrator_save_dir",
+        help="Path to save or load calibrator output",
+    )
+    parser.add_argument(
+        "--calibrator.load_tensorflow_module",
+        type=str,
+        default=None,
+        dest="calibrator_load_tensorflow_module",
+        help="Location from where to load a pretrained graph from. Typically, this is where the MLP graph is saved",
+    )
+    parser.add_argument(
+        "--calibrator.export_mlp_module_name",
+        type=str,
+        default="tf_hub_mlp",
+        help="Name for loaded hub signature",
+        dest="export_mlp_module_name",
+    )
+    parser.add_argument(
+        "--calibrator.export_isotonic_module_name",
+        type=str,
+        default="tf_hub_isotonic",
+        dest="calibrator_export_module_name",
+        help="export module name",
+    )
+    parser.add_argument(
+        "--calibrator.final_evaluation_steps",
+        type=int,
+        dest="calibrator_final_evaluation_steps",
+        default=None,
+        help="number of steps for final evaluation",
+    )
+    parser.add_argument(
+        "--calibrator.train_steps",
+        type=int,
+        default=-1,
+        dest="calibrator_train_steps",
+        help="number of steps for calibration",
+    )
+    parser.add_argument(
+        "--calibrator.batch_size",
+        type=int,
+        default=1024,
+        dest="calibrator_batch_size",
+        help="Calibrator batch size",
+    )
+    parser.add_argument(
+        "--calibrator.is_calibrating",
+        action="store_true",
+        dest="is_calibrating",
+        help="Dummy argument to allow running in chief worker",
+    )
+    return parser
+
+
+def calibrate_calibrator_and_export(
+    name: str,
+    calibrator: tf.estimator.Estimator,
+    build_graph_fn: Callable,
+    params: tf.contrib.training.HParams,
+    feature_config: FeatureConfig,
+    run_eval: bool = True,
+    input_fn: Callable = None,
+    metric_fn: Callable = None,
+    export_task_type_overrider: str = None,
+):
+    """
+    Pre-set `isotonic calibrator` calibrator.
+    Args:
+        name (str):
+            scope name used for the calibrator
+        calibrator (tf.estimator.Estimator):
+            calibrator that will be calibrated and exported.
+        build_graph_fn (Callable):
+            build graph function for the calibrator
+        params (tf.contrib.training.HParams):
+            params passed to the calibrator
+        feature_config (FeatureConfig):
+            feature config which will be passed to the trainer
+        run_eval (bool):
+            whether to run evaluation after calibration. Default is True.
+        input_fn (Callable):
+            input function for the calibrator. If not specified, the default input function will be used.
+        metric_fn (Callable):
+            metric function for the calibrator. If not specified, the default metric function will be used.
+        export_task_type_overrider:
+            the task type for exporting the calibrator
+            if specified, this will override the default export task type in trainer.hub_export(..)
+    """
+
+    # create calibrator params
     params_c = copy.deepcopy(params)
     params_c.data_threads = 1
-    params_c.train_steps = -1
-    params_c.train_max_steps = None
-    params_c.eval_steps = -1
     params_c.num_workers = 1
-    params_c.tensorboard_port = None
+    params_c.continue_from_checkpoint = True
+    params_c.overwrite_save_dir = False
     params_c.stats_port = None
 
-    if "discretizer_batch_size" in params_c:
-      params_c.train_batch_size = params_c.discretizer_batch_size
-      params_c.eval_batch_size = params_c.discretizer_batch_size
-    if "discretizer_keep_rate" in params_c:
-      params_c.train_keep_rate = params_c.discretizer_keep_rate
-    if "discretizer_parts_downsampling_rate" in params_c:
-      params_c.train_parts_downsampling_rate = params_c.discretizer_parts_downsampling_rate
-    if "discretizer_save_dir" in params_c:
-      params_c.save_dir = params_c.discretizer_save_dir
-
-    # train discretizer
+    # Automatically load from the saved Tensorflow Hub module if not specified.
+    if params_c.calibrator_load_tensorflow_module is None:
+        path_saved_tensorflow_model = os.path.join(
+            params.save_dir, params.export_mlp_module_name
+        )
+        params_c.calibrator_load_tensorflow_module = path_saved_tensorflow_model
+
+    if "calibrator_parts_downsampling_rate" in params_c:
+        params_c.train_parts_downsampling_rate = (
+            params_c.calibrator_parts_downsampling_rate
+        )
+
+    if "calibrator_save_dir" in params_c:
+        params_c.save_dir = params_c.calibrator_save_dir
+
+    if "calibrator_batch_size" in params_c:
+        params_c.train_batch_size = params_c.calibrator_batch_size
+        params_c.eval_batch_size = params_c.calibrator_batch_size
+
+    # TODO: Deprecate this option. It is not actually used. Calibrator simply iterates until the end of input_fn.
+    if "calibrator_train_steps" in params_c:
+        params_c.train_steps = params_c.calibrator_train_steps
+
+    if metric_fn is None:
+        metric_fn = twml.metrics.get_multi_binary_class_metric_fn(None)
+
+    # Common Trainer which will also be used by all workers
     trainer = twml.trainers.DataRecordTrainer(
-      name=name,
-      params=params_c,
-      build_graph_fn=build_graph_fn,
-      save_dir=params_c.save_dir,
+        name=name,
+        params=params_c,
+        feature_config=feature_config,
+        build_graph_fn=build_graph_fn,
+        save_dir=params_c.save_dir,
+        metric_fn=metric_fn,
+    )
+
+    if trainer._estimator.config.is_chief:
+        # Chief trains calibrator
+        logging.info("Chief training calibrator")
+
+        # Disregard hogwild config
+        os_twml_hogwild_ports = os.environ.get("TWML_HOGWILD_PORTS")
+        os.environ["TWML_HOGWILD_PORTS"] = ""
+
+        hooks = None
+        if params_c.calibrator_train_steps > 0:
+            hooks = [twml.hooks.StepProgressHook(params_c.calibrator_train_steps)]
+
+        def parse_fn(input_x):
+            fc_parse_fn = feature_config.get_parse_fn()
+            features, labels = fc_parse_fn(input_x)
+            features["labels"] = labels
+            return features, labels
+
+        if input_fn is None:
+            input_fn = trainer.get_train_input_fn(parse_fn=parse_fn, repeat=False)
+
+        # Calibrate stage
+        trainer.estimator._params.mode = "calibrate"
+        trainer.calibrate(
+            calibrator=calibrator,
+            input_fn=input_fn,
+            steps=params_c.calibrator_train_steps,
+            hooks=hooks,
+        )
+
+        # Save Checkpoint
+        # We need to train for 1 step, to save the graph to checkpoint.
+        # This is done just by the chief.
+        # We need to set the mode to evaluate to save the graph that will be consumed
+        # In the final evaluation
+        trainer.estimator._params.mode = "evaluate"
+        trainer.train(input_fn=input_fn, steps=1)
+
+        # Restore hogwild setup
+        if os_twml_hogwild_ports is not None:
+            os.environ["TWML_HOGWILD_PORTS"] = os_twml_hogwild_ports
+    else:
+        # Workers wait for calibration to be ready
+        final_calibrator_path = os.path.join(
+            params_c.calibrator_save_dir, params_c.calibrator_export_module_name
+        )
+
+        final_calibrator_path = twml.util.sanitize_hdfs_path(final_calibrator_path)
+
+        while not tf.io.gfile.exists(
+            final_calibrator_path + os.path.sep + "tfhub_module.pb"
+        ):
+            logging.info("Worker waiting for calibration at %s" % final_calibrator_path)
+            time.sleep(60)
+
+    # Evaluate stage
+    if run_eval:
+        trainer.estimator._params.mode = "evaluate"
+        # This will allow the Evaluate method to be run in Hogwild
+        # trainer.estimator._params.continue_from_checkpoint = True
+        trainer.evaluate(
+            name="test",
+            input_fn=input_fn,
+            steps=params_c.calibrator_final_evaluation_steps,
+        )
+
+    trainer.hub_export(
+        name=params_c.calibrator_export_module_name,
+        export_task_type_overrider=export_task_type_overrider,
+        serving_input_receiver_fn=feature_config.get_serving_input_receiver_fn(),
     )
 
-    if isinstance(feature_config, twml.feature_config.FeatureConfig):
-      parse_fn = twml.parsers.get_continuous_parse_fn(feature_config)
-      input_fn = trainer.get_train_input_fn(parse_fn=parse_fn, repeat=False)
-    elif callable(feature_config):
-      input_fn = feature_config
+    return trainer
+
+
+def calibrate_discretizer_and_export(
+    name: str,
+    calibrator: twml.calibrators.Calibrator,
+    build_graph_fn: callable,
+    params: twml.params.TrainParams,
+    feature_config: FeatureConfig,
+):
+    """
+    Pre-set percentile discretizer calibrator.
+    Args:
+        name (str):
+            scope name used for the calibrator
+        calibrator (twml.calibrators.Calibrator):
+            calibrator that will be calibrated and exported.
+        build_graph_fn (function):
+            build graph function for the calibrator
+        params (twml.params.TrainParams):
+            params passed to the calibrator
+        feature_config (twml.feature_config.FeatureConfig):
+            feature config or input_fn which will be passed to the trainer.
+    """
+
+    if (
+        os.environ.get("TWML_HOGWILD_TASK_TYPE") == "chief"
+        or "num_workers" not in params
+        or params.num_workers is None
+    ):
+        # chief trains discretizer
+        logging.info("Chief training discretizer")
+
+        # disregard hogwild config
+        os_twml_hogwild_ports = os.environ.get("TWML_HOGWILD_PORTS")
+        os.environ["TWML_HOGWILD_PORTS"] = ""
+
+        # create discretizer params
+        params_c = copy.deepcopy(params)
+        params_c.data_threads = 1
+        params_c.train_steps = -1
+        params_c.train_max_steps = None
+        params_c.eval_steps = -1
+        params_c.num_workers = 1
+        params_c.tensorboard_port = None
+        params_c.stats_port = None
+
+        if "discretizer_batch_size" in params_c:
+            params_c.train_batch_size = params_c.discretizer_batch_size
+            params_c.eval_batch_size = params_c.discretizer_batch_size
+        if "discretizer_keep_rate" in params_c:
+            params_c.train_keep_rate = params_c.discretizer_keep_rate
+        if "discretizer_parts_downsampling_rate" in params_c:
+            params_c.train_parts_downsampling_rate = (
+                params_c.discretizer_parts_downsampling_rate
+            )
+        if "discretizer_save_dir" in params_c:
+            params_c.save_dir = params_c.discretizer_save_dir
+
+        # train discretizer
+        trainer = twml.trainers.DataRecordTrainer(
+            name=name,
+            params=params_c,
+            build_graph_fn=build_graph_fn,
+            save_dir=params_c.save_dir,
+        )
+
+        if isinstance(feature_config, twml.feature_config.FeatureConfig):
+            parse_fn = twml.parsers.get_continuous_parse_fn(feature_config)
+            input_fn = trainer.get_train_input_fn(parse_fn=parse_fn, repeat=False)
+        elif callable(feature_config):
+            input_fn = feature_config
+        else:
+            got_type = type(feature_config).__name__
+            raise ValueError(
+                "Expecting feature_config to be FeatureConfig or function got %s"
+                % got_type
+            )
+
+        hooks = None
+        if params_c.train_steps > 0:
+            hooks = [twml.hooks.StepProgressHook(params_c.train_steps)]
+
+        trainer.calibrate(
+            calibrator=calibrator,
+            input_fn=input_fn,
+            steps=params_c.train_steps,
+            hooks=hooks,
+        )
+        # restore hogwild setup
+        if os_twml_hogwild_ports is not None:
+            os.environ["TWML_HOGWILD_PORTS"] = os_twml_hogwild_ports
     else:
-      got_type = type(feature_config).__name__
-      raise ValueError(
-        "Expecting feature_config to be FeatureConfig or function got %s" % got_type)
-
-    hooks = None
-    if params_c.train_steps > 0:
-      hooks = [twml.hooks.StepProgressHook(params_c.train_steps)]
-
-    trainer.calibrate(calibrator=calibrator, input_fn=input_fn,
-                      steps=params_c.train_steps, hooks=hooks)
-    # restore hogwild setup
-    if os_twml_hogwild_ports is not None:
-      os.environ["TWML_HOGWILD_PORTS"] = os_twml_hogwild_ports
-  else:
-    discretizer_save_dir = twml.util.sanitize_hdfs_path(params.discretizer_save_dir)
-    # workers wait for calibration to be ready
-    while not tf.io.gfile.exists(discretizer_save_dir + os.path.sep + "tfhub_module.pb"):
-      logging.info("Worker waiting for calibration at %s" % discretizer_save_dir)
-      time.sleep(60)
+        discretizer_save_dir = twml.util.sanitize_hdfs_path(params.discretizer_save_dir)
+        # workers wait for calibration to be ready
+        while not tf.io.gfile.exists(
+            discretizer_save_dir + os.path.sep + "tfhub_module.pb"
+        ):
+            logging.info("Worker waiting for calibration at %s" % discretizer_save_dir)
+            time.sleep(60)
 
 
 def build_percentile_discretizer_graph(features, label, mode, params, config=None):
-  """
-  Pre-set Percentile Discretizer Build Graph
-  Follows the same signature as build_graph
-  """
-  sparse_tf = twml.util.convert_to_sparse(features, params.input_size_bits)
-  weights = tf.reshape(features['weights'], tf.reshape(features['batch_size'], [1]))
-  if isinstance(sparse_tf, tf.SparseTensor):
-    indices = sparse_tf.indices[:, 1]
-    ids = sparse_tf.indices[:, 0]
-  elif isinstance(sparse_tf, twml.SparseTensor):
-    indices = sparse_tf.indices
-    ids = sparse_tf.ids
-
-  # Return weights, feature_ids, feature_values
-  weights = tf.gather(params=weights, indices=ids)
-  feature_ids = indices
-  feature_values = sparse_tf.values
-  # Update train_op and assign dummy_loss
-  train_op = tf.assign_add(tf.train.get_global_step(), 1)
-  loss = tf.constant(1)
-  if mode == 'train':
-    return {'train_op': train_op, 'loss': loss}
-  return {'feature_ids': feature_ids, 'feature_values': feature_values, 'weights': weights}
+    """
+    Pre-set Percentile Discretizer Build Graph
+    Follows the same signature as build_graph
+    """
+    sparse_tf = twml.util.convert_to_sparse(features, params.input_size_bits)
+    weights = tf.reshape(features["weights"], tf.reshape(features["batch_size"], [1]))
+    if isinstance(sparse_tf, tf.SparseTensor):
+        indices = sparse_tf.indices[:, 1]
+        ids = sparse_tf.indices[:, 0]
+    elif isinstance(sparse_tf, twml.SparseTensor):
+        indices = sparse_tf.indices
+        ids = sparse_tf.ids
+
+    # Return weights, feature_ids, feature_values
+    weights = tf.gather(params=weights, indices=ids)
+    feature_ids = indices
+    feature_values = sparse_tf.values
+    # Update train_op and assign dummy_loss
+    train_op = tf.assign_add(tf.train.get_global_step(), 1)
+    loss = tf.constant(1)
+    if mode == "train":
+        return {"train_op": train_op, "loss": loss}
+    return {
+        "feature_ids": feature_ids,
+        "feature_values": feature_values,
+        "weights": weights,
+    }
 
 
 def isotonic_module(mode, params):
-  """
-  Common Isotonic Calibrator module for Hub Export
-  """
-  inputs = tf.sparse_placeholder(tf.float32, name="sparse_input")
-  mlp = hub.Module(params.calibrator_load_tensorflow_module)
-  logits = mlp(inputs, signature=params.export_mlp_module_name)
-  isotonic_calibrator = hub.Module(params.save_dir)
-  output = isotonic_calibrator(logits, signature="isotonic_calibrator")
-  hub.add_signature(inputs={"sparse_input": inputs},
-    outputs={"default": output},
-    name=params.calibrator_export_module_name)
-
-
-def build_isotonic_graph_from_inputs(inputs, features, label, mode, params, config=None, isotonic_fn=None):
-  """
-  Helper function to build_isotonic_graph
-  Pre-set Isotonic Calibrator Build Graph
-  Follows the same signature as build_graph
-  """
-  if params.mode == 'calibrate':
+    """
+    Common Isotonic Calibrator module for Hub Export
+    """
+    inputs = tf.sparse_placeholder(tf.float32, name="sparse_input")
     mlp = hub.Module(params.calibrator_load_tensorflow_module)
     logits = mlp(inputs, signature=params.export_mlp_module_name)
-    weights = tf.reshape(features['weights'], tf.reshape(features['batch_size'], [1]))
-    # Update train_op and assign dummy_loss
-    train_op = tf.assign_add(tf.train.get_global_step(), 1)
-    loss = tf.constant(1)
-    if mode == 'train':
-      return {'train_op': train_op, 'loss': loss}
-    return {'predictions': logits, 'targets': features['labels'], 'weights': weights}
-  else:
-    if isotonic_fn is None:
-      isotonic_spec = twml.util.create_module_spec(mlp_fn=isotonic_module, mode=mode, params=params)
+    isotonic_calibrator = hub.Module(params.save_dir)
+    output = isotonic_calibrator(logits, signature="isotonic_calibrator")
+    hub.add_signature(
+        inputs={"sparse_input": inputs},
+        outputs={"default": output},
+        name=params.calibrator_export_module_name,
+    )
+
+
+def build_isotonic_graph_from_inputs(
+    inputs, features, label, mode, params, config=None, isotonic_fn=None
+):
+    """
+    Helper function to build_isotonic_graph
+    Pre-set Isotonic Calibrator Build Graph
+    Follows the same signature as build_graph
+    """
+    if params.mode == "calibrate":
+        mlp = hub.Module(params.calibrator_load_tensorflow_module)
+        logits = mlp(inputs, signature=params.export_mlp_module_name)
+        weights = tf.reshape(
+            features["weights"], tf.reshape(features["batch_size"], [1])
+        )
+        # Update train_op and assign dummy_loss
+        train_op = tf.assign_add(tf.train.get_global_step(), 1)
+        loss = tf.constant(1)
+        if mode == "train":
+            return {"train_op": train_op, "loss": loss}
+        return {
+            "predictions": logits,
+            "targets": features["labels"],
+            "weights": weights,
+        }
     else:
-      isotonic_spec = twml.util.create_module_spec(mlp_fn=isotonic_fn, mode=mode, params=params)
-    output_hub = hub.Module(isotonic_spec,
-      name=params.calibrator_export_module_name)
-    hub.register_module_for_export(output_hub, params.calibrator_export_module_name)
-    output = output_hub(inputs, signature=params.calibrator_export_module_name)
-    output = tf.clip_by_value(output, 0, 1)
-    loss = tf.reduce_sum(tf.stop_gradient(output))
-    train_op = tf.assign_add(tf.train.get_global_step(), 1)
-    return {'train_op': train_op, 'loss': loss, 'output': output}
-
-
-def build_isotonic_graph(features, label, mode, params, config=None, export_discretizer=True):
-  """
-  Pre-set Isotonic Calibrator Build Graph
-  Follows the same signature as build_graph
-  This assumes that MLP already contains all modules (include percentile
-  discretizer); if export_discretizer is set
-  then it does not export the MDL phase.
-  """
-  sparse_tf = twml.util.convert_to_sparse(features, params.input_size_bits)
-  if export_discretizer:
-    return build_isotonic_graph_from_inputs(sparse_tf, features, label, mode, params, config)
-  discretizer = hub.Module(params.discretizer_path)
-
-  if params.discretizer_signature is None:
-    discretizer_signature = "percentile_discretizer_calibrator"
-  else:
-    discretizer_signature = params.discretizer_signature
-  input_sparse = discretizer(sparse_tf, signature=discretizer_signature)
-  return build_isotonic_graph_from_inputs(input_sparse, features, label, mode, params, config)
+        if isotonic_fn is None:
+            isotonic_spec = twml.util.create_module_spec(
+                mlp_fn=isotonic_module, mode=mode, params=params
+            )
+        else:
+            isotonic_spec = twml.util.create_module_spec(
+                mlp_fn=isotonic_fn, mode=mode, params=params
+            )
+        output_hub = hub.Module(
+            isotonic_spec, name=params.calibrator_export_module_name
+        )
+        hub.register_module_for_export(output_hub, params.calibrator_export_module_name)
+        output = output_hub(inputs, signature=params.calibrator_export_module_name)
+        output = tf.clip_by_value(output, 0, 1)
+        loss = tf.reduce_sum(tf.stop_gradient(output))
+        train_op = tf.assign_add(tf.train.get_global_step(), 1)
+        return {"train_op": train_op, "loss": loss, "output": output}
+
+
+def build_isotonic_graph(
+    features, label, mode, params, config=None, export_discretizer=True
+):
+    """
+    Pre-set Isotonic Calibrator Build Graph
+    Follows the same signature as build_graph
+    This assumes that MLP already contains all modules (include percentile
+    discretizer); if export_discretizer is set
+    then it does not export the MDL phase.
+    """
+    sparse_tf = twml.util.convert_to_sparse(features, params.input_size_bits)
+    if export_discretizer:
+        return build_isotonic_graph_from_inputs(
+            sparse_tf, features, label, mode, params, config
+        )
+    discretizer = hub.Module(params.discretizer_path)
+
+    if params.discretizer_signature is None:
+        discretizer_signature = "percentile_discretizer_calibrator"
+    else:
+        discretizer_signature = params.discretizer_signature
+    input_sparse = discretizer(sparse_tf, signature=discretizer_signature)
+    return build_isotonic_graph_from_inputs(
+        input_sparse, features, label, mode, params, config
+    )
diff --git a/twml/twml/contrib/calibrators/hashed_percentile_discretizer.py b/twml/twml/contrib/calibrators/hashed_percentile_discretizer.py
index e14f62303..09a70d94b 100644
--- a/twml/twml/contrib/calibrators/hashed_percentile_discretizer.py
+++ b/twml/twml/contrib/calibrators/hashed_percentile_discretizer.py
@@ -1,22 +1,34 @@
 # pylint: disable=arguments-differ,no-member,too-many-statements
-''' Contains HashedPercentileDiscretizerCalibrator used for calibration '''
-from .percentile_discretizer import PercentileDiscretizerCalibrator
+""" Contains HashedPercentileDiscretizerCalibrator used for calibration """
+import numpy as np
 
 import twml
 
+from .percentile_discretizer import PercentileDiscretizerCalibrator
+
 
 class HashedPercentileDiscretizerCalibrator(PercentileDiscretizerCalibrator):
-  ''' Accumulates features and their respective values for HashedPercentileDiscretizer calibration.
-  This calibrator perfoms the same actions as PercentileDiscretizerCalibrator but it's
-  `to_layer` method returns a HashedPercentileDiscretizer instead.
-  '''
+    """Accumulates features and their respective values for HashedPercentileDiscretizer calibration.
+    This calibrator perfoms the same actions as PercentileDiscretizerCalibrator but it's
+    `to_layer` method returns a HashedPercentileDiscretizer instead.
+    """
 
-  def _create_discretizer_layer(self, n_feature, hash_map_keys, hash_map_values,
-                                feature_offsets, name):
-    return twml.contrib.layers.HashedPercentileDiscretizer(
-      n_feature=n_feature, n_bin=self._n_bin,
-      name=name, out_bits=self._out_bits,
-      hash_keys=hash_map_keys, hash_values=hash_map_values,
-      bin_ids=self._bin_ids.flatten(), bin_values=self._bin_vals.flatten(),
-      feature_offsets=feature_offsets
-    )
+    def _create_discretizer_layer(
+        self,
+        n_feature: int,
+        hash_map_keys: np.ndarray,
+        hash_map_values: np.ndarray,
+        feature_offsets: np.ndarray,
+        name: str,
+    ):
+        return twml.contrib.layers.HashedPercentileDiscretizer(
+            n_feature=n_feature,
+            n_bin=self._n_bin,
+            name=name,
+            out_bits=self._out_bits,
+            hash_keys=hash_map_keys,
+            hash_values=hash_map_values,
+            bin_ids=self._bin_ids.flatten(),
+            bin_values=self._bin_vals.flatten(),
+            feature_offsets=feature_offsets,
+        )
diff --git a/twml/twml/contrib/calibrators/hashing_discretizer.py b/twml/twml/contrib/calibrators/hashing_discretizer.py
index 965ced934..6f1bfb11b 100644
--- a/twml/twml/contrib/calibrators/hashing_discretizer.py
+++ b/twml/twml/contrib/calibrators/hashing_discretizer.py
@@ -1,35 +1,42 @@
 # pylint: disable=arguments-differ,no-member,too-many-statements
-''' Contains HashedPercentileDiscretizerCalibrator used for calibration '''
-from .percentile_discretizer import PercentileDiscretizerCalibrator
-
+""" Contains HashedPercentileDiscretizerCalibrator used for calibration """
 import numpy as np
+
 import twml
 
+from .percentile_discretizer import PercentileDiscretizerCalibrator
+
 
 class HashingDiscretizerCalibrator(PercentileDiscretizerCalibrator):
-  ''' Accumulates features and their respective values for HashingDiscretizer calibration.
-  This calibrator perfoms the same actions as PercentileDiscretizerCalibrator but it's
-  `to_layer` method returns a HashingDiscretizer instead.
-  '''
+    """Accumulates features and their respective values for HashingDiscretizer calibration.
+    This calibrator perfoms the same actions as PercentileDiscretizerCalibrator but it's
+    `to_layer` method returns a HashingDiscretizer instead.
+    """
 
-  def _create_discretizer_layer(self, n_feature, hash_map_keys, hash_map_values,
-                                feature_offsets, name):
-    # Need to sort hash_map_keys according to hash_map_values
-    # just in case they're not in order of being put in the dict
-    # hash_map_values is already 0 through len(hash_map_values)-1
-    hash_map_keys = hash_map_keys.flatten()
-    # why is this float32 in PercentileDiscretizerCalibrator.to_layer ????
-    # need int for indexing
-    hash_map_values = hash_map_values.flatten().astype(np.int32)
-    feature_ids = np.zeros((len(hash_map_keys),), dtype=np.int64)
-    for idx in range(len(hash_map_keys)):
-      feature_ids[hash_map_values[idx]] = hash_map_keys[idx]
+    def _create_discretizer_layer(
+        self,
+        n_feature: int,
+        hash_map_keys: np.ndarray,
+        hash_map_values: np.ndarray,
+        feature_offsets: np.ndarray,
+        name: str,
+    ) -> twml.contrib.layers.HashingDiscretizer:
+        # Need to sort hash_map_keys according to hash_map_values
+        # just in case they're not in order of being put in the dict
+        # hash_map_values is already 0 through len(hash_map_values)-1
+        hash_map_keys = hash_map_keys.flatten()
+        # why is this float32 in PercentileDiscretizerCalibrator.to_layer ????
+        # need int for indexing
+        hash_map_values = hash_map_values.flatten().astype(np.int32)
+        feature_ids = np.zeros((len(hash_map_keys),), dtype=np.int64)
+        for idx in range(len(hash_map_keys)):
+            feature_ids[hash_map_values[idx]] = hash_map_keys[idx]
 
-    return twml.contrib.layers.HashingDiscretizer(
-      feature_ids=feature_ids,
-      bin_vals=self._bin_vals.flatten(),
-      n_bin=self._n_bin + 1,  # (self._n_bin + 1) bin_vals for each feature_id
-      out_bits=self._out_bits,
-      cost_per_unit=500,
-      name=name
-    )
+        return twml.contrib.layers.HashingDiscretizer(
+            feature_ids=feature_ids,
+            bin_vals=self._bin_vals.flatten(),
+            n_bin=self._n_bin + 1,  # (self._n_bin + 1) bin_vals for each feature_id
+            out_bits=self._out_bits,
+            cost_per_unit=500,
+            name=name,
+        )
diff --git a/twml/twml/contrib/calibrators/isotonic.py b/twml/twml/contrib/calibrators/isotonic.py
index d03a75ff8..d8a325613 100644
--- a/twml/twml/contrib/calibrators/isotonic.py
+++ b/twml/twml/contrib/calibrators/isotonic.py
@@ -1,317 +1,370 @@
 # pylint: disable=arguments-differ, unused-argument
-''' Contains Isotonic Calibration'''
+""" Contains Isotonic Calibration"""
 
-from .calibrator import CalibrationFeature, Calibrator
+from typing import Dict, Optional, Tuple
 
-from absl import logging
 import numpy as np
-from sklearn.isotonic import isotonic_regression
 import tensorflow.compat.v1 as tf
 import tensorflow_hub as hub
+from absl import logging
+from sklearn.isotonic import isotonic_regression
+
 import twml
 import twml.layers
 
+from .calibrator import CalibrationFeature, Calibrator
 
 DEFAULT_SAMPLE_WEIGHT = 1
 
 
-def sort_values(inputs, target, weight, ascending=True):
-  '''
-  Sorts arrays based on the first array.
-
-  Arguments:
-    inputs:
-      1D array which will dictate the order which the remainder 2 arrays will be sorted
-    target:
-      1D array
-    weight:
-      1D array
-    ascending:
-      Boolean. If set to True (the default), sorts values in ascending order.
-
-  Returns:
-    sorted inputs:
-      1D array sorted by the order of `ascending`
-    sorted targets:
-      1D array
-    sorted weight:
-      1D array
-  '''
-  # assert that the length of inputs and target are the same
-  if len(inputs) != len(target):
-    raise ValueError('Expecting inputs and target sizes to match')
-   # assert that the length of inputs and weight are the same
-  if len(inputs) != len(weight):
-    raise ValueError('Expecting inputs and weight sizes to match')
-  inds = inputs.argsort()
-  if not ascending:
-    inds = inds[::-1]
-  return inputs[inds], target[inds], weight[inds]
-
-
-class IsotonicFeature(CalibrationFeature):
-  '''
-  IsotonicFeature adds values, weights and targets to each feature and then runs
-  isotonic regression by calling `sklearn.isotonic.isotonic_regression
-  <http://scikit-learn.org/stable/auto_examples/plot_isotonic_regression.html>`_
-  '''
-
-  def _get_bin_boundaries(self, n_samples, bins, similar_bins):
+def sort_values(
+    inputs: np.ndarray,
+    target: np.ndarray,
+    weight: np.ndarray,
+    ascending: bool = True,
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
     """
-    Calculates the sample indices that define bin boundaries
-
-    Arguments:
-      n_samples:
-        (int) number of samples
-      bins:
-        (int) number of bins. Needs to be smaller or equal than n_samples.
-      similar_bins:
-        (bool) If True, samples will be distributed in bins of equal size (up to one sample).
-        If False bins will be filled with step = N_samples//bins, and last bin will contain all remaining samples.
-        Note that equal_bins=False can create a last bins with a very large number of samples.
+    Sorts arrays based on the first array.
+
+    Args:
+        inputs:
+            1D array which will dictate the order which the remainder 2 arrays will be sorted
+        target:
+            1D array
+        weight:
+            1D array
+        ascending:
+            Boolean. If set to True (the default), sorts values in ascending order.
 
     Returns:
-      (list[int]) List of sample indices defining bin boundaries
+        sorted inputs:
+            1D array sorted by the order of `ascending`
+        sorted targets:
+            1D array
+        sorted weight:
+            1D array
     """
+    # assert that the length of inputs and target are the same
+    if len(inputs) != len(target):
+        raise ValueError("Expecting inputs and target sizes to match")
+    # assert that the length of inputs and weight are the same
+    if len(inputs) != len(weight):
+        raise ValueError("Expecting inputs and weight sizes to match")
 
-    if bins > n_samples:
-      raise ValueError(
-        "The number of bins needs to be less than or equal to the number of samples. "
-        "Currently bins={0} and n_samples={1}.".format(bins, n_samples)
-      )
-
-    step = n_samples // bins
+    inds = inputs.argsort()
+    if not ascending:
+        inds = inds[::-1]
+    return inputs[inds], target[inds], weight[inds]
 
-    if similar_bins:
-      # dtype=int will floor the linspace
-      bin_boundaries = np.linspace(0, n_samples - step, num=bins, dtype=int)
-    else:
-      bin_boundaries = range(0, step * bins, step)
 
-    bin_boundaries = np.append(bin_boundaries, n_samples)
-
-    return bin_boundaries
-
-  def calibrate(self, bins, similar_bins=False, debug=False):
-    '''Calibrates the IsotonicFeature into calibrated weights and bias.
-
-    1. Sorts the values of the feature class, based on the order of values
-    2. Performs isotonic regression using sklearn.isotonic.isotonic_regression
-    3. Performs the binning of the samples, in order to obtain the final weight and bias
-      which will be used for inference
-
-    Note that this method can only be called once.
-
-    Arguments:
-      bins:
-        number of bins.
-      similar_bins:
-        If True, samples will be distributed in bins of equal size (up to one sample).
-        If False bins will be filled with step = N_samples//bins, and last bin will contain all remaining samples.
-        Note that equal_bins=False can create a last bins with a very large number of samples.
-      debug:
-        Defaults to False. If debug is set to true, output other parameters useful for debugging.
+class IsotonicFeature(CalibrationFeature):
+    """
+    IsotonicFeature adds values, weights and targets to each feature and then runs
+    isotonic regression by calling `sklearn.isotonic.isotonic_regression
+    <http://scikit-learn.org/stable/auto_examples/plot_isotonic_regression.html>`_
+    """
 
-    Returns:
-      [calibrated weight, calibrated bias]
-    '''
-    if self._calibrated:
-      raise RuntimeError("Can only calibrate once")
-    # parse through the dict to obtain the targets, weights and values
-    self._concat_arrays()
-    feature_targets = self._features_dict['targets']
-    feature_values = self._features_dict['values']
-    feature_weights = self._features_dict['weights']
-    srtd_feature_values, srtd_feature_targets, srtd_feature_weights = sort_values(
-      inputs=feature_values,
-      target=feature_targets,
-      weight=feature_weights
-    )
-    calibrated_feature_values = isotonic_regression(
-      srtd_feature_targets, sample_weight=srtd_feature_weights)
-    # create the final outputs for the prediction of each class
-    bpreds = []
-    btargets = []
-    bweights = []
-    rpreds = []
-
-    # Create bin boundaries
-    bin_boundaries = self._get_bin_boundaries(
-      len(calibrated_feature_values), bins, similar_bins=similar_bins)
-
-    for sidx, eidx in zip(bin_boundaries, bin_boundaries[1:]):
-      # separate each one of the arrays based on their respective bins
-      lpreds = srtd_feature_values[int(sidx):int(eidx)]
-      lrpreds = calibrated_feature_values[int(sidx):int(eidx)]
-      ltargets = srtd_feature_targets[int(sidx):int(eidx)]
-      lweights = srtd_feature_weights[int(sidx):int(eidx)]
-
-      # calculate the outputs (including the bpreds and rpreds)
-      bpreds.append(np.sum(lpreds * lweights) / (np.squeeze(np.sum(lweights))))
-      rpreds.append(np.sum(lrpreds * lweights) / (np.squeeze(np.sum(lweights))))
-      btargets.append(np.sum(ltargets * lweights) / (np.squeeze(np.sum(lweights))))
-      bweights.append(np.squeeze(np.sum(lweights)))
-    # transposing the bpreds and rpreds which will be used as input to the inference step
-    bpreds = np.asarray(bpreds).T
-    rpreds = np.asarray(rpreds).T
-    btargets = np.asarray(btargets).T
-    bweights = np.asarray(bweights).T
-    # setting _calibrated to be True which is necessary in order to prevent it to re-calibrate
-    self._calibrated = True
-    if debug:
-      return bpreds, rpreds, btargets, bweights
-    return bpreds, rpreds
+    def _get_bin_boundaries(
+        self, n_samples: int, bins: int, similar_bins: bool
+    ) -> np.ndarray:
+        """
+        Calculates the sample indices that define bin boundaries
+
+        Args:
+            n_samples:
+                (int) number of samples
+            bins:
+                (int) number of bins. Needs to be smaller or equal than n_samples.
+            similar_bins:
+                (bool) If True, samples will be distributed in bins of equal size (up to one sample).
+                If False bins will be filled with step = N_samples//bins, and last bin will contain all remaining samples.
+                Note that equal_bins=False can create a last bins with a very large number of samples.
+
+        Returns:
+            (np.ndarray) List of sample indices defining bin boundaries
+        """
+
+        if bins > n_samples:
+            raise ValueError(
+                "The number of bins needs to be less than or equal to the number of samples. "
+                f"Currently bins={bins} and n_samples={n_samples}"
+            )
+
+        step = n_samples // bins
+
+        if similar_bins:
+            # dtype=int will floor the linspace
+            bin_boundaries = np.linspace(0, n_samples - step, num=bins, dtype=int)
+        else:
+            bin_boundaries = range(0, step * bins, step)
+
+        bin_boundaries = np.append(bin_boundaries, n_samples)
+
+        return bin_boundaries
+
+    def calibrate(
+        self,
+        bins: int,
+        similar_bins: bool = False,
+        debug: bool = False,
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """Calibrates the IsotonicFeature into calibrated weights and bias.
+
+        1. Sorts the values of the feature class, based on the order of values
+        2. Performs isotonic regression using sklearn.isotonic.isotonic_regression
+        3. Performs the binning of the samples, in order to obtain the final weight and bias
+            which will be used for inference
+
+        Note that this method can only be called once.
+
+        Args:
+            bins (int):
+                number of bins.
+            similar_bins (bool):
+                If True, samples will be distributed in bins of equal size (up to one sample).
+                If False bins will be filled with step = N_samples//bins, and last bin will contain all remaining samples.
+                Note that equal_bins=False can create a last bins with a very large number of samples.
+            debug (bool):
+                Defaults to False. If debug is set to true, output other parameters useful for debugging.
+
+        Returns:
+            [calibrated weight, calibrated bias]
+        """
+
+        if self._calibrated:
+            raise RuntimeError("Can only calibrate once")
+
+        # parse through the dict to obtain the targets, weights and values
+        self._concat_arrays()
+        feature_targets = self._features_dict["targets"]
+        feature_values = self._features_dict["values"]
+        feature_weights = self._features_dict["weights"]
+        srtd_feature_values, srtd_feature_targets, srtd_feature_weights = sort_values(
+            inputs=feature_values,
+            target=feature_targets,
+            weight=feature_weights,
+        )
+        calibrated_feature_values = isotonic_regression(
+            srtd_feature_targets,
+            sample_weight=srtd_feature_weights,
+        )
+        # create the final outputs for the prediction of each class
+        bpreds = []
+        btargets = []
+        bweights = []
+        rpreds = []
+
+        # Create bin boundaries
+        bin_boundaries = self._get_bin_boundaries(
+            len(calibrated_feature_values),
+            bins,
+            similar_bins=similar_bins,
+        )
+
+        for sidx, eidx in zip(bin_boundaries, bin_boundaries[1:]):
+            # separate each one of the arrays based on their respective bins
+            lpreds = srtd_feature_values[int(sidx) : int(eidx)]
+            lrpreds = calibrated_feature_values[int(sidx) : int(eidx)]
+            ltargets = srtd_feature_targets[int(sidx) : int(eidx)]
+            lweights = srtd_feature_weights[int(sidx) : int(eidx)]
+
+            # calculate the outputs (including the bpreds and rpreds)
+            bpreds.append(np.sum(lpreds * lweights) / (np.squeeze(np.sum(lweights))))
+            rpreds.append(np.sum(lrpreds * lweights) / (np.squeeze(np.sum(lweights))))
+            btargets.append(
+                np.sum(ltargets * lweights) / (np.squeeze(np.sum(lweights)))
+            )
+            bweights.append(np.squeeze(np.sum(lweights)))
+        # transposing the bpreds and rpreds which will be used as input to the inference step
+        bpreds = np.asarray(bpreds).T
+        rpreds = np.asarray(rpreds).T
+        btargets = np.asarray(btargets).T
+        bweights = np.asarray(bweights).T
+        # setting _calibrated to be True which is necessary in order to prevent it to re-calibrate
+        self._calibrated = True
+        if debug:
+            return bpreds, rpreds, btargets, bweights
+
+        return bpreds, rpreds
 
 
 class IsotonicCalibrator(Calibrator):
-  ''' Accumulates features and their respective values for isotonic calibration.
-  Internally, each feature's values is accumulated via its own isotonicFeature object.
-  The steps for calibration are typically as follows:
-
-   1. accumulate feature values from batches by calling ``accumulate()``;
-   2. calibrate all feature into Isotonic ``bpreds``, ``rpreds`` by calling ``calibrate()``; and
-   3. convert to a ``twml.layers.Isotonic`` layer by calling ``to_layer()``.
-
-  '''
-
-  def __init__(self, n_bin, similar_bins=False, **kwargs):
-    ''' Constructs an isotonicCalibrator instance.
-
-    Arguments:
-      n_bin:
-        the number of bins per feature to use for isotonic.
-        Note that each feature actually maps to ``n_bin+1`` output IDs.
-    '''
-    super(IsotonicCalibrator, self).__init__(**kwargs)
-    self._n_bin = n_bin
-    self._similar_bins = similar_bins
-    self._ys_input = []
-    self._xs_input = []
-    self._isotonic_feature_dict = {}
-
-  def accumulate_feature(self, output):
-    '''
-    Wrapper around accumulate for trainer API.
-    Arguments:
-      output: output of prediction of build_graph for calibrator
-    '''
-    weights = output['weights'] if 'weights' in output else None
-    return self.accumulate(output['predictions'], output['targets'], weights)
-
-  def accumulate(self, predictions, targets, weights=None):
-    '''
-    Accumulate a single batch of class predictions, class targets and class weights.
-    These are accumulated until calibrate() is called.
-
-    Arguments:
-      predictions:
-        float matrix of class values. Each dimension corresponds to a different class.
-        Shape is ``[n, d]``, where d is the number of classes.
-      targets:
-        float matrix of class targets. Each dimension corresponds to a different class.
-        Shape ``[n, d]``, where d is the number of classes.
-      weights:
-        Defaults to weights of 1.
-        1D array containing the weights of each prediction.
-    '''
-    if predictions.shape != targets.shape:
-      raise ValueError(
-        'Expecting predictions.shape == targets.shape, got %s and %s instead' %
-        (str(predictions.shape), str(targets.shape)))
-    if weights is not None:
-      if weights.ndim != 1:
-        raise ValueError('Expecting 1D weight, got %dD instead' % weights.ndim)
-      elif weights.size != predictions.shape[0]:
-        raise ValueError(
-          'Expecting predictions.shape[0] == weights.size, got %d != %d instead' %
-          (predictions.shape[0], weights.size))
-    # iterate through the rows of predictions and sets one class to each row
-    if weights is None:
-      weights = np.full(predictions.shape[0], fill_value=DEFAULT_SAMPLE_WEIGHT)
-    for class_key in range(predictions.shape[1]):
-      # gets the predictions and targets for that class
-      class_predictions = predictions[:, class_key]
-      class_targets = targets[:, class_key]
-      if class_key not in self._isotonic_feature_dict:
-        isotonic_feature = IsotonicFeature(class_key)
-        self._isotonic_feature_dict[class_key] = isotonic_feature
-      else:
-        isotonic_feature = self._isotonic_feature_dict[class_key]
-      isotonic_feature.add_values({'values': class_predictions, 'weights': weights,
-                                   'targets': class_targets})
-
-  def calibrate(self, debug=False):
-    '''
-    Calibrates each IsotonicFeature after accumulation is complete.
-    Results are stored in ``self._ys_input`` and ``self._xs_input``
-
-    Arguments:
-      debug:
-        Defaults to False. If set to true, returns the ``xs_input`` and ``ys_input``.
-    '''
-    super(IsotonicCalibrator, self).calibrate()
-    bias_temp = []
-    weight_temp = []
-    logging.info("Beginning isotonic calibration.")
-    isotonic_features_dict = self._isotonic_feature_dict
-    for class_id in isotonic_features_dict:
-      bpreds, rpreds = isotonic_features_dict[class_id].calibrate(bins=self._n_bin, similar_bins=self._similar_bins)
-      weight_temp.append(bpreds)
-      bias_temp.append(rpreds)
-    # save isotonic results onto a matrix
-    self._xs_input = np.array(weight_temp, dtype=np.float32)
-    self._ys_input = np.array(bias_temp, dtype=np.float32)
-    logging.info("Isotonic calibration finished.")
-    if debug:
-      return np.array(weight_temp), np.array(bias_temp)
-    return None
-
-  def save(self, save_dir, name="default", verbose=False):
-    '''Save the calibrator into the given save_directory.
-    Arguments:
-      save_dir:
-        name of the saving directory. Default (string): "default".
-    '''
-    if not self._calibrated:
-      raise RuntimeError("Expecting prior call to calibrate().Cannot save() prior to calibrate()")
-
-    # This module allows for the calibrator to save be saved as part of
-    # Tensorflow Hub (this will allow it to be used in further steps)
-    logging.info("You probably do not need to save the isotonic layer. \
-                  So feel free to set save to False in the Trainer. \
-                  Additionally this only saves the layer not the whole graph.")
-
-    def calibrator_module():
-      '''
-      Way to save Isotonic layer
-      '''
-      # The input to isotonic is a dense layer
-      inputs = tf.placeholder(tf.float32)
-      calibrator_layer = self.to_layer()
-      output = calibrator_layer(inputs)
-      # creates the signature to the calibrator module
-      hub.add_signature(inputs=inputs, outputs=output, name=name)
-
-    # exports the module to the save_dir
-    spec = hub.create_module_spec(calibrator_module)
-    with tf.Graph().as_default():
-      module = hub.Module(spec)
-      with tf.Session() as session:
-        module.export(save_dir, session)
-
-  def to_layer(self):
-    """ Returns a twml.layers.Isotonic Layer that can be used for feature discretization.
-    """
-    if not self._calibrated:
-      raise RuntimeError("Expecting prior call to calibrate()")
-
-    isotonic_layer = twml.layers.Isotonic(
-      n_unit=self._xs_input.shape[0], n_bin=self._xs_input.shape[1],
-      xs_input=self._xs_input, ys_input=self._ys_input,
-      **self._kwargs)
+    """Accumulates features and their respective values for isotonic calibration.
+    Internally, each feature's values is accumulated via its own isotonicFeature object.
+    The steps for calibration are typically as follows:
+        1. accumulate feature values from batches by calling ``accumulate()``;
+        2. calibrate all feature into Isotonic ``bpreds``, ``rpreds`` by calling ``calibrate()``; and
+        3. convert to a ``twml.layers.Isotonic`` layer by calling ``to_layer()``.
 
-    return isotonic_layer
+    """
 
-  def get_layer_args(self, name=None):
-    """ Returns layer args. See ``Calibrator.get_layer_args`` for more detailed documentation """
-    return {'n_unit': self._xs_input.shape[0], 'n_bin': self._xs_input.shape[1]}
+    def __init__(self, n_bin: int, similar_bins: bool = False, **kwargs):
+        """Constructs an isotonicCalibrator instance.
+
+        Args:
+            n_bin:
+                the number of bins per feature to use for isotonic.
+                Note that each feature actually maps to ``n_bin+1`` output IDs.
+            similar_bins:
+                If True, samples will be distributed in bins of equal size (up to one sample).
+                defaults to False. If False bins will be filled with step = N_samples//bins, and last bin will contain all remaining samples.
+        """
+        super(IsotonicCalibrator, self).__init__(**kwargs)
+        self._n_bin = n_bin
+        self._similar_bins = similar_bins
+        self._ys_input = []
+        self._xs_input = []
+        self._isotonic_feature_dict = {}
+
+    def accumulate_feature(self, output: Dict[str, np.ndarray]) -> None:
+        """
+        Wrapper around accumulate for trainer API.
+        Args:
+            output (dict):
+                output of prediction of build_graph for calibrator
+        """
+        weights = output["weights"] if "weights" in output else None
+        self.accumulate(output["predictions"], output["targets"], weights)
+
+    def accumulate(
+        self, predictions: np.ndarray, targets: np.ndarray, weights: np.ndarray = None
+    ) -> None:
+        """
+        Accumulate a single batch of class predictions, class targets and class weights.
+        These are accumulated until calibrate() is called.
+
+        Args:
+            predictions (np.ndarray):
+                float matrix of class values. Each dimension corresponds to a different class.
+                Shape is ``[n, d]``, where d is the number of classes.
+            targets (np.ndarray):
+                float matrix of class targets. Each dimension corresponds to a different class.
+                Shape ``[n, d]``, where d is the number of classes.
+            weights (np.ndarray):
+                Defaults to weights of 1.
+                1D array containing the weights of each prediction.
+        """
+        if predictions.shape != targets.shape:
+            raise ValueError(
+                "Expecting predictions.shape == targets.shape, got %s and %s instead"
+                % (str(predictions.shape), str(targets.shape))
+            )
+        if weights is not None:
+            if weights.ndim != 1:
+                raise ValueError("Expecting 1D weight, got %dD instead" % weights.ndim)
+            elif weights.size != predictions.shape[0]:
+                raise ValueError(
+                    "Expecting predictions.shape[0] == weights.size, got %d != %d instead"
+                    % (predictions.shape[0], weights.size)
+                )
+        # iterate through the rows of predictions and sets one class to each row
+        if weights is None:
+            weights = np.full(predictions.shape[0], fill_value=DEFAULT_SAMPLE_WEIGHT)
+        for class_key in range(predictions.shape[1]):
+            # gets the predictions and targets for that class
+            class_predictions = predictions[:, class_key]
+            class_targets = targets[:, class_key]
+            if class_key not in self._isotonic_feature_dict:
+                isotonic_feature = IsotonicFeature(class_key)
+                self._isotonic_feature_dict[class_key] = isotonic_feature
+            else:
+                isotonic_feature = self._isotonic_feature_dict[class_key]
+            isotonic_feature.add_values(
+                {
+                    "values": class_predictions,
+                    "weights": weights,
+                    "targets": class_targets,
+                }
+            )
+
+    def calibrate(self, debug: bool = False) -> Optional[Tuple[np.ndarray, np.ndarray]]:
+        """
+        Calibrates each IsotonicFeature after accumulation is complete.
+        Results are stored in ``self._ys_input`` and ``self._xs_input``
+
+        Args:
+            debug:
+                Defaults to False. If set to true, returns the ``xs_input`` and ``ys_input``.
+
+        Returns:
+            If debug is set to True, returns the ``xs_input`` and ``ys_input``.
+        """
+        super(IsotonicCalibrator, self).calibrate()
+        bias_temp = []
+        weight_temp = []
+        logging.info("Beginning isotonic calibration.")
+        isotonic_features_dict = self._isotonic_feature_dict
+        for class_id in isotonic_features_dict:
+            bpreds, rpreds = isotonic_features_dict[class_id].calibrate(
+                bins=self._n_bin, similar_bins=self._similar_bins
+            )
+            weight_temp.append(bpreds)
+            bias_temp.append(rpreds)
+        # save isotonic results onto a matrix
+        self._xs_input = np.array(weight_temp, dtype=np.float32)
+        self._ys_input = np.array(bias_temp, dtype=np.float32)
+        logging.info("Isotonic calibration finished.")
+        if debug:
+            return np.array(weight_temp), np.array(bias_temp)
+
+    def save(
+        self,
+        save_dir: str,
+        name: str = "default",
+        verbose: bool = False,
+    ):  # pylint: disable=unused-argument
+        """Save the calibrator into the given save_directory.
+        Args:
+            save_dir (str):
+                name of the saving directory.
+            name (str):
+                name of the calibrator. Default (string): "default".
+        """
+        if not self._calibrated:
+            raise RuntimeError(
+                "Expecting prior call to calibrate().Cannot save() prior to calibrate()"
+            )
+
+        # This module allows for the calibrator to save be saved as part of
+        # Tensorflow Hub (this will allow it to be used in further steps)
+        logging.info(
+            "You probably do not need to save the isotonic layer. \
+            So feel free to set save to False in the Trainer. \
+            Additionally this only saves the layer not the whole graph."
+        )
+
+        def calibrator_module():
+            """Way to save Isotonic layer"""
+
+            # The input to isotonic is a dense layer
+            inputs = tf.placeholder(tf.float32)
+            calibrator_layer = self.to_layer()
+            output = calibrator_layer(inputs)
+            # creates the signature to the calibrator module
+            hub.add_signature(inputs=inputs, outputs=output, name=name)
+
+        # exports the module to the save_dir
+        spec = hub.create_module_spec(calibrator_module)
+        with tf.Graph().as_default():
+            module = hub.Module(spec)
+            with tf.Session() as session:
+                module.export(save_dir, session)
+
+    def to_layer(self) -> twml.layers.Isotonic:
+        """Returns a twml.layers.Isotonic Layer that can be used for feature discretization."""
+        if not self._calibrated:
+            raise RuntimeError("Expecting prior call to calibrate()")
+
+        isotonic_layer = twml.layers.Isotonic(
+            n_unit=self._xs_input.shape[0],
+            n_bin=self._xs_input.shape[1],
+            xs_input=self._xs_input,
+            ys_input=self._ys_input,
+            **self._kwargs,
+        )
+
+        return isotonic_layer
+
+    def get_layer_args(self, name: str = None) -> Dict[str, int]:
+        """Returns layer args. See ``Calibrator.get_layer_args`` for more detailed documentation"""
+        return {"n_unit": self._xs_input.shape[0], "n_bin": self._xs_input.shape[1]}
diff --git a/twml/twml/contrib/calibrators/mdl.py b/twml/twml/contrib/calibrators/mdl.py
index 0fe3265a4..66d5d5512 100644
--- a/twml/twml/contrib/calibrators/mdl.py
+++ b/twml/twml/contrib/calibrators/mdl.py
@@ -1,118 +1,132 @@
 # pylint: disable=arguments-differ,no-member,too-many-statements
-''' Contains MDLFeature and MDLCalibrator used for MDL calibration '''
+""" Contains MDLFeature and MDLCalibrator used for MDL calibration """
 
 
 import os
 
-from .percentile_discretizer import PercentileDiscretizerCalibrator, PercentileDiscretizerFeature
-
-from absl import logging
 import numpy as np
 import tensorflow.compat.v1 as tf
+from absl import logging
+
 import twml
 import twml.layers
 
+from .percentile_discretizer import (
+    PercentileDiscretizerCalibrator,
+    PercentileDiscretizerFeature,
+)
 
 DEFAULT_SAMPLE_WEIGHT = 1
 
 
 class MDLFeature(PercentileDiscretizerFeature):
-  ''' Accumulates and calibrates a single sparse MDL feature. '''
+    """Accumulates and calibrates a single sparse MDL feature."""
 
 
 class MDLCalibrator(PercentileDiscretizerCalibrator):
-  ''' Accumulates features and their respective values for MDL calibration.
-  Internally, each feature's values is accumulated via its own ``MDLFeature`` object.
-  The steps for calibration are typically as follows:
-
-   1. accumulate feature values from batches by calling ``accumulate()``;
-   2. calibrate all feature into MDL bin_vals by calling ``calibrate()``; and
-   3. convert to a twml.layers.MDL layer by calling ``to_layer()``.
-
-  '''
-
-  def to_layer(self, name=None):
+    """Accumulates features and their respective values for MDL calibration.
+    Internally, each feature's values is accumulated via its own ``MDLFeature`` object.
+    The steps for calibration are typically as follows:
+        1. accumulate feature values from batches by calling ``accumulate()``;
+        2. calibrate all feature into MDL bin_vals by calling ``calibrate()``; and
+        3. convert to a twml.layers.MDL layer by calling ``to_layer()``.
     """
-    Returns a twml.layers.PercentileDiscretizer Layer
-    that can be used for feature discretization.
 
-    Arguments:
-      name:
-        name-scope of the PercentileDiscretizer layer
-    """
-    n_feature = len(self._discretizer_feature_dict)
-    max_discretizer_feature = n_feature * (self._n_bin + 1)
-
-    if not self._calibrated:
-      raise RuntimeError("Expecting prior call to calibrate()")
-
-    if self._bin_ids.shape[0] != n_feature:
-      raise RuntimeError("Expecting self._bin_ids.shape[0] \
-        != len(self._discretizer_feature_dict)")
-    if self._bin_vals.shape[0] != n_feature:
-      raise RuntimeError("Expecting self._bin_vals.shape[0] \
-        != len(self._discretizer_feature_dict)")
-
-    # can add at most #features * (n_bin+1) new feature ids
-    if 2**self._out_bits <= max_discretizer_feature:
-      raise ValueError("""Maximum number of features created by discretizer is
+    def to_layer(self, name: str = None):
+        """
+        Returns a twml.layers.PercentileDiscretizer Layer
+        that can be used for feature discretization.
+
+        Args:
+            name (str):
+                name-scope of the PercentileDiscretizer layer
+        """
+        n_feature = len(self._discretizer_feature_dict)
+        max_discretizer_feature = n_feature * (self._n_bin + 1)
+
+        if not self._calibrated:
+            raise RuntimeError("Expecting prior call to calibrate()")
+
+        if self._bin_ids.shape[0] != n_feature:
+            raise RuntimeError(
+                "Expecting self._bin_ids.shape[0] != len(self._discretizer_feature_dict)"
+            )
+        if self._bin_vals.shape[0] != n_feature:
+            raise RuntimeError(
+                "Expecting self._bin_vals.shape[0] != len(self._discretizer_feature_dict)"
+            )
+
+        # can add at most #features * (n_bin+1) new feature ids
+        if (1 << self._out_bits) <= max_discretizer_feature:
+            raise ValueError(
+                """Maximum number of features created by discretizer is
         %d but requested that the output be limited to %d values (%d bits),
         which is smaller than that. Please ensure the output has enough bits
         to represent at least the new features"""
-                       % (max_discretizer_feature, 2**self._out_bits, self._out_bits))
-
-    # build feature_offsets, hash_map_keys, hash_map_values
-    feature_offsets = np.arange(0, max_discretizer_feature,
-                                self._n_bin + 1, dtype='int64')
-    hash_map_keys = np.array(list(self._hash_map.keys()), dtype=np.int64)
-    hash_map_values = np.array(list(self._hash_map.values()), dtype=np.float32)
-
-    discretizer = twml.layers.MDL(
-      n_feature=n_feature, n_bin=self._n_bin,
-      name=name, out_bits=self._out_bits,
-      hash_keys=hash_map_keys, hash_values=hash_map_values,
-      bin_ids=self._bin_ids.flatten(), bin_values=self._bin_vals.flatten(),
-      feature_offsets=feature_offsets,
-      **self._kwargs
-    )
-
-    return discretizer
-
-  def save(self, save_dir, name='calibrator', verbose=False):
-    '''Save the calibrator into the given save_directory.
-    Arguments:
-      save_dir:
-        name of the saving directory
-      name:
-        name for the graph scope. Passed to to_layer(name=name) to set
-        scope of layer.
-    '''
-    if not self._calibrated:
-      raise RuntimeError("Expecting prior call to calibrate().Cannot save() prior to calibrate()")
-
-    layer_args = self.get_layer_args()
-
-    calibrator_filename = os.path.join(save_dir, name + '.json.tf')
-    calibrator_dict = {
-      'layer_args': layer_args,
-      'saved_layer_scope': name + '/',
-    }
-    twml.write_file(calibrator_filename, calibrator_dict, encode='json')
-
-    if verbose:
-      logging.info("The layer graph and other information necessary ")
-      logging.info("for multi-phase training is saved in directory:")
-      logging.info(save_dir)
-      logging.info("This directory can be specified as --init_from_dir argument.")
-      logging.info("")
-      logging.info("Other information is available in: %s.json.tf", name)
-      logging.info("This file can be loaded with twml.read_file(decode='json) to obtain ")
-      logging.info("layer_args, saved_layer_scope and variable_names")
-
-    graph = tf.Graph()
-    # save graph for tensorboard as well
-    writer = tf.summary.FileWriter(logdir=save_dir, graph=graph)
-
-    with tf.Session(graph=graph) as sess:
-      self.write_summary(writer, sess)
-    writer.flush()
+                % (max_discretizer_feature, (1 << self._out_bits), self._out_bits)
+            )
+
+        # build feature_offsets, hash_map_keys, hash_map_values
+        feature_offsets = np.arange(
+            0, max_discretizer_feature, self._n_bin + 1, dtype="int64"
+        )
+        hash_map_keys = np.array(list(self._hash_map.keys()), dtype=np.int64)
+        hash_map_values = np.array(list(self._hash_map.values()), dtype=np.float32)
+
+        discretizer = twml.layers.MDL(
+            n_feature=n_feature,
+            n_bin=self._n_bin,
+            name=name,
+            out_bits=self._out_bits,
+            hash_keys=hash_map_keys,
+            hash_values=hash_map_values,
+            bin_ids=self._bin_ids.flatten(),
+            bin_values=self._bin_vals.flatten(),
+            feature_offsets=feature_offsets,
+            **self._kwargs
+        )
+
+        return discretizer
+
+    def save(self, save_dir, name="calibrator", verbose=False):
+        """Save the calibrator into the given save_directory.
+        Args:
+          save_dir:
+            name of the saving directory
+          name:
+            name for the graph scope. Passed to to_layer(name=name) to set
+            scope of layer.
+        """
+        if not self._calibrated:
+            raise RuntimeError(
+                "Expecting prior call to calibrate().Cannot save() prior to calibrate()"
+            )
+
+        layer_args = self.get_layer_args()
+
+        calibrator_filename = os.path.join(save_dir, name + ".json.tf")
+        calibrator_dict = {
+            "layer_args": layer_args,
+            "saved_layer_scope": name + "/",
+        }
+        twml.write_file(calibrator_filename, calibrator_dict, encode="json")
+
+        if verbose:
+            logging.info("The layer graph and other information necessary ")
+            logging.info("for multi-phase training is saved in directory:")
+            logging.info(save_dir)
+            logging.info("This directory can be specified as --init_from_dir argument.")
+            logging.info("")
+            logging.info("Other information is available in: %s.json.tf", name)
+            logging.info(
+                "This file can be loaded with twml.read_file(decode='json) to obtain "
+            )
+            logging.info("layer_args, saved_layer_scope and variable_names")
+
+        graph = tf.Graph()
+        # save graph for tensorboard as well
+        writer = tf.summary.FileWriter(logdir=save_dir, graph=graph)
+
+        with tf.Session(graph=graph) as sess:
+            self.write_summary(writer, sess)
+        writer.flush()
diff --git a/twml/twml/contrib/calibrators/percentile_discretizer.py b/twml/twml/contrib/calibrators/percentile_discretizer.py
index eefce62c2..1e65c31e3 100644
--- a/twml/twml/contrib/calibrators/percentile_discretizer.py
+++ b/twml/twml/contrib/calibrators/percentile_discretizer.py
@@ -1,577 +1,613 @@
 # pylint: disable=arguments-differ,no-member,too-many-statements
-''' Contains PercentileDiscretizerFeature and PercentileDiscretizerCalibrator used \
-    for PercentileDiscretizer calibration '''
+""" Contains PercentileDiscretizerFeature and PercentileDiscretizerCalibrator used \
+    for PercentileDiscretizer calibration """
 
 
-
-from .calibrator import CalibrationFeature, Calibrator
-
 import os
+from typing import Any, Dict, Optional, Tuple
+
 import numpy as np
 import tensorflow.compat.v1 as tf
 import tensorflow_hub as hub
+
 import twml
 import twml.layers
 
+from .calibrator import CalibrationFeature, Calibrator
 
 DEFAULT_SAMPLE_WEIGHT = 1
 
 
 class PercentileDiscretizerFeature(CalibrationFeature):
-  ''' Accumulates and calibrates a single sparse PercentileDiscretizer feature. '''
-
-  @staticmethod
-  def _gather_debug_info(values, indices, bin_vals, bin_counts_buffer):
-    '''
-    Determine how many training values fell into a given bin during calibration.
-    This is calculated by finding the index of the first appearance of each bin
-    boundary in values (values may repeat, so that isn't trivially in indices.)
-    Subtracting each bin boundary index from the next tells you how many values fall in
-    that bin.
-    To get this to calculate the last bin correctly, len(values) is appended to the
-    list of bound indices.
-
-    This assumes that ``bin_vals`` excludes np.inf bin boundaries when
-    PercentileDiscretizer was calibrated
-    with fewer values than bins.
-
-    Arguments:
-      values:
-        1D ndarray of the PercentileDiscretizerFeature's accumulated values, sorted ascending
-      indices:
-        1D int32 ndarray of the indices (in values) of the bin boundaries
-      bin_vals:
-        1D ndarray containing the bin boundaries
-      bin_counts_buffer:
-        ndarray buffer for returning the PercentileDiscretizer histogram
-    '''
-    # np.flatnonzero(np.diff(x)) gives you the indices i in x s.t. x[i] != x[i+1]
-    # append index of the last bin since that cannot be empty with how
-    # PercentileDiscretizer is implemented
-    nonempty_bins = np.append(np.flatnonzero(np.diff(bin_vals)), len(bin_vals) - 1)
-    bin_start_indices = indices.take(nonempty_bins)
-
-    # if multiples of a bin's lower bound value exist, find the first one
-    for (i, idx) in enumerate(bin_start_indices):
-      cur_idx = idx
-      while cur_idx > 0 and values[cur_idx] == values[cur_idx - 1]:
-        bin_start_indices[i] = cur_idx = cur_idx - 1
-
-    # the end of each bin is the start of the next bin,
-    # until the last, which is the end of the array
-    # broadcast the counts to the nonempty bins, 0 otherwise
-    bin_counts_buffer[:] = 0
-    bin_counts_buffer[nonempty_bins] = np.diff(np.append(bin_start_indices, values.size))
-
-  def calibrate(
-          self,
-          bin_vals, percentiles, percentile_indices,
-          bin_counts_buffer=None):
-    '''Calibrates the PercentileDiscretizerFeature into bin values for
-    use in PercentileDiscretizerCalibrator.
-    Note that this method can only be called once.
-
-    Arguments:
-      bin_vals:
-        Row in the PercentileDiscretizerCalibrator.bin_vals matrix corresponding to this feature.
-        Will be updated with the results of the calibration.
-        A 1D ndarray.
-      percentiles:
-        1D array of size n_bin with values ranging from 0 to 1.
-        For example, ``percentiles = np.linspace(0, 1, num=self._n_bin+1, dtype=np.float32)``
-      percentile_indices:
-        Empty 1D array of size n_bin used to store intermediate results when
-        calling twml.twml_optim_nearest_interpolation().
-        For example, np.empty(self._n_bin + 1, dtype=np.float32).
-      bin_counts_buffer:
-        optional ndarray buffer used for retaining count of values per PercentileDiscretizer
-        bucket (for debug and feature exploration purposes)
-
-    Returns:
-      calibrated bin_vals for use by ``PercentileDiscretizerCalibrator``
-    '''
-    if self._calibrated:
-      raise RuntimeError("Can only calibrate once")
-    if bin_vals.ndim != 1:
-      raise RuntimeError("Expecting bin_vals row")
-
-    # # concatenate values and weights buffers
-    self._concat_arrays()
-    feature_values = self._features_dict['values']
-    feature_weights = self._features_dict['weights']
-
-    # get features ready for the bins, order array indices by feature values.
-    indices = np.argsort(feature_values)
-
-    # get ordered values and weights using array indices
-    values = feature_values.take(indices)
-    weights = feature_weights.take(indices)
-
-    # Normalizes the sum of weights to be between 0 and 1
-    weights = np.cumsum(weights, out=feature_weights)
-    weights -= weights[0]
-    if weights[-1] > 0:  # prevent zero-division
-      weights /= weights[-1]
-
-    # Check if we have less values than bin_vals
-    if values.size < bin_vals.size:
-      # Fills all the bins with a value that won't ever be reached
-      bin_vals.fill(np.inf)
-      # Forces the first to be -inf
-      bin_vals[0] = -np.inf
-      # Copies the values as boundaries
-      bin_vals[1:values.size + 1] = values
-
-      if bin_counts_buffer is not None:
-        # slice out bins with +/-np.inf boundary -- their count will be zero anyway
-        # we can't just assume all other bins will have 1 value since there can be dups
-        short_indices = np.arange(values.size, dtype=np.int32)
-        bin_counts_buffer.fill(0)
-        self._gather_debug_info(
-          values, short_indices, bin_vals[1:values.size + 1],
-          bin_counts_buffer[1:values.size + 1])
-
-    else:
-      # Gets the indices for the values that define the boundary for the bins
-      indices_float = np.arange(0, weights.size, dtype=np.float32)
-
-      # Gets things in the correct shape for the linear interpolation
-      weights = weights.reshape(1, weights.size)
-      indices_float = indices_float.reshape(1, weights.size)
-
-      # wrap ndarrays into twml.Array
-      percentiles_tarray = twml.Array(percentiles.reshape(percentiles.size, 1))
-      weights_tarray = twml.Array(weights)
-      indices_float_tarray = twml.Array(indices_float)
-      percentile_indices_tarray = twml.Array(percentile_indices.reshape(percentiles.size, 1))
-
-      # Performs the binary search to find the indices corresponding to the percentiles
-      err = twml.CLIB.twml_optim_nearest_interpolation(
-        percentile_indices_tarray.handle, percentiles_tarray.handle,  # output, input
-        weights_tarray.handle, indices_float_tarray.handle  # xs, ys
-      )
-      if err != 1000:
-        raise ValueError("""twml.CLIB.twml_optim_nearest_interpolation
-          caught an error (see previous stdout). Error code: """ % err)
-
-      indices = indices[:bin_vals.size]
-      indices[:] = percentile_indices
-      indices[0] = 0
-      indices[-1] = weights.size - 1
-
-      # Gets the values at those indices and copies them into bin_vals
-      values.take(indices, out=bin_vals)
-
-      # get # of values per bucket
-      if bin_counts_buffer is not None:
-        self._gather_debug_info(values, indices, bin_vals, bin_counts_buffer)
-
-    self._calibrated = True
+    """Accumulates and calibrates a single sparse PercentileDiscretizer feature."""
+
+    @staticmethod
+    def _gather_debug_info(values, indices, bin_vals, bin_counts_buffer):
+        """
+        Determine how many training values fell into a given bin during calibration.
+        This is calculated by finding the index of the first appearance of each bin
+        boundary in values (values may repeat, so that isn't trivially in indices.)
+        Subtracting each bin boundary index from the next tells you how many values fall in
+        that bin.
+        To get this to calculate the last bin correctly, len(values) is appended to the
+        list of bound indices.
+
+        This assumes that ``bin_vals`` excludes np.inf bin boundaries when
+        PercentileDiscretizer was calibrated
+        with fewer values than bins.
+
+        Args:
+          values:
+            1D ndarray of the PercentileDiscretizerFeature's accumulated values, sorted ascending
+          indices:
+            1D int32 ndarray of the indices (in values) of the bin boundaries
+          bin_vals:
+            1D ndarray containing the bin boundaries
+          bin_counts_buffer:
+            ndarray buffer for returning the PercentileDiscretizer histogram
+        """
+        # np.flatnonzero(np.diff(x)) gives you the indices i in x s.t. x[i] != x[i+1]
+        # append index of the last bin since that cannot be empty with how
+        # PercentileDiscretizer is implemented
+        nonempty_bins = np.append(np.flatnonzero(np.diff(bin_vals)), len(bin_vals) - 1)
+        bin_start_indices = indices.take(nonempty_bins)
+
+        # if multiples of a bin's lower bound value exist, find the first one
+        for i, idx in enumerate(bin_start_indices):
+            cur_idx = idx
+            while cur_idx > 0 and values[cur_idx] == values[cur_idx - 1]:
+                bin_start_indices[i] = cur_idx = cur_idx - 1
+
+        # the end of each bin is the start of the next bin,
+        # until the last, which is the end of the array
+        # broadcast the counts to the nonempty bins, 0 otherwise
+        bin_counts_buffer[:] = 0
+        bin_counts_buffer[nonempty_bins] = np.diff(
+            np.append(bin_start_indices, values.size)
+        )
+
+    def calibrate(
+        self, bin_vals, percentiles, percentile_indices, bin_counts_buffer=None
+    ):
+        """Calibrates the PercentileDiscretizerFeature into bin values for
+        use in PercentileDiscretizerCalibrator.
+        Note that this method can only be called once.
+
+        Args:
+          bin_vals:
+            Row in the PercentileDiscretizerCalibrator.bin_vals matrix corresponding to this feature.
+            Will be updated with the results of the calibration.
+            A 1D ndarray.
+          percentiles:
+            1D array of size n_bin with values ranging from 0 to 1.
+            For example, ``percentiles = np.linspace(0, 1, num=self._n_bin+1, dtype=np.float32)``
+          percentile_indices:
+            Empty 1D array of size n_bin used to store intermediate results when
+            calling twml.twml_optim_nearest_interpolation().
+            For example, np.empty(self._n_bin + 1, dtype=np.float32).
+          bin_counts_buffer:
+            optional ndarray buffer used for retaining count of values per PercentileDiscretizer
+            bucket (for debug and feature exploration purposes)
+
+        Returns:
+          calibrated bin_vals for use by ``PercentileDiscretizerCalibrator``
+        """
+        if self._calibrated:
+            raise RuntimeError("Can only calibrate once")
+        if bin_vals.ndim != 1:
+            raise RuntimeError("Expecting bin_vals row")
+
+        # # concatenate values and weights buffers
+        self._concat_arrays()
+        feature_values = self._features_dict["values"]
+        feature_weights = self._features_dict["weights"]
+
+        # get features ready for the bins, order array indices by feature values.
+        indices = np.argsort(feature_values)
+
+        # get ordered values and weights using array indices
+        values = feature_values.take(indices)
+        weights = feature_weights.take(indices)
+
+        # Normalizes the sum of weights to be between 0 and 1
+        weights = np.cumsum(weights, out=feature_weights)
+        weights -= weights[0]
+        if weights[-1] > 0:  # prevent zero-division
+            weights /= weights[-1]
+
+        # Check if we have less values than bin_vals
+        if values.size < bin_vals.size:
+            # Fills all the bins with a value that won't ever be reached
+            bin_vals.fill(np.inf)
+            # Forces the first to be -inf
+            bin_vals[0] = -np.inf
+            # Copies the values as boundaries
+            bin_vals[1 : values.size + 1] = values
+
+            if bin_counts_buffer is not None:
+                # slice out bins with +/-np.inf boundary -- their count will be zero anyway
+                # we can't just assume all other bins will have 1 value since there can be dups
+                short_indices = np.arange(values.size, dtype=np.int32)
+                bin_counts_buffer.fill(0)
+                self._gather_debug_info(
+                    values,
+                    short_indices,
+                    bin_vals[1 : values.size + 1],
+                    bin_counts_buffer[1 : values.size + 1],
+                )
+
+        else:
+            # Gets the indices for the values that define the boundary for the bins
+            indices_float = np.arange(0, weights.size, dtype=np.float32)
+
+            # Gets things in the correct shape for the linear interpolation
+            weights = weights.reshape(1, weights.size)
+            indices_float = indices_float.reshape(1, weights.size)
+
+            # wrap ndarrays into twml.Array
+            percentiles_tarray = twml.Array(percentiles.reshape(percentiles.size, 1))
+            weights_tarray = twml.Array(weights)
+            indices_float_tarray = twml.Array(indices_float)
+            percentile_indices_tarray = twml.Array(
+                percentile_indices.reshape(percentiles.size, 1)
+            )
+
+            # Performs the binary search to find the indices corresponding to the percentiles
+            err = twml.CLIB.twml_optim_nearest_interpolation(
+                percentile_indices_tarray.handle,
+                percentiles_tarray.handle,  # output, input
+                weights_tarray.handle,
+                indices_float_tarray.handle,  # xs, ys
+            )
+            if err != 1000:
+                raise ValueError(
+                    """twml.CLIB.twml_optim_nearest_interpolation
+          caught an error (see previous stdout). Error code: """
+                    % err
+                )
+
+            indices = indices[: bin_vals.size]
+            indices[:] = percentile_indices
+            indices[0] = 0
+            indices[-1] = weights.size - 1
+
+            # Gets the values at those indices and copies them into bin_vals
+            values.take(indices, out=bin_vals)
+
+            # get # of values per bucket
+            if bin_counts_buffer is not None:
+                self._gather_debug_info(values, indices, bin_vals, bin_counts_buffer)
+
+        self._calibrated = True
 
 
 class PercentileDiscretizerCalibrator(Calibrator):
-  ''' Accumulates features and their respective values for PercentileDiscretizer calibration.
-  Internally, each feature's values is accumulated via its own
-  ``PercentileDiscretizerFeature`` object.
-  The steps for calibration are typically as follows:
-
-   1. accumulate feature values from batches by calling ``accumulate()``;
-   2. calibrate all feature into PercentileDiscretizer bin_vals by calling ``calibrate()``; and
-   3. convert to a twml.layers.PercentileDiscretizer layer by calling ``to_layer()``.
-
-  '''
-
-  def __init__(self, n_bin, out_bits, bin_histogram=True,
-               allow_empty_calibration=False, **kwargs):
-    ''' Constructs an PercentileDiscretizerCalibrator instance.
-
-    Arguments:
-      n_bin:
-        the number of bins per feature to use for PercentileDiscretizer.
-        Note that each feature actually maps to n_bin+1 output IDs.
-      out_bits:
-        The maximum number of bits to use for the output IDs.
-        2**out_bits must be greater than bin_ids.size or an error is raised.
-      bin_histogram:
-        When True (the default), gathers information during calibration
-        to build a bin_histogram.
-      allow_empty_calibration:
-        allows operation where we might not calibrate any features.
-        Default False to error out if no features were calibrated.
-        Typically, values of uncalibrated features pass through discretizers
-        untouched (though the feature ids will be truncated to obey out_bits).
-    '''
-    super(PercentileDiscretizerCalibrator, self).__init__(**kwargs)
-    self._n_bin = n_bin
-    self._out_bits = out_bits
-
-    self._bin_ids = None
-    self._bin_vals = np.empty(0, dtype=np.float32)  # Note changed from 64 (v1) to 32 (v2)
-
-    self._bin_histogram = bin_histogram
-    self._bin_histogram_dict = None
-
-    self._hash_map_counter = 0
-    self._hash_map = {}
-
-    self._discretizer_feature_dict = {}
-    self._allow_empty_calibration = allow_empty_calibration
-
-  @property
-  def bin_ids(self):
-    '''
-    Gets bin_ids
-    '''
-    return self._bin_ids
-
-  @property
-  def bin_vals(self):
-    '''
-    Gets bin_vals
-    '''
-    return self._bin_vals
-
-  @property
-  def hash_map(self):
-    '''
-    Gets hash_map
-    '''
-    return self._hash_map
-
-  @property
-  def discretizer_feature_dict(self):
-    '''
-    Gets feature_dict
-    '''
-    return self._discretizer_feature_dict
-
-  def accumulate_features(self, inputs, name):
-    '''
-    Wrapper around accumulate for PercentileDiscretizer.
-    Arguments:
-      inputs:
-        batch that will be accumulated
-      name:
-        name of the tensor that will be accumulated
-
-    '''
-    sparse_tf = inputs[name]
-    indices = sparse_tf.indices[:, 1]
-    ids = sparse_tf.indices[:, 0]
-    weights = np.take(inputs["weights"], ids)
-    return self.accumulate(indices, sparse_tf.values, weights)
-
-  def accumulate_feature(self, output):
-    '''
-    Wrapper around accumulate for trainer API.
-    Arguments:
-      output:
-        output of prediction of build_graph for calibrator
-    '''
-    return self.accumulate(output['feature_ids'], output['feature_values'], output['weights'])
-
-  def accumulate(self, feature_keys, feature_vals, weights=None):
-    '''Accumulate a single batch of feature keys, values and weights.
-
-    These are accumulate until ``calibrate()`` is called.
-
-    Arguments:
-      feature_keys:
-        1D int64 array of feature keys.
-      feature_vals:
-        1D float array of feature values. Each element of this array
-        maps to the commensurate element in ``feature_keys``.
-      weights:
-        Defaults to weights of 1.
-        1D array containing the weights of each feature key, value pair.
-        Typically, this is the weight of each sample (but you still need
-        to provide one weight per key,value pair).
-        Each element of this array maps to the commensurate element in feature_keys.
-    '''
-    if feature_keys.ndim != 1:
-      raise ValueError('Expecting 1D feature_keys, got %dD' % feature_keys.ndim)
-    if feature_vals.ndim != 1:
-      raise ValueError('Expecting 1D feature_values, got %dD' % feature_vals.ndim)
-    if feature_vals.size != feature_keys.size:
-      raise ValueError(
-        'Expecting feature_keys.size == feature_values.size, got %d != %d' %
-        (feature_keys.size, feature_vals.size))
-    if weights is not None:
-      weights = np.squeeze(weights)
-      if weights.ndim != 1:
-        raise ValueError('Expecting 1D weights, got %dD' % weights.ndim)
-      elif weights.size != feature_keys.size:
-        raise ValueError(
-          'Expecting feature_keys.size == weights.size, got %d != %d' %
-          (feature_keys.size, weights.size))
-    if weights is None:
-      weights = np.full(feature_vals.size, fill_value=DEFAULT_SAMPLE_WEIGHT)
-    unique_keys = np.unique(feature_keys)
-    for feature_id in unique_keys:
-      idx = np.where(feature_keys == feature_id)
-      if feature_id not in self._discretizer_feature_dict:
-        self._hash_map[feature_id] = self._hash_map_counter
-        # unlike v1, the hash_map_counter is incremented AFTER assignment.
-        # This makes the hash_map features zero-indexed: 0, 1, 2 instead of 1, 2, 3
-        self._hash_map_counter += 1
-        # creates a new cache if we never saw the feature before
-        discretizer_feature = PercentileDiscretizerFeature(feature_id)
-        self._discretizer_feature_dict[feature_id] = discretizer_feature
-      else:
-        discretizer_feature = self._discretizer_feature_dict[feature_id]
-      discretizer_feature.add_values({'values': feature_vals[idx], 'weights': weights[idx]})
-
-  def calibrate(self, debug=False):
-    '''
-    Calibrates each PercentileDiscretizer feature after accumulation is complete.
-
-    Arguments:
-      debug:
-        Boolean to request debug info be returned by the method.
-        (see Returns section below)
-
-    The calibration results are stored in two matrices:
-      bin_ids:
-        2D array of size number of accumulate ``features x n_bin+1``.
-        Contains the new IDs generated by PercentileDiscretizer. Each row maps to a feature.
-        Each row maps to different value bins. The IDs
-        are in the range ``1 -> bin_ids.size+1``
-      bin_vals:
-        2D array of the same size as bin_ids.
-        Each row maps to a feature. Each row contains the bin boundaries.
-        These boundaries represent feature values.
-
-    Returns:
-      if debug is True, the method returns
-
-        - 1D int64 array of feature_ids
-        - 2D float32 array copy of bin_vals (the bin boundaries) for each feature
-        - 2D int64 array of bin counts corresponding to the bin boundaries
-
-    '''
-    n_feature = len(self._discretizer_feature_dict)
-    if n_feature == 0 and not self._allow_empty_calibration:
-      raise RuntimeError("Need to accumulate some features for calibration\n"
-                         "Likely, the calibration data is empty. This can\n"
-                         "happen if the dataset is small, or if the following\n"
-                         "cli args are set too low:\n"
-                         "  --discretizer_keep_rate (default=0.0008)\n"
-                         "  --discretizer_parts_downsampling_rate (default=0.2)\n"
-                         "Consider increasing the values of these args.\n"
-                         "To allow empty calibration data (and degenerate discretizer),\n"
-                         "use the allow_empty_calibration input of the constructor.")
-
-    self._bin_ids = np.arange(1, n_feature * (self._n_bin + 1) + 1)
-    self._bin_ids = self._bin_ids.reshape(n_feature, self._n_bin + 1)
-
-    self._bin_vals.resize(n_feature, self._n_bin + 1)
-
-    # buffers shared by PercentileDiscretizerFeature.calibrate()
-    percentile_indices = np.empty(self._n_bin + 1, dtype=np.float32)
-
-    # Tensor from 0 to 1 in the number of steps provided
-    percentiles = np.linspace(0, 1, num=self._n_bin + 1, dtype=np.float32)
-
-    if debug or self._bin_histogram:
-      debug_feature_ids = np.empty(n_feature, dtype=np.int64)
-      bin_counts = np.empty((n_feature, self._n_bin + 1), dtype=np.int64)
-
-    # progress bar for calibration phase
-    progress_bar = tf.keras.utils.Progbar(n_feature)
-
-    discretizer_features_dict = self._discretizer_feature_dict
-    for i, feature_id in enumerate(discretizer_features_dict):
-      if debug or self._bin_histogram:
-        debug_feature_ids[self._hash_map[feature_id]] = feature_id
-        bin_counts_buffer = bin_counts[self._hash_map[feature_id]]
-      else:
-        bin_counts_buffer = None
-
-      # calibrate each PercentileDiscretizer feature (puts results in bin_vals)
-      discretizer_features_dict[feature_id].calibrate(
-        self._bin_vals[self._hash_map[feature_id]],  # Gets feature-values
-        percentiles, percentile_indices,
-        bin_counts_buffer=bin_counts_buffer
-      )
-
-      # update progress bar 20 times
-      if (i % max(1.0, round(n_feature / 20)) == 0) or (i == n_feature - 1):
-        progress_bar.update(i + 1)
-
-    super(PercentileDiscretizerCalibrator, self).calibrate()
-
-    if self._bin_histogram:
-      # save bin histogram data for later
-      self._bin_histogram_dict = {
-        'feature_ids': debug_feature_ids,
-        'bin_counts': bin_counts,
-        'bin_vals': self._bin_vals,
-        'out_bits': self._out_bits,
-      }
-
-    if debug:
-      return debug_feature_ids, self._bin_vals.copy(), bin_counts
-
-    return None
-
-  def _create_discretizer_layer(self, n_feature, hash_map_keys, hash_map_values,
-                                feature_offsets, name):
-    return twml.layers.PercentileDiscretizer(
-      n_feature=n_feature,
-      n_bin=self._n_bin,
-      out_bits=self._out_bits,
-      bin_values=self._bin_vals.flatten(),
-      hash_keys=hash_map_keys,
-      hash_values=hash_map_values.astype(np.int64),
-      bin_ids=self._bin_ids.flatten().astype(np.int64),
-      feature_offsets=feature_offsets,
-      name=name,
-      **self._kwargs
-    )
-
-  def to_layer(self, name=None):
-    """
-    Returns a twml.layers.PercentileDiscretizer Layer
-    that can be used for feature discretization.
+    """Accumulates features and their respective values for PercentileDiscretizer calibration.
+    Internally, each feature's values is accumulated via its own
+    ``PercentileDiscretizerFeature`` object.
+    The steps for calibration are typically as follows:
 
-    Arguments:
-      name:
-        name-scope of the PercentileDiscretizer layer
-    """
-    n_feature = len(self._discretizer_feature_dict)
-    max_discretizer_feature = n_feature * (self._n_bin + 1)
-
-    if not self._calibrated:
-      raise RuntimeError("Expecting prior call to calibrate()")
-
-    if self._bin_ids.shape[0] != n_feature:
-      raise RuntimeError("Expecting self._bin_ids.shape[0] \
-        != len(self._discretizer_feature_dict)")
-    if self._bin_vals.shape[0] != n_feature:
-      raise RuntimeError("Expecting self._bin_vals.shape[0] \
-        != len(self._discretizer_feature_dict)")
-
-    # can add at most #features * (n_bin+1) new feature ids
-    if 2**self._out_bits <= max_discretizer_feature:
-      raise ValueError("""Maximum number of features created by discretizer is
-        %d but requested that the output be limited to %d values (%d bits),
-        which is smaller than that. Please ensure the output has enough bits
-        to represent at least the new features"""
-                       % (max_discretizer_feature, 2**self._out_bits, self._out_bits))
-
-    # build feature_offsets, hash_map_keys, hash_map_values
-    feature_offsets = np.arange(0, max_discretizer_feature,
-                                self._n_bin + 1, dtype='int64')
-    hash_map_keys = np.array(list(self._hash_map.keys()), dtype=np.int64)
-    hash_map_values = np.array(list(self._hash_map.values()), dtype=np.float32)
-
-    discretizer = self._create_discretizer_layer(n_feature, hash_map_keys,
-                                                 hash_map_values, feature_offsets, name)
-
-    return discretizer
-
-  def get_layer_args(self):
-    '''
-    Returns layer arguments required to implement multi-phase training.
-    See twml.calibrator.Calibrator.get_layer_args for more detailed documentation.
-    '''
-    layer_args = {
-      'n_feature': len(self._discretizer_feature_dict),
-      'n_bin': self._n_bin,
-      'out_bits': self._out_bits,
-    }
-
-    return layer_args
-
-  def add_hub_signatures(self, name):
-    """
-    Add Hub Signatures for each calibrator
+     1. accumulate feature values from batches by calling ``accumulate()``;
+     2. calibrate all feature into PercentileDiscretizer bin_vals by calling ``calibrate()``; and
+     3. convert to a twml.layers.PercentileDiscretizer layer by calling ``to_layer()``.
 
-    Arguments:
-      name:
-        Calibrator name
-    """
-    sparse_tf = tf.sparse_placeholder(tf.float32)
-    calibrator_layer = self.to_layer()
-    hub.add_signature(
-      inputs=sparse_tf,
-      outputs=calibrator_layer(sparse_tf, keep_inputs=False),
-      name=name)
-
-  def write_summary(self, writer, sess=None):
-    """
-    This method is called by save() to write a histogram of
-    PercentileDiscretizer feature bins to disk. A histogram is included for each
-    feature.
-
-    Arguments:
-      writer:
-        tf.summary.FilteWriter instance.
-        used to add summaries to event files for inclusion in tensorboard.
-      sess:
-        tf.Session instance. Used to produces summaries for the writer.
     """
-    bin_counts_ph = tf.placeholder(tf.int64)
-    bin_counts = self._bin_histogram_dict['bin_counts']
 
-    # Record that distribution into a histogram summary
-    histo = tf.summary.histogram("discretizer_feature_bin_counts", bin_counts_ph)
-    for i in range(bin_counts.shape[0]):
-      bin_counts_summary = sess.run(histo, feed_dict={bin_counts_ph: bin_counts[i]})
-      writer.add_summary(bin_counts_summary, global_step=i)
-
-  def write_summary_json(self, save_dir, name="default"):
-    """
-    Export bin information to HDFS.
-    
-    Arguments:
-      save_dir:
-        name of the saving directory.
-      name:
-        prefix of the saved hub signature. Default (string): "default".
-    """
-    # Since the size is small: (# of bins) * (# of features), we always dump the file.
-    discretizer_export_bin_filename = os.path.join(save_dir, name + '_bin.json')
-    discretizer_export_bin_dict = {
-      'feature_ids': self._bin_histogram_dict['feature_ids'].tolist(),
-      'bin_boundaries': self._bin_histogram_dict['bin_vals'].tolist(),
-      'output_bits': self._bin_histogram_dict['out_bits']
-    }
-    twml.write_file(discretizer_export_bin_filename, discretizer_export_bin_dict, encode='json')
-
-  def save(self, save_dir, name="default", verbose=False):
-    '''Save the calibrator into the given save_directory using TF Hub.
-    Arguments:
-      save_dir:
-        name of the saving directory.
-      name:
-        prefix of the saved hub signature. Default (string): "default".
-    '''
-    if not self._calibrated:
-      raise RuntimeError("Expecting prior call to calibrate().Cannot save() prior to calibrate()")
-
-    # This module allows for the calibrator to save be saved as part of
-    # Tensorflow Hub (this will allow it to be used in further steps)
-    def calibrator_module():
-      # Note that this is usually expecting a sparse_placeholder
-      inputs = tf.sparse_placeholder(tf.float32)
-      calibrator_layer = self.to_layer()
-      # creates the signature to the calibrator module
-      hub.add_signature(
-        inputs=inputs,
-        outputs=calibrator_layer(inputs, keep_inputs=False),
-        name=name)
-      # and another signature for keep_inputs mode
-      hub.add_signature(
-        inputs=inputs,
-        outputs=calibrator_layer(inputs, keep_inputs=True),
-        name=name + '_keep_inputs')
-
-    # exports the module to the save_dir
-    spec = hub.create_module_spec(calibrator_module)
-    with tf.Graph().as_default():
-      module = hub.Module(spec)
-      with tf.Session() as session:
-        module.export(save_dir, session)
-
-    self.write_summary_json(save_dir, name)
+    def __init__(
+        self,
+        n_bin: int,
+        out_bits: int,
+        bin_histogram: bool = True,
+        allow_empty_calibration: bool = False,
+        **kwargs
+    ):
+        """Constructs an PercentileDiscretizerCalibrator instance.
+
+        Args:
+            n_bin:
+                the number of bins per feature to use for PercentileDiscretizer.
+                Note that each feature actually maps to n_bin+1 output IDs.
+            out_bits:
+                The maximum number of bits to use for the output IDs.
+                2**out_bits must be greater than bin_ids.size or an error is raised.
+            bin_histogram:
+                When True (the default), gathers information during calibration
+                to build a bin_histogram.
+            allow_empty_calibration:
+                allows operation where we might not calibrate any features.
+                Default False to error out if no features were calibrated.
+                Typically, values of uncalibrated features pass through discretizers
+                untouched (though the feature ids will be truncated to obey out_bits).
+        """
+        super(PercentileDiscretizerCalibrator, self).__init__(**kwargs)
+        self._n_bin = n_bin
+        self._out_bits = out_bits
+
+        self._bin_ids = None
+        self._bin_vals = np.empty(
+            0, dtype=np.float32
+        )  # Note changed from 64 (v1) to 32 (v2)
+
+        self._bin_histogram = bin_histogram
+        self._bin_histogram_dict = None
+
+        self._hash_map_counter = 0
+        self._hash_map = {}
+
+        self._discretizer_feature_dict = {}
+        self._allow_empty_calibration = allow_empty_calibration
+
+    @property
+    def bin_ids(self) -> np.ndarray:
+        """Gets bin_ids"""
+        return self._bin_ids
+
+    @property
+    def bin_vals(self) -> np.ndarray:
+        """Gets bin_vals"""
+        return self._bin_vals
+
+    @property
+    def hash_map(self) -> Dict[str, int]:
+        """Gets hash_map"""
+        return self._hash_map
+
+    @property
+    def discretizer_feature_dict(self) -> Dict[str, PercentileDiscretizerFeature]:
+        """Gets feature_dict"""
+        return self._discretizer_feature_dict
+
+    def accumulate_features(self, inputs: Dict[str, Any], name: str) -> None:
+        """
+        Wrapper around accumulate for PercentileDiscretizer.
+        Args:
+            inputs (dict):
+                batch that will be accumulated
+            name (str):
+                name of the tensor that will be accumulated
+        """
+        sparse_tf = inputs[name]
+        indices = sparse_tf.indices[:, 1]
+        ids = sparse_tf.indices[:, 0]
+        weights = np.take(inputs["weights"], ids)
+        return self.accumulate(indices, sparse_tf.values, weights)
+
+    def accumulate_feature(self, output: Dict[str, Any]) -> None:
+        """
+        Wrapper around accumulate for trainer API.
+        Args:
+            output:
+                output of prediction of build_graph for calibrator
+        """
+        return self.accumulate(
+            output["feature_ids"], output["feature_values"], output["weights"]
+        )
+
+    def accumulate(
+        self,
+        feature_keys: np.ndarray,
+        feature_vals: np.ndarray,
+        weights: np.ndarray = None,
+    ) -> None:
+        """Accumulate a single batch of feature keys, values and weights.
+        These are accumulate until ``calibrate()`` is called.
+
+        Args:
+            feature_keys (np.ndarray):
+                1D int64 array of feature keys.
+            feature_vals (np.ndarray):
+                1D float array of feature values. Each element of this array
+                maps to the commensurate element in ``feature_keys``.
+            weights (np.ndarray, optional):
+                Defaults to weights of 1.
+                1D array containing the weights of each feature key, value pair.
+                Typically, this is the weight of each sample (but you still need
+                to provide one weight per key,value pair).
+                Each element of this array maps to the commensurate element in feature_keys.
+        """
+        if feature_keys.ndim != 1:
+            raise ValueError("Expecting 1D feature_keys, got %dD" % feature_keys.ndim)
+        if feature_vals.ndim != 1:
+            raise ValueError("Expecting 1D feature_values, got %dD" % feature_vals.ndim)
+        if feature_vals.size != feature_keys.size:
+            raise ValueError(
+                "Expecting feature_keys.size == feature_values.size, got %d != %d"
+                % (feature_keys.size, feature_vals.size)
+            )
+        if weights is not None:
+            weights = np.squeeze(weights)
+            if weights.ndim != 1:
+                raise ValueError("Expecting 1D weights, got %dD" % weights.ndim)
+            elif weights.size != feature_keys.size:
+                raise ValueError(
+                    "Expecting feature_keys.size == weights.size, got %d != %d"
+                    % (feature_keys.size, weights.size)
+                )
+        if weights is None:
+            weights = np.full(feature_vals.size, fill_value=DEFAULT_SAMPLE_WEIGHT)
+        unique_keys = np.unique(feature_keys)
+        for feature_id in unique_keys:
+            idx = np.where(feature_keys == feature_id)
+            if feature_id not in self._discretizer_feature_dict:
+                self._hash_map[feature_id] = self._hash_map_counter
+                # unlike v1, the hash_map_counter is incremented AFTER assignment.
+                # This makes the hash_map features zero-indexed: 0, 1, 2 instead of 1, 2, 3
+                self._hash_map_counter += 1
+                # creates a new cache if we never saw the feature before
+                discretizer_feature = PercentileDiscretizerFeature(feature_id)
+                self._discretizer_feature_dict[feature_id] = discretizer_feature
+            else:
+                discretizer_feature = self._discretizer_feature_dict[feature_id]
+            discretizer_feature.add_values(
+                {"values": feature_vals[idx], "weights": weights[idx]}
+            )
+
+    def calibrate(
+        self, debug: bool = False
+    ) -> Optional[Tuple[np.ndarray, np.ndarray, np.ndarray]]:
+        """
+        Calibrates each PercentileDiscretizer feature after accumulation is complete.
+
+        Args:
+            debug (bool):
+                Boolean to request debug info be returned by the method.
+                (see Returns section below)
+                The calibration results are stored in two matrices:
+
+        Returns:
+            if debug is True, the method returns
+            - 1D int64 array of feature_ids
+            - 2D float32 array copy of bin_vals (the bin boundaries) for each feature
+            - 2D int64 array of bin counts corresponding to the bin boundaries
+        """
+        n_feature = len(self._discretizer_feature_dict)
+        if n_feature == 0 and not self._allow_empty_calibration:
+            raise RuntimeError(
+                "Need to accumulate some features for calibration\n"
+                "Likely, the calibration data is empty. This can\n"
+                "happen if the dataset is small, or if the following\n"
+                "cli args are set too low:\n"
+                "  --discretizer_keep_rate (default=0.0008)\n"
+                "  --discretizer_parts_downsampling_rate (default=0.2)\n"
+                "Consider increasing the values of these args.\n"
+                "To allow empty calibration data (and degenerate discretizer),\n"
+                "use the allow_empty_calibration input of the constructor."
+            )
+
+        self._bin_ids = np.arange(1, n_feature * (self._n_bin + 1) + 1)
+        self._bin_ids = self._bin_ids.reshape(n_feature, self._n_bin + 1)
+
+        self._bin_vals.resize(n_feature, self._n_bin + 1)
+
+        # buffers shared by PercentileDiscretizerFeature.calibrate()
+        percentile_indices = np.empty(self._n_bin + 1, dtype=np.float32)
+
+        # Tensor from 0 to 1 in the number of steps provided
+        percentiles = np.linspace(0, 1, num=self._n_bin + 1, dtype=np.float32)
+
+        if debug or self._bin_histogram:
+            debug_feature_ids = np.empty(n_feature, dtype=np.int64)
+            bin_counts = np.empty((n_feature, self._n_bin + 1), dtype=np.int64)
+
+        # progress bar for calibration phase
+        progress_bar = tf.keras.utils.Progbar(n_feature)
+
+        discretizer_features_dict = self._discretizer_feature_dict
+        for i, feature_id in enumerate(discretizer_features_dict):
+            if debug or self._bin_histogram:
+                debug_feature_ids[self._hash_map[feature_id]] = feature_id
+                bin_counts_buffer = bin_counts[self._hash_map[feature_id]]
+            else:
+                bin_counts_buffer = None
+
+            # calibrate each PercentileDiscretizer feature (puts results in bin_vals)
+            discretizer_features_dict[feature_id].calibrate(
+                self._bin_vals[self._hash_map[feature_id]],  # Gets feature-values
+                percentiles,
+                percentile_indices,
+                bin_counts_buffer=bin_counts_buffer,
+            )
+
+            # update progress bar 20 times
+            if (i % max(1.0, round(n_feature / 20)) == 0) or (i == n_feature - 1):
+                progress_bar.update(i + 1)
+
+        super(PercentileDiscretizerCalibrator, self).calibrate()
+
+        if self._bin_histogram:
+            # save bin histogram data for later
+            self._bin_histogram_dict = {
+                "feature_ids": debug_feature_ids,
+                "bin_counts": bin_counts,
+                "bin_vals": self._bin_vals,
+                "out_bits": self._out_bits,
+            }
+
+        if debug:
+            return debug_feature_ids, self._bin_vals.copy(), bin_counts
+
+        return None
+
+    def _create_discretizer_layer(
+        self,
+        n_feature: int,
+        hash_map_keys: np.ndarray,
+        hash_map_values: np.ndarray,
+        feature_offsets: np.ndarray,
+        name: Optional[str] = None,
+    ):
+        return twml.layers.PercentileDiscretizer(
+            n_feature=n_feature,
+            n_bin=self._n_bin,
+            out_bits=self._out_bits,
+            bin_values=self._bin_vals.flatten(),
+            hash_keys=hash_map_keys,
+            hash_values=hash_map_values.astype(np.int64),
+            bin_ids=self._bin_ids.flatten().astype(np.int64),
+            feature_offsets=feature_offsets,
+            name=name,
+            **self._kwargs
+        )
+
+    def to_layer(self, name: str = None):
+        """
+        Returns a twml.layers.PercentileDiscretizer Layer
+        that can be used for feature discretization.
+
+        Args:
+            name:
+                name-scope of the PercentileDiscretizer layer
+        """
+        n_feature = len(self._discretizer_feature_dict)
+        max_discretizer_feature = n_feature * (self._n_bin + 1)
+
+        if not self._calibrated:
+            raise RuntimeError("Expecting prior call to calibrate()")
+        if self._bin_ids.shape[0] != n_feature:
+            raise RuntimeError(
+                "Expecting self._bin_ids.shape[0] != len(self._discretizer_feature_dict)"
+            )
+        if self._bin_vals.shape[0] != n_feature:
+            raise RuntimeError(
+                "Expecting self._bin_vals.shape[0] != len(self._discretizer_feature_dict)"
+            )
+
+        # can add at most #features * (n_bin+1) new feature ids
+        if (1 << self._out_bits) <= max_discretizer_feature:
+            raise ValueError(
+                """Maximum number of features created by discretizer is
+                %d but requested that the output be limited to %d values (%d bits),
+                which is smaller than that. Please ensure the output has enough bits
+                to represent at least the new features"""
+                % (max_discretizer_feature, (1 << self._out_bits), self._out_bits)
+            )
+
+        # build feature_offsets, hash_map_keys, hash_map_values
+        feature_offsets = np.arange(
+            0, max_discretizer_feature, self._n_bin + 1, dtype="int64"
+        )
+        hash_map_keys = np.array(list(self._hash_map.keys()), dtype=np.int64)
+        hash_map_values = np.array(list(self._hash_map.values()), dtype=np.float32)
+
+        discretizer = self._create_discretizer_layer(
+            n_feature, hash_map_keys, hash_map_values, feature_offsets, name
+        )
+
+        return discretizer
+
+    def get_layer_args(self) -> Dict[str, int]:
+        """
+        Returns layer arguments required to implement multi-phase training.
+        See twml.calibrator.Calibrator.get_layer_args for more detailed documentation.
+        """
+        layer_args = {
+            "n_feature": len(self._discretizer_feature_dict),
+            "n_bin": self._n_bin,
+            "out_bits": self._out_bits,
+        }
+
+        return layer_args
+
+    def add_hub_signatures(self, name: str):
+        """
+        Add Hub Signatures for each calibrator
+
+        Args:
+            name:
+                Calibrator name
+        """
+        sparse_tf = tf.sparse_placeholder(tf.float32)
+        calibrator_layer = self.to_layer()
+        hub.add_signature(
+            inputs=sparse_tf,
+            outputs=calibrator_layer(sparse_tf, keep_inputs=False),
+            name=name,
+        )
+
+    def write_summary(self, writer: tf.summary.FileWriter, sess: tf.Session = None):
+        """
+        This method is called by save() to write a histogram of
+        PercentileDiscretizer feature bins to disk. A histogram is included for each
+        feature.
+
+        Args:
+            writer:
+                tf.summary.FilteWriter instance.
+                used to add summaries to event files for inclusion in tensorboard.
+            sess:
+                tf.Session instance. Used to produces summaries for the writer.
+        """
+        bin_counts_ph = tf.placeholder(tf.int64)
+        bin_counts = self._bin_histogram_dict["bin_counts"]
+
+        # Record that distribution into a histogram summary
+        histo = tf.summary.histogram("discretizer_feature_bin_counts", bin_counts_ph)
+        for i in range(bin_counts.shape[0]):
+            bin_counts_summary = sess.run(
+                histo, feed_dict={bin_counts_ph: bin_counts[i]}
+            )
+            writer.add_summary(bin_counts_summary, global_step=i)
+
+    def write_summary_json(self, save_dir: str, name: str = "default"):
+        """
+        Export bin information to HDFS.
+
+        Args:
+            save_dir (str):
+                name of the saving directory.
+            name (str):
+                prefix of the saved hub signature. Default (string): "default".
+        """
+        # Since the size is small: (# of bins) * (# of features), we always dump the file.
+        discretizer_export_bin_filename = os.path.join(save_dir, name + "_bin.json")
+        discretizer_export_bin_dict = {
+            "feature_ids": self._bin_histogram_dict["feature_ids"].tolist(),
+            "bin_boundaries": self._bin_histogram_dict["bin_vals"].tolist(),
+            "output_bits": self._bin_histogram_dict["out_bits"],
+        }
+        twml.write_file(
+            discretizer_export_bin_filename, discretizer_export_bin_dict, encode="json"
+        )
+
+    def save(
+        self, save_dir: str, name: str = "default", verbose: bool = False
+    ):  # pylint: disable=unused-argument
+        """Save the calibrator into the given save_directory using TF Hub.
+        Args:
+            save_dir:
+                name of the saving directory.
+            name:
+                prefix of the saved hub signature. Default (string): "default".
+        """
+        if not self._calibrated:
+            raise RuntimeError(
+                "Expecting prior call to calibrate().Cannot save() prior to calibrate()"
+            )
+
+        # This module allows for the calibrator to save be saved as part of
+        # Tensorflow Hub (this will allow it to be used in further steps)
+        def calibrator_module():
+            # Note that this is usually expecting a sparse_placeholder
+            inputs = tf.sparse_placeholder(tf.float32)
+            calibrator_layer = self.to_layer()
+            # creates the signature to the calibrator module
+            hub.add_signature(
+                inputs=inputs,
+                outputs=calibrator_layer(inputs, keep_inputs=False),
+                name=name,
+            )
+            # and another signature for keep_inputs mode
+            hub.add_signature(
+                inputs=inputs,
+                outputs=calibrator_layer(inputs, keep_inputs=True),
+                name=name + "_keep_inputs",
+            )
+
+        # exports the module to the save_dir
+        spec = hub.create_module_spec(calibrator_module)
+        with tf.Graph().as_default():
+            module = hub.Module(spec)
+            with tf.Session() as session:
+                module.export(save_dir, session)
+
+        self.write_summary_json(save_dir, name)
diff --git a/twml/twml/contrib/eventbus/input_fn.py b/twml/twml/contrib/eventbus/input_fn.py
index c184d9434..d9ae7ab54 100644
--- a/twml/twml/contrib/eventbus/input_fn.py
+++ b/twml/twml/contrib/eventbus/input_fn.py
@@ -1,7 +1,9 @@
-from reader import EventBusPipedBinaryRecordReader
+from typing import Callable, Generator, Optional
+
 import tensorflow.compat.v1 as tf
-import twml
+from reader import EventBusPipedBinaryRecordReader
 
+import twml
 
 """
 This module provides input function for DeepBird v2 training.
@@ -9,51 +11,74 @@
 """
 
 
-def get_eventbus_data_record_generator(eventbus_reader):
-  """
-  This module provides a data record generater from EventBus reader.
-
-  Args:
-    eventbus_reader: EventBus reader
-
-  Returns:
-    gen: Data record generater
-  """
-  eventbus_reader.initialize()
-  counter = [0]
-
-  def gen():
-    while True:
-      record = eventbus_reader.read()
-      if eventbus_reader.debug:
-        tf.logging.warn("counter: {}".format(counter[0]))
-        with open('tmp_record_{}.bin'.format(counter[0]), 'wb') as f:
-          f.write(record)
-        counter[0] = counter[0] + 1
-      yield record
-  return gen
-
-
-def get_eventbus_data_record_dataset(eventbus_reader, parse_fn, batch_size):
-  """
-  This module generates batch data for training from a data record generator.
-  """
-  dataset = tf.data.Dataset.from_generator(
-    get_eventbus_data_record_generator(eventbus_reader), tf.string, tf.TensorShape([]))
-  return dataset.batch(batch_size).map(parse_fn, num_parallel_calls=4).prefetch(buffer_size=10)
-
-
-def get_train_input_fn(feature_config, params, parse_fn=None):
-  """
-  This module provides input function for DeepBird v2 training.
-  It gets batched training data from data record generator.
-  """
-  eventbus_reader = EventBusPipedBinaryRecordReader(
-    params.jar_file, params.num_eb_threads, params.subscriber_id,
-    filter_str=params.filter_str, debug=params.debug)
-
-  train_parse_fn = parse_fn or twml.parsers.get_sparse_parse_fn(
-    feature_config, ["ids", "keys", "values", "batch_size", "weights"])
-
-  return lambda: get_eventbus_data_record_dataset(
-    eventbus_reader, train_parse_fn, params.train_batch_size)
+def get_eventbus_data_record_generator(
+    eventbus_reader: EventBusPipedBinaryRecordReader,
+) -> Generator[bytes, None, None]:
+    """
+    This module provides a data record generater from EventBus reader.
+
+    Args:
+        eventbus_reader: EventBus reader
+
+    Returns:
+        gen: Data record generater
+    """
+    eventbus_reader.initialize()
+    counter = [0]
+
+    def gen() -> Generator[bytes, None, None]:
+        while True:
+            record = eventbus_reader.read()
+            if eventbus_reader.debug:
+                tf.logging.warn(f"counter: {counter[0]}")
+                with open(f"tmp_record_{counter[0]}.bin", "wb") as f:
+                    f.write(record)
+                counter[0] = counter[0] + 1
+            yield record
+
+    return gen
+
+
+def get_eventbus_data_record_dataset(
+    eventbus_reader: EventBusPipedBinaryRecordReader,
+    parse_fn: Callable[[tf.Tensor], tf.Tensor],
+    batch_size: int,
+) -> tf.data.Dataset:
+    """This module generates batch data for training from a data record generator."""
+
+    dataset = tf.data.Dataset.from_generator(
+        get_eventbus_data_record_generator(eventbus_reader),
+        tf.string,
+        tf.TensorShape([]),
+    )
+    return (
+        dataset.batch(batch_size)
+        .map(parse_fn, num_parallel_calls=4)
+        .prefetch(buffer_size=10)
+    )
+
+
+def get_train_input_fn(
+    feature_config: dict,
+    params: twml.Params,
+    parse_fn: Optional[Callable[[tf.Tensor], tf.Tensor]] = None,
+) -> Callable[[], tf.data.Dataset]:
+    """
+    This module provides input function for DeepBird v2 training.
+    It gets batched training data from data record generator.
+    """
+    eventbus_reader = EventBusPipedBinaryRecordReader(
+        params.jar_file,
+        params.num_eb_threads,
+        params.subscriber_id,
+        filter_str=params.filter_str,
+        debug=params.debug,
+    )
+
+    train_parse_fn = parse_fn or twml.parsers.get_sparse_parse_fn(
+        feature_config, ["ids", "keys", "values", "batch_size", "weights"]
+    )
+
+    return lambda: get_eventbus_data_record_dataset(
+        eventbus_reader, train_parse_fn, params.train_batch_size
+    )
diff --git a/twml/twml/contrib/eventbus/reader.py b/twml/twml/contrib/eventbus/reader.py
index 2f8e2749e..605cadd1a 100644
--- a/twml/twml/contrib/eventbus/reader.py
+++ b/twml/twml/contrib/eventbus/reader.py
@@ -2,6 +2,7 @@
 import logging
 import subprocess
 from threading import Lock
+from typing import Any, Optional
 
 """
 This module provides a binary data record reader for EventBus data.
@@ -12,108 +13,135 @@
 
 
 class BinaryRecordReader(object):
-  def initialize(self):
-    pass
+    def initialize(self):
+        """Initialize the reader"""
+        pass
 
-  def read(self):
-    """Read raw bytes for one record
-    """
-    raise NotImplementedError
+    def read(self):
+        """Read raw bytes for one record"""
+        raise NotImplementedError
 
-  def close(self):
-    pass
+    def close(self):
+        """Close the reader"""
+        pass
 
 
 class ReadableWrapper(object):
-  def __init__(self, internal):
-    self.internal = internal
+    def __init__(self, internal: io.BufferedReader):
+        self.internal = internal
 
-  def __getattr__(self, name):
-    return getattr(self.internal, name)
+    def __getattr__(self, name: str) -> Any:
+        return getattr(self.internal, name)
 
-  def readable(self):
-    return True
+    def readable(self) -> bool:
+        return True
 
 
 class EventBusPipedBinaryRecordReader(BinaryRecordReader):
-
-  JAVA = '/usr/lib/jvm/java-11-twitter/bin/java'
-  RECORD_SEPARATOR_HEX = [
-    0x29, 0xd8, 0xd5, 0x06, 0x58, 0xcd, 0x4c, 0x29,
-    0xb2, 0xbc, 0x57, 0x99, 0x21, 0x71, 0xbd, 0xff
-  ]
-  RECORD_SEPARATOR = ''.join([chr(i) for i in RECORD_SEPARATOR_HEX])
-  RECORD_SEPARATOR_LENGTH = len(RECORD_SEPARATOR)
-  CHUNK_SIZE = 8192
-
-  def __init__(self, jar_file, num_eb_threads, subscriber_id,
-               filter_str=None, buffer_size=32768, debug=False):
-    self.jar_file = jar_file
-    self.num_eb_threads = num_eb_threads
-    self.subscriber_id = subscriber_id
-    self.filter_str = filter_str if filter_str else '""'
-    self.buffer_size = buffer_size
-    self.lock = Lock()
-    self._pipe = None
-    self._buffered_reader = None
-    self._bytes_buffer = None
-
-    self.debug = debug
-
-  def initialize(self):
-    if not self._pipe:
-      self._pipe = subprocess.Popen(
-        [
-          self.JAVA, '-jar', self.jar_file,
-          '-subscriberId', self.subscriber_id,
-          '-numThreads', str(self.num_eb_threads),
-          '-dataFilter', self.filter_str,
-          '-debug' if self.debug else ''
-        ],
-        stdout=subprocess.PIPE
-      )
-      self._buffered_reader = io.BufferedReader(
-        ReadableWrapper(self._pipe.stdout), self.buffer_size)
-      self._bytes_buffer = io.BytesIO()
-    else:
-      logging.warning('Already initialized')
-
-  def _find_next_record(self):
-    tail = ['']
-    while True:
-      chunk = tail[0] + self._buffered_reader.read(self.CHUNK_SIZE)
-      index = chunk.find(self.RECORD_SEPARATOR)
-      if index < 0:
-        self._bytes_buffer.write(chunk[:-self.RECORD_SEPARATOR_LENGTH])
-        tail[0] = chunk[-self.RECORD_SEPARATOR_LENGTH:]
-      else:
-        self._bytes_buffer.write(chunk[:index])
-        return chunk[(index + self.RECORD_SEPARATOR_LENGTH):]
-
-  def _read(self):
-    with self.lock:
-      remaining = self._find_next_record()
-      record = self._bytes_buffer.getvalue()
-      # clean up buffer
-      self._bytes_buffer.close()
-      self._bytes_buffer = io.BytesIO()
-      self._bytes_buffer.write(remaining)
-
-      return record
-
-  def read(self):
-    while True:
-      try:
-        return self._read()
-      except Exception as e:
-        logging.error("Error reading bytes for next record: {}".format(e))
-        if self.debug:
-          raise
-
-  def close(self):
-    try:
-      self._bytes_buffer.close()
-      self._buffered_reader.close()
-      self._pipe.terminate()
-    except Exception as e:
-      logging.error("Error closing reader: {}".format(e))
+    JAVA = "/usr/lib/jvm/java-11-twitter/bin/java"
+    RECORD_SEPARATOR_HEX = [
+        0x29,
+        0xD8,
+        0xD5,
+        0x06,
+        0x58,
+        0xCD,
+        0x4C,
+        0x29,
+        0xB2,
+        0xBC,
+        0x57,
+        0x99,
+        0x21,
+        0x71,
+        0xBD,
+        0xFF,
+    ]
+    RECORD_SEPARATOR = "".join([chr(i) for i in RECORD_SEPARATOR_HEX])
+    RECORD_SEPARATOR_LENGTH = len(RECORD_SEPARATOR)
+    CHUNK_SIZE = 8192
+
+    def __init__(
+        self,
+        jar_file: str,
+        num_eb_threads: int,
+        subscriber_id: str,
+        filter_str: Optional[str] = None,
+        buffer_size: int = 32768,
+        debug: bool = False,
+    ):
+        self.jar_file = jar_file
+        self.num_eb_threads = num_eb_threads
+        self.subscriber_id = subscriber_id
+        self.filter_str = filter_str if filter_str else '""'
+        self.buffer_size = buffer_size
+        self.lock = Lock()
+        self._pipe = None
+        self._buffered_reader = None
+        self._bytes_buffer = None
+        self.debug = debug
+
+    def initialize(self) -> None:
+        if not self._pipe:
+            self._pipe = subprocess.Popen(
+                [
+                    self.JAVA,
+                    "-jar",
+                    self.jar_file,
+                    "-subscriberId",
+                    self.subscriber_id,
+                    "-numThreads",
+                    str(self.num_eb_threads),
+                    "-dataFilter",
+                    self.filter_str,
+                    "-debug" if self.debug else "",
+                ],
+                stdout=subprocess.PIPE,
+            )
+            self._buffered_reader = io.BufferedReader(
+                ReadableWrapper(self._pipe.stdout), self.buffer_size
+            )
+            self._bytes_buffer = io.BytesIO()
+        else:
+            logging.warning("Already initialized")
+
+    def _find_next_record(self) -> Optional[bytes]:
+        tail = [""]
+        while True:
+            chunk = tail[0] + self._buffered_reader.read(self.CHUNK_SIZE)
+            index = chunk.find(self.RECORD_SEPARATOR)
+            if index < 0:
+                self._bytes_buffer.write(chunk[: -self.RECORD_SEPARATOR_LENGTH])
+                tail[0] = chunk[-self.RECORD_SEPARATOR_LENGTH :]
+            else:
+                self._bytes_buffer.write(chunk[:index])
+                return chunk[(index + self.RECORD_SEPARATOR_LENGTH) :]
+
+    def _read(self) -> bytes:
+        with self.lock:
+            remaining = self._find_next_record()
+            record = self._bytes_buffer.getvalue()
+
+            # clean up buffer
+            self._bytes_buffer.close()
+            self._bytes_buffer = io.BytesIO()
+            self._bytes_buffer.write(remaining)
+
+            return record
+
+    def read(self) -> bytes:
+        while True:
+            try:
+                return self._read()
+            except Exception as e:
+                logging.error(f"Error reading bytes for next record: {e}")
+                if self.debug:
+                    raise
+
+    def close(self) -> None:
+        try:
+            self._bytes_buffer.close()
+            self._buffered_reader.close()
+            self._pipe.terminate()
+        except Exception as e:
+            logging.error("Error closing reader: {e}")
diff --git a/twml/twml/contrib/export/__init__.py b/twml/twml/contrib/export/__init__.py
index 99892dcfa..2a6e0f86d 100644
--- a/twml/twml/contrib/export/__init__.py
+++ b/twml/twml/contrib/export/__init__.py
@@ -1,2 +1,2 @@
-from . import export_fn # noqa: F401
-from . import exporters # noqa: F401
+from . import export_fn  # noqa: F401
+from . import exporters  # noqa: F401
diff --git a/twml/twml/contrib/export/export_fn.py b/twml/twml/contrib/export/export_fn.py
index 6e59fff07..5a338aae9 100644
--- a/twml/twml/contrib/export/export_fn.py
+++ b/twml/twml/contrib/export/export_fn.py
@@ -1,263 +1,313 @@
 """
 Functions for exporting models for different modes.
 """
-from collections import OrderedDict
 import os
+from typing import List
 
 import tensorflow.compat.v1 as tf
-from tensorflow.python.estimator.export import export
-import twml
 import yaml
+from tensorflow.python.estimator.export import export
 
-
-def get_sparse_batch_supervised_input_receiver_fn(feature_config, keep_fields=None):
-  """Gets supervised_input_receiver_fn that decodes a BatchPredictionRequest as sparse tensors
-  with labels and weights as defined in feature_config.
-  This input_receiver_fn is required for exporting models with 'train' mode to be trained with
-  Java API
-
-  Args:
-    feature_config (FeatureConfig): deepbird v2 feature config object
-    keep_fields (list): list of fields to keep
-
-  Returns:
-    supervised_input_receiver_fn: input_receiver_fn used for train mode
-  """
-  def supervised_input_receiver_fn():
-    serialized_request = tf.placeholder(dtype=tf.uint8, name='request')
-    receiver_tensors = {'request': serialized_request}
-
-    bpr = twml.contrib.readers.HashedBatchPredictionRequest(serialized_request, feature_config)
-    features = bpr.get_sparse_features() if keep_fields is None else bpr.get_features(keep_fields)
-    features['weights'] = bpr.weights
-    labels = bpr.labels
-    features, labels = bpr.apply_filter(features, labels)
-
-    return export.SupervisedInputReceiver(features, labels, receiver_tensors)
-
-  return supervised_input_receiver_fn
-
-
-def update_build_graph_fn_for_train(build_graph_fn):
-  """Updates a build_graph_fn by inserting in graph output a serialized BatchPredictionResponse
-  similar to the export_output_fns for serving.
-  The key difference here is that
-  1. We insert serialized BatchPredictionResponse in graph output with key 'prediction' instead of
-     creating an export_output object. This is because of the way estimators export model in 'train'
-     mode doesn't take custom export_output
-  2. We only do it when `mode == 'train'` to avoid altering the graph when exporting
-     for 'infer' mode
-
-  Args:
-    build_graph_fn (Callable): deepbird v2 build graph function
-
-  Returns:
-    new_build_graph_fn: An updated build_graph_fn that inserts serialized BatchPredictResponse
-                        to graph output when in 'train' mode
-  """
-  def new_build_graph_fn(features, label, mode, params, config=None):
-    output = build_graph_fn(features, label, mode, params, config)
-    if mode == tf.estimator.ModeKeys.TRAIN:
-      output.update(
-        twml.export_output_fns.batch_prediction_continuous_output_fn(output)[
-          tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY].outputs
-      )
-    return output
-  return new_build_graph_fn
+import twml
+from twml.twml.feature_config import FeatureConfig
+
+
+def get_sparse_batch_supervised_input_receiver_fn(
+    feature_config: FeatureConfig, keep_fields: list = None
+) -> callable:
+    """
+    Gets supervised_input_receiver_fn that decodes a BatchPredictionRequest as sparse tensors
+    with labels and weights as defined in feature_config. This input_receiver_fn is required
+    for exporting models with 'train' mode to be trained with Java API
+
+    Args:
+        feature_config (FeatureConfig):
+            deepbird v2 feature config object
+        keep_fields (list):
+            list of fields to keep
+
+    Returns:
+        supervised_input_receiver_fn: input_receiver_fn used for train mode
+    """
+
+    def supervised_input_receiver_fn():
+        serialized_request = tf.placeholder(dtype=tf.uint8, name="request")
+        receiver_tensors = {"request": serialized_request}
+
+        bpr = twml.contrib.readers.HashedBatchPredictionRequest(
+            serialized_request, feature_config
+        )
+        features = (
+            bpr.get_sparse_features()
+            if keep_fields is None
+            else bpr.get_features(keep_fields)
+        )
+        features["weights"] = bpr.weights
+        labels = bpr.labels
+        features, labels = bpr.apply_filter(features, labels)
+
+        return export.SupervisedInputReceiver(features, labels, receiver_tensors)
+
+    return supervised_input_receiver_fn
+
+
+def update_build_graph_fn_for_train(build_graph_fn: callable):
+    """Updates a build_graph_fn by inserting in graph output a serialized BatchPredictionResponse
+    similar to the export_output_fns for serving.
+    The key difference here is that
+    1. We insert serialized BatchPredictionResponse in graph output with key 'prediction' instead of
+        creating an export_output object. This is because of the way estimators export model in 'train'
+        mode doesn't take custom export_output
+    2. We only do it when `mode == 'train'` to avoid altering the graph when exporting for 'infer' mode
+
+    Args:
+        build_graph_fn (Callable):
+            deepbird v2 build graph function
+
+    Returns:
+        new_build_graph_fn:
+            An updated build_graph_fn that inserts serialized BatchPredictResponse to graph
+            output when in 'train' mode
+    """
+
+    def new_build_graph_fn(features, label, mode, params, config=None):
+        output = build_graph_fn(features, label, mode, params, config)
+        if mode == tf.estimator.ModeKeys.TRAIN:
+            output.update(
+                twml.export_output_fns.batch_prediction_continuous_output_fn(output)[
+                    tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+                ].outputs
+            )
+        return output
+
+    return new_build_graph_fn
 
 
 def export_model_for_train_and_infer(
-    trainer, feature_config, keep_fields, export_dir, as_text=False):
-  """Function for exporting model with both 'train' and 'infer' mode.
-
-  This means the exported saved_model.pb will contain two meta graphs, one with tag 'train'
-  and the other with tag 'serve', and it can be loaded in Java API with either tag depending on
-  the use case
-
-  Args:
-    trainer (DataRecordTrainer): deepbird v2 DataRecordTrainer
-    feature_config (FeatureConfig): deepbird v2 feature config
-    keep_fields (list of string): list of field keys, e.g.
-                                  ('ids', 'keys', 'values', 'batch_size', 'total_size', 'codes')
-    export_dir (str): a directory (local or hdfs) to export model to
-    as_text (bool): if True, write 'saved_model.pb' as binary file, else write
-                    'saved_model.pbtxt' as human readable text file. Default False
-  """
-  train_input_receiver_fn = get_sparse_batch_supervised_input_receiver_fn(
-    feature_config, keep_fields)
-  predict_input_receiver_fn = twml.parsers.get_sparse_serving_input_receiver_fn(
-    feature_config, keep_fields)
-  trainer._export_output_fn = twml.export_output_fns.batch_prediction_continuous_output_fn
-  trainer._build_graph_fn = update_build_graph_fn_for_train(trainer._build_graph_fn)
-  trainer._estimator._export_all_saved_models(
-    export_dir_base=export_dir,
-    input_receiver_fn_map={
-      tf.estimator.ModeKeys.TRAIN: train_input_receiver_fn,
-      tf.estimator.ModeKeys.PREDICT: predict_input_receiver_fn
-    },
-    as_text=as_text,
-  )
-
-  trainer.export_model_effects(export_dir)
-
-
-def export_all_models_with_receivers(estimator, export_dir,
-                                     train_input_receiver_fn,
-                                     eval_input_receiver_fn,
-                                     predict_input_receiver_fn,
-                                     export_output_fn,
-                                     export_modes=('train', 'eval', 'predict'),
-                                     register_model_fn=None,
-                                     feature_spec=None,
-                                     checkpoint_path=None,
-                                     log_features=True):
-  """
-  Function for exporting a model with train, eval, and infer modes.
-
-  Args:
-    estimator:
-      Should be of type tf.estimator.Estimator.
-      You can get this from trainer using trainer.estimator
-    export_dir:
-      Directory to export the model.
-    train_input_receiver_fn:
-      Input receiver for train interface.
-    eval_input_receiver_fn:
-      Input receiver for eval interface.
-    predict_input_receiver_fn:
-      Input receiver for predict interface.
-    export_output_fn:
-      export_output_fn to be used for serving.
-    export_modes:
-      A list to Specify what modes to export. Can be "train", "eval", "predict".
-      Defaults to ["train", "eval", "predict"]
-    register_model_fn:
-      An optional function which is called with export_dir after models are exported.
-      Defaults to None.
-  Returns:
-     The timestamped directory the models are exported to.
-  """
-  # TODO: Fix for hogwild / distributed training.
-
-  if export_dir is None:
-    raise ValueError("export_dir can not be None")
-  export_dir = twml.util.sanitize_hdfs_path(export_dir)
-  input_receiver_fn_map = {}
-
-  if "train" in export_modes:
-    input_receiver_fn_map[tf.estimator.ModeKeys.TRAIN] = train_input_receiver_fn
-
-  if "eval" in export_modes:
-    input_receiver_fn_map[tf.estimator.ModeKeys.EVAL] = eval_input_receiver_fn
-
-  if "predict" in export_modes:
-    input_receiver_fn_map[tf.estimator.ModeKeys.PREDICT] = predict_input_receiver_fn
-
-  export_dir = estimator._export_all_saved_models(
-    export_dir_base=export_dir,
-    input_receiver_fn_map=input_receiver_fn_map,
-    checkpoint_path=checkpoint_path,
-  )
-
-  if register_model_fn is not None:
-    register_model_fn(export_dir, feature_spec, log_features)
-
-  return export_dir
-
-
-def export_all_models(trainer,
-                      export_dir,
-                      parse_fn,
-                      serving_input_receiver_fn,
-                      export_output_fn=None,
-                      export_modes=('train', 'eval', 'predict'),
-                      feature_spec=None,
-                      checkpoint=None,
-                      log_features=True):
-  """
-  Function for exporting a model with train, eval, and infer modes.
-
-  Args:
-    trainer:
-      An object of type twml.trainers.Trainer.
-    export_dir:
-      Directory to export the model.
-    parse_fn:
-      The parse function used parse the inputs for train and eval.
-    serving_input_receiver_fn:
-      The input receiver function used during serving.
-    export_output_fn:
-      export_output_fn to be used for serving.
-    export_modes:
-      A list to Specify what modes to export. Can be "train", "eval", "predict".
-      Defaults to ["train", "eval", "predict"]
-    feature_spec:
-      A dictionary obtained from FeatureConfig.get_feature_spec() to serialize
-      as feature_spec.yaml in export_dir.
-      Defaults to None
-  Returns:
-     The timestamped directory the models are exported to.
-  """
-  # Only export from chief in hogwild or distributed modes.
-  if trainer.params.get('distributed', False) and not trainer.estimator.config.is_chief:
-    tf.logging.info("Trainer.export_model ignored due to instance not being chief.")
-    return
-
-  if feature_spec is None:
-    if getattr(trainer, '_feature_config') is None:
-      raise ValueError("feature_spec is set to None."
-                       "Please pass feature_spec=feature_config.get_feature_spec() to the export_all_model function")
-    else:
-      feature_spec = trainer._feature_config.get_feature_spec()
-
-  export_dir = twml.util.sanitize_hdfs_path(export_dir)
-  old_export_output_fn = trainer._export_output_fn
-  trainer._export_output_fn = export_output_fn
-  supervised_input_receiver_fn = twml.parsers.convert_to_supervised_input_receiver_fn(parse_fn)
-  if not checkpoint:
-    checkpoint = trainer.best_or_latest_checkpoint
-
-  export_dir = export_all_models_with_receivers(estimator=trainer.estimator,
-                                                export_dir=export_dir,
-                                                train_input_receiver_fn=supervised_input_receiver_fn,
-                                                eval_input_receiver_fn=supervised_input_receiver_fn,
-                                                predict_input_receiver_fn=serving_input_receiver_fn,
-                                                export_output_fn=export_output_fn,
-                                                export_modes=export_modes,
-                                                register_model_fn=trainer.export_model_effects,
-                                                feature_spec=feature_spec,
-                                                checkpoint_path=checkpoint,
-                                                log_features=log_features)
-  trainer._export_output_fn = old_export_output_fn
-  return export_dir
-
-
-def export_feature_spec(dir_path, feature_spec_dict):
-  """
-  Exports a FeatureConfig.get_feature_spec() dict to <dir_path>/feature_spec.yaml.
-  """
-  def ordered_dict_representer(dumper, data):
-    return dumper.represent_mapping('tag:yaml.org,2002:map', data.items())
-
-  try:
-    # needed for Python 2
-    yaml.add_representer(str, yaml.representer.SafeRepresenter.represent_str)
-    yaml.add_representer(unicode, yaml.representer.SafeRepresenter.represent_unicode)
-  except NameError:
-    # 'unicode' type doesn't exist on Python 3
-    # PyYAML handles unicode correctly in Python 3
-    pass
-
-  yaml.add_representer(OrderedDict, ordered_dict_representer)
-
-  fbase = "feature_spec.yaml"
-  fname = fbase.encode('utf-8') if type(dir_path) != str else fbase
-  file_path = os.path.join(dir_path, fname)
-  with tf.io.gfile.GFile(file_path, mode='w') as f:
-    yaml.dump(feature_spec_dict, f, default_flow_style=False, allow_unicode=True)
-  tf.logging.info("Exported feature spec to %s" % file_path)
-
-  return file_path
+    trainer: twml.DataRecordTrainer,
+    feature_config: FeatureConfig,
+    keep_fields: List[str],
+    export_dir: str,
+    as_text: bool = False,
+):
+    """Function for exporting model with both 'train' and 'infer' mode.
+
+    This means the exported saved_model.pb will contain two meta graphs, one with tag 'train'
+    and the other with tag 'serve', and it can be loaded in Java API with either tag depending on
+    the use case
+
+    Args:
+        trainer (DataRecordTrainer):
+            deepbird v2 DataRecordTrainer
+        feature_config (FeatureConfig):
+            deepbird v2 feature config
+        keep_fields (list[string]):
+            list of field keys, e.g. ('ids', 'keys', 'values', 'batch_size', 'total_size', 'codes')
+        export_dir (str):
+            a directory (local or hdfs) to export model to
+        as_text (bool):
+            if True, write 'saved_model.pb' as binary file, else write 'saved_model.pbtxt' as human readable text file. Default False
+    """
+    train_input_receiver_fn = get_sparse_batch_supervised_input_receiver_fn(
+        feature_config, keep_fields
+    )
+    predict_input_receiver_fn = twml.parsers.get_sparse_serving_input_receiver_fn(
+        feature_config, keep_fields
+    )
+    trainer._export_output_fn = (
+        twml.export_output_fns.batch_prediction_continuous_output_fn
+    )
+    trainer._build_graph_fn = update_build_graph_fn_for_train(trainer._build_graph_fn)
+    trainer._estimator._export_all_saved_models(
+        export_dir_base=export_dir,
+        input_receiver_fn_map={
+            tf.estimator.ModeKeys.TRAIN: train_input_receiver_fn,
+            tf.estimator.ModeKeys.PREDICT: predict_input_receiver_fn,
+        },
+        as_text=as_text,
+    )
+
+    trainer.export_model_effects(export_dir)
+
+
+def export_all_models_with_receivers(
+    estimator: tf.estimator.Estimator,
+    export_dir: str,
+    train_input_receiver_fn: callable,
+    eval_input_receiver_fn: callable,
+    predict_input_receiver_fn: callable,
+    export_output_fn: callable,
+    export_modes: List[str] = ["train", "eval", "predict"],
+    register_model_fn: callable = None,
+    feature_spec: dict = None,
+    checkpoint_path: str = None,
+    log_features: bool = True,
+) -> str:
+    """
+    Function for exporting a model with train, eval, and infer modes.
+
+    Args:
+        estimator (tf.estimator.Estimator):
+            You can get this from trainer using trainer.estimator
+        export_dir (str):
+            Directory to export the model.
+        train_input_receiver_fn (Callable):
+            Input receiver for train interface.
+        eval_input_receiver_fn (Callable):
+            Input receiver for eval interface.
+        predict_input_receiver_fn (Callable):
+            Input receiver for predict interface.
+        export_output_fn (Callable):
+            export_output_fn to be used for serving.
+        export_modes (list[str]):
+            A list to Specify what modes to export. Can be "train", "eval", "predict".
+            Defaults to ["train", "eval", "predict"]
+        register_model_fn (Callable):
+            An optional function which is called with export_dir after models are exported.
+            Defaults to None.
+        feature_spec (dict):
+            An optional dict of feature names to tf.FixedLenFeature or tf.VarLenFeature.
+            Defaults to None.
+        checkpoint_path (str):
+            An optional path to a specific checkpoint to export. If None, the latest checkpoint
+            in export_dir is used. Defaults to None.
+        log_features (bool):
+            If True, log the features to the console. Defaults to True.
+    Returns:
+        The timestamped directory the models are exported to.
+    """
+    # TODO: Fix for hogwild / distributed training.
+
+    if export_dir is None:
+        raise ValueError("export_dir can not be None")
+    export_dir = twml.util.sanitize_hdfs_path(export_dir)
+    input_receiver_fn_map = {}
+
+    if "train" in export_modes:
+        input_receiver_fn_map[tf.estimator.ModeKeys.TRAIN] = train_input_receiver_fn
+
+    if "eval" in export_modes:
+        input_receiver_fn_map[tf.estimator.ModeKeys.EVAL] = eval_input_receiver_fn
+
+    if "predict" in export_modes:
+        input_receiver_fn_map[tf.estimator.ModeKeys.PREDICT] = predict_input_receiver_fn
+
+    export_dir = estimator._export_all_saved_models(
+        export_dir_base=export_dir,
+        input_receiver_fn_map=input_receiver_fn_map,
+        checkpoint_path=checkpoint_path,
+    )
+
+    if register_model_fn is not None:
+        register_model_fn(export_dir, feature_spec, log_features)
+
+    return export_dir
+
+
+def export_all_models(
+    trainer: twml.trainers.Trainer,
+    export_dir: str,
+    parse_fn: callable,
+    serving_input_receiver_fn: callable,
+    export_output_fn: callable = None,
+    export_modes: List[str] = ["train", "eval", "predict"],
+    feature_spec: dict = None,
+    checkpoint: str = None,
+    log_features: bool = True,
+) -> str:
+    """
+    Function for exporting a model with train, eval, and infer modes.
+
+    Args:
+        trainer:
+            An object of type twml.trainers.Trainer.
+        export_dir:
+            Directory to export the model.
+        parse_fn:
+            The parse function used parse the inputs for train and eval.
+        serving_input_receiver_fn:
+            The input receiver function used during serving.
+        export_output_fn:
+            export_output_fn to be used for serving.
+        export_modes:
+            A list to Specify what modes to export. Can be "train", "eval", "predict".
+            Defaults to ["train", "eval", "predict"]
+        feature_spec:
+            A dictionary obtained from FeatureConfig.get_feature_spec() to serialize
+            as feature_spec.yaml in export_dir.
+            Defaults to None
+
+    Returns:
+        The timestamped directory the models are exported to.
+    """
+    # Only export from chief in hogwild or distributed modes.
+    if (
+        trainer.params.get("distributed", False)
+        and not trainer.estimator.config.is_chief
+    ):
+        tf.logging.info("Trainer.export_model ignored due to instance not being chief.")
+        return
+
+    if feature_spec is None:
+        if getattr(trainer, "_feature_config") is None:
+            raise ValueError(
+                "feature_spec is set to None."
+                "Please pass feature_spec=feature_config.get_feature_spec() to the export_all_model function"
+            )
+        else:
+            feature_spec = trainer._feature_config.get_feature_spec()
+
+    export_dir = twml.util.sanitize_hdfs_path(export_dir)
+    old_export_output_fn = trainer._export_output_fn
+    trainer._export_output_fn = export_output_fn
+    supervised_input_receiver_fn = twml.parsers.convert_to_supervised_input_receiver_fn(
+        parse_fn
+    )
+    if not checkpoint:
+        checkpoint = trainer.best_or_latest_checkpoint
+
+    export_dir = export_all_models_with_receivers(
+        estimator=trainer.estimator,
+        export_dir=export_dir,
+        train_input_receiver_fn=supervised_input_receiver_fn,
+        eval_input_receiver_fn=supervised_input_receiver_fn,
+        predict_input_receiver_fn=serving_input_receiver_fn,
+        export_output_fn=export_output_fn,
+        export_modes=export_modes,
+        register_model_fn=trainer.export_model_effects,
+        feature_spec=feature_spec,
+        checkpoint_path=checkpoint,
+        log_features=log_features,
+    )
+    trainer._export_output_fn = old_export_output_fn
+    return export_dir
+
+
+def export_feature_spec(dir_path: str, feature_spec_dict: dict) -> str:
+    """Exports a FeatureConfig.get_feature_spec() dict to <dir_path>/feature_spec.yaml"""
+
+    def ordered_dict_representer(dumper, data: dict):
+        return dumper.represent_mapping("tag:yaml.org,2002:map", data.items())
+
+    try:
+        # needed for Python 2
+        yaml.add_representer(str, yaml.representer.SafeRepresenter.represent_str)
+        yaml.add_representer(
+            unicode, yaml.representer.SafeRepresenter.represent_unicode
+        )
+    except NameError:
+        pass
+
+    yaml.add_representer(dict, ordered_dict_representer)
+
+    fbase = "feature_spec.yaml"
+    fname = fbase.encode("utf-8") if type(dir_path) != str else fbase
+    file_path = os.path.join(dir_path, fname)
+    with tf.io.gfile.GFile(file_path, mode="w") as f:
+        yaml.dump(feature_spec_dict, f, default_flow_style=False, allow_unicode=True)
+    tf.logging.info("Exported feature spec to %s" % file_path)
+
+    return file_path
 
 
 # Keep the alias for compatibility.
diff --git a/twml/twml/contrib/export/exporters.py b/twml/twml/contrib/export/exporters.py
index 122955cbc..f0a696a37 100644
--- a/twml/twml/contrib/export/exporters.py
+++ b/twml/twml/contrib/export/exporters.py
@@ -2,144 +2,202 @@
 Wrappers around tf.estimator.Exporters to export models and save checkpoints.
 """
 import os
+from typing import List
 
 import tensorflow.compat.v1 as tf
 from tensorflow.python.estimator import exporter
+
 import twml
 
 
 class _AllSavedModelsExporter(tf.estimator.Exporter):
-  """Internal exporter class to be used for exporting models for different modes."""
-
-  def __init__(self,
-               name,
-               input_receiver_fn_map,
-               backup_checkpoints,
-               assets_extra=None,
-               as_text=False):
-    """
-    Args:
-      name: A unique name to be used for the exporter. This is used in the export path.
-      input_receiver_fn_map: A map of tf.estimator.ModeKeys to input_receiver_fns.
-      backup_checkpoints: A flag to specify if backups of checkpoints need to be made.
-      assets_extra: Additional assets to be included in the exported model.
-      as_text: Specifies if the exported model should be in a human readable text format.
-    """
-    self._name = name
-    self._input_receiver_fn_map = input_receiver_fn_map
-    self._backup_checkpoints = backup_checkpoints
-    self._assets_extra = assets_extra
-    self._as_text = as_text
-
-  @property
-  def name(self):
-    return self._name
-
-  def export(self, estimator, export_path, checkpoint_path, eval_result,
-             is_the_final_export):
-    del is_the_final_export
-
-    export_path = twml.util.sanitize_hdfs_path(export_path)
-    checkpoint_path = twml.util.sanitize_hdfs_path(checkpoint_path)
-
-    if self._backup_checkpoints:
-      backup_path = os.path.join(export_path, "checkpoints")
-      # Ensure backup_path is created. makedirs passes if dir already exists.
-      tf.io.gfile.makedirs(backup_path)
-      twml.util.backup_checkpoint(checkpoint_path, backup_path, empty_backup=False)
-
-    export_result = estimator.experimental_export_all_saved_models(
-      export_path,
-      self._input_receiver_fn_map,
-      assets_extra=self._assets_extra,
-      as_text=self._as_text,
-      checkpoint_path=checkpoint_path)
-
-    return export_result
+    """Internal exporter class to be used for exporting models for different modes."""
+
+    def __init__(
+        self,
+        name: str,
+        input_receiver_fn_map: dict,
+        backup_checkpoints: bool,
+        assets_extra: List[str] = None,
+        as_text: bool = False,
+    ):
+        """
+        Args:
+            name (str):
+                A unique name to be used for the exporter. This is used in the export path.
+            input_receiver_fn_map (dict):
+                A map of tf.estimator.ModeKeys to input_receiver_fns.
+            backup_checkpoints (bool):
+                A flag to specify if backups of checkpoints need to be made.
+            assets_extra (list):
+                Additional assets to be included in the exported model.
+            as_text (bool):
+                Specifies if the exported model should be in a human readable text format.
+        """
+        self._name = name
+        self._input_receiver_fn_map = input_receiver_fn_map
+        self._backup_checkpoints = backup_checkpoints
+        self._assets_extra = assets_extra
+        self._as_text = as_text
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    def export(
+        self,
+        estimator: tf.estimator.Estimator,
+        export_path: str,
+        checkpoint_path: str,
+        eval_result: dict,
+        is_the_final_export: bool = True,
+    ):  # pylint: disable=unused-argument
+        del is_the_final_export
+
+        export_path = twml.util.sanitize_hdfs_path(export_path)
+        checkpoint_path = twml.util.sanitize_hdfs_path(checkpoint_path)
+
+        if self._backup_checkpoints:
+            backup_path = os.path.join(export_path, "checkpoints")
+            # Ensure backup_path is created. makedirs passes if dir already exists.
+            tf.io.gfile.makedirs(backup_path)
+            twml.util.backup_checkpoint(
+                checkpoint_path, backup_path, empty_backup=False
+            )
+
+        export_result = estimator.experimental_export_all_saved_models(
+            export_path,
+            self._input_receiver_fn_map,
+            assets_extra=self._assets_extra,
+            as_text=self._as_text,
+            checkpoint_path=checkpoint_path,
+        )
+
+        return export_result
 
 
 class BestExporter(tf.estimator.BestExporter):
-  """
-  This class inherits from tf.estimator.BestExporter with the following differences:
-    - It also creates a backup of the best checkpoint.
-    - It can export the model for multiple modes.
-
-  A backup / export is performed everytime the evaluated metric is better
-  than previous models.
-  """
-
-  def __init__(self,
-               name='best_exporter',
-               input_receiver_fn_map=None,
-               backup_checkpoints=True,
-               event_file_pattern='eval/*.tfevents.*',
-               compare_fn=exporter._loss_smaller,
-               assets_extra=None,
-               as_text=False,
-               exports_to_keep=5):
-    """
-    Args:
-      name: A unique name to be used for the exporter. This is used in the export path.
-      input_receiver_fn_map: A map of tf.estimator.ModeKeys to input_receiver_fns.
-      backup_checkpoints: A flag to specify if backups of checkpoints need to be made.
-
-    Note:
-      Check the following documentation for more information about the remaining args:
-      https://www.tensorflow.org/api_docs/python/tf/estimator/BestExporter
     """
-    serving_input_receiver_fn = input_receiver_fn_map.get(tf.estimator.ModeKeys.PREDICT)
+    This class inherits from tf.estimator.BestExporter with the following differences:
+        - It also creates a backup of the best checkpoint.
+        - It can export the model for multiple modes.
 
-    super(BestExporter, self).__init__(
-      name, serving_input_receiver_fn, event_file_pattern, compare_fn,
-      assets_extra, as_text, exports_to_keep)
-
-    if not hasattr(self, "_saved_model_exporter"):
-      raise AttributeError(
-        "_saved_model_exporter needs to exist for this exporter to work."
-        " This is potentially broken because of an internal change in Tensorflow")
+    A backup / export is performed every time the evaluated metric is better
+    than previous models.
+    """
 
-    # Override the saved_model_exporter with SaveAllmodelsexporter
-    self._saved_model_exporter = _AllSavedModelsExporter(
-      name, input_receiver_fn_map, backup_checkpoints, assets_extra, as_text)
+    def __init__(
+        self,
+        name: str = "best_exporter",
+        input_receiver_fn_map: dict = None,
+        backup_checkpoints: bool = True,
+        event_file_pattern: str = "eval/*.tfevents.*",
+        compare_fn: callable = exporter._loss_smaller,
+        assets_extra: List[str] = None,
+        as_text: bool = False,
+        exports_to_keep: int = 5,
+    ):
+        """
+        Args:
+            name (str):
+                A unique name to be used for the exporter. This is used in the export path.
+            input_receiver_fn_map (dict):
+                A map of tf.estimator.ModeKeys to input_receiver_fns.
+            backup_checkpoints (bool):
+                A flag to specify if backups of checkpoints need to be made.
+            event_file_pattern (str):
+                A glob pattern for the event files in the evaluation directory.
+            compare_fn (callable):
+                A function that takes two evaluation results and returns True if the first
+                one is better than the second one.
+            assets_extra (list):
+                Additional assets to be included in the exported model.
+            as_text (bool):
+                Specifies if the exported model should be in a human readable text format.
+            exports_to_keep (int):
+                The maximum number of exports to keep. Older exports are deleted.
+        Note:
+            Check the following documentation for more information about the remaining args:
+            https://www.tensorflow.org/api_docs/python/tf/estimator/BestExporter
+        """
+        serving_input_receiver_fn = input_receiver_fn_map.get(
+            tf.estimator.ModeKeys.PREDICT
+        )
+
+        super(BestExporter, self).__init__(
+            name,
+            serving_input_receiver_fn,
+            event_file_pattern,
+            compare_fn,
+            assets_extra,
+            as_text,
+            exports_to_keep,
+        )
+
+        if not hasattr(self, "_saved_model_exporter"):
+            raise AttributeError(
+                "_saved_model_exporter needs to exist for this exporter to work."
+                " This is potentially broken because of an internal change in Tensorflow"
+            )
+
+        # Override the saved_model_exporter with SaveAllmodelsexporter
+        self._saved_model_exporter = _AllSavedModelsExporter(
+            name, input_receiver_fn_map, backup_checkpoints, assets_extra, as_text
+        )
 
 
 class LatestExporter(tf.estimator.LatestExporter):
-  """
-  This class inherits from tf.estimator.LatestExporter with the following differences:
-    - It also creates a backup of the latest checkpoint.
-    - It can export the model for multiple modes.
-
-  A backup / export is performed everytime the evaluated metric is better
-  than previous models.
-  """
-
-  def __init__(self,
-               name='latest_exporter',
-               input_receiver_fn_map=None,
-               backup_checkpoints=True,
-               assets_extra=None,
-               as_text=False,
-               exports_to_keep=5):
     """
-    Args:
-      name: A unique name to be used for the exporter. This is used in the export path.
-      input_receiver_fn_map: A map of tf.estimator.ModeKeys to input_receiver_fns.
-      backup_checkpoints: A flag to specify if backups of checkpoints need to be made.
-
-    Note:
-      Check the following documentation for more information about the remaining args:
-      https://www.tensorflow.org/api_docs/python/tf/estimator/LatestExporter
-    """
-    serving_input_receiver_fn = input_receiver_fn_map.get(tf.estimator.ModeKeys.PREDICT)
-
-    super(LatestExporter, self).__init__(
-      name, serving_input_receiver_fn, assets_extra, as_text, exports_to_keep)
+    This class inherits from tf.estimator.LatestExporter with the following differences:
+        - It also creates a backup of the latest checkpoint.
+        - It can export the model for multiple modes.
 
-    if not hasattr(self, "_saved_model_exporter"):
-      raise AttributeError(
-        "_saved_model_exporter needs to exist for this exporter to work."
-        " This is potentially broken because of an internal change in Tensorflow")
+    A backup / export is performed every time the evaluated metric is better
+    than previous models.
+    """
 
-    # Override the saved_model_exporter with SaveAllmodelsexporter
-    self._saved_model_exporter = _AllSavedModelsExporter(
-      name, input_receiver_fn_map, backup_checkpoints, assets_extra, as_text)
+    def __init__(
+        self,
+        name: str = "latest_exporter",
+        input_receiver_fn_map: dict = None,
+        backup_checkpoints: bool = True,
+        assets_extra: List[str] = None,
+        as_text: bool = False,
+        exports_to_keep: int = 5,
+    ):
+        """
+        Args:
+            name (str):
+                A unique name to be used for the exporter. This is used in the export path.
+            input_receiver_fn_map (dict):
+                A map of tf.estimator.ModeKeys to input_receiver_fns.
+            backup_checkpoints (bool):
+                A flag to specify if backups of checkpoints need to be made.
+            assets_extra (list[str]):
+                Additional assets to be included in the exported model.
+            as_text (bool):
+                Specifies if the exported model should be in a human readable text format.
+            exports_to_keep (int):
+                The number of exports to keep.
+        Note:
+            Check the following documentation for more information about the remaining args:
+            https://www.tensorflow.org/api_docs/python/tf/estimator/LatestExporter
+        """
+        serving_input_receiver_fn = input_receiver_fn_map.get(
+            tf.estimator.ModeKeys.PREDICT
+        )
+
+        super(LatestExporter, self).__init__(
+            name, serving_input_receiver_fn, assets_extra, as_text, exports_to_keep
+        )
+
+        if not hasattr(self, "_saved_model_exporter"):
+            raise AttributeError(
+                "_saved_model_exporter needs to exist for this exporter to work."
+                " This is potentially broken because of an internal change in Tensorflow"
+            )
+
+        # Override the saved_model_exporter with SaveAllmodelsexporter
+        self._saved_model_exporter = _AllSavedModelsExporter(
+            name, input_receiver_fn_map, backup_checkpoints, assets_extra, as_text
+        )
diff --git a/twml/twml/contrib/feature_config.py b/twml/twml/contrib/feature_config.py
index 833695751..c29bb63a6 100644
--- a/twml/twml/contrib/feature_config.py
+++ b/twml/twml/contrib/feature_config.py
@@ -2,84 +2,83 @@
 Feature configuration for DeepBird jobs returns dictionary of sparse and dense Features
 """
 from twitter.deepbird.io.legacy.contrib import feature_config
+
 import twml
 
 
 class FeatureConfig(feature_config.FeatureConfig):
-  def get_feature_spec(self):
-    """
-    Generates a serialization-friendly dict representing this FeatureConfig.
-    """
-    doc = super(FeatureConfig, self).get_feature_spec()
-
-    # Override the class in the spec.
-    doc["class"] = "twml.contrib.FeatureConfig"
+    def get_feature_spec(self) -> dict:
+        """Generates a serialization-friendly dict representing this FeatureConfig."""
 
-    return doc
+        doc = super(FeatureConfig, self).get_feature_spec()
+        # Override the class in the spec.
+        doc["class"] = "twml.contrib.FeatureConfig"
+        return doc
 
 
 class FeatureConfigBuilder(feature_config.FeatureConfigBuilder):
-  # Overwrite self.build() to return twml.FeatureConfig instead
-  def build(self):
-    """
-    Returns an instance of FeatureConfig with the features passed to the FeatureConfigBuilder.
-    """
+    # Overwrite self.build() to return twml.FeatureConfig instead
+    def build(self) -> FeatureConfig:
+        """Returns an instance of FeatureConfig with the features passed to the FeatureConfigBuilder."""
 
-    (
-      keep_tensors,
-      keep_sparse_tensors,
-      feature_map,
-      features_add,
-      feature_name_to_feature_parser,
-      feature_in_bq_name,
-    ) = self._build()
+        (
+            keep_tensors,
+            keep_sparse_tensors,
+            feature_map,
+            features_add,
+            feature_name_to_feature_parser,
+            feature_in_bq_name,
+        ) = self._build()
 
-    discretize_dict = {}
-    for config in self._sparse_extraction_configs:
-      if config.discretize_num_bins and config.discretize_output_size_bits:
-        if config.discretize_type == "percentile":
-          calibrator = twml.contrib.calibrators.PercentileDiscretizerCalibrator
-        elif config.discretize_type == "hashed_percentile":
-          calibrator = twml.contrib.calibrators.HashedPercentileDiscretizerCalibrator
-        elif config.discretize_type == "hashing":
-          calibrator = twml.contrib.calibrators.HashingDiscretizerCalibrator
-        else:
-          raise ValueError("Unsupported discretizer type: " + config.discretize_type)
-        discretize_dict[config.output_name] = calibrator(
-          config.discretize_num_bins,
-          config.discretize_output_size_bits,
-          allow_empty_calibration=config.allow_empty_calibration,
-        )
-      elif config.discretize_num_bins or config.discretize_output_size_bits:
-        raise ValueError(
-          "Discretize_num_bins AND discretize_output_size_bits need to be in the FeatureConfig"
-        )
+        discretize_dict = {}
+        for config in self._sparse_extraction_configs:
+            if config.discretize_num_bins and config.discretize_output_size_bits:
+                if config.discretize_type == "percentile":
+                    calibrator = (
+                        twml.contrib.calibrators.PercentileDiscretizerCalibrator
+                    )
+                elif config.discretize_type == "hashed_percentile":
+                    calibrator = (
+                        twml.contrib.calibrators.HashedPercentileDiscretizerCalibrator
+                    )
+                elif config.discretize_type == "hashing":
+                    calibrator = twml.contrib.calibrators.HashingDiscretizerCalibrator
+                else:
+                    raise ValueError(
+                        "Unsupported discretizer type: " + config.discretize_type
+                    )
+                discretize_dict[config.output_name] = calibrator(
+                    config.discretize_num_bins,
+                    config.discretize_output_size_bits,
+                    allow_empty_calibration=config.allow_empty_calibration,
+                )
+            elif config.discretize_num_bins or config.discretize_output_size_bits:
+                raise ValueError(
+                    "Discretize_num_bins AND discretize_output_size_bits need to be in the FeatureConfig"
+                )
 
-    return FeatureConfig(
-      features={},
-      labels=self._labels,
-      weight=self._weight,
-      filters=self._filter_features,
-      tensor_types=keep_tensors,
-      sparse_tensor_types=keep_sparse_tensors,
-      feature_types=feature_map,
-      sparse_extraction_configs=self._sparse_extraction_configs,
-      feature_extraction_configs=self._feature_extraction_configs,
-      feature_group_extraction_configs=self._feature_group_extraction_configs,
-      image_configs=self._image_configs,
-      discretize_config=discretize_dict,
-      feature_ids=features_add,
-      decode_mode=self._decode_mode,
-      legacy_sparse=self._legacy_sparse,
-      feature_name_to_feature_parser=feature_name_to_feature_parser,
-      feature_in_bq_name=feature_in_bq_name,
-    )
+        return FeatureConfig(
+            features={},
+            labels=self._labels,
+            weight=self._weight,
+            filters=self._filter_features,
+            tensor_types=keep_tensors,
+            sparse_tensor_types=keep_sparse_tensors,
+            feature_types=feature_map,
+            sparse_extraction_configs=self._sparse_extraction_configs,
+            feature_extraction_configs=self._feature_extraction_configs,
+            feature_group_extraction_configs=self._feature_group_extraction_configs,
+            image_configs=self._image_configs,
+            discretize_config=discretize_dict,
+            feature_ids=features_add,
+            decode_mode=self._decode_mode,
+            legacy_sparse=self._legacy_sparse,
+            feature_name_to_feature_parser=feature_name_to_feature_parser,
+            feature_in_bq_name=feature_in_bq_name,
+        )
 
 
 TensorExtractionConfig = feature_config.TensorExtractionConfig
-
 FeatureGroupExtractionConfig = feature_config.FeatureGroupExtractionConfig
-
 ImageExtractionConfig = feature_config.ImageExtractionConfig
-
 _set_tensor_namedtuple = feature_config._set_tensor_namedtuple
diff --git a/twml/twml/contrib/feature_config_parsers.py b/twml/twml/contrib/feature_config_parsers.py
index 83c402e2e..2fb3dd4a3 100644
--- a/twml/twml/contrib/feature_config_parsers.py
+++ b/twml/twml/contrib/feature_config_parsers.py
@@ -1,224 +1,247 @@
 """Utility functions to create FeatureConfig objects from feature_spec.yaml files"""
 import os
 import re
+from typing import Dict
 
 import tensorflow.compat.v1 as tf
 import yaml
-from twml.feature_config import FeatureConfigBuilder
-from twml.contrib.feature_config import FeatureConfigBuilder as FeatureConfigBuilderV2
-
 
-def _get_config_version(config_dict):
-  doc = config_dict
-  supported_classes = {
-    "twml.FeatureConfig": "v1",
-    "twml.contrib.FeatureConfig": "v2"
-  }
-  if "class" not in doc:
-    raise ValueError("'class' key not found")
-  if doc["class"] not in supported_classes.keys():
-    raise ValueError("Class %s not supported. Supported clases are %s"
-                     % (doc["class"], supported_classes.keys()))
-  return supported_classes[doc["class"]]
-
-
-def _validate_config_dict_v1(config_dict):
-  """
-  Validate spec exported by twml.FeatureConfig
-  """
-  doc = config_dict
-
-  def malformed_error(msg):
-    raise ValueError("twml.FeatureConfig: Malformed feature_spec. %s" % msg)
-
-  if doc["class"] != "twml.FeatureConfig":
-    malformed_error("'class' is not twml.FeatureConfig")
-  if "format" not in doc:
-    malformed_error("'format' key not found")
-
-  # validate spec exported by twml.FeatureConfig
-  if doc["format"] == "exported":
-    dict_keys = ["features", "labels", "weight", "tensors", "sparse_tensors"]
-    for key in dict_keys:
-      if key not in doc:
-        malformed_error("'%s' key not found" % key)
-      if type(doc[key]) != dict:
-        malformed_error("'%s' is not a dict" % key)
-    if "filters" not in doc:
-      malformed_error("'filters' key not found")
-    elif type(doc["filters"]) != list:
-      malformed_error("'filters' is not a list")
-
-  # validate spec provided by modeler
-  elif doc["format"] == "manual":
-    raise NotImplementedError("Manual config support not yet implemented")
-  else:
-    malformed_error("'format' must be 'exported' or 'manual'")
-
-
-def _validate_config_dict_v2(config_dict):
-  """
-  Validate spec exported by twml.contrib.FeatureConfig
-  """
-  doc = config_dict
-
-  def malformed_error(msg):
-    raise ValueError("twml.contrib.FeatureConfig: Malformed feature_spec. %s" % msg)
-
-  if doc["class"] != "twml.contrib.FeatureConfig":
-    malformed_error("'class' is not twml.contrib.FeatureConfig")
-  if "format" not in doc:
-    malformed_error("'format key not found'")
-
-  # validate spec exported by twml.contrib.FeatureConfig (basic validation only)
-  if doc["format"] == "exported":
-    dict_keys = ["features", "labels", "weight", "tensors", "sparseTensors", "discretizeConfig"]
-    for key in dict_keys:
-      if key not in doc:
-        malformed_error("'%s' key not found" % key)
-      if type(doc[key]) != dict:
-        malformed_error("'%s' is not a dict" % key)
-    list_keys = ["sparseFeatureGroups", "denseFeatureGroups", "denseFeatures", "images", "filters"]
-    for key in list_keys:
-      if key not in doc:
-        malformed_error("'%s' key not found" % key)
-      if type(doc[key]) != list:
-        malformed_error("'%s' is not a list" % key)
-
-  # validate spec provided by modeler
-  elif doc["format"] == "manual":
-    raise NotImplementedError("Manual config support not yet implemented")
-  else:
-    malformed_error("'format' must be 'exported' or 'manual'")
-
-
-def _create_feature_config_v1(config_dict, data_spec_path):
-  fc_builder = FeatureConfigBuilder(data_spec_path)
-
-  if config_dict["format"] == "exported":
-    # add features
-    for feature_info in config_dict["features"].values():
-      feature_name = re.escape(feature_info["featureName"])
-      feature_group = feature_info["featureGroup"]
-      fc_builder.add_feature(feature_name, feature_group)
-    # add labels
-    labels = []
-    for label_info in config_dict["labels"].values():
-      labels.append(label_info["featureName"])
-    fc_builder.add_labels(labels)
-    # feature filters
-    for feature_name in config_dict["filters"]:
-      fc_builder.add_filter(feature_name)
-    # weight
-    if config_dict["weight"]:
-      weight_feature = list(config_dict["weight"].values())[0]["featureName"]
-      fc_builder.define_weight(weight_feature)
-  else:
-    raise ValueError("Format '%s' not implemented" % config_dict["format"])
-
-  return fc_builder.build()
-
-
-def _create_feature_config_v2(config_dict, data_spec_path):
-  fc_builder = FeatureConfigBuilderV2(data_spec_path)
-
-  if config_dict["format"] == "exported":
-    # add sparse group extraction configs
-    for sparse_group in config_dict["sparseFeatureGroups"]:
-      fids = sparse_group["features"].keys()
-      fnames = [sparse_group["features"][fid]["featureName"] for fid in fids]
-      fc_builder.extract_features_as_hashed_sparse(
-        feature_regexes=[re.escape(fname) for fname in fnames],
-        output_tensor_name=sparse_group["outputName"],
-        hash_space_size_bits=sparse_group["hashSpaceBits"],
-        discretize_num_bins=sparse_group["discretize"]["numBins"],
-        discretize_output_size_bits=sparse_group["discretize"]["outputSizeBits"],
-        discretize_type=sparse_group["discretize"]["type"],
-        type_filter=sparse_group["filterType"])
-
-    # add dense group extraction configs
-    for dense_group in config_dict["denseFeatureGroups"]:
-      fids = dense_group["features"].keys()
-      fnames = [dense_group["features"][fid]["featureName"] for fid in fids]
-      fc_builder.extract_feature_group(
-        feature_regexes=[re.escape(fname) for fname in fnames],
-        group_name=dense_group["outputName"],
-        type_filter=dense_group["filterType"],
-        default_value=dense_group["defaultValue"])
-
-    # add dense feature configs
-    for dense_features in config_dict["denseFeatures"]:
-      fids = dense_features["features"].keys()
-      fnames = [dense_features["features"][fid]["featureName"] for fid in fids]
-      default_value = dense_features["defaultValue"]
-      if len(fnames) == 1 and type(default_value) != dict:
-        fc_builder.extract_feature(
-          feature_name=re.escape(fnames[0]),
-          expected_shape=dense_features["expectedShape"],
-          default_value=dense_features["defaultValue"])
-      else:
-        fc_builder.extract_features(
-          feature_regexes=[re.escape(fname) for fname in fnames],
-          default_value_map=dense_features["defaultValue"])
-
-    # add image feature configs
-    for image in config_dict["images"]:
-      fc_builder.extract_image(
-        feature_name=image["featureName"],
-        preprocess=image["preprocess"],
-        out_type=tf.as_dtype(image["outType"].lower()),
-        channels=image["channels"],
-        default_image=image["defaultImage"],
-      )
-
-    # add other tensor features (non-image)
-    tensor_fnames = []
-    image_fnames = [img["featureName"] for img in config_dict["images"]]
-    for tensor_fname in config_dict["tensors"]:
-      if tensor_fname not in image_fnames:
-        tensor_fnames.append(tensor_fname)
-    for sparse_tensor_fname in config_dict["sparseTensors"]:
-      tensor_fnames.append(sparse_tensor_fname)
-    fc_builder.extract_tensors(tensor_fnames)
-
-    # add labels
-    labels = []
-    for label_info in config_dict["labels"].values():
-      labels.append(label_info["featureName"])
-    fc_builder.add_labels(labels)
-
-  else:
-    raise ValueError("Format '%s' not implemented" % config_dict["format"])
-
-  return fc_builder.build()
-
-
-def create_feature_config_from_dict(config_dict, data_spec_path):
-  """
-  Create a FeatureConfig object from a feature spec dict.
-  """
-  config_version = _get_config_version(config_dict)
-  if config_version == "v1":
-    _validate_config_dict_v1(config_dict)
-    feature_config = _create_feature_config_v1(config_dict, data_spec_path)
-  elif config_version == "v2":
-    _validate_config_dict_v2(config_dict)
-    feature_config = _create_feature_config_v2(config_dict, data_spec_path)
-  else:
-    raise ValueError("version not supported")
-
-  return feature_config
-
-
-def create_feature_config(config_path, data_spec_path):
-  """
-  Create a FeatureConfig object from a feature_spec.yaml file.
-  """
-  _, ext = os.path.splitext(config_path)
-  if ext not in ['.yaml', '.yml']:
-    raise ValueError("create_feature_config_from_yaml: Only .yaml/.yml supported")
-
-  with tf.io.gfile.GFile(config_path, mode='r') as fs:
-    config_dict = yaml.safe_load(fs)
-
-  return create_feature_config_from_dict(config_dict, data_spec_path)
+from twml.contrib.feature_config import FeatureConfigBuilder as FeatureConfigBuilderV2
+from twml.feature_config import FeatureConfig, FeatureConfigBuilder
+
+
+def _get_config_version(config_dict: dict) -> str:
+    """Returns the version of the feature spec"""
+
+    doc = config_dict.copy()
+    supported_classes = {"twml.FeatureConfig": "v1", "twml.contrib.FeatureConfig": "v2"}
+    if "class" not in doc:
+        raise ValueError("'class' key not found")
+    if doc["class"] not in supported_classes.keys():
+        raise ValueError(
+            "Class %s not supported. Supported clases are %s"
+            % (doc["class"], supported_classes.keys())
+        )
+    return supported_classes[doc["class"]]
+
+
+def _validate_config_dict_v1(config_dict: dict) -> None:
+    """Validate spec exported by twml.FeatureConfig"""
+
+    doc = config_dict
+
+    def malformed_error(msg: str):
+        raise ValueError("twml.FeatureConfig: Malformed feature_spec. " + msg)
+
+    if doc["class"] != "twml.FeatureConfig":
+        malformed_error("'class' is not twml.FeatureConfig")
+    if "format" not in doc:
+        malformed_error("'format' key not found")
+
+    # validate spec exported by twml.FeatureConfig
+    if doc["format"] == "exported":
+        dict_keys = ["features", "labels", "weight", "tensors", "sparse_tensors"]
+        for key in dict_keys:
+            if key not in doc:
+                malformed_error("'%s' key not found" % key)
+            elif isinstance(doc[key], dict):
+                malformed_error("'%s' is not a dict" % key)
+        if "filters" not in doc:
+            malformed_error("'filters' key not found")
+        elif isinstance(doc["filters"], list):
+            malformed_error("'filters' is not a list")
+    # validate spec provided by modeler
+    elif doc["format"] == "manual":
+        raise NotImplementedError("Manual config support not yet implemented")
+    else:
+        malformed_error("'format' must be 'exported' or 'manual'")
+
+
+def _validate_config_dict_v2(config_dict: dict) -> None:
+    """Validate spec exported by twml.contrib.FeatureConfig"""
+
+    doc = config_dict
+
+    def malformed_error(msg: str):
+        raise ValueError("twml.contrib.FeatureConfig: Malformed feature_spec. " + msg)
+
+    if doc["class"] != "twml.contrib.FeatureConfig":
+        malformed_error("'class' is not twml.contrib.FeatureConfig")
+    if "format" not in doc:
+        malformed_error("'format key not found'")
+
+    # validate spec exported by twml.contrib.FeatureConfig (basic validation only)
+    if doc["format"] == "exported":
+        dict_keys = [
+            "features",
+            "labels",
+            "weight",
+            "tensors",
+            "sparseTensors",
+            "discretizeConfig",
+        ]
+        for key in dict_keys:
+            if key not in doc:
+                malformed_error("'%s' key not found" % key)
+            if isinstance(doc[key], dict):
+                malformed_error("'%s' is not a dict" % key)
+        list_keys = [
+            "sparseFeatureGroups",
+            "denseFeatureGroups",
+            "denseFeatures",
+            "images",
+            "filters",
+        ]
+        for key in list_keys:
+            if key not in doc:
+                malformed_error("'%s' key not found" % key)
+            if type(doc[key]) != list:
+                malformed_error("'%s' is not a list" % key)
+
+    # validate spec provided by modeler
+    elif doc["format"] == "manual":
+        raise NotImplementedError("Manual config support not yet implemented")
+    else:
+        malformed_error("'format' must be 'exported' or 'manual'")
+
+
+def _create_feature_config_v1(
+    config_dict: Dict[str, str], data_spec_path: str
+) -> FeatureConfig:
+    """Create a FeatureConfig object from a feature spec"""
+
+    fc_builder = FeatureConfigBuilder(data_spec_path)
+
+    if config_dict["format"] == "exported":
+        # add features
+        for feature_info in config_dict["features"].values():
+            feature_name = re.escape(feature_info["featureName"])
+            feature_group = feature_info["featureGroup"]
+            fc_builder.add_feature(feature_name, feature_group)
+        # add labels
+        labels = []
+        for label_info in config_dict["labels"].values():
+            labels.append(label_info["featureName"])
+        fc_builder.add_labels(labels)
+        # feature filters
+        for feature_name in config_dict["filters"]:
+            fc_builder.add_filter(feature_name)
+        # weight
+        if config_dict["weight"]:
+            weight_feature = list(config_dict["weight"].values())[0]["featureName"]
+            fc_builder.define_weight(weight_feature)
+    else:
+        raise ValueError("Format '%s' not implemented" % config_dict["format"])
+
+    return fc_builder.build()
+
+
+def _create_feature_config_v2(config_dict: dict, data_spec_path: str) -> FeatureConfig:
+    """Create a FeatureConfig object from a feature spec"""
+
+    fc_builder = FeatureConfigBuilderV2(data_spec_path)
+
+    if config_dict["format"] == "exported":
+        # add sparse group extraction configs
+        for sparse_group in config_dict["sparseFeatureGroups"]:
+            fids = sparse_group["features"].keys()
+            fnames = [sparse_group["features"][fid]["featureName"] for fid in fids]
+            fc_builder.extract_features_as_hashed_sparse(
+                feature_regexes=[re.escape(fname) for fname in fnames],
+                output_tensor_name=sparse_group["outputName"],
+                hash_space_size_bits=sparse_group["hashSpaceBits"],
+                discretize_num_bins=sparse_group["discretize"]["numBins"],
+                discretize_output_size_bits=sparse_group["discretize"][
+                    "outputSizeBits"
+                ],
+                discretize_type=sparse_group["discretize"]["type"],
+                type_filter=sparse_group["filterType"],
+            )
+
+        # add dense group extraction configs
+        for dense_group in config_dict["denseFeatureGroups"]:
+            fids = dense_group["features"].keys()
+            fnames = [dense_group["features"][fid]["featureName"] for fid in fids]
+            fc_builder.extract_feature_group(
+                feature_regexes=[re.escape(fname) for fname in fnames],
+                group_name=dense_group["outputName"],
+                type_filter=dense_group["filterType"],
+                default_value=dense_group["defaultValue"],
+            )
+
+        # add dense feature configs
+        for dense_features in config_dict["denseFeatures"]:
+            fids = dense_features["features"].keys()
+            fnames = [dense_features["features"][fid]["featureName"] for fid in fids]
+            default_value = dense_features["defaultValue"]
+            if len(fnames) == 1 and type(default_value) != dict:
+                fc_builder.extract_feature(
+                    feature_name=re.escape(fnames[0]),
+                    expected_shape=dense_features["expectedShape"],
+                    default_value=dense_features["defaultValue"],
+                )
+            else:
+                fc_builder.extract_features(
+                    feature_regexes=[re.escape(fname) for fname in fnames],
+                    default_value_map=dense_features["defaultValue"],
+                )
+
+        # add image feature configs
+        for image in config_dict["images"]:
+            fc_builder.extract_image(
+                feature_name=image["featureName"],
+                preprocess=image["preprocess"],
+                out_type=tf.as_dtype(image["outType"].lower()),
+                channels=image["channels"],
+                default_image=image["defaultImage"],
+            )
+
+        # add other tensor features (non-image)
+        tensor_fnames = []
+        image_fnames = [img["featureName"] for img in config_dict["images"]]
+        for tensor_fname in config_dict["tensors"]:
+            if tensor_fname not in image_fnames:
+                tensor_fnames.append(tensor_fname)
+        for sparse_tensor_fname in config_dict["sparseTensors"]:
+            tensor_fnames.append(sparse_tensor_fname)
+        fc_builder.extract_tensors(tensor_fnames)
+
+        # add labels
+        labels = []
+        for label_info in config_dict["labels"].values():
+            labels.append(label_info["featureName"])
+        fc_builder.add_labels(labels)
+
+    else:
+        raise ValueError("Format '%s' not implemented" % config_dict["format"])
+    return fc_builder.build()
+
+
+def create_feature_config_from_dict(
+    config_dict: dict, data_spec_path: str
+) -> FeatureConfig:
+    """Create a FeatureConfig object from a feature spec dict."""
+
+    config_version = _get_config_version(config_dict)
+    if config_version == "v1":
+        _validate_config_dict_v1(config_dict)
+        feature_config = _create_feature_config_v1(config_dict, data_spec_path)
+    elif config_version == "v2":
+        _validate_config_dict_v2(config_dict)
+        feature_config = _create_feature_config_v2(config_dict, data_spec_path)
+    else:
+        raise ValueError("version not supported")
+    return feature_config
+
+
+def create_feature_config(config_path: str, data_spec_path: str) -> FeatureConfig:
+    """Create a FeatureConfig object from a feature_spec.yaml file."""
+
+    _, ext = os.path.splitext(config_path)
+    if ext not in [".yaml", ".yml"]:
+        raise ValueError("create_feature_config_from_yaml: Only .yaml/.yml supported")
+
+    with tf.io.gfile.GFile(config_path, mode="r") as fs:
+        config_dict = yaml.safe_load(fs)
+
+    return create_feature_config_from_dict(config_dict, data_spec_path)
diff --git a/twml/twml/contrib/feature_importances/feature_importances.py b/twml/twml/contrib/feature_importances/feature_importances.py
index a8bfcc129..34e6b2228 100644
--- a/twml/twml/contrib/feature_importances/feature_importances.py
+++ b/twml/twml/contrib/feature_importances/feature_importances.py
@@ -2,25 +2,28 @@
 
 import time
 from collections import defaultdict
+from queue import Queue
+from typing import Any, Dict, List, Tuple
 
 from com.twitter.mlmetastore.modelrepo.client import ModelRepoClient
 from com.twitter.mlmetastore.modelrepo.core import FeatureImportance, FeatureNames
+from requests.exceptions import HTTPError, RetryError
+from tensorflow.compat.v1 import logging
 from twitter.deepbird.io.util import match_feature_regex_list
 
-from twml.contrib.feature_importances.helpers import (
-  _get_feature_name_from_config,
-  _get_feature_types_from_records,
-  _get_metrics_hook,
-  _expand_prefix,
-  longest_common_prefix,
-  write_list_to_hdfs_gfile)
 from twml.contrib.feature_importances.feature_permutation import PermutedInputFnFactory
+from twml.contrib.feature_importances.helpers import (
+    _expand_prefix,
+    _get_feature_name_from_config,
+    _get_feature_types_from_records,
+    _get_metrics_hook,
+    longest_common_prefix,
+    write_list_to_hdfs_gfile,
+)
 from twml.tracking import ExperimentTracker
-
-from tensorflow.compat.v1 import logging
-from requests.exceptions import HTTPError, RetryError
-from queue import Queue
-
+from twml.twml import contrib
+from twml.twml.trainers.data_record_trainer import DataRecordTrainer
+from twml.twml.trainers.trainer import Trainer
 
 SERIAL = "serial"
 TREE = "tree"
@@ -31,384 +34,562 @@
 LOSS = "loss"
 
 
-def _repartition(feature_list_queue, fnames_ftypes, split_feature_group_on_period):
-  """
-  Iterate through letters to partition each feature by prefix, and then put each tuple
-    (prefix, feature_partition) into the feature_list_queue
-  Args:
-    prefix (str): The prefix shared by each feature in list_of_feature_types
-    feature_list_queue (Queue<(str, list<(str, str)>)>): The queue of feature groups
-    fnames_ftypes (list<(str, str)>): List of (fname, ftype) pairs. Each fname begins with prefix
-    split_feature_group_on_period (str): If true, require that feature groups end in a period
-  Returns:
-    Updated queue with each group in fnames_ftypes
-  """
-  assert len(fnames_ftypes) > 1
-
-  split_character = "." if split_feature_group_on_period else None
-  # Compute the longest prefix of the words
-  prefix = longest_common_prefix(
-    strings=[fname for fname, _ in fnames_ftypes], split_character=split_character)
-
-  # Separate the features by prefix
-  prefix_to_features = defaultdict(list)
-  for fname, ftype in fnames_ftypes:
-    assert fname.startswith(prefix)
-    new_prefix = _expand_prefix(fname=fname, prefix=prefix, split_character=split_character)
-    prefix_to_features[new_prefix].append((fname, ftype))
-
-  # Add all of the new partitions to the queue
-  for new_prefix, fname_ftype_list in prefix_to_features.items():
-    extended_new_prefix = longest_common_prefix(
-      strings=[fname for fname, _ in fname_ftype_list], split_character=split_character)
-    assert extended_new_prefix.startswith(new_prefix)
-    feature_list_queue.put((extended_new_prefix, fname_ftype_list))
-  return feature_list_queue
-
-
-def _infer_if_is_metric_larger_the_better(stopping_metric):
-  # Infers whether a metric should be interpreted such that larger numbers are better (e.g. ROC_AUC), as opposed to
-  #   larger numbers being worse (e.g. LOSS)
-  if stopping_metric is None:
-    raise ValueError("Error: Stopping Metric cannot be None")
-  elif stopping_metric.startswith(LOSS):
-    logging.info("Interpreting {} to be a metric where larger numbers are worse".format(stopping_metric))
-    is_metric_larger_the_better = False
-  else:
-    logging.info("Interpreting {} to be a metric where larger numbers are better".format(stopping_metric))
-    is_metric_larger_the_better = True
-  return is_metric_larger_the_better
-
-
-def _check_whether_tree_should_expand(baseline_performance, computed_performance, sensitivity, stopping_metric, is_metric_larger_the_better):
-  """
-  Returns True if
-    - the metric is positive (e.g. ROC_AUC) and computed_performance is nontrivially smaller than the baseline_performance
-    - the metric is negative (e.g. LOSS) and computed_performance is nontrivially larger than the baseline_performance
-  """
-  difference = ((baseline_performance[stopping_metric] - computed_performance[stopping_metric]) /
-                 baseline_performance[stopping_metric])
-
-  if not is_metric_larger_the_better:
-      difference = -difference
-
-  logging.info(
-    "Found a {} difference of {}. Sensitivity is {}.".format("positive" if is_metric_larger_the_better else "negative", difference, sensitivity))
-  return difference > sensitivity
+def _repartition(
+    feature_list_queue: Queue[Tuple[str, List[Tuple[str, str]]]],
+    fnames_ftypes: List[Tuple[str, str]],
+    split_feature_group_on_period: str,
+) -> Queue[Tuple[str, List[Tuple[str, str]]]]:
+    """
+    Iterate through letters to partition each feature by prefix, and then put each tuple
+        (prefix, feature_partition) into the feature_list_queue
+
+    Args:
+        prefix (str):
+            The prefix shared by each feature in list_of_feature_types
+        feature_list_queue (Queue<(str, list<(str, str)>)>):
+            The queue of feature groups
+        fnames_ftypes (list<(str, str)>):
+            List of (fname, ftype) pairs. Each fname begins with prefix
+        split_feature_group_on_period (str):
+            If true, require that feature groups end in a period
+
+    Returns:
+        Updated queue with each group in fnames_ftypes
+    """
+    assert len(fnames_ftypes) > 1
+
+    split_character = "." if split_feature_group_on_period else None
+    # Compute the longest prefix of the words
+    prefix = longest_common_prefix(
+        strings=[fname for fname, _ in fnames_ftypes], split_character=split_character
+    )
+
+    # Separate the features by prefix
+    prefix_to_features = defaultdict(list)
+    for fname, ftype in fnames_ftypes:
+        assert fname.startswith(prefix)
+        new_prefix = _expand_prefix(
+            fname=fname, prefix=prefix, split_character=split_character
+        )
+        prefix_to_features[new_prefix].append((fname, ftype))
+
+    # Add all of the new partitions to the queue
+    for new_prefix, fname_ftype_list in prefix_to_features.items():
+        extended_new_prefix = longest_common_prefix(
+            strings=[fname for fname, _ in fname_ftype_list],
+            split_character=split_character,
+        )
+        assert extended_new_prefix.startswith(new_prefix)
+        feature_list_queue.put((extended_new_prefix, fname_ftype_list))
+    return feature_list_queue
+
+
+def _infer_if_is_metric_larger_the_better(stopping_metric: str) -> bool:
+    # Infers whether a metric should be interpreted such that larger numbers are better (e.g. ROC_AUC), as opposed to
+    #   larger numbers being worse (e.g. LOSS)
+    if stopping_metric is None:
+        raise ValueError("Error: Stopping Metric cannot be None")
+    elif stopping_metric.startswith(LOSS):
+        logging.info(
+            f"Interpreting {stopping_metric} to be a metric where larger numbers are worse"
+        )
+        return False
+    else:
+        logging.info(
+            f"Interpreting {stopping_metric} to be a metric where larger numbers are better"
+        )
+        return True
+
+
+def _check_whether_tree_should_expand(
+    baseline_performance: dict,
+    computed_performance: dict,
+    sensitivity: float,
+    stopping_metric: str,
+    is_metric_larger_the_better: bool,
+) -> bool:
+    """
+    Returns True if
+        - the metric is positive (e.g. ROC_AUC) and computed_performance is non-trivially smaller than the baseline_performance
+        - the metric is negative (e.g. LOSS) and computed_performance is non-trivially larger than the baseline_performance
+    """
+
+    difference = (
+        baseline_performance[stopping_metric] - computed_performance[stopping_metric]
+    ) / baseline_performance[stopping_metric]
+
+    if not is_metric_larger_the_better:
+        difference *= -1
+
+    logging.info(
+        f"Found a {'positive' if is_metric_larger_the_better else 'negative'} difference of {difference}. Sensitivity is {sensitivity}."
+    )
+    return difference > sensitivity
 
 
 def _compute_multiple_permuted_performances_from_trainer(
-    factory, fname_ftypes, trainer, parse_fn, record_count):
-  """Compute performances with fname and fype permuted
-  """
-  metrics_hook = _get_metrics_hook(trainer)
-  trainer._estimator.evaluate(
-    input_fn=factory.get_permuted_input_fn(
-      batch_size=trainer._params.eval_batch_size, parse_fn=parse_fn, fname_ftypes=fname_ftypes),
-    steps=(record_count + trainer._params.eval_batch_size) // trainer._params.eval_batch_size,
-    hooks=[metrics_hook],
-    checkpoint_path=trainer.best_or_latest_checkpoint)
-  return metrics_hook.metric_values
-
-
-def _get_extra_feature_group_performances(factory, trainer, parse_fn, extra_groups, feature_to_type, record_count):
-  """Compute performance differences for the extra feature groups
-  """
-  extra_group_feature_performance_results = {}
-  for group_name, raw_feature_regex_list in extra_groups.items():
-    start = time.time()
-    fnames = match_feature_regex_list(
-      features=feature_to_type.keys(),
-      feature_regex_list=[regex for regex in raw_feature_regex_list],
-      preprocess=False,
-      as_dict=False)
-
-    fnames_ftypes = [(fname, feature_to_type[fname]) for fname in fnames]
-
-    logging.info("Extracted extra group {} with features {}".format(group_name, fnames_ftypes))
-    extra_group_feature_performance_results[group_name] = _compute_multiple_permuted_performances_from_trainer(
-      factory=factory, fname_ftypes=fnames_ftypes,
-      trainer=trainer, parse_fn=parse_fn, record_count=record_count)
-    logging.info("\n\nImportances computed for {} in {} seconds \n\n".format(
-      group_name, int(time.time() - start)))
-  return extra_group_feature_performance_results
+    factory: PermutedInputFnFactory,
+    fname_ftypes: List[Tuple[str, str]],
+    trainer: ExperimentTracker,
+    parse_fn: callable,
+    record_count: int,
+) -> dict:
+    """Compute performances with fname and ftype permuted"""
+    metrics_hook = _get_metrics_hook(trainer)
+    trainer._estimator.evaluate(
+        input_fn=factory.get_permuted_input_fn(
+            batch_size=trainer._params.eval_batch_size,
+            parse_fn=parse_fn,
+            fname_ftypes=fname_ftypes,
+        ),
+        steps=(record_count + trainer._params.eval_batch_size)
+        // trainer._params.eval_batch_size,
+        hooks=[metrics_hook],
+        checkpoint_path=trainer.best_or_latest_checkpoint,
+    )
+    return metrics_hook.metric_values
+
+
+def _get_extra_feature_group_performances(
+    factory: PermutedInputFnFactory,
+    trainer: ExperimentTracker,
+    parse_fn: callable,
+    extra_groups: dict,
+    feature_to_type: dict,
+    record_count: int,
+):
+    """Compute performance differences for the extra feature groups"""
+    extra_group_feature_performance_results = {}
+    for group_name, raw_feature_regex_list in extra_groups.items():
+        start = time.time()
+        fnames = match_feature_regex_list(
+            features=feature_to_type.keys(),
+            feature_regex_list=[regex for regex in raw_feature_regex_list],
+            preprocess=False,
+            as_dict=False,
+        )
+
+        fnames_ftypes = [(fname, feature_to_type[fname]) for fname in fnames]
+
+        logging.info(
+            f"Extracted extra group {group_name} with features {fnames_ftypes}"
+        )
+        extra_group_feature_performance_results[
+            group_name
+        ] = _compute_multiple_permuted_performances_from_trainer(
+            factory=factory,
+            fname_ftypes=fnames_ftypes,
+            trainer=trainer,
+            parse_fn=parse_fn,
+            record_count=record_count,
+        )
+        logging.info(
+            f"\n\nImportance computed for {group_name} in {float(time.time() - start):.3f} seconds \n\n"
+        )
+    return extra_group_feature_performance_results
 
 
 def _feature_importances_tree_algorithm(
-    data_dir, trainer, parse_fn, fnames, stopping_metric, file_list=None, datarecord_filter_fn=None, split_feature_group_on_period=True,
-    record_count=99999, is_metric_larger_the_better=None, sensitivity=0.025, extra_groups=None, dont_build_tree=False):
-  """Tree algorithm for feature and feature group importances. This algorithm build a prefix tree of
-  the feature names and then traverses the tree with a BFS. At each node (aka group of features with
-  a shared prefix) the algorithm computes the performance of the model when we permute all features
-  in the group. The algorithm only zooms-in on groups that impact the performance by more than
-  sensitivity. As a result, features that affect the model performance by less than sensitivity will
-  not have an exact importance.
-  Args:
-    data_dir: (str): The location of the training or testing data to compute importances over.
-      If None, the trainer._eval_files are used
-    trainer: (DataRecordTrainer): A DataRecordTrainer object
-    parse_fn: (function): The parse_fn used by eval_input_fn
-    fnames (list<string>): The list of feature names
-    stopping_metric (str): The metric to use to determine when to stop expanding trees
-    file_list (list<str>): The list of filenames. Exactly one of file_list and data_dir should be
-      provided
-    datarecord_filter_fn (function): a function takes a single data sample in com.twitter.ml.api.ttypes.DataRecord format
-        and return a boolean value, to indicate if this data record should be kept in feature importance module or not.
-    split_feature_group_on_period (boolean): If true, split feature groups by period rather than on
-      optimal prefix
-    record_count (int): The number of records to compute importances over
-    is_metric_larger_the_better (boolean): If true, assume that stopping_metric is a metric where larger
-      values are better (e.g. ROC-AUC)
-    sensitivity (float): The smallest change in performance to continue to expand the tree
-    extra_groups (dict<str, list<str>>): A dictionary mapping the name of extra feature groups to the list of
-      the names of the features in the group. You should only supply a value for this argument if you have a set
-      of features that you want to evaluate as a group but don't share a prefix
-    dont_build_tree (boolean): If True, don't build the tree and only compute the extra_groups importances
-  Returns:
-    A dictionary that contains the individual and group feature importances
-  """
-  factory = PermutedInputFnFactory(
-    data_dir=data_dir, record_count=record_count, file_list=file_list, datarecord_filter_fn=datarecord_filter_fn)
-  baseline_performance = _compute_multiple_permuted_performances_from_trainer(
-    factory=factory, fname_ftypes=[],
-    trainer=trainer, parse_fn=parse_fn, record_count=record_count)
-  out = {"None": baseline_performance}
-
-  if stopping_metric not in baseline_performance:
-    raise ValueError("The stopping metric '{}' not found in baseline_performance. Metrics are {}".format(
-      stopping_metric, list(baseline_performance.keys())))
-
-  is_metric_larger_the_better = (
-    is_metric_larger_the_better if is_metric_larger_the_better is not None else _infer_if_is_metric_larger_the_better(stopping_metric))
-  logging.info("Using {} as the stopping metric for the tree algorithm".format(stopping_metric))
-
-  feature_to_type = _get_feature_types_from_records(records=factory.records, fnames=fnames)
-  all_feature_types = list(feature_to_type.items())
-
-  individual_feature_performances = {}
-  feature_group_performances = {}
-  if dont_build_tree:
-    logging.info("Not building feature importance trie. Will only compute importances for the extra_groups")
-  else:
-    logging.info("Building feature importance trie")
-    # Each element in the Queue will be a tuple of (prefix, list_of_feature_type_pairs) where
-    #   each feature in list_of_feature_type_pairs will have have the prefix "prefix"
-    feature_list_queue = _repartition(
-      feature_list_queue=Queue(), fnames_ftypes=all_feature_types, split_feature_group_on_period=split_feature_group_on_period)
-
-    while not feature_list_queue.empty():
-      # Pop the queue. We should never have an empty list in the queue
-      prefix, fnames_ftypes = feature_list_queue.get()
-      assert len(fnames_ftypes) > 0
-
-      # Compute performance from permuting all features in fname_ftypes
-      logging.info(
-        "\n\nComputing importances for {} ({}...). {} elements left in the queue \n\n".format(
-          prefix, fnames_ftypes[:5], feature_list_queue.qsize()))
-      start = time.time()
-      computed_performance = _compute_multiple_permuted_performances_from_trainer(
-        factory=factory, fname_ftypes=fnames_ftypes,
-        trainer=trainer, parse_fn=parse_fn, record_count=record_count)
-      logging.info("\n\nImportances computed for {} in {} seconds \n\n".format(
-        prefix, int(time.time() - start)))
-      if len(fnames_ftypes) == 1:
-        individual_feature_performances[fnames_ftypes[0][0]] = computed_performance
-      else:
-        feature_group_performances[prefix] = computed_performance
-      # Dig deeper into the features in fname_ftypes only if there is more than one feature in the
-      #    list and the performance drop is nontrivial
-      logging.info("Checking performance for {} ({}...)".format(prefix, fnames_ftypes[:5]))
-      check = _check_whether_tree_should_expand(
-        baseline_performance=baseline_performance, computed_performance=computed_performance,
-        sensitivity=sensitivity, stopping_metric=stopping_metric, is_metric_larger_the_better=is_metric_larger_the_better)
-      if len(fnames_ftypes) > 1 and check:
-        logging.info("Expanding {} ({}...)".format(prefix, fnames_ftypes[:5]))
-        feature_list_queue = _repartition(
-          feature_list_queue=feature_list_queue, fnames_ftypes=fnames_ftypes, split_feature_group_on_period=split_feature_group_on_period)
-      else:
-        logging.info("Not expanding {} ({}...)".format(prefix, fnames_ftypes[:5]))
-
-  # Baseline performance is grouped in with individual_feature_importance_results
-  individual_feature_performance_results = dict(
-    out, **{k: v for k, v in individual_feature_performances.items()})
-  group_feature_performance_results = {k: v for k, v in feature_group_performances.items()}
-
-  if extra_groups is not None:
-    logging.info("Computing performances for extra groups {}".format(extra_groups.keys()))
-    for group_name, performances in _get_extra_feature_group_performances(
+    data_dir: str,
+    trainer: DataRecordTrainer,
+    parse_fn: callable,
+    fnames: List[str],
+    stopping_metric: str,
+    file_list: List[str] = None,
+    datarecord_filter_fn: callable = None,
+    split_feature_group_on_period: bool = True,
+    record_count: int = 99999,
+    is_metric_larger_the_better: bool = None,
+    sensitivity: float = 0.025,
+    extra_groups: Dict[str, List[str]] = None,
+    dont_build_tree: bool = False,
+) -> Dict[str, Dict[str, Any]]:
+    """Tree algorithm for feature and feature group importance. This algorithm build a prefix tree of
+    the feature names and then traverses the tree with a BFS. At each node (aka group of features with
+    a shared prefix) the algorithm computes the performance of the model when we permute all features
+    in the group. The algorithm only zooms-in on groups that impact the performance by more than
+    sensitivity. As a result, features that affect the model performance by less than sensitivity will
+    not have an exact importance.
+    Args:
+        data_dir (str):
+            The location of the training or testing data to compute importance over.
+            If None, the trainer._eval_files are used
+        trainer (DataRecordTrainer):
+            A DataRecordTrainer object
+        parse_fn (function):
+            The parse_fn used by eval_input_fn
+        fnames (list<string>):
+            The list of feature names
+        stopping_metric (str):
+            The metric to use to determine when to stop expanding trees
+        file_list (list<str>):
+            The list of filenames. Exactly one of file_list and data_dir should be provided
+        datarecord_filter_fn (function):
+            a function takes a single data sample in com.twitter.ml.api.ttypes.DataRecord format
+            and return a boolean value, to indicate if this data record should be kept in feature importance module or not.
+        split_feature_group_on_period (boolean):
+            If true, split feature groups by period rather than on optimal prefix
+        record_count (int):
+            The number of records to compute importance over
+        is_metric_larger_the_better (boolean):
+            If true, assume that stopping_metric is a metric where larger values are better (e.g. ROC-AUC)
+        sensitivity (float):
+            The smallest change in performance to continue to expand the tree
+        extra_groups (dict<str, list<str>>):
+            A dictionary mapping the name of extra feature groups to the list of
+            the names of the features in the group. You should only supply a value for this argument if you have a set
+            of features that you want to evaluate as a group but don't share a prefix
+        dont_build_tree (boolean):
+            If True, don't build the tree and only compute the extra_groups importance
+    Returns:
+        A dictionary that contains the individual and group feature importance
+    """
+    factory = PermutedInputFnFactory(
+        data_dir=data_dir,
+        record_count=record_count,
+        file_list=file_list,
+        datarecord_filter_fn=datarecord_filter_fn,
+    )
+    baseline_performance = _compute_multiple_permuted_performances_from_trainer(
         factory=factory,
+        fname_ftypes=[],
         trainer=trainer,
         parse_fn=parse_fn,
-        extra_groups=extra_groups,
-        feature_to_type=feature_to_type,
-        record_count=record_count).items():
-      group_feature_performance_results[group_name] = performances
-  else:
-    logging.info("Not computing performances for extra groups")
-
-  return {INDIVIDUAL: individual_feature_performance_results,
-          GROUP: group_feature_performance_results}
+        record_count=record_count,
+    )
+    out = {"None": baseline_performance}
+
+    if stopping_metric not in baseline_performance:
+        raise ValueError(
+            f"The stopping metric '{stopping_metric}' not found in baseline_performance. Metrics are {baseline_performance.keys()}"
+        )
+
+    is_metric_larger_the_better = (
+        is_metric_larger_the_better
+        if is_metric_larger_the_better is not None
+        else _infer_if_is_metric_larger_the_better(stopping_metric)
+    )
+    logging.info(
+        f"Using {stopping_metric} as the stopping metric for the tree algorithm"
+    )
+
+    feature_to_type = _get_feature_types_from_records(
+        records=factory.records, fnames=fnames
+    )
+    all_feature_types = list(feature_to_type.items())
+
+    individual_feature_performances = {}
+    feature_group_performances = {}
+    if dont_build_tree:
+        logging.info(
+            "Not building feature importance trie. Will only compute importance for the extra_groups"
+        )
+    else:
+        logging.info("Building feature importance trie")
+        # Each element in the Queue will be a tuple of (prefix, list_of_feature_type_pairs) where
+        #   each feature in list_of_feature_type_pairs will have have the prefix "prefix"
+        feature_list_queue = _repartition(
+            feature_list_queue=Queue(),
+            fnames_ftypes=all_feature_types,
+            split_feature_group_on_period=split_feature_group_on_period,
+        )
+
+        while not feature_list_queue.empty():
+            # Pop the queue. We should never have an empty list in the queue
+            prefix, fnames_ftypes = feature_list_queue.get()
+            assert len(fnames_ftypes) > 0
+
+            # Compute performance from permuting all features in fname_ftypes
+            logging.info(
+                f"\n\nComputing importances for {prefix} ({fnames_ftypes[:5]}...). {feature_list_queue.qsize()} elements left in the queue \n\n"
+            )
+            start = time.time()
+            computed_performance = _compute_multiple_permuted_performances_from_trainer(
+                factory=factory,
+                fname_ftypes=fnames_ftypes,
+                trainer=trainer,
+                parse_fn=parse_fn,
+                record_count=record_count,
+            )
+            logging.info(
+                f"\n\nImportance computed for {prefix} in {float(time.time() - start):.3f} seconds \n\n"
+            )
+            if len(fnames_ftypes) == 1:
+                individual_feature_performances[
+                    fnames_ftypes[0][0]
+                ] = computed_performance
+            else:
+                feature_group_performances[prefix] = computed_performance
+            # Dig deeper into the features in fname_ftypes only if there is more than one feature in the
+            #    list and the performance drop is nontrivial
+            logging.info(f"Checking performance for {prefix} ({fnames_ftypes[:5]}...)")
+            check = _check_whether_tree_should_expand(
+                baseline_performance=baseline_performance,
+                computed_performance=computed_performance,
+                sensitivity=sensitivity,
+                stopping_metric=stopping_metric,
+                is_metric_larger_the_better=is_metric_larger_the_better,
+            )
+            if len(fnames_ftypes) > 1 and check:
+                logging.info(f"Expanding {prefix} ({fnames_ftypes[:5]}...)")
+                feature_list_queue = _repartition(
+                    feature_list_queue=feature_list_queue,
+                    fnames_ftypes=fnames_ftypes,
+                    split_feature_group_on_period=split_feature_group_on_period,
+                )
+            else:
+                logging.info(f"Not expanding {prefix} ({fnames_ftypes[:5]}...)")
+
+    # Baseline performance is grouped in with individual_feature_importance_results
+    individual_feature_performance_results = dict(
+        out, **{k: v for k, v in individual_feature_performances.items()}
+    )
+    group_feature_performance_results = {
+        k: v for k, v in feature_group_performances.items()
+    }
+
+    if extra_groups is not None:
+        logging.info(f"Computing performances for extra groups {extra_groups.keys()}")
+        for group_name, performances in _get_extra_feature_group_performances(
+            factory=factory,
+            trainer=trainer,
+            parse_fn=parse_fn,
+            extra_groups=extra_groups,
+            feature_to_type=feature_to_type,
+            record_count=record_count,
+        ).items():
+            group_feature_performance_results[group_name] = performances
+    else:
+        logging.info("Not computing performances for extra groups")
+
+    return {
+        INDIVIDUAL: individual_feature_performance_results,
+        GROUP: group_feature_performance_results,
+    }
 
 
 def _feature_importances_serial_algorithm(
-    data_dir, trainer, parse_fn, fnames, file_list=None, datarecord_filter_fn=None, factory=None, record_count=99999):
-  """Serial algorithm for feature importances. This algorithm computes the
-  importance of each feature.
-  """
-  factory = PermutedInputFnFactory(
-    data_dir=data_dir, record_count=record_count, file_list=file_list, datarecord_filter_fn=datarecord_filter_fn)
-  feature_to_type = _get_feature_types_from_records(records=factory.records, fnames=fnames)
-
-  out = {}
-  for fname, ftype in list(feature_to_type.items()) + [(None, None)]:
-    logging.info("\n\nComputing importances for {}\n\n".format(fname))
-    start = time.time()
-    fname_ftypes = [(fname, ftype)] if fname is not None else []
-    out[str(fname)] = _compute_multiple_permuted_performances_from_trainer(
-      factory=factory, fname_ftypes=fname_ftypes,
-      trainer=trainer, parse_fn=parse_fn, record_count=record_count)
-    logging.info("\n\nImportances computed for {} in {} seconds \n\n".format(
-      fname, int(time.time() - start)))
-  # The serial algorithm does not compute group feature results.
-  return {INDIVIDUAL: out, GROUP: {}}
+    data_dir: str,
+    trainer: Trainer,
+    parse_fn: callable,
+    fnames: List[str],
+    file_list: List[str] = None,
+    datarecord_filter_fn: callable = None,
+    factory: PermutedInputFnFactory = None,
+    record_count: int = 99999,
+):
+    """Serial algorithm for feature importances. This algorithm computes the
+    importance of each feature.
+    """
+    factory = PermutedInputFnFactory(
+        data_dir=data_dir,
+        record_count=record_count,
+        file_list=file_list,
+        datarecord_filter_fn=datarecord_filter_fn,
+    )
+    feature_to_type = _get_feature_types_from_records(
+        records=factory.records, fnames=fnames
+    )
+
+    out = {}
+    for fname, ftype in list(feature_to_type.items()) + [(None, None)]:
+        logging.info(f"\n\nComputing importances for {fname}\n\n")
+        start = time.time()
+        fname_ftypes = [(fname, ftype)] if fname is not None else []
+        out[str(fname)] = _compute_multiple_permuted_performances_from_trainer(
+            factory=factory,
+            fname_ftypes=fname_ftypes,
+            trainer=trainer,
+            parse_fn=parse_fn,
+            record_count=record_count,
+        )
+        logging.info(
+            f"\n\nImportances computed for {fname} in {float(time.time() - start):.3f} seconds \n\n"
+        )
+    # The serial algorithm does not compute group feature results.
+    return {INDIVIDUAL: out, GROUP: {}}
 
 
 def _process_feature_name_for_mldash(feature_name):
-  # Using a forward slash in the name causes feature importance writing to fail because strato interprets it as
-  #   part of a url
-  return feature_name.replace("/", "__")
+    # Using a forward slash in the name causes feature importance writing to fail because strato interprets it as
+    #   part of a url
+    return feature_name.replace("/", "__")
 
 
 def compute_feature_importances(
-    trainer, data_dir=None, feature_config=None, algorithm=TREE, parse_fn=None, datarecord_filter_fn=None, **kwargs):
-  """Perform a feature importance analysis on a trained model
-  Args:
-    trainer: (DataRecordTrainer): A DataRecordTrainer object
-    data_dir: (str): The location of the training or testing data to compute importances over.
-      If None, the trainer._eval_files are used
-    feature_config (contrib.FeatureConfig): The feature config object. If this is not provided, it
-      is taken from the trainer
-    algorithm (str): The algorithm to use
-    parse_fn: (function): The parse_fn used by eval_input_fn. By default this is
-      feature_config.get_parse_fn()
-    datarecord_filter_fn (function): a function takes a single data sample in com.twitter.ml.api.ttypes.DataRecord format
-        and return a boolean value, to indicate if this data record should be kept in feature importance module or not.
-  """
-
-  # We only use the trainer's eval files if an override data_dir is not provided
-  if data_dir is None:
-    logging.info("Using trainer._eval_files (found {} as files)".format(trainer._eval_files))
-    file_list = trainer._eval_files
-  else:
-    logging.info("data_dir provided. Looking at {} for data.".format(data_dir))
-    file_list = None
-
-  feature_config = feature_config or trainer._feature_config
-  out = {}
-  if not feature_config:
-    logging.warn("WARN: Not computing feature importance because trainer._feature_config is None")
-    out = None
-  else:
-    parse_fn = parse_fn if parse_fn is not None else feature_config.get_parse_fn()
-    fnames = _get_feature_name_from_config(feature_config)
-    logging.info("Computing importances for {}".format(fnames))
-    logging.info("Using the {} feature importance computation algorithm".format(algorithm))
-    algorithm = {
-      SERIAL: _feature_importances_serial_algorithm,
-      TREE: _feature_importances_tree_algorithm}[algorithm]
-    out = algorithm(data_dir=data_dir, trainer=trainer, parse_fn=parse_fn, fnames=fnames, file_list=file_list, datarecord_filter_fn=datarecord_filter_fn, **kwargs)
-  return out
+    trainer: DataRecordTrainer,
+    data_dir: str = None,
+    feature_config: contrib.feature_config = None,
+    algorithm: str = TREE,
+    parse_fn: callable = None,
+    datarecord_filter_fn: callable = None,
+    **kwargs,
+):
+    """Perform a feature importance analysis on a trained model
+    Args:
+        trainer (DataRecordTrainer):
+            A DataRecordTrainer object
+        data_dir (str):
+            The location of the training or testing data to compute importances over.
+            If None, the trainer._eval_files are used
+        feature_config (contrib.FeatureConfig):
+            The feature config object. If this is not provided, it
+            is taken from the trainer
+        algorithm (str):
+            The algorithm to use
+        parse_fn (function):
+            The parse_fn used by eval_input_fn. By default this is feature_config.get_parse_fn()
+        datarecord_filter_fn (function):
+            a function takes a single data sample in com.twitter.ml.api.ttypes.DataRecord format and
+            return a boolean value, to indicate if this data record should be kept in feature importance module or not.
+    """
+
+    # We only use the trainer's eval files if an override data_dir is not provided
+    if data_dir is None:
+        logging.info(
+            f"Using trainer._eval_files (found {trainer._eval_files} as files)"
+        )
+        file_list = trainer._eval_files
+    else:
+        logging.info(f"data_dir provided. Looking at {data_dir} for data.")
+        file_list = None
+
+    feature_config = feature_config or trainer._feature_config
+    out = {}
+    if not feature_config:
+        logging.warn(
+            "WARN: Not computing feature importance because trainer._feature_config is None"
+        )
+        out = None
+    else:
+        parse_fn = parse_fn if parse_fn is not None else feature_config.get_parse_fn()
+        fnames = _get_feature_name_from_config(feature_config)
+        logging.info(f"Computing importances for {fnames}")
+        logging.info(f"Using the {algorithm} feature importance computation algorithm")
+        algorithm = {
+            SERIAL: _feature_importances_serial_algorithm,
+            TREE: _feature_importances_tree_algorithm,
+        }[algorithm]
+        out = algorithm(
+            data_dir=data_dir,
+            trainer=trainer,
+            parse_fn=parse_fn,
+            fnames=fnames,
+            file_list=file_list,
+            datarecord_filter_fn=datarecord_filter_fn,
+            **kwargs,
+        )
+    return out
 
 
 def write_feature_importances_to_hdfs(
-    trainer, feature_importances, output_path=None, metric="roc_auc"):
-  """Publish a feature importance analysis to hdfs as a tsv
-  Args:
-    (see compute_feature_importances for other args)
-    trainer (Trainer)
-    feature_importances (dict): Dictionary of feature importances
-    output_path (str): The remote or local file to write the feature importances to. If not
-      provided, this is inferred to be the trainer save dir
-    metric (str): The metric to write to tsv
-  """
-  # String formatting appends (Individual) or (Group) to feature name depending on type
-  perfs = {"{} ({})".format(k, importance_key) if k != "None" else k: v[metric]
-    for importance_key, importance_value in feature_importances.items()
-    for k, v in importance_value.items()}
-
-  output_path = ("{}/feature_importances-{}".format(
-    trainer._save_dir[:-1] if trainer._save_dir.endswith('/') else trainer._save_dir,
-    output_path if output_path is not None else str(time.time())))
-
-  if len(perfs) > 0:
-    logging.info("Writing feature_importances for {} to hdfs".format(perfs.keys()))
-    entries = [
-      {
-        "name": name,
-        "drop": perfs["None"] - perfs[name],
-        "pdrop": 100 * (perfs["None"] - perfs[name]) / (perfs["None"] + 1e-8),
-        "perf": perfs[name]
-      } for name in perfs.keys()]
-    out = ["Name\tPerformance Drop\tPercent Performance Drop\tPerformance"]
-    for entry in sorted(entries, key=lambda d: d["drop"]):
-      out.append("{name}\t{drop}\t{pdrop}%\t{perf}".format(**entry))
-    logging.info("\n".join(out))
-    write_list_to_hdfs_gfile(out, output_path)
-    logging.info("Wrote feature feature_importances to {}".format(output_path))
-  else:
-    logging.info("Not writing feature_importances to hdfs")
-  return output_path
-
-
-def write_feature_importances_to_ml_dash(trainer, feature_importances, feature_config=None):
-  # type: (DataRecordTrainer, FeatureConfig, dict) -> None
-  """Publish feature importances + all feature names to ML Metastore
-  Args:
-    trainer: (DataRecordTrainer): A DataRecordTrainer object
-    feature_config (contrib.FeatureConfig): The feature config object. If this is not provided, it
-      is taken from the trainer
-    feature_importances (dict, default=None): Dictionary of precomputed feature importances
-    feature_importance_metric (str, default=None): The metric to write to ML Dashboard
-  """
-  experiment_tracking_path = trainer.experiment_tracker.tracking_path\
-    if trainer.experiment_tracker.tracking_path\
-    else ExperimentTracker.guess_path(trainer._save_dir)
-
-  logging.info('Computing feature importances for run: {}'.format(experiment_tracking_path))
-
-  feature_importance_list = []
-  for key in feature_importances:
-    for feature, imps in feature_importances[key].items():
-      logging.info('FEATURE NAME: {}'.format(feature))
-      feature_name = feature.split(' (').pop(0)
-      for metric_name, value in imps.items():
-        try:
-          imps[metric_name] = float(value)
-          logging.info('Wrote feature importance value {} for metric: {}'.format(str(value), metric_name))
-        except Exception as ex:
-          logging.error("Skipping writing metric:{} to ML Metastore due to invalid metric value: {} or value type: {}. Exception: {}".format(metric_name, str(value), type(value), str(ex)))
-          pass
-
-      feature_importance_list.append(FeatureImportance(
-        run_id=experiment_tracking_path,
-        feature_name=_process_feature_name_for_mldash(feature_name),
-        feature_importance_metrics=imps,
-        is_group=key == GROUP
-      ))
-
-# setting feature config to match the one used in compute_feature_importances
-  feature_config = feature_config or trainer._feature_config
-  feature_names = FeatureNames(
-    run_id=experiment_tracking_path,
-    names=list(feature_config.features.keys())
-  )
-
-  try:
-    client = ModelRepoClient()
-    logging.info('Writing feature importances to ML Metastore')
-    client.add_feature_importances(feature_importance_list)
-    logging.info('Writing feature names to ML Metastore')
-    client.add_feature_names(feature_names)
-  except (HTTPError, RetryError) as err:
-    logging.error('Feature importance is not being written due to: '
-                  'HTTPError when attempting to write to ML Metastore: \n{}.'.format(err))
+    trainer: Trainer,
+    feature_importances: Dict,
+    output_path: str = None,
+    metric: str = "roc_auc",
+) -> str:
+    """Publish a feature importance analysis to hdfs as a tsv
+    Args:
+        (see compute_feature_importances for other args)
+        trainer (Trainer)
+        feature_importances (dict):
+            Dictionary of feature importances
+        output_path (str):
+            The remote or local file to write the feature importances to. If not
+            provided, this is inferred to be the trainer save dir
+        metric (str):
+            The metric to write to tsv
+    """
+    # String formatting appends (Individual) or (Group) to feature name depending on type
+    perfs = {
+        f"{k} ({importance_key})" if k != "None" else k: v[metric]
+        for importance_key, importance_value in feature_importances.items()
+        for k, v in importance_value.items()
+    }
+
+    output_path = f"{trainer._save_dir[:-1] if trainer._save_dir.endswith('/') else trainer._save_dir}/feature_importances-{output_path if output_path is not None else str(time.time())}"
+    if len(perfs) > 0:
+        logging.info(f"Writing feature_importances for {perfs.keys()} to hdfs")
+        entries = [
+            {
+                "name": name,
+                "drop": perfs["None"] - perfs[name],
+                "pdrop": 100 * (perfs["None"] - perfs[name]) / (perfs["None"] + 1e-8),
+                "perf": perfs[name],
+            }
+            for name in perfs.keys()
+        ]
+        out = ["Name\tPerformance Drop\tPercent Performance Drop\tPerformance"]
+        for entry in sorted(entries, key=lambda d: d["drop"]):
+            out.append("{name}\t{drop}\t{pdrop}%\t{perf}".format(**entry))
+        logging.info("\n".join(out))
+        write_list_to_hdfs_gfile(out, output_path)
+        logging.info(f"Wrote feature feature_importances to {output_path}")
+    else:
+        logging.info("Not writing feature_importances to hdfs")
+    return output_path
+
+
+def write_feature_importances_to_ml_dash(
+    trainer: DataRecordTrainer,
+    feature_importances: Dict[str, Dict[str, Dict[str, float]]],
+    feature_config: contrib.FeatureConfig = None,
+) -> None:
+    """Publish feature importances + all feature names to ML Metastore
+    Args:
+        trainer (DataRecordTrainer):
+            A DataRecordTrainer object
+        feature_importances (dict, default=None):
+            Dictionary of precomputed feature importances
+        feature_config (contrib.FeatureConfig):
+            The feature config object. If this is not provided, it is taken from the trainer
+    """
+    experiment_tracking_path = (
+        trainer.experiment_tracker.tracking_path
+        if trainer.experiment_tracker.tracking_path
+        else ExperimentTracker.guess_path(trainer._save_dir)
+    )
+
+    logging.info(f"Computing feature importances for run: {experiment_tracking_path}")
+    feature_importance_list = []
+    for key in feature_importances:
+        for feature, imps in feature_importances[key].items():
+            logging.info(f"FEATURE NAME: {feature}")
+            feature_name = feature.split(" (").pop(0)
+            for metric_name, value in imps.items():
+                try:
+                    imps[metric_name] = float(value)
+                    logging.info(
+                        f"Wrote feature importance value {value} for metric: {metric_name}"
+                    )
+                except Exception as ex:
+                    logging.error(
+                        f"Skipping writing metric:{metric_name} to ML Metastore due to invalid metric value: {value} or value type: {type(value)}. Exception: {ex}"
+                    )
+
+            feature_importance_list.append(
+                FeatureImportance(
+                    run_id=experiment_tracking_path,
+                    feature_name=_process_feature_name_for_mldash(feature_name),
+                    feature_importance_metrics=imps,
+                    is_group=key == GROUP,
+                )
+            )
+
+    # setting feature config to match the one used in compute_feature_importances
+    feature_config = feature_config or trainer._feature_config
+    feature_names = FeatureNames(
+        run_id=experiment_tracking_path, names=list(feature_config.features.keys())
+    )
+
+    try:
+        client = ModelRepoClient()
+        logging.info("Writing feature importances to ML Metastore")
+        client.add_feature_importances(feature_importance_list)
+        logging.info("Writing feature names to ML Metastore")
+        client.add_feature_names(feature_names)
+    except (HTTPError, RetryError) as err:
+        logging.error(
+            "Feature importance is not being written due to: "
+            f"HTTPError when attempting to write to ML Metastore: \n{err}."
+        )
diff --git a/twml/twml/contrib/feature_importances/feature_permutation.py b/twml/twml/contrib/feature_importances/feature_permutation.py
index 809f5fde0..c523106dc 100644
--- a/twml/twml/contrib/feature_importances/feature_permutation.py
+++ b/twml/twml/contrib/feature_importances/feature_permutation.py
@@ -1,129 +1,190 @@
-from copy import deepcopy
 import random
 import types
+from copy import deepcopy
+from typing import Callable, List, Tuple
 
+import tensorflow.compat.v1 as tf
+from com.twitter.ml.api.ttypes import DataRecord  # pylint: disable=import-error
+from tensorflow.compat.v1 import logging
 from twitter.deepbird.util.thrift.simple_converters import (
-  bytes_to_thrift_object, thrift_object_to_bytes)
+    bytes_to_thrift_object,
+    thrift_object_to_bytes,
+)
 
-from tensorflow.compat.v1 import logging
-from com.twitter.ml.api.ttypes import DataRecord  # pylint: disable=import-error
-import tensorflow.compat.v1 as tf
 import twml
 
 
 class PermutedInputFnFactory(object):
+    def __init__(
+        self,
+        data_dir: str,
+        record_count: int,
+        file_list: List[str] = None,
+        datarecord_filter_fn: Callable[[DataRecord], bool] = None,
+    ):
+        """
+        Args:
+            data_dir (str):
+                The location of the records on hdfs
+            record_count (int):
+                The number of records to process
+            file_list (list[str], default=None):
+                The list of data files on HDFS. If provided, use this instead of data_dir
+            datarecord_filter_fn (function):
+                a function takes a single data sample in com.twitter.ml.api.ttypes.DataRecord format
+                and return a boolean value, to indicate if this data record should be kept in feature importance module or not.
+        """
+        if not (data_dir is None) ^ (file_list is None):
+            raise ValueError(
+                f"Exactly one of data_dir and file_list can be provided. Got {data_dir} for data_dir and {file_list} for file_list"
+            )
+
+        file_list = (
+            file_list
+            if file_list is not None
+            else twml.util.list_files(twml.util.preprocess_path(data_dir))
+        )
+        _next_batch = twml.input_fns.default_input_fn(
+            file_list, 1, lambda x: x, num_threads=2, shuffle=True, shuffle_files=True
+        )
+        self.records = []
+        # Validate datarecord_filter_fn
+        if datarecord_filter_fn is not None and not isinstance(
+            datarecord_filter_fn, types.FunctionType
+        ):
+            raise TypeError("datarecord_filter_fn is not function type")
+        with tf.Session() as sess:
+            for i in range(record_count):
+                try:
+                    record = bytes_to_thrift_object(
+                        sess.run(_next_batch)[0], DataRecord
+                    )
+                    if datarecord_filter_fn is None or datarecord_filter_fn(record):
+                        self.records.append(record)
+                except tf.errors.OutOfRangeError:
+                    logging.info(
+                        f"Stopping after reading {i} records out of {record_count}"
+                    )
+                    break
+            if datarecord_filter_fn:
+                logging.info(
+                    f"datarecord_filter_fn has been applied; keeping {len(self.records)} records out of {record_count}"
+                )
+
+    def _get_record_generator(self) -> Tuple[bytes]:
+        return (thrift_object_to_bytes(r) for r in self.records)
+
+    def get_permuted_input_fn(
+        self, batch_size: int, parse_fn: callable, fname_ftypes: List[Tuple[str, str]]
+    ) -> callable:
+        """Get an input function that passes in a preset number of records that have been feature permuted
+
+        Args:
+            batch_size (int): The batch size to use
+            parse_fn (function): The function to parse inputs
+            fname_ftypes: (list<(str, str)>): The names and types of the features to permute
+
+        Returns:
+            A function that returns a batch of permuted records
+        """
+
+        def permuted_parse_pyfn(bytes_array: List[bytes]) -> List[bytes]:
+            """Parse a list of bytes into a list of parsed bytes"""
+
+            out = []
+            for b in bytes_array:
+                rec = bytes_to_thrift_object(b, DataRecord)
+                if fname_ftypes:
+                    rec = _permutate_features(
+                        rec, fname_ftypes=fname_ftypes, records=self.records
+                    )
+                out.append(thrift_object_to_bytes(rec))
+            return [out]
+
+        def permuted_parse_fn(bytes_tensor: tf.Tensor) -> tf.Tensor:
+            """Parse a tensor of bytes into a tensor of parsed bytes"""
+            parsed_bytes_tensor = parse_fn(
+                tf.py_func(permuted_parse_pyfn, [bytes_tensor], tf.string)
+            )
+            return parsed_bytes_tensor
+
+        def input_fn(
+            batch_size: int = batch_size, parse_fn: callable = parse_fn, factory=self
+        ) -> tf.Tensor:
+            """The input function to return"""
+
+            return (
+                tf.data.Dataset.from_generator(self._get_record_generator, tf.string)
+                .batch(batch_size)
+                .map(permuted_parse_fn, 4)
+                .make_one_shot_iterator()
+                .get_next()
+            )
+
+        return input_fn
+
+
+def _permutate_features(
+    rec: DataRecord, fname_ftypes: List[Tuple[str, str]], records: List[DataRecord]
+) -> DataRecord:
+    """Replace a feature value with a value from random selected record
 
-  def __init__(self, data_dir, record_count, file_list=None, datarecord_filter_fn=None):
-    """
-    Args:
-      data_dir (str): The location of the records on hdfs
-      record_count (int): The number of records to process
-      file_list (list<str>, default=None): The list of data files on HDFS. If provided, use this instead
-        of data_dir
-      datarecord_filter_fn (function): a function takes a single data sample in com.twitter.ml.api.ttypes.DataRecord format
-        and return a boolean value, to indicate if this data record should be kept in feature importance module or not.
-    """
-    if not (data_dir is None) ^ (file_list is None):
-      raise ValueError("Exactly one of data_dir and file_list can be provided. Got {} for data_dir and {} for file_list".format(
-        data_dir, file_list))
-
-    file_list = file_list if file_list is not None else twml.util.list_files(twml.util.preprocess_path(data_dir))
-    _next_batch = twml.input_fns.default_input_fn(file_list, 1, lambda x: x,
-      num_threads=2, shuffle=True, shuffle_files=True)
-    self.records = []
-    # Validate datarecord_filter_fn
-    if datarecord_filter_fn is not None and not isinstance(datarecord_filter_fn, types.FunctionType):
-      raise TypeError("datarecord_filter_fn is not function type")
-    with tf.Session() as sess:
-      for i in range(record_count):
-        try:
-          record = bytes_to_thrift_object(sess.run(_next_batch)[0], DataRecord)
-          if datarecord_filter_fn is None or datarecord_filter_fn(record):
-            self.records.append(record)
-        except tf.errors.OutOfRangeError:
-          logging.info("Stopping after reading {} records out of {}".format(i, record_count))
-          break
-      if datarecord_filter_fn:
-        logging.info("datarecord_filter_fn has been applied; keeping {} records out of {}".format(len(self.records), record_count))
-
-  def _get_record_generator(self):
-    return (thrift_object_to_bytes(r) for r in self.records)
-
-  def get_permuted_input_fn(self, batch_size, parse_fn, fname_ftypes):
-    """Get an input function that passes in a preset number of records that have been feature permuted
     Args:
-      parse_fn (function): The function to parse inputs
-      fname_ftypes: (list<(str, str)>): The names and types of the features to permute
+        rec: (datarecord):
+            A datarecord returned from DataRecordGenerator
+        fname_ftypes: (list<(str, str)>):
+            The names and types of the features to permute
+        records: (list<datarecord>):
+            The records to sample from
+
+    Returns:
+        The record with the feature permuted
     """
-    def permuted_parse_pyfn(bytes_array):
-      out = []
-      for b in bytes_array:
-        rec = bytes_to_thrift_object(b, DataRecord)
-        if fname_ftypes:
-          rec = _permutate_features(rec, fname_ftypes=fname_ftypes, records=self.records)
-        out.append(thrift_object_to_bytes(rec))
-      return [out]
-
-    def permuted_parse_fn(bytes_tensor):
-      parsed_bytes_tensor = parse_fn(tf.py_func(permuted_parse_pyfn, [bytes_tensor], tf.string))
-      return parsed_bytes_tensor
-
-    def input_fn(batch_size=batch_size, parse_fn=parse_fn, factory=self):
-      return (tf.data.Dataset
-          .from_generator(self._get_record_generator, tf.string)
-          .batch(batch_size)
-          .map(permuted_parse_fn, 4)
-          .make_one_shot_iterator()
-          .get_next())
-    return input_fn
-
-
-def _permutate_features(rec, fname_ftypes, records):
-  """Replace a feature value with a value from random selected record
-  Args:
-    rec: (datarecord): A datarecord returned from DataRecordGenerator
-    fname_ftypes: (list<(str, str)>): The names and types of the features to permute
-    records: (list<datarecord>): The records to sample from
-  Returns:
-    The record with the feature permuted
-  """
-  rec_new = deepcopy(rec)
-  rec_replace = random.choice(records)
-
-  # If the replacement datarecord does not have the feature type entirely, add it in
-  #   to make the logic a bit simpler
-  for fname, feature_type in fname_ftypes:
-    fid = twml.feature_id(fname)[0]
-    if rec_replace.__dict__.get(feature_type, None) is None:
-      rec_replace.__dict__[feature_type] = (
-        dict() if feature_type != 'binaryFeatures' else set())
-    if rec_new.__dict__.get(feature_type, None) is None:
-      rec_new.__dict__[feature_type] = (
-        dict() if feature_type != 'binaryFeatures' else set())
-
-    if feature_type != 'binaryFeatures':
-      if fid not in rec_replace.__dict__[feature_type] and fid in rec_new.__dict__.get(feature_type, dict()):
-        # If the replacement datarecord does not contain the feature but the original does
-        del rec_new.__dict__[feature_type][fid]
-      elif fid in rec_replace.__dict__[feature_type]:
-        # If the replacement datarecord does contain the feature
-        if rec_new.__dict__[feature_type] is None:
-          rec_new.__dict__[feature_type] = dict()
-        rec_new.__dict__[feature_type][fid] = rec_replace.__dict__[feature_type][fid]
-      else:
-        # If neither datarecord contains this feature
-        pass
-    else:
-      if fid not in rec_replace.__dict__[feature_type] and fid in rec_new.__dict__.get(feature_type, set()):
-        # If the replacement datarecord does not contain the feature but the original does
-        rec_new.__dict__[feature_type].remove(fid)
-      elif fid in rec_replace.__dict__[feature_type]:
-        # If the replacement datarecord does contain the feature
-        if rec_new.__dict__[feature_type] is None:
-          rec_new.__dict__[feature_type] = set()
-        rec_new.__dict__[feature_type].add(fid)
-        # If neither datarecord contains this feature
-      else:
-        # If neither datarecord contains this feature
-        pass
-  return rec_new
+    rec_new = deepcopy(rec)
+    rec_replace = random.choice(records)
+
+    # If the replacement datarecord does not have the feature type entirely, add it in
+    #   to make the logic a bit simpler
+    for fname, feature_type in fname_ftypes:
+        fid = twml.feature_id(fname)[0]
+        if rec_replace.__dict__.get(feature_type, None) is None:
+            rec_replace.__dict__[feature_type] = (
+                dict() if feature_type != "binaryFeatures" else set()
+            )
+        if rec_new.__dict__.get(feature_type, None) is None:
+            rec_new.__dict__[feature_type] = (
+                dict() if feature_type != "binaryFeatures" else set()
+            )
+
+        if feature_type != "binaryFeatures":
+            if fid not in rec_replace.__dict__[
+                feature_type
+            ] and fid in rec_new.__dict__.get(feature_type, dict()):
+                # If the replacement datarecord does not contain the feature but the original does
+                del rec_new.__dict__[feature_type][fid]
+            elif fid in rec_replace.__dict__[feature_type]:
+                # If the replacement datarecord does contain the feature
+                if rec_new.__dict__[feature_type] is None:
+                    rec_new.__dict__[feature_type] = dict()
+                rec_new.__dict__[feature_type][fid] = rec_replace.__dict__[
+                    feature_type
+                ][fid]
+            else:
+                # If neither datarecord contains this feature
+                pass
+        else:
+            if fid not in rec_replace.__dict__[
+                feature_type
+            ] and fid in rec_new.__dict__.get(feature_type, set()):
+                # If the replacement datarecord does not contain the feature but the original does
+                rec_new.__dict__[feature_type].remove(fid)
+            elif fid in rec_replace.__dict__[feature_type]:
+                # If the replacement datarecord does contain the feature
+                if rec_new.__dict__[feature_type] is None:
+                    rec_new.__dict__[feature_type] = set()
+                rec_new.__dict__[feature_type].add(fid)
+                # If neither datarecord contains this feature
+            else:
+                # If neither datarecord contains this feature
+                pass
+    return rec_new
diff --git a/twml/twml/contrib/feature_importances/helpers.py b/twml/twml/contrib/feature_importances/helpers.py
index f3f600e8b..f794c33bf 100644
--- a/twml/twml/contrib/feature_importances/helpers.py
+++ b/twml/twml/contrib/feature_importances/helpers.py
@@ -1,96 +1,113 @@
 import uuid
+from typing import List
 
+import tensorflow.compat.v1 as tf
 from tensorflow.compat.v1 import logging
+
 import twml
-import tensorflow.compat.v1 as tf
 
 
-def write_list_to_hdfs_gfile(list_to_write, output_path):
-  """Use tensorflow gfile to write a list to a location on hdfs"""
-  locname = "/tmp/{}".format(str(uuid.uuid4()))
-  with open(locname, "w") as f:
-    for row in list_to_write:
-      f.write("%s\n" % row)
-  tf.io.gfile.copy(locname, output_path, overwrite=False)
-
-
-def decode_str_or_unicode(str_or_unicode):
-  return str_or_unicode.decode() if hasattr(str_or_unicode, 'decode') else str_or_unicode
-
-
-def longest_common_prefix(strings, split_character):
-  """
-  Args:
-    string (list<str>): The list of strings to find the longest common prefix of
-    split_character (str): If not None, require that the return string end in this character or
-      be the length of the entire string
-  Returns:
-    The string corresponding to the longest common prefix
-  """
-  sorted_strings = sorted(strings)
-  s1, s2 = sorted_strings[0], sorted_strings[-1]
-  if s1 == s2:
-    # If the strings are the same, just return the full string
-    out = s1
-  else:
-    # If the strings are not the same, return the longest common prefix optionally ending in split_character
-    ix = 0
-    for i in range(min(len(s1), len(s2))):
-      if s1[i] != s2[i]:
-        break
-      if split_character is None or s1[i] == split_character:
-        ix = i + 1
-    out = s1[:ix]
-  return out
-
-
-def _expand_prefix(fname, prefix, split_character):
-  if len(fname) == len(prefix):
-    # If the prefix is already the full feature, just take the feature name
-    out = fname
-  elif split_character is None:
-    # Advance the prefix by one character
-    out = fname[:len(prefix) + 1]
-  else:
-    # Advance the prefix to the next instance of split_character or the end of the string
-    for ix in range(len(prefix), len(fname)):
-      if fname[ix] == split_character:
-        break
-    out = fname[:ix + 1]
-  return out
-
-
-def _get_feature_types_from_records(records, fnames):
-  # This method gets the types of the features in fnames by looking at the datarecords themselves.
-  #   The reason why we do this rather than extract the feature types from the feature_config is
-  #   that the feature naming conventions in the feature_config are different from those in the
-  #   datarecords.
-  fids = [twml.feature_id(fname)[0] for fname in fnames]
-  feature_to_type = {}
-  for record in records:
-    for feature_type, values in record.__dict__.items():
-      if values is not None:
-        included_ids = set(values)
-        for fname, fid in zip(fnames, fids):
-          if fid in included_ids:
-            feature_to_type[fname] = feature_type
-  return feature_to_type
-
-
-def _get_metrics_hook(trainer):
-  def get_metrics_fn(trainer=trainer):
-    return {k: v[0]for k, v in trainer.current_estimator_spec.eval_metric_ops.items()}
-  return twml.hooks.GetMetricsHook(get_metrics_fn=get_metrics_fn)
-
-
-def _get_feature_name_from_config(feature_config):
-  """Extract the names of the features on a feature config object
-  """
-  decoded_feature_names = []
-  for f in feature_config.get_feature_spec()['features'].values():
-    try:
-      fname = decode_str_or_unicode(f['featureName'])
-    except UnicodeEncodeError as e:
-      logging.error("Encountered decoding exception when decoding %s: %s" % (f, e))
-    decoded_feature_names.append(fname)
-  return decoded_feature_names
+def write_list_to_hdfs_gfile(list_to_write: List[str], output_path: str) -> None:
+    """Use tensorflow gfile to write a list to a location on hdfs"""
+    locname = f"/tmp/{str(uuid.uuid4())}"
+    with open(locname, "w") as f:
+        for row in list_to_write:
+            f.write("%s\n" % row)
+    tf.io.gfile.copy(locname, output_path, overwrite=False)
+
+
+def decode_str_or_unicode(str_or_unicode: str) -> str:
+    if hasattr(str_or_unicode, "decode"):
+        return str_or_unicode.decode()
+    return str_or_unicode
+
+
+def longest_common_prefix(strings: List[str], split_character: str) -> str:
+    """
+    Args:
+        string (list<str>): The list of strings to find the longest common prefix of
+        split_character (str): If not None, require that the return string end in this character or
+            be the length of the entire string
+    Returns:
+        The string corresponding to the longest common prefix
+    """
+    sorted_strings = sorted(strings)
+    s1, s2 = sorted_strings[0], sorted_strings[-1]
+    if s1 == s2:
+        # If the strings are the same, just return the full string
+        out = s1
+    else:
+        # If the strings are not the same, return the longest common prefix optionally ending in split_character
+        ix = 0
+        for i in range(min(len(s1), len(s2))):
+            if s1[i] != s2[i]:
+                break
+            if split_character is None or s1[i] == split_character:
+                ix = i + 1
+        out = s1[:ix]
+    return out
+
+
+def _expand_prefix(fname: str, prefix: str, split_character: str) -> str:
+    """Expand the prefix of a feature name to the next split_character or the end of the string"""
+
+    if len(fname) == len(prefix):
+        # If the prefix is already the full feature, just take the feature name
+        out = fname
+    elif split_character is None:
+        # Advance the prefix by one character
+        out = fname[: len(prefix) + 1]
+    else:
+        # Advance the prefix to the next instance of split_character or the end of the string
+        for ix in range(len(prefix), len(fname)):
+            if fname[ix] == split_character:
+                break
+        out = fname[: ix + 1]
+    return out
+
+
+def _get_feature_types_from_records(
+    records: List[twml.datarecord.DataRecord], fnames: List[str]
+) -> dict:
+    """Get the types of the features in fnames by looking at the datarecords themselves"""
+
+    # This method gets the types of the features in fnames by looking at the datarecords themselves.
+    #   The reason why we do this rather than extract the feature types from the feature_config is
+    #   that the feature naming conventions in the feature_config are different from those in the
+    #   datarecords.
+    fids = [twml.feature_id(fname)[0] for fname in fnames]
+    feature_to_type = {}
+    for record in records:
+        for feature_type, values in record.__dict__.items():
+            if values is not None:
+                included_ids = set(values)
+                for fname, fid in zip(fnames, fids):
+                    if fid in included_ids:
+                        feature_to_type[fname] = feature_type
+    return feature_to_type
+
+
+def _get_metrics_hook(trainer: twml.Trainer) -> tf.train.SessionRunHook:
+    """Get a hook that returns the metrics from the current estimator spec"""
+
+    def get_metrics_fn(trainer=trainer):
+        return {
+            k: v[0] for k, v in trainer.current_estimator_spec.eval_metric_ops.items()
+        }
+
+    return twml.hooks.GetMetricsHook(get_metrics_fn=get_metrics_fn)
+
+
+def _get_feature_name_from_config(feature_config: twml.FeatureConfig) -> List[str]:
+    """Extract the names of the features on a feature config object"""
+
+    decoded_feature_names = []
+    for f in feature_config.get_feature_spec()["features"].values():
+        try:
+            fname = decode_str_or_unicode(f["featureName"])
+        except UnicodeEncodeError as e:
+            logging.error(
+                "Encountered decoding exception when decoding %s: %s" % (f, e)
+            )
+        decoded_feature_names.append(fname)
+    return decoded_feature_names
diff --git a/twml/twml/contrib/hooks.py b/twml/twml/contrib/hooks.py
index 6d68831fc..d76cefbab 100644
--- a/twml/twml/contrib/hooks.py
+++ b/twml/twml/contrib/hooks.py
@@ -1,42 +1,50 @@
 import datetime
+from typing import Union
 
-from absl import logging
 import pytz
 import tensorflow.compat.v1 as tf
+from absl import logging
 
 
 class StopAtTimeHook(tf.train.SessionRunHook):
-  """
-  Hook that stops training at a fixed datetime
-  """
-
-  def __init__(self, stop_time):
     """
-    Arguments:
-      stop_time:
-        a datetime.datetime or a datetime.timedelta specifying when to stop.
-        For naive datetime.datetime objects (with no time zone specified),
-        UTC time zone is assumed.
+    Hook that stops training at a fixed datetime
     """
-    if isinstance(stop_time, datetime.timedelta):
-      self._stop_datetime = pytz.utc.localize(datetime.datetime.utcnow() + stop_time)
-    elif isinstance(stop_time, datetime.datetime):
-      if stop_time.tzinfo is None:
-        self._stop_datetime = pytz.utc.localize(stop_time)
-      else:
-        self._stop_datetime = stop_time.astimezone(pytz.UTC)
-    else:
-      raise ValueError("Expecting datetime or timedelta for stop_time arg")
-    self._stop_requested = False
 
-  def after_run(self, run_context, run_values):
-    delta = self._stop_datetime - pytz.utc.localize(datetime.datetime.utcnow())
-    if delta.total_seconds() <= 0:
-      logging.info("StopAtTimeHook reached stop_time; requesting stop")
-      run_context.request_stop()
-      self._stop_requested = True
+    def __init__(self, stop_time: Union[datetime.datetime, datetime.timedelta]):
+        """
+        Args:
+            stop_time:
+                a datetime.datetime or a datetime.timedelta specifying when to stop.
+                For naive datetime.datetime objects (with no time zone specified),
+                UTC time zone is assumed.
+        """
+        if isinstance(stop_time, datetime.timedelta):
+            self._stop_datetime = pytz.utc.localize(
+                datetime.datetime.utcnow() + stop_time
+            )
+        elif isinstance(stop_time, datetime.datetime):
+            if stop_time.tzinfo is None:
+                self._stop_datetime = pytz.utc.localize(stop_time)
+            else:
+                self._stop_datetime = stop_time.astimezone(pytz.UTC)
+        else:
+            raise ValueError("Expecting datetime or timedelta for stop_time arg")
+        self._stop_requested = False
+
+    def after_run(
+        self,
+        run_context: tf.train.SessionRunContext,
+        run_values: tf.train.SessionRunValues,
+    ) -> None:
+        """Called after each call to run()."""
+        delta = self._stop_datetime - pytz.utc.localize(datetime.datetime.utcnow())
+        if delta.total_seconds() <= 0:
+            logging.info("StopAtTimeHook reached stop_time; requesting stop")
+            run_context.request_stop()
+            self._stop_requested = True
 
-  @property
-  def stop_requested(self):
-    """ true if this hook requested a stop """
-    return self._stop_requested
+    @property
+    def stop_requested(self) -> bool:
+        """true if this hook requested a stop"""
+        return self._stop_requested
diff --git a/twml/twml/contrib/initializers.py b/twml/twml/contrib/initializers.py
index 52bad3a19..fd7c73abe 100644
--- a/twml/twml/contrib/initializers.py
+++ b/twml/twml/contrib/initializers.py
@@ -1,61 +1,76 @@
+from typing import Optional
+
 import numpy as np
 import tensorflow.compat.v1 as tf
 
-
 TWML_INIT_FEED_KEY = "TWML_INIT_FEED_COLLECTION"
 
 
 class PartitionConstant(tf.keras.initializers.Constant):
-  """A constant initializer that supports partitions"""
-
-  def __call__(self, shape, dtype=None, partition_info=None):
-    if partition_info is not None:
-      if not isinstance(self.value, np.ndarray):
-        raise ValueError(
-          "Currently, PartitionConstant only supports "
-          "partitioning on np.ndarrays. Got {}".format(type(self.value).__name__))
-      offsets = partition_info.var_offset
-      indices = tuple([slice(offset, offset + size) for offset, size in zip(offsets, shape)])
-      subset = self.value[indices]
-      return subset
-    else:
-      return self.value
+    """A constant initializer that supports partitions"""
+
+    def __call__(
+        self,
+        shape: tf.TensorShape,
+        dtype: Optional[tf.Dtype] = None,
+        partition_info: Optional[tf.VariablePartitionInfo] = None,
+    ) -> tf.Tensor:
+        if partition_info is not None:
+            if not isinstance(self.value, np.ndarray):
+                raise ValueError(
+                    "Currently, PartitionConstant only supports "
+                    f"partitioning on np.ndarrays. Got {type(self.value).__name__}"
+                )
+            offsets = partition_info.var_offset
+            indices = tuple(
+                [slice(offset, offset + size) for offset, size in zip(offsets, shape)]
+            )
+            subset = self.value[indices]
+            return subset
+        return self.value
 
 
 partition_constant_initializer = PartitionConstant
 
 
 class PlaceholderInitializer(tf.keras.initializers.Initializer):
-  """A placeholder initializer that supports partitions"""
-
-  def __init__(self, shape, dtype):
-    self.dtype = dtype
-    self.value = tf.placeholder(dtype=dtype, shape=shape)
-
-  def __call__(self, shape, dtype=None, partition_info=None):
-    if partition_info is not None:
-      if self.dtype != dtype:
-        raise ValueError("dtype does not match placeholder dtype")
-      offsets = partition_info.var_offset
-      indices = tuple([slice(offset, offset + size) for offset, size in zip(offsets, shape)])
-      subset = self.value[indices]
-      return subset
-    else:
-      return self.value
-
-
-def get_init_feed_dict():
-  """Get the init feed dictionary to be used when running the init op."""
-  # Get the reference to the collection.
-  init_feed_collection = tf.get_collection(TWML_INIT_FEED_KEY)
-  init_feed_dict = {}
-  for d in init_feed_collection:
-    init_feed_dict.update(d)
-  return init_feed_dict
-
-
-def clear_init_feed_collection():
-  """Clear the init feed collection."""
-  init_feed_collection = tf.get_collection_ref(TWML_INIT_FEED_KEY)
-  while init_feed_collection:
-    init_feed_collection.pop()
+    """A placeholder initializer that supports partitions"""
+
+    def __init__(self, shape: tf.TensorShape, dtype: tf.Dtype = tf.float32):
+        self.dtype = dtype
+        self.value = tf.placeholder(dtype=dtype, shape=shape)
+
+    def __call__(
+        self,
+        shape: tf.TensorShape,
+        dtype: Optional[tf.Dtype] = None,
+        partition_info: Optional[tf.VariablePartitionInfo] = None,
+    ) -> tf.Tensor:
+        if partition_info is not None:
+            if self.dtype != dtype:
+                raise ValueError("dtype does not match placeholder dtype")
+            offsets = partition_info.var_offset
+            indices = tuple(
+                [slice(offset, offset + size) for offset, size in zip(offsets, shape)]
+            )
+            subset = self.value[indices]
+            return subset
+        else:
+            return self.value
+
+
+def get_init_feed_dict() -> dict:
+    """Get the init feed dictionary to be used when running the init op."""
+    # Get the reference to the collection.
+    init_feed_collection = tf.get_collection(TWML_INIT_FEED_KEY)
+    init_feed_dict = {}
+    for d in init_feed_collection:
+        init_feed_dict.update(d)
+    return init_feed_dict
+
+
+def clear_init_feed_collection() -> None:
+    """Clear the init feed collection."""
+    init_feed_collection = tf.get_collection_ref(TWML_INIT_FEED_KEY)
+    while init_feed_collection:
+        init_feed_collection.pop()
diff --git a/twml/twml/contrib/layers/__init__.py b/twml/twml/contrib/layers/__init__.py
index aa6e7d7e4..ac29dcc7c 100644
--- a/twml/twml/contrib/layers/__init__.py
+++ b/twml/twml/contrib/layers/__init__.py
@@ -1,11 +1,12 @@
 # pylint: disable=wildcard-import
 """ This module contains all contrib Layers. """
 
+from .embedding_lookup import EmbeddingLookup  # noqa: F401
+from .factorization_machine import FactorizationMachine  # noqa: F401
+from .full_dense import FullDense, full_dense  # noqa: F401
 from .hashed_percentile_discretizer import HashedPercentileDiscretizer  # noqa: F401
 from .hashing_discretizer import HashingDiscretizer  # noqa: F401
 from .mask_layer import MaskLayer  # noqa: F401
-from .embedding_lookup import EmbeddingLookup  # noqa: F401
-from .factorization_machine import FactorizationMachine # noqa: F401
-from .full_dense import full_dense, FullDense  # noqa: F401
 from .stacked_rnn import StackedRNN, stacked_rnn  # noqa: F401
-from .zscore_normalization import ZscoreNormalization, zscore_normalization  # noqa: F401
+from .zscore_normalization import ZscoreNormalization  # noqa: F401
+from .zscore_normalization import zscore_normalization
diff --git a/twml/twml/contrib/layers/embedding_lookup.py b/twml/twml/contrib/layers/embedding_lookup.py
index c83dc7edd..b86565da0 100644
--- a/twml/twml/contrib/layers/embedding_lookup.py
+++ b/twml/twml/contrib/layers/embedding_lookup.py
@@ -1,12 +1,12 @@
+import argparse
 import os
 import re
 import time
+from typing import Dict, Optional, Tuple
 
-from collections import OrderedDict
-
-from absl import logging
 import numpy as np
 import tensorflow.compat.v1 as tf
+from absl import logging
 from tensorflow.python.ops.lookup_ops import index_table_from_tensor
 
 import twml
@@ -17,403 +17,435 @@
 
 
 def load_initializers_from_csv(
-  embedding_path, vocab_size=-1, embedding_size=None, separator=None, vocab=None
-):
-  """
-  Loads embeddings saved in the `glove format <https://nlp.stanford.edu/projects/glove/>`_.
-  The glove format is a txt file separated by spaces.
-  Each line looks like: "word 0.00001 0.2334 ...".
-
-  Arguments:
-    embedding_path:
-      path to the embeddings file on HDFS (hdfs://default/...)
-      or its local_path (/path/to/...).
-      The embedding_path may also specify a pattern. In which case, the embeddings
-      are read in the lexical order of the filenames that match the order.
-    vocab_size:
-      the maximum size of the vocabulary. The top ``vocab_size`` words in the file
-      are included in the vocabulary. If you specify a positive vocab_size,
-      the words are expected to be in descending order of frequency.
-      This allows the embeddings to be easily filtered to top vocab_size words.
-      Reducing the vocab_size acts as a regularizer, preventing the model to overfit on rarer words.
-      A negative vocab_size loads all embeddings.
-      Reducing the vocab_size may also help with memory issues,
-      allowing the embedding initializers to fit inside the graph.
-    embedding_size:
-      Defaults to None. If None, the embedding size is infered from the file name.
-      For example, ``glove.300d.txt`` and ``glove300d200.txt`` will both infrered
-      as ``embedding_size=300``. If this can't be done, the ``embedding_size`` is
-      inferred from the first line in the file. If ``embedding_size`` is provided,
-      only the last ``embedding_size`` values of each line are considered. This
-      allows the line parser to recover from partial word parsing errors.
-    separator:
-      Specifies the separator to use when splitting each line into values.
-      Default value is a whitespace (same as glove format).
-    vocab:
-      OrderedDict mapping words to np.array embedding vectors. Initializes the vocabulary.
-      Duplicate words found in the file are ignored.
-      Defaults to a vocabulary of two words::
-
-        vocab = OrderedDict()
-        vocab[''] = np.random.randn(embedding_size)
-        vocab['<UNK>'] = np.random.randn(embedding_size)
-
-  Returns:
-    tuple of (vocab_initializer, weight_initializer, shape)
-
-    vocab_initializer:
-      A tf.constant_initializer containing a vector of word strings of size vocab_size.
-    weight_initializer:
-      A twml.contrib.initializers.partition_constant_initializer containing
-      the weight matrix of embeddings of size vocab_size x embedding_size.
-    shape:
-      A tuple containing of (vocab_size, embedding_size).
-
-  """
-
-  start = time.time()
-
-  embedding_path = twml.util.sanitize_hdfs_path(embedding_path)
-
-  is_user_vocab = True
-  if vocab is None:
-    vocab = OrderedDict()
-    vocab[''] = True
-    vocab['<UNK>'] = True
-    is_user_vocab = False
-  elif not isinstance(vocab, OrderedDict):
-    raise RuntimeError(
-      "Expecting vocab argument of type OrderedDict or None. "
-      "Got type %s instead." % type(vocab).__name__
-    )
+    embedding_path: str,
+    vocab_size: int = -1,
+    embedding_size: int = None,
+    separator: str = " ",
+    vocab: Dict[str, np.ndarray] = None,
+) -> Tuple[
+    tf.constant_initializer, tf.keras.initializers.PartitionedConstant, Tuple[int, int]
+]:
+    """
+    Loads embeddings saved in the `glove format <https://nlp.stanford.edu/projects/glove/>`_.
+    The glove format is a txt file separated by spaces.
+    Each line looks like: "word 0.00001 0.2334 ...".
+
+    Args:
+        embedding_path:
+            path to the embeddings file on HDFS (hdfs://default/...)
+            or its local_path (/path/to/...).
+            The embedding_path may also specify a pattern. In which case, the embeddings
+            are read in the lexical order of the filenames that match the order.
+        vocab_size:
+            the maximum size of the vocabulary. The top ``vocab_size`` words in the file
+            are included in the vocabulary. If you specify a positive vocab_size,
+            the words are expected to be in descending order of frequency.
+            This allows the embeddings to be easily filtered to top vocab_size words.
+            Reducing the vocab_size acts as a regularizer, preventing the model to overfit on rarer words.
+            A negative vocab_size loads all embeddings.
+            Reducing the vocab_size may also help with memory issues,
+            allowing the embedding initializers to fit inside the graph.
+        embedding_size:
+            Defaults to None. If None, the embedding size is inferred from the file name.
+            For example, ``glove.300d.txt`` and ``glove300d200.txt`` will both inferred
+            as ``embedding_size=300``. If this can't be done, the ``embedding_size`` is
+            inferred from the first line in the file. If ``embedding_size`` is provided,
+            only the last ``embedding_size`` values of each line are considered. This
+            allows the line parser to recover from partial word parsing errors.
+        separator:
+            Specifies the separator to use when splitting each line into values.
+            Default value is a whitespace (same as glove format).
+        vocab:
+            dict mapping words to np.array embedding vectors. Initializes the vocabulary.
+            Duplicate words found in the file are ignored.
+            Defaults to a vocabulary of two words::
+
+                vocab = dict()
+                vocab[''] = np.random.randn(embedding_size)
+                vocab['<UNK>'] = np.random.randn(embedding_size)
 
-  if embedding_size is None:
-    embedding_file = os.path.basename(embedding_path)
-    match = re.search(r"[^\d]([\d]+)d", embedding_file)
-    if match is not None:
-      embedding_size = int(match.group(1))
+    Returns:
+        tuple of (vocab_initializer, weight_initializer, shape)
+        vocab_initializer:
+            A tf.constant_initializer containing a vector of word strings of size vocab_size.
+        weight_initializer:
+            A twml.contrib.initializers.partition_constant_initializer containing
+            the weight matrix of embeddings of size vocab_size x embedding_size.
+        shape:
+            A tuple containing of (vocab_size, embedding_size).
+    """
 
-  if embedding_size is not None and not isinstance(embedding_size, int):
-    raise RuntimeError(
-      "Expecting embedding_size argument of type int or None. "
-      "Got type %s, instead." % type(embedding_size).__name__
+    start = time.time()
+
+    embedding_path = twml.util.sanitize_hdfs_path(embedding_path)
+
+    is_user_vocab = True
+    if vocab is None:
+        vocab = dict()
+        vocab[""] = True
+        vocab["<UNK>"] = True
+        is_user_vocab = False
+
+    elif not isinstance(vocab, dict):
+        raise RuntimeError(
+            "Expecting vocab argument of type dict or None. "
+            "Got type %s instead." % type(vocab).__name__
+        )
+
+    if embedding_size is None:
+        embedding_file = os.path.basename(embedding_path)
+        match = re.search(r"[^\d]([\d]+)d", embedding_file)
+        if match is not None:
+            embedding_size = int(match.group(1))
+
+    if embedding_size is not None and not isinstance(embedding_size, int):
+        raise RuntimeError(
+            "Expecting embedding_size argument of type int or None. "
+            "Got type %s, instead." % type(embedding_size).__name__
+        )
+
+    embedding_paths = sorted(tf.io.gfile.glob(embedding_path))
+
+    if len(embedding_paths) > 1:
+        raise ValueError("You are most likely using a the wrong --embedding.path")
+
+    embedding_path = embedding_paths[0]
+    logging.info("Reading embeddings file from path %s.." % embedding_path)
+
+    with tf.io.gfile.GFile(embedding_path) as f:
+        lines = f.readlines()
+
+    logging.info("Done reading embeddings file from path %s." % embedding_path)
+
+    logging.info("Parsing vocabulary and embeddings...")
+
+    for line in lines:
+        # Word and weights separated by space
+        values = line.strip().split(separator)
+        # Word is first symbol on each line
+        word = values[0]
+
+        if word not in vocab:
+            if embedding_size is None or embedding_size <= 0:
+                # get all elements after the first one.
+                word_weights = values[1:]
+                embedding_size = len(word_weights)
+            else:
+                # get the last embedding_size elements
+                word_weights = values[-min(embedding_size, len(values) - 1) :]
+
+            try:
+                if len(word_weights) != embedding_size:
+                    raise ValueError
+
+                word_weights = np.asarray(word_weights, dtype=np.float32)
+                vocab[word] = word_weights
+            except ValueError:
+                logging.info(
+                    "Wasn't able to load embeddings for word '%s'. Ignoring it" % word
+                )
+
+            vocab_len = len(vocab)
+            if vocab_size > 0 and vocab_len == vocab_size:
+                # Limit vocabulary to top terms
+                break
+            elif (vocab_len % 1000) == 0:
+                logging.info("Loaded %d words into vocab" % vocab_len)
+
+        else:
+            logging.info("found duplicate word: %s" % word)
+
+    if not is_user_vocab:
+        vocab[""] = np.random.randn(embedding_size)
+        vocab["<UNK>"] = np.random.randn(embedding_size)
+
+    words = list(vocab.keys())
+    weights = list(vocab.values())
+
+    weights = np.asarray(weights, dtype=np.float32)
+    assert weights.shape[0] == len(vocab)
+    assert weights.shape[1] == embedding_size
+
+    vocab_initializer = tf.constant_initializer(words, tf.string)
+    weight_initializer = twml.contrib.initializers.PartitionConstant(
+        weights, tf.float32
     )
 
-  embedding_paths = sorted(tf.io.gfile.glob(embedding_path))
-
-  if len(embedding_paths) > 1:
-    raise ValueError(
-      "You are most likely using a the wrong --embedding.path"
+    logging.info(
+        "Loaded %d embeddings in %d seconds." % (len(vocab), time.time() - start)
     )
+    return vocab_initializer, weight_initializer, weights.shape
 
-  embedding_path = embedding_paths[0]
-  logging.info("Reading embeddings file from path %s.." % embedding_path)
-
-  with tf.io.gfile.GFile(embedding_path) as f:
-    lines = f.readlines()
-
-  logging.info("Done reading embeddings file from path %s." % embedding_path)
-
-  logging.info("Parsing vocbulary and embeddings...")
-
-  for line in lines:
-    # Word and weights separated by space
-    values = line.strip().split(separator)
-    # Word is first symbol on each line
-    word = values[0]
-
-    if word not in vocab:
-      if embedding_size is None or embedding_size <= 0:
-        # get all elements after the first one.
-        word_weights = values[1:]
-        embedding_size = len(word_weights)
-      else:
-        # get the last embedding_size elements
-        word_weights = values[-min(embedding_size, len(values) - 1) :]
-
-      try:
-        if len(word_weights) != embedding_size:
-          raise ValueError
-
-        word_weights = np.asarray(word_weights, dtype=np.float32)
-        vocab[word] = word_weights
-      except ValueError:
-        logging.info("Wasn't able to load embeddings for word '%s'. Ignoring it" % word)
-
-      vocab_len = len(vocab)
-      if vocab_size > 0 and vocab_len == vocab_size:
-        # Limit vocabulary to top terms
-        break
-      elif (vocab_len % 1000) == 0:
-        logging.info("Loaded %d words into vocab" % vocab_len)
-
-    else:
-      logging.info("found duplicate word: %s" % word)
-
-  if not is_user_vocab:
-    vocab[''] = np.random.randn(embedding_size)
-    vocab['<UNK>'] = np.random.randn(embedding_size)
-
-  words = list(vocab.keys())
-  weights = list(vocab.values())
-
-  weights = np.asarray(weights, dtype=np.float32)
-  assert weights.shape[0] == len(vocab)
-  assert weights.shape[1] == embedding_size
-
-  vocab_initializer = tf.constant_initializer(words, tf.string)
-  weight_initializer = twml.contrib.initializers.PartitionConstant(weights, tf.float32)
-
-  logging.info("Loaded %d embeddings in %d seconds." % (len(vocab), time.time() - start))
-  return vocab_initializer, weight_initializer, weights.shape
-
-
-def add_parser_arguments(parser):
-  """
-  Adds the embedding.path and embedding.vocab_size command-line arguments to the parser.
-  These can be used to call an initializer loader function like
-  the ``load_initializers_from_csv`` function.
-
-  Arguments:
-    parser: argparse.ArgumentParser instance obtained from Trainer.get_trainer_parser
-
-  Returns:
-    argparse.ArgumentParser instance with discretizer-specific arguments added
-  """
-
-  parser.add_argument(
-    "--embedding.path",
-    "--embedding_path",
-    dest="embedding_path",
-    type=str,
-    default=None,
-    help="When specified, loads glove embeddings from .txt glove file",
-  )
-  parser.add_argument(
-    "--embedding.vocab_size",
-    "--embedding_vocab_size",
-    dest="embedding_vocab_size",
-    type=int,
-    default=-1,
-    help="Size of vocabulary. Uses this many of the most frequent terms. Defaults to -1 (use full vocab).",
-  )
-
-  return parser
 
-
-class EmbeddingLookup(twml.layers.Layer):
-  """Layer for looking up embeddings.
-  Transforms a sequence of strings to a sequence of embeddings.
-
-  Arguments:
-    vocab_size:
-      The number of word strings and embeddings in the vocabulary.
-    output_size:
-      Long or Integer, dimensionality of the output space. The embedding vector size.
-    vocab_initializer:
-      Initializer function for the vocabulary. Required. The initializer should
-      return a list of strings of size vocab_size.
-    weight_initializer:
-      Initializer function for the weight matrix of size vocab_size x output_size.
-      This argument defaults to zeros_initializer().
-      This is valid when the EmbeddingLookup is the first layer of
-      parameters but should be changed otherwise.
-    trainable:
-      Boolean, if `True` adds variables to the graph collection
-      ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
-      <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
-      Defaults to True: trains the embeddings.
-    num_oov_buckets:
-      The number of buckets to use for OOV strings. These bucket ids occur after the vocab bucket
-      ids. Hashing is used to assign OOV strings to these buckets. If `num_oov_buckets` is not
-      specified, index `OOV_WORD_ID` is used for OOV strings.
-    name:
-      String, the name of the layer. Layers with the same name will
-      share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
-    num_partitions:
-      Number of partitions to use for the weight variable. Defaults to 1.
-    partition_axis:
-      If num_partitions is specified, the partition axis for the weight variable
-      Defaults to 0 (partition by row).
-      Must be 0 (row) or 1 (column, does not support yet)
-    weight_regularizer:
-      Regularizer function for the weight matrix.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    dtype:
-      Defaults to tf.float32. Specifies the dtype of the weights.
-    use_placeholder:
-      Defaults to True.
-      If set to `True`, the initializer is passed via a placeholder. The initializer in this case needs to be of type `keras.initializers.Constant`.
-      If set to `False`, the initializer becomes part of the graph. This can sometimes be beyond what protobuf clients support.
-    checkpoint_dir:
-      Default to None.
-      If set to the path of a checkpoint, load embedding from the checkpoint.
-    convert_to_lowercase:
-      Default to True.
-      Converting all string inputs to lowercase.
-
-  Notes: If `use_placeholder` is set to `True`, the feed dictionary can be accessed by calling `twml.contrib.initializers.get_init_feed_dict()`.
-  """
-
-  def __init__(
-    self,
-    vocab_size,
-    output_size,
-    vocab_initializer,
-    weight_initializer=None,
-    trainable=True,
-    num_oov_buckets=None,
-    oov_word_id=None,
-    name=None,
-    num_partitions=1,
-    partition_axis=0,
-    weight_regularizer=None,
-    dtype=None,
-    use_placeholder=True,
-    checkpoint_dir=None,
-    convert_to_lowercase=True,
-    **kwargs,
-  ):
-    if dtype is None:
-      # prevents a bug where the parent class defaults to the type of the first input tensor.
-      dtype = tf.float32
-    super().__init__(trainable=trainable, name=name, dtype=dtype, **kwargs)
-    # Weights initialization is set to 0s. This is safe for full sparse layers because
-    # you are supposed to learn your embedding from the label.
-
-    is_constant_init = isinstance(weight_initializer, tf.keras.initializers.Constant)
-    if use_placeholder and (not is_constant_init) and (weight_initializer is not None):
-      raise ValueError("Weight initializer should be a `Constant` or `None`.")
-
-    if weight_initializer is None:
-      self.weight_initializer = tf.zeros_initializer()
-    else:
-      self.weight_initializer = weight_initializer
-    self.use_placeholder = use_placeholder
-    self.checkpoint_dir = checkpoint_dir
-    self.convert_to_lowercase = convert_to_lowercase
-
-    self.vocab_initializer = vocab_initializer
-    self.vocab_size = vocab_size
-    self.output_size = output_size
-    self.num_partitions = num_partitions
-    self.partition_axis = partition_axis
-    self.weight_regularizer = weight_regularizer
-    self.trainable = trainable
-    self.oov_word_id = oov_word_id
-    self.num_oov_buckets = num_oov_buckets
-
-    if self.oov_word_id is not None and self.num_oov_buckets is not None:
-      raise ValueError("At most one of oov_word_id or num_oov_buckets should be specified")
-    elif self.oov_word_id is None and self.num_oov_buckets is None:
-      self.oov_word_id = OOV_WORD_ID  # use the default OOV word id
-
-    if partition_axis != 0:
-      raise NotImplementedError("embedding_lookup only supports partition_axis = 0")
-
-  def build(self, input_shapes):
+def add_parser_arguments(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
     """
-    creates the ``vocab`` and ``weight`` Variables
-    of shape ``[vocab_size]`` and ``[vocab_size, output_size]`` respectively.
-    """
-    partitioner = None
-
-    additional_buckets_for_oov = self.num_oov_buckets if self.num_oov_buckets is not None else 0
-    shape = [self.vocab_size + additional_buckets_for_oov, self.output_size]
-
-    if self.use_placeholder:
-      embedding_weight_initializer = twml.contrib.initializers.PlaceholderInitializer(
-        shape, self.dtype
-      )
-      tf.add_to_collection(
-        twml.contrib.initializers.TWML_INIT_FEED_KEY,
-        {embedding_weight_initializer.value: self.weight_initializer.value},
-      )
-    else:
-      embedding_weight_initializer = self.weight_initializer
-
-    if self.num_partitions:
-      partition_axis = int(self.partition_axis)
-      partitioner = tf.fixed_size_partitioner(self.num_partitions, axis=partition_axis)
-    else:
-      # Regular variables do not like it when you pass both constant tensors and shape
-      if not callable(self.weight_initializer):
-        shape = None
-
-    self.vocab = self.add_variable(
-      'vocab',
-      initializer=self.vocab_initializer,
-      shape=[self.vocab_size],
-      dtype=tf.string,
-      trainable=False,
-    )
+    Adds the embedding.path and embedding.vocab_size command-line arguments to the parser.
+    These can be used to call an initializer loader function like
+    the ``load_initializers_from_csv`` function.
 
-    self.weight = self.add_variable(
-      'weight',
-      initializer=None if self.checkpoint_dir is not None else embedding_weight_initializer,
-      regularizer=self.weight_regularizer,
-      shape=shape,
-      dtype=self.dtype,
-      trainable=self.trainable,
-      partitioner=partitioner,
-    )
-    if self.checkpoint_dir is not None:
-      twml.trainers.trainer.init_from_checkpoint(self.checkpoint_dir, {'weight': self.weight.name})
-
-    self.built = True
-
-  def call(
-    self, inputs, debug=False, oov_summaries=False, **kwargs
-  ):  # pylint: disable=unused-argument
-    """Converts word strings to word ids using the vocabulary lookup table.
-    Then converts the word ids to their commensurate embedding vector.
-
-    Arguments:
-      inputs:
-        A tensor of word strings. Typically, of size batch_size x seq_len.
-      debug:
-        When True, prints the input strings and their commensurate input_ids.
-        Defaults to False.
-      oov_summaries:
-        When True, log the out-of-vocabulary (OOV) rate to TensorBoard
-        Defaults to False.
+    Args:
+        parser: argparse.ArgumentParser instance obtained from Trainer.get_trainer_parser
 
     Returns:
-      The mapping of input word strings to output embedding vectors.
-      Given an input of shape ``batch_size x seq_len``, the output has shape
-      ``batch_size x seq_len x embedding_size``.
+        argparse.ArgumentParser instance with discretizer-specific arguments added
     """
-    if self.convert_to_lowercase:
-      inputs = tf.strings.lower(inputs)
-    if self.num_oov_buckets is None:
-      lookup_table = index_table_from_tensor(self.vocab, default_value=self.oov_word_id)
-    else:
-      lookup_table = index_table_from_tensor(self.vocab, num_oov_buckets=self.num_oov_buckets)
-    input_ids = lookup_table.lookup(inputs)
-
-    if oov_summaries:
-      oov_count = tf.reduce_sum(
-        tf.cast(tf.math.equal(input_ids, self.oov_word_id), tf.dtypes.float32)
-      )
-      valid_count = tf.reduce_sum(
-        tf.cast(tf.math.not_equal(input_ids, PAD_WORD_ID), tf.dtypes.float32)
-      )
-      oov_rate = oov_count / valid_count
-      tf.summary.scalar('OOV_rate', oov_rate)
-
-    if debug:
-
-      def print_debug():
-        return tf.print("input_strings:", inputs, "\ninput_ids: ", input_ids, summarize=140)
-
-      with tf.control_dependencies([twml.util.do_every_n_steps(print_debug, 1000)]):
-        input_ids = tf.identity(input_ids)
-
-    output_embeddings = tf.nn.embedding_lookup(
-      params=self.weight, ids=input_ids, partition_strategy='div'
+
+    parser.add_argument(
+        "--embedding.path",
+        "--embedding_path",
+        dest="embedding_path",
+        type=str,
+        default=None,
+        help="When specified, loads glove embeddings from .txt glove file",
+    )
+    parser.add_argument(
+        "--embedding.vocab_size",
+        "--embedding_vocab_size",
+        dest="embedding_vocab_size",
+        type=int,
+        default=-1,
+        help="Size of vocabulary. Uses this many of the most frequent terms. Defaults to -1 (use full vocab).",
     )
 
-    output_shape = inputs.shape.concatenate(tf.TensorShape([self.output_size]))
-    output_embeddings.set_shape(output_shape)
+    return parser
+
+
+class EmbeddingLookup(twml.layers.Layer):
+    """Layer for looking up embeddings.
+    Transforms a sequence of strings to a sequence of embeddings.
+
+    Args:
+        vocab_size:
+            The number of word strings and embeddings in the vocabulary.
+        output_size:
+            Long or Integer, dimensionality of the output space. The embedding vector size.
+        vocab_initializer:
+            Initializer function for the vocabulary. Required. The initializer should
+            return a list of strings of size vocab_size.
+        weight_initializer:
+            Initializer function for the weight matrix of size vocab_size x output_size.
+            This argument defaults to zeros_initializer().
+            This is valid when the EmbeddingLookup is the first layer of
+            parameters but should be changed otherwise.
+        trainable:
+            Boolean, if `True` adds variables to the graph collection
+            ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
+            <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
+            Defaults to True: trains the embeddings.
+        num_oov_buckets:
+            The number of buckets to use for OOV strings. These bucket ids occur after the vocab bucket
+            ids. Hashing is used to assign OOV strings to these buckets. If `num_oov_buckets` is not
+            specified, index `OOV_WORD_ID` is used for OOV strings.
+        name:
+            String, the name of the layer. Layers with the same name will
+            share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
+        num_partitions:
+            Number of partitions to use for the weight variable. Defaults to 1.
+        partition_axis:
+            If num_partitions is specified, the partition axis for the weight variable
+            Defaults to 0 (partition by row).
+            Must be 0 (row) or 1 (column, does not support yet)
+        weight_regularizer:
+            Regularizer function for the weight matrix.
+            Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
+        dtype:
+            Defaults to tf.float32. Specifies the dtype of the weights.
+        use_placeholder:
+            Defaults to True.
+            If set to `True`, the initializer is passed via a placeholder. The initializer in this case needs to be of type `keras.initializers.Constant`.
+            If set to `False`, the initializer becomes part of the graph. This can sometimes be beyond what protobuf clients support.
+        checkpoint_dir:
+            Default to None.
+            If set to the path of a checkpoint, load embedding from the checkpoint.
+        convert_to_lowercase:
+            Default to True.
+            Converting all string inputs to lowercase.
+
+    Notes: If `use_placeholder` is set to `True`, the feed dictionary can be accessed by calling `twml.contrib.initializers.get_init_feed_dict()`.
+    """
 
-    return output_embeddings
+    def __init__(
+        self,
+        vocab_size: int,
+        output_size: int,
+        vocab_initializer: tf.keras.initializers.Initializer,
+        weight_initializer: tf.keras.initializers.Initializer = tf.zeros_initializer(),
+        trainable: bool = True,
+        num_oov_buckets: Optional[int] = None,
+        oov_word_id: Optional[int] = None,
+        name: Optional[str] = None,
+        num_partitions: int = 1,
+        partition_axis: int = 0,
+        weight_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+        dtype: tf.DType = tf.float32,
+        use_placeholder: bool = True,
+        checkpoint_dir: Optional[str] = None,
+        convert_to_lowercase: bool = True,
+        **kwargs,
+    ):
+        super().__init__(trainable=trainable, name=name, dtype=dtype, **kwargs)
+        # Weights initialization is set to 0s. This is safe for full sparse layers because
+        # you are supposed to learn your embedding from the label.
+
+        is_constant_init = isinstance(
+            weight_initializer, tf.keras.initializers.Constant
+        )
+        if (
+            use_placeholder
+            and (not is_constant_init)
+            and (weight_initializer is not None)
+        ):
+            raise ValueError("Weight initializer should be a `Constant` or `None`.")
+
+        self.use_placeholder = use_placeholder
+        self.checkpoint_dir = checkpoint_dir
+        self.convert_to_lowercase = convert_to_lowercase
+
+        self.vocab_initializer = vocab_initializer
+        self.vocab_size = vocab_size
+        self.output_size = output_size
+        self.num_partitions = num_partitions
+        self.partition_axis = partition_axis
+        self.weight_regularizer = weight_regularizer
+        self.trainable = trainable
+        self.oov_word_id = oov_word_id
+        self.num_oov_buckets = num_oov_buckets
+
+        if self.oov_word_id is not None and self.num_oov_buckets is not None:
+            raise ValueError(
+                "At most one of oov_word_id or num_oov_buckets should be specified"
+            )
+        elif self.oov_word_id is None and self.num_oov_buckets is None:
+            self.oov_word_id = OOV_WORD_ID  # use the default OOV word id
+
+        if partition_axis != 0:
+            raise NotImplementedError(
+                "embedding_lookup only supports partition_axis = 0"
+            )
+
+    def build(self, input_shapes: tf.TensorShape) -> None:
+        """
+        creates the ``vocab`` and ``weight`` Variables
+        of shape ``[vocab_size]`` and ``[vocab_size, output_size]`` respectively.
+        """
+        partitioner = None
+
+        additional_buckets_for_oov = (
+            self.num_oov_buckets if self.num_oov_buckets is not None else 0
+        )
+        shape = [self.vocab_size + additional_buckets_for_oov, self.output_size]
+
+        if self.use_placeholder:
+            embedding_weight_initializer = (
+                twml.contrib.initializers.PlaceholderInitializer(shape, self.dtype)
+            )
+            tf.add_to_collection(
+                twml.contrib.initializers.TWML_INIT_FEED_KEY,
+                {embedding_weight_initializer.value: self.weight_initializer.value},
+            )
+        else:
+            embedding_weight_initializer = self.weight_initializer
+
+        if self.num_partitions:
+            partition_axis = int(self.partition_axis)
+            partitioner = tf.fixed_size_partitioner(
+                self.num_partitions, axis=partition_axis
+            )
+        else:
+            # Regular variables do not like it when you pass both constant tensors and shape
+            if not callable(self.weight_initializer):
+                shape = None
+
+        self.vocab = self.add_variable(
+            "vocab",
+            initializer=self.vocab_initializer,
+            shape=[self.vocab_size],
+            dtype=tf.string,
+            trainable=False,
+        )
+
+        self.weight = self.add_variable(
+            "weight",
+            initializer=None
+            if self.checkpoint_dir is not None
+            else embedding_weight_initializer,
+            regularizer=self.weight_regularizer,
+            shape=shape,
+            dtype=self.dtype,
+            trainable=self.trainable,
+            partitioner=partitioner,
+        )
+        if self.checkpoint_dir is not None:
+            twml.trainers.trainer.init_from_checkpoint(
+                self.checkpoint_dir, {"weight": self.weight.name}
+            )
+
+        self.built = True
+
+    def call(
+        self,
+        inputs: tf.Tensor,
+        debug: bool = False,
+        oov_summaries: bool = False,
+        **kwargs,
+    ):  # pylint: disable=unused-argument
+        """Converts word strings to word ids using the vocabulary lookup table.
+        Then converts the word ids to their commensurate embedding vector.
+
+        Args:
+            inputs:
+                A tensor of word strings. Typically, of size batch_size x seq_len.
+            debug:
+                When True, prints the input strings and their commensurate input_ids.
+                Defaults to False.
+            oov_summaries:
+                When True, log the out-of-vocabulary (OOV) rate to TensorBoard
+                Defaults to False.
+
+        Returns:
+            The mapping of input word strings to output embedding vectors.
+            Given an input of shape ``batch_size x seq_len``, the output has shape
+            ``batch_size x seq_len x embedding_size``.
+        """
+        if self.convert_to_lowercase:
+            inputs = tf.strings.lower(inputs)
+        if self.num_oov_buckets is None:
+            lookup_table = index_table_from_tensor(
+                self.vocab, default_value=self.oov_word_id
+            )
+        else:
+            lookup_table = index_table_from_tensor(
+                self.vocab, num_oov_buckets=self.num_oov_buckets
+            )
+        input_ids = lookup_table.lookup(inputs)
+
+        if oov_summaries:
+            oov_count = tf.reduce_sum(
+                tf.cast(tf.math.equal(input_ids, self.oov_word_id), tf.dtypes.float32)
+            )
+            valid_count = tf.reduce_sum(
+                tf.cast(tf.math.not_equal(input_ids, PAD_WORD_ID), tf.dtypes.float32)
+            )
+            oov_rate = oov_count / valid_count
+            tf.summary.scalar("OOV_rate", oov_rate)
+
+        if debug:
+
+            def print_debug():
+                return tf.print(
+                    "input_strings:", inputs, "\ninput_ids: ", input_ids, summarize=140
+                )
+
+            with tf.control_dependencies(
+                [twml.util.do_every_n_steps(print_debug, 1000)]
+            ):
+                input_ids = tf.identity(input_ids)
+
+        output_embeddings = tf.nn.embedding_lookup(
+            params=self.weight, ids=input_ids, partition_strategy="div"
+        )
+
+        output_shape = inputs.shape.concatenate(tf.TensorShape([self.output_size]))
+        output_embeddings.set_shape(output_shape)
+
+        return output_embeddings
diff --git a/twml/twml/contrib/layers/factorization_machine.py b/twml/twml/contrib/layers/factorization_machine.py
index 3b8adae42..2484d0a8d 100644
--- a/twml/twml/contrib/layers/factorization_machine.py
+++ b/twml/twml/contrib/layers/factorization_machine.py
@@ -3,177 +3,198 @@
 Implementing factorization Layer
 """
 
-from twitter.deepbird.sparse.sparse_ops import _pad_empty_outputs
+from typing import Optional
 
 import tensorflow.compat.v1 as tf
+from twitter.deepbird.sparse.sparse_ops import _pad_empty_outputs
+
 import twml
 from twml.layers.layer import Layer
 
 
 class FactorizationMachine(Layer):
-  """factorization machine layer class.
-  This layer implements the factorization machine operation.
-  The paper is "Factorization Machines" by Steffen Rendle.
-  TDD: go/tf-fm-tdd
-
-  Arguments:
-    num_latent_variables:
-      num of latent variables
-      The number of parameter in this layer is num_latent_variables x n where n is number of
-      input features.
-    weight_initializer:
-      Initializer function for the weight matrix.
-      This argument defaults to zeros_initializer().
-      This is valid when the FullSparse is the first layer of
-      parameters but should be changed otherwise.
-    weight_regularizer:
-      Regularizer function for the weight matrix.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    activation:
-      Activation function (callable). Set it to None to maintain a linear activation.
-    trainable:
-      Boolean, if `True` also add variables to the graph collection
-      ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
-      <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
-    name:
-      String, the name of the layer. Layers with the same name will
-      share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
-    use_sparse_grads:
-      Boolean, if `True` do sparse mat mul with `embedding_lookup_sparse`, which will
-      make gradients to weight matrix also sparse in backward pass. This can lead to non-trivial
-      speed up at training time when input_size is large and optimizer handles sparse gradients
-      correctly (eg. with SGD or LazyAdamOptimizer). If weight matrix is small, it's recommended
-      to set this flag to `False`; for most use cases of FullSparse, however, weight matrix will
-      be large, so it's better to set it to `True`
-    use_binary_values:
-      Assume all non zero values are 1. Defaults to False.
-      This can improve training if used in conjunction with MDL.
-      This parameter can also be a list of binary values if `inputs` passed to `call` a list.
-  """
-
-  def __init__(self,
-    num_latent_variables=10,
-    weight_initializer=None,
-    activation=None,
-    trainable=True,
-    name=None,
-    use_sparse_grads=True,
-    use_binary_values=False,
-    weight_regularizer=None,
-    substract_self_cross=True,
-    **kwargs):
-    super(FactorizationMachine, self).__init__(trainable=trainable, name=name, **kwargs)
-
-    if weight_initializer is None:
-      weight_initializer = tf.zeros_initializer()
-    self.weight_initializer = weight_initializer
-    self.num_latent_variables = num_latent_variables
-    self.activation = activation
-    self.use_sparse_grads = use_sparse_grads
-    self.use_binary_values = use_binary_values
-    self.weight_regularizer = weight_regularizer
-    self.substract_self_cross = substract_self_cross
-
-  def build(self, input_shape):
-    """
-    creates``weight`` Variable of shape``[input_size, num_latent_variables]``.
-
-    """
-
-    shape = [input_shape[1], self.num_latent_variables]
-
-    # There is a 2GB limitation for each tensor because of protobuf.
-    # 2**30 is 1GB. 2 * (2**30) is 2GB.
-    dtype = tf.as_dtype(self.dtype)
-    requested_size = input_shape[1] * self.num_latent_variables * dtype.size
-    if (requested_size >= 2**31):
-      raise ValueError("Weight tensor can not be larger than 2GB. " %
-                       "Requested Dimensions(%d, %d) of type %s (%d bytes total)"
-                       (input_shape[1], self.num_latent_variables, dtype.name))
-
-    if not callable(self.weight_initializer):
-      shape = None
-
-    # dense tensor
-    self.weight = self.add_variable(
-      'weight',
-      initializer=self.weight_initializer,
-      regularizer=self.weight_regularizer,
-      shape=shape,
-      dtype=self.dtype,
-      trainable=True,
-    )
-
-    self.built = True
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
+    """factorization machine layer class.
+    This layer implements the factorization machine operation.
+    The paper is "Factorization Machines" by Steffen Rendle.
+    TDD: go/tf-fm-tdd
 
     Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raises NotImplementedError.
-
+        num_latent_variables:
+            num of latent variables
+            The number of parameter in this layer is num_latent_variables x n where n is number of
+            input features.
+        weight_initializer:
+            Initializer function for the weight matrix.
+            This argument defaults to zeros_initializer().
+            This is valid when the FullSparse is the first layer of
+            parameters but should be changed otherwise.
+        weight_regularizer:
+            Regularizer function for the weight matrix.
+            Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
+        activation:
+            Activation function (callable). Set it to None to maintain a linear activation.
+        trainable:
+            Boolean, if `True` also add variables to the graph collection
+            ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
+            <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
+        name:
+            String, the name of the layer. Layers with the same name will
+            share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
+        use_sparse_grads:
+            Boolean, if `True` do sparse mat mul with `embedding_lookup_sparse`, which will
+            make gradients to weight matrix also sparse in backward pass. This can lead to non-trivial
+            speed up at training time when input_size is large and optimizer handles sparse gradients
+            correctly (eg. with SGD or LazyAdamOptimizer). If weight matrix is small, it's recommended
+            to set this flag to `False`; for most use cases of FullSparse, however, weight matrix will
+            be large, so it's better to set it to `True`
+        use_binary_values:
+            Assume all non zero values are 1. Defaults to False.
+            This can improve training if used in conjunction with MDL.
+            This parameter can also be a list of binary values if `inputs` passed to `call` a list.
     """
-    raise NotImplementedError
 
-  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """The logic of the layer lives here.
-
-    Arguments:
-      inputs:
-        A SparseTensor
-    Returns:
-      - If `inputs` is `SparseTensor`, then returns a number with cross info
-    """
-    # The following are given:
-    # - inputs is a sparse tensor, we call it sp_x.
-    # - The dense_v tensor is a dense matrix, whose row i
-    #   corresponds to the vector V_i.
-    #   weights has shape [num_features, k]
-    sp_x = inputs
-    if isinstance(inputs, twml.SparseTensor):
-      sp_x = inputs.to_tf()
-    elif not isinstance(sp_x, tf.SparseTensor):
-      raise TypeError("The sp_x must be of type tf.SparseTensor or twml.SparseTensor")
-
-    indices = sp_x.indices[:, 1]
-    batch_ids = sp_x.indices[:, 0]
-    values = tf.reshape(sp_x.values, [-1, 1], name=self.name)
-    if self.use_sparse_grads:
-      v = tf.nn.embedding_lookup(self.weight, indices)
-      # if (self.use_binary_values):
-      #   values = tf.ones(tf.shape(values), dtype=values.dtype)
-      v_times_x = v * values
-      # First term: Sum_k  [Sum_i (v_ik * x_i)]^2
-      all_crosses = tf.segment_sum(v_times_x, batch_ids, name=self.name)
-      all_crosses_squared = tf.reduce_sum((all_crosses * all_crosses), 1)
-
-      if self.substract_self_cross:
-        # Second term: Sum_k Sum_i [ (v_ik * x_i)^2 ]
-        v_times_x_2 = v_times_x**2
-        self_crosses = tf.reduce_sum(tf.segment_sum(v_times_x_2, batch_ids, name=self.name), 1)
-        outputs = all_crosses_squared - self_crosses
-      else:
-        outputs = all_crosses_squared
-    else:
-      # need to check if prediction is faster with code below
-      crossTerm = tf.reduce_sum((tf.sparse_tensor_dense_matmul(sp_x, self.weight)**2), 1)
-
-      if self.substract_self_cross:
-        # compute self-cross term
-        self_crossTerm = tf.reduce_sum(tf.segment_sum((tf.gather(self.weight, indices) * values)**2, batch_ids), 1)
-        outputs = crossTerm - self_crossTerm
-      else:
-        outputs = crossTerm
-
-    if self.activation is not None:
-      outputs = self.activation(outputs)
-
-    outputs = tf.reshape(outputs, [-1, 1], name=self.name)
-    outputs = _pad_empty_outputs(outputs, tf.cast(sp_x.dense_shape[0], tf.int32))
-    # set more explicit and static shape to avoid shape inference error
-    # valueError: The last dimension of the inputs to `Dense` should be defined. Found `None`
-    outputs.set_shape([None, 1])
-    return outputs
+    def __init__(
+        self,
+        num_latent_variables: int = 10,
+        weight_initializer: tf.keras.initializers.Initializer = tf.zeros_initializer(),
+        activation: Optional[tf.keras.activations.Activation] = None,
+        trainable: bool = True,
+        name: Optional[str] = None,
+        use_sparse_grads: bool = True,
+        use_binary_values: bool = False,
+        weight_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+        substract_self_cross: bool = True,
+        **kwargs
+    ):
+        super(FactorizationMachine, self).__init__(
+            trainable=trainable,
+            name=name,
+            **kwargs,
+        )
+
+        self.weight_initializer = weight_initializer
+        self.num_latent_variables = num_latent_variables
+        self.activation = activation
+        self.use_sparse_grads = use_sparse_grads
+        self.use_binary_values = use_binary_values
+        self.weight_regularizer = weight_regularizer
+        self.substract_self_cross = substract_self_cross
+
+    def build(self, input_shape: tf.TensorShape) -> None:
+        """creates `weight` Variable of shape `[input_size, num_latent_variables]`."""
+
+        shape = [input_shape[1], self.num_latent_variables]
+
+        # There is a 2GB limitation for each tensor because of protobuf.
+        # 2**30 is 1GB. 2 * (2**30) is 2GB.
+        dtype = tf.as_dtype(self.dtype)
+        requested_size = input_shape[1] * self.num_latent_variables * dtype.size
+        if requested_size >= (1<<31):
+            raise ValueError(
+                "Weight tensor can not be larger than 2GB. "
+                % "Requested Dimensions(%d, %d) of type %s (%d bytes total)"(
+                    input_shape[1], self.num_latent_variables, dtype.name
+                )
+            )
+
+        if not callable(self.weight_initializer):
+            shape = None
+
+        # dense tensor
+        self.weight = self.add_variable(
+            "weight",
+            initializer=self.weight_initializer,
+            regularizer=self.weight_regularizer,
+            shape=shape,
+            dtype=self.dtype,
+            trainable=True,
+        )
+
+        self.built = True
+
+    def compute_output_shape(
+        self, input_shape: tf.TensorShape
+    ):  # pylint: disable=unused-argument
+        """Computes the output shape of the layer given the input shape.
+
+        Args:
+            input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
+                be fully defined (e.g. the batch size may be unknown).
+
+        Raises NotImplementedError.
+        """
+        raise NotImplementedError
+
+    def call(
+        self, inputs: tf.SparseTensor, **kwargs
+    ):  # pylint: disable=unused-argument
+        """The logic of the layer lives here.
+
+        Args:
+            inputs:
+                A SparseTensor
+
+        Returns:
+            - If `inputs` is `SparseTensor`, then returns a number with cross info
+        """
+        # The following are given:
+        # - inputs is a sparse tensor, we call it sp_x.
+        # - The dense_v tensor is a dense matrix, whose row i
+        #   corresponds to the vector V_i.
+        #   weights has shape [num_features, k]
+        sp_x = inputs
+        if isinstance(inputs, twml.SparseTensor):
+            sp_x = inputs.to_tf()
+        elif not isinstance(sp_x, tf.SparseTensor):
+            raise TypeError(
+                "The sp_x must be of type tf.SparseTensor or twml.SparseTensor"
+            )
+
+        indices = sp_x.indices[:, 1]
+        batch_ids = sp_x.indices[:, 0]
+        values = tf.reshape(sp_x.values, [-1, 1], name=self.name)
+        if self.use_sparse_grads:
+            v = tf.nn.embedding_lookup(self.weight, indices)
+            # if (self.use_binary_values):
+            #   values = tf.ones(tf.shape(values), dtype=values.dtype)
+            v_times_x = v * values
+            # First term: Sum_k  [Sum_i (v_ik * x_i)]^2
+            all_crosses = tf.segment_sum(v_times_x, batch_ids, name=self.name)
+            all_crosses_squared = tf.reduce_sum((all_crosses * all_crosses), 1)
+
+            if self.substract_self_cross:
+                # Second term: Sum_k Sum_i [ (v_ik * x_i)^2 ]
+                v_times_x_2 = v_times_x**2
+                self_crosses = tf.reduce_sum(
+                    tf.segment_sum(v_times_x_2, batch_ids, name=self.name), 1
+                )
+                outputs = all_crosses_squared - self_crosses
+            else:
+                outputs = all_crosses_squared
+        else:
+            # need to check if prediction is faster with code below
+            crossTerm = tf.reduce_sum(
+                (tf.sparse_tensor_dense_matmul(sp_x, self.weight) ** 2), 1
+            )
+
+            if self.substract_self_cross:
+                # compute self-cross term
+                self_crossTerm = tf.reduce_sum(
+                    tf.segment_sum(
+                        (tf.gather(self.weight, indices) * values) ** 2, batch_ids
+                    ),
+                    1,
+                )
+                outputs = crossTerm - self_crossTerm
+            else:
+                outputs = crossTerm
+
+        if self.activation is not None:
+            outputs = self.activation(outputs)
+
+        outputs = tf.reshape(outputs, [-1, 1], name=self.name)
+        outputs = _pad_empty_outputs(outputs, tf.cast(sp_x.dense_shape[0], tf.int32))
+        # set more explicit and static shape to avoid shape inference error
+        # valueError: The last dimension of the inputs to `Dense` should be defined. Found `None`
+        outputs.set_shape([None, 1])
+        return outputs
diff --git a/twml/twml/contrib/layers/full_dense.py b/twml/twml/contrib/layers/full_dense.py
index ad78a91a4..63990ba4e 100644
--- a/twml/twml/contrib/layers/full_dense.py
+++ b/twml/twml/contrib/layers/full_dense.py
@@ -2,379 +2,402 @@
 """
 Implementing Full Dense Layer
 """
-from twml.layers import Layer
+from typing import List, Optional, Tuple, Union
 
 import tensorflow.compat.v1 as tf
+from tensorflow import keras
 from tensorflow.python.layers import core
 
+from twml.layers import Layer
+
 
 class FullDense(Layer):
-  """
-  Full-connected, Dense input layer class.
-  This layer implements the operation:
-
-  .. code-block:: python
-
-    outputs = activation(inputs.weight + bias)
-
-  Where ``activation`` is the activation function passed as the ``activation``
-  argument (if not ``None``), ``weight`` is a weights matrix created by the layer,
-  and ``bias`` is a bias vector created by the layer.
-
-  However, this layer breaks up ``weight`` into ``num_partitions`` parts,
-  for the purpose of even disribution of weights across parameter servers
-  for distributed training.
-
-  Note - This layer is created to allow distributed training optimizations,
-  but can also be used for single node training (e.g. hogwild) without
-  code modification
-
-  Arguments:
-    output_size:
-      Integer or Long, dimensionality of the output space.
-    weight_initializer:
-      Initializer function for the weight matrix.
-    weight_regularizer:
-      Regularizer function for the weight matrix.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    weight_constraint:
-      An optional projection function to be applied to the
-      weight after being updated by an `Optimizer` (e.g. used to implement
-      norm constraints or value constraints for layer weights). The function
-      must take as input the unprojected variable and must return the
-      projected variable (which must have the same shape). Constraints are
-      not safe to use when doing asynchronous distributed training.
-    bias_constraint:
-      An optional projection function to be applied to the
-      bias after being updated by an `Optimizer`.
-    num_partitions:
-      Number of pieces to partition the weights into. This layer does
-      column partitioning of the weights, which is equivalent to
-      processing the input tensor with multiple fully connected layers
-      of smaller output size, and then concatenating these outputs
-    activation:
-      Activation function (callable). Set it to None to maintain a linear activation.
-    use_bias:
-      Boolean whether to include a bias parameter in the layer
-    bias_initializer:
-      Initializer function for the bias.
-    bias_regularizer:
-      Regularizer function for the bias.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    activity_regularizer:
-      Regularizer function for the output.
-    trainable:
-      Boolean, if `True` also add variables to the graph collection
-      ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
-      <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
-    name:
-      String, the name of the layer. Layers with the same name will
-      share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
-
-  Properties:
-    output_size:
-      Python integer, dimensionality of the output space.
-    activation:
-      Activation function (callable).
-    weight_initializer:
-      Initializer instance (or name) for the weight matrix.
-    bias_initializer:
-      Initializer instance (or name) for the bias.
-    weights:
-      list of underlying weight and bias matrix components. no guarantee on order of elements
-    weight_regularizer:
-      Regularizer instance for the weight matrix (callable)
-    bias_regularizer:
-      Regularizer instance for the bias (callable).
-    activity_regularizer:
-      Regularizer instance for the output (callable)
-    weight_constraint:
-      Constraint function for the weight matrix.
-    bias_constraint:
-      Constraint function for the bias.
-  """
-
-  def __init__(self, output_size,
-               weight_initializer=None,
-               weight_regularizer=None,
-               weight_constraint=None,
-               bias_constraint=None,
-               num_partitions=3,
-               activation=None,
-               use_bias=True,
-               bias_initializer=tf.zeros_initializer(),
-               bias_regularizer=None,
-               activity_regularizer=None,
-               trainable=True,
-               name=None,
-               **kwargs):
-    super(FullDense, self).__init__(trainable=trainable, name=name, **kwargs)
-    self._output_sizes = self._get_output_partition_sizes(output_size, num_partitions)
-    self._units = output_size
-    self._activation = activation
-    self._weight_initializer = weight_initializer
-    self._bias_initializer = bias_initializer
-    self._weight_regularizer = weight_regularizer
-    self._bias_regularizer = bias_regularizer
-    self._weight_constraint = weight_constraint
-    self._bias_constraint = bias_constraint
-    self._use_bias = use_bias
-    # NOTE - many initializers depend on fan_in and fan_out
-    #      - as such, initialization here may be different than
-    #      - for a non-partitioned FullDense
-    self._parts = [core.Dense(units=out_size,
-                              activation=activation,
-                              use_bias=use_bias,
-                              kernel_initializer=weight_initializer,
-                              bias_initializer=bias_initializer,
-                              kernel_regularizer=weight_regularizer,
-                              bias_regularizer=bias_regularizer,
-                              activity_regularizer=activity_regularizer,
-                              kernel_constraint=weight_constraint,
-                              bias_constraint=bias_constraint,
-                              trainable=trainable,
-                              name=name,
-                              **kwargs) for out_size in self._output_sizes]
-
-  @staticmethod
-  def _get_output_partition_sizes(out_size, num_parts):
-    """ Returns the appropriate output sizes of the partitions """
-    boundaries = [out_size * n // num_parts for n in range(num_parts + 1)]
-    return [k - j for j, k in zip(boundaries[:], boundaries[1:])]
-
-  def build(self, input_shapes):
-    """ Create the appropriately sized weights and biases in each layer partition """
-    if isinstance(input_shapes, (list, tuple)):
-      input_shape = input_shapes[0]
-      is_compatible = True
-      for other_shape in input_shapes[1:]:
-        is_compatible &= input_shape.is_compatible_with(other_shape)
-      if not is_compatible:
-        raise ValueError("Input shapes %s are not compatible." % input_shapes)
-    else:
-      input_shape = input_shapes
-
-    for part in self._parts:
-      part.build(input_shape)
-
-    self.built = True
-
-  @property
-  def units(self):
-    """ Returns the number of output units of the layer """
-    return self._units
-
-  @property
-  def output_size(self):
-    """ Returns the number of output units of the layer """
-    return self._units
-
-  @property
-  def activation(self):
-    """ Returns the activation function """
-    return self._activation
-
-  @property
-  def weight_initializer(self):
-    """ Returns the weight_initializer """
-    return self._weight_initializer
-
-  @property
-  def weight_regularizer(self):
-    """ Returns the weight_regularizer """
-    return self._weight_regularizer
-
-  @property
-  def weight_constraint(self):
-    """ Returns the weight_constraint """
-    return self._weight_constraint
-
-  @property
-  def bias_initializer(self):
-    """ Returns the bias_initializer """
-    return self._bias_initializer
-
-  @property
-  def bias_regularizer(self):
-    """ Returns the bias_regularizer """
-    return self._bias_regularizer
-
-  @property
-  def bias_constraint(self):
-    """ Returns the bias_constraint """
-    return self._bias_constraint
-
-  @property
-  def use_bias(self):
-    """ Returns whether a bias is used in the layer """
-    return self._use_bias
-
-  @property
-  def trainable_variables(self):
-    """ Returns the trainable variables of the layer """
-    trainable_vars = []
-    for pt in self._parts:
-      trainable_vars += pt.trainable_variables
-    return trainable_vars
-
-  @property
-  def trainable_weights(self):
-    """ Returns the trainable variables of the layer """
-    return self.trainable_variables
-
-  @property
-  def non_trainable_variables(self):
-    """ Returns the non-trainable variables of the layer """
-    non_trainable_vars = []
-    for pt in self._parts:
-      non_trainable_vars += pt.non_trainable_variables
-    return non_trainable_vars
-
-  @property
-  def non_trainable_weights(self):
-    """ Returns the non-trainable variables of the layer """
-    return self.non_trainable_variables
-
-  @property
-  def variables(self):
-    """ Returns a list of all weights and biases in this layer """
-    layer_vars = []
-    for pt in self._parts:
-      layer_vars += pt.weights
-    return layer_vars
-
-  @property
-  def weights(self):
-    """ Returns a list of all weights and biases in this layer """
-    return self.variables
-
-  @property
-  def dtype(self):
-    """ Returns the dtype of the layers weights """
-    return self._parts[0].dtype
-
-  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """The logic of the layer lives here.
-
-    Arguments:
-      inputs:
-        A dense Tensor or a list of such.
-        If `inputs` is a list, all tensors must have same `dense_shape`.
+    """
+    Full-connected, Dense input layer class.
+    This layer implements the operation:
+
+    .. code-block:: python
+        outputs = activation(inputs.weight + bias)
+
+    Where ``activation`` is the activation function passed as the ``activation``
+    argument (if not ``None``), ``weight`` is a weights matrix created by the layer,
+    and ``bias`` is a bias vector created by the layer.
+
+    However, this layer breaks up ``weight`` into ``num_partitions`` parts,
+    for the purpose of even disribution of weights across parameter servers
+    for distributed training.
+
+    Note - This layer is created to allow distributed training optimizations,
+    but can also be used for single node training (e.g. hogwild) without
+    code modification
+
+    Args:
+        output_size:
+            Integer or Long, dimensionality of the output space.
+        weight_initializer:
+            Initializer function for the weight matrix.
+        weight_regularizer:
+            Regularizer function for the weight matrix.
+            Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
+        weight_constraint:
+            An optional projection function to be applied to the
+            weight after being updated by an `Optimizer` (e.g. used to implement
+            norm constraints or value constraints for layer weights). The function
+            must take as input the unprotected variable and must return the
+            projected variable (which must have the same shape). Constraints are
+            not safe to use when doing asynchronous distributed training.
+        bias_constraint:
+            An optional projection function to be applied to the
+            bias after being updated by an `Optimizer`.
+        num_partitions:
+            Number of pieces to partition the weights into. This layer does
+            column partitioning of the weights, which is equivalent to
+            processing the input tensor with multiple fully connected layers
+            of smaller output size, and then concatenating these outputs
+        activation:
+            Activation function (callable). Set it to None to maintain a linear activation.
+        use_bias:
+            Boolean whether to include a bias parameter in the layer
+        bias_initializer:
+            Initializer function for the bias.
+        bias_regularizer:
+            Regularizer function for the bias.
+            Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
+        activity_regularizer:
+            Regularizer function for the output.
+        trainable:
+            Boolean, if `True` also add variables to the graph collection
+            ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
+            <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
+        name:
+            String, the name of the layer. Layers with the same name will
+            share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
+
+    Properties:
+        output_size:
+            Python integer, dimensionality of the output space.
+        activation:
+            Activation function (callable).
+        weight_initializer:
+            Initializer instance (or name) for the weight matrix.
+        bias_initializer:
+            Initializer instance (or name) for the bias.
+        weights:
+            list of underlying weight and bias matrix components. no guarantee on order of elements
+        weight_regularizer:
+            Regularizer instance for the weight matrix (callable)
+        bias_regularizer:
+            Regularizer instance for the bias (callable).
+        activity_regularizer:
+            Regularizer instance for the output (callable)
+        weight_constraint:
+            Constraint function for the weight matrix.
+        bias_constraint:
+            Constraint function for the bias.
+    """
+
+    def __init__(
+        self,
+        output_size: int,
+        weight_initializer: Optional[keras.initializers.Initializer] = None,
+        weight_regularizer: Optional[keras.regularizers.Regularizer] = None,
+        weight_constraint: Optional[keras.constraints.Constraint] = None,
+        bias_constraint: Optional[keras.constraints.Constraint] = None,
+        num_partitions: int = 3,
+        activation: Optional[tf.keras.activations.Activation] = None,
+        use_bias: bool = True,
+        bias_initializer: keras.initializers.Initializer = tf.zeros_initializer(),
+        bias_regularizer: Optional[keras.regularizers.Regularizer] = None,
+        activity_regularizer: Optional[keras.regularizers.Regularizer] = None,
+        trainable: bool = True,
+        name: Optional[str] = None,
+        **kwargs
+    ):
+        super(FullDense, self).__init__(trainable=trainable, name=name, **kwargs)
+        self._output_sizes = self._get_output_partition_sizes(
+            output_size, num_partitions
+        )
+        self._units = output_size
+        self._activation = activation
+        self._weight_initializer = weight_initializer
+        self._bias_initializer = bias_initializer
+        self._weight_regularizer = weight_regularizer
+        self._bias_regularizer = bias_regularizer
+        self._weight_constraint = weight_constraint
+        self._bias_constraint = bias_constraint
+        self._use_bias = use_bias
+        # NOTE - many initializers depend on fan_in and fan_out
+        #      - as such, initialization here may be different than
+        #      - for a non-partitioned FullDense
+        self._parts = [
+            core.Dense(
+                units=out_size,
+                activation=activation,
+                use_bias=use_bias,
+                kernel_initializer=weight_initializer,
+                bias_initializer=bias_initializer,
+                kernel_regularizer=weight_regularizer,
+                bias_regularizer=bias_regularizer,
+                activity_regularizer=activity_regularizer,
+                kernel_constraint=weight_constraint,
+                bias_constraint=bias_constraint,
+                trainable=trainable,
+                name=name,
+                **kwargs
+            )
+            for out_size in self._output_sizes
+        ]
+
+    @staticmethod
+    def _get_output_partition_sizes(out_size: int, num_parts: int) -> List[int]:
+        """Returns the appropriate output sizes of the partitions"""
+        boundaries = [out_size * n // num_parts for n in range(num_parts + 1)]
+        return [k - j for j, k in zip(boundaries[:], boundaries[1:])]
+
+    def build(self, input_shapes: Union[tf.TensorShape, List[tf.TensorShape]]):
+        """Create the appropriately sized weights and biases in each layer partition"""
+        if isinstance(input_shapes, (list, tuple)):
+            input_shape = input_shapes[0]
+            is_compatible = True
+            for other_shape in input_shapes[1:]:
+                is_compatible &= input_shape.is_compatible_with(other_shape)
+            if not is_compatible:
+                raise ValueError("Input shapes %s are not compatible." % input_shapes)
+        else:
+            input_shape = input_shapes
+
+        for part in self._parts:
+            part.build(input_shape)
+
+        self.built = True
+
+    @property
+    def units(self) -> int:
+        """Returns the number of output units of the layer"""
+        return self._units
+
+    @property
+    def output_size(self) -> int:
+        """Returns the number of output units of the layer"""
+        return self._units
+
+    @property
+    def activation(self) -> Optional[tf.keras.activations.Activation]:
+        """Returns the activation function"""
+        return self._activation
+
+    @property
+    def weight_initializer(self) -> Optional[keras.initializers.Initializer]:
+        """Returns the weight_initializer"""
+        return self._weight_initializer
+
+    @property
+    def weight_regularizer(self) -> Optional[keras.regularizers.Regularizer]:
+        """Returns the weight_regularizer"""
+        return self._weight_regularizer
+
+    @property
+    def weight_constraint(self) -> Optional[keras.constraints.Constraint]:
+        """Returns the weight_constraint"""
+        return self._weight_constraint
+
+    @property
+    def bias_initializer(self) -> Optional[keras.initializers.Initializer]:
+        """Returns the bias_initializer"""
+        return self._bias_initializer
+
+    @property
+    def bias_regularizer(self) -> Optional[keras.regularizers.Regularizer]:
+        """Returns the bias_regularizer"""
+        return self._bias_regularizer
+
+    @property
+    def bias_constraint(self) -> Optional[keras.constraints.Constraint]:
+        """Returns the bias_constraint"""
+        return self._bias_constraint
+
+    @property
+    def use_bias(self) -> bool:
+        """Returns whether a bias is used in the layer"""
+        return self._use_bias
+
+    @property
+    def trainable_variables(self) -> List[tf.Variable]:
+        """Returns the trainable variables of the layer"""
+        trainable_vars = []
+        for pt in self._parts:
+            trainable_vars += pt.trainable_variables
+        return trainable_vars
+
+    @property
+    def trainable_weights(self) -> List[tf.Variable]:
+        """Returns the trainable variables of the layer"""
+        return self.trainable_variables
+
+    @property
+    def non_trainable_variables(self) -> List[tf.Variable]:
+        """Returns the non-trainable variables of the layer"""
+        non_trainable_vars = []
+        for pt in self._parts:
+            non_trainable_vars += pt.non_trainable_variables
+        return non_trainable_vars
+
+    @property
+    def non_trainable_weights(self) -> List[tf.Variable]:
+        """Returns the non-trainable variables of the layer"""
+        return self.non_trainable_variables
+
+    @property
+    def variables(self) -> List[tf.Variable]:
+        """Returns a list of all weights and biases in this layer"""
+        layer_vars = []
+        for pt in self._parts:
+            layer_vars += pt.weights
+        return layer_vars
+
+    @property
+    def weights(self) -> List[tf.Variable]:
+        """Returns a list of all weights and biases in this layer"""
+        return self.variables
+
+    @property
+    def dtype(self) -> tf.DType:
+        """Returns the dtype of the layers weights"""
+        return self._parts[0].dtype
+
+    def call(
+        self,
+        inputs: Union[
+            tf.SparseTensor, Union[List[tf.SparseTensor], Tuple[tf.SparseTensor]]
+        ],
+        **kwargs
+    ):  # pylint: disable=unused-argument
+        """The logic of the layer lives here.
+
+        Args:
+            inputs:
+                A dense Tensor or a list of such.
+                If `inputs` is a list, all tensors must have same `dense_shape`.
+
+        Returns:
+            - If `inputs` is `SparseTensor`, then returns `bias + inputs * dense_b`.
+            - If `inputs` is a `list[SparseTensor`, then returns
+            `bias + accumulate_n([sp_a * dense_b for sp_a in inputs])`.
+        """
+        if not isinstance(inputs, (list, tuple)):
+            inputs = [inputs]
+
+        outputs = []
+        for inp in inputs:
+            part_outputs = [part(inp) for part in self._parts]
+            outputs.append(tf.concat(part_outputs, axis=-1))
+
+        return tf.accumulate_n(outputs)
+
+
+def full_dense(
+    inputs: tf.Tensor,
+    output_size: int,
+    weight_initializer: Optional[keras.initializers.Initializer] = None,
+    weight_regularizer: Optional[keras.regularizers.Regularizer] = None,
+    weight_constraint: Optional[keras.constraints.Constraint] = None,
+    bias_constraint: Optional[keras.constraints.Constraint] = None,
+    num_partitions: int = 3,
+    activation: Optional[tf.keras.activations.Activation] = None,
+    use_bias: bool = True,
+    bias_initializer: keras.initializers.Initializer = tf.zeros_initializer(),
+    bias_regularizer: Optional[keras.regularizers.Regularizer] = None,
+    activity_regularizer: Optional[keras.regularizers.Regularizer] = None,
+    trainable: bool = True,
+    name: Optional[str] = None,
+    reuse: Optional[bool] = None,
+    **kwargs
+):
+    """Functional interface for the fully-connected dense-input layer.
+    This layer implements the operation:
+    `outputs = activation(inputs.weight + bias)`
+    Where `activation` is the activation function passed as the `activation`
+    argument (if not `None`), `weight` is a weights matrix created by the layer,
+    and `bias` is a bias vector created by the layer
+    (only if `use_bias` is `True`).
+
+    However, this layer breaks up ``weight`` into ``num_partitions`` parts,
+    for the purpose of even disribution of weights across parameter servers
+    for distributed training.
+
+    Note - This layer is created to allow distributed training optimizations,
+    but can also be used for single node training (e.g. hogwild) without
+    code modification
+
+    Args:
+        inputs: Tensor input.
+        output_size: Integer or Long, dimensionality of the output space.
+        weight_initializer: Initializer function for the weight matrix.
+            If `None` (default), weights are initialized using the default
+            initializer used by `tf.get_variable`.
+        weight_regularizer:
+            Regularizer function for the weight matrix.
+            Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
+        weight_constraint:
+            An optional projection function to be applied to the
+            weight after being updated by an `Optimizer` (e.g. used to implement
+            norm constraints or value constraints for layer weights). The function
+            must take as input the unprojected variable and must return the
+            projected variable (which must have the same shape). Constraints are
+            not safe to use when doing asynchronous distributed training.
+        bias_constraint:
+            An optional projection function to be applied to the
+            bias after being updated by an `Optimizer`.
+        num_partitions:
+            Number of pieces to partition the weights into. This layer does
+            column partitioning of the weights, which is equivalent to
+            processing the input tensor with multiple fully connected layers
+            of smaller output size, and then concatenating these outputs
+        activation: Activation function (callable). Set it to None to maintain a
+            linear activation.
+        use_bias: Boolean, whether the layer uses a bias.
+        bias_initializer:
+            Initializer function for the bias.
+        bias_regularizer:
+            Regularizer function for the bias.
+            Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
+        activity_regularizer:
+            Regularizer function for the output.
+        trainable:
+            Boolean, if `True` also add variables to the graph collection
+            `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+        name:
+            String, the name of the layer.
+        reuse:
+            Boolean, whether to reuse the weights of a previous layer
+            by the same name.
 
     Returns:
-      - If `inputs` is `SparseTensor`, then returns `bias + inputs * dense_b`.
-      - If `inputs` is a `list[SparseTensor`, then returns
-       `bias + accumulate_n([sp_a * dense_b for sp_a in inputs])`.
+        Output tensor with shape `inputs.shape[:-1] + [output_size]`.
     """
     if not isinstance(inputs, (list, tuple)):
-      inputs = [inputs]
-
-    outputs = []
-    for inp in inputs:
-      part_outputs = [part(inp) for part in self._parts]
-      outputs.append(tf.concat(part_outputs, axis=-1))
-
-    return tf.accumulate_n(outputs)
-
-
-def full_dense(inputs, output_size,
-               weight_initializer=None,
-               weight_regularizer=None,
-               weight_constraint=None,
-               bias_constraint=None,
-               num_partitions=3,
-               activation=None,
-               use_bias=True,
-               bias_initializer=tf.zeros_initializer(),
-               bias_regularizer=None,
-               activity_regularizer=None,
-               trainable=True,
-               name=None,
-               reuse=None,
-               **kwargs):
-  """Functional interface for the fully-connected dense-input layer.
-  This layer implements the operation:
-  `outputs = activation(inputs.weight + bias)`
-  Where `activation` is the activation function passed as the `activation`
-  argument (if not `None`), `weight` is a weights matrix created by the layer,
-  and `bias` is a bias vector created by the layer
-  (only if `use_bias` is `True`).
-
-  However, this layer breaks up ``weight`` into ``num_partitions`` parts,
-  for the purpose of even disribution of weights across parameter servers
-  for distributed training.
-
-  Note - This layer is created to allow distributed training optimizations,
-  but can also be used for single node training (e.g. hogwild) without
-  code modification
-
-  Arguments:
-    inputs: Tensor input.
-    output_size: Integer or Long, dimensionality of the output space.
-    weight_initializer: Initializer function for the weight matrix.
-      If `None` (default), weights are initialized using the default
-      initializer used by `tf.get_variable`.
-    weight_regularizer:
-      Regularizer function for the weight matrix.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    weight_constraint:
-      An optional projection function to be applied to the
-      weight after being updated by an `Optimizer` (e.g. used to implement
-      norm constraints or value constraints for layer weights). The function
-      must take as input the unprojected variable and must return the
-      projected variable (which must have the same shape). Constraints are
-      not safe to use when doing asynchronous distributed training.
-    bias_constraint:
-      An optional projection function to be applied to the
-      bias after being updated by an `Optimizer`.
-    num_partitions:
-      Number of pieces to partition the weights into. This layer does
-      column partitioning of the weights, which is equivalent to
-      processing the input tensor with multiple fully connected layers
-      of smaller output size, and then concatenating these outputs
-    activation: Activation function (callable). Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    bias_initializer:
-      Initializer function for the bias.
-    bias_regularizer:
-      Regularizer function for the bias.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    activity_regularizer:
-      Regularizer function for the output.
-    trainable:
-      Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name:
-      String, the name of the layer.
-    reuse:
-      Boolean, whether to reuse the weights of a previous layer
-      by the same name.
-
-  Returns:
-    Output tensor with shape `inputs.shape[:-1] + [output_size]`.
-  """
-  if not isinstance(inputs, (list, tuple)):
-    inputs = [inputs]
-
-  dtype = inputs[0].dtype.base_dtype
-
-  layer = FullDense(output_size=output_size,
-                    weight_initializer=weight_initializer,
-                    weight_regularizer=weight_regularizer,
-                    weight_constraint=weight_constraint,
-                    bias_constraint=bias_constraint,
-                    num_partitions=num_partitions,
-                    activation=activation,
-                    use_bias=use_bias,
-                    bias_initializer=bias_initializer,
-                    bias_regularizer=bias_regularizer,
-                    activity_regularizer=activity_regularizer,
-                    trainable=trainable,
-                    name=name,
-                    dtype=dtype,
-                    _scope=name,
-                    _reuse=reuse,
-                    **kwargs)
-
-  return layer(inputs)
+        inputs = [inputs]
+
+    dtype = inputs[0].dtype.base_dtype
+
+    layer = FullDense(
+        output_size=output_size,
+        weight_initializer=weight_initializer,
+        weight_regularizer=weight_regularizer,
+        weight_constraint=weight_constraint,
+        bias_constraint=bias_constraint,
+        num_partitions=num_partitions,
+        activation=activation,
+        use_bias=use_bias,
+        bias_initializer=bias_initializer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        trainable=trainable,
+        name=name,
+        dtype=dtype,
+        _scope=name,
+        _reuse=reuse,
+        **kwargs
+    )
+
+    return layer(inputs)
diff --git a/twml/twml/contrib/layers/hashed_percentile_discretizer.py b/twml/twml/contrib/layers/hashed_percentile_discretizer.py
index b32c3be8d..3dc99a6e1 100644
--- a/twml/twml/contrib/layers/hashed_percentile_discretizer.py
+++ b/twml/twml/contrib/layers/hashed_percentile_discretizer.py
@@ -4,14 +4,16 @@
 """
 
 
-from twitter.deepbird.util.hashing import (
-  integer_multiplicative_hashing_uniform,
-  integer_multiplicative_hashing,
-)  # noqa: F401
+from typing import Callable, Optional
 
-from libtwml import percentile_discretizer_bin_indices
 import numpy as np
 import tensorflow.compat.v1 as tf
+from libtwml import percentile_discretizer_bin_indices
+from twitter.deepbird.util.hashing import (  # noqa: F401
+    integer_multiplicative_hashing,
+    integer_multiplicative_hashing_uniform,
+)
+
 import twml
 from twml.layers.layer import Layer
 from twml.layers.partition import Partition
@@ -19,199 +21,219 @@
 
 
 class HashedPercentileDiscretizer(Layer):
-  """
-  HashedPercentileDiscretizer layer is constructed by PercentileDiscretizerCalibrator
-  after accumulating data
-  and performing minimum description length (PercentileDiscretizer) calibration.
-
-  HashedPercentileDiscretizer takes sparse continuous features and converts then to sparse
-  binary features. Each binary output feature is associated to an HashedPercentileDiscretizer
-  bin.
-  Each HashedPercentileDiscretizer input feature is converted to n_bin bins.
-  Each HashedPercentileDiscretizer calibration tries to find bin delimiters such
-  that the number of features values
-  per bin is roughly equal (for each given HashedPercentileDiscretizer feature).
-  Note that if an input feature is rarely used, so will its associated output bin/features.
-  The difference between this layer and PercentileDiscretizer is that the
-  DeterministicPercentileDiscretize always assigns the same output id in the SparseTensor to the
-  same input feature id + bin. This is useful if you want to user transfer learning on pre-trained
-  sparse to dense embedding layers, but re-calibrate your discretizer on newer data.
-  """
-
-  def __init__(self, n_feature, n_bin, out_bits,
-               bin_values=None, hash_keys=None, hash_values=None,
-               bin_ids=None, feature_offsets=None,
-               hash_fn=integer_multiplicative_hashing_uniform, **kwargs):
     """
-    Creates a non-initialized `HashedPercentileDiscretizer` object.
-    Before using the table you will have to initialize it. After initialization
-    the table will be immutable.
-
-    Parent class args:
-      see [tf.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/layers/Layer)
-      for documentation of parent class arguments.
-
-    Required args:
-      n_feature:
-        number of unique features accumulated during HashedPercentileDiscretizer calibration.
-        This is the number of features in the hash map.
-        Used to initialize bin_values, hash_keys, hash_values,
-        bin_ids, bin_values and feature_offsets.
-      n_bin:
-        number of HashedPercentileDiscretizer bins used for
-        HashedPercentileDiscretizer calibration. Used to initialize bin_values, hash_keys,
-        hash_values, bin_ids, bin_values and feature_offsets.
-      out_bits:
-        Determines the maximum value for output feature IDs.
-        The dense_shape of the SparseTensor returned by lookup(x)
-        will be [x.shape[0], 1 << output_bits].
-
-    Optional args:
-      hash_keys:
-        contains the features ID that HashedPercentileDiscretizer discretizes and knows
-        about. The hash map (hash_keys->hash_values) is used for two reasons:
-          1. divide inputs into two feature spaces:
-          HashedPercentileDiscretizer vs non-HashedPercentileDiscretizer
-          2. transate the HashedPercentileDiscretizer features into a hash_feature ID that
-          HashedPercentileDiscretizer understands.
-        The hash_map is expected to contain n_feature items.
-      hash_values:
-        translates the feature IDs into hash_feature IDs for HashedPercentileDiscretizer.
-      bin_ids:
-        a 1D Tensor of size n_feature * n_bin + 1 which contains
-        unique IDs to which the HashedPercentileDiscretizer features will be translated to.
-        For example, tf.Tensor(np.arange(n_feature * n_bin)) would produce
-        the most efficient output space.
-      bin_values:
-        a 1D Tensor aligned with bin_ids.
-        For a given hash_feature ID j, it's value bin's are indexed between
-        `j*n_bin` and `j*n_bin + n_bin-1`.
-        As such, bin_ids[j*n_bin+i] is translated from a hash_feature ID of j
-        and a inputs value between
-        `bin_values[j*n_bin + i]` and `bin_values[j*n_bin+i+1]`.
-      feature_offsets:
-        a 1D Tensor specifying the starting location of bins for a given feature id.
-        For example, tf.Tensor(np.arange(0, bin_values.size, n_bin, dtype='int64')).
-      hash_fn:
-        a function that takes in `feature_ids`, `bucket_indices` and `output_size` and
-        hashes the bucketed features into the `output_size` buckets. The default uses knuth's
-        multiplicative hashing
+    HashedPercentileDiscretizer layer is constructed by PercentileDiscretizerCalibrator
+    after accumulating data
+    and performing minimum description length (PercentileDiscretizer) calibration.
+
+    HashedPercentileDiscretizer takes sparse continuous features and converts then to sparse
+    binary features. Each binary output feature is associated to an HashedPercentileDiscretizer
+    bin.
+    Each HashedPercentileDiscretizer input feature is converted to n_bin bins.
+    Each HashedPercentileDiscretizer calibration tries to find bin delimiters such
+    that the number of features values
+    per bin is roughly equal (for each given HashedPercentileDiscretizer feature).
+    Note that if an input feature is rarely used, so will its associated output bin/features.
+    The difference between this layer and PercentileDiscretizer is that the
+    DeterministicPercentileDiscretize always assigns the same output id in the SparseTensor to the
+    same input feature id + bin. This is useful if you want to user transfer learning on pre-trained
+    sparse to dense embedding layers, but re-calibrate your discretizer on newer data.
     """
-    super(HashedPercentileDiscretizer, self).__init__(**kwargs)
-
-    max_discretizer_feature = n_feature * (n_bin + 1)
-    self._n_feature = n_feature
-    self._n_bin = n_bin
-
-    if not self.built:
-      self.build(input_shape=None)
-
-    # build variables
-    self.output_size = tf.convert_to_tensor(1 << out_bits, tf.int64)
-    self._out_bits = out_bits
-
-    hash_keys = hash_keys
-    if hash_keys is None:
-      hash_keys = np.empty(n_feature, dtype=np.int64)
-
-    hash_values = hash_values
-    if hash_values is None:
-      hash_values = np.empty(n_feature, dtype=np.int64)
 
-    initializer = tf.lookup.KeyValueTensorInitializer(hash_keys, hash_values)
-    self.hash_map = tf.lookup.StaticHashTable(initializer, -1)
-    self.bin_ids = bin_ids
-    if bin_ids is None:
-      bin_ids = np.empty(max_discretizer_feature, dtype=np.int64)
-
-    self.bin_values = bin_values
-    if bin_values is None:
-      bin_values = np.empty(max_discretizer_feature, dtype=np.float32)
-
-    self.feature_offsets = feature_offsets
-    if feature_offsets is None:
-      feature_offsets = np.empty(n_feature, dtype=np.int64)
-
-    self.hash_fn = hash_fn
-
-  def build(self, input_shape):  # pylint: disable=unused-argument
-    """
-    Creates the variables of the layer:
-    hash_keys, hash_values, bin_ids, bin_values, feature_offsets and self.output_size.
-    """
-    # build layers
-    self.partition = Partition()
-    self.stitch = Stitch()
-    # make sure this is last
-    self.built = True
-
-  def call(self, inputs, **kwargs):
-    """Looks up `keys` in a table, outputs the corresponding values.
-
-    Implements HashedPercentileDiscretizer inference where inputs are intersected with a
-    hash_map.
-    Part of the inputs are discretized using twml.discretizer
-    to produce a discretizer_output SparseTensor.
-    This SparseTensor is then joined with the original inputs SparseTensor,
-    but only for the inputs keys that did not get discretized.
-
-    Args:
-      inputs: A 2D SparseTensor that is input to HashedPercentileDiscretizer for
-        discretization. It has a dense_shape of [batch_size, input_size]
-      name: A name for the operation (optional).
-    Returns:
-      A `SparseTensor` of the same type as `inputs`.
-      Its dense_shape is [shape_input.dense_shape[0], 1 << output_bits].
-    """
-    if isinstance(inputs, tf.SparseTensor):
-      inputs = twml.SparseTensor.from_tf(inputs)
-
-    assert(isinstance(inputs, twml.SparseTensor))
-
-    # sparse column indices
-    ids = inputs.ids
-    # sparse row indices
-    keys = inputs.indices
-    # sparse values
-    vals = inputs.values
-
-    hashed_keys = self.hash_map.lookup(keys)
-    hashed_keys = tf.cast(hashed_keys, tf.int64)
-
-    found = tf.not_equal(hashed_keys, tf.constant(-1, tf.int64))
-    partition_ids = tf.cast(found, tf.int32)
-
-    found = tf.reshape(found, [-1])
-    continuous_feature_ids = tf.boolean_mask(keys, found)
-
-    vals, key, indices = self.partition(partition_ids, vals, tf.where(found, hashed_keys, keys))
-    non_discretizer_keys, discretizer_in_keys = key
-    non_discretizer_vals, discretizer_in_vals = vals
-
-    non_discretizer_keys = twml.util.limit_bits(non_discretizer_keys, self._out_bits)
-    self.non_discretizer_keys = non_discretizer_keys
-
-    # run HashedPercentileDiscretizer on the keys/values it knows about
-    output = percentile_discretizer_bin_indices(discretizer_in_keys,
-                                                discretizer_in_vals,
-                                                self.bin_ids,
-                                                self.bin_values,
-                                                self.feature_offsets)
-    discretizer_bucket_idxs, discretizer_vals = output
-    new_discretizer_keys = self.hash_fn(continuous_feature_ids, discretizer_bucket_idxs,
-                                        self.output_size)
-    # Stitch the keys and values from discretizer and non discretizer indices back, with help
-    # of the Stitch Layer
-    self.discretizer_out_keys = new_discretizer_keys
-
-    concat_data = self.stitch([non_discretizer_vals, discretizer_vals],
-                              [non_discretizer_keys, new_discretizer_keys],
-                              indices)
-
-    concat_vals, concat_keys = concat_data
-
-    # Generate output shape using _compute_output_shape
-
-    batch_size = tf.to_int64(inputs.dense_shape[0])
-    output_shape = [batch_size, self.output_size]
-    return twml.SparseTensor(ids, concat_keys, concat_vals, output_shape).to_tf()
+    def __init__(
+        self,
+        n_feature: int,
+        n_bin: int,
+        out_bits: int,
+        bin_values: Optional[tf.Tensor] = None,
+        hash_keys: Optional[tf.Tensor] = None,
+        hash_values: Optional[tf.Tensor] = None,
+        bin_ids: Optional[tf.Tensor] = None,
+        feature_offsets: Optional[tf.Tensor] = None,
+        hash_fn: Callable[
+            [tf.Tensor, tf.Tensor, tf.Tensor], tf.Tensor
+        ] = integer_multiplicative_hashing_uniform,
+        **kwargs
+    ):
+        """
+        Creates a non-initialized `HashedPercentileDiscretizer` object.
+        Before using the table you will have to initialize it. After initialization
+        the table will be immutable.
+
+        Parent class args:
+            see [tf.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/layers/Layer)
+            for documentation of parent class arguments.
+
+        Required args:
+            n_feature:
+                number of unique features accumulated during HashedPercentileDiscretizer calibration.
+                This is the number of features in the hash map.
+                Used to initialize bin_values, hash_keys, hash_values,
+                bin_ids, bin_values and feature_offsets.
+            n_bin:
+                number of HashedPercentileDiscretizer bins used for
+                HashedPercentileDiscretizer calibration. Used to initialize bin_values, hash_keys,
+                hash_values, bin_ids, bin_values and feature_offsets.
+            out_bits:
+                Determines the maximum value for output feature IDs.
+                The dense_shape of the SparseTensor returned by lookup(x)
+                will be [x.shape[0], 1 << output_bits].
+
+        Optional args:
+            bin_values:
+                a 1D Tensor aligned with bin_ids.
+                For a given hash_feature ID j, it's value bin's are indexed between
+                `j*n_bin` and `j*n_bin + n_bin-1`.
+                As such, bin_ids[j*n_bin+i] is translated from a hash_feature ID of j
+                and a inputs value between
+                `bin_values[j*n_bin + i]` and `bin_values[j*n_bin+i+1]`.
+            hash_keys:
+                contains the features ID that HashedPercentileDiscretizer discretizes and knows
+                about. The hash map (hash_keys->hash_values) is used for two reasons:
+                    1. divide inputs into two feature spaces:
+                        HashedPercentileDiscretizer vs non-HashedPercentileDiscretizer
+                    2. translate the HashedPercentileDiscretizer features into a hash_feature ID that
+                        HashedPercentileDiscretizer understands.
+                The hash_map is expected to contain n_feature items.
+            hash_values:
+                translates the feature IDs into hash_feature IDs for HashedPercentileDiscretizer.
+            bin_ids:
+                a 1D Tensor of size n_feature * n_bin + 1 which contains
+                unique IDs to which the HashedPercentileDiscretizer features will be translated to.
+                For example, tf.Tensor(np.arange(n_feature * n_bin)) would produce
+                the most efficient output space.
+            feature_offsets:
+                a 1D Tensor specifying the starting location of bins for a given feature id.
+                For example, tf.Tensor(np.arange(0, bin_values.size, n_bin, dtype='int64')).
+            hash_fn:
+                a function that takes in `feature_ids`, `bucket_indices` and `output_size` and
+                hashes the bucketed features into the `output_size` buckets. The default uses knuth's
+            multiplicative hashing
+        """
+        super(HashedPercentileDiscretizer, self).__init__(**kwargs)
+
+        max_discretizer_feature = n_feature * (n_bin + 1)
+        self._n_feature = n_feature
+        self._n_bin = n_bin
+
+        if not self.built:
+            self.build(input_shape=None)
+
+        # build variables
+        self.output_size = tf.convert_to_tensor(1 << out_bits, tf.int64)
+        self._out_bits = out_bits
+
+        hash_keys = hash_keys
+        if hash_keys is None:
+            hash_keys = np.empty(n_feature, dtype=np.int64)
+
+        hash_values = hash_values
+        if hash_values is None:
+            hash_values = np.empty(n_feature, dtype=np.int64)
+
+        initializer = tf.lookup.KeyValueTensorInitializer(hash_keys, hash_values)
+        self.hash_map = tf.lookup.StaticHashTable(initializer, -1)
+        self.bin_ids = bin_ids
+        if bin_ids is None:
+            bin_ids = np.empty(max_discretizer_feature, dtype=np.int64)
+
+        self.bin_values = bin_values
+        if bin_values is None:
+            bin_values = np.empty(max_discretizer_feature, dtype=np.float32)
+
+        self.feature_offsets = feature_offsets
+        if feature_offsets is None:
+            feature_offsets = np.empty(n_feature, dtype=np.int64)
+
+        self.hash_fn = hash_fn
+
+    def build(self, input_shape):  # pylint: disable=unused-argument
+        """
+        Creates the variables of the layer:
+        hash_keys, hash_values, bin_ids, bin_values, feature_offsets and self.output_size.
+        """
+        # build layers
+        self.partition = Partition()
+        self.stitch = Stitch()
+        # make sure this is last
+        self.built = True
+
+    def call(self, inputs: twml.SparseTensor, **kwargs) -> twml.SparseTensor:
+        """Looks up `keys` in a table, outputs the corresponding values.
+
+        Implements HashedPercentileDiscretizer inference where inputs are intersected with a
+        hash_map.
+        Part of the inputs are discretized using twml.discretizer
+        to produce a discretizer_output SparseTensor.
+        This SparseTensor is then joined with the original inputs SparseTensor,
+        but only for the inputs keys that did not get discretized.
+
+        Args:
+            inputs: A 2D SparseTensor that is input to HashedPercentileDiscretizer for
+                discretization. It has a dense_shape of [batch_size, input_size]
+            name: A name for the operation (optional).
+        Returns:
+            A `SparseTensor` of the same type as `inputs`.
+            Its dense_shape is [shape_input.dense_shape[0], 1 << output_bits].
+        """
+        if isinstance(inputs, tf.SparseTensor):
+            inputs = twml.SparseTensor.from_tf(inputs)
+
+        assert isinstance(inputs, twml.SparseTensor)
+
+        # sparse column indices
+        ids = inputs.ids
+        # sparse row indices
+        keys = inputs.indices
+        # sparse values
+        vals = inputs.values
+
+        hashed_keys = self.hash_map.lookup(keys)
+        hashed_keys = tf.cast(hashed_keys, tf.int64)
+
+        found = tf.not_equal(hashed_keys, tf.constant(-1, tf.int64))
+        partition_ids = tf.cast(found, tf.int32)
+
+        found = tf.reshape(found, [-1])
+        continuous_feature_ids = tf.boolean_mask(keys, found)
+
+        vals, key, indices = self.partition(
+            partition_ids, vals, tf.where(found, hashed_keys, keys)
+        )
+        non_discretizer_keys, discretizer_in_keys = key
+        non_discretizer_vals, discretizer_in_vals = vals
+
+        non_discretizer_keys = twml.util.limit_bits(
+            non_discretizer_keys, self._out_bits
+        )
+        self.non_discretizer_keys = non_discretizer_keys
+
+        # run HashedPercentileDiscretizer on the keys/values it knows about
+        output = percentile_discretizer_bin_indices(
+            discretizer_in_keys,
+            discretizer_in_vals,
+            self.bin_ids,
+            self.bin_values,
+            self.feature_offsets,
+        )
+        discretizer_bucket_idxs, discretizer_vals = output
+        new_discretizer_keys = self.hash_fn(
+            continuous_feature_ids, discretizer_bucket_idxs, self.output_size
+        )
+        # Stitch the keys and values from discretizer and non discretizer indices back, with help
+        # of the Stitch Layer
+        self.discretizer_out_keys = new_discretizer_keys
+
+        concat_data = self.stitch(
+            [non_discretizer_vals, discretizer_vals],
+            [non_discretizer_keys, new_discretizer_keys],
+            indices,
+        )
+
+        concat_vals, concat_keys = concat_data
+
+        # Generate output shape using _compute_output_shape
+
+        batch_size = tf.to_int64(inputs.dense_shape[0])
+        output_shape = [batch_size, self.output_size]
+        return twml.SparseTensor(ids, concat_keys, concat_vals, output_shape).to_tf()
diff --git a/twml/twml/contrib/layers/hashing_discretizer.py b/twml/twml/contrib/layers/hashing_discretizer.py
index 2a8244f4b..19ee51f9d 100644
--- a/twml/twml/contrib/layers/hashing_discretizer.py
+++ b/twml/twml/contrib/layers/hashing_discretizer.py
@@ -4,153 +4,170 @@
 """
 
 
+from typing import Optional
+
 import libtwml
 import tensorflow.compat.v1 as tf
+
 import twml
 from twml.constants import HashingDiscretizerOptions
 from twml.layers.layer import Layer
 
 
 class HashingDiscretizer(Layer):
-  """A layer that discretizes continuous features, with hashed feature assignments
-
-  HashingDiscretizer converts sparse continuous features into sparse
-  binary features. Each binary output feature indicates the presence of a
-  value in a HashingDiscretizer bin.
+    """A layer that discretizes continuous features, with hashed feature assignments
 
-  Each calibrated HashingDiscretizer input feature is converted to n_bin+1 bins.
+    HashingDiscretizer converts sparse continuous features into sparse
+    binary features. Each binary output feature indicates the presence of a
+    value in a HashingDiscretizer bin.
 
-  - n_bin bin boundaries for each feature (i.e. len(bin_vals[id])==n_bin) defines n_bin+1 bins
-  - bin assignment = sum(bin_vals<val)
+    Each calibrated HashingDiscretizer input feature is converted to n_bin+1 bins.
 
-  The difference between this layer and PercentileDiscretizer is that the
-  HashingDiscretizer always assigns the same output id in the
-  SparseTensor to the same input (feature id, bin) pair. This is useful if you
-  want to user transfer learning on pre-trained sparse to dense embedding
-  layers, but re-calibrate your discretizer on newer data.
+    - n_bin bin boundaries for each feature (i.e. len(bin_vals[id])==n_bin) defines n_bin+1 bins
+    - bin assignment = sum(bin_vals<val)
 
-  If there are no calibrated features, then the discretizer will only apply
-  twml.util.limit_bits to the the feature keys (aka "feature_ids"). Essentially,
-  the discretizer will be a "no-operation", other than obeying `out_bits`
+    The difference between this layer and PercentileDiscretizer is that the
+    HashingDiscretizer always assigns the same output id in the
+    SparseTensor to the same input (feature id, bin) pair. This is useful if you
+    want to user transfer learning on pre-trained sparse to dense embedding
+    layers, but re-calibrate your discretizer on newer data.
 
-  Typically, a HashingDiscretizer layer will be generated by calling the
-  to_layer() method of the HashingDiscretizerCalibrator
-  """
+    If there are no calibrated features, then the discretizer will only apply
+    twml.util.limit_bits to the the feature keys (aka "feature_ids"). Essentially,
+    the discretizer will be a "no-operation", other than obeying `out_bits`
 
-  def __init__(self, feature_ids, bin_vals, n_bin, out_bits,
-               cost_per_unit=500, options=None, **kwargs):
+    Typically, a HashingDiscretizer layer will be generated by calling the
+    to_layer() method of the HashingDiscretizerCalibrator
     """
-    Creates a non-initialized `HashingDiscretizer` object.
-
-    Parent class args:
-      see [tf.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/layers/Layer)
-      for documentation of parent class arguments.
-
-    Required args:
-      feature_ids (1D int64 numpy array):
-      - list of feature IDs that have been calibrated and have corresponding
-        bin boundary values in the bin_vals array
-      - bin values for feature feature_ids[i] live at bin_vals[i*n_bin:(i+1)*n_bin]
-      bin_vals (1D float numpy array):
-      - These are the bin boundary values for each calibrated feature
-      - len(bin_vals) = n_bin*len(feature_ids)
-      n_bin (int):
-      - number of HashingDiscretizer bins is actually n_bin + 1
-      - ***Note*** that if a value N is passed for the value of n_bin to
-        HashingDiscretizerCalibrator, then HashingDiscretizerCalibrator
-        will generate N+1 bin boundaries for each feature, and hence there
-        will actually be N+2 potential bins for each feature
-      out_bits (int):
-        Determines the maximum value for output feature IDs.
-        The dense_shape of the SparseTensor returned by lookup(x)
-        will be [x.shape[0], 1 << output_bits].
-
-    Optional args:
-      cost_per_unit (int):
-      - heuristic for intra op multithreading. approximate nanoseconds per input value.
-      options (int or None for default):
-      - Selects behavior of the op. Default is lower_bound and integer_multiplicative_hashing.
-      - Use values in twml.constants.HashingDiscretizerOptions to select options as follows
-        choose exactly one of HashingDiscretizerOptions.{SEARCH_LOWER_BOUND, SEARCH_LINEAR, SEARCH_UPPER_BOUND}
-        choose exactly one of HashingDiscretizerOptions.{HASH_32BIT, HASH_64BIT}
-        Bitwise OR these together to construct the options input.
-        For example, `options=(HashingDiscretizerOptions.SEARCH_UPPER_BOUND | HashingDiscretizerOptions.HASH_64BIT)`
-    """
-    super(HashingDiscretizer, self).__init__(**kwargs)
-
-    self._feature_ids = feature_ids
-    self._bin_vals = bin_vals
-    self._n_bin = n_bin
-    self._out_bits = out_bits
-    self.cost_per_unit = cost_per_unit
-    if options is None:
-      options = HashingDiscretizerOptions.SEARCH_LOWER_BOUND | HashingDiscretizerOptions.HASH_32BIT
-    self._options = options
-
-    if not self.built:
-      self.build(input_shape=None)
 
-  def build(self, input_shape):  # pylint: disable=unused-argument
-    """
-    Creates the variables of the layer
-    """
-    # make sure this is last
-    self.built = True
-
-  def call(self, inputs, **kwargs):
-    """
-    Implements HashingDiscretizer inference on a twml.SparseTensor.
-    Alternatively, accepts a tf.SparseTensor that can be converted
-    to twml.SparseTensor.
-
-    Performs discretization of input values.
-    i.e. bucket_val = bucket(val | feature_id)
-
-    This bucket mapping depends on the calibration (i.e. the bin boundaries).
-    However, (feature_id, bucket_val) pairs are mapped to new_feature_id in
-    a way that is independent of the calibration procedure
-
-    Args:
-      inputs: A 2D SparseTensor that is input to HashingDiscretizer for
-        discretization. It has a dense_shape of [batch_size, input_size]
-      name: A name for the operation (optional).
-    Returns:
-      A tf.SparseTensor, created from twml.SparseTensor.to_tf()
-      Its dense_shape is [shape_input.dense_shape[0], 1 << output_bits].
-    """
-    if isinstance(inputs, tf.SparseTensor):
-      inputs = twml.SparseTensor.from_tf(inputs)
-
-    assert(isinstance(inputs, twml.SparseTensor))
-
-    # sparse column indices
-    ids = inputs.ids
-    # sparse row indices
-    keys = inputs.indices
-    # sparse values
-    vals = inputs.values
-
-    if len(self._feature_ids) > 0:
-      # pass all inputs to the c++ op
-      # the op determines whether to discretize (when a feature is calibrated),
-      #   or whether to simply limit bits and pass through (when not calibrated)
-      # NOTE - Hashing is done in C++
-      discretizer_keys, discretizer_vals = libtwml.ops.hashing_discretizer(
-        input_ids=keys,  # Input
-        input_vals=vals,  # Input
-        bin_vals=self._bin_vals,  # Input
-        feature_ids=tf.make_tensor_proto(self._feature_ids),  # Attr
-        n_bin=self._n_bin,  # Attr
-        output_bits=self._out_bits,  # Attr
-        cost_per_unit=self.cost_per_unit,  # Attr
-        options=self._options,  # Attr
-      )
-    else:
-      discretizer_keys = twml.util.limit_bits(keys, self._out_bits)
-      discretizer_vals = vals
-
-    batch_size = tf.to_int64(inputs.dense_shape[0])
-    output_size = tf.convert_to_tensor(1 << self._out_bits, tf.int64)
-    output_shape = [batch_size, output_size]
-
-    return twml.SparseTensor(ids, discretizer_keys, discretizer_vals, output_shape).to_tf()
+    def __init__(
+        self,
+        feature_ids: tf.Tensor,
+        bin_vals: tf.Tensor,
+        n_bin: int,
+        out_bits: int,
+        cost_per_unit: int = 500,
+        options: Optional[HashingDiscretizerOptions] = None,
+        **kwargs
+    ):
+        """
+        Creates a non-initialized `HashingDiscretizer` object.
+
+        Parent class args:
+            see [tf.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/layers/Layer)
+            for documentation of parent class arguments.
+
+        Required args:
+            feature_ids (1D int64 numpy array):
+            - list of feature IDs that have been calibrated and have corresponding
+                bin boundary values in the bin_vals array
+            - bin values for feature feature_ids[i] live at bin_vals[i*n_bin:(i+1)*n_bin]
+            bin_vals (1D float numpy array):
+            - These are the bin boundary values for each calibrated feature
+            - len(bin_vals) = n_bin*len(feature_ids)
+            n_bin (int):
+            - number of HashingDiscretizer bins is actually n_bin + 1
+            - ***Note*** that if a value N is passed for the value of n_bin to
+                HashingDiscretizerCalibrator, then HashingDiscretizerCalibrator
+                will generate N+1 bin boundaries for each feature, and hence there
+                will actually be N+2 potential bins for each feature
+            out_bits (int):
+                Determines the maximum value for output feature IDs.
+                The dense_shape of the SparseTensor returned by lookup(x)
+                will be [x.shape[0], 1 << output_bits].
+
+        Optional args:
+            cost_per_unit (int):
+                - heuristic for intra op multithreading. approximate nanoseconds per input value.
+            options (int or None for default):
+                - Selects behavior of the op. Default is lower_bound and integer_multiplicative_hashing.
+                - Use values in twml.constants.HashingDiscretizerOptions to select options as follows
+                    choose exactly one of HashingDiscretizerOptions.{SEARCH_LOWER_BOUND, SEARCH_LINEAR, SEARCH_UPPER_BOUND}
+                    choose exactly one of HashingDiscretizerOptions.{HASH_32BIT, HASH_64BIT}
+                    Bitwise OR these together to construct the options input.
+                    For example, `options=(HashingDiscretizerOptions.SEARCH_UPPER_BOUND | HashingDiscretizerOptions.HASH_64BIT)`
+        """
+
+        super(HashingDiscretizer, self).__init__(**kwargs)
+        self._feature_ids = feature_ids
+        self._bin_vals = bin_vals
+        self._n_bin = n_bin
+        self._out_bits = out_bits
+        self.cost_per_unit = cost_per_unit
+        if options is None:
+            options = (
+                HashingDiscretizerOptions.SEARCH_LOWER_BOUND
+                | HashingDiscretizerOptions.HASH_32BIT
+            )
+        self._options = options
+
+        if not self.built:
+            self.build(input_shape=None)
+
+    def build(self, input_shape):  # pylint: disable=unused-argument
+        """Creates the variables of the layer"""
+        # make sure this is last
+        self.built = True
+
+    def call(
+        self, inputs: tf.SparseTensor, name: Optional[str] = None
+    ) -> tf.SparseTensor:  # pylint: disable=unused-argument
+        """
+        Implements HashingDiscretizer inference on a twml.SparseTensor.
+        Alternatively, accepts a tf.SparseTensor that can be converted
+        to twml.SparseTensor.
+
+        Performs discretization of input values.
+        i.e. bucket_val = bucket(val | feature_id)
+
+        This bucket mapping depends on the calibration (i.e. the bin boundaries).
+        However, (feature_id, bucket_val) pairs are mapped to new_feature_id in
+        a way that is independent of the calibration procedure
+
+        Args:
+            inputs: A 2D SparseTensor that is input to HashingDiscretizer for
+                discretization. It has a dense_shape of [batch_size, input_size]
+            name: A name for the operation (optional).
+
+        Returns:
+            A tf.SparseTensor, created from twml.SparseTensor.to_tf()
+            Its dense_shape is [shape_input.dense_shape[0], 1 << output_bits].
+        """
+        if isinstance(inputs, tf.SparseTensor):
+            inputs = twml.SparseTensor.from_tf(inputs)
+
+        assert isinstance(inputs, twml.SparseTensor)
+
+        # sparse column indices
+        ids = inputs.ids
+        # sparse row indices
+        keys = inputs.indices
+        # sparse values
+        vals = inputs.values
+
+        if len(self._feature_ids) > 0:
+            # pass all inputs to the c++ op
+            # the op determines whether to discretize (when a feature is calibrated),
+            #   or whether to simply limit bits and pass through (when not calibrated)
+            # NOTE - Hashing is done in C++
+            discretizer_keys, discretizer_vals = libtwml.ops.hashing_discretizer(
+                input_ids=keys,  # Input
+                input_vals=vals,  # Input
+                bin_vals=self._bin_vals,  # Input
+                feature_ids=tf.make_tensor_proto(self._feature_ids),  # Attr
+                n_bin=self._n_bin,  # Attr
+                output_bits=self._out_bits,  # Attr
+                cost_per_unit=self.cost_per_unit,  # Attr
+                options=self._options,  # Attr
+            )
+        else:
+            discretizer_keys = twml.util.limit_bits(keys, self._out_bits)
+            discretizer_vals = vals
+
+        batch_size = tf.to_int64(inputs.dense_shape[0])
+        output_size = tf.convert_to_tensor((1 << self._out_bits), tf.int64)
+        output_shape = [batch_size, output_size]
+
+        return twml.SparseTensor(
+            ids, discretizer_keys, discretizer_vals, output_shape
+        ).to_tf()
diff --git a/twml/twml/contrib/layers/mask_layer.py b/twml/twml/contrib/layers/mask_layer.py
index f5e788c7b..0e632eeba 100644
--- a/twml/twml/contrib/layers/mask_layer.py
+++ b/twml/twml/contrib/layers/mask_layer.py
@@ -1,29 +1,31 @@
+import tensorflow.compat.v1 as tf
+
 from twml.contrib.pruning import apply_mask
 from twml.layers import Layer
 
 
 class MaskLayer(Layer):
-  """
-  This layer corresponds to `twml.contrib.pruning.apply_mask`.
-
-  It applies a binary mask to mask out channels of a given tensor. The masks can be
-  optimized using `twml.contrib.trainers.PruningDataRecordTrainer`.
-  """
+    """
+    This layer corresponds to `twml.contrib.pruning.apply_mask`.
 
-  def call(self, inputs, **kwargs):
+    It applies a binary mask to mask out channels of a given tensor. The masks can be
+    optimized using `twml.contrib.trainers.PruningDataRecordTrainer`.
     """
-    Applies a binary mask to the channels of the input.
 
-    Arguments:
-      inputs:
-        input tensor
-      **kwargs:
-        additional keyword arguments
+    def call(self, inputs: tf.Tensor, **kwargs):
+        """
+        Applies a binary mask to the channels of the input.
 
-    Returns:
-      Masked tensor
-    """
-    return apply_mask(inputs)
+        Args:
+            inputs:
+                input tensor
+            **kwargs:
+                additional keyword arguments
+
+        Returns:
+            Masked tensor
+        """
+        return apply_mask(inputs)
 
-  def compute_output_shape(self, input_shape):
-    return input_shape
+    def compute_output_shape(self, input_shape: tf.TensorShape) -> tf.TensorShape:
+        return input_shape
diff --git a/twml/twml/contrib/layers/stacked_rnn.py b/twml/twml/contrib/layers/stacked_rnn.py
index e05f5d853..2a86e9493 100644
--- a/twml/twml/contrib/layers/stacked_rnn.py
+++ b/twml/twml/contrib/layers/stacked_rnn.py
@@ -1,189 +1,235 @@
+from typing import Callable, List, Union
 
+import tensorflow.compat.v1 as tf
 from twitter.deepbird.compat.v1.rnn import stack_bidirectional_dynamic_rnn
 
-import tensorflow.compat.v1 as tf
-import tensorflow
 import twml
 
 
-def _get_rnn_cell_creator(cell_type):
-  if cell_type == "LSTM":
-    Cell = tf.nn.rnn_cell.LSTMCell
-  elif cell_type == "GRU":
-    Cell = tf.nn.rnn_cell.GRUCell
-  else:
-    raise ValueError("cell_type: %s is not supported."
-                     "It should be one of 'LSTM' or 'GRU'." % cell_type)
-  return Cell
-
-
-def _apply_dropout_wrapper(rnn_cells, dropout):
-  """ Apply dropout wrapper around each cell if necessary """
-  if rnn_cells is None:
-    return None
-
-  cells = []
-  for i, dropout_rate in enumerate(dropout):
-    cell = rnn_cells[i]
-    if dropout_rate > 0:
-      cell = tf.nn.rnn_cell.DropoutWrapper(cell, input_keep_prob=(1.0 - dropout_rate))
-    cells.append(cell)
-  return cells
-
-
-def _create_bidirectional_rnn_cell(num_units, dropout, cell_type):
-  scope_name = "lstm" if cell_type else "gru"
-  with tf.variable_scope(scope_name):
-    Cell = _get_rnn_cell_creator(cell_type)
-    cells_forward = [Cell(output_size) for output_size in num_units]
-    cells_backward = [Cell(output_size) for output_size in num_units]
-    cells_forward = _apply_dropout_wrapper(cells_forward, dropout)
-    cells_backward = _apply_dropout_wrapper(cells_backward, dropout)
-
-  def stacked_rnn_cell(inputs, sequence_lengths):
+def _get_rnn_cell_creator(cell_type: str):
+    if cell_type == "LSTM":
+        Cell = tf.nn.rnn_cell.LSTMCell
+    elif cell_type == "GRU":
+        Cell = tf.nn.rnn_cell.GRUCell
+    else:
+        raise ValueError(
+            "cell_type: %s is not supported."
+            "It should be one of 'LSTM' or 'GRU'." % cell_type
+        )
+    return Cell
+
+
+def _apply_dropout_wrapper(
+    rnn_cells: List[tf.nn.rnn_cell.RNNCell],
+    dropout: List[float],
+) -> List[tf.nn.rnn_cell.RNNCell]:
+    """Apply dropout wrapper around each cell if necessary"""
+
+    if any(
+        [rnn_cells is None, len(rnn_cells) == 0, dropout is None, len(dropout) == 0]
+    ):
+        return None
+
+    cells = []
+    for i, dropout_rate in enumerate(dropout):
+        cell = rnn_cells[i]
+        if dropout_rate > 0:
+            cell = tf.nn.rnn_cell.DropoutWrapper(
+                cell, input_keep_prob=(1.0 - dropout_rate)
+            )
+        cells.append(cell)
+    return cells
+
+
+def _create_bidirectional_rnn_cell(
+    num_units: List[int],
+    dropout: List[float],
+    cell_type: str,
+) -> Callable[[tf.Tensor, tf.Tensor], tf.Tensor]:
+    """Create a bidirectional RNN cell."""
+
+    scope_name = "lstm" if cell_type else "gru"
     with tf.variable_scope(scope_name):
-      outputs, final_states, _ = stack_bidirectional_dynamic_rnn(
-        cells_fw=cells_forward, cells_bw=cells_backward, inputs=inputs,
-        sequence_length=sequence_lengths, dtype=inputs.dtype)
-      return final_states[-1][-1]
-
-  return stacked_rnn_cell
-
-
-def _create_unidirectional_rnn_cell(num_units, dropout, cell_type):
-  scope_name = "lstm" if cell_type else "gru"
-  with tf.variable_scope(scope_name):
-    Cell = _get_rnn_cell_creator(cell_type)
-    cells = [Cell(output_size) for output_size in num_units]
-    cells = _apply_dropout_wrapper(cells, dropout)
-    multi_cell = tf.nn.rnn_cell.MultiRNNCell(cells)
-
-  def stacked_rnn_cell(inputs, sequence_lengths):
+        Cell = _get_rnn_cell_creator(cell_type)
+        cells_forward = [Cell(output_size) for output_size in num_units]
+        cells_backward = [Cell(output_size) for output_size in num_units]
+        cells_forward = _apply_dropout_wrapper(cells_forward, dropout)
+        cells_backward = _apply_dropout_wrapper(cells_backward, dropout)
+
+    def stacked_rnn_cell(inputs: tf.Tensor, sequence_lengths: tf.Tensor) -> tf.Tensor:
+        """Create a bidirectional RNN cell."""
+
+        with tf.variable_scope(scope_name):
+            outputs, final_states, _ = stack_bidirectional_dynamic_rnn(
+                cells_fw=cells_forward,
+                cells_bw=cells_backward,
+                inputs=inputs,
+                sequence_length=sequence_lengths,
+                dtype=inputs.dtype,
+            )
+            return final_states[-1][-1]
+
+    return stacked_rnn_cell
+
+
+def _create_unidirectional_rnn_cell(
+    num_units: List[int],
+    dropout: List[float],
+    cell_type: str,
+) -> Callable[[tf.Tensor, tf.Tensor], tf.Tensor]:
+    """Create a unidirectional RNN cell."""
+
+    scope_name = "lstm" if cell_type else "gru"
     with tf.variable_scope(scope_name):
-      outputs, final_states = tf.nn.static_rnn(
-        multi_cell,
-        tf.unstack(inputs, axis=1),
-        dtype=inputs.dtype,
-        sequence_length=sequence_lengths)
-      return final_states[-1].h
-
-  return stacked_rnn_cell
-
-
-def _create_regular_rnn_cell(num_units, dropout, cell_type, is_bidirectional):
-  if is_bidirectional:
-    return _create_bidirectional_rnn_cell(num_units, dropout, cell_type)
-  else:
+        Cell = _get_rnn_cell_creator(cell_type)
+        cells = [Cell(output_size) for output_size in num_units]
+        cells = _apply_dropout_wrapper(cells, dropout)
+        multi_cell = tf.nn.rnn_cell.MultiRNNCell(cells)
+
+    def stacked_rnn_cell(inputs: tf.Tensor, sequence_lengths: tf.Tensor) -> tf.Tensor:
+        """Create a unidirectional RNN cell."""
+
+        with tf.variable_scope(scope_name):
+            outputs, final_states = tf.nn.static_rnn(
+                multi_cell,
+                tf.unstack(inputs, axis=1),
+                dtype=inputs.dtype,
+                sequence_length=sequence_lengths,
+            )
+            return final_states[-1].h
+
+    return stacked_rnn_cell
+
+
+def _create_regular_rnn_cell(
+    num_units: List[int],
+    dropout: List[float],
+    cell_type: str,
+    is_bidirectional: bool = True,
+) -> Callable[[tf.Tensor, tf.Tensor], tf.Tensor]:
+    if is_bidirectional:
+        return _create_bidirectional_rnn_cell(num_units, dropout, cell_type)
     return _create_unidirectional_rnn_cell(num_units, dropout, cell_type)
 
 
 class StackedRNN(twml.layers.Layer):
-  """
-  Layer for stacking RNN modules.
-  This layer provides a unified interface for RNN modules that perform well on CPUs and GPUs.
-
-  Arguments:
-    num_units:
-      A list specifying the number of units per layer.
-    dropout:
-      Dropout applied to the input of each cell.
-      If list, has to dropout used for each layer.
-      If number, the same amount of dropout is used everywhere.
-      Defaults to 0.
-    is_training:
-      Flag to specify if the layer is used in training mode or not.
-    cell_type:
-      Sepcifies the type of RNN. Can be "LSTM". "GRU" is not yet implemented.
-    is_bidirectional:
-      Specifies if the stacked RNN layer is bidirectional.
-      This is for forward compatibility, this is not yet implemented.
-      Defaults to False.
-  """
-
-  def __init__(self,
-               num_units,
-               dropout=0,
-               is_training=True,
-               cell_type="LSTM",
-               is_bidirectional=False,
-               name="stacked_rnn"):
-
-    super(StackedRNN, self).__init__(name=name)
-
-    if (is_bidirectional):
-      raise NotImplementedError("Bidirectional RNN is not yet implemented")
-
-    if (cell_type != "LSTM"):
-      raise NotImplementedError("Only LSTMs are supported")
-
-    if not isinstance(num_units, (list, tuple)):
-      num_units = [num_units]
-    else:
-      num_units = num_units
-
-    self.num_layers = len(num_units)
-    if not isinstance(dropout, (tuple, list)):
-      dropout = [dropout] * self.num_layers
-    else:
-      dropout = dropout
-
-    self.is_training = is_training
-
-    is_gpu_available = twml.contrib.utils.is_gpu_available()
-    same_unit_size = all(size == num_units[0] for size in num_units)
-    same_dropout_rate = any(val == dropout[0] for val in dropout)
-
-    self.stacked_rnn_cell = None
-    self.num_units = num_units
-    self.dropout = dropout
-    self.cell_type = cell_type
-    self.is_bidirectional = is_bidirectional
-
-  def build(self, input_shape):
-    self.stacked_rnn_cell = _create_regular_rnn_cell(self.num_units,
-                                                     self.dropout,
-                                                     self.cell_type,
-                                                     self.is_bidirectional)
-
-  def call(self, inputs, sequence_lengths):
     """
-    Arguments:
-      inputs:
-        A tensor of size [batch_size, max_sequence_length, embedding_size].
-      sequence_lengths:
-        The length of each input sequence in the batch. Should be of size [batch_size].
-    Returns:
-      final_output
-        The output of at the end of sequence_length.
+    Layer for stacking RNN modules.
+    This layer provides a unified interface for RNN modules that perform well on CPUs and GPUs.
+
+    Args:
+        num_units: int or list
+            A list specifying the number of units per layer.
+        dropout: float or list
+            Dropout applied to the input of each cell.
+            If list, has to dropout used for each layer.
+            If number, the same amount of dropout is used everywhere.
+            Defaults to 0.
+        is_training: bool
+            Flag to specify if the layer is used in training mode or not.
+        cell_type: str
+            Sepcifies the type of RNN. Can be "LSTM". "GRU" is not yet implemented.
+        is_bidirectional: bool
+            Specifies if the stacked RNN layer is bidirectional.
+            This is for forward compatibility, this is not yet implemented.
+            Defaults to False.
+        name: str
+            Name of the layer.
+    """
+
+    def __init__(
+        self,
+        num_units: Union[int, List[int]],
+        dropout: Union[float, List[float]] = 0.0,
+        is_training: bool = True,
+        cell_type: str = "LSTM",
+        is_bidirectional: bool = False,
+        name: str = "stacked_rnn",
+    ):
+        super(StackedRNN, self).__init__(name=name)
+
+        if is_bidirectional:
+            raise NotImplementedError("Bidirectional RNN is not yet implemented")
+
+        assert cell_type in ["LSTM", "GRU"]
+        if cell_type != "LSTM":
+            raise NotImplementedError("Only LSTMs are supported")
+
+        # Make sure num_units is a list
+        if not isinstance(num_units, (list, tuple)):
+            num_units = [num_units]
+
+        # Make sure dropout is a list
+        self.num_layers = len(num_units)
+        if not isinstance(dropout, (tuple, list)):
+            dropout = [dropout] * self.num_layers
+
+        # Check if all parameters are valid
+        is_gpu_available = twml.contrib.utils.is_gpu_available()
+        same_unit_size = all(size == num_units[0] for size in num_units)
+        same_dropout_rate = any(val == dropout[0] for val in dropout)
+
+        # set all class variables
+        self.is_training = is_training
+        self.stacked_rnn_cell = None
+        self.num_units = num_units
+        self.dropout = dropout
+        self.cell_type = cell_type
+        self.is_bidirectional = is_bidirectional
+
+    def build(self, input_shape: tf.TensorShape):
+        self.stacked_rnn_cell = _create_regular_rnn_cell(
+            self.num_units, self.dropout, self.cell_type, self.is_bidirectional
+        )
+
+    def call(self, inputs: tf.Tensor, sequence_lengths: tf.Tensor) -> tf.Tensor:
+        """
+        Args:
+            inputs:
+                A tensor of size [batch_size, max_sequence_length, embedding_size].
+            sequence_lengths:
+                The length of each input sequence in the batch. Should be of size [batch_size].
+        Returns:
+            final_output
+                The output of at the end of sequence_length.
+        """
+        return self.stacked_rnn_cell(inputs, sequence_lengths)
+
+
+def stacked_rnn(
+    inputs: tf.Tensor,
+    sequence_lengths: tf.Tensor,
+    num_units: List[int],
+    dropout: Union[float, List[float]] = 0.0,
+    is_training: bool = True,
+    cell_type: str = "LSTM",
+    is_bidirectional: bool = False,
+    name: str = "stacked_rnn",
+) -> StackedRNN:
+    """Functional interface for StackedRNN
+
+    Args:
+        inputs:
+            A tensor of size [batch_size, max_sequence_length, embedding_size].
+        sequence_lengths:
+            The length of each input sequence in the batch. Should be of size [batch_size].
+        num_units:
+            A list specifying the number of units per layer.
+        dropout:
+            Dropout applied to the input of each cell.
+            If list, has to dropout used for each layer.
+            If number, the same amount of dropout is used everywhere.
+            Defaults to 0.
+        is_training:
+            Flag to specify if the layer is used in training mode or not.
+        cell_type:
+            Specifies the type of RNN. Can be "LSTM" or "GRU".
+        is_bidirectional:
+            Specifies if the stacked RNN layer is bidirectional.
+            Defaults to False.
+
+    Returns
+        outputs, state.
     """
-    return self.stacked_rnn_cell(inputs, sequence_lengths)
-
-
-def stacked_rnn(inputs, sequence_lengths, num_units,
-                dropout=0, is_training=True,
-                cell_type="LSTM", is_bidirectional=False, name="stacked_rnn"):
-  """Functional interface for StackedRNN
-  Arguments:
-    inputs:
-      A tensor of size [batch_size, max_sequence_length, embedding_size].
-    sequence_lengths:
-      The length of each input sequence in the batch. Should be of size [batch_size].
-    num_units:
-      A list specifying the number of units per layer.
-    dropout:
-      Dropout applied to the input of each cell.
-      If list, has to dropout used for each layer.
-      If number, the same amount of dropout is used everywhere.
-      Defaults to 0.
-    is_training:
-      Flag to specify if the layer is used in training mode or not.
-    cell_type:
-      Sepcifies the type of RNN. Can be "LSTM" or "GRU".
-    is_bidirectional:
-      Specifies if the stacked RNN layer is bidirectional.
-      Defaults to False.
-  Returns
-    outputs, state.
-  """
-  rnn = StackedRNN(num_units, dropout, is_training, cell_type, is_bidirectional, name)
-  return rnn(inputs, sequence_lengths)
+    rnn = StackedRNN(num_units, dropout, is_training, cell_type, is_bidirectional, name)
+    return rnn(inputs, sequence_lengths)
diff --git a/twml/twml/contrib/layers/zscore_normalization.py b/twml/twml/contrib/layers/zscore_normalization.py
index 8a1064965..05da7d4d1 100644
--- a/twml/twml/contrib/layers/zscore_normalization.py
+++ b/twml/twml/contrib/layers/zscore_normalization.py
@@ -1,247 +1,294 @@
 """
 Contains the twml.layers.ZscoreNormalization layer.
 """
-from twml.layers.layer import Layer
-import tensorflow.compat.v1 as tf
+from typing import Optional, Tuple, Union
 
+import tensorflow.compat.v1 as tf
 from tensorflow.python.training import moving_averages
 
+from twml.layers.layer import Layer
+
 
 # This is copied from tensorflow.contrib.framework.python.ops.add_model_variable in 1.15
 # Not available in 2.x
 # TODO: Figure out if this is really necessary.
-def _add_model_variable(var):
-  """Adds a variable to the `GraphKeys.MODEL_VARIABLES` collection.
-  Args:
-    var: a variable.
-  """
-  if var not in tf.get_collection(tf.GraphKeys.MODEL_VARIABLES):
-    tf.add_to_collection(tf.GraphKeys.MODEL_VARIABLES, var)
-
+def _add_model_variable(var: tf.Variable) -> None:
+    """Adds a variable to the `GraphKeys.MODEL_VARIABLES` collection.
 
-def update_moving_variable(batch_var, moving_var, decay, zero_debias=True, name=None):
-  update_op = moving_averages.assign_moving_average(
-      moving_var, batch_var, decay, zero_debias=zero_debias, name=None)
-  _add_model_variable(moving_var)
-  with tf.control_dependencies([update_op]):
-    return tf.identity(moving_var)
+    Args:
+        var: a variable.
+    """
 
+    if var not in tf.get_collection(tf.GraphKeys.MODEL_VARIABLES):
+        tf.add_to_collection(tf.GraphKeys.MODEL_VARIABLES, var)
 
-class ZscoreNormalization(Layer):
-  """
-  Perform z-score normalization using moving mean and std.
-  Missing values are not included during mean/std calculation
-  This layer should only be used right after input layer.
-
-  Args:
-    decay:
-      using large decay to include longer moving means.
-    data_type:
-      use float64 to prevent overflow during variance calculation.
-    name:
-      Layer name
-  Returns:
-    A layer representing the output of the ZscoreNormalization transformation.
-   """
-
-  def __init__(
-    self,
-    decay=0.9999,
-    data_type=tf.float64,
-    name=None,
-    **kwargs):
-    super(ZscoreNormalization, self).__init__(name=name, **kwargs)
-    self.epsilon = tf.constant(1., data_type)
-    self.decay = decay
-    self.data_type = data_type
-
-  def build(self, input_shape):  # pylint: disable=unused-argument
-    """Creates the moving_mean and moving_var tf.Variables of the layer."""
-    input_dim = input_shape[1]
-    self.moving_mean = self.add_variable(
-      '{}_mean/EMA'.format(self.name),
-      initializer=tf.constant_initializer(),
-      shape=[input_dim],
-      dtype=self.data_type,
-      trainable=False
-    )
-    self.moving_var = self.add_variable(
-      '{}_variance/EMA'.format(self.name),
-      initializer=tf.constant_initializer(),
-      shape=[input_dim],
-      dtype=self.data_type,
-      trainable=False
-    )
-    self.built = True
 
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
+def update_moving_variable(
+    batch_var: tf.Variable,
+    moving_var: tf.Variable,
+    decay: float,
+    zero_debias: bool = True,
+    name: Optional[str] = None,
+) -> tf.Variable:
+    """Update moving variable using batch variable.
 
     Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
+        batch_var: a variable.
+        moving_var: a variable.
+        decay: decay rate.
+        zero_debias: whether to use zero debias.
+        name: name of the operation.
 
+    Returns:
+        A variable representing the updated moving variable.
     """
 
-    return input_shape
-
-  def _training_pass(self, input, dense_mask, input_dtype, handle_single, zero_debias):
-    epsilon = self.epsilon
-    moving_mean, moving_var = self.moving_mean, self.moving_var
-    # calculate the number of exisiting value for each feature
-    tensor_batch_num = tf.reduce_sum(tf.cast(dense_mask, self.data_type), axis=0)
-    mask_ones = tf.cast(tensor_batch_num, tf.bool)
-    eps_vector = tf.fill(tf.shape(tensor_batch_num), epsilon)
-    # the following filled 0 with epision
-    tensor_batch_num_eps = tf.where(mask_ones,
-                                    tensor_batch_num,
-                                    eps_vector
-                                  )
-    tensor_batch_num_eps_broacast = tf.expand_dims(tensor_batch_num_eps, 0)
-    tensor_batch_divided = input / tensor_batch_num_eps_broacast
-    tensor_batch_mean = tf.reduce_sum(tensor_batch_divided, axis=0)
-
-    # update moving mean here, and use it to calculate the std.
-    tensor_moving_mean = update_moving_variable(tensor_batch_mean, moving_mean, self.decay,
-                                                zero_debias, name="mean_ema_op")
-
-    tensor_batch_sub_mean = input - tf.expand_dims(tensor_moving_mean, 0)
-    tensor_batch_sub_mean = tf.where(dense_mask,
-                                    tensor_batch_sub_mean,
-                                    tf.zeros_like(tensor_batch_sub_mean))
-    # divided by sqrt(n) before square, and then do summation for numeric stability.
-    broad_sqrt_num_eps = tf.expand_dims(tf.sqrt(tensor_batch_num_eps), 0)
-    tensor_batch_sub_mean_div = tensor_batch_sub_mean / broad_sqrt_num_eps
-    tensor_batch_sub_mean_div_square = tf.square(tensor_batch_sub_mean_div)
-    tensor_batch_var = tf.reduce_sum(tensor_batch_sub_mean_div_square, axis=0)
-
-    # update moving var here, dont replace 0 with eps before updating.
-    tensor_moving_var = update_moving_variable(tensor_batch_var, moving_var, self.decay,
-                                               zero_debias, name="var_ema_op")
-
-    # if std is 0, replace it with epsilon
-    tensor_moving_std = tf.sqrt(tensor_moving_var)
-    tensor_moving_std_eps = tf.where(tf.equal(tensor_moving_std, 0),
-                                    eps_vector,
-                                    tensor_moving_std)
-
-    missing_input_norm = tensor_batch_sub_mean / tf.expand_dims(tensor_moving_std_eps, 0)
-
-    if handle_single:
-      # if std==0 and value not missing, reset it to 1.
-      moving_var_mask_zero = tf.math.equal(tensor_moving_var, 0)
-      moving_var_mask_zero = tf.expand_dims(moving_var_mask_zero, 0)
-      missing_input_norm = tf.where(
-        tf.math.logical_and(dense_mask, moving_var_mask_zero),
-        tf.ones_like(missing_input_norm),
-        missing_input_norm
-      )
-    if input_dtype != self.data_type:
-      missing_input_norm = tf.cast(missing_input_norm, input_dtype)
-    return missing_input_norm
-
-  def _infer_pass(self, input, dense_mask, input_dtype, handle_single):
-    epsilon = tf.cast(self.epsilon, input_dtype)
-    testing_moving_mean = tf.cast(self.moving_mean, input_dtype)
-    tensor_moving_std = tf.cast(tf.sqrt(self.moving_var), input_dtype)
-
-    broad_mean = tf.expand_dims(testing_moving_mean, 0)
-    tensor_batch_sub_mean = input - broad_mean
-
-    tensor_batch_sub_mean = tf.where(dense_mask,
-                                    tensor_batch_sub_mean,
-                                    tf.zeros_like(tensor_batch_sub_mean)
-                            )
-    tensor_moving_std_eps = tf.where(tf.equal(tensor_moving_std, 0),
-                                      tf.fill(tf.shape(tensor_moving_std), epsilon),
-                                      tensor_moving_std)
-    missing_input_norm = tensor_batch_sub_mean / tf.expand_dims(tensor_moving_std_eps, 0)
-    if handle_single:
-      # if std==0 and value not missing, reset it to 1.
-      moving_var_broad = tf.expand_dims(tensor_moving_std, 0)
-      moving_var_mask_zero = tf.math.logical_not(tf.cast(moving_var_broad, tf.bool))
-
-      missing_input_norm = tf.where(tf.math.logical_and(dense_mask, moving_var_mask_zero),
-                          tf.ones_like(missing_input_norm),
-                          missing_input_norm
-                          )
-    return missing_input_norm
-
-  def call(
-    self,
-    input,
-    is_training,
-    dense_mask=None,
-    zero_debias=True,
-    handle_single=False):
+    update_op = moving_averages.assign_moving_average(
+        moving_var, batch_var, decay, zero_debias=zero_debias, name=None
+    )
+    _add_model_variable(moving_var)
+    with tf.control_dependencies([update_op]):
+        return tf.identity(moving_var)
+
+
+class ZscoreNormalization(Layer):
     """
+    Perform z-score normalization using moving mean and std.
+    Missing values are not included during mean/std calculation
+    This layer should only be used right after input layer.
+
     Args:
-    -----------
-    input:  B x D : float32/float64
-      missing value must be set to 0.
-    is_training: bool
-      training phase or testing phase
-    dense_mask: B x D : bool
-      missing value should be marked as 0, non-missing as 1. same shape as input
-    zero_debias: bool
-      bias correction of the moving average. (biased towards 0 in the beginning.
-      see adam paper. https://arxiv.org/abs/1412.6980)
-    handle_single: bool
-      if std==0, and feature is not missing value, set the value to 1, instead of 0.
-      This is super rare if input only consists of continous feature.
-      But if one-hot feature is included,
-      they will all have same values 1, in that case, make sure to set handle_single to true.
+        decay:
+            using large decay to include longer moving means.
+        data_type:
+            use float64 to prevent overflow during variance calculation.
+        name:
+            Layer name
+
+    Returns:
+        A layer representing the output of the ZscoreNormalization transformation.
     """
 
-    if dense_mask is None:
-      dense_mask = tf.math.logical_not(tf.equal(input, 0))
-    input_dtype = input.dtype
+    def __init__(self, decay=0.9999, data_type=tf.float64, name=None, **kwargs):
+        super(ZscoreNormalization, self).__init__(name=name, **kwargs)
+        self.epsilon = tf.constant(1.0, data_type)
+        self.decay = decay
+        self.data_type = data_type
+
+    def build(self, input_shape: tf.TensorShape):
+        """Creates the moving_mean and moving_var tf.Variables of the layer."""
+        input_dim = input_shape[1]
+        self.moving_mean = self.add_variable(
+            f"{self.name}_mean/EMA",
+            initializer=tf.constant_initializer(),
+            shape=[input_dim],
+            dtype=self.data_type,
+            trainable=False,
+        )
+        self.moving_var = self.add_variable(
+            f"{self.name}_variance/EMA",
+            initializer=tf.constant_initializer(),
+            shape=[input_dim],
+            dtype=self.data_type,
+            trainable=False,
+        )
+        self.built = True
+
+    def compute_output_shape(
+        self, input_shape: Union[tf.TensorShape, Tuple[tf.TensorShape]]
+    ) -> tf.TensorShape:
+        """Computes the output shape of the layer given the input shape."""
+
+        return input_shape
+
+    def _training_pass(
+        self,
+        input: tf.Tensor,
+        dense_mask: tf.Tensor,
+        input_dtype: tf.DType,
+        handle_single: bool = False,
+        zero_debias: bool = True,
+    ) -> tf.Tensor:
+        """Perform z-score normalization in training mode."""
 
-    if is_training:
-      if input_dtype != self.data_type:
-        input = tf.cast(input, self.data_type)
-      return self._training_pass(input, dense_mask, input_dtype, handle_single, zero_debias)
-    else:
-      return self._infer_pass(input, dense_mask, input_dtype, handle_single)
+        epsilon = self.epsilon
+        moving_mean, moving_var = self.moving_mean, self.moving_var
+        # calculate the number of exisiting value for each feature
+        tensor_batch_num = tf.reduce_sum(tf.cast(dense_mask, self.data_type), axis=0)
+        mask_ones = tf.cast(tensor_batch_num, tf.bool)
+        eps_vector = tf.fill(tf.shape(tensor_batch_num), epsilon)
+        # the following filled 0 with epision
+        tensor_batch_num_eps = tf.where(mask_ones, tensor_batch_num, eps_vector)
+        tensor_batch_num_eps_broacast = tf.expand_dims(tensor_batch_num_eps, 0)
+        tensor_batch_divided = input / tensor_batch_num_eps_broacast
+        tensor_batch_mean = tf.reduce_sum(tensor_batch_divided, axis=0)
+
+        # update moving mean here, and use it to calculate the std.
+        tensor_moving_mean = update_moving_variable(
+            tensor_batch_mean, moving_mean, self.decay, zero_debias, name="mean_ema_op"
+        )
+
+        tensor_batch_sub_mean = input - tf.expand_dims(tensor_moving_mean, 0)
+        tensor_batch_sub_mean = tf.where(
+            dense_mask, tensor_batch_sub_mean, tf.zeros_like(tensor_batch_sub_mean)
+        )
+        # divided by sqrt(n) before square, and then do summation for numeric stability.
+        broad_sqrt_num_eps = tf.expand_dims(tf.sqrt(tensor_batch_num_eps), 0)
+        tensor_batch_sub_mean_div = tensor_batch_sub_mean / broad_sqrt_num_eps
+        tensor_batch_sub_mean_div_square = tf.square(tensor_batch_sub_mean_div)
+        tensor_batch_var = tf.reduce_sum(tensor_batch_sub_mean_div_square, axis=0)
+
+        # update moving var here, dont replace 0 with eps before updating.
+        tensor_moving_var = update_moving_variable(
+            tensor_batch_var, moving_var, self.decay, zero_debias, name="var_ema_op"
+        )
+
+        # if std is 0, replace it with epsilon
+        tensor_moving_std = tf.sqrt(tensor_moving_var)
+        tensor_moving_std_eps = tf.where(
+            tf.equal(tensor_moving_std, 0), eps_vector, tensor_moving_std
+        )
+
+        missing_input_norm = tensor_batch_sub_mean / tf.expand_dims(
+            tensor_moving_std_eps, 0
+        )
+
+        if handle_single:
+            # if std==0 and value not missing, reset it to 1.
+            moving_var_mask_zero = tf.math.equal(tensor_moving_var, 0)
+            moving_var_mask_zero = tf.expand_dims(moving_var_mask_zero, 0)
+            missing_input_norm = tf.where(
+                tf.math.logical_and(dense_mask, moving_var_mask_zero),
+                tf.ones_like(missing_input_norm),
+                missing_input_norm,
+            )
+        if input_dtype != self.data_type:
+            missing_input_norm = tf.cast(missing_input_norm, input_dtype)
+        return missing_input_norm
+
+    def _infer_pass(
+        self,
+        input: tf.Tensor,
+        dense_mask: tf.Tensor,
+        input_dtype: tf.DType,
+        handle_single: bool = False,
+    ) -> tf.Tensor:
+        """Perform z-score normalization in inference mode."""
+
+        epsilon = tf.cast(self.epsilon, input_dtype)
+        testing_moving_mean = tf.cast(self.moving_mean, input_dtype)
+        tensor_moving_std = tf.cast(tf.sqrt(self.moving_var), input_dtype)
+
+        broad_mean = tf.expand_dims(testing_moving_mean, 0)
+        tensor_batch_sub_mean = input - broad_mean
+
+        tensor_batch_sub_mean = tf.where(
+            dense_mask, tensor_batch_sub_mean, tf.zeros_like(tensor_batch_sub_mean)
+        )
+        tensor_moving_std_eps = tf.where(
+            tf.equal(tensor_moving_std, 0),
+            tf.fill(tf.shape(tensor_moving_std), epsilon),
+            tensor_moving_std,
+        )
+        missing_input_norm = tensor_batch_sub_mean / tf.expand_dims(
+            tensor_moving_std_eps, 0
+        )
+        if handle_single:
+            # if std==0 and value not missing, reset it to 1.
+            moving_var_broad = tf.expand_dims(tensor_moving_std, 0)
+            moving_var_mask_zero = tf.math.logical_not(
+                tf.cast(moving_var_broad, tf.bool)
+            )
+
+            missing_input_norm = tf.where(
+                tf.math.logical_and(dense_mask, moving_var_mask_zero),
+                tf.ones_like(missing_input_norm),
+                missing_input_norm,
+            )
+        return missing_input_norm
+
+    def call(
+        self,
+        input: tf.Tensor,
+        is_training: bool = True,
+        dense_mask: bool = None,
+        zero_debias: bool = True,
+        handle_single: bool = False,
+    ) -> tf.Tensor:
+        """
+        Args:
+            input:  B x D : float32/float64
+                missing value must be set to 0.
+            is_training: bool
+                training phase or testing phase
+            dense_mask: B x D : bool
+                missing value should be marked as 0, non-missing as 1. same shape as input
+            zero_debias: bool
+                bias correction of the moving average. (biased towards 0 in the beginning.
+                see adam paper. https://arxiv.org/abs/1412.6980)
+            handle_single: bool
+                if std==0, and feature is not missing value, set the value to 1, instead of 0.
+                This is super rare if input only consists of continuous feature.
+                But if one-hot feature is included,
+                they will all have same values 1, in that case, make sure to set handle_single to true.
+        """
+
+        if dense_mask is None:
+            dense_mask = tf.math.logical_not(tf.equal(input, 0))
+        input_dtype = input.dtype
+
+        if is_training:
+            if input_dtype != self.data_type:
+                input = tf.cast(input, self.data_type)
+            return self._training_pass(
+                input, dense_mask, input_dtype, handle_single, zero_debias
+            )
+        else:
+            return self._infer_pass(input, dense_mask, input_dtype, handle_single)
 
 
 def zscore_normalization(
-  input,
-  is_training,
-  decay=0.9999,
-  data_type=tf.float64,
-  name=None,
-  dense_mask=None,
-  zero_debias=True,
-  handle_single=False, **kwargs):
-  """
-  Args:
-  ------------
-  input:  B x D : float32/float64
-    missing value must be set to 0.
-  is_training: bool
-    training phase or testing phase
-  decay:
-    using large decay to include longer moving means.
-  data_type:
-    use float64 to zprevent overflow during variance calculation.
-  name:
-    Layer name
-  dense_mask: B x D : bool
-    missing value should be marked as 0, non-missing as 1. same shape as input
-  zero_debias: bool
-    bias correction of the moving average. (biased towards 0 in the beginning.
-    see adam paper. https://arxiv.org/abs/1412.6980)
-  handle_single: bool
-    if std==0, and feature is not missing value, set the value to 1, instead of 0.
-    This is super rare if input only consists of continous feature.
-    But if one-hot feature is included,
-    they will all have same values 1, in that case, make sure to set handle_single to true.
-  """
-
-  norm_layer = ZscoreNormalization(decay=decay, data_type=data_type, name=name, **kwargs)
-  return norm_layer(input,
-                    is_training,
-                    dense_mask=dense_mask,
-                    zero_debias=zero_debias,
-                    handle_single=handle_single)
+    input: tf.Tensor,
+    is_training: bool = True,
+    decay: float = 0.9999,
+    data_type: tf.DType = tf.float64,
+    name: Optional[str] = None,
+    dense_mask: Optional[tf.Tensor] = None,
+    zero_debias: bool = True,
+    handle_single: bool = False,
+    **kwargs,
+):
+    """
+    Args:
+        input:  B x D : float32/float64
+            missing value must be set to 0.
+        is_training: bool
+            training phase or testing phase
+        decay: float
+            using large decay to include longer moving means.
+        data_type: tf.DType
+            use float64 to zprevent overflow during variance calculation.
+        name: str
+            Layer name
+        dense_mask: B x D : bool
+            missing value should be marked as 0, non-missing as 1. same shape as input
+        zero_debias: bool
+            bias correction of the moving average. (biased towards 0 in the beginning.
+            see adam paper. https://arxiv.org/abs/1412.6980)
+        handle_single: bool
+            if std == 0, and feature is not missing value, set the value to 1, instead of 0.
+            This is super rare if input only consists of continuous feature.
+            But if one-hot feature is included,
+            they will all have same values 1, in that case, make sure to set handle_single to true.
+    """
+
+    norm_layer = ZscoreNormalization(
+        decay=decay, data_type=data_type, name=name, **kwargs
+    )
+    return norm_layer(
+        input,
+        is_training,
+        dense_mask=dense_mask,
+        zero_debias=zero_debias,
+        handle_single=handle_single,
+    )
diff --git a/twml/twml/contrib/metrics/__init__.py b/twml/twml/contrib/metrics/__init__.py
index 37e6563c9..f2e26dafe 100644
--- a/twml/twml/contrib/metrics/__init__.py
+++ b/twml/twml/contrib/metrics/__init__.py
@@ -1,5 +1,5 @@
 # pylint: disable=wildcard-import
 """This module contains experimental metric(s) for search and ranking"""
 
-from .search_metrics import get_search_metric_fn, ndcg  # noqa: F401
 from .metrics import *  # noqa: F401
+from .search_metrics import get_search_metric_fn, ndcg  # noqa: F401
diff --git a/twml/twml/contrib/metrics/metrics.py b/twml/twml/contrib/metrics/metrics.py
index dea1a5273..d39e2bef5 100644
--- a/twml/twml/contrib/metrics/metrics.py
+++ b/twml/twml/contrib/metrics/metrics.py
@@ -2,208 +2,324 @@
 Module containing extra tensorflow metrics used at Twitter.
 This module conforms to conventions used by tf.metrics.*.
 In particular, each metric constructs two subgraphs: value_op and update_op:
-  - The value op is used to fetch the current metric value.
-  - The update_op is used to accumulate into the metric.
+    - The value op is used to fetch the current metric value.
+    - The update_op is used to accumulate into the metric.
 
 Note: similar to tf.metrics.*, metrics in here do not support multi-label learning.
 We will have to write wrapper classes to create one metric per label.
 
 Note: similar to tf.metrics.*, batches added into a metric via its update_op are cumulative!
-
 """
 
-from collections import OrderedDict
+from typing import Callable, List, Optional, Tuple, Union
 
 import tensorflow.compat.v1 as tf
-from twml.metrics import get_multi_binary_class_metric_fn
 
+from twml.metrics import get_multi_binary_class_metric_fn
 
 
 # checkstyle: noqa
-def get_partial_multi_binary_class_metric_fn(metrics, classes=None, class_dim=1, predcols=None):
-
-  def get_eval_metric_ops(graph_output, labels, weights):
-    if predcols is None:
-      preds = graph_output['output']
-    else:
-      if isinstance(predcols, int):
-        predcol_list=[predcols]
-      else:
-        predcol_list=list(predcols)
-      for col in predcol_list:
-        assert 0 <= col < graph_output['output'].shape[class_dim], 'Invalid Prediction Column Index !'
-      preds  = tf.gather(graph_output['output'], indices=predcol_list, axis=class_dim)     # [batchSz, num_col]
-      labels = tf.gather(labels, indices=predcol_list, axis=class_dim)                     # [batchSz, num_col]
-
-    predInfo = {'output': preds}
-    if 'threshold' in graph_output:
-      predInfo['threshold'] = graph_output['threshold']
-    if 'hard_output' in graph_output:
-      predInfo['hard_output'] = graph_output['hard_output']
-
-    metrics_op = get_multi_binary_class_metric_fn(metrics, classes, class_dim)
-    metrics_op_res = metrics_op(predInfo, labels, weights)
-    return metrics_op_res
-
-  return get_eval_metric_ops
-
+def get_partial_multi_binary_class_metric_fn(
+    metrics: List[str],
+    classes: Optional[List[str]] = None,
+    class_dim: int = 1,
+    predcols: Optional[Union[int, List[int]]] = None,
+) -> callable:
+    def get_eval_metric_ops(
+        graph_output: dict, labels: tf.Tensor, weights: tf.Tensor
+    ) -> dict:
+        if predcols is None:
+            preds = graph_output["output"]
+        else:
+            if isinstance(predcols, int):
+                predcol_list = [predcols]
+            else:
+                predcol_list = list(predcols)
+            for col in predcol_list:
+                assert (
+                    0 <= col < graph_output["output"].shape[class_dim]
+                ), "Invalid Prediction Column Index !"
+            preds = tf.gather(
+                graph_output["output"], indices=predcol_list, axis=class_dim
+            )  # [batchSz, num_col]
+            labels = tf.gather(
+                labels, indices=predcol_list, axis=class_dim
+            )  # [batchSz, num_col]
+
+        predInfo = {"output": preds}
+        if "threshold" in graph_output:
+            predInfo["threshold"] = graph_output["threshold"]
+        if "hard_output" in graph_output:
+            predInfo["hard_output"] = graph_output["hard_output"]
+
+        metrics_op = get_multi_binary_class_metric_fn(metrics, classes, class_dim)
+        metrics_op_res = metrics_op(predInfo, labels, weights)
+        return metrics_op_res
+
+    return get_eval_metric_ops
 
 
 # Numeric Prediction Performance among TopK Predictions
-def mean_numeric_label_topK(labels, predictions, weights, name, topK_id):
-  top_k_labels  = tf.gather(params=labels, indices=topK_id, axis=0)                # [topK, 1]
-  return tf.metrics.mean(values=top_k_labels, name=name)
+def mean_numeric_label_topK(
+    labels: List[str],
+    predictions: tf.Tensor,
+    weights: tf.Tensor,
+    name: str,
+    topK_id: tf.Tensor,
+) -> tf.Tensor:
+    top_k_labels = tf.gather(params=labels, indices=topK_id, axis=0)  # [topK, 1]
+    return tf.metrics.mean(values=top_k_labels, name=name)
+
+
+def mean_gated_numeric_label_topK(
+    labels: List[str],
+    predictions: tf.Tensor,
+    weights: tf.Tensor,
+    name: str,
+    topK_id: tf.Tensor,
+    bar: float = 2.0,
+) -> tf.Tensor:
+    assert isinstance(bar, int) or isinstance(bar, float), "bar must be int or float"
+    top_k_labels = tf.gather(params=labels, indices=topK_id, axis=0)  # [topK, 1]
+    gated_top_k_labels = tf.cast(top_k_labels > bar * 1.0, tf.int32)
+    return tf.metrics.mean(values=gated_top_k_labels, name=name)
 
-def mean_gated_numeric_label_topK(labels, predictions, weights, name, topK_id, bar=2.0):
-  assert isinstance(bar, int) or isinstance(bar, float), "bar must be int or float"
-  top_k_labels  = tf.gather(params=labels, indices=topK_id, axis=0)                # [topK, 1]
-  gated_top_k_labels  = tf.cast(top_k_labels > bar*1.0, tf.int32)
-  return tf.metrics.mean(values=gated_top_k_labels, name=name)
 
 SUPPORTED_NUMERIC_METRICS = {
-  'mean_numeric_label_topk': mean_numeric_label_topK,
-  'mean_gated_numeric_label_topk': mean_gated_numeric_label_topK
+    "mean_numeric_label_topk": mean_numeric_label_topK,
+    "mean_gated_numeric_label_topk": mean_gated_numeric_label_topK,
 }
-DEFAULT_NUMERIC_METRICS = ['mean_numeric_label_topk', 'mean_gated_numeric_label_topk']
+DEFAULT_NUMERIC_METRICS = ["mean_numeric_label_topk", "mean_gated_numeric_label_topk"]
 
 
-
-def get_metric_topK_fn_helper(targetMetrics, supportedMetrics_op, metrics=None, topK=(5,5,5), predcol=None, labelcol=None):
-  """
-  :param targetMetrics:        Target Metric List
-  :param supportedMetrics_op:  Supported Metric Operators             Dict
-  :param metrics:              Metric Set to evaluate
-  :param topK:                 (topK_min, topK_max, topK_delta)       Tuple
-  :param predcol:              Prediction Column Index
-  :param labelcol:             Label Column Index
-  :return:
-  """
-  # pylint: disable=dict-keys-not-iterating
-  if targetMetrics is None or supportedMetrics_op is None:
-    raise ValueError("Invalid Target Metric List/op !")
-
-  targetMetrics = set([m.lower() for m in targetMetrics])
-  if metrics is None:
-    metrics = list(targetMetrics)
-  else:
-    metrics = [m.lower() for m in metrics if m.lower() in targetMetrics]
-
-  num_k     = int((topK[1]-topK[0])/topK[2]+1)
-  topK_list = [topK[0]+d*topK[2] for d in range(num_k)]
-  if 1 not in topK_list:
-    topK_list = [1] + topK_list
-
-
-  def get_eval_metric_ops(graph_output, labels, weights):
+def get_metric_topK_fn_helper(
+    targetMetrics: List[str],
+    supportedMetrics_op: dict,
+    metrics: Optional[List[str]] = None,
+    topK: Tuple[int] = (5, 5, 5),
+    predcol: Optional[int] = None,
+    labelcol: Optional[int] = None,
+) -> Callable[[dict, tf.Tensor, tf.Tensor], dict]:
     """
-    graph_output:
-      dict that is returned by build_graph given input features.
-    labels:
-      target labels associated to batch.
-    weights:
-      weights of the samples..
+    Helper function to get metric function for topK evaluation
+
+    Args:
+        targetMetrics (list[str]):
+            Target Metric List
+        supportedMetrics_op (dict):
+            Supported Metric Operators
+        metrics (list[str], optional):
+            Metric Set to evaluate
+        topK (tuple[int], optional):
+            (topK_min, topK_max, topK_delta)
+        predcol (int, optional):
+            Prediction Column Index
+        labelcol (int, optional):
+            Label Column Index
+
+    Returns:
+        callable:
+            Metric Function
     """
-    eval_metric_ops = OrderedDict()
 
-    if predcol is None:
-      pred = graph_output['output']
+    # pylint: disable=dict-keys-not-iterating
+    if targetMetrics is None or supportedMetrics_op is None:
+        raise ValueError("Invalid Target Metric List/op !")
+
+    targetMetrics = set([m.lower() for m in targetMetrics])
+    if metrics is None:
+        metrics = list(targetMetrics)
     else:
-      assert 0 <= predcol < graph_output['output'].shape[1], 'Invalid Prediction Column Index !'
-      assert labelcol is not None
-      pred   = tf.reshape(graph_output['output'][:, predcol], shape=[-1, 1])
-      labels = tf.reshape(labels[:, labelcol], shape=[-1, 1])
-    numOut = graph_output['output'].shape[1]
-    pred_score = tf.reshape(graph_output['output'][:, numOut-1], shape=[-1, 1])
-
-    # add metrics to eval_metric_ops dict
-    for metric_name in metrics:
-      metric_name = metric_name.lower()  # metric name are case insensitive.
-
-      if metric_name in supportedMetrics_op:
-        metric_factory = supportedMetrics_op.get(metric_name)
-
-        if 'topk' not in metric_name:
-          value_op, update_op = metric_factory(
-            labels=labels,
-            predictions=pred,
-            weights=weights,
-            name=metric_name)
-          eval_metric_ops[metric_name] = (value_op, update_op)
+        metrics = [m.lower() for m in metrics if m.lower() in targetMetrics]
+
+    num_k = int((topK[1] - topK[0]) / topK[2] + 1)
+    topK_list = [topK[0] + d * topK[2] for d in range(num_k)]
+    if 1 not in topK_list:
+        topK_list = [1] + topK_list
+
+    def get_eval_metric_ops(
+        graph_output: dict, labels: tf.Tensor, weights: tf.Tensor
+    ) -> dict:
+        """
+        Get Evaluation Metric Ops
+
+        Args:
+            graph_output (dict):
+                Graph Output
+            labels (tf.Tensor):
+                Labels
+            weights (tf.Tensor):
+                Weights
+
+        Returns:
+            dict:
+                Evaluation Metric Ops
+        """
+        eval_metric_ops = dict()
+
+        if predcol is None:
+            pred = graph_output["output"]
         else:
-          for K in topK_list:
-            K_min = tf.minimum(K, tf.shape(pred_score)[0])
-            topK_id = tf.nn.top_k(tf.reshape(pred_score, shape=[-1]), k=K_min)[1]           # [topK]
-            value_op, update_op = metric_factory(
-              labels=labels,
-              predictions=pred,
-              weights=weights,
-              name=metric_name+'__k_'+str(K),
-              topK_id=topK_id)
-            eval_metric_ops[metric_name+'__k_'+str(K)] = (value_op, update_op)
-
-      else:
-        raise ValueError('Cannot find the metric named ' + metric_name)
-
-    return eval_metric_ops
-
-  return get_eval_metric_ops
-
-
-
-def get_numeric_metric_fn(metrics=None, topK=(5,5,5), predcol=None, labelcol=None):
-  if metrics is None:
-    metrics = list(DEFAULT_NUMERIC_METRICS)
-  metrics   = list(set(metrics))
-
-  metric_op = get_metric_topK_fn_helper(targetMetrics=list(DEFAULT_NUMERIC_METRICS),
-                                        supportedMetrics_op=SUPPORTED_NUMERIC_METRICS,
-                                        metrics=metrics, topK=topK, predcol=predcol, labelcol=labelcol)
-  return metric_op
-
-
-
-def get_single_binary_task_metric_fn(metrics, classnames, topK=(5,5,5), use_topK=False):
-  """
-  graph_output['output']:        [BatchSz, 1]        [pred_Task1]
-  labels:                        [BatchSz, 2]        [Task1, NumericLabel]
-  """
-  def get_eval_metric_ops(graph_output, labels, weights):
-    metric_op_base = get_partial_multi_binary_class_metric_fn(metrics, predcols=0, classes=classnames)
-    classnames_unw = ['unweighted_'+cs for cs in classnames]
-    metric_op_unw = get_partial_multi_binary_class_metric_fn(metrics, predcols=0, classes=classnames_unw)
-
-    metrics_base_res = metric_op_base(graph_output, labels, weights)
-    metrics_unw_res = metric_op_unw(graph_output, labels, None)
-    metrics_base_res.update(metrics_unw_res)
-
-    if use_topK:
-      metric_op_numeric = get_numeric_metric_fn(metrics=None, topK=topK, predcol=0, labelcol=1)
-      metrics_numeric_res = metric_op_numeric(graph_output, labels, weights)
-      metrics_base_res.update(metrics_numeric_res)
-    return metrics_base_res
-
-  return get_eval_metric_ops
-
-
-def get_dual_binary_tasks_metric_fn(metrics, classnames, topK=(5,5,5), use_topK=False):
-  """
-  graph_output['output']:        [BatchSz, 3]        [pred_Task1, pred_Task2, Score]
-  labels:                        [BatchSz, 3]        [Task1, Task2, NumericLabel]
-  """
-  def get_eval_metric_ops(graph_output, labels, weights):
-
-    metric_op_base = get_partial_multi_binary_class_metric_fn(metrics, predcols=[0, 1], classes=classnames)
-    classnames_unw = ['unweighted_'+cs for cs in classnames]
-    metric_op_unw = get_partial_multi_binary_class_metric_fn(metrics, predcols=[0, 1], classes=classnames_unw)
-
-    metrics_base_res = metric_op_base(graph_output, labels, weights)
-    metrics_unw_res = metric_op_unw(graph_output, labels, None)
-    metrics_base_res.update(metrics_unw_res)
+            assert (
+                0 <= predcol < graph_output["output"].shape[1]
+            ), "Invalid Prediction Column Index !"
+            assert labelcol is not None
+            pred = tf.reshape(graph_output["output"][:, predcol], shape=[-1, 1])
+            labels = tf.reshape(labels[:, labelcol], shape=[-1, 1])
+        numOut = graph_output["output"].shape[1]
+        pred_score = tf.reshape(graph_output["output"][:, numOut - 1], shape=[-1, 1])
+
+        # add metrics to eval_metric_ops dict
+        for metric_name in metrics:
+            metric_name = metric_name.lower()  # metric name are case insensitive.
+
+            if metric_name in supportedMetrics_op:
+                metric_factory = supportedMetrics_op.get(metric_name)
+
+                if "topk" not in metric_name:
+                    value_op, update_op = metric_factory(
+                        labels=labels,
+                        predictions=pred,
+                        weights=weights,
+                        name=metric_name,
+                    )
+                    eval_metric_ops[metric_name] = (value_op, update_op)
+                else:
+                    for K in topK_list:
+                        K_min = tf.minimum(K, tf.shape(pred_score)[0])
+                        topK_id = tf.nn.top_k(
+                            tf.reshape(pred_score, shape=[-1]), k=K_min
+                        )[
+                            1
+                        ]  # [topK]
+                        value_op, update_op = metric_factory(
+                            labels=labels,
+                            predictions=pred,
+                            weights=weights,
+                            name=metric_name + "__k_" + str(K),
+                            topK_id=topK_id,
+                        )
+                        eval_metric_ops[metric_name + "__k_" + str(K)] = (
+                            value_op,
+                            update_op,
+                        )
+            else:
+                raise ValueError("Cannot find the metric named " + metric_name)
+        return eval_metric_ops
+
+    return get_eval_metric_ops
+
+
+def get_numeric_metric_fn(
+    metrics: List[str] = None,
+    topK: Tuple[int] = (5, 5, 5),
+    predcol: Optional[int] = None,
+    labelcol: Optional[int] = None,
+) -> Callable[[dict, tf.Tensor, tf.Tensor], dict]:
+    if metrics is None:
+        metrics = list(DEFAULT_NUMERIC_METRICS)
+    metrics = list(set(metrics))
+
+    metric_op = get_metric_topK_fn_helper(
+        targetMetrics=list(DEFAULT_NUMERIC_METRICS),
+        supportedMetrics_op=SUPPORTED_NUMERIC_METRICS,
+        metrics=metrics,
+        topK=topK,
+        predcol=predcol,
+        labelcol=labelcol,
+    )
+    return metric_op
+
+
+def get_single_binary_task_metric_fn(
+    metrics: List[str],
+    classnames: List[str],
+    topK: Tuple[int] = (5, 5, 5),
+    use_topK: bool = False,
+) -> Callable[[dict, tf.Tensor, tf.Tensor], dict]:
+    """
+    graph_output['output']:        [BatchSz, 1]        [pred_Task1]
+    labels:                        [BatchSz, 2]        [Task1, NumericLabel]
+    """
 
-    if use_topK:
-      metric_op_numeric = get_numeric_metric_fn(metrics=None, topK=topK, predcol=2, labelcol=2)
-      metrics_numeric_res = metric_op_numeric(graph_output, labels, weights)
-      metrics_base_res.update(metrics_numeric_res)
-    return metrics_base_res
+    def get_eval_metric_ops(graph_output, labels, weights):
+        metric_op_base = get_partial_multi_binary_class_metric_fn(
+            metrics, predcols=0, classes=classnames
+        )
+        classnames_unw = ["unweighted_" + cs for cs in classnames]
+        metric_op_unw = get_partial_multi_binary_class_metric_fn(
+            metrics, predcols=0, classes=classnames_unw
+        )
+
+        metrics_base_res = metric_op_base(graph_output, labels, weights)
+        metrics_unw_res = metric_op_unw(graph_output, labels, None)
+        metrics_base_res.update(metrics_unw_res)
+
+        if use_topK:
+            metric_op_numeric = get_numeric_metric_fn(
+                metrics=None, topK=topK, predcol=0, labelcol=1
+            )
+            metrics_numeric_res = metric_op_numeric(graph_output, labels, weights)
+            metrics_base_res.update(metrics_numeric_res)
+        return metrics_base_res
+
+    return get_eval_metric_ops
+
+
+def get_dual_binary_tasks_metric_fn(
+    metrics: List[str],
+    classnames: List[str],
+    topK: Tuple[int] = (5, 5, 5),
+    use_topK: bool = False,
+) -> Callable[[dict, tf.Tensor, tf.Tensor], dict]:
+    """
+    Args:
+        metrics (List[str]):
+            List of metrics to use
+        classnames (List[str]):
+            List of class names
+        topK (Tuple[int]):
+            Top K
+        use_topK (bool):
+            Whether to use top K
+
+    Returns:
+        callable:
+            Evaluation Metric Ops
+    """
 
-  return get_eval_metric_ops
+    def get_eval_metric_ops(graph_output: dict, labels: tf.Tensor, weights: tf.Tensor):
+        """
+        Args:
+            graph_output (dict):
+                Graph Output
+            labels (tf.Tensor):
+                Labels
+            weights (tf.Tensor):
+                Weights
+
+        Returns:
+            callable:
+                Evaluation Metric Ops
+        """
+        metric_op_base = get_partial_multi_binary_class_metric_fn(
+            metrics, predcols=[0, 1], classes=classnames
+        )
+        classnames_unw = ["unweighted_" + cs for cs in classnames]
+        metric_op_unw = get_partial_multi_binary_class_metric_fn(
+            metrics, predcols=[0, 1], classes=classnames_unw
+        )
+
+        metrics_base_res = metric_op_base(graph_output, labels, weights)
+        metrics_unw_res = metric_op_unw(graph_output, labels, None)
+        metrics_base_res.update(metrics_unw_res)
+
+        if use_topK:
+            metric_op_numeric = get_numeric_metric_fn(
+                metrics=None, topK=topK, predcol=2, labelcol=2
+            )
+            metrics_numeric_res = metric_op_numeric(graph_output, labels, weights)
+            metrics_base_res.update(metrics_numeric_res)
+        return metrics_base_res
+
+    return get_eval_metric_ops
diff --git a/twml/twml/contrib/metrics/search_metrics.py b/twml/twml/contrib/metrics/search_metrics.py
index 7d7a502f1..7038a2eb8 100644
--- a/twml/twml/contrib/metrics/search_metrics.py
+++ b/twml/twml/contrib/metrics/search_metrics.py
@@ -12,281 +12,302 @@
 
 """
 
-from collections import OrderedDict
 from functools import partial
+from typing import Callable, Dict, List, Optional, Tuple
 
 import tensorflow.compat.v1 as tf
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes, ops
 from tensorflow.python.ops import array_ops, state_ops
+
 import twml
 from twml.contrib.utils import math_fns
 
 
-def ndcg(labels, predictions,
-                  metrics_collections=None,
-                  updates_collections=None,
-                  name=None,
-                  top_k_int=1):
-  # pylint: disable=unused-argument
-  """
-  Compute full normalized discounted cumulative gain (ndcg) based on predictions
-  ndcg = dcg_k/idcg_k, k is a cut off ranking postion
-  There are a few variants of ndcg
-  The dcg (discounted cumulative gain) formula used in
-  twml.contrib.metrics.ndcg is::
-
-    \\sum_{i=1}^k \frac{2^{relevance\\_score} -1}{\\log_{2}(i + 1)}
-
-  k is the length of items to be ranked in a batch/query
-  Notice that whether k will be replaced with a fixed value requires discussions
-  The scores in predictions are transformed to order and relevance scores to calculate ndcg
-  A relevance score means how relevant a DataRecord is to a particular query
-
-  Arguments:
-    labels: the ground truth value.
-    predictions: the predicted values, whose shape must match labels. Ignored for CTR computation.
-    metrics_collections: optional list of collections to add this metric into.
-    updates_collections: optional list of collections to add the associated update_op into.
-    name: an optional variable_scope name.
-
-  Returns:
-    ndcg: A `Tensor` representing the ndcg score.
-    update_op: A update operation used to accumulate data into this metric.
-  """
-  with tf.variable_scope(name, 'ndcg', (labels, predictions)):
-    label_scores = tf.to_float(labels, name='label_to_float')
-    predicted_scores = tf.to_float(predictions, name='predictions_to_float')
-
-    if context.executing_eagerly():
-      raise RuntimeError('ndcg is not supported when eager execution '
-                         'is enabled.')
-
-    total_ndcg = _metric_variable([], dtypes.float32, name='total_ndcg')
-    count_query = _metric_variable([], dtypes.float32, name='query_count')
-
-    # actual ndcg cutoff position top_k_int
-    max_prediction_size = array_ops.size(predicted_scores)
-    top_k_int = tf.minimum(max_prediction_size, top_k_int)
-    # the ndcg score of the batch
-    ndcg = math_fns.cal_ndcg(label_scores,
-      predicted_scores, top_k_int=top_k_int)
-    # add ndcg of the current batch to total_ndcg
-    update_total_op = state_ops.assign_add(total_ndcg, ndcg)
-    with ops.control_dependencies([ndcg]):
-      # count_query stores the number of queries
-      # count_query increases by 1 for each batch/query
-      update_count_op = state_ops.assign_add(count_query, 1)
-
-    mean_ndcg = math_fns.safe_div(total_ndcg, count_query, 'mean_ndcg')
-    update_op = math_fns.safe_div(update_total_op, update_count_op, 'update_mean_ndcg_op')
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, mean_ndcg)
-
-    if updates_collections:
-      ops.add_to_collections(updates_collections, update_op)
-
-    return mean_ndcg, update_op
+def ndcg(
+    labels: tf.Tensor,
+    predictions: tf.Tensor,
+    metrics_collections: Optional[tf.Tensor] = None,
+    updates_collections: Optional[tf.Tensor] = None,
+    name: Optional[str] = None,
+    top_k_int: int = 1,
+) -> Tuple[tf.Tensor, tf.Tensor]:
+    # pylint: disable=unused-argument
+    """
+    Compute full normalized discounted cumulative gain (ndcg) based on predictions
+    ndcg = dcg_k/idcg_k, k is a cut off ranking postion
+    There are a few variants of ndcg
+    The dcg (discounted cumulative gain) formula used in
+    twml.contrib.metrics.ndcg is::
+
+        \\sum_{i=1}^k \frac{2^{relevance\\_score} -1}{\\log_{2}(i + 1)}
+
+    k is the length of items to be ranked in a batch/query
+    Notice that whether k will be replaced with a fixed value requires discussions
+    The scores in predictions are transformed to order and relevance scores to calculate ndcg
+    A relevance score means how relevant a DataRecord is to a particular query
+
+    Args:
+        labels (tf.Tensor):
+            the ground truth value.
+        predictions (tf.Tensor):
+            the predicted values, whose shape must match labels. Ignored for CTR computation.
+        metrics_collections (tf.Tensor):
+            optional list of collections to add this metric into.
+        updates_collections (tf.Tensor):
+            optional list of collections to add the associated update_op into.
+        name (str):
+            an optional variable_scope name.
+
+    Returns:
+        ndcg: A `Tensor` representing the ndcg score.
+        update_op: A update operation used to accumulate data into this metric.
+    """
+    with tf.variable_scope(name, "ndcg", (labels, predictions)):
+        label_scores = tf.to_float(labels, name="label_to_float")
+        predicted_scores = tf.to_float(predictions, name="predictions_to_float")
+
+        if context.executing_eagerly():
+            raise RuntimeError(
+                "ndcg is not supported when eager execution " "is enabled."
+            )
 
+        total_ndcg = _metric_variable([], dtypes.float32, name="total_ndcg")
+        count_query = _metric_variable([], dtypes.float32, name="query_count")
 
-# Copied from metrics_impl.py with minor modifications.
-# https://github.com/tensorflow/tensorflow/blob/v1.5.0/tensorflow/python/ops/metrics_impl.py#L39
-def _metric_variable(shape, dtype, validate_shape=True, name=None):
-  """Create variable in `GraphKeys.(LOCAL|METRIC_VARIABLES`) collections."""
+        # actual ndcg cutoff position top_k_int
+        max_prediction_size = array_ops.size(predicted_scores)
+        top_k_int = tf.minimum(max_prediction_size, top_k_int)
+        # the ndcg score of the batch
+        ndcg = math_fns.cal_ndcg(label_scores, predicted_scores, top_k_int=top_k_int)
+        # add ndcg of the current batch to total_ndcg
+        update_total_op = state_ops.assign_add(total_ndcg, ndcg)
+        with ops.control_dependencies([ndcg]):
+            # count_query stores the number of queries
+            # count_query increases by 1 for each batch/query
+            update_count_op = state_ops.assign_add(count_query, 1)
 
-  return tf.Variable(
-    lambda: tf.zeros(shape, dtype),
-    trainable=False,
-    collections=[tf.GraphKeys.LOCAL_VARIABLES, tf.GraphKeys.METRIC_VARIABLES],
-    validate_shape=validate_shape,
-    name=name)
+        mean_ndcg = math_fns.safe_div(total_ndcg, count_query, "mean_ndcg")
+        update_op = math_fns.safe_div(
+            update_total_op, update_count_op, "update_mean_ndcg_op"
+        )
 
+        if metrics_collections:
+            ops.add_to_collections(metrics_collections, mean_ndcg)
 
-# binary metric_name: (metric, requires thresholded output)
+        if updates_collections:
+            ops.add_to_collections(updates_collections, update_op)
+
+        return mean_ndcg, update_op
+
+
+# Copied from metrics_impl.py with minor modifications.
+# https://github.com/tensorflow/tensorflow/blob/v1.5.0/tensorflow/python/ops/metrics_impl.py#L39
+def _metric_variable(
+    shape: Tuple[int, ...],
+    dtype: tf.Dtype,
+    validate_shape: bool = True,
+    name: Optional[str] = None,
+) -> tf.Variable:
+    """Create variable in `GraphKeys.(LOCAL|METRIC_VARIABLES`) collections."""
+
+    return tf.Variable(
+        lambda: tf.zeros(shape, dtype),
+        trainable=False,
+        collections=[tf.GraphKeys.LOCAL_VARIABLES, tf.GraphKeys.METRIC_VARIABLES],
+        validate_shape=validate_shape,
+        name=name,
+    )
+
+
+# binary metric_name: (metric, requires threshold output)
 SUPPORTED_BINARY_CLASS_METRICS = {
-  # TWML binary metrics
-  'rce': (twml.metrics.rce, False),
-  'nrce': (partial(twml.metrics.rce, normalize=True), False),
-  # CTR measures positive sample ratio. This terminology is inherited from Ads.
-  'ctr': (twml.metrics.ctr, False),
-  # predicted CTR measures predicted positive ratio.
-  'predicted_ctr': (twml.metrics.predicted_ctr, False),
-  # thresholded metrics
-  'accuracy': (tf.metrics.accuracy, True),
-  'precision': (tf.metrics.precision, True),
-  'recall': (tf.metrics.recall, True),
-  # tensorflow metrics
-  'roc_auc': (partial(tf.metrics.auc, curve='ROC'), False),
-  'pr_auc': (partial(tf.metrics.auc, curve='PR'), False),
+    # TWML binary metrics
+    "rce": (twml.metrics.rce, False),
+    "nrce": (partial(twml.metrics.rce, normalize=True), False),
+    # CTR measures positive sample ratio. This terminology is inherited from Ads.
+    "ctr": (twml.metrics.ctr, False),
+    # predicted CTR measures predicted positive ratio.
+    "predicted_ctr": (twml.metrics.predicted_ctr, False),
+    # thresholded metrics
+    "accuracy": (tf.metrics.accuracy, True),
+    "precision": (tf.metrics.precision, True),
+    "recall": (tf.metrics.recall, True),
+    # tensorflow metrics
+    "roc_auc": (partial(tf.metrics.auc, curve="ROC"), False),
+    "pr_auc": (partial(tf.metrics.auc, curve="PR"), False),
 }
 
 # search metric_name: metric
 SUPPORTED_SEARCH_METRICS = {
-  # TWML search metrics
-  # ndcg needs the raw prediction scores to sort
-  'ndcg': ndcg,
+    # TWML search metrics
+    # ndcg needs the raw prediction scores to sort
+    "ndcg": ndcg,
 }
 
 
-def get_search_metric_fn(binary_metrics=None, search_metrics=None,
-  ndcg_top_ks=[1, 3, 5, 10], use_binary_metrics=False):
-  """
-  Returns a function having signature:
-
-  .. code-block:: python
-
-    def get_eval_metric_ops(graph_output, labels, weights):
-      ...
-      return eval_metric_ops
-
-  where the returned eval_metric_ops is a dict of common evaluation metric
-  Ops for ranking. See `tf.estimator.EstimatorSpec
-  <https://www.tensorflow.org/api_docs/python/tf/estimator/EstimatorSpec>`_
-  for a description of eval_metric_ops. The graph_output is a the result
-  dict returned by build_graph. Labels and weights are tf.Tensors.
-
-  The following graph_output keys are recognized:
-    output:
-      the raw predictions. Required.
-    threshold:
-      Only used in SUPPORTED_BINARY_CLASS_METRICS
-      If the lables are 0s and 1s
-      A value between 0 and 1 used to threshold the output into a hard_output.
-      Defaults to 0.5 when threshold and hard_output are missing.
-      Either threshold or hard_output can be provided, but not both.
-    hard_output:
-      Only used in SUPPORTED_BINARY_CLASS_METRICS
-      A thresholded output. Either threshold or hard_output can be provided, but not both.
-
-  Arguments:
-    only used in pointwise learning-to-rank
-
-    binary_metrics (list of String):
-      a list of metrics of interest. E.g. ['ctr', 'accuracy', 'rce']
-      These metrics are evaluated and reported to tensorboard *during the eval phases only*.
-      Supported metrics:
-        - ctr (same as positive sample ratio.)
-        - rce (cross entropy loss compared to the baseline model of always predicting ctr)
-        - nrce (normalized rce, do not use this one if you do not understand what it is)
-        - pr_auc
-        - roc_auc
-        - accuracy (percentage of predictions that are correct)
-        - precision (true positives) / (true positives + false positives)
-        - recall (true positives) / (true positives + false negatives)
-
-      NOTE: accuracy / precision / recall apply to binary classification problems only.
-      I.e. a prediction is only considered correct if it matches the label. E.g. if the label
-      is 1.0, and the prediction is 0.99, it does not get credit.  If you want to use
-      precision / recall / accuracy metrics with soft predictions, you'll need to threshold
-      your predictions into hard 0/1 labels.
-
-      When binary_metrics is None (the default), it defaults to all supported metrics
-
-    search_metrics (list of String):
-      a list of metrics of interest. E.g. ['ndcg']
-      These metrics are evaluated and reported to tensorboard *during the eval phases only*.
-      Supported metrics:
-        - ndcg
-
-      NOTE: ndcg works for ranking-relatd problems.
-      A batch contains all DataRecords that belong to the same query
-      If pair_in_batch_mode used in scalding -- a batch contains a pair of DataRecords
-      that belong to the same query and have different labels -- ndcg does not apply in here.
-
-      When search_metrics is None (the default), it defaults to all supported search metrics
-      currently only 'ndcg'
-
-    ndcg_top_ks (list of integers):
-      The cut-off ranking postions for a query
-      When ndcg_top_ks is None or empty (the default), it defaults to [1, 3, 5, 10]
-
-    use_binary_metrics:
-      False (default)
-      Only set it to true in pointwise learning-to-rank
-  """
-  # pylint: disable=dict-keys-not-iterating
-
-  if ndcg_top_ks is None or not ndcg_top_ks:
-    ndcg_top_ks = [1, 3, 5, 10]
-
-  if search_metrics is None:
-    search_metrics = list(SUPPORTED_SEARCH_METRICS.keys())
-
-  if binary_metrics is None and use_binary_metrics:
-    # Added SUPPORTED_BINARY_CLASS_METRICS in twml.metics as well
-    # they are only used in pointwise learing-to-rank
-    binary_metrics = list(SUPPORTED_BINARY_CLASS_METRICS.keys())
-
-  def get_eval_metric_ops(graph_output, labels, weights):
+def get_search_metric_fn(
+    binary_metrics: Optional[List[str]] = None,
+    search_metrics: Optional[List[str]] = None,
+    ndcg_top_ks: List[int] = [1, 3, 5, 10],
+    use_binary_metrics: bool = False,
+) -> Callable[[Dict[str, tf.Tensor], tf.Tensor, tf.Tensor], Dict[str, tf.Tensor]]:
     """
-    graph_output:
-      dict that is returned by build_graph given input features.
-    labels:
-      target labels associated to batch.
-    weights:
-      weights of the samples..
+    Returns a function having signature:
+
+    .. code-block:: python
+
+      def get_eval_metric_ops(graph_output, labels, weights):
+        ...
+        return eval_metric_ops
+
+    where the returned eval_metric_ops is a dict of common evaluation metric
+    Ops for ranking. See `tf.estimator.EstimatorSpec
+    <https://www.tensorflow.org/api_docs/python/tf/estimator/EstimatorSpec>`_
+    for a description of eval_metric_ops. The graph_output is a the result
+    dict returned by build_graph. Labels and weights are tf.Tensors.
+
+    The following graph_output keys are recognized:
+        output:
+            the raw predictions. Required.
+        threshold:
+            Only used in SUPPORTED_BINARY_CLASS_METRICS
+            If the lables are 0s and 1s
+            A value between 0 and 1 used to threshold the output into a hard_output.
+            Defaults to 0.5 when threshold and hard_output are missing.
+            Either threshold or hard_output can be provided, but not both.
+        hard_output:
+            Only used in SUPPORTED_BINARY_CLASS_METRICS
+            A thresholded output. Either threshold or hard_output can be provided, but not both.
+
+    Args:
+        only used in pointwise learning-to-rank
+        binary_metrics (list of String):
+            a list of metrics of interest. E.g. ['ctr', 'accuracy', 'rce']
+            These metrics are evaluated and reported to tensorboard *during the eval phases only*.
+            Supported metrics:
+                - ctr (same as positive sample ratio.)
+                - rce (cross entropy loss compared to the baseline model of always predicting ctr)
+                - nrce (normalized rce, do not use this one if you do not understand what it is)
+                - pr_auc
+                - roc_auc
+                - accuracy (percentage of predictions that are correct)
+                - precision (true positives) / (true positives + false positives)
+                - recall (true positives) / (true positives + false negatives)
+            NOTE: accuracy / precision / recall apply to binary classification problems only.
+            I.e. a prediction is only considered correct if it matches the label. E.g. if the label
+            is 1.0, and the prediction is 0.99, it does not get credit.  If you want to use
+            precision / recall / accuracy metrics with soft predictions, you'll need to threshold
+            your predictions into hard 0/1 labels.
+            When binary_metrics is None (the default), it defaults to all supported metrics
+        search_metrics (list of String):
+            a list of metrics of interest. E.g. ['ndcg']
+            These metrics are evaluated and reported to tensorboard *during the eval phases only*.
+            Supported metrics:
+                - ndcg
+            NOTE: ndcg works for ranking-related problems.
+            A batch contains all DataRecords that belong to the same query
+            If pair_in_batch_mode used in scalding -- a batch contains a pair of DataRecords
+            that belong to the same query and have different labels -- ndcg does not apply in here.
+            When search_metrics is None (the default), it defaults to all supported search metrics
+            currently only 'ndcg'
+        ndcg_top_ks (list of integers):
+            The cut-off ranking positions for a query
+            When ndcg_top_ks is None or empty (the default), it defaults to [1, 3, 5, 10]
+        use_binary_metrics:
+            False (default)
+            Only set it to true in pointwise learning-to-rank
     """
-
-    eval_metric_ops = OrderedDict()
-
-    preds = graph_output['output']
-
-    threshold = graph_output['threshold'] if 'threshold' in graph_output else 0.5
-
-    hard_preds = graph_output.get('hard_output')
-    # hard_preds is a tensor
-    # check hard_preds is None and then check if it is empty
-    if hard_preds is None or tf.equal(tf.size(hard_preds), 0):
-      hard_preds = tf.greater_equal(preds, threshold)
-
-    # add search metrics to eval_metric_ops dict
-    for metric_name in search_metrics:
-      metric_name = metric_name.lower()  # metric name are case insensitive.
-
-      if metric_name in eval_metric_ops:
-        # avoid adding duplicate metrics.
-        continue
-
-      search_metric_factory = SUPPORTED_SEARCH_METRICS.get(metric_name)
-      if search_metric_factory:
-        if metric_name == 'ndcg':
-          for top_k in ndcg_top_ks:
-            # metric name will show as ndcg_1, ndcg_10, ...
-            metric_name_ndcg_top_k = metric_name + '_' + str(top_k)
-            top_k_int = tf.constant(top_k, dtype=tf.int32)
-            # Note: having weights in ndcg does not make much sense
-            # Because ndcg already has position weights/discounts
-            # Thus weights are not applied in ndcg metric
-            value_op, update_op = search_metric_factory(
-              labels=labels,
-              predictions=preds,
-              name=metric_name_ndcg_top_k,
-              top_k_int=top_k_int)
-            eval_metric_ops[metric_name_ndcg_top_k] = (value_op, update_op)
-      else:
-        raise ValueError('Cannot find the search metric named ' + metric_name)
-
-    if use_binary_metrics:
-      # add binary metrics to eval_metric_ops dict
-      for metric_name in binary_metrics:
-
-        if metric_name in eval_metric_ops:
-          # avoid adding duplicate metrics.
-          continue
-
-        metric_name = metric_name.lower()  # metric name are case insensitive.
-        binary_metric_factory, requires_threshold = SUPPORTED_BINARY_CLASS_METRICS.get(metric_name)
-        if binary_metric_factory:
-          value_op, update_op = binary_metric_factory(
-            labels=labels,
-            predictions=(hard_preds if requires_threshold else preds),
-            weights=weights,
-            name=metric_name)
-          eval_metric_ops[metric_name] = (value_op, update_op)
-        else:
-          raise ValueError('Cannot find the binary metric named ' + metric_name)
-
-    return eval_metric_ops
-
-  return get_eval_metric_ops
+    # pylint: disable=dict-keys-not-iterating
+
+    if ndcg_top_ks is None or not ndcg_top_ks:
+        ndcg_top_ks = [1, 3, 5, 10]
+
+    if search_metrics is None:
+        search_metrics = list(SUPPORTED_SEARCH_METRICS.keys())
+
+    if binary_metrics is None and use_binary_metrics:
+        # Added SUPPORTED_BINARY_CLASS_METRICS in twml.metics as well
+        # they are only used in pointwise learing-to-rank
+        binary_metrics = list(SUPPORTED_BINARY_CLASS_METRICS.keys())
+
+    def get_eval_metric_ops(
+        graph_output: Dict[str, tf.Tensor], labels: tf.Tensor, weights: tf.Tensor
+    ) -> Dict[str, tf.Tensor]:
+        """
+        graph_output:
+            dict that is returned by build_graph given input features.
+        labels:
+            target labels associated to batch.
+        weights:
+            weights of the samples..
+        """
+
+        eval_metric_ops = dict()
+
+        preds = graph_output["output"]
+
+        threshold = graph_output["threshold"] if "threshold" in graph_output else 0.5
+
+        hard_preds = graph_output.get("hard_output")
+        # hard_preds is a tensor
+        # check hard_preds is None and then check if it is empty
+        if hard_preds is None or tf.equal(tf.size(hard_preds), 0):
+            hard_preds = tf.greater_equal(preds, threshold)
+
+        # add search metrics to eval_metric_ops dict
+        for metric_name in search_metrics:
+            metric_name = metric_name.lower()  # metric name are case insensitive.
+
+            if metric_name in eval_metric_ops:
+                # avoid adding duplicate metrics.
+                continue
+
+            search_metric_factory = SUPPORTED_SEARCH_METRICS.get(metric_name)
+            if search_metric_factory:
+                if metric_name == "ndcg":
+                    for top_k in ndcg_top_ks:
+                        # metric name will show as ndcg_1, ndcg_10, ...
+                        metric_name_ndcg_top_k = metric_name + "_" + str(top_k)
+                        top_k_int = tf.constant(top_k, dtype=tf.int32)
+                        # Note: having weights in ndcg does not make much sense
+                        # Because ndcg already has position weights/discounts
+                        # Thus weights are not applied in ndcg metric
+                        value_op, update_op = search_metric_factory(
+                            labels=labels,
+                            predictions=preds,
+                            name=metric_name_ndcg_top_k,
+                            top_k_int=top_k_int,
+                        )
+                        eval_metric_ops[metric_name_ndcg_top_k] = (value_op, update_op)
+            else:
+                raise ValueError("Cannot find the search metric named " + metric_name)
+
+        if use_binary_metrics:
+            # add binary metrics to eval_metric_ops dict
+            for metric_name in binary_metrics:
+                if metric_name in eval_metric_ops:
+                    # avoid adding duplicate metrics.
+                    continue
+
+                metric_name = metric_name.lower()  # metric name are case insensitive.
+                (
+                    binary_metric_factory,
+                    requires_threshold,
+                ) = SUPPORTED_BINARY_CLASS_METRICS.get(metric_name)
+                if binary_metric_factory:
+                    value_op, update_op = binary_metric_factory(
+                        labels=labels,
+                        predictions=(hard_preds if requires_threshold else preds),
+                        weights=weights,
+                        name=metric_name,
+                    )
+                    eval_metric_ops[metric_name] = (value_op, update_op)
+                else:
+                    raise ValueError(
+                        "Cannot find the binary metric named " + metric_name
+                    )
+
+        return eval_metric_ops
+
+    return get_eval_metric_ops
diff --git a/twml/twml/contrib/optimizers/__init__.py b/twml/twml/contrib/optimizers/__init__.py
index 112b2b410..c140e55af 100644
--- a/twml/twml/contrib/optimizers/__init__.py
+++ b/twml/twml/contrib/optimizers/__init__.py
@@ -1,4 +1,6 @@
 # pylint: disable=wildcard-import
 """This module contains experimental optimizer classes"""
-from .deep_gradient_compression_optimizer import DeepGradientCompressionOptimizer  # noqa: F401
+from .deep_gradient_compression_optimizer import (
+    DeepGradientCompressionOptimizer,
+)  # noqa: F401
 from .pruning_optimizer import PruningOptimizer  # noqa: F401
diff --git a/twml/twml/contrib/optimizers/deep_gradient_compression_optimizer.py b/twml/twml/contrib/optimizers/deep_gradient_compression_optimizer.py
index 2c71ed13f..4447feb90 100644
--- a/twml/twml/contrib/optimizers/deep_gradient_compression_optimizer.py
+++ b/twml/twml/contrib/optimizers/deep_gradient_compression_optimizer.py
@@ -8,173 +8,225 @@
 # TODO: Test how much communication overhead this DeepGradientCompressionOptimizer can reduce under
 # multi-GPU and distributed setting.
 
-import tensorflow.compat.v1 as tf
-
+from typing import List
 
-def compute_threshold(grad, density):
-  """
-  A utility function to compute the threshold for gradient sparsification, given the gradient
-  tensor and the density.
-  Args:
-    grad(tf.Tensor):
-      Gradient tensor for some variable.
-    density(float):
-      Density degree when sparsifying gradients.
-  Returns(float):
-    Threshold for gradient sparsification.
-  """
-  flat_grad = tf.reshape(grad, [-1])
-  abs_flat_grad = tf.abs(flat_grad)
-  size = tf.shape(abs_flat_grad)[0]
-  k = tf.maximum(tf.constant(1),
-                 tf.cast(tf.scalar_mul(density, tf.cast(size, tf.float32)), tf.int32))
-  topk, _ = tf.nn.top_k(abs_flat_grad, k, False)
-  return topk[-1]
-
-
-def get_top_row_indices(values, density):
-  """
-  A utility function to get indices of most significant rows, given the density degree.
-  Args:
-    values(tf.Tensor):
-      Gradient or locally accumulated gradient for some variable.
-    density(float):
-      Density degree when filtering out rows.
-  Returns(list(int)):
-    Indices of most significant rows.
-  """
-  abs_values = tf.abs(values)
-
-  try:
-    row_num = tf.shape(abs_values)[0]
-    k = tf.maximum(tf.constant(1),
-                   tf.cast(tf.scalar_mul(density, tf.cast(row_num, tf.float32)), tf.int32))
-    row_sums = tf.squeeze(tf.reduce_sum(values, axis=1, keepdims=True))
-    _, top_row_indices = tf.nn.top_k(row_sums, k=k, sorted=False)
-    # print "abs_values", abs_values, "row_sums", row_sums
-    return top_row_indices
-    # return tf.range(row_num)
-
-  except ValueError:  # if the tensor is 0-D or 1-D
-    return None
+import tensorflow.compat.v1 as tf
 
 
-class DeepGradientCompressionOptimizer(tf.train.GradientDescentOptimizer):
-  """
-  A custom optimizer to implement Deep Gradient Compression (https://arxiv.org/abs/1712.01887).
-  """
-
-  def __init__(self, learning_rate, use_locking=False, name="Sparse",
-               density=1.0,
-               density_decay=False,
-               density_decay_steps=10000,
-               density_decay_rate=0.5,
-               min_density=0.1,
-               accumulation=False):
-    super(DeepGradientCompressionOptimizer, self).__init__(learning_rate, use_locking, name)
-    self._initial_density_t = tf.convert_to_tensor(density)
-    self._density_decay = density_decay
-    dtype = self._initial_density_t.dtype
-    self._density_decay_steps_t = tf.convert_to_tensor(density_decay_steps, dtype)
-    self._density_decay_rate_t = tf.convert_to_tensor(density_decay_rate, dtype)
-    self._min_density_t = tf.convert_to_tensor(min_density, dtype)
-    self._accumulation = accumulation
-
-  def _prepare(self):
-    super(DeepGradientCompressionOptimizer, self)._prepare()
-    if not self._density_decay:
-      self._density_t = self._initial_density_t
-    else:
-      dtype = self._initial_density_t.dtype
-      global_step = tf.cast(tf.train.get_global_step(), dtype)
-      p = tf.floor(tf.divide(global_step, self._density_decay_steps_t))
-      decayed_density = tf.multiply(self._initial_density_t,
-                                    tf.pow(self._density_decay_rate_t, p))
-      self._density_t = tf.maximum(self._min_density_t, decayed_density)
-
-  def _create_slots(self, var_list):
+def compute_threshold(grad: tf.Tensor, density: float) -> float:
     """
-    Create a slot variable to accumulate gradients locally for each variable in `var_list`.
+    A utility function to compute the threshold for gradient sparsification, given the gradient
+    tensor and the density.
     Args:
-      var_list(list(tf.Variable)):
-        List of variables to accumulate gradients locally for.
+        grad(tf.Tensor):
+            Gradient tensor for some variable.
+        density (float):
+            Density degree when sparsifying gradients.
+    Returns:
+        (float) Threshold for gradient sparsification.
     """
-    for var in var_list:
-      self._zeros_slot(var, "g_buffer", self._name)
-
-  def _apply_dense(self, grad, var):
-    if not self._accumulation:
-      top_row_indices = get_top_row_indices(grad, self._density_t)
-
-      if top_row_indices is None:
-        return super(DeepGradientCompressionOptimizer, self)._apply_dense(grad, var)
-
-      sparsified_values = tf.gather(grad, top_row_indices)
-      sparsified_indices = top_row_indices
-
-      sparsified_grad = tf.IndexedSlices(sparsified_values, sparsified_indices)
-
-      return super(DeepGradientCompressionOptimizer, self)._apply_sparse_duplicate_indices(
-        sparsified_grad, var)
-
-    else:
-      g_buffer = self.get_slot(var, "g_buffer")
-
-      g_buffer = tf.assign_add(g_buffer, grad)
-
-      top_row_indices = get_top_row_indices(g_buffer, self._density_t)
-
-      if top_row_indices is None:
-        return super(DeepGradientCompressionOptimizer, self)._apply_dense(grad, var)
-
-      sparsified_values = tf.gather(g_buffer, top_row_indices)
-      sparsified_indices = top_row_indices
-
-      sparsified_grad = tf.IndexedSlices(sparsified_values, sparsified_indices)
-
-      update_var = super(DeepGradientCompressionOptimizer, self)._apply_sparse_duplicate_indices(
-        sparsified_grad, var)
-
-      update_g_buffer = tf.scatter_update(g_buffer, sparsified_indices, tf.zeros_like(
-        sparsified_values))
-
-      return tf.group(*[update_var, update_g_buffer])
-
-  def _apply_sparse_duplicate_indices(self, grad, var):
-    if not self._accumulation:
-      top_row_indices = get_top_row_indices(grad.values, self._density_t)
-
-      if top_row_indices is None:
-        return super(DeepGradientCompressionOptimizer, self)._apply_sparse_duplicate_indices(grad, var)  # noqa: E501
-
-      sparsified_values = tf.gather(grad.values, top_row_indices)
-      sparsified_indices = tf.gather(grad.indices, top_row_indices)
-
-      sparsified_grad = tf.IndexedSlices(sparsified_values, sparsified_indices)
+    flat_grad = tf.reshape(grad, [-1])
+    abs_flat_grad = tf.abs(flat_grad)
+    size = tf.shape(abs_flat_grad)[0]
+    k = tf.maximum(
+        tf.constant(1),
+        tf.cast(tf.scalar_mul(density, tf.cast(size, tf.float32)), tf.int32),
+    )
+    topk, _ = tf.nn.top_k(abs_flat_grad, k, False)
+    return topk[-1]
+
+
+def get_top_row_indices(values: tf.Tensor, density: float) -> List[int]:
+    """
+    A utility function to get indices of most significant rows, given the density degree.
+    Args:
+        values(tf.Tensor):
+            Gradient or locally accumulated gradient for some variable.
+        density(float):
+            Density degree when filtering out rows.
+    Returns(list(int)):
+        Indices of most significant rows.
+    """
+    abs_values = tf.abs(values)
 
-      return super(DeepGradientCompressionOptimizer, self)._apply_sparse_duplicate_indices(
-        sparsified_grad, var)
+    try:
+        row_num = tf.shape(abs_values)[0]
+        k = tf.maximum(
+            tf.constant(1),
+            tf.cast(tf.scalar_mul(density, tf.cast(row_num, tf.float32)), tf.int32),
+        )
+        row_sums = tf.squeeze(tf.reduce_sum(values, axis=1, keepdims=True))
+        _, top_row_indices = tf.nn.top_k(row_sums, k=k, sorted=False)
+        # print "abs_values", abs_values, "row_sums", row_sums
+        return top_row_indices
+        # return tf.range(row_num)
 
-    else:
-      g_buffer = self.get_slot(var, "g_buffer")
+    except ValueError:  # if the tensor is 0-D or 1-D
+        return None
 
-      g_buffer = tf.scatter_update(g_buffer, grad.indices, grad.values)
 
-      top_row_indices = get_top_row_indices(g_buffer, self._density_t)
+class DeepGradientCompressionOptimizer(tf.train.GradientDescentOptimizer):
+    """
+    A custom optimizer to implement Deep Gradient Compression (https://arxiv.org/abs/1712.01887).
+    """
 
-      if top_row_indices is None:
-        return super(DeepGradientCompressionOptimizer,
-                     self)._apply_sparse_duplicate_indices(grad, var)
+    def __init__(
+        self,
+        learning_rate: float,
+        use_locking: bool = False,
+        name: str = "Sparse",
+        density: float = 1.0,
+        density_decay: bool = False,
+        density_decay_steps: int = 10000,
+        density_decay_rate: float = 0.5,
+        min_density: float = 0.1,
+        accumulation: bool = False,
+    ):
+        super(DeepGradientCompressionOptimizer, self).__init__(
+            learning_rate, use_locking, name
+        )
+        self._initial_density_t = tf.convert_to_tensor(density)
+        self._density_decay = density_decay
+        dtype = self._initial_density_t.dtype
+        self._density_decay_steps_t = tf.convert_to_tensor(density_decay_steps, dtype)
+        self._density_decay_rate_t = tf.convert_to_tensor(density_decay_rate, dtype)
+        self._min_density_t = tf.convert_to_tensor(min_density, dtype)
+        self._accumulation = accumulation
+
+    def _prepare(self) -> None:
+        super(DeepGradientCompressionOptimizer, self)._prepare()
+        if not self._density_decay:
+            self._density_t = self._initial_density_t
+        else:
+            dtype = self._initial_density_t.dtype
+            global_step = tf.cast(tf.train.get_global_step(), dtype)
+            p = tf.floor(tf.divide(global_step, self._density_decay_steps_t))
+            decayed_density = tf.multiply(
+                self._initial_density_t, tf.pow(self._density_decay_rate_t, p)
+            )
+            self._density_t = tf.maximum(self._min_density_t, decayed_density)
+
+    def _create_slots(self, var_list: List[tf.Variable]) -> None:
+        """
+        Create a slot variable to accumulate gradients locally for each variable in `var_list`.
+        Args:
+            var_list(list(tf.Variable)):
+                List of variables to accumulate gradients locally for.
+        """
+        for var in var_list:
+            self._zeros_slot(var, "g_buffer", self._name)
+
+    def _apply_dense(self, grad: tf.Tensor, var: tf.Variable) -> tf.Operation:
+        """
+        Apply dense gradients to variables.
+
+        Args:
+            grad(tf.Tensor):
+                Dense gradients to apply.
+            var(tf.Variable):
+                Variable to apply gradients to.
+
+        Returns:
+            (tf.Operation) Operation to apply dense gradients to variables.
+        """
+        if not self._accumulation:
+            top_row_indices = get_top_row_indices(grad, self._density_t)
+
+            if top_row_indices is None:
+                return super(DeepGradientCompressionOptimizer, self)._apply_dense(
+                    grad, var
+                )
+
+            sparsified_values = tf.gather(grad, top_row_indices)
+            sparsified_indices = top_row_indices
+
+            sparsified_grad = tf.IndexedSlices(sparsified_values, sparsified_indices)
+
+            return super(
+                DeepGradientCompressionOptimizer, self
+            )._apply_sparse_duplicate_indices(sparsified_grad, var)
+
+        else:
+            g_buffer = self.get_slot(var, "g_buffer")
+
+            g_buffer = tf.assign_add(g_buffer, grad)
+
+            top_row_indices = get_top_row_indices(g_buffer, self._density_t)
+
+            if top_row_indices is None:
+                return super(DeepGradientCompressionOptimizer, self)._apply_dense(
+                    grad, var
+                )
+
+            sparsified_values = tf.gather(g_buffer, top_row_indices)
+            sparsified_indices = top_row_indices
+
+            sparsified_grad = tf.IndexedSlices(sparsified_values, sparsified_indices)
+
+            update_var = super(
+                DeepGradientCompressionOptimizer, self
+            )._apply_sparse_duplicate_indices(sparsified_grad, var)
+
+            update_g_buffer = tf.scatter_update(
+                g_buffer, sparsified_indices, tf.zeros_like(sparsified_values)
+            )
+
+            return tf.group(*[update_var, update_g_buffer])
+
+    def _apply_sparse_duplicate_indices(
+        self, grad: tf.IndexedSlices, var: tf.Variable
+    ) -> tf.Operation:
+        """
+        Apply sparse gradients to variables.
+
+        Args:
+            grad(tf.IndexedSlices):
+                Sparse gradients to apply.
+            var(tf.Variable):
+                Variable to apply gradients to.
+
+        Returns:
+            (tf.Operation) Operation to apply sparse gradients to variables.
+        """
+
+        if not self._accumulation:
+            top_row_indices = get_top_row_indices(grad.values, self._density_t)
+
+            if top_row_indices is None:
+                return super(
+                    DeepGradientCompressionOptimizer, self
+                )._apply_sparse_duplicate_indices(
+                    grad, var
+                )  # noqa: E501
+
+            sparsified_values = tf.gather(grad.values, top_row_indices)
+            sparsified_indices = tf.gather(grad.indices, top_row_indices)
+            sparsified_grad = tf.IndexedSlices(sparsified_values, sparsified_indices)
+
+            return super(
+                DeepGradientCompressionOptimizer, self
+            )._apply_sparse_duplicate_indices(sparsified_grad, var)
+
+        else:
+            g_buffer = self.get_slot(var, "g_buffer")
+            g_buffer = tf.scatter_update(g_buffer, grad.indices, grad.values)
+            top_row_indices = get_top_row_indices(g_buffer, self._density_t)
 
-      sparsified_values = tf.gather(g_buffer, top_row_indices)
-      sparsified_indices = top_row_indices
+            if top_row_indices is None:
+                return super(
+                    DeepGradientCompressionOptimizer, self
+                )._apply_sparse_duplicate_indices(grad, var)
 
-      sparsified_grad = tf.IndexedSlices(sparsified_values, sparsified_indices)
+            sparsified_values = tf.gather(g_buffer, top_row_indices)
+            sparsified_indices = top_row_indices
+            sparsified_grad = tf.IndexedSlices(sparsified_values, sparsified_indices)
 
-      update_var = super(DeepGradientCompressionOptimizer, self)._apply_sparse_duplicate_indices(
-        sparsified_grad, var)
+            update_var = super(
+                DeepGradientCompressionOptimizer, self
+            )._apply_sparse_duplicate_indices(sparsified_grad, var)
 
-      update_g_buffer = tf.scatter_update(g_buffer, sparsified_indices, tf.zeros_like(
-        sparsified_values))
+            update_g_buffer = tf.scatter_update(
+                g_buffer, sparsified_indices, tf.zeros_like(sparsified_values)
+            )
 
-      return tf.group(*[update_var, update_g_buffer])
+            return tf.group(*[update_var, update_g_buffer])
diff --git a/twml/twml/contrib/optimizers/pruning_optimizer.py b/twml/twml/contrib/optimizers/pruning_optimizer.py
index 2bcd612ed..40f2fc007 100644
--- a/twml/twml/contrib/optimizers/pruning_optimizer.py
+++ b/twml/twml/contrib/optimizers/pruning_optimizer.py
@@ -6,159 +6,168 @@
 
 To make a layer prunable, use `twml.contrib.pruning.apply_mask`:
 
-  dense1 = tf.layers.dense(inputs=inputs, units=50, activation=tf.nn.relu)
-  dense1 = apply_mask(dense1)
+    dense1 = tf.layers.dense(inputs=inputs, units=50, activation=tf.nn.relu)
+    dense1 = apply_mask(dense1)
 
 To prune the network, apply PruningOptimizer to any cross-entropy loss:
 
-  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
+    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
 
-  optimizer = PruningOptimizer(learning_rate=0.001, momentum=0.5)
-  minimize = optimizer.minimize(
-      loss=loss,
-      prune_every=10,
-      burn_in=100,
-      global_step=tf.train.get_global_step())
+    optimizer = PruningOptimizer(learning_rate=0.001, momentum=0.5)
+    minimize = optimizer.minimize(
+        loss=loss,
+        prune_every=10,
+        burn_in=100,
+        global_step=tf.train.get_global_step())
 """
 
+from typing import Optional
+
 import tensorflow.compat.v1 as tf
 
-from twml.contrib.pruning import computational_cost, prune, update_pruning_signals
-from twml.contrib.pruning import MASK_COLLECTION
+from twml.contrib.pruning import (
+    MASK_COLLECTION,
+    computational_cost,
+    prune,
+    update_pruning_signals,
+)
 
 
 class PruningOptimizer(tf.train.MomentumOptimizer):
-  """
-  Updates parameters with SGD and pruning masks using Fisher pruning.
-
-  Arguments:
-    learning_rate: float
-      Learning rate of SGD
-
-    momentum: float
-      Momentum used by SGD
-
-    use_locking: bool
-      If `True`, use locks for update operations
-
-    name: str
-      Optional name prefix for the operations created when applying gradients
-
-    use_nesterov: bool
-      If `True`, use Nesterov momentum
-  """
-
-  def __init__(
-      self,
-      learning_rate,
-      momentum=0.9,
-      use_locking=False,
-      name="PruningOptimizer",
-      use_nesterov=False):
-    super(PruningOptimizer, self).__init__(
-        learning_rate=learning_rate,
-        momentum=momentum,
-        use_locking=use_locking,
-        name=name,
-        use_nesterov=use_nesterov)
-
-  def minimize(
-    self,
-    loss,
-    prune_every=100,
-    burn_in=0,
-    decay=.96,
-    flops_weight='AUTO',
-    flops_target=0,
-    update_params=None,
-    method='Fisher',
-    *args,
-    **kwargs):
     """
-    Create operations to minimize loss and to prune features.
-
-    A pruning signal measures the importance of feature maps. This is weighed against the
-    computational cost of computing a feature map. Features are then iteratively pruned
-    based on a weighted average of feature importance S and computational cost C (in FLOPs):
-
-    $$S + w * C$$
-
-    Setting `flops_weight` to 'AUTO' is the most convenient and recommended option, but not
-    necessarily optimal.
-
-    Arguments:
-      loss: tf.Tensor
-        The value to minimize
-
-      prune_every: int
-        One entry of a mask is set to zero only every few update steps
-
-      burn_in: int
-        Pruning starts only after this many parameter updates
-
-      decay: float
-        Controls exponential moving average of pruning signals
-
-      flops_weight: float or str
-        Controls the targeted trade-off between computational complexity and performance
-
-      flops_target: float
-        Stop pruning when computational complexity is less or this many floating point ops
-
-      update_params: tf.Operation
-        Optional training operation used instead of MomentumOptimizer to update parameters
-
-      method: str
-        Method used to compute pruning signal (currently only supports 'Fisher')
-
-    Returns:
-      A `tf.Operation` updating parameters and pruning masks
-
-    References:
-    * Theis et al., Faster gaze prediction with dense networks and Fisher pruning, 2018
+    Updates parameters with SGD and pruning masks using Fisher pruning.
+
+    Args:
+        learning_rate: float
+            Learning rate of SGD
+        momentum: float
+            Momentum used by SGD
+        use_locking: bool
+            If `True`, use locks for update operations
+        name: str
+            Optional name prefix for the operations created when applying gradients
+        use_nesterov: bool
+            If `True`, use Nesterov momentum
     """
 
-    # gradient-based updates of parameters
-    if update_params is None:
-      update_params = super(PruningOptimizer, self).minimize(loss, *args, **kwargs)
-
-    masks = tf.get_collection(MASK_COLLECTION)
-
-    with tf.variable_scope('pruning_opt', reuse=True):
-      # estimate computational cost per data point
-      batch_size = tf.cast(tf.shape(masks[0].tensor), loss.dtype)[0]
-      cost = tf.divide(computational_cost(loss), batch_size, name='computational_cost')
-
-      tf.summary.scalar('computational_cost', cost)
-
-      if masks:
-        signals = update_pruning_signals(loss, masks=masks, decay=decay, method=method)
-
-        # estimate computational cost per feature map
-        costs = tf.gradients(cost, masks)
-
-        # trade off computational complexity and performance
-        if flops_weight.upper() == 'AUTO':
-          signals = [s / (c + 1e-6) for s, c in zip(signals, costs)]
-        elif not isinstance(flops_weight, float) or flops_weight != 0.:
-          signals = [s - flops_weight * c for s, c in zip(signals, costs)]
-
-        counter = tf.Variable(0, name='pruning_counter')
-        counter = tf.assign_add(counter, 1, use_locking=True)
-
-        # only prune every so often after a burn-in phase
-        pruning_cond = tf.logical_and(counter > burn_in, tf.equal(counter % prune_every, 0))
-
-        # stop pruning after reaching threshold
-        if flops_target > 0:
-          pruning_cond = tf.logical_and(pruning_cond, tf.greater(cost, flops_target))
-
-        update_masks = tf.cond(
-          pruning_cond,
-          lambda: prune(signals, masks=masks),
-          lambda: tf.group(masks))
-
-        return tf.group([update_params, update_masks])
-
-    # no masks found
-    return update_params
+    def __init__(
+        self,
+        learning_rate: float,
+        momentum: float = 0.9,
+        use_locking: bool = False,
+        name: str = "PruningOptimizer",
+        use_nesterov: bool = False,
+    ):
+        super(PruningOptimizer, self).__init__(
+            learning_rate=learning_rate,
+            momentum=momentum,
+            use_locking=use_locking,
+            name=name,
+            use_nesterov=use_nesterov,
+        )
+
+    def minimize(
+        self,
+        loss: tf.Tensor,
+        prune_every: int = 100,
+        burn_in: int = 0,
+        decay: float = 0.96,
+        flops_weight: str = "AUTO",
+        flops_target: int = 0,
+        update_params: Optional[tf.Operation] = None,
+        method: str = "Fisher",
+        *args,
+        **kwargs
+    ) -> tf.Operation:
+        """
+        Create operations to minimize loss and to prune features.
+
+        A pruning signal measures the importance of feature maps. This is weighed against the
+        computational cost of computing a feature map. Features are then iteratively pruned
+        based on a weighted average of feature importance S and computational cost C (in FLOPs):
+
+        $$S + w * C$$
+
+        Setting `flops_weight` to 'AUTO' is the most convenient and recommended option, but not
+        necessarily optimal.
+
+        Args:
+            loss: tf.Tensor
+                The value to minimize
+            prune_every: int
+                One entry of a mask is set to zero only every few update steps
+            burn_in: int
+                Pruning starts only after this many parameter updates
+            decay: float
+                Controls exponential moving average of pruning signals
+            flops_weight: float or str
+                Controls the targeted trade-off between computational complexity and performance
+            flops_target: float
+                Stop pruning when computational complexity is less or this many floating point ops
+            update_params: tf.Operation
+                Optional training operation used instead of MomentumOptimizer to update parameters
+            method: str
+                Method used to compute pruning signal (currently only supports 'Fisher')
+
+        Returns:
+            A `tf.Operation` updating parameters and pruning masks
+
+        References:
+            * Theis et al., Faster gaze prediction with dense networks and Fisher pruning, 2018
+        """
+
+        # gradient-based updates of parameters
+        if update_params is None:
+            update_params = super(PruningOptimizer, self).minimize(
+                loss, *args, **kwargs
+            )
+
+        masks = tf.get_collection(MASK_COLLECTION)
+
+        with tf.variable_scope("pruning_opt", reuse=True):
+            # estimate computational cost per data point
+            batch_size = tf.cast(tf.shape(masks[0].tensor), loss.dtype)[0]
+            cost = tf.divide(
+                computational_cost(loss), batch_size, name="computational_cost"
+            )
+
+            tf.summary.scalar("computational_cost", cost)
+
+            if masks:
+                signals = update_pruning_signals(
+                    loss, masks=masks, decay=decay, method=method
+                )
+
+                # estimate computational cost per feature map
+                costs = tf.gradients(cost, masks)
+
+                # trade off computational complexity and performance
+                if flops_weight.upper() == "AUTO":
+                    signals = [s / (c + 1e-6) for s, c in zip(signals, costs)]
+                elif not isinstance(flops_weight, float) or flops_weight != 0.0:
+                    signals = [s - flops_weight * c for s, c in zip(signals, costs)]
+
+                counter = tf.Variable(0, name="pruning_counter")
+                counter = tf.assign_add(counter, 1, use_locking=True)
+
+                # only prune every so often after a burn-in phase
+                pruning_cond = tf.logical_and(
+                    counter > burn_in, tf.equal(counter % prune_every, 0)
+                )
+
+                # stop pruning after reaching threshold
+                if flops_target > 0:
+                    pruning_cond = tf.logical_and(
+                        pruning_cond, tf.greater(cost, flops_target)
+                    )
+
+                update_masks = tf.cond(
+                    pruning_cond,
+                    lambda: prune(signals, masks=masks),
+                    lambda: tf.group(masks),
+                )
+
+                return tf.group([update_params, update_masks])
+
+        # no masks found
+        return update_params
diff --git a/twml/twml/contrib/parsers.py b/twml/twml/contrib/parsers.py
index a27f2acbd..448b724fa 100644
--- a/twml/twml/contrib/parsers.py
+++ b/twml/twml/contrib/parsers.py
@@ -1,21 +1,21 @@
-'''
+"""
 Contains implementations of functions to parse the contrib.FeatureConfig
 
 Modelers can use the functions in this module as the the train/eval_parse_fn of
 the DataRecordTrainer constructor to customize how to parse their datasets.
 
 Modelers may also provide custom implementations of train/eval_parse_fn using these as reference.
-'''
+"""
 
-from twitter.deepbird.io.legacy.contrib.parsers import (
-  _convert_to_fixed_length_tensor,  # noqa: F401
-  _get_input_receiver_fn_feature_dict,  # noqa: F401
-  _merge_dictionaries,  # noqa: F401
-  get_features_as_tensor_dict,  # noqa: F401
-  get_keras_parse_fn,  # noqa: F401
-  get_serving_input_receiver_fn_feature_dict,  # noqa: F401
-  get_string_tensor_parse_fn,  # noqa: F401
-  get_string_tensor_serving_input_receiver_fn,  # noqa: F401
-  get_supervised_input_receiver_fn_feature_dict,  # noqa: F401
-  parse_string_tensor,  # noqa: F401
+from twitter.deepbird.io.legacy.contrib.parsers import _merge_dictionaries  # noqa: F401
+from twitter.deepbird.io.legacy.contrib.parsers import get_keras_parse_fn  # noqa: F401
+from twitter.deepbird.io.legacy.contrib.parsers import parse_string_tensor  # noqa: F401
+from twitter.deepbird.io.legacy.contrib.parsers import (  # noqa: F401
+    _convert_to_fixed_length_tensor,
+    _get_input_receiver_fn_feature_dict,
+    get_features_as_tensor_dict,
+    get_serving_input_receiver_fn_feature_dict,
+    get_string_tensor_parse_fn,
+    get_string_tensor_serving_input_receiver_fn,
+    get_supervised_input_receiver_fn_feature_dict,
 )
diff --git a/twml/twml/contrib/pruning.py b/twml/twml/contrib/pruning.py
index b6ddee693..950f3c9d1 100644
--- a/twml/twml/contrib/pruning.py
+++ b/twml/twml/contrib/pruning.py
@@ -3,361 +3,395 @@
 
 In particular, it provides tools for dealing with masks:
 
-  features = apply_mask(features)
+    features = apply_mask(features)
 
 The function `apply_mask` applies a binary mask to the channels of a given tensor. Consider the
 following loss:
 
-  logits = tf.matmul(features, weights)
-  loss = tf.losses.sparse_softmax_cross_entropy(labels, logits)
+    logits = tf.matmul(features, weights)
+    loss = tf.losses.sparse_softmax_cross_entropy(labels, logits)
 
 Each mask has a corresponding pruning signal. The function `update_pruning_signals` will update and
 return these signals:
 
-  signals = update_pruning_signals(loss)
+    signals = update_pruning_signals(loss)
 
 The pruning operation will zero out the mask entry with the smallest corresponding pruning signal:
 
-  prune(signals)
+    prune(signals)
 
 The following function allows us to estimate the computational cost of a graph (number of FLOPs):
 
-  cost = computational_cost(loss)
+    cost = computational_cost(loss)
 
 To compute the cost of each feature per data point, we can do:
 
-  costs = tf.gradients(cost / batch_size, masks)
+    costs = tf.gradients(cost / batch_size, masks)
 
 The current implementation of `computational_cost` is designed to work with standard feed-forward
 and convolutional network architectures only, but may fail with more complicated architectures.
 """
 
 
+from typing import List, Optional, Set, Union
+
 import numpy as np
 import tensorflow.compat.v1 as tf
 
-MASK_COLLECTION = 'pruning/masks'
-MASK_EXTENDED_COLLECTION = 'pruning/masks_extended'
-OP_COLLECTION = 'pruning/ops'
-
-
-def apply_mask(tensor, name='pruning'):
-  """
-  Point-wise multiplies a tensor with a binary mask.
-
-  During training, pruning is simulated by setting entries of the mask to zero.
-
-  Arguments:
-    tensor: tf.Tensor
-      A tensor where the last dimension represents channels which will be masked
-
-  Returns:
-    `tf.Tensor` with same shape as `tensor`
-  """
-
-  tensor_shape = tensor.shape
-
-  with tf.variable_scope(name, reuse=True):
-    # allocate masks and corresponding pruning signals
-    mask = tf.Variable(tf.ones(tensor.shape.as_list()[-1]), trainable=False, name='mask')
-    pruning_signal = tf.Variable(tf.zeros_like(mask), trainable=False, name='signal')
-
-    # extending masks is a trick to get a separate gradient for each data point
-    mask_extended = extend_mask(mask, tensor)
-
-  # store extended mask, pruning signal, and other vars for easy access later
-  mask.extended = mask_extended
-  mask.pruning_signal = pruning_signal
-  mask.tensor = tensor
-
-  # mask tensor
-  tensor = tf.multiply(tensor, mask_extended)
-  tensor.set_shape(tensor_shape)
-  tensor._mask = mask
-
-  tf.add_to_collection(MASK_COLLECTION, mask)
-  tf.add_to_collection(MASK_EXTENDED_COLLECTION, mask.extended)
-  tf.add_to_collection(OP_COLLECTION, tensor.op)
-
-  return tensor
-
-
-def extend_mask(mask, tensor):
-  """
-  Repeats the mask for each data point stored in a tensor.
-
-  If `tensor` is AxBxC dimensional and `mask` is C dimensional, returns an Ax1xC dimensional
-  tensor with A copies or `mask`.
-
-  Arguments:
-    mask: tf.Tensor
-      The mask which will be extended
-
-    tensor: tf.Tensor
-      The tensor to which the extended mask will be applied
-
-  Returns:
-    The extended mask
-  """
-
-  batch_size = tf.shape(tensor)[:1]
-  ones = tf.ones([tf.rank(tensor) - 1], dtype=batch_size.dtype)
-  multiples = tf.concat([batch_size, ones], 0)
-  mask_shape = tf.concat([ones, [-1]], 0)
-  return tf.tile(tf.reshape(mask, mask_shape), multiples)
-
-
-def find_input_mask(tensor):
-  """
-  Find ancestral mask affecting the number of pruned channels of a tensor.
-
-  Arguments:
-    tensor: tf.Tensor
-      Tensor for which to identify relevant mask
-
-  Returns:
-    A `tf.Tensor` or `None`
-  """
-
-  if hasattr(tensor, '_mask'):
-    return tensor._mask
-  if tensor.op.type in ['MatMul', 'Conv1D', 'Conv2D', 'Conv3D', 'Transpose']:
-    # op produces a new number of channels, preceding mask therefore irrelevant
-    return None
-  if not tensor.op.inputs:
-    return None
-  for input in tensor.op.inputs:
-    mask = find_input_mask(input)
-    if mask is not None:
-      return mask
-
-
-def find_output_mask(tensor):
-  """
-  Find mask applied to the tensor or one of its descendants if it affects the tensor's pruned shape.
-
-  Arguments:
-    tensor: tf.Tensor or tf.Variable
-      Tensor for which to identify relevant mask
-
-  Returns:
-    A `tf.Tensor` or `None`
-  """
-
-  if isinstance(tensor, tf.Variable):
-    return find_output_mask(tensor.op.outputs[0])
-  if hasattr(tensor, '_mask'):
-    return tensor._mask
-  for op in tensor.consumers():
-    if len(op.outputs) != 1:
-      continue
-    if op.type in ['MatMul', 'Conv1D', 'Conv2D', 'Conv3D']:
-      # masks of descendants are only relevant if tensor is right-multiplied
-      if tensor == op.inputs[1]:
-        return find_output_mask(op.outputs[0])
-      return None
-    mask = find_output_mask(op.outputs[0])
-    if mask is not None:
-      return mask
-
-
-def find_mask(tensor):
-  """
-  Returns masks indicating channels of the tensor that are effectively removed from the graph.
-
-  Arguments:
-    tensor: tf.Tensor
-      Tensor for which to compute a mask
-
-  Returns:
-    A `tf.Tensor` with binary entries indicating disabled channels
-  """
-
-  input_mask = find_input_mask(tensor)
-  output_mask = find_output_mask(tensor)
-  if input_mask is None:
-    return output_mask
-  if output_mask is None:
-    return input_mask
-  if input_mask is output_mask:
-    return input_mask
-  return input_mask * output_mask
-
-
-def pruned_shape(tensor):
-  """
-  Computes the shape of a tensor after taking into account pruning of channels.
-
-  Note that the shape will only differ in the last dimension, even if other dimensions are also
-  effectively disabled by pruning masks.
-
-  Arguments:
-    tensor: tf.Tensor
-      Tensor for which to compute a pruned shape
-
-  Returns:
-    A `tf.Tensor[tf.float32]` representing the pruned shape
-  """
-
-  mask = find_mask(tensor)
-
-  if mask is None:
-    return tf.cast(tf.shape(tensor), tf.float32)
-
-  return tf.concat([
-    tf.cast(tf.shape(tensor)[:-1], mask.dtype),
-    tf.reduce_sum(mask, keepdims=True)], 0)
-
-
-def computational_cost(op_or_tensor, _observed=None):
-  """
-  Estimates the computational complexity of a pruned graph (number of floating point operations).
-
-  This function currently only supports sequential graphs such as those of MLPs and
-  simple CNNs with 2D convolutions in NHWC format.
-
-  Note that the computational cost returned by this function is proportional to batch size.
-
-  Arguments:
-    op_or_tensor: tf.Tensor or tf.Operation
-      Root node of graph for which to compute computational cost
-
-  Returns:
-    A `tf.Tensor` representing a number of floating point operations
-  """
+MASK_COLLECTION = "pruning/masks"
+MASK_EXTENDED_COLLECTION = "pruning/masks_extended"
+OP_COLLECTION = "pruning/ops"
+
 
-  cost = tf.constant(0.)
+def apply_mask(tensor: tf.tensor, name: str = "pruning") -> tf.tensor:
+    """
+    Point-wise multiplies a tensor with a binary mask.
+    During training, pruning is simulated by setting entries of the mask to zero.
 
-  # exclude cost of computing extended pruning masks
-  masks_extended = [mask.extended for mask in tf.get_collection(MASK_COLLECTION)]
-  if op_or_tensor in masks_extended:
-    return cost
-
-  # convert tensor to op
-  op = op_or_tensor.op if isinstance(op_or_tensor, (tf.Tensor, tf.Variable)) else op_or_tensor
-
-  # make sure cost of op will not be counted twice
-  if _observed is None:
-    _observed = []
-  elif op in _observed:
-    return cost
-  _observed.append(op)
+    Args:
+        tensor: tf.Tensor
+            A tensor where the last dimension represents channels which will be masked
 
-  # compute cost of computing inputs
-  for tensor in op.inputs:
-    cost = cost + computational_cost(tensor, _observed)
+    Returns:
+        `tf.Tensor` with same shape as `tensor`
+    """
+
+    tensor_shape = tensor.shape
+
+    with tf.variable_scope(name, reuse=True):
+        # allocate masks and corresponding pruning signals
+        mask = tf.Variable(
+            tf.ones(tensor.shape.as_list()[-1]), trainable=False, name="mask"
+        )
+        pruning_signal = tf.Variable(
+            tf.zeros_like(mask), trainable=False, name="signal"
+        )
+
+        # extending masks is a trick to get a separate gradient for each data point
+        mask_extended = extend_mask(mask, tensor)
+
+    # store extended mask, pruning signal, and other vars for easy access later
+    mask.extended = mask_extended
+    mask.pruning_signal = pruning_signal
+    mask.tensor = tensor
+
+    # mask tensor
+    tensor = tf.multiply(tensor, mask_extended)
+    tensor.set_shape(tensor_shape)
+    tensor._mask = mask
+
+    tf.add_to_collection(MASK_COLLECTION, mask)
+    tf.add_to_collection(MASK_EXTENDED_COLLECTION, mask.extended)
+    tf.add_to_collection(OP_COLLECTION, tensor.op)
+
+    return tensor
+
+
+def extend_mask(mask: tf.Tensor, tensor: tf.Tensor) -> tf.Tensor:
+    """
+    Repeats the mask for each data point stored in a tensor.
+    If `tensor` is AxBxC dimensional and `mask` is C dimensional, returns an Ax1xC dimensional
+    tensor with A copies or `mask`.
+
+    Args:
+        mask: tf.Tensor
+            The mask which will be extended
+        tensor: tf.Tensor
+            The tensor to which the extended mask will be applied
+
+    Returns:
+        The extended mask
+    """
+
+    batch_size = tf.shape(tensor)[:1]
+    ones = tf.ones([tf.rank(tensor) - 1], dtype=batch_size.dtype)
+    multiples = tf.concat([batch_size, ones], 0)
+    mask_shape = tf.concat([ones, [-1]], 0)
+    return tf.tile(tf.reshape(mask, mask_shape), multiples)
+
+
+def find_input_mask(tensor: tf.Tensor) -> Optional[tf.Tensor]:
+    """
+    Find ancestral mask affecting the number of pruned channels of a tensor.
+
+    Args:
+        tensor: tf.Tensor
+            Tensor for which to identify relevant mask
+
+    Returns:
+        A `tf.Tensor` or `None`
+    """
+
+    if hasattr(tensor, "_mask"):
+        return tensor._mask
+    if tensor.op.type in ["MatMul", "Conv1D", "Conv2D", "Conv3D", "Transpose"]:
+        # op produces a new number of channels, preceding mask therefore irrelevant
+        return None
+    if not tensor.op.inputs:
+        return None
+    for input in tensor.op.inputs:
+        mask = find_input_mask(input)
+        if mask is not None:
+            return mask
+
+
+def find_output_mask(tensor: Union[tf.Tensor, tf.Variable]) -> Optional[tf.Tensor]:
+    """
+    Find mask applied to the tensor or one of its descendants if it affects the tensor's pruned shape.
+
+    Args:
+        tensor: tf.Tensor or tf.Variable
+            Tensor for which to identify relevant mask
+
+    Returns:
+        A `tf.Tensor` or `None`
+    """
+
+    if isinstance(tensor, tf.Variable):
+        return find_output_mask(tensor.op.outputs[0])
+
+    if hasattr(tensor, "_mask"):
+        return tensor._mask
+    for op in tensor.consumers():
+        if len(op.outputs) != 1:
+            continue
+        if op.type in ["MatMul", "Conv1D", "Conv2D", "Conv3D"]:
+            # masks of descendants are only relevant if tensor is right-multiplied
+            if tensor == op.inputs[1]:
+                return find_output_mask(op.outputs[0])
+            return None
+        mask = find_output_mask(op.outputs[0])
+        if mask is not None:
+            return mask
+
+
+def find_mask(tensor: tf.Tensor) -> tf.Tensor:
+    """
+    Returns masks indicating channels of the tensor that are effectively removed from the graph.
+
+    Args:
+        tensor: tf.Tensor
+            Tensor for which to compute a mask
+
+    Returns:
+        A `tf.Tensor` with binary entries indicating disabled channels
+    """
+
+    input_mask = find_input_mask(tensor)
+    output_mask = find_output_mask(tensor)
+    if input_mask is None:
+        return output_mask
+    if output_mask is None:
+        return input_mask
+    if input_mask is output_mask:
+        return input_mask
+    return input_mask * output_mask
+
+
+def pruned_shape(tensor: tf.Tensor) -> tf.Tensor:
+    """
+    Computes the shape of a tensor after taking into account pruning of channels.
+
+    Note that the shape will only differ in the last dimension, even if other dimensions are also
+    effectively disabled by pruning masks.
+
+    Args:
+        tensor: tf.Tensor
+            Tensor for which to compute a pruned shape
+
+    Returns:
+        A `tf.Tensor[tf.float32]` representing the pruned shape
+    """
+
+    mask = find_mask(tensor)
+
+    if mask is None:
+        return tf.cast(tf.shape(tensor), tf.float32)
+
+    return tf.concat(
+        [
+            tf.cast(tf.shape(tensor)[:-1], mask.dtype),
+            tf.reduce_sum(mask, keepdims=True),
+        ],
+        0,
+    )
+
+
+def computational_cost(
+    op_or_tensor: Union[tf.Tensor, tf.Operation], _observed: Optional[Set] = None
+) -> tf.Tensor:
+    """
+    Estimates the computational complexity of a pruned graph (number of floating point operations).
+
+    This function currently only supports sequential graphs such as those of MLPs and
+    simple CNNs with 2D convolutions in NHWC format.
+
+    Note that the computational cost returned by this function is proportional to batch size.
+
+    Args:
+        op_or_tensor: tf.Tensor or tf.Operation
+            Root node of graph for which to compute computational cost
+        _observed: Set
+            Internal parameter used to avoid counting the same operation twice
+
+    Returns:
+        A `tf.Tensor` representing a number of floating point operations
+    """
+
+    cost = tf.constant(0.0)
+
+    # exclude cost of computing extended pruning masks
+    masks_extended = [mask.extended for mask in tf.get_collection(MASK_COLLECTION)]
+    if op_or_tensor in masks_extended:
+        return cost
+
+    # convert tensor to op
+    op = (
+        op_or_tensor.op
+        if isinstance(op_or_tensor, (tf.Tensor, tf.Variable))
+        else op_or_tensor
+    )
+
+    # make sure cost of op will not be counted twice
+    if _observed is None:
+        _observed = []
+    elif op in _observed:
+        return cost
+    _observed.append(op)
+
+    # compute cost of computing inputs
+    for tensor in op.inputs:
+        cost = cost + computational_cost(tensor, _observed)
+
+    # add cost of operation
+    if op.op_def is None or op in tf.get_collection(OP_COLLECTION):
+        # exclude cost of undefined ops and pruning ops
+        return cost
+
+    elif op.op_def.name == "MatMul":
+        shape_a = pruned_shape(op.inputs[0])
+        shape_b = pruned_shape(op.inputs[1])
+        return cost + shape_a[0] * shape_b[1] * (2.0 * shape_a[1] - 1.0)
+
+    elif op.op_def.name in ["Add", "Mul", "BiasAdd"]:
+        return cost + tf.cond(
+            tf.size(op.inputs[0]) > tf.size(op.inputs[1]),
+            lambda: tf.reduce_prod(pruned_shape(op.inputs[0])),
+            lambda: tf.reduce_prod(pruned_shape(op.inputs[1])),
+        )
+
+    elif op.op_def.name in ["Conv2D"]:
+        output_shape = pruned_shape(op.outputs[0])
+        input_shape = pruned_shape(op.inputs[0])
+        kernel_shape = pruned_shape(op.inputs[1])
+        inner_prod_cost = tf.reduce_prod(kernel_shape[:2]) * input_shape[-1] * 2.0 - 1.0
+        return cost + tf.reduce_prod(output_shape) * inner_prod_cost
 
-  # add cost of operation
-  if op.op_def is None or op in tf.get_collection(OP_COLLECTION):
-    # exclude cost of undefined ops and pruning ops
     return cost
 
-  elif op.op_def.name == 'MatMul':
-    shape_a = pruned_shape(op.inputs[0])
-    shape_b = pruned_shape(op.inputs[1])
-    return cost + shape_a[0] * shape_b[1] * (2. * shape_a[1] - 1.)
-
-  elif op.op_def.name in ['Add', 'Mul', 'BiasAdd']:
-    return cost + tf.cond(
-        tf.size(op.inputs[0]) > tf.size(op.inputs[1]),
-        lambda: tf.reduce_prod(pruned_shape(op.inputs[0])),
-        lambda: tf.reduce_prod(pruned_shape(op.inputs[1])))
-
-  elif op.op_def.name in ['Conv2D']:
-    output_shape = pruned_shape(op.outputs[0])
-    input_shape = pruned_shape(op.inputs[0])
-    kernel_shape = pruned_shape(op.inputs[1])
-    inner_prod_cost = (tf.reduce_prod(kernel_shape[:2]) * input_shape[-1] * 2. - 1.)
-    return cost + tf.reduce_prod(output_shape) * inner_prod_cost
-
-  return cost
-
-
-def update_pruning_signals(loss, decay=.96, masks=None, method='Fisher'):
-  """
-  For each mask, computes corresponding pruning signals indicating the importance of a feature.
-
-  Arguments:
-    loss: tf.Tensor
-      Any cross-entropy loss
-
-    decay: float
-      Controls exponential moving average of pruning signals
-
-    method: str
-      Method used to compute pruning signal (currently only supports 'Fisher')
-
-  Returns:
-    A `list[tf.Tensor]` of pruning signals corresponding to masks
-
-  References:
-    * Theis et al., Faster gaze prediction with dense networks and Fisher pruning, 2018
-  """
-
-  if masks is None:
-    masks = tf.get_collection(MASK_COLLECTION)
-
-  if method not in ['Fisher']:
-    raise ValueError('Pruning method \'{0}\' not supported.'.format(method))
-
-  if not masks:
-    return []
-
-  with tf.variable_scope('pruning_opt', reuse=True):
-    # compute gradients of extended masks (yields separate gradient for each data point)
-    grads = tf.gradients(loss, [m.extended for m in masks])
-
-    # estimate Fisher pruning signals from batch
-    signals_batch = [tf.squeeze(tf.reduce_mean(tf.square(g), 0)) for g in grads]
-
-    # update pruning signals
-    signals = [m.pruning_signal for m in masks]
-    signals = [tf.assign(s, decay * s + (1. - decay) * f, use_locking=True)
-      for s, f in zip(signals, signals_batch)]
-
-  return signals
-
-
-def prune(signals, masks=None):
-  """
-  Prunes a single feature by zeroing the mask entry with the smallest pruning signal.
-
-  Arguments:
-    signals: list[tf.Tensor]
-      A list of pruning signals
-
-    masks: list[tf.Tensor]
-      A list of corresponding masks, defaults to `tf.get_collection(MASK_COLLECTION)`
-
-  Returns:
-    A `tf.Operation` which updates masks
-  """
-
-  if masks is None:
-    masks = tf.get_collection(MASK_COLLECTION)
-
-  with tf.variable_scope('pruning_opt', reuse=True):
-    # make sure we don't select already pruned units
-    signals = [tf.where(m > .5, s, tf.zeros_like(s) + np.inf) for m, s in zip(masks, signals)]
-
-    # find units with smallest pruning signal in each layer
-    min_idx = [tf.argmin(s) for s in signals]
-    min_signals = [s[i] for s, i in zip(signals, min_idx)]
-
-    # find layer with smallest pruning signal
-    l = tf.argmin(min_signals)
-
-    # construct pruning operations, one for each mask
-    updates = []
-    for k, i in enumerate(min_idx):
-      # set mask of layer l to 0 where pruning signal is smallest
-      updates.append(
-        tf.cond(
-          tf.equal(l, k),
-          lambda: tf.scatter_update(
-            masks[k], tf.Print(i, [i], message="Pruning layer [{0}] at index ".format(k)), 0.),
-          lambda: masks[k]))
-
-    updates = tf.group(updates, name='prune')
 
-  return updates
+def update_pruning_signals(
+    loss: tf.Tensor,
+    decay: float = 0.96,
+    masks: Optional[str] = None,
+    method: str = "Fisher",
+) -> List[tf.Tensor]:
+    """
+    For each mask, computes corresponding pruning signals indicating the importance of a feature.
+
+    Args:
+        loss: tf.Tensor
+            Any cross-entropy loss
+        decay: float
+            Controls exponential moving average of pruning signals
+        method: str
+            Method used to compute pruning signal (currently only supports 'Fisher')
+
+    Returns:
+        A `list[tf.Tensor]` of pruning signals corresponding to masks
+
+    References:
+      * Theis et al., Faster gaze prediction with dense networks and Fisher pruning, 2018
+    """
+
+    if masks is None:
+        masks = tf.get_collection(MASK_COLLECTION)
+
+    allowed_methods = ["Fisher"]
+    if method not in allowed_methods:
+        raise ValueError(f"Pruning method '{method}' not supported.")
+
+    if not masks:
+        return []
+
+    with tf.variable_scope("pruning_opt", reuse=True):
+        # compute gradients of extended masks (yields separate gradient for each data point)
+        grads = tf.gradients(loss, [m.extended for m in masks])
+
+        # estimate Fisher pruning signals from batch
+        signals_batch = [tf.squeeze(tf.reduce_mean(tf.square(g), 0)) for g in grads]
+
+        # update pruning signals
+        signals = [m.pruning_signal for m in masks]
+        signals = [
+            tf.assign(s, decay * s + (1.0 - decay) * f, use_locking=True)
+            for s, f in zip(signals, signals_batch)
+        ]
+
+    return signals
+
+
+def prune(
+    signals: List[tf.Tensor], masks: Optional[List[tf.Tensor]] = None
+) -> tf.Operation:
+    """
+    Prunes a single feature by zeroing the mask entry with the smallest pruning signal.
+
+    Args:
+        signals: list[tf.Tensor]
+            A list of pruning signals
+        masks: list[tf.Tensor]
+            A list of corresponding masks, defaults to `tf.get_collection(MASK_COLLECTION)`
+
+    Returns:
+        A `tf.Operation` which updates masks
+    """
+
+    if masks is None:
+        masks = tf.get_collection(MASK_COLLECTION)
+
+    with tf.variable_scope("pruning_opt", reuse=True):
+        # make sure we don't select already pruned units
+        signals = [
+            tf.where(m > 0.5, s, tf.zeros_like(s) + np.inf)
+            for m, s in zip(masks, signals)
+        ]
+
+        # find units with smallest pruning signal in each layer
+        min_idx = [tf.argmin(s) for s in signals]
+        min_signals = [s[i] for s, i in zip(signals, min_idx)]
+
+        # find layer with smallest pruning signal
+        l = tf.argmin(min_signals)
+
+        # construct pruning operations, one for each mask
+        updates = []
+        for index, id in enumerate(min_idx):
+            # set mask of layer l to 0 where pruning signal is smallest
+            updates.append(
+                tf.cond(
+                    tf.equal(l, index),
+                    lambda: tf.scatter_update(
+                        masks[index],
+                        tf.Print(
+                            id, [id], message=f"Pruning layer [{index}] at index "
+                        ),
+                        0.0,
+                    ),
+                    lambda: masks[index],
+                )
+            )
+
+        updates = tf.group(updates, name="prune")
+
+    return updates
diff --git a/twml/twml/contrib/readers/batch_prediction_request.py b/twml/twml/contrib/readers/batch_prediction_request.py
index 4408b33b4..3341cc851 100644
--- a/twml/twml/contrib/readers/batch_prediction_request.py
+++ b/twml/twml/contrib/readers/batch_prediction_request.py
@@ -4,5 +4,5 @@
 """
 
 from twitter.deepbird.io.legacy.contrib.readers.batch_prediction_request import (
-  BatchPredictionRequest  # noqa: F401
-)
+    BatchPredictionRequest,
+)  # noqa: F401
diff --git a/twml/twml/contrib/readers/data_record.py b/twml/twml/contrib/readers/data_record.py
index ae8cc0b68..84ca74f2e 100644
--- a/twml/twml/contrib/readers/data_record.py
+++ b/twml/twml/contrib/readers/data_record.py
@@ -4,7 +4,7 @@
 The result of this subclass methods are dictionaries of Tensors and SparseTensors
 """
 
-from twitter.deepbird.io.legacy.contrib.readers.data_record import (
-  SUPPORTED_DENSE_FEATURE_TYPES,  # noqa: F401
-  DataRecord,  # noqa: F401
+from twitter.deepbird.io.legacy.contrib.readers.data_record import (  # noqa: F401
+    SUPPORTED_DENSE_FEATURE_TYPES,
+    DataRecord,
 )
diff --git a/twml/twml/contrib/readers/hashed_batch_prediction_request.py b/twml/twml/contrib/readers/hashed_batch_prediction_request.py
index 3454f8483..d97c47a2f 100644
--- a/twml/twml/contrib/readers/hashed_batch_prediction_request.py
+++ b/twml/twml/contrib/readers/hashed_batch_prediction_request.py
@@ -4,5 +4,5 @@
 """
 
 from twitter.deepbird.io.legacy.contrib.readers.hashed_batch_prediction_request import (
-  HashedBatchPredictionRequest  # noqa: F401
-)
+    HashedBatchPredictionRequest,
+)  # noqa: F401
diff --git a/twml/twml/contrib/trainers/__init__.py b/twml/twml/contrib/trainers/__init__.py
index 3226cd805..cc9508628 100644
--- a/twml/twml/contrib/trainers/__init__.py
+++ b/twml/twml/contrib/trainers/__init__.py
@@ -1,5 +1,7 @@
 # pylint: disable=wildcard-import
 """This module contains experimental trainer classes"""
-from .batch_prediction_request_trainer import BatchPredictionRequestTrainer  # noqa: F401
+from .batch_prediction_request_trainer import (
+    BatchPredictionRequestTrainer,
+)  # noqa: F401
 from .pruning_data_record_trainer import PruningDataRecordTrainer  # noqa: F401
-from .trainer_utils import build_keras_trainer # noqa: F401
+from .trainer_utils import build_keras_trainer  # noqa: F401
diff --git a/twml/twml/contrib/trainers/batch_prediction_request_trainer.py b/twml/twml/contrib/trainers/batch_prediction_request_trainer.py
index 2effa87ed..09d61aaf3 100644
--- a/twml/twml/contrib/trainers/batch_prediction_request_trainer.py
+++ b/twml/twml/contrib/trainers/batch_prediction_request_trainer.py
@@ -2,179 +2,212 @@
 """
 This file contains the DataRecordTrainer class.
 """
+import argparse
 import warnings
+from typing import Callable, Optional
 
 import twml
 from twml.trainers import DataRecordTrainer
 
 
-class BatchPredictionRequestTrainer(DataRecordTrainer):  # pylint: disable=abstract-method
-  """
-  The ``BatchPredictionRequestTrainer`` implementation is intended to satisfy use cases
-  that input is BatchPredictionRequest at Twitter and also where only the build_graph methods
-  needs to be overridden. For this reason, ``Trainer.[train,eval]_input_fn`` methods
-  assume a DataRecord dataset partitioned into part files stored in compressed (e.g. gzip) format.
-
-  For use-cases that differ from this common Twitter use-case,
-  further Trainer methods can be overridden.
-  If that still doesn't provide enough flexibility, the user can always
-  use the tf.estimator.Esimator or tf.session.run directly.
-  """
-
-  def __init__(
-          self, name, params,
-          build_graph_fn,
-          feature_config=None,
-          **kwargs):
+class BatchPredictionRequestTrainer(
+    DataRecordTrainer
+):  # pylint: disable=abstract-method
     """
-    The BatchPredictionRequestTrainer constructor builds a
-    ``tf.estimator.Estimator`` and stores it in self.estimator.
-    For this reason, BatchPredictionRequestTrainer accepts the same Estimator constructor arguments.
-    It also accepts additional arguments to facilitate metric evaluation and multi-phase training
-    (init_from_dir, init_map).
-
-    Args:
-      parent arguments:
-        See the `Trainer constructor <#twml.trainers.Trainer.__init__>`_ documentation
-        for a full list of arguments accepted by the parent class.
-      name, params, build_graph_fn (and other parent class args):
-        see documentation for twml.Trainer and twml.DataRecordTrainer doc.
-      feature_config:
-        An object of type FeatureConfig describing what features to decode.
-        Defaults to None. But it is needed in the following cases:
-          - `get_train_input_fn()` / `get_eval_input_fn()` is called without a `parse_fn`
-          - `learn()`, `train()`, `eval()`, `calibrate()` are called without providing `*input_fn`.
-
-      **kwargs:
-        further kwargs can be specified and passed to the Estimator constructor.
+    The ``BatchPredictionRequestTrainer`` implementation is intended to satisfy use cases
+    that input is BatchPredictionRequest at Twitter and also where only the build_graph methods
+    needs to be overridden. For this reason, ``Trainer.[train,eval]_input_fn`` methods
+    assume a DataRecord dataset partitioned into part files stored in compressed (e.g. gzip) format.
+
+    For use-cases that differ from this common Twitter use-case,
+    further Trainer methods can be overridden.
+    If that still doesn't provide enough flexibility, the user can always
+    use the tf.estimator.Esimator or tf.session.run directly.
     """
 
-    # Check and update train_batch_size and eval_batch_size in params before initialization
-    # to print correct parameter logs and does not stop running
-    # This overwrites batch_size parameter constrains in twml.trainers.Trainer.check_params
-    updated_params = self.check_batch_size_params(params)
-    super(BatchPredictionRequestTrainer, self).__init__(
-      name=name, params=updated_params, build_graph_fn=build_graph_fn, **kwargs)
-
-  def check_batch_size_params(self, params):
-    """ Verify that params has the correct key,values """
-    # updated_params is an instance of tensorflow.contrib.training.HParams
-    updated_params = twml.util.convert_to_hparams(params)
-    param_values = updated_params.values()
-
-    # twml.trainers.Trainer.check_params already checks other constraints,
-    # such as being an integer
-    if 'train_batch_size' in param_values:
-      if not isinstance(updated_params.train_batch_size, int):
-        raise ValueError("Expecting params.train_batch_size to be an integer.")
-      if param_values['train_batch_size'] != 1:
-        # This can be a bit annoying to force users to pass the batch sizes,
-        # but it is good to let them know what they actually use in the models
-        # Use warning instead of ValueError in there to continue the run
-        # and print out that train_batch_size is changed
-        warnings.warn('You are processing BatchPredictionRequest data, '
-          'train_batch_size is always 1.\n'
-          'The number of DataRecords in a batch is determined by the size '
-          'of each BatchPredictionRequest.\n'
-          'If you did not pass train.batch_size or eval.batch_size, and '
-          'the default batch_size 32 was in use,\n'
-          'please pass --train.batch_size 1 --eval.batch_size 1')
-        # If the upper error warning, change/pass --train.batch_size 1
-        # so that train_batch_size = 1
-        updated_params.train_batch_size = 1
-
-    if 'eval_batch_size' in param_values:
-      if not isinstance(updated_params.train_batch_size, int):
-        raise ValueError('Expecting params.eval_batch_size to be an integer.')
-      if param_values['eval_batch_size'] != 1:
-        # This can be a bit annoying to force users to pass the batch sizes,
-        # but it is good to let them know what they actually use in the models
-        # Use warning instead of ValueError in there to continue the run
-        # and print out that eval_batch_size is changed
-        warnings.warn('You are processing BatchPredictionRequest data, '
-          'eval_batch_size is also always 1.\n'
-          'The number of DataRecords in a batch is determined by the size '
-          'of each BatchPredictionRequest.\n'
-          'If you did not pass train.batch_size or eval.batch_size, and '
-          'the default batch_size 32 was in use,\n'
-          'please pass --train.batch_size 1 --eval.batch_size 1')
-        # If the upper warning raises, change/pass --eval.batch_size 1
-        # so that eval_batch_size = 1
-        updated_params.eval_batch_size = 1
-
-    if 'eval_batch_size' not in param_values:
-      updated_params.eval_batch_size = 1
-
-    if not updated_params.eval_batch_size:
-      updated_params.eval_batch_size = 1
-
-    return updated_params
-
-  @staticmethod
-  def add_batch_prediction_request_arguments():
-    """
-    Add commandline args to parse typically for the BatchPredictionRequestTrainer class.
-    Typically, the user calls this function and then parses cmd-line arguments
-    into an argparse.Namespace object which is then passed to the Trainer constructor
-    via the params argument.
-
-    See the `code <_modules/twml/argument_parser.html#get_trainer_parser>`_
-    for a list and description of all cmd-line arguments.
-
-    Returns:
-      argparse.ArgumentParser instance with some useful args already added.
-    """
-    parser = super(BatchPredictionRequestTrainer,
-      BatchPredictionRequestTrainer).add_parser_arguments()
-
-    # mlp arguments
-    parser.add_argument(
-      '--model.use_existing_discretizer', action='store_true',
-      dest="model_use_existing_discretizer",
-      help='Load a pre-trained calibration or train a new one')
-    parser.add_argument(
-      '--model.use_binary_values', action='store_true',
-      dest='model_use_binary_values',
-      help='Use the use_binary_values optimization')
-
-    # control hom many featues we keep in sparse tensors
-    # 12 is enough for learning-to-rank for now
-    parser.add_argument(
-      '--input_size_bits', type=int, default=12,
-      help='Number of bits allocated to the input size')
-
-    parser.add_argument(
-      '--loss_function', type=str, default='ranknet',
-      dest='loss_function',
-      help='Options are pairwise: ranknet (default), lambdarank, '
-      'listnet, listmle, attrank, '
-      'pointwise')
-
-    # whether convert sparse tensors to dense tensor
-    # in order to use dense normalization methods
-    parser.add_argument(
-      '--use_dense_tensor', action='store_true',
-      dest='use_dense_tensor',
-      default=False,
-      help='If use_dense_tensor is False, '
-      'sparse tensor and spare normalization are in use. '
-      'If use_dense_tensor is True, '
-      'dense tensor and dense normalization are in use.')
-
-    parser.add_argument(
-      '--dense_normalization', type=str, default='mean_max_normalizaiton',
-      dest='dense_normalization',
-      help='Options are mean_max_normalizaiton (default), standard_normalizaiton')
-
-    parser.add_argument(
-      '--sparse_normalization', type=str, default='SparseMaxNorm',
-      dest='sparse_normalization',
-      help='Options are SparseMaxNorm (default), SparseBatchNorm')
-
-    # so far only used in pairwise learning-to-rank
-    parser.add_argument(
-      '--mask', type=str, default='full_mask',
-      dest='mask',
-      help='Options are full_mask (default), diag_mask')
-
-    return parser
+    def __init__(
+        self,
+        name: str,
+        params: dict,
+        build_graph_fn: Callable,
+        feature_config: Optional[dict] = None,
+        **kwargs,
+    ):
+        """
+        The BatchPredictionRequestTrainer constructor builds a
+        ``tf.estimator.Estimator`` and stores it in self.estimator.
+        For this reason, BatchPredictionRequestTrainer accepts the same Estimator constructor arguments.
+        It also accepts additional arguments to facilitate metric evaluation and multi-phase training
+        (init_from_dir, init_map).
+
+        Args:
+            parent Args:
+                See the `Trainer constructor <#twml.trainers.Trainer.__init__>`_ documentation
+                for a full list of arguments accepted by the parent class.
+            name, params, build_graph_fn (and other parent class args):
+                see documentation for twml.Trainer and twml.DataRecordTrainer doc.
+            feature_config:
+                An object of type FeatureConfig describing what features to decode.
+                Defaults to None. But it is needed in the following cases:
+                    - `get_train_input_fn()` / `get_eval_input_fn()` is called without a `parse_fn`
+                    - `learn()`, `train()`, `eval()`, `calibrate()` are called without providing `*input_fn`.
+            **kwargs:
+                further kwargs can be specified and passed to the Estimator constructor.
+        """
+
+        # Check and update train_batch_size and eval_batch_size in params before initialization
+        # to print correct parameter logs and does not stop running
+        # This overwrites batch_size parameter constrains in twml.trainers.Trainer.check_params
+        updated_params = self.check_batch_size_params(params)
+        super(BatchPredictionRequestTrainer, self).__init__(
+            name=name, params=updated_params, build_graph_fn=build_graph_fn, **kwargs
+        )
+
+    def check_batch_size_params(self, params: dict):
+        """Verify that params has the correct key,values"""
+        # updated_params is an instance of tensorflow.contrib.training.HParams
+        updated_params = twml.util.convert_to_hparams(params)
+        param_values = updated_params.values()
+
+        # twml.trainers.Trainer.check_params already checks other constraints,
+        # such as being an integer
+        if "train_batch_size" in param_values:
+            if not isinstance(updated_params.train_batch_size, int):
+                raise ValueError("Expecting params.train_batch_size to be an integer.")
+            if param_values["train_batch_size"] != 1:
+                # This can be a bit annoying to force users to pass the batch sizes,
+                # but it is good to let them know what they actually use in the models
+                # Use warning instead of ValueError in there to continue the run
+                # and print out that train_batch_size is changed
+                warnings.warn(
+                    "You are processing BatchPredictionRequest data, "
+                    "train_batch_size is always 1.\n"
+                    "The number of DataRecords in a batch is determined by the size "
+                    "of each BatchPredictionRequest.\n"
+                    "If you did not pass train.batch_size or eval.batch_size, and "
+                    "the default batch_size 32 was in use,\n"
+                    "please pass --train.batch_size 1 --eval.batch_size 1"
+                )
+                # If the upper error warning, change/pass --train.batch_size 1
+                # so that train_batch_size = 1
+                updated_params.train_batch_size = 1
+
+        if "eval_batch_size" in param_values:
+            if not isinstance(updated_params.train_batch_size, int):
+                raise ValueError("Expecting params.eval_batch_size to be an integer.")
+            if param_values["eval_batch_size"] != 1:
+                # This can be a bit annoying to force users to pass the batch sizes,
+                # but it is good to let them know what they actually use in the models
+                # Use warning instead of ValueError in there to continue the run
+                # and print out that eval_batch_size is changed
+                warnings.warn(
+                    "You are processing BatchPredictionRequest data, "
+                    "eval_batch_size is also always 1.\n"
+                    "The number of DataRecords in a batch is determined by the size "
+                    "of each BatchPredictionRequest.\n"
+                    "If you did not pass train.batch_size or eval.batch_size, and "
+                    "the default batch_size 32 was in use,\n"
+                    "please pass --train.batch_size 1 --eval.batch_size 1"
+                )
+                # If the upper warning raises, change/pass --eval.batch_size 1
+                # so that eval_batch_size = 1
+                updated_params.eval_batch_size = 1
+
+        if "eval_batch_size" not in param_values:
+            updated_params.eval_batch_size = 1
+
+        if not updated_params.eval_batch_size:
+            updated_params.eval_batch_size = 1
+
+        return updated_params
+
+    @staticmethod
+    def add_batch_prediction_request_arguments() -> argparse.ArgumentParser:
+        """
+        Add commandline args to parse typically for the BatchPredictionRequestTrainer class.
+        Typically, the user calls this function and then parses cmd-line arguments
+        into an argparse.Namespace object which is then passed to the Trainer constructor
+        via the params argument.
+
+        See the `code <_modules/twml/argument_parser.html#get_trainer_parser>`_
+        for a list and description of all cmd-line arguments.
+
+        Returns:
+            argparse.ArgumentParser instance with some useful args already added.
+        """
+        parser = super(
+            BatchPredictionRequestTrainer, BatchPredictionRequestTrainer
+        ).add_parser_arguments()
+
+        # mlp arguments
+        parser.add_argument(
+            "--model.use_existing_discretizer",
+            action="store_true",
+            dest="model_use_existing_discretizer",
+            help="Load a pre-trained calibration or train a new one",
+        )
+        parser.add_argument(
+            "--model.use_binary_values",
+            action="store_true",
+            dest="model_use_binary_values",
+            help="Use the use_binary_values optimization",
+        )
+
+        # control hom many featues we keep in sparse tensors
+        # 12 is enough for learning-to-rank for now
+        parser.add_argument(
+            "--input_size_bits",
+            type=int,
+            default=12,
+            help="Number of bits allocated to the input size",
+        )
+
+        parser.add_argument(
+            "--loss_function",
+            type=str,
+            default="ranknet",
+            dest="loss_function",
+            help="Options are pairwise: ranknet (default), lambdarank, "
+            "listnet, listmle, attrank, "
+            "pointwise",
+        )
+
+        # whether convert sparse tensors to dense tensor
+        # in order to use dense normalization methods
+        parser.add_argument(
+            "--use_dense_tensor",
+            action="store_true",
+            dest="use_dense_tensor",
+            default=False,
+            help="If use_dense_tensor is False, "
+            "sparse tensor and spare normalization are in use. "
+            "If use_dense_tensor is True, "
+            "dense tensor and dense normalization are in use.",
+        )
+
+        parser.add_argument(
+            "--dense_normalization",
+            type=str,
+            default="mean_max_normalizaiton",
+            dest="dense_normalization",
+            help="Options are mean_max_normalizaiton (default), standard_normalizaiton",
+        )
+
+        parser.add_argument(
+            "--sparse_normalization",
+            type=str,
+            default="SparseMaxNorm",
+            dest="sparse_normalization",
+            help="Options are SparseMaxNorm (default), SparseBatchNorm",
+        )
+
+        # so far only used in pairwise learning-to-rank
+        parser.add_argument(
+            "--mask",
+            type=str,
+            default="full_mask",
+            dest="mask",
+            help="Options are full_mask (default), diag_mask",
+        )
+
+        return parser
diff --git a/twml/twml/contrib/trainers/pruning_data_record_trainer.py b/twml/twml/contrib/trainers/pruning_data_record_trainer.py
index 4796e5390..3a3fead02 100644
--- a/twml/twml/contrib/trainers/pruning_data_record_trainer.py
+++ b/twml/twml/contrib/trainers/pruning_data_record_trainer.py
@@ -1,59 +1,87 @@
+import argparse
+from typing import Optional
+
 import tensorflow.compat.v1 as tf
 
-from twml.trainers import DataRecordTrainer
 from twml.contrib.optimizers import PruningOptimizer
+from twml.trainers import DataRecordTrainer
 
 
 class PruningDataRecordTrainer(DataRecordTrainer):
-  @staticmethod
-  def get_train_op(params, loss):
-    train_op = DataRecordTrainer.get_train_op(params, loss)
-
-    optimizer = PruningOptimizer(learning_rate=params.get('learning_rate'))
-
-    return optimizer.minimize(
-        loss=loss,
-        prune_every=params.get('pruning_iter', 5000),
-        burn_in=params.get('pruning_burn_in', 100000),
-        decay=params.get('pruning_decay', .9999),
-        flops_target=params.get('pruning_flops_target', 250000),
-        update_params=train_op,
-        global_step=tf.train.get_global_step())
-
-  def __init__(self, name, params, build_graph_fn, feature_config=None, **kwargs):
-    kwargs['optimize_loss_fn'] = self.get_train_op
-
-    super(PruningDataRecordTrainer, self).__init__(
-      name=name,
-      params=params,
-      build_graph_fn=build_graph_fn,
-      feature_config=feature_config,
-      **kwargs)
-
-  def export_model(self, *args, **kwargs):
-    # TODO: modify graph before exporting to take into account masks
-    return super(PruningDataRecordTrainer, self).export_model(*args, **kwargs)
-
-  @staticmethod
-  def add_parser_arguments():
-    parser = DataRecordTrainer.add_parser_arguments()
-    parser.add_argument(
-      "--pruning.iter", "--pruning_iter", type=int, default=5000,
-      dest="pruning_iter",
-      help="A single feature or feature map is pruned every this many iterations")
-    parser.add_argument(
-      "--pruning.burn_in", "--pruning_burn_in", type=int, default=100000,
-      dest="pruning_burn_in",
-      help="Only start pruning after collecting statistics for this many training steps")
-    parser.add_argument(
-      "--pruning.flops_target", "--pruning_flops_target", type=int, default=250000,
-      dest="pruning_flops_target",
-      help="Stop pruning when estimated number of floating point operations reached this target. \
-      For example, a small feed-forward network might require 250,000 FLOPs to run.")
-    parser.add_argument(
-      "--pruning.decay", "--pruning_decay", type=float, default=.9999,
-      dest="pruning_decay",
-      help="A float value in [0.0, 1.0) controlling an exponential moving average of pruning \
-      signal statistics. A value of 0.9999 can be thought of as averaging statistics over 10,000 \
-      steps.")
-    return parser
+    @staticmethod
+    def get_train_op(params: dict, loss: tf.Tensor) -> tf.Operation:
+        train_op = DataRecordTrainer.get_train_op(params, loss)
+
+        optimizer = PruningOptimizer(learning_rate=params.get("learning_rate"))
+
+        return optimizer.minimize(
+            loss=loss,
+            prune_every=params.get("pruning_iter", 5000),
+            burn_in=params.get("pruning_burn_in", 100000),
+            decay=params.get("pruning_decay", 0.9999),
+            flops_target=params.get("pruning_flops_target", 250000),
+            update_params=train_op,
+            global_step=tf.train.get_global_step(),
+        )
+
+    def __init__(
+        self,
+        name: str,
+        params: dict,
+        build_graph_fn: callable,
+        feature_config: Optional[dict] = None,
+        **kwargs,
+    ):
+        kwargs["optimize_loss_fn"] = self.get_train_op
+
+        super(PruningDataRecordTrainer, self).__init__(
+            name=name,
+            params=params,
+            build_graph_fn=build_graph_fn,
+            feature_config=feature_config,
+            **kwargs,
+        )
+
+    def export_model(self, *args, **kwargs) -> str:
+        # TODO: modify graph before exporting to take into account masks
+        return super(PruningDataRecordTrainer, self).export_model(*args, **kwargs)
+
+    @staticmethod
+    def add_parser_arguments() -> argparse.ArgumentParser:
+        parser = DataRecordTrainer.add_parser_arguments()
+        parser.add_argument(
+            "--pruning.iter",
+            "--pruning_iter",
+            type=int,
+            default=5000,
+            dest="pruning_iter",
+            help="A single feature or feature map is pruned every this many iterations",
+        )
+        parser.add_argument(
+            "--pruning.burn_in",
+            "--pruning_burn_in",
+            type=int,
+            default=100000,
+            dest="pruning_burn_in",
+            help="Only start pruning after collecting statistics for this many training steps",
+        )
+        parser.add_argument(
+            "--pruning.flops_target",
+            "--pruning_flops_target",
+            type=int,
+            default=250000,
+            dest="pruning_flops_target",
+            help="Stop pruning when estimated number of floating point operations reached this target. \
+            For example, a small feed-forward network might require 250,000 FLOPs to run.",
+        )
+        parser.add_argument(
+            "--pruning.decay",
+            "--pruning_decay",
+            type=float,
+            default=0.9999,
+            dest="pruning_decay",
+            help="A float value in [0.0, 1.0) controlling an exponential moving average of pruning \
+            signal statistics. A value of 0.9999 can be thought of as averaging statistics over 10,000 \
+            steps.",
+        )
+        return parser
diff --git a/twml/twml/contrib/trainers/trainer_utils.py b/twml/twml/contrib/trainers/trainer_utils.py
index f279571be..cb6ec1be6 100644
--- a/twml/twml/contrib/trainers/trainer_utils.py
+++ b/twml/twml/contrib/trainers/trainer_utils.py
@@ -4,14 +4,14 @@
 
 As of now (Q4 2019), Keras model training using `model.fit()` has various issues, making it unfit
 for production training:
-  1. `model.fit()` is slow in TF 1.14. This will be fixed with future TensorFlow updates.
-  2. `model.fit()` crashes during model saving or in eager mode when the input has SparseTensor.
-  3. Models saved using TF 2.0 API cannot be served by TensorFlow's Java API.
+    1. `model.fit()` is slow in TF 1.14. This will be fixed with future TensorFlow updates.
+    2. `model.fit()` crashes during model saving or in eager mode when the input has SparseTensor.
+    3. Models saved using TF 2.0 API cannot be served by TensorFlow's Java API.
 
 Until MLCE team resolves the above issues, MLCE team recommends the following:
-  - Please feel free to use Keras models for experimentation and exploration.
-  - Please stick to twml Trainer for production training & exporting,
-    especially if you want to serve your model using Twitter's prediction servers.
+    - Please feel free to use Keras models for experimentation and exploration.
+    - Please stick to twml Trainer for production training & exporting,
+        especially if you want to serve your model using Twitter's prediction servers.
 
 This module provide tooling for easily training keras models using twml Trainer.
 
@@ -22,90 +22,97 @@
 This input function can be created from the tf.data.Dataset you used with your Keras model.
 
 .. note: this util handles the most common case. If you have cases not satisfied by this util,
-         consider writing your own build_graph to wrap your keras models.
+        consider writing your own build_graph to wrap your keras models.
 """
-from twitter.deepbird.hparam import HParams
+from typing import Callable
 
 import tensorflow  # noqa: F401
 import tensorflow.compat.v2 as tf
+from twitter.deepbird.hparam import HParams
 
 import twml
 
 
 def build_keras_trainer(
-  name,
-  model_factory,
-  save_dir,
-  loss_fn=None,
-  metrics_fn=None,
-  **kwargs):
-  """
-  Compile the given model_factory into a twml Trainer.
-
-  Args:
-    name: a string name for the returned twml Trainer.
-
-    model_factory: a callable that returns a keras model when called.
-      This keras model is expected to solve a binary classification problem.
-      This keras model takes a dict of tensors as input, and outputs a logit or probability.
-
-    save_dir: a directory where the trainer saves data. Can be an HDFS path.
-
-    loss_fn: the loss function to use. Defaults to tf.keras.losses.BinaryCrossentropy.
-
-    metrics_fn: metrics function used by TensorFlow estimators.
-    Defaults to twml.metrics.get_binary_class_metric_fn().
-
-    **kwargs: for people familiar with twml Trainer's options, they can be passed in here
-      as kwargs, and they will be forwarded to Trainer as opts.
-      See https://cgit.twitter.biz/source/tree/twml/twml/argument_parser.py#n43 for available args.
-
-  Returns:
-    a twml.trainers.Trainer object which can be used for training and exporting models.
-  """
-  build_graph = create_build_graph_fn(model_factory, loss_fn)
-
-  if metrics_fn is None:
-    metrics_fn = twml.metrics.get_binary_class_metric_fn()
-
-  opts = HParams(**kwargs)
-  opts.add_hparam('save_dir', save_dir)
-
-  return twml.trainers.Trainer(
-    name,
-    opts,
-    build_graph_fn=build_graph,
-    save_dir=save_dir,
-    metric_fn=metrics_fn)
-
-
-def create_build_graph_fn(model_factory, loss_fn=None):
-  """Create a build graph function from the given keras model."""
-
-  def build_graph(features, label, mode, params, config=None):
-    # create model from model factory.
-    model = model_factory()
-
-    # create loss function if the user didn't specify one.
-    if loss_fn is None:
-      build_graph_loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=False)
-    else:
-      build_graph_loss_fn = loss_fn
-
-    output = model(features)
-    if mode == 'infer':
-      loss = None
-    else:
-      weights = features.get('weights', None)
-      loss = build_graph_loss_fn(y_true=label, y_pred=output, sample_weight=weights)
-
-    if isinstance(output, dict):
-      if loss is None:
-        return output
-      else:
-        output['loss'] = loss
-        return output
-    else:
-      return {'output': output, 'loss': loss}
-
-  return build_graph
+    name: str,
+    model_factory: Callable[[], tf.keras.Model],
+    save_dir: str,
+    loss_fn: tf.keras.losses.Loss = tf.keras.losses.BinaryCrossentropy,
+    metrics_fn: Callable = twml.metrics.get_binary_class_metric_fn(),
+    **kwargs,
+) -> twml.trainers.Trainer:
+    """
+    Compile the given model_factory into a twml Trainer.
+
+    Args:
+        name:
+            a string name for the returned twml Trainer.
+        model_factory:
+            a callable that returns a keras model when called.
+            This keras model is expected to solve a binary classification problem.
+            This keras model takes a dict of tensors as input, and outputs a logit or probability.
+        save_dir:
+            a directory where the trainer saves data. Can be an HDFS path.
+        loss_fn:
+            the loss function to use. Defaults to tf.keras.losses.BinaryCrossentropy.
+        metrics_fn:
+            metrics function used by TensorFlow estimators.
+            Defaults to twml.metrics.get_binary_class_metric_fn().
+        **kwargs:
+            for people familiar with twml Trainer's options, they can be passed in here
+            as kwargs, and they will be forwarded to Trainer as opts.
+            See https://cgit.twitter.biz/source/tree/twml/twml/argument_parser.py#n43 for available args.
+
+    Returns:
+        a twml.trainers.Trainer object which can be used for training and exporting models.
+    """
+    build_graph = create_build_graph_fn(model_factory, loss_fn)
+
+    opts = HParams(**kwargs)
+    opts.add_hparam("save_dir", save_dir)
+
+    return twml.trainers.Trainer(
+        name,
+        opts,
+        build_graph_fn=build_graph,
+        save_dir=save_dir,
+        metric_fn=metrics_fn,
+    )
+
+
+def create_build_graph_fn(model_factory: Callable[[], tf.keras.Model], loss_fn=None):
+    """Create a build graph function from the given keras model."""
+
+    def build_graph(
+        features: dict,
+        label: tf.Tensor,
+        mode: str,
+        params: HParams,
+        config: dict,
+    ) -> dict:  # pylint: disable=unused-argument
+        # create model from model factory.
+        model = model_factory()
+
+        # create loss function if the user didn't specify one.
+        if loss_fn is None:
+            build_graph_loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=False)
+        else:
+            build_graph_loss_fn = loss_fn
+
+        output = model(features)
+        if mode == "infer":
+            loss = None
+        else:
+            weights = features.get("weights", None)
+            loss = build_graph_loss_fn(
+                y_true=label, y_pred=output, sample_weight=weights
+            )
+
+        if isinstance(output, dict):
+            if loss is None:
+                return output
+            output["loss"] = loss
+            return output
+        return {"output": output, "loss": loss}
+
+    return build_graph
diff --git a/twml/twml/contrib/utils/__init__.py b/twml/twml/contrib/utils/__init__.py
index 56a083048..4b7dbdf87 100644
--- a/twml/twml/contrib/utils/__init__.py
+++ b/twml/twml/contrib/utils/__init__.py
@@ -1,18 +1,22 @@
 # pylint: disable=wildcard-import
 """This module contains experimental util functions for contrib."""
 
-from .math_fns import safe_div, safe_log, cal_ndcg, cal_swapped_ndcg  # noqa: F401
-from .masks import diag_mask, full_mask  # noqa: F401
-from .normalizer import mean_max_normalizaiton, standard_normalizaiton  # noqa: F401
-from .scores import get_pairwise_scores, get_pairwise_label_scores  # noqa: F401
-# pointwise functions
-from .loss_fns import get_pointwise_loss  # noqa: F401
-# ranknet functions
-from .loss_fns import get_pair_loss  # noqa: F401
-# listwise functions
-from .loss_fns import get_attrank_loss, get_listnet_loss, get_listmle_loss  # noqa: F401
+from . import interp  # noqa: F401
+from .device import get_gpu_list  # noqa: F401
+from .device import get_device_map, get_gpu_count, is_gpu_available
+
 # lambdarank functions
+# listwise functions
+# ranknet functions
+# pointwise functions
 from .loss_fns import get_lambda_pair_loss  # noqa: F401
-from .device import get_device_map, get_gpu_list, get_gpu_count, is_gpu_available  # noqa: F401
+from .loss_fns import get_pair_loss  # noqa: F401
+from .loss_fns import get_pointwise_loss  # noqa: F401
+from .loss_fns import get_attrank_loss, get_listmle_loss, get_listnet_loss  # noqa: F401
+from .masks import diag_mask, full_mask  # noqa: F401
+from .math_fns import cal_ndcg, cal_swapped_ndcg, safe_div, safe_log  # noqa: F401
+from .normalizer import mean_max_normalizaiton  # noqa: F401
+from .normalizer import standard_normalizaiton
+from .scores import get_pairwise_label_scores  # noqa: F401
+from .scores import get_pairwise_scores
 from .similarities import cosine_similarity  # noqa: F401
-from . import interp # noqa: F401
diff --git a/twml/twml/contrib/utils/datasets.py b/twml/twml/contrib/utils/datasets.py
index d31ea3ae4..d6e203811 100644
--- a/twml/twml/contrib/utils/datasets.py
+++ b/twml/twml/contrib/utils/datasets.py
@@ -1,4 +1,6 @@
 import random
+from datetime import datetime
+from typing import List, Optional, Tuple
 
 import twml
 
@@ -6,88 +8,95 @@
 
 
 def resolve_train_and_eval_files_overlap(
-  train_files, eval_files, fraction_kept_for_eval, seed=None
-):
-  """Resolve any overlap between train and eval files.
+    train_files: List[str],
+    eval_files: List[str],
+    fraction_kept_for_eval: float,
+    seed: Optional[int] = None,
+) -> Tuple[List[str], List[str]]:
+    """Resolve any overlap between train and eval files.
 
-  Specifically, if there's an overlap between `train_files` and `eval_files`, then a fraction of
-  the overlap (i.e. `fraction_kept_for_eval`) will be randomly assigned (exclusively) to the
-  `eval_files`.
+    Specifically, if there's an overlap between `train_files` and `eval_files`, then a fraction of
+    the overlap (i.e. `fraction_kept_for_eval`) will be randomly assigned (exclusively) to the
+    `eval_files`.
 
-  The following example demonstrates its usage:
+    The following example demonstrates its usage:
 
-  >>> orig_train_files = ['f1', 'f2', 'f3', 'f4']
-  >>> orig_eval_files = ['f1', 'f2', 'f3']
-  >>> resolved_train_files, resolved_eval_files = resolve_train_and_eval_files_overlap(
-  ...     orig_train_files, orig_eval_files, 0.5
-  ... )
-  >>> set(resolved_train_files) & set(resolved_eval_files) == set()
-  True
-  >>> len(resolved_train_files) == 3
-  True
-  >>> len(resolved_eval_files) == 2
-  True
+    >>> orig_train_files = ['f1', 'f2', 'f3', 'f4']
+    >>> orig_eval_files = ['f1', 'f2', 'f3']
+    >>> resolved_train_files, resolved_eval_files = resolve_train_and_eval_files_overlap(
+    ...     orig_train_files, orig_eval_files, 0.5
+    ... )
+    >>> set(resolved_train_files) & set(resolved_eval_files) == set()
+    True
+    >>> len(resolved_train_files) == 3
+    True
+    >>> len(resolved_eval_files) == 2
+    True
 
-  Args:
-    train_files: A list of the files used for training.
-    eval_files: A list of the files used for validation.
-    fraction_kept_for_eval: A fraction of files in the intersection between `train_files` and
-      `eval_files` exclusively kept for evaluation.
-    seed: A seed for generating random numbers.
+    Args:
+        train_files:
+            A list of the files used for training.
+        eval_files:
+            A list of the files used for validation.
+        fraction_kept_for_eval:
+            A fraction of files in the intersection between `train_files` and `eval_files` exclusively kept for evaluation.
+        seed:
+            A seed for generating random numbers.
 
-  Returns:
-    A tuple `(new_train_files, new_eval_files)` with the overlapping resolved.
-  """
+    Returns:
+        A tuple `(new_train_files, new_eval_files)` with the overlapping resolved.
+    """
 
-  rng = random.Random(seed)
+    rng = random.Random(seed)
 
-  train_files = set(train_files)
-  eval_files = set(eval_files)
-  overlapping_files = train_files & eval_files
-  train_files_selected_for_eval = set(rng.sample(
-    overlapping_files,
-    int(len(overlapping_files) * fraction_kept_for_eval)
-  ))
-  train_files = train_files - train_files_selected_for_eval
-  eval_files = (eval_files - overlapping_files) | train_files_selected_for_eval
-  return list(train_files), list(eval_files)
+    train_files = set(train_files)
+    eval_files = set(eval_files)
+    overlapping_files = train_files & eval_files
+    train_files_selected_for_eval = set(
+        rng.sample(
+            overlapping_files, int(len(overlapping_files) * fraction_kept_for_eval)
+        )
+    )
+    train_files = train_files - train_files_selected_for_eval
+    eval_files = (eval_files - overlapping_files) | train_files_selected_for_eval
+    return list(train_files), list(eval_files)
 
 
 def get_time_based_dataset_files_for_train_and_eval(
-  base_path,
-  train_start_datetime,
-  train_end_datetime,
-  eval_start_datetime,
-  eval_end_datetime,
-  fraction_kept_for_eval,
-  datetime_prefix_format='%Y/%m/%d/%H',
-  extension='lzo',
-  parallelism=1
-):
-  """Get train/eval dataset files organized with a time-based prefix.
+    base_path: str,
+    train_start_datetime: datetime,
+    train_end_datetime: datetime,
+    eval_start_datetime: datetime,
+    eval_end_datetime: datetime,
+    fraction_kept_for_eval: float,
+    datetime_prefix_format: str = "%Y/%m/%d/%H",
+    extension: str = "lzo",
+    parallelism: int = 1,
+) -> Tuple[List[str], List[str]]:
+    """
+    Get train/eval dataset files organized with a time-based prefix.
+    This is just a convenience built around `get_dataset_files_prefixed_by_time` and
+    `resolve_train_and_eval_files_overlap`. Please refer to these functions for documentation.
+    """
 
-  This is just a convenience built around `get_dataset_files_prefixed_by_time` and
-  `resolve_train_and_eval_files_overlap`. Please refer to these functions for documentation.
-  """
-
-  train_files = get_time_based_dataset_files(
-    base_path=base_path,
-    start_datetime=train_start_datetime,
-    end_datetime=train_end_datetime,
-    datetime_prefix_format=datetime_prefix_format,
-    extension=extension,
-    parallelism=parallelism
-  )
-  eval_files = get_time_based_dataset_files(
-    base_path=base_path,
-    start_datetime=eval_start_datetime,
-    end_datetime=eval_end_datetime,
-    datetime_prefix_format=datetime_prefix_format,
-    extension=extension,
-    parallelism=parallelism
-  )
-  return resolve_train_and_eval_files_overlap(
-    train_files=train_files,
-    eval_files=eval_files,
-    fraction_kept_for_eval=fraction_kept_for_eval
-  )
+    train_files = get_time_based_dataset_files(
+        base_path=base_path,
+        start_datetime=train_start_datetime,
+        end_datetime=train_end_datetime,
+        datetime_prefix_format=datetime_prefix_format,
+        extension=extension,
+        parallelism=parallelism,
+    )
+    eval_files = get_time_based_dataset_files(
+        base_path=base_path,
+        start_datetime=eval_start_datetime,
+        end_datetime=eval_end_datetime,
+        datetime_prefix_format=datetime_prefix_format,
+        extension=extension,
+        parallelism=parallelism,
+    )
+    return resolve_train_and_eval_files_overlap(
+        train_files=train_files,
+        eval_files=eval_files,
+        fraction_kept_for_eval=fraction_kept_for_eval,
+    )
diff --git a/twml/twml/contrib/utils/device.py b/twml/twml/contrib/utils/device.py
index ab189c98a..d3f3cef42 100644
--- a/twml/twml/contrib/utils/device.py
+++ b/twml/twml/contrib/utils/device.py
@@ -2,26 +2,32 @@
 Functions to query devices being used by tensorflow
 """
 
+from typing import Dict, List
+
 from tensorflow.python.client import device_lib
 
 
-def get_device_map():
-  """Returns the map of device name to device type"""
-  local_device_protos = device_lib.list_local_devices()
-  return {x.name: x.device_type for x in local_device_protos}
+def get_device_map() -> Dict[str, str]:
+    """Returns the map of device name to device type"""
+
+    local_device_protos = device_lib.list_local_devices()
+    return {x.name: x.device_type for x in local_device_protos}
+
+
+def get_gpu_list() -> List[str]:
+    """Returns the list of GPUs available"""
+
+    device_map = get_device_map()
+    return [name for name in device_map if device_map[name] == "GPU"]
 
 
-def get_gpu_list():
-  """Returns the list of GPUs available"""
-  device_map = get_device_map()
-  return [name for name in device_map if device_map[name] == 'GPU']
+def get_gpu_count() -> int:
+    """Returns the count of GPUs available"""
 
+    return len(get_gpu_list())
 
-def get_gpu_count():
-  """Returns the count of GPUs available"""
-  return len(get_gpu_list())
 
+def is_gpu_available() -> bool:
+    """Returns if GPUs are available"""
 
-def is_gpu_available():
-  """Returns if GPUs are available"""
-  return get_gpu_count() > 0
+    return get_gpu_count() > 0
diff --git a/twml/twml/contrib/utils/interp.py b/twml/twml/contrib/utils/interp.py
index 419d89030..5893352c6 100644
--- a/twml/twml/contrib/utils/interp.py
+++ b/twml/twml/contrib/utils/interp.py
@@ -4,91 +4,105 @@
 
 import libtwml
 import tensorflow.compat.v1 as tf
+
 import twml
 
 
-def linear_interp1(inputs, ref_inputs, ref_outputs):
-  """
-  Perform 1D linear interpolation.
-  Arguments:
-    inputs:
-      The query input values.
-    ref_inputs:
-      Reference grid points used for interpolation.
-    ref_outputs:
-      Reference output values used for interpolation.
-
-  Returns:
-    The interpolated outputs for the requested input values.
-  """
-
-  inputs = tf.convert_to_tensor(inputs)
-  ref_inputs = tf.convert_to_tensor(ref_inputs)
-  ref_outputs = tf.convert_to_tensor(ref_outputs)
-
-  ndims = inputs.shape.ndims
-  ref_inputs_ndims = ref_inputs.shape.ndims
-  ref_outputs_ndims = ref_inputs.shape.ndims
-
-  if (ref_inputs_ndims != ndims):
-    raise ValueError("Dimension mismatch. inputs: %d, ref_inputs: %d" % (ndims, ref_inputs_ndims))
-
-  if (ref_outputs_ndims != ndims):
-    raise ValueError("Dimension mismatch. inputs: %d, ref_outputs: %d" % (ndims, ref_outputs_ndims))
-
-  if ndims > 2:
-    raise ValueError("Input dimensions should be < 2D. But got %d." % ndims)
-
-  original_input_shape = tf.shape(inputs)
-  # This is needed because isotonic_calibration expects:
-  # - inputs of size [num_samples, num_classes]
-  # - ref_inputs, ref_outputs of size [num_classes, num_bins]
-  inputs = tf.reshape(inputs, [-1, 1])
-  ref_inputs = tf.reshape(ref_inputs, [1, -1])
-  ref_outputs = tf.reshape(ref_outputs, [1, -1])
-
-  # isotonic_calibration is simply doing linear interpolation.
-  # This needs to be renamed in the future to make it consistent.
-  outputs = libtwml.ops.isotonic_calibration(inputs, ref_inputs, ref_outputs)
-  return tf.reshape(outputs, original_input_shape)
-
-
-def linear_interp1_by_class(inputs, input_classes, ref_inputs, ref_outputs):
-  """
-  Perform 1D linear interpolation.
-  Arguments:
-    inputs:
-      The query input values.
-    input_classes:
-      The class index to use from the reference grid.
-    ref_inputs:
-      Reference 2D grid points used for interpolation.
-      Each row denotes the grid from a different class.
-    ref_outputs:
-      Reference 2D output values used for interpolation.
-      Each row denotes the grid from a different class.
-
-  Returns:
-    The interpolated outputs for the requested input values.
-  """
-
-  inputs = tf.convert_to_tensor(inputs)
-  input_classes = tf.convert_to_tensor(input_classes)
-  ref_inputs = tf.convert_to_tensor(ref_inputs)
-  ref_outputs = tf.convert_to_tensor(ref_outputs)
-
-  original_input_shape = tf.shape(inputs)
-
-  # pass through
-  def in_func(x):
-    return x
-
-  # indexed function
-  def cond_func(i, fn):
-    idx = input_classes[i]
-    x = tf.expand_dims(fn(), axis=0)
-    return linear_interp1(x, ref_inputs[idx], ref_outputs[idx])
-
-  # Use while loop for now, needs to be replace by a custom C++ op later.
-  outputs = twml.util.batch_apply(in_func, inputs, cond_func=cond_func)
-  return tf.reshape(outputs, original_input_shape)
+def linear_interp1(
+    inputs: tf.Tensor, ref_inputs: tf.Tensor, ref_outputs: tf.Tensor
+) -> tf.Tensor:
+    """
+    Perform 1D linear interpolation.
+    Args:
+        inputs:
+            The query input values.
+        ref_inputs:
+            Reference grid points used for interpolation.
+        ref_outputs:
+            Reference output values used for interpolation.
+
+    Returns:
+        The interpolated outputs for the requested input values.
+    """
+
+    inputs = tf.convert_to_tensor(inputs)
+    ref_inputs = tf.convert_to_tensor(ref_inputs)
+    ref_outputs = tf.convert_to_tensor(ref_outputs)
+
+    ndims = inputs.shape.ndims
+    ref_inputs_ndims = ref_inputs.shape.ndims
+    ref_outputs_ndims = ref_inputs.shape.ndims
+
+    if ref_inputs_ndims != ndims:
+        raise ValueError(
+            "Dimension mismatch. inputs: %d, ref_inputs: %d" % (ndims, ref_inputs_ndims)
+        )
+
+    if ref_outputs_ndims != ndims:
+        raise ValueError(
+            "Dimension mismatch. inputs: %d, ref_outputs: %d"
+            % (ndims, ref_outputs_ndims)
+        )
+
+    if ndims > 2:
+        raise ValueError("Input dimensions should be < 2D. But got %d." % ndims)
+
+    original_input_shape = tf.shape(inputs)
+    # This is needed because isotonic_calibration expects:
+    # - inputs of size [num_samples, num_classes]
+    # - ref_inputs, ref_outputs of size [num_classes, num_bins]
+    inputs = tf.reshape(inputs, [-1, 1])
+    ref_inputs = tf.reshape(ref_inputs, [1, -1])
+    ref_outputs = tf.reshape(ref_outputs, [1, -1])
+
+    # isotonic_calibration is simply doing linear interpolation.
+    # This needs to be renamed in the future to make it consistent.
+    outputs = libtwml.ops.isotonic_calibration(inputs, ref_inputs, ref_outputs)
+    return tf.reshape(outputs, original_input_shape)
+
+
+def linear_interp1_by_class(
+    inputs: tf.Tensor,
+    input_classes: tf.Tensor,
+    ref_inputs: tf.Tensor,
+    ref_outputs: tf.Tensor,
+) -> tf.Tensor:
+    """
+    Perform 1D linear interpolation.
+
+    Args:
+        inputs:
+            The query input values.
+        input_classes:
+            The class index to use from the reference grid.
+        ref_inputs:
+            Reference 2D grid points used for interpolation.
+            Each row denotes the grid from a different class.
+        ref_outputs:
+            Reference 2D output values used for interpolation.
+            Each row denotes the grid from a different class.
+
+    Returns:
+        The interpolated outputs for the requested input values.
+    """
+
+    inputs = tf.convert_to_tensor(inputs)
+    input_classes = tf.convert_to_tensor(input_classes)
+    ref_inputs = tf.convert_to_tensor(ref_inputs)
+    ref_outputs = tf.convert_to_tensor(ref_outputs)
+
+    original_input_shape = tf.shape(inputs)
+
+    # pass through
+    def in_func(x):
+        return x
+
+    # indexed function
+    def cond_func(i: int, fn: callable):
+        idx = input_classes[i]
+        x = tf.expand_dims(fn(), axis=0)
+        return linear_interp1(x, ref_inputs[idx], ref_outputs[idx])
+
+    # Use while loop for now, needs to be replace by a custom C++ op later.
+    outputs = twml.util.batch_apply(in_func, inputs, cond_func=cond_func)
+    return tf.reshape(outputs, original_input_shape)
diff --git a/twml/twml/contrib/utils/loss_fns.py b/twml/twml/contrib/utils/loss_fns.py
index eb25b430a..ce01f2de8 100644
--- a/twml/twml/contrib/utils/loss_fns.py
+++ b/twml/twml/contrib/utils/loss_fns.py
@@ -1,302 +1,387 @@
+from typing import Optional
+
 import tensorflow.compat.v1 as tf
+
 from twml.contrib.utils import masks, math_fns
 
 
-def get_pair_loss(pairwise_label_scores, pairwise_predicted_scores,
-                  params):
-  """
-  Paiwise learning-to-rank ranknet loss
-  Check paper https://www.microsoft.com/en-us/research/publication/
-  learning-to-rank-using-gradient-descent/
-  for more information
-  Args:
-    pairwise_label_scores: a dense tensor of shape [n_data, n_data]
-    pairwise_predicted_scores: a dense tensor of shape [n_data, n_data]
-    n_data is the number of tweet candidates in a BatchPredictionRequest
-    params: network parameters
-  mask options: full_mask and diag_mask
-  Returns:
-    average loss over pairs defined by the masks
-  """
-  n_data = tf.shape(pairwise_label_scores)[0]
-  if params.mask == "full_mask":
-    # full_mask that only covers pairs that have different labels
-    # (all pairwise_label_scores = 0.5: selfs and same labels are 0s)
-    mask, pair_count = masks.full_mask(n_data, pairwise_label_scores)
-  else:
-    # diag_mask that covers all pairs
-    # (only selfs/diags are 0s)
-    mask, pair_count = masks.diag_mask(n_data, pairwise_label_scores)
-
-  # pairwise sigmoid_cross_entropy_with_logits loss
-  loss = tf.cond(tf.equal(pair_count, 0), lambda: 0.,
-    lambda: _get_average_cross_entropy_loss(pairwise_label_scores,
-      pairwise_predicted_scores, mask, pair_count))
-  return loss
-
-
-def get_lambda_pair_loss(pairwise_label_scores, pairwise_predicted_scores,
-                  params, swapped_ndcg):
-  """
-  Paiwise learning-to-rank lambdarank loss
-  faster than the previous gradient method
-  Note: this loss depends on ranknet cross-entropy
-  delta NDCG is applied to ranknet cross-entropy
-  Hence, it is still a gradient descent method
-  Check paper http://citeseerx.ist.psu.edu/viewdoc/
-  download?doi=10.1.1.180.634&rep=rep1&type=pdf for more information
-  for more information
-  Args:
-    pairwise_label_scores: a dense tensor of shape [n_data, n_data]
-    pairwise_predicted_scores: a dense tensor of shape [n_data, n_data]
-    n_data is the number of tweet candidates in a BatchPredictionRequest
-    params: network parameters
-    swapped_ndcg: swapped ndcg of shape [n_data, n_data]
-    ndcg values when swapping each pair in the prediction ranking order
-  mask options: full_mask and diag_mask
-  Returns:
-    average loss over pairs defined by the masks
-  """
-  n_data = tf.shape(pairwise_label_scores)[0]
-  if params.mask == "full_mask":
-    # full_mask that only covers pairs that have different labels
-    # (all pairwise_label_scores = 0.5: selfs and same labels are 0s)
-    mask, pair_count = masks.full_mask(n_data, pairwise_label_scores)
-  else:
-    # diag_mask that covers all pairs
-    # (only selfs/diags are 0s)
-    mask, pair_count = masks.diag_mask(n_data, pairwise_label_scores)
-
-  # pairwise sigmoid_cross_entropy_with_logits loss
-  loss = tf.cond(tf.equal(pair_count, 0), lambda: 0.,
-    lambda: _get_average_cross_entropy_loss(pairwise_label_scores,
-      pairwise_predicted_scores, mask, pair_count, swapped_ndcg))
-  return loss
-
-
-def _get_average_cross_entropy_loss(pairwise_label_scores, pairwise_predicted_scores,
-                                    mask, pair_count, swapped_ndcg=None):
-  """
-  Average the loss for a batchPredictionRequest based on a desired number of pairs
-  """
-  loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=pairwise_label_scores,
-    logits=pairwise_predicted_scores)
-  loss = mask * loss
-  if swapped_ndcg is not None:
-    loss = loss * swapped_ndcg
-  loss = tf.reduce_sum(loss) / pair_count
-  return loss
-
-
-def get_listmle_loss(labels, predicted_scores):
-  r"""
-  listwise learning-to-rank listMLE loss
-  Note: Simplified MLE formula is used in here (omit the proof in here)
-  \sum_{s=1}^{n-1} (-predicted_scores + ln(\sum_{i=s}^n exp(predicted_scores)))
-  n is tf.shape(predicted_scores)[0]
-  Check paper http://icml2008.cs.helsinki.fi/papers/167.pdf for more information
-  Args:
-    labels: a dense tensor of shape [n_data, 1]
-    n_data is the number of tweet candidates in a BatchPredictionRequest
-    predicted_scores: a dense tensor of same shape and type as labels
-  Returns:
-    average loss
-  """
-  labels = tf.reshape(labels, [-1, 1])
-  n_data = tf.shape(labels)[0]
-  predicted_scores = tf.reshape(predicted_scores, [-1, 1])
-
-  predicted_scores_ordered_by_labels = _get_ordered_predicted_scores(labels,
-    predicted_scores, n_data)
-
-  loss = (-1) * tf.reduce_sum(predicted_scores)
-  # sum over 1 to n_data - 1
-  temp = tf.gather(predicted_scores_ordered_by_labels, [n_data - 1])
-  temp = tf.reshape(temp, [])
-  loss = tf.add(loss, temp)
-
-  exps = tf.exp(predicted_scores_ordered_by_labels)
-  exp_sum = tf.reduce_sum(exps)
-  # clip exp_sum for safer log
-  loss = tf.add(loss, math_fns.safe_log(exp_sum))
-
-  iteration = tf.constant(0)
-
-  def _cond(iteration, loss, exp_sum, exp):
-    return tf.less(iteration, n_data - 2)
-
-  def _gen_loop_body():
-    def loop_body(iteration, loss, exp_sum, exps):
-      temp = tf.gather(exps, [iteration])
-      temp = tf.reshape(temp, [])
-      exp_sum = tf.subtract(exp_sum, temp)
-      # clip exp_sum for safer log
-      loss = tf.add(loss, math_fns.safe_log(exp_sum))
-      return tf.add(iteration, 1), loss, exp_sum, exps
-    return loop_body
-
-  iteration, loss, exp_sum, exps = tf.while_loop(_cond, _gen_loop_body(),
-    (iteration, loss, exp_sum, exps))
-  loss = loss / tf.cast(n_data, dtype=tf.float32)
-  return loss
-
-
-def _get_ordered_predicted_scores(labels, predicted_scores, n_data):
-  """
-  Order predicted_scores based on sorted labels
-  """
-  sorted_labels, ordered_labels_indices = tf.nn.top_k(
-    tf.transpose(labels), k=n_data)
-  ordered_labels_indices = tf.transpose(ordered_labels_indices)
-  predicted_scores_ordered_by_labels = tf.gather_nd(predicted_scores,
-    ordered_labels_indices)
-  return predicted_scores_ordered_by_labels
-
-
-def get_attrank_loss(labels, predicted_scores, weights=None):
-  """
-  Modified listwise learning-to-rank AttRank loss
-  Check paper https://arxiv.org/abs/1804.05936 for more information
-  Note: there is an inconsistency between the paper statement and
-  their public code
-  Args:
-    labels: a dense tensor of shape [n_data, 1]
-    n_data is the number of tweet candidates in a BatchPredictionRequest
-    predicted_scores: a dense tensor of same shape and type as labels
-    weights: a dense tensor of the same shape as labels
-  Returns:
-    average loss
-  """
-  # The authors immeplemented the following, which is basically listnet
-  # attention_labels = _get_attentions(labels)
-  # attention_labels = tf.reshape(attention_labels, [1, -1])
-  # predicted_scores = tf.reshape(predicted_scores, [1, -1])
-  # loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=attention_labels,
-  #   logits=predicted_scores))
-
-  # The paper proposed the following
-  # attention_labels = _get_attentions(labels)
-  # # However the following line is wrong based on their statement
-  # # as _get_attentions can give 0 results when input < 0
-  # # and the result cannot be used in _get_attrank_cross_entropy
-  # # log(a_i^S)
-  # # attention_predicted_scores = _get_attentions(predicted_scores)
-  # loss = _get_attrank_cross_entropy(attention_labels, attention_predicted_scores)
-  # # the range of attention_predicted_scores is [0, 1)
-  # # this gives sigmoid [0.5, 0.732)
-  # # hence, it is not good to use in sigmoid_cross_entropy_with_logits either
-
-  # Implemented the following instead
-  # _get_attentions is applied to labels
-  # softmax is applied to predicted_scores
-  reshaped_labels = tf.reshape(labels, [1, -1])
-  attention_labels = _get_attentions(reshaped_labels)
-  reshaped_predicted_scores = tf.reshape(predicted_scores, [1, -1])
-  attention_predicted_scores = tf.nn.softmax(reshaped_predicted_scores)
-  loss = _get_attrank_cross_entropy(attention_labels, attention_predicted_scores)
-  return loss
-
-
-def _get_attentions(raw_scores):
-  """
-  Used in attention weights in AttRank loss
-  for a query/batch/batchPreidictionRequest
-  (a rectified softmax function)
-  """
-  not_consider = tf.less_equal(raw_scores, 0)
-  mask = tf.ones(tf.shape(raw_scores)) - tf.cast(not_consider, dtype=tf.float32)
-  mask = tf.cast(mask, dtype=tf.float32)
-  expon_labels = mask * tf.exp(raw_scores)
-
-  expon_label_sum = tf.reduce_sum(expon_labels)
-  # expon_label_sum is safe as a denominator
-  attentions = math_fns.safe_div(expon_labels, expon_label_sum)
-  return attentions
-
-
-def _get_attrank_cross_entropy(labels, logits):
-  # logits is not safe based on their satement
-  # do not use this function directly elsewhere
-  results = labels * math_fns.safe_log(logits) + (1 - labels) * math_fns.safe_log(1 - logits)
-  results = (-1) * results
-  results = tf.reduce_mean(results)
-  return results
-
-
-def get_listnet_loss(labels, predicted_scores, weights=None):
-  """
-  Listwise learning-to-rank listet loss
-  Check paper https://www.microsoft.com/en-us/research/
-  wp-content/uploads/2016/02/tr-2007-40.pdf
-  for more information
-  Args:
-    labels: a dense tensor of shape [n_data, 1]
-    n_data is the number of tweet candidates in a BatchPredictionRequest
-    predicted_scores: a dense tensor of same shape and type as labels
-    weights: a dense tensor of the same shape as labels
-  Returns:
-    average loss
-  """
-  # top one probability is the same as softmax
-  labels_top_one_probs = _get_top_one_probs(labels)
-  predicted_scores_top_one_probs = _get_top_one_probs(predicted_scores)
-
-  if weights is None:
+def get_pair_loss(
+    pairwise_label_scores: tf.Tensor, pairwise_predicted_scores: tf.Tensor, params: dict
+) -> tf.Tensor:
+    """
+    Pairwise learning-to-rank ranknet loss. Check paper for more information:
+    https://www.microsoft.com/en-us/research/publication/learning-to-rank-using-gradient-descent/
+
+    Args:
+        pairwise_label_scores:
+            a dense tensor of shape [n_data, n_data]
+        pairwise_predicted_scores:
+            a dense tensor of shape [n_data, n_data]
+            n_data is the number of tweet candidates in a BatchPredictionRequest
+        params:
+            network parameters
+            mask:
+                full_mask or diag_mask
+    Returns:
+        average loss over pairs defined by the masks
+    """
+
+    n_data = tf.shape(pairwise_label_scores)[0]
+    if params.mask == "full_mask":
+        # full_mask that only covers pairs that have different labels
+        # (all pairwise_label_scores = 0.5: selfs and same labels are 0s)
+        mask, pair_count = masks.full_mask(n_data, pairwise_label_scores)
+    else:
+        # diag_mask that covers all pairs
+        # (only selfs/diags are 0s)
+        mask, pair_count = masks.diag_mask(n_data, pairwise_label_scores)
+
+    # pairwise sigmoid_cross_entropy_with_logits loss
+    loss = tf.cond(
+        tf.equal(pair_count, 0),
+        lambda: 0.0,
+        lambda: _get_average_cross_entropy_loss(
+            pairwise_label_scores, pairwise_predicted_scores, mask, pair_count
+        ),
+    )
+    return loss
+
+
+def get_lambda_pair_loss(
+    pairwise_label_scores: tf.Tensor,
+    pairwise_predicted_scores: tf.Tensor,
+    params: dict,
+    swapped_ndcg: tf.Tensor,
+) -> tf.Tensor:
+    """
+    Pairwise learning-to-rank lambdarank loss faster than the previous gradient method
+    Note: this loss depends on ranknet cross-entropy delta NDCG is applied to ranknet cross-entropy
+    Hence, it is still a gradient descent method
+
+    For more information, check paper:
+    http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.180.634&rep=rep1&type=pdf
+
+    Args:
+        pairwise_label_scores:
+            a dense tensor of shape [n_data, n_data]
+        pairwise_predicted_scores:
+            a dense tensor of shape [n_data, n_data]
+            n_data is the number of tweet candidates in a BatchPredictionRequest
+        params:
+            network parameters
+        swapped_ndcg:
+            swapped ndcg of shape [n_data, n_data]
+            ndcg values when swapping each pair in the prediction ranking order
+            mask options: full_mask and diag_mask
+
+    Returns:
+        average loss over pairs defined by the masks
+    """
+    n_data = tf.shape(pairwise_label_scores)[0]
+    if params.mask == "full_mask":
+        # full_mask that only covers pairs that have different labels
+        # (all pairwise_label_scores = 0.5: selfs and same labels are 0s)
+        mask, pair_count = masks.full_mask(n_data, pairwise_label_scores)
+    else:
+        # diag_mask that covers all pairs
+        # (only selfs/diags are 0s)
+        mask, pair_count = masks.diag_mask(n_data, pairwise_label_scores)
+
+    # pairwise sigmoid_cross_entropy_with_logits loss
+    loss = tf.cond(
+        tf.equal(pair_count, 0),
+        lambda: 0.0,
+        lambda: _get_average_cross_entropy_loss(
+            pairwise_label_scores,
+            pairwise_predicted_scores,
+            mask,
+            pair_count,
+            swapped_ndcg,
+        ),
+    )
+    return loss
+
+
+def _get_average_cross_entropy_loss(
+    pairwise_label_scores: tf.Tensor,
+    pairwise_predicted_scores: tf.Tensor,
+    mask: tf.Tensor,
+    pair_count: tf.Tensor,
+    swapped_ndcg: Optional[tf.Tensor] = None,
+) -> tf.Tensor:
+    """Average the loss for a batchPredictionRequest based on a desired number of pairs"""
+
+    loss = tf.nn.sigmoid_cross_entropy_with_logits(
+        labels=pairwise_label_scores,
+        logits=pairwise_predicted_scores,
+    )
+    loss = mask * loss
+    if swapped_ndcg is not None:
+        loss = loss * swapped_ndcg
+    loss = tf.reduce_sum(loss) / pair_count
+    return loss
+
+
+def get_listmle_loss(labels: tf.Tensor, predicted_scores: tf.Tensor) -> tf.Tensor:
+    """
+    listwise learning-to-rank listMLE loss
+    Note: Simplified MLE formula is used in here (omit the proof in here)
+    \sum_{s=1}^{n-1} (-predicted_scores + ln(\sum_{i=s}^n exp(predicted_scores)))
+    n is tf.shape(predicted_scores)[0]
+
+    Check paper http://icml2008.cs.helsinki.fi/papers/167.pdf for more information
+
+    Args:
+        labels:
+            a dense tensor of shape [n_data, 1]
+            n_data is the number of tweet candidates in a BatchPredictionRequest
+        predicted_scores:
+            a dense tensor of same shape and type as labels
+
+    Returns:
+        average loss
+    """
+    labels = tf.reshape(labels, [-1, 1])
+    n_data = tf.shape(labels)[0]
+    predicted_scores = tf.reshape(predicted_scores, [-1, 1])
+
+    predicted_scores_ordered_by_labels = _get_ordered_predicted_scores(
+        labels, predicted_scores, n_data
+    )
+
+    loss = (-1) * tf.reduce_sum(predicted_scores)
+    # sum over 1 to n_data - 1
+    temp = tf.gather(predicted_scores_ordered_by_labels, [n_data - 1])
+    temp = tf.reshape(temp, [])
+    loss = tf.add(loss, temp)
+
+    exps = tf.exp(predicted_scores_ordered_by_labels)
+    exp_sum = tf.reduce_sum(exps)
+    # clip exp_sum for safer log
+    loss = tf.add(loss, math_fns.safe_log(exp_sum))
+
+    iteration = tf.constant(0)
+
+    def _cond(
+        iteration: tf.Tensor, loss: tf.Tensor, exp_sum: tf.Tensor, exps: tf.Tensor
+    ) -> tf.Tensor:
+        return tf.less(iteration, n_data - 2)
+
+    def _gen_loop_body() -> callable:
+        def loop_body(
+            iteration: tf.Tensor, loss: tf.Tensor, exp_sum: tf.Tensor, exps: tf.Tensor
+        ) -> tf.Tensor:
+            temp = tf.gather(exps, [iteration])
+            temp = tf.reshape(temp, [])
+            exp_sum = tf.subtract(exp_sum, temp)
+            # clip exp_sum for safer log
+            loss = tf.add(loss, math_fns.safe_log(exp_sum))
+            return tf.add(iteration, 1), loss, exp_sum, exps
+
+        return loop_body
+
+    iteration, loss, exp_sum, exps = tf.while_loop(
+        _cond, _gen_loop_body(), (iteration, loss, exp_sum, exps)
+    )
+    loss = loss / tf.cast(n_data, dtype=tf.float32)
+    return loss
+
+
+def _get_ordered_predicted_scores(
+    labels: tf.Tensor, predicted_scores: tf.Tensor, n_data: tf.Tensor
+) -> tf.Tensor:
+    """Order predicted_scores based on sorted labels"""
+
+    sorted_labels, ordered_labels_indices = tf.nn.top_k(tf.transpose(labels), k=n_data)
+    ordered_labels_indices = tf.transpose(ordered_labels_indices)
+    predicted_scores_ordered_by_labels = tf.gather_nd(
+        predicted_scores, ordered_labels_indices
+    )
+    return predicted_scores_ordered_by_labels
+
+
+def get_attrank_loss(
+    labels: tf.Tensor, predicted_scores: tf.Tensor, weights: Optional[tf.Tensor] = None
+) -> tf.Tensor:
+    """
+    Modified listwise learning-to-rank AttRank loss. For more info, Check paper:
+    https://arxiv.org/abs/1804.05936
+
+    Note: there is an inconsistency between the paper statement and their public code
+
+    Args:
+        labels:
+            a dense tensor of shape [n_data, 1]
+            n_data is the number of tweet candidates in a BatchPredictionRequest
+        predicted_scores:
+            a dense tensor of same shape and type as labels
+        weights:
+            a dense tensor of the same shape as labels
+
+    Returns:
+        average loss
+    """
+    # The authors implemented the following, which is basically listnet
+    # attention_labels = _get_attentions(labels)
+    # attention_labels = tf.reshape(attention_labels, [1, -1])
+    # predicted_scores = tf.reshape(predicted_scores, [1, -1])
+    # loss = tf.reduce_mean(
+    #     tf.nn.softmax_cross_entropy_with_logits(
+    #         labels=attention_labels, logits=predicted_scores
+    #     )
+    # )
+
+    # The paper proposed the following
+    # attention_labels = _get_attentions(labels)
+    # # However the following line is wrong based on their statement
+    # # as _get_attentions can give 0 results when input < 0
+    # # and the result cannot be used in _get_attrank_cross_entropy
+    # # log(a_i^S)
+    # # attention_predicted_scores = _get_attentions(predicted_scores)
+    # loss = _get_attrank_cross_entropy(attention_labels, attention_predicted_scores)
+    # # the range of attention_predicted_scores is [0, 1)
+    # # this gives sigmoid [0.5, 0.732)
+    # # hence, it is not good to use in sigmoid_cross_entropy_with_logits either
+
+    # Implemented the following instead
+    # _get_attentions is applied to labels
+    # softmax is applied to predicted_scores
+    reshaped_labels = tf.reshape(labels, [1, -1])
+    attention_labels = _get_attentions(reshaped_labels)
+    reshaped_predicted_scores = tf.reshape(predicted_scores, [1, -1])
+    attention_predicted_scores = tf.nn.softmax(reshaped_predicted_scores)
+    loss = _get_attrank_cross_entropy(attention_labels, attention_predicted_scores)
+    return loss
+
+
+def _get_attentions(raw_scores: tf.Tensor) -> tf.Tensor:
+    """
+    Used in attention weights in AttRank loss for a query/batch/batchPredictionRequest
+    (a rectified softmax function)
+    """
+
+    not_consider = tf.less_equal(raw_scores, 0)
+    mask = tf.ones(tf.shape(raw_scores)) - tf.cast(not_consider, dtype=tf.float32)
+    mask = tf.cast(mask, dtype=tf.float32)
+    expon_labels = mask * tf.exp(raw_scores)
+
+    expon_label_sum = tf.reduce_sum(expon_labels)
+    # expon_label_sum is safe as a denominator
+    attentions = math_fns.safe_div(expon_labels, expon_label_sum)
+    return attentions
+
+
+def _get_attrank_cross_entropy(labels: tf.Tensor, logits: tf.Tensor) -> tf.Tensor:
+    # logits is not safe based on their statement
+    # do not use this function directly elsewhere
+    results = labels * math_fns.safe_log(logits) + (1 - labels) * math_fns.safe_log(
+        1 - logits
+    )
+    results = (-1) * results
+    results = tf.reduce_mean(results)
+    return results
+
+
+def get_listnet_loss(
+    labels: tf.Tensor,
+    predicted_scores: tf.Tensor,
+    weights: Optional[tf.Tensor] = None,
+) -> tf.Tensor:
+    """
+    Listwise learning-to-rank listet loss. For more information, check paper:
+    https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/tr-2007-40.pdf
+
+    Args:
+        labels:
+            a dense tensor of shape [n_data, 1]
+            n_data is the number of tweet candidates in a BatchPredictionRequest
+        predicted_scores:
+            a dense tensor of same shape and type as labels
+        weights:
+            a dense tensor of the same shape as labels
+
+    Returns:
+        average loss
+    """
+    # top one probability is the same as softmax
+    labels_top_one_probs = _get_top_one_probs(labels)
+    predicted_scores_top_one_probs = _get_top_one_probs(predicted_scores)
+
+    if weights is None:
+        loss = tf.reduce_mean(
+            _get_listnet_cross_entropy(
+                labels=labels_top_one_probs, logits=predicted_scores_top_one_probs
+            )
+        )
+        return loss
+
     loss = tf.reduce_mean(
-      _get_listnet_cross_entropy(labels=labels_top_one_probs,
-      logits=predicted_scores_top_one_probs))
+        _get_listnet_cross_entropy(
+            labels=labels_top_one_probs, logits=predicted_scores_top_one_probs
+        )
+        * weights
+    ) / tf.reduce_mean(weights)
     return loss
 
-  loss = tf.reduce_mean(
-    _get_listnet_cross_entropy(labels=labels_top_one_probs,
-    logits=predicted_scores_top_one_probs) * weights) / tf.reduce_mean(weights)
-  return loss
-
-
-def _get_top_one_probs(labels):
-  """
-  Used in listnet top-one probabilities
-  for a query/batch/batchPreidictionRequest
-  (essentially a softmax function)
-  """
-  expon_labels = tf.exp(labels)
-  expon_label_sum = tf.reduce_sum(expon_labels)
-  # expon_label_sum is safe as a denominator
-  attentions = expon_labels / expon_label_sum
-  return attentions
-
-
-def _get_listnet_cross_entropy(labels, logits):
-  """
-  Used in listnet
-  cross entropy on top-one probabilities
-  between ideal/label top-one probabilities
-  and predicted/logits top-one probabilities
-  for a query/batch/batchPreidictionRequest
-  """
-  # it is safe to use log on logits
-  # that come from _get_top_one_probs
-  # do not use this function directly elsewhere
-  results = (-1) * labels * math_fns.safe_log(logits)
-  return results
-
-
-def get_pointwise_loss(labels, predicted_scores, weights=None):
-  """
-  Pointwise learning-to-rank pointwise loss
-  Args:
-    labels: a dense tensor of shape [n_data, 1]
-    n_data is the number of tweet candidates in a BatchPredictionRequest
-    predicted_scores: a dense tensor of same shape and type as labels
-    weights: a dense tensor of the same shape as labels
-  Returns:
-    average loss
-  """
-  if weights is None:
+
+def _get_top_one_probs(labels: tf.Tensor) -> tf.Tensor:
+    """
+    Used in listnet top-one probabilities
+    for a query/batch/batchPredictionRequest
+    (essentially a softmax function)
+    """
+    expon_labels = tf.exp(labels)
+    expon_label_sum = tf.reduce_sum(expon_labels)
+    # expon_label_sum is safe as a denominator
+    attentions = expon_labels / expon_label_sum
+    return attentions
+
+
+def _get_listnet_cross_entropy(labels: tf.Tensor, logits: tf.Tensor) -> tf.Tensor:
+    """
+    Used in listnet
+    cross entropy on top-one probabilities
+    between ideal/label top-one probabilities
+    and predicted/logits top-one probabilities
+    for a query/batch/batchPredictionRequest
+    """
+    # it is safe to use log on logits
+    # that come from _get_top_one_probs
+    # do not use this function directly elsewhere
+    results = (-1) * labels * math_fns.safe_log(logits)
+    return results
+
+
+def get_pointwise_loss(
+    labels: tf.Tensor,
+    predicted_scores: tf.Tensor,
+    weights: Optional[tf.Tensor] = None,
+) -> tf.Tensor:
+    """
+    Pointwise learning-to-rank pointwise loss
+
+    Args:
+        labels:
+            a dense tensor of shape [n_data, 1]
+            n_data is the number of tweet candidates in a BatchPredictionRequest
+        predicted_scores:
+            a dense tensor of same shape and type as labels
+        weights:
+            a dense tensor of the same shape as labels
+
+    Returns:
+        average loss
+    """
+
+    if weights is None:
+        loss = tf.reduce_mean(
+            tf.nn.sigmoid_cross_entropy_with_logits(
+                labels=labels, logits=predicted_scores
+            )
+        )
+        return loss
     loss = tf.reduce_mean(
-      tf.nn.sigmoid_cross_entropy_with_logits(labels=labels,
-      logits=predicted_scores))
+        tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=predicted_scores)
+        * weights
+    ) / tf.reduce_mean(weights)
     return loss
-  loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=labels,
-        logits=predicted_scores) * weights) / tf.reduce_mean(weights)
-  return loss
diff --git a/twml/twml/contrib/utils/masks.py b/twml/twml/contrib/utils/masks.py
index f3143dc52..3a76a7607 100644
--- a/twml/twml/contrib/utils/masks.py
+++ b/twml/twml/contrib/utils/masks.py
@@ -1,38 +1,53 @@
+from typing import Tuple
+
 import tensorflow.compat.v1 as tf
 
 
-def diag_mask(n_data, pairwise_label_scores):
-  """
-  This is so far only used in pariwise learning-to-rank
-  Args:
-    n_data: a int `Tensor`.
-    pairwise_label_scores: a dense `Tensor` of shape [n_data, n_data].
-  Returns:
-    values in pairwise_label_scores except the diagonal
-    each cell contains a paiwise score difference
-    only selfs/diags are 0s
-  """
-  mask = tf.ones([n_data, n_data]) - tf.diag(tf.ones([n_data]))
-  mask = tf.cast(mask, dtype=tf.float32)
-  pair_count = tf.to_float(n_data) * (tf.to_float(n_data) - 1)
-  pair_count = tf.cast(pair_count, dtype=tf.float32)
-  return mask, pair_count
-
-
-def full_mask(n_data, pairwise_label_scores):
-  """
-  This is so far only used in pariwise learning-to-rank
-  Args:
-    n_data: a int `Tensor`.
-    pairwise_label_scores: a dense `Tensor` of shape [n_data, n_data].
-  Returns:
-    values in pairwise_label_scores except pairs that have the same labels
-    each cell contains a paiwise score difference
-    all pairwise_label_scores = 0.5: selfs and same labels are 0s
-  """
-  not_consider = tf.equal(pairwise_label_scores, 0.5)
-  mask = tf.ones([n_data, n_data]) - tf.cast(not_consider, dtype=tf.float32)
-  mask = tf.cast(mask, dtype=tf.float32)
-  pair_count = tf.reduce_sum(mask)
-  pair_count = tf.cast(pair_count, dtype=tf.float32)
-  return mask, pair_count
+def diag_mask(
+    n_data: tf.Tensor, pairwise_label_scores: tf.Tensor
+) -> Tuple[tf.Tensor, tf.Tensor]:
+    """
+    This is so far only used in pairwise learning-to-rank
+
+    Args:
+        n_data:
+            a int `Tensor`.
+        pairwise_label_scores:
+            a dense `Tensor` of shape [n_data, n_data].
+
+    Returns:
+        values in pairwise_label_scores except the diagonal each cell contains a
+        pairwise score difference only selfs/diags are 0s
+    """
+
+    mask = tf.ones([n_data, n_data]) - tf.diag(tf.ones([n_data]))
+    mask = tf.cast(mask, dtype=tf.float32)
+    pair_count = tf.to_float(n_data) * (tf.to_float(n_data) - 1)
+    pair_count = tf.cast(pair_count, dtype=tf.float32)
+    return mask, pair_count
+
+
+def full_mask(
+    n_data: tf.Tensor, pairwise_label_scores: tf.Tensor
+) -> Tuple[tf.Tensor, tf.Tensor]:
+    """
+    This is so far only used in pairwise learning-to-rank
+
+    Args:
+        n_data:
+            a int `Tensor`.
+        pairwise_label_scores:
+            a dense `Tensor` of shape [n_data, n_data].
+
+    Returns:
+        values in pairwise_label_scores except pairs that have the same labels
+        each cell contains a pairwise score difference
+        all pairwise_label_scores = 0.5: selfs and same labels are 0s
+    """
+
+    not_consider = tf.equal(pairwise_label_scores, 0.5)
+    mask = tf.ones([n_data, n_data]) - tf.cast(not_consider, dtype=tf.float32)
+    mask = tf.cast(mask, dtype=tf.float32)
+    pair_count = tf.reduce_sum(mask)
+    pair_count = tf.cast(pair_count, dtype=tf.float32)
+    return mask, pair_count
diff --git a/twml/twml/contrib/utils/math_fns.py b/twml/twml/contrib/utils/math_fns.py
index 2d9e72282..924b8968e 100644
--- a/twml/twml/contrib/utils/math_fns.py
+++ b/twml/twml/contrib/utils/math_fns.py
@@ -1,171 +1,226 @@
+from typing import Optional, Union
+
 import tensorflow.compat.v1 as tf
 from tensorflow.python.ops import array_ops, math_ops
 
 
 # Copied from metrics_impl.py
 # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/metrics_impl.py#L216
-def safe_div(numerator, denominator, name=None):
-  """
-  Example usage: calculating NDCG = DCG / IDCG to handle cases when
-  IDCG = 0 returns 0 instead of Infinity 
-  Do not use this dividing funciton unless it makes sense to your problem
-  Divides two tensors element-wise, returns 0 if the denominator is <= 0.
-  Args:
-    numerator: a real `Tensor`.
-    denominator: a real `Tensor`, with dtype matching `numerator`.
-    name: Name for the returned op.
-  Returns:
-    0 if `denominator` <= 0, else `numerator` / `denominator`
-  """
-  t = math_ops.truediv(numerator, denominator)
-  zero = array_ops.zeros_like(t, dtype=denominator.dtype)
-  condition = math_ops.greater(denominator, zero)
-  zero = math_ops.cast(zero, t.dtype)
-  return array_ops.where(condition, t, zero, name=name)
-
-
-def cal_ndcg(label_scores, predicted_scores, top_k_int=1):
-  """
-  Calculate NDCG score for top_k_int ranking positions
-  Args:
-    label_scores: a real `Tensor`.
-    predicted_scores: a real `Tensor`, with dtype matching label_scores
-    top_k_int: An int or an int `Tensor`.
-  Returns:
-    a `Tensor` that holds DCG / IDCG.
-  """
-  sorted_labels, predicted_order = _get_ranking_orders(
-    label_scores, predicted_scores, top_k_int=top_k_int)
-
-  predicted_relevance = _get_relevance_scores(predicted_order)
-  sorted_relevance = _get_relevance_scores(sorted_labels)
-
-  cg_discount = _get_cg_discount(top_k_int)
-
-  dcg = _dcg_idcg(predicted_relevance, cg_discount)
-  idcg = _dcg_idcg(sorted_relevance, cg_discount)
-  # the ndcg score of the batch
-  # idcg is 0 if label_scores are all 0
-  ndcg = safe_div(dcg, idcg, 'one_ndcg')
-  return ndcg
-
-
-def cal_swapped_ndcg(label_scores, predicted_scores, top_k_int):
-  """
-  Calculate swapped NDCG score in Lambda Rank for full/top k ranking positions
-  Args:
-    label_scores: a real `Tensor`.
-    predicted_scores: a real `Tensor`, with dtype matching label_scores
-    top_k_int: An int or an int `Tensor`. 
-  Returns:
-    a `Tensor` that holds swapped NDCG by .
-  """
-  sorted_labels, predicted_order = _get_ranking_orders(
-    label_scores, predicted_scores, top_k_int=top_k_int)
-
-  predicted_relevance = _get_relevance_scores(predicted_order)
-  sorted_relevance = _get_relevance_scores(sorted_labels)
-
-  cg_discount = _get_cg_discount(top_k_int)
-
-  # cg_discount is safe as a denominator
-  dcg_k = predicted_relevance / cg_discount
-  dcg = tf.reduce_sum(dcg_k)
-
-  idcg_k = sorted_relevance / cg_discount
-  idcg = tf.reduce_sum(idcg_k)
-
-  ndcg = safe_div(dcg, idcg, 'ndcg_in_lambdarank_training')
-
-  # remove the gain from label i then add the gain from label j
-  tiled_ij = tf.tile(dcg_k, [1, top_k_int])
-  new_ij = (predicted_relevance / tf.transpose(cg_discount))
-
-  tiled_ji = tf.tile(tf.transpose(dcg_k), [top_k_int, 1])
-  new_ji = tf.transpose(predicted_relevance) / cg_discount
-
-  # if swap i and j, remove the stale cg for i, then add the new cg for i,
-  # remove the stale cg for j, and then add the new cg for j
-  new_dcg = dcg - tiled_ij + new_ij - tiled_ji + new_ji
-
-  new_ndcg = safe_div(new_dcg, idcg, 'new_ndcg_in_lambdarank_training')
-  swapped_ndcg = tf.abs(ndcg - new_ndcg)
-  return swapped_ndcg
-
-
-def _dcg_idcg(relevance_scores, cg_discount):
-  """
-  Calculate DCG scores for top_k_int ranking positions
-  Args:
-    relevance_scores: a real `Tensor`.
-    cg_discount: a real `Tensor`, with dtype matching relevance_scores
-  Returns:
-    a `Tensor` that holds \\sum_{i=1}^k \frac{relevance_scores_k}{cg_discount}  
-  """
-  # cg_discount is safe
-  dcg_k = relevance_scores / cg_discount
-  return tf.reduce_sum(dcg_k)
-
-
-def _get_ranking_orders(label_scores, predicted_scores, top_k_int=1):
-  """
-  Calculate DCG scores for top_k_int ranking positions
-  Args:
-    label_scores: a real `Tensor`.
-    predicted_scores: a real `Tensor`, with dtype matching label_scores
-    top_k_int: an integer or an int `Tensor`.
-  Returns:
-    two `Tensors` that hold sorted_labels: the ground truth relevance socres
-    and predicted_order: relevance socres based on sorted predicted_scores
-  """
-  # sort predictions_scores and label_scores
-  # size [batch_size/num of DataRecords, 1]
-  label_scores = tf.reshape(label_scores, [-1, 1])
-  predicted_scores = tf.reshape(predicted_scores, [-1, 1])
-  # sorted_labels contians the relevance scores of the correct order
-  sorted_labels, ordered_labels_indices = tf.nn.top_k(
-    tf.transpose(label_scores), k=top_k_int)
-  sorted_labels = tf.transpose(sorted_labels)
-  # sort predicitons and use the indices to obtain the relevance scores of the predicted order
-  sorted_predictions, ordered_predictions_indices = tf.nn.top_k(
-    tf.transpose(predicted_scores), k=top_k_int)
-  ordered_predictions_indices_for_labels = tf.transpose(ordered_predictions_indices)
-  # predicted_order contians the relevance scores of the predicted order
-  predicted_order = tf.gather_nd(label_scores, ordered_predictions_indices_for_labels)
-  return sorted_labels, predicted_order
-
-
-def _get_cg_discount(top_k_int=1):
-  r"""
-  Calculate discounted gain factor for ranking position till top_k_int
-  Args:
-    top_k_int: An int or an int `Tensor`.
-  Returns:
-    a `Tensor` that holds \log_{2}(i + 1), i \in [1, k] 
-  """
-  log_2 = tf.log(tf.constant(2.0, dtype=tf.float32))
-  # top_k_range needs to start from 1 to top_k_int
-  top_k_range = tf.range(top_k_int) + 1
-  top_k_range = tf.reshape(top_k_range, [-1, 1])
-  # cast top_k_range to float
-  top_k_range = tf.cast(top_k_range, dtype=tf.float32)
-  cg_discount = tf.log(top_k_range + 1.0) / log_2
-  return cg_discount
-
-
-def _get_relevance_scores(scores):
-  return 2 ** scores - 1
-
-
-def safe_log(raw_scores, name=None):
-  """
-  Calculate log of a tensor, handling cases that
-  raw_scores are close to 0s
-  Args:
-    raw_scores: An float `Tensor`.
-  Returns:
-    A float `Tensor` that hols the safe log base e of input
-  """
-  epsilon = 1E-8
-  clipped_raw_scores = tf.maximum(raw_scores, epsilon)
-  return tf.log(clipped_raw_scores)
+def safe_div(
+    numerator: tf.Tensor, denominator: tf.Tensor, name: Optional[str] = None
+) -> tf.Tensor:
+    """
+    Example usage: calculating NDCG = DCG / IDCG to handle cases when
+    IDCG = 0 returns 0 instead of Infinity
+    Do not use this dividing function unless it makes sense to your problem
+    Divides two tensors element-wise, returns 0 if the denominator is <= 0.
+
+    Args:
+        numerator:
+            a real `Tensor`.
+        denominator:
+            a real `Tensor`, with dtype matching `numerator`.
+        name:
+            Name for the returned op.
+
+    Returns:
+        0 if `denominator` <= 0, else `numerator` / `denominator`
+    """
+    t = math_ops.truediv(numerator, denominator)
+    zero = array_ops.zeros_like(t, dtype=denominator.dtype)
+    condition = math_ops.greater(denominator, zero)
+    zero = math_ops.cast(zero, t.dtype)
+    return array_ops.where(condition, t, zero, name=name)
+
+
+def cal_ndcg(
+    label_scores: tf.Tensor,
+    predicted_scores: tf.Tensor,
+    top_k_int: Union[int, tf.Tensor],
+) -> tf.Tensor:
+    """
+    Calculate NDCG score for top_k_int ranking positions
+
+    Args:
+        label_scores:
+            a real `Tensor`.
+        predicted_scores:
+            a real `Tensor`, with dtype matching label_scores
+        top_k_int:
+            An int or an int `Tensor`.
+
+    Returns:
+        a `Tensor` that holds DCG / IDCG.
+    """
+
+    sorted_labels, predicted_order = _get_ranking_orders(
+        label_scores, predicted_scores, top_k_int=top_k_int
+    )
+
+    predicted_relevance = _get_relevance_scores(predicted_order)
+    sorted_relevance = _get_relevance_scores(sorted_labels)
+
+    cg_discount = _get_cg_discount(top_k_int)
+
+    dcg = _dcg_idcg(predicted_relevance, cg_discount)
+    idcg = _dcg_idcg(sorted_relevance, cg_discount)
+    # the ndcg score of the batch
+    # idcg is 0 if label_scores are all 0
+    ndcg = safe_div(dcg, idcg, "one_ndcg")
+    return ndcg
+
+
+def cal_swapped_ndcg(
+    label_scores: tf.Tensor,
+    predicted_scores: tf.Tensor,
+    top_k_int: Union[int, tf.Tensor],
+) -> tf.Tensor:
+    """
+    Calculate swapped NDCG score in Lambda Rank for full/top k ranking positions
+
+    Args:
+        label_scores:
+            a real `Tensor`.
+        predicted_scores:
+            a real `Tensor`, with dtype matching label_scores
+        top_k_int:
+            An int or an int `Tensor`.
+
+    Returns:
+        a `Tensor` that holds swapped NDCG by .
+    """
+
+    sorted_labels, predicted_order = _get_ranking_orders(
+        label_scores, predicted_scores, top_k_int=top_k_int
+    )
+
+    predicted_relevance = _get_relevance_scores(predicted_order)
+    sorted_relevance = _get_relevance_scores(sorted_labels)
+
+    cg_discount = _get_cg_discount(top_k_int)
+
+    # cg_discount is safe as a denominator
+    dcg_k = predicted_relevance / cg_discount
+    dcg = tf.reduce_sum(dcg_k)
+
+    idcg_k = sorted_relevance / cg_discount
+    idcg = tf.reduce_sum(idcg_k)
+
+    ndcg = safe_div(dcg, idcg, "ndcg_in_lambdarank_training")
+
+    # remove the gain from label i then add the gain from label j
+    tiled_ij = tf.tile(dcg_k, [1, top_k_int])
+    new_ij = predicted_relevance / tf.transpose(cg_discount)
+
+    tiled_ji = tf.tile(tf.transpose(dcg_k), [top_k_int, 1])
+    new_ji = tf.transpose(predicted_relevance) / cg_discount
+
+    # if swap i and j, remove the stale cg for i, then add the new cg for i,
+    # remove the stale cg for j, and then add the new cg for j
+    new_dcg = dcg - tiled_ij + new_ij - tiled_ji + new_ji
+
+    new_ndcg = safe_div(new_dcg, idcg, "new_ndcg_in_lambdarank_training")
+    swapped_ndcg = tf.abs(ndcg - new_ndcg)
+    return swapped_ndcg
+
+
+def _dcg_idcg(relevance_scores: tf.Tensor, cg_discount: tf.Tensor) -> tf.Tensor:
+    """
+    Calculate DCG scores for top_k_int ranking positions
+
+    Args:
+        relevance_scores:
+            a real `Tensor`.
+        cg_discount:
+            a real `Tensor`, with dtype matching relevance_scores
+
+    Returns:
+        a `Tensor` that holds \\sum_{i=1}^k \frac{relevance_scores_k}{cg_discount}
+    """
+
+    # cg_discount is safe
+    dcg_k = relevance_scores / cg_discount
+    return tf.reduce_sum(dcg_k)
+
+
+def _get_ranking_orders(
+    label_scores: tf.Tensor,
+    predicted_scores: tf.Tensor,
+    top_k_int: Union[int, tf.Tensor],
+) -> tf.Tensor:
+    """
+    Calculate DCG scores for top_k_int ranking positions
+
+    Args:
+        label_scores:
+            a real `Tensor`.
+        predicted_scores:
+            a real `Tensor`, with dtype matching label_scores
+        top_k_int:
+            an integer or an int `Tensor`.
+
+    Returns:
+        two `Tensors` that hold sorted_labels: the ground truth relevance scores
+        and predicted_order: relevance scores based on sorted predicted_scores
+    """
+
+    # sort predictions_scores and label_scores
+    # size [batch_size/num of DataRecords, 1]
+    label_scores = tf.reshape(label_scores, [-1, 1])
+    predicted_scores = tf.reshape(predicted_scores, [-1, 1])
+    # sorted_labels contains the relevance scores of the correct order
+    sorted_labels, ordered_labels_indices = tf.nn.top_k(
+        tf.transpose(label_scores), k=top_k_int
+    )
+    sorted_labels = tf.transpose(sorted_labels)
+    # sort predictions and use the indices to obtain the relevance scores of the predicted order
+    sorted_predictions, ordered_predictions_indices = tf.nn.top_k(
+        tf.transpose(predicted_scores), k=top_k_int
+    )
+    ordered_predictions_indices_for_labels = tf.transpose(ordered_predictions_indices)
+    # predicted_order contains the relevance scores of the predicted order
+    predicted_order = tf.gather_nd(label_scores, ordered_predictions_indices_for_labels)
+    return sorted_labels, predicted_order
+
+
+def _get_cg_discount(top_k_int: int = 1):
+    r"""
+    Calculate discounted gain factor for ranking position till top_k_int
+
+    Args:
+        top_k_int: An int or an int `Tensor`.
+
+    Returns:
+        a `Tensor` that holds \log_{2}(i + 1), i \in [1, k]
+    """
+
+    log_2 = tf.log(tf.constant(2.0, dtype=tf.float32))
+    # top_k_range needs to start from 1 to top_k_int
+    top_k_range = tf.range(top_k_int) + 1
+    top_k_range = tf.reshape(top_k_range, [-1, 1])
+    # cast top_k_range to float
+    top_k_range = tf.cast(top_k_range, dtype=tf.float32)
+    cg_discount = tf.log(top_k_range + 1.0) / log_2
+    return cg_discount
+
+
+def _get_relevance_scores(scores: tf.Tensor) -> tf.Tensor:
+    """Calculate relevance scores for top_k_int ranking positions"""
+
+    return 2**scores - 1
+
+
+def safe_log(raw_scores: tf.Tensor, name: Optional[str] = None) -> tf.Tensor:
+    """
+    Calculate log of a tensor, handling cases that raw_scores are close to 0s
+
+    Args:
+        raw_scores: An float `Tensor`.
+
+    Returns:
+        A float `Tensor` that hols the safe log base e of input
+    """
+
+    epsilon = 1e-8
+    clipped_raw_scores = tf.maximum(raw_scores, epsilon)
+    return tf.log(clipped_raw_scores)
diff --git a/twml/twml/contrib/utils/normalizer.py b/twml/twml/contrib/utils/normalizer.py
index a6a7035b8..0b9a94602 100644
--- a/twml/twml/contrib/utils/normalizer.py
+++ b/twml/twml/contrib/utils/normalizer.py
@@ -1,39 +1,48 @@
 import tensorflow.compat.v1 as tf
+
 from twml.contrib.utils import math_fns
 
 
-def mean_max_normalizaiton(dense_tensor):
-  """
-  In-batch normalization
-  Args:
-    dense_tensor: A dense `Tensor`.
-  Returns:
-    (dense_tensor - mean) / abs(max value)
-  Note:
-    when dense_tensor is of size [1, ?] it will give 0
-    If this is not what you want handle it outside the function
-  """
-  dense_mean = tf.reduce_mean(dense_tensor, reduction_indices=[0])
-  dense_abs_max = tf.abs(tf.reduce_max(dense_tensor, reduction_indices=[0]))
-  dense_tensor = math_fns.safe_div(dense_tensor - dense_mean, dense_abs_max,
-    'mean_max_normalization_in_batch')
-  return dense_tensor
-
-
-def standard_normalizaiton(dense_tensor):
-  """
-  In-batch normalization
-  z-normalization or standard_normalization in batch
-  Args:
-    dense_tensor: A dense `Tensor`.
-  Returns:
-    (dense_tensor - mean) / variance
-  Note:
-    when dense_tensor is of size [1, ?] it will give 0
-    If this is not what you want handle it outside the function
-  """
-  epsilon = 1E-7
-  dense_mean, dense_variance = tf.nn.moments(dense_tensor, 0)
-  # using epsilon is safer than math_fns.safe_div in here
-  dense_tensor = (dense_tensor - dense_mean) / (dense_variance + epsilon)
-  return dense_tensor
+def mean_max_normalizaiton(dense_tensor: tf.Tensor) -> tf.Tensor:
+    """
+    In-batch normalization
+
+    Args:
+        dense_tensor: A dense `Tensor`.
+
+    Returns:
+        (dense_tensor - mean) / abs(max value)
+
+    Note:
+        when dense_tensor is of size [1, ?] it will give 0
+        If this is not what you want handle it outside the function
+    """
+
+    dense_mean = tf.reduce_mean(dense_tensor, reduction_indices=[0])
+    dense_abs_max = tf.abs(tf.reduce_max(dense_tensor, reduction_indices=[0]))
+    dense_tensor = math_fns.safe_div(
+        dense_tensor - dense_mean, dense_abs_max, "mean_max_normalization_in_batch"
+    )
+    return dense_tensor
+
+
+def standard_normalizaiton(dense_tensor: tf.Tensor) -> tf.Tensor:
+    """
+    In-batch normalization
+    z-normalization or standard_normalization in batch
+
+    Args:
+        dense_tensor: A dense `Tensor`.
+
+    Returns:
+        (dense_tensor - mean) / variance
+
+    Note:
+        when dense_tensor is of size [1, ?] it will give 0
+        If this is not what you want handle it outside the function
+    """
+    epsilon = 1e-7
+    dense_mean, dense_variance = tf.nn.moments(dense_tensor, 0)
+    # using epsilon is safer than math_fns.safe_div in here
+    dense_tensor = (dense_tensor - dense_mean) / (dense_variance + epsilon)
+    return dense_tensor
diff --git a/twml/twml/contrib/utils/scores.py b/twml/twml/contrib/utils/scores.py
index 84e792c13..641b3fbc9 100644
--- a/twml/twml/contrib/utils/scores.py
+++ b/twml/twml/contrib/utils/scores.py
@@ -1,33 +1,35 @@
 import tensorflow.compat.v1 as tf
 
 
-def get_pairwise_scores(tensor_input):
-  """
-  This is so far used in pariwise learning-to-rank
-
-  Arguments:
-    tensor_input: a dense `Tensor` of shape [n_data, 1]
-      n_data is the number of teet candidates
-
-  Returns:
-    pairwise scores: a dense `Tensor` of shape [n_data, n_data].
-  """
-  return tensor_input - tf.transpose(tensor_input)
-
-
-def get_pairwise_label_scores(labels):
-  """
-  This is so far used in pariwise learning-to-rank
-  Args:
-    labels: a dense `Tensor` of shape [n_data, 1]
-      n_data is the number of teet candidates
-  Returns:
-    pairwise label scores: a dense `Tensor` of shape [n_data, n_data].
-      each value is within [0, 1]
-  """
-  # raw pairwise label scores/differences
-  pairwise_label_scores = get_pairwise_scores(labels)
-  # sanity check to make sure values in differences_ij are [-1, 1]
-  differences_ij = tf.maximum(tf.minimum(1.0, pairwise_label_scores), -1.0)
-  # values in pairwise_label_scores are within [0, 1] for cross entropy
-  return (1.0 / 2.0) * (1.0 + differences_ij)
+def get_pairwise_scores(tensor_input: tf.Tensor) -> tf.Tensor:
+    """
+    This is so far used in pairwise learning-to-rank
+
+    Args:
+        tensor_input: a dense `Tensor` of shape [n_data, 1]
+            n_data is the number of tweet candidates
+
+    Returns:
+        pairwise scores: a dense `Tensor` of shape [n_data, n_data].
+    """
+
+    return tensor_input - tf.transpose(tensor_input)
+
+
+def get_pairwise_label_scores(labels: tf.Tensor) -> tf.Tensor:
+    """
+    This is so far used in pairwise learning-to-rank
+    Args:
+        labels: a dense `Tensor` of shape [n_data, 1]
+            n_data is the number of tweet candidates
+    Returns:
+        pairwise label scores: a dense `Tensor` of shape [n_data, n_data].
+            each value is within [0, 1]
+    """
+
+    # raw pairwise label scores/differences
+    pairwise_label_scores = get_pairwise_scores(labels)
+    # sanity check to make sure values in differences_ij are [-1, 1]
+    differences_ij = tf.maximum(tf.minimum(1.0, pairwise_label_scores), -1.0)
+    # values in pairwise_label_scores are within [0, 1] for cross entropy
+    return (1.0 / 2.0) * (1.0 + differences_ij)
diff --git a/twml/twml/contrib/utils/similarities.py b/twml/twml/contrib/utils/similarities.py
index 212065f88..3107ab8f6 100644
--- a/twml/twml/contrib/utils/similarities.py
+++ b/twml/twml/contrib/utils/similarities.py
@@ -1,17 +1,21 @@
 import tensorflow.compat.v1 as tf
 
 
-def cosine_similarity(x1, x2, axis):
-  """
-  cosine similarity of two tensors.
+def cosine_similarity(x1: tf.Tensor, x2: tf.tensor, axis: int = 1) -> tf.Tensor:
+    """
+    cosine similarity of two tensors.
 
-  Arguments:
-    x1:
-      A tf.Tensor
-    x2:
-      A tf.Tensor
-    axis: Dimension along which to normalize.
-  """
-  normalize_x1 = tf.nn.l2_normalize(x1, axis=axis)
-  normalize_x2 = tf.nn.l2_normalize(x2, axis=axis)
-  return tf.reduce_sum(tf.multiply(normalize_x1, normalize_x2), axis=axis)
+    Args:
+        x1:
+            A tf.Tensor
+        x2:
+            A tf.Tensor
+        axis:
+            Dimension along which to normalize.
+
+    Returns:
+        A tf.Tensor
+    """
+    normalize_x1 = tf.nn.l2_normalize(x1, axis=axis)
+    normalize_x2 = tf.nn.l2_normalize(x2, axis=axis)
+    return tf.reduce_sum(tf.multiply(normalize_x1, normalize_x2), axis=axis)
diff --git a/twml/twml/dataset.py b/twml/twml/dataset.py
index 4356fdc7c..537caa316 100644
--- a/twml/twml/dataset.py
+++ b/twml/twml/dataset.py
@@ -2,371 +2,429 @@
 This module implements custom tf.data.datasets for twml.
 """
 import numbers
+from typing import Callable, Generator, Iterator, List, Optional
 
+import tensorflow.compat.v1 as tf
 from absl import logging
 from kazoo.client import KazooClient
 from libtwml import OPLIB
-import tensorflow.compat.v1 as tf
+
 from twml.constants import DEFAULT_ZOOKEEPER_BASE_ZNODE, DEFAULT_ZOOKEEPER_HOST
 
 
 class BlockFormatDataset(tf.data.Dataset):
-  """A ``tf.data.Dataset`` comprising records from one or more TFRecord files."""
-
-  def __init__(self, filenames, compression_type="auto", buffer_size=1 << 20):
+    """A ``tf.data.Dataset`` comprising records from one or more TFRecord files."""
+
+    def __init__(
+        self,
+        filenames: tf.Tensor,
+        compression_type: str = "auto",
+        buffer_size: int = 1 << 20,
+    ):
+        """
+        Creates a ``BlockFormatDataset``.
+
+        Args:
+            filenames:
+                A `tf.string` tensor containing one or more filenames.
+            compression_type:
+                A string specifying the compression type.
+                Can be one of 'gz' (or 'gzip'), 'none', 'auto' (default).
+                When compression_type == 'auto', it is inferred from file extension.
+            buffer_size:
+                Buffer size to be used during decompression. default: 1<<20.
+        """
+        self._filenames = tf.convert_to_tensor(
+            filenames, dtype=tf.string, name="filenames"
+        )
+        self._compression_type = tf.convert_to_tensor(
+            compression_type.lower(), name="compression_type"
+        )
+        self._buffer_size = tf.convert_to_tensor(
+            buffer_size, dtype=tf.int64, name="buffer_size"
+        )
+        # Parent class calls self._as_variant_tensor in init. So call this at the end.
+        super(BlockFormatDataset, self).__init__()
+
+    def _as_variant_tensor(self) -> tf.Tensor:
+        """Create the resource handle for the dataset."""
+        try:
+            block_format_dataset = __import__(
+                "libtwml_internal"
+            ).OPLIB.block_format_dataset
+            return block_format_dataset(self._filenames)
+        except ImportError:
+            block_format_dataset = OPLIB.block_format_dataset_v2
+            return block_format_dataset(
+                self._filenames, self._compression_type, self._buffer_size
+            )
+
+    def _inputs(self) -> list:
+        return []
+
+    @property
+    def output_shapes(self) -> tf.TensorShape:
+        """Return output shapes"""
+        return tf.TensorShape([])
+
+    @property
+    def output_types(self) -> tf.DType:
+        """Return output types"""
+        return tf.string
+
+    @property
+    def output_classes(self) -> tf.Tensor:
+        """Return output classes"""
+        return tf.Tensor
+
+
+def downsample_dataset(
+    dataset: tf.data.Dataset,
+    sample_rate: numbers.Real,
+    rate_name: str = "rate",
+) -> tf.data.Dataset:
     """
-    Creates a ``BlockFormatDataset``.
-
-    Args:
-      filenames:
-        A `tf.string` tensor containing one or more filenames.
-      compression_type:
-        A string specifying the compression type.
-        Can be one of 'gz' (or 'gzip'), 'none', 'auto' (default).
-        When compression_type == 'auto', it is inferred from file extension.
-      buffer_size:
-        Buffer size to be used during decompression. default: 1<<20.
+    Downsample a tf.data.Dataset at sample_rate
     """
-    self._filenames = tf.convert_to_tensor(filenames, dtype=tf.string, name="filenames")
-    self._compression_type = tf.convert_to_tensor(compression_type.lower(), name="compression_type")
-    self._buffer_size = tf.convert_to_tensor(buffer_size, dtype=tf.int64, name="buffer_size")
-    # Parent class calss self._as_variant_tensor in init. So call this at the end.
-    super(BlockFormatDataset, self).__init__()
-
-  def _as_variant_tensor(self):
+    if sample_rate is None or sample_rate == 1.0:
+        return dataset
+    elif not isinstance(sample_rate, numbers.Real):
+        raise TypeError("dataset %s must be a real number" % rate_name)
+    elif sample_rate <= 0 or sample_rate > 1:
+        raise ValueError("dataset %s must be in range (0, 1])" % rate_name)
+    return dataset.filter(lambda _: tf.squeeze(tf.random_uniform([1])) < sample_rate)
+
+
+def _filenames_dataset(
+    files: List[str],
+    shards: Optional[int] = None,
+    shard_index: Optional[int] = None,
+) -> tf.data.Dataset:
     """
-    Create the resource handle for the dataset.
+    Get a tf.data.Dataset with file names from a list of files
+    Optionally shard the file list (see stream_block_format_dataset)
     """
-    try:
-      block_format_dataset = __import__("libtwml_internal").OPLIB.block_format_dataset
-      return block_format_dataset(self._filenames)
-    except ImportError:
-      block_format_dataset = OPLIB.block_format_dataset_v2
-      return block_format_dataset(self._filenames, self._compression_type, self._buffer_size)
-
-  def _inputs(self):
-    return []
-
-  @property
-  def output_shapes(self):
-    """Return output shapes"""
-    return tf.TensorShape([])
-
-  @property
-  def output_types(self):
-    """Return output types"""
-    return tf.string
-
-  @property
-  def output_classes(self):
-    """Return output classes"""
-    return tf.Tensor
-
-
-def downsample_dataset(dataset, sample_rate, rate_name):
-  """
-  Downsample a tf.data.Dataset at sample_rate
-  """
-  if sample_rate is None or sample_rate == 1.0:
-    return dataset
-  elif not isinstance(sample_rate, numbers.Real):
-    raise TypeError("dataset %s must be a real number" % rate_name)
-  elif sample_rate <= 0 or sample_rate > 1:
-    raise ValueError("dataset %s must be in range (0, 1])" % rate_name)
-  return dataset.filter(lambda _: tf.squeeze(tf.random_uniform([1])) < sample_rate)
-
+    files = tf.data.Dataset.from_tensor_slices(files)
 
-def _filenames_dataset(files, shards=None, shard_index=None):
-  """
-  Get a tf.data.Dataset with file names from a list of files
-  Optionally shard the file list (see stream_block_format_dataset)
-  """
-  files = tf.data.Dataset.from_tensor_slices(files)
+    if [shards, shard_index] != [None, None]:
+        logging.info(
+            "Sharding files dataset (index: %d, shards: %d)" % (shard_index, shards)
+        )
+        files = files.shard(num_shards=shards, index=shard_index)
 
-  if [shards, shard_index] != [None, None]:
-    logging.info("Sharding files dataset (index: %d, shards: %d)" % (shard_index, shards))
-    files = files.shard(num_shards=shards, index=shard_index)
-
-  return files
+    return files
 
 
 def stream_block_format_dataset(
-        files, parse_fn, batch_size, num_threads,
-        shuffle=True, repeat=False,
-        block_length=None, part_file_parallelism=None, file_shuffle_size=None,
-        record_shuffle_size=None, dataset_fn=None,
-        keep_rate=None, parts_downsampling_rate=None, prefetch_size=2,
-        shards=None, shard_index=None, shuffle_files=True, interleave=True):
-  """
-  Helper function to stream a list of part files.
-
-  Args:
-    files:
-      List of input files which will create a dataset.
-    parse_fn:
-      A function that takes a byte tensor containing a datarecord and decodes it.
-    batch_size:
-      The batch size for each step.
-    num_threads:
-      Number of threads working on the data in parallel.
-    shuffle:
-      Shuffle records within each file using ``record_shuffle_size``. Defaults to True.
-    repeat:
-      Repeat the dataset indefinitely. Defaults to False.
-      Useful when you want to use an ``[train,eval]_steps`` greater than the size of the dataset
-      (otherwise ``Estimator.[train,evaluate]`` stop when the end of the dataset is reached).
-    block_length (optional):
-      Number of consecutive records to pull from a single part file.
-      Defaults to batch_size.
-    part_file_parallelism (optional):
-      Number of part files to read from in parallel. Once a part file is completely read, it will
-      be replaced by the next part file in the part file list.
-
-      ``num_threads`` specifies a reader thread pool size, while ``part_file_parallelism`` specifies
-      the number of files to read from in parallel. If ``part_file_parallelism`` is greater than or
-      equal to ``num_threads``, the reads will be distributed over ``num_threads``. On the other hand,
-      if ``part_file_parallelism`` is smaller than``num_threads``, it is very likely that the reader
-      thread pool will be underutilized, since it can never be the case that every reader thread has
-      a part file to read from.
-
-    file_shuffle_size (optional):
-      the buffer_size used for shuffling of the list of files.
-      Defaults to 1000. For example, if you have 2000 files, the first
-      1000 files are shuffled together, iterated through, then the next 1000 files are shuffled
-      and iterated through.
-    record_shuffle_size (optional):
-      the ``buffer_size`` used for shuffling records in each thread.
-      Defaults to ``batch_size * 8`` records.
-    dataset_fn (optional):
-      A function of that modifies the dataset after it reads different interleaved parts files.
-      Defaults to:
-
-      .. code-block:: python
-
-        def dataset_fn(dataset, parse_fn, batch_size):
-          return dataset.batch(batch_size).map(parse_fn, 1)
-
-    keep_rate (optional):
-      A float value in (0.0, 1.0] that indicates to drop records according to the Bernoulli
-      distribution with p = 1 - keep_rate.
-      Defaults to None (no records dropped).
-
-    parts_downsampling_rate (optional):
-      A float value in ``(0.0, 1.0]`` that indicates the factor by which to downsample part files.
-      For example, a value of 0.2 means only 20 percent of part files become part of the dataset.
-
-      Note that this argument is only useful in conjunction with a [train,eval]_steps of -1
-      (that is, when the entire dataset is used). Furthermore, note that even in this case, each
-      epoch will see a different set of part files. This is because new part files are re-sampled
-      every epoch. In other words, this argument is only provided for backwards compatibility with
-      DeepBird v1. We recommend you use a smaller [train,eval]_steps (or specify a keep_rate)
-      instead.
-
-    shards (optional):
-      Number of partitions to shard the dataset into. This is useful for codistillation and other
-      techniques that require each worker to train on disjoint partitions of the dataset.
-      The dataset is not sharded by default.
-
-    shard_index (optional):
-      Which partition of the dataset to use if ``shards`` is set.
-
-    shuffle_files (optional):
-      Shuffle the list of files. Defaults to True.
-      When False, files are iterated in the order they are passed in.
-
-    interleave (optional):
-      Interleave records from multiple files in parallel. Defaults to True.
-
-  Returns:
-    tf.data.DataSet of batches of HashedDataRecord resource handles decoded and streamed online.
-  """
-  # Creating a dataset from an input directory
-
-  files = _filenames_dataset(files, shards=shards, shard_index=shard_index)
-
-  file_shuffle_size = file_shuffle_size if file_shuffle_size is not None else 100000
-  record_shuffle_size = record_shuffle_size if record_shuffle_size is not None else (batch_size * 8)
-  block_length = block_length if block_length is not None else batch_size
-
-  logging.info("NUM_THREADS: %d", num_threads)
-
-  if repeat:
-    files = files.repeat()
-
-  if shuffle_files:
-    # Randomly shuffle the files list.
-    files = files.shuffle(buffer_size=file_shuffle_size)
-
-  # Downsample parts files
-  files = downsample_dataset(files, parts_downsampling_rate, "parts_downsampling_rate")
-
-  # Interleave the result from BlockFormatDataset
-  # block_length == batch_size results in batch_size records being read from a single file.
-  def map_fn(filenames):
-    '''function that maps each filename to a BlockFormatDataset'''
-    # reach each file using BlockFormatDataset
-    dataset = BlockFormatDataset(filenames)
-
-    # early prefetching can sometimes improve performance (like on GCS)
-    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
-
-    # Shuffling before repeating ensures strong ordering.
-    if shuffle:
-      dataset = dataset.shuffle(buffer_size=record_shuffle_size)
-
-    return dataset
-
-  if interleave:
-    part_file_parallelism = num_threads if part_file_parallelism is None else part_file_parallelism
-    dataset = files.interleave(
-      map_fn, cycle_length=part_file_parallelism, block_length=block_length, num_parallel_calls=num_threads)
-  else:
-    dataset = files.flat_map(map_fn)
-
-  # Downsample DataRecords
-  dataset = downsample_dataset(dataset, keep_rate, "keep_rate")
-
-  if dataset_fn is None:
-    # Create a batch of datarecords and decode them
-    return dataset.batch(batch_size).map(parse_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE).prefetch(prefetch_size)
+    files: List[str],
+    parse_fn: Callable[[tf.Tensor], tf.Tensor],
+    batch_size: int,
+    num_threads: int,
+    shuffle: bool = True,
+    repeat: bool = False,
+    block_length: Optional[int] = None,
+    part_file_parallelism: Optional[int] = None,
+    file_shuffle_size: Optional[int] = None,
+    record_shuffle_size: Optional[int] = None,
+    dataset_fn: Optional[Callable[[tf.data.Dataset], tf.data.Dataset]] = None,
+    keep_rate: Optional[float] = None,
+    parts_downsampling_rate: Optional[float] = None,
+    prefetch_size: int = 2,
+    shards: int = None,
+    shard_index: int = None,
+    shuffle_files: bool = True,
+    interleave: bool = True,
+) -> tf.data.Dataset:
+    """
+    Helper function to stream a list of part files.
 
-  return dataset_fn(dataset, parse_fn, batch_size)
+    Args:
+        files:
+            List of input files which will create a dataset.
+        parse_fn:
+            A function that takes a byte tensor containing a data record and decodes it.
+        batch_size:
+            The batch size for each step.
+        num_threads:
+            Number of threads working on the data in parallel.
+        shuffle:
+            Shuffle records within each file using ``record_shuffle_size``. Defaults to True.
+        repeat:
+            Repeat the dataset indefinitely. Defaults to False.
+            Useful when you want to use an ``[train,eval]_steps`` greater than the size of the dataset
+            (otherwise ``Estimator.[train,evaluate]`` stop when the end of the dataset is reached).
+        block_length (optional):
+            Number of consecutive records to pull from a single part file.
+            Defaults to batch_size.
+        part_file_parallelism (optional):
+            Number of part files to read from in parallel. Once a part file is completely read, it will
+            be replaced by the next part file in the part file list.
+            ``num_threads`` specifies a reader thread pool size, while ``part_file_parallelism`` specifies
+            the number of files to read from in parallel. If ``part_file_parallelism`` is greater than or
+            equal to ``num_threads``, the reads will be distributed over ``num_threads``. On the other hand,
+            if ``part_file_parallelism`` is smaller than``num_threads``, it is very likely that the reader
+            thread pool will be underutilized, since it can never be the case that every reader thread has
+            a part file to read from.
+        file_shuffle_size (optional):
+            the buffer_size used for shuffling of the list of files.
+            Defaults to 1000. For example, if you have 2000 files, the first
+            1000 files are shuffled together, iterated through, then the next 1000 files are shuffled
+            and iterated through.
+        record_shuffle_size (optional):
+            the ``buffer_size`` used for shuffling records in each thread.
+            Defaults to ``batch_size * 8`` records.
+        dataset_fn (optional):
+            A function of that modifies the dataset after it reads different interleaved parts files.
+            Defaults to:
+            .. code-block:: python
+            def dataset_fn(dataset, parse_fn, batch_size):
+                return dataset.batch(batch_size).map(parse_fn, 1)
+        keep_rate (optional):
+            A float value in (0.0, 1.0] that indicates to drop records according to the Bernoulli
+            distribution with p = 1 - keep_rate.
+            Defaults to None (no records dropped).
+        parts_downsampling_rate (optional):
+            A float value in ``(0.0, 1.0]`` that indicates the factor by which to downsample part files.
+            For example, a value of 0.2 means only 20 percent of part files become part of the dataset.
+            Note that this argument is only useful in conjunction with a [train,eval]_steps of -1
+            (that is, when the entire dataset is used). Furthermore, note that even in this case, each
+            epoch will see a different set of part files. This is because new part files are re-sampled
+            every epoch. In other words, this argument is only provided for backwards compatibility with
+            DeepBird v1. We recommend you use a smaller [train,eval]_steps (or specify a keep_rate)
+            instead.
+        shards (optional):
+            Number of partitions to shard the dataset into. This is useful for codistillation and other
+            techniques that require each worker to train on disjoint partitions of the dataset.
+            The dataset is not shared by default.
+        shard_index (optional):
+            Which partition of the dataset to use if ``shards`` is set.
+        shuffle_files (optional):
+            Shuffle the list of files. Defaults to True.
+            When False, files are iterated in the order they are passed in.
+        interleave (optional):
+            Interleave records from multiple files in parallel. Defaults to True.
+
+    Returns:
+        tf.data.DataSet of batches of HashedDataRecord resource handles decoded and streamed online.
+    """
+    # Creating a dataset from an input directory
+
+    files = _filenames_dataset(files, shards=shards, shard_index=shard_index)
+
+    file_shuffle_size = file_shuffle_size if file_shuffle_size is not None else 100000
+    record_shuffle_size = (
+        record_shuffle_size if record_shuffle_size is not None else (batch_size * 8)
+    )
+    block_length = block_length if block_length is not None else batch_size
+
+    logging.info("NUM_THREADS: %d", num_threads)
+
+    if repeat:
+        files = files.repeat()
+
+    if shuffle_files:
+        # Randomly shuffle the files list.
+        files = files.shuffle(buffer_size=file_shuffle_size)
+
+    # Downsample parts files
+    files = downsample_dataset(
+        files, parts_downsampling_rate, "parts_downsampling_rate"
+    )
+
+    # Interleave the result from BlockFormatDataset
+    # block_length == batch_size results in batch_size records being read from a single file.
+    def map_fn(filenames):
+        """function that maps each filename to a BlockFormatDataset"""
+        # reach each file using BlockFormatDataset
+        dataset = BlockFormatDataset(filenames)
+
+        # early prefetching can sometimes improve performance (like on GCS)
+        dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+
+        # Shuffling before repeating ensures strong ordering.
+        if shuffle:
+            dataset = dataset.shuffle(buffer_size=record_shuffle_size)
+
+        return dataset
+
+    if interleave:
+        part_file_parallelism = (
+            num_threads if part_file_parallelism is None else part_file_parallelism
+        )
+        dataset = files.interleave(
+            map_fn,
+            cycle_length=part_file_parallelism,
+            block_length=block_length,
+            num_parallel_calls=num_threads,
+        )
+    else:
+        dataset = files.flat_map(map_fn)
+
+    # Downsample DataRecords
+    dataset = downsample_dataset(dataset, keep_rate, "keep_rate")
+
+    if dataset_fn is None:
+        # Create a batch of datarecords and decode them
+        return (
+            dataset.batch(batch_size)
+            .map(parse_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+            .prefetch(prefetch_size)
+        )
+
+    return dataset_fn(dataset, parse_fn, batch_size)
+
+
+def cx_zk_path(path: str) -> str:
+    if path is None:
+        raise ValueError(
+            "Path for zookeeper dataset pointer is None. You must specify a path."
+        )
+    return_path = "/".join([DEFAULT_ZOOKEEPER_BASE_ZNODE, path])
+    logging.info(f"Zookeeper path is: {return_path}")
+    return return_path
 
 
-def cx_zk_path(path):
-  if path is None:
-    raise ValueError("Path for zookeeper dataset pointer is None. You must specify a path.")
-  return_path = "/".join([DEFAULT_ZOOKEEPER_BASE_ZNODE, path])
-  logging.info("Zookeeper path is: {}".format(return_path))
-  return return_path
+def zookeeper_ordered_dataset(
+    files: List[str],
+    parse_fn: Callable[[tf.Tensor], tf.Tensor],
+    batch_size: int,
+    zk_counter_path: str,
+    repeat: bool = False,
+    num_threads: int = 2,
+    block_length: Optional[int] = None,
+    part_file_parallelism: Optional[int] = None,
+    batch_shuffle_size: Optional[int] = None,
+    file_keep_rate: Optional[float] = None,
+    record_keep_rate: Optional[float] = None,
+    prefetch_size: int = 2,
+    interleave: bool = False,
+    dataset_fn: Callable[
+        [tf.data.Dataset, Callable[[tf.Tensor], tf.Tensor], int], tf.data.Dataset
+    ] = None,
+    verbose: bool = False,
+) -> tf.data.Dataset:
+    """
+    Make a tf.Dataset given an ordered list of filenames, using Zookeeper to keep track of
+    which file to read, and to coordinate multiple workers.
 
+    Args:
+        files:
+            ordered list of (typically HDFS) filenames. This must remain consistent
+            between different workers, and between worker restarts (e.g. in the case
+            of instance failure or preemption).
+            To ensure this remains consistent, consider using the --train.files_list
+            option from DataRecordTrainer.
+        parse_fn:
+            A function that takes a byte tensor containing a datarecord and decodes it.
+        batch_size:
+            The batch size for each step.
+        zk_counter_path:
+            Path under the root node for the underlying zookeeper shared counter that
+            is used to coordinate distributed iteration over the list of files.
+            Full path will be `'/'.join([DEFAULT_ZOOKEEPER_BASE_ZNODE, zk_counter_path])`.
+        repeat:
+            Default False. Set True to repeat over the files forever.
+        num_threads:
+            Default 2. Number of threads working on the data in parallel.
+            Only used if interleave=True.
+        block_length:
+            Default None. Number of consecutive records to pull from a single part file.
+            If None, then block_length=batch_size will be used.
+            Only used if interleave=True.
+        part_file_parallelism:
+            Default None. Number of part files to read from in parallel. Once a part file is completely
+            read, it will be replaced by the next part file indicated by the zookeeper counter.
+            Only used if interleave=True.
+            ``num_threads`` specifies a reader thread pool size, while ``part_file_parallelism`` specifies
+            the number of files to read from in parallel. If ``part_file_parallelism`` is greater than or
+            equal to ``num_threads``, the reads will be distributed over ``num_threads``. On the other hand,
+            if ``part_file_parallelism`` is smaller than``num_threads``, it is very likely that the reader
+            thread pool will be underutilized, since it can never be the case that every reader thread has
+            a part file to read from.
+        batch_shuffle_size:
+            Default None. Size of shuffle buffer, for shuffling that will be applied after batching.
+            if None, then batches will not be shuffled. Ignored if dataset_fn is provided.
+        file_keep_rate:
+            Default None. Fraction of files to keep, or None to keep all files.
+        record_keep_rate:
+            Default None. Fraction of records to keep, or None to keep all records.
+        prefetch_size:
+            Default 2. Number of parsed batches to prefetch. Ignored if dataset_fn is provided.
+        interleave:
+            Default False. Set True to use tf.data.Dataset.interleave rather than flat_map.
+        dataset_fn:
+            A function that is applied to the dataset of individual records, after
+            these have been read from the parts files.
+            If ``None`` (the default), the behavior will be as though dataset_fn were set to:
+
+            .. code-block:: python
+            def dataset_fn(dataset, parse_fn, batch_size):
+                dataset = dataset.batch(batch_size)
+                dataset = dataset.map(parse_fn, tf.data.experimental.AUTOTUNE)
+                if batch_shuffle_size:
+                dataset = dataset.shuffle(batch_shuffle_size)
+                return dataset.prefetch(prefetch_size)
+        verbose:
+            Default False. Set True to log the names of files loaded by TF.
+    """
+    block_length = batch_size if block_length is None else block_length
+    part_file_parallelism = (
+        num_threads if part_file_parallelism is None else part_file_parallelism
+    )
+
+    def zk_index_generator(my_files: List[str] = files) -> Generator[str, None, None]:
+        zk = KazooClient(hosts=DEFAULT_ZOOKEEPER_HOST)
+        zk.start()
+        my_counter = zk.Counter(cx_zk_path(zk_counter_path), default=0)
+        while True:
+            my_counter += 1
+            counter_pre_value = my_counter.pre_value
+            if repeat:
+                counter_pre_value = counter_pre_value % len(my_files)
+            if counter_pre_value >= len(my_files):
+                break
+            else:
+                chosen_file = my_files[counter_pre_value]
+                if verbose:
+                    logging.info(f"{counter_pre_value}. yielding {chosen_file}")
+                yield chosen_file
+        zk.stop()
+
+    files = tf.data.Dataset.from_generator(zk_index_generator, tf.string)
+
+    # Downsample parts files
+    files = downsample_dataset(files, file_keep_rate, "file_keep_rate")
+
+    def map_fn(filenames: tf.Tensor) -> tf.data.Dataset:
+        return BlockFormatDataset(filenames).prefetch(20)
+
+    # Dont interleave for sequential training
+    if interleave:
+        dataset = files.interleave(
+            map_fn,
+            cycle_length=part_file_parallelism,
+            block_length=block_length,
+            num_parallel_calls=num_threads,
+        )
+    else:
+        dataset = files.flat_map(map_fn)
+
+    # Downsample DataRecords
+    dataset = downsample_dataset(dataset, record_keep_rate, "record_keep_rate")
+
+    if dataset_fn is None:
+        # Create a batch of data records and decode them
+        dataset = dataset.batch(batch_size)
+        dataset = dataset.map(
+            parse_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE
+        )
+        # shuffle after batching and parsing for performance reasons
+        # faster b/c 1 random selection is made per batch rather than per record
+        if batch_shuffle_size:
+            dataset = dataset.shuffle(buffer_size=batch_shuffle_size)
+        dataset = dataset.prefetch(prefetch_size)
+
+    else:
+        dataset = dataset_fn(dataset, parse_fn, batch_size)
 
-def zookeeper_ordered_dataset(
-        files, parse_fn, batch_size, zk_counter_path, repeat=False,
-        num_threads=2, block_length=None, part_file_parallelism=None,
-        batch_shuffle_size=None, file_keep_rate=None, record_keep_rate=None,
-        prefetch_size=2, interleave=False, dataset_fn=None, verbose=False):
-  """
-  Make a tf.Dataset given an ordered list of filenames, using Zookeeper to keep track of
-  which file to read, and to coordinate multiple workers.
-
-  Args:
-    files:
-      ordered list of (typically HDFS) filenames. This must remain consistent
-      between different workers, and between worker restarts (e.g. in the case
-      of instance failure or preemption).
-      To ensure this remains consistent, consider using the --train.files_list
-      option from DataRecordTrainer.
-    parse_fn:
-      A function that takes a byte tensor containing a datarecord and decodes it.
-    batch_size:
-      The batch size for each step.
-    zk_counter_path:
-      Path under the root node for the underlying zookeeper shared counter that
-      is used to coordinate distributed iteration over the list of files.
-      Full path will be `'/'.join([DEFAULT_ZOOKEEPER_BASE_ZNODE, zk_counter_path])`.
-    repeat:
-      Default False. Set True to repeat over the files forever.
-    num_threads:
-      Default 2. Number of threads working on the data in parallel.
-      Only used if interleave=True.
-    block_length:
-      Default None. Number of consecutive records to pull from a single part file.
-      If None, then block_length=batch_size will be used.
-      Only used if interleave=True.
-    part_file_parallelism:
-      Default None. Number of part files to read from in parallel. Once a part file is completely
-      read, it will be replaced by the next part file indicated by the zookeeper counter.
-      Only used if interleave=True.
-
-      ``num_threads`` specifies a reader thread pool size, while ``part_file_parallelism`` specifies
-      the number of files to read from in parallel. If ``part_file_parallelism`` is greater than or
-      equal to ``num_threads``, the reads will be distributed over ``num_threads``. On the other hand,
-      if ``part_file_parallelism`` is smaller than``num_threads``, it is very likely that the reader
-      thread pool will be underutilized, since it can never be the case that every reader thread has
-      a part file to read from.
-
-    batch_shuffle_size:
-      Default None. Size of shuffle buffer, for shuffling that will be applied after batching.
-      if None, then batches will not be shuffled. Ignored if dataset_fn is provided.
-    file_keep_rate:
-      Default None. Fraction of files to keep, or None to keep all files.
-    record_keep_rate:
-      Default None. Fraction of records to keep, or None to keep all records.
-    prefetch_size:
-      Default 2. Number of parsed batches to prefetch. Ignored if dataset_fn is provided.
-    interleave:
-      Default False. Set True to use tf.data.Dataset.interleave rather than flat_map.
-    dataset_fn:
-      A function that is applied to the dataset of individual records, after
-      these have been read from the parts files.
-      If ``None`` (the default), the behavior will be as though dataset_fn were set to:
-
-      .. code-block:: python
-
-        def dataset_fn(dataset, parse_fn, batch_size):
-          dataset = dataset.batch(batch_size)
-          dataset = dataset.map(parse_fn, tf.data.experimental.AUTOTUNE)
-          if batch_shuffle_size:
-            dataset = dataset.shuffle(batch_shuffle_size)
-          return dataset.prefetch(prefetch_size)
-
-    verbose:
-      Default False. Set True to log the names of files loaded by TF.
-  """
-  block_length = batch_size if block_length is None else block_length
-  part_file_parallelism = num_threads if part_file_parallelism is None else part_file_parallelism
-
-  def zk_index_generator(my_files=files):
-    zk = KazooClient(hosts=DEFAULT_ZOOKEEPER_HOST)
-    zk.start()
-    my_counter = zk.Counter(cx_zk_path(zk_counter_path), default=0)
-    while True:
-      my_counter += 1
-      counter_pre_value = my_counter.pre_value
-      if repeat:
-        counter_pre_value = counter_pre_value % len(my_files)
-      if counter_pre_value >= len(my_files):
-        break
-      else:
-        chosen_file = my_files[counter_pre_value]
-        if verbose:
-          logging.info("{}. yielding {}".format(counter_pre_value, chosen_file))
-        yield chosen_file
-    zk.stop()
-
-  files = tf.data.Dataset.from_generator(zk_index_generator, tf.string)
-
-  # Downsample parts files
-  files = downsample_dataset(files, file_keep_rate, "file_keep_rate")
-
-  def map_fn(filenames):
-    return BlockFormatDataset(filenames).prefetch(20)
-
-  # Dont interleave for sequential training
-  if interleave:
-    dataset = files.interleave(
-      map_fn,
-      cycle_length=part_file_parallelism,
-      block_length=block_length,
-      num_parallel_calls=num_threads)
-  else:
-    dataset = files.flat_map(map_fn)
-
-  # Downsample DataRecords
-  dataset = downsample_dataset(dataset, record_keep_rate, "record_keep_rate")
-
-  if dataset_fn is None:
-    # Create a batch of datarecords and decode them
-    dataset = dataset.batch(batch_size)
-    dataset = dataset.map(parse_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-    # shuffle after batching and parsing for performance reasons
-    # faster b/c 1 random selection is made per batch rather than per record
-    if batch_shuffle_size:
-      dataset = dataset.shuffle(buffer_size=batch_shuffle_size)
-    dataset = dataset.prefetch(prefetch_size)
-
-  else:
-    dataset = dataset_fn(dataset, parse_fn, batch_size)
-
-  return dataset
+    return dataset
diff --git a/twml/twml/errors.py b/twml/twml/errors.py
index 9b50fcd79..ff39a5139 100644
--- a/twml/twml/errors.py
+++ b/twml/twml/errors.py
@@ -4,10 +4,12 @@
 
 
 class EarlyStopError(Exception):
-  """Exception used to indicate evaluator needs to early stop."""
-  pass
+    """Exception used to indicate evaluator needs to early stop."""
+
+    pass
 
 
 class CheckpointNotFoundError(Exception):
-  """Exception used to indicate a checkpoint hasnt been found."""
-  pass
+    """Exception used to indicate a checkpoint hasn't been found."""
+
+    pass
diff --git a/twml/twml/export_output_fns.py b/twml/twml/export_output_fns.py
index f72e1d0fe..e80a73eeb 100644
--- a/twml/twml/export_output_fns.py
+++ b/twml/twml/export_output_fns.py
@@ -1,4 +1,4 @@
-'''
+"""
 Contains implemenations of DataRecordTrainer.get_export_output_fns that specify how to
 export model graph outputs from build_graph to DataRecords for prediction servers.
 
@@ -6,12 +6,12 @@
 the DataRecordTrainer constructor to customize how to export their model outputs.
 
 Modelers may also provide a custom implementation of export_output_fn using these as reference.
-'''
+"""
 
 # pylint: disable=invalid-name
-from twitter.deepbird.io.legacy.export_output_fns import (
-  batch_prediction_continuous_output_fn,  # noqa: F401
-  batch_prediction_tensor_output_fn,  # noqa: F401
-  default_output_fn,  # noqa: F401
-  variable_length_continuous_output_fn,  # noqa: F401
+from twitter.deepbird.io.legacy.export_output_fns import default_output_fn  # noqa: F401
+from twitter.deepbird.io.legacy.export_output_fns import (  # noqa: F401
+    batch_prediction_continuous_output_fn,
+    batch_prediction_tensor_output_fn,
+    variable_length_continuous_output_fn,
 )
diff --git a/twml/twml/feature_config.py b/twml/twml/feature_config.py
index 37004f442..0efd437fc 100644
--- a/twml/twml/feature_config.py
+++ b/twml/twml/feature_config.py
@@ -10,45 +10,45 @@
 
 
 class FeatureConfig(feature_config.FeatureConfig):
-  def get_feature_spec(self):
-    """
-    Generates a serialization-friendly dict representing this FeatureConfig.
-    """
-    doc = super(FeatureConfig, self).get_feature_spec()
-    # Override the class in the spec.
-    doc["class"] = "twml.FeatureConfig"
-    return doc
+    def get_feature_spec(self):
+        """
+        Generates a serialization-friendly dict representing this FeatureConfig.
+        """
+        doc = super(FeatureConfig, self).get_feature_spec()
+        # Override the class in the spec.
+        doc["class"] = "twml.FeatureConfig"
+        return doc
 
 
 class FeatureConfigBuilder(feature_config.FeatureConfigBuilder):
-  def build(self):
-    # Overwrite self.build() to return twml.FeatureConfig instead
-    """
-    Builds and returns FeatureConfig object.
-    """
-
-    (
-      features,
-      tensor_types,
-      sparse_tensor_types,
-      feature_map,
-      feature_name_to_feature_parser,
-      feature_in_bq_name,
-    ) = self._build()
-
-    return FeatureConfig(
-      features=features,
-      labels=self._labels,
-      weight=self._weight,
-      filters=self._filter_features,
-      tensor_types=tensor_types,
-      sparse_tensor_types=sparse_tensor_types,
-      feature_types=feature_map,
-      decode_mode=self._decode_mode,
-      legacy_sparse=self._legacy_sparse,
-      feature_name_to_feature_parser=self._feature_name_to_feature_parser,
-      feature_in_bq_name=self._feature_in_bq_name,
-    )
+    def build(self) -> FeatureConfig:
+        # Overwrite self.build() to return twml.FeatureConfig instead
+        """
+        Builds and returns FeatureConfig object.
+        """
+
+        (
+            features,
+            tensor_types,
+            sparse_tensor_types,
+            feature_map,
+            feature_name_to_feature_parser,
+            feature_in_bq_name,
+        ) = self._build()
+
+        return FeatureConfig(
+            features=features,
+            labels=self._labels,
+            weight=self._weight,
+            filters=self._filter_features,
+            tensor_types=tensor_types,
+            sparse_tensor_types=sparse_tensor_types,
+            feature_types=feature_map,
+            decode_mode=self._decode_mode,
+            legacy_sparse=self._legacy_sparse,
+            feature_name_to_feature_parser=self._feature_name_to_feature_parser,
+            feature_in_bq_name=self._feature_in_bq_name,
+        )
 
 
 _name_to_id = feature_config._name_to_id
diff --git a/twml/twml/filters.py b/twml/twml/filters.py
index e48633808..d1376e45c 100644
--- a/twml/twml/filters.py
+++ b/twml/twml/filters.py
@@ -1,9 +1,10 @@
-'''
+"""
 Includes functions to filter features dict build from
 data records.
-'''
+"""
 
+from twitter.deepbird.io.legacy.filters import sparse_keep_feature_if  # noqa: F401
+from twitter.deepbird.io.legacy.filters import sparse_keep_sample_if  # noqa: F401
 from twitter.deepbird.io.legacy.filters import (
-  balance_binary_class_samples,  # noqa: F401
-  sparse_keep_feature_if,  # noqa: F401
-  sparse_keep_sample_if)  # noqa: F401
+    balance_binary_class_samples,
+)  # noqa: F401
diff --git a/twml/twml/hooks.py b/twml/twml/hooks.py
index cdf733535..c2118166a 100644
--- a/twml/twml/hooks.py
+++ b/twml/twml/hooks.py
@@ -1,562 +1,616 @@
 """ This file contains tf.train.SessionRunHooks defined by TWML """
-from datetime import datetime
 import json
 import operator
 import os
+from datetime import datetime
+from typing import Callable, Dict, Optional
 
-from absl import logging
 import numpy as np
 import tensorflow.compat.v1 as tf
-from tensorflow.python.training.basic_session_run_hooks import NeverTriggerTimer, SecondOrStepTimer
+from absl import logging
+from tensorflow.python.training.basic_session_run_hooks import (
+    NeverTriggerTimer,
+    SecondOrStepTimer,
+)
+
 import twml
 
 
 class StepProgressHook(tf.train.SessionRunHook):
-  """Hook that displays a progress bar to monitor global step progress """
-
-  def __init__(self, max_step):
-    """
-    Initializes a `StepProgressHook`.
-    This hook displays a progress bar for max_steps.
-
-    Note that this hook only works for training and calibration.
-
-    Args:
-      max_steps:
-        maximum steps to monitor in progress bar.
-        When this many steps is reached, the progress bar will be full.
-    """
-    self._max_step = max_step
-    self._start_step = 0
-    self._global_step_tensor = None
-    self._progress_bar = None
-
-  def begin(self):
-    """ sets the global_step_tensor """
-    self._global_step_tensor = tf.train.get_or_create_global_step()
-    if self._global_step_tensor is None:
-      raise RuntimeError("Global step should be created to use StepProgressHook.")
-
-  def after_create_session(self, session, coord):
-    """ creates the progress bar and keeps track of the first global step upon session creation """
-    global_step = session.run(self._global_step_tensor)
-    self._start_step = global_step
-    self._progress_bar = tf.keras.utils.Progbar(self._max_step)
-
-  def before_run(self, run_context):  # pylint: disable=unused-argument
-    """ invoked before calling session.run """
-    return tf.train.SessionRunArgs(self._global_step_tensor)
-
-  def after_run(self, run_context, run_values):
-    """ invoked after run is called. Updates the progress bar. """
-    step = run_context.session.run(self._global_step_tensor)
-    self._progress_bar.update(step - self._start_step)
+    """Hook that displays a progress bar to monitor global step progress"""
+
+    def __init__(self, max_step):
+        """
+        Initializes a `StepProgressHook`.
+        This hook displays a progress bar for max_steps.
+
+        Note that this hook only works for training and calibration.
+
+        Args:
+            max_steps:
+                maximum steps to monitor in progress bar.
+                When this many steps is reached, the progress bar will be full.
+        """
+        self._max_step = max_step
+        self._start_step = 0
+        self._global_step_tensor = None
+        self._progress_bar = None
+
+    def begin(self) -> None:
+        """sets the global_step_tensor"""
+        self._global_step_tensor = tf.train.get_or_create_global_step()
+        if self._global_step_tensor is None:
+            raise RuntimeError("Global step should be created to use StepProgressHook.")
+
+    def after_create_session(
+        self, session: tf.Session, coord: tf.train.Coordinator
+    ) -> None:  # pylint: disable=unused-argument
+        """creates the progress bar and keeps track of the first global step upon session creation"""
+        global_step = session.run(self._global_step_tensor)
+        self._start_step = global_step
+        self._progress_bar = tf.keras.utils.Progbar(self._max_step)
+
+    def before_run(
+        self, run_context: tf.train.SessionRunContext
+    ) -> None:  # pylint: disable=unused-argument
+        """invoked before calling session.run"""
+        return tf.train.SessionRunArgs(self._global_step_tensor)
+
+    def after_run(
+        self,
+        run_context: tf.train.SessionRunContext,
+        run_values: tf.train.SessionRunValues,
+    ) -> None:  # pylint: disable=unused-argument
+        """invoked after run is called. Updates the progress bar."""
+        step = run_context.session.run(self._global_step_tensor)
+        self._progress_bar.update(step - self._start_step)
 
 
 class GetMetricsHook(tf.train.SessionRunHook):
-  """
-  Hook used to obtain evaluation metrics.
-  Typically used for early-stopping by obtaining the value of a
-  metric at the end of an epoch.
-  Note that the metric tensor and its commensurate update Op
-  are responsible for aggregating the metric during the session
-  (one session per epoch). Used for evaluation.
-  """
-
-  def __init__(self, get_metrics_fn):
-    """GetMetricsHook constructor.
-
-    Args:
-      get_metrics_fn:
-        Function that returns a dict mapping metric keys to
-        tensors as a tf.Tensor.
-        See Trainer.learn for an example use-case.
+    """
+    Hook used to obtain evaluation metrics.
+    Typically used for early-stopping by obtaining the value of a
+    metric at the end of an epoch.
+    Note that the metric tensor and its commensurate update Op
+    are responsible for aggregating the metric during the session
+    (one session per epoch). Used for evaluation.
     """
 
-    self._get_metrics_fn = get_metrics_fn
-    self._metric_tensors = None
-    self.metric_values = None
+    def __init__(self, get_metrics_fn: Callable[[], Dict[str, tf.Tensor]]):
+        """GetMetricsHook constructor.
 
-  def begin(self):
-    """ sets the global_step_tensor and metric tensor"""
-    self._metric_tensors = self._get_metrics_fn()
-    assert isinstance(self._metric_tensors, dict)
+        Args:
+            get_metrics_fn:
+                Function that returns a dict mapping metric keys to
+                tensors as a tf.Tensor.
+                See Trainer.learn for an example use-case.
+        """
 
-  def end(self, session):
-    self.metric_values = session.run(self._metric_tensors)
+        self._get_metrics_fn = get_metrics_fn
+        self._metric_tensors = None
+        self.metric_values = None
 
+    def begin(self) -> None:
+        """sets the global_step_tensor and metric tensor"""
+        self._metric_tensors = self._get_metrics_fn()
+        assert isinstance(self._metric_tensors, dict)
 
-class EarlyStopHook(GetMetricsHook):
-  """
-  A GetMetricsHook augmented with early-stopping logic for use
-  within the Trainer.learn method.
-  """
-
-  def __init__(self,
-               metric,
-               patience,
-               minimize,
-               get_estimator_spec_fn,
-               checkpoint_dir,
-               file_path=None,
-               exit_on_end=True,
-               start_epoch=0,
-               tolerance=0):
-    """
-    Prepare early-stopping hook and variables.
+    def end(self, session: tf.Session) -> None:
+        self.metric_values = session.run(self._metric_tensors)
 
-    Args:
-      metric:
-        String specifying the metric to early-stop on. Required with positive
-        ``early_stop_patience``. For example, 'accuracy', 'accuracy_0', 'loss', etc.
-        The string is used to extract the relevant tensor Op from the dict returned by
-        the get_eval_metric_ops method. For ``metrics`` pass to the constructor,
-        the string is one of those. For multi-class (that is, multi-metric)
-        metrics, the string may be appended with a ``_0``, ``_1``, etc. or one
-        of the ``multi_metric_names`` (one per class).
-      patience:
-        Maximum number of epochs to wait for an improvement in the early_stop_metric
-        before breaking off training. For example, a patience of 10 means that
-        training will have 10 epochs to improve the metric before it is killed.
-        Whenever the metric is improved before running out of patience,
-        patience is reset to ``early_stop_patience``.
-      minimize:
-        Set this to True for metrics that need to be minimized
-        (like ``loss``). Metrics like ``accuracy`` that need to be maximized
-        should set this to False.
-      tolerance:
-        A non-negative tolerance for comparing early_stop_metric.
-        e.g. when maximizing the condition is current_metric > best_metric + tolerance."
-        Defaults to 0.
-      get_estimator_spec_fn:
-        function that returns the current EstimatorSpec.
-        The EstimatorSpec is used to obtain the current eval_metric_ops.
-      checkpoint_dir:
-        path to directory containing the Estimator checkpoints.
-      file_path:
-        path to file that is used by this hook to communicate early-stopping
-        to StopIfExistsHook. This hook would be used for evaluation, while
-        the StopIfExistsHooks (the listeners) would be used for training.
-        When the file is created, the StopIfExistsHooks detect and terminate training.
-        This argument is used by ``Trainer.train_and_evaluate``.
-      exit_on_end:
-        when the end() method is called to indicate that the session is terminating,
-        and exit_on_end is True, twml.errors.EarlyStopError() is triggered to stop the evaluation job.
-        This is set to False by the trainer for non distributed jobs.
-      start_epoch:
-        Specifies the starting epoch number. This is used for logging purposes only.
-    """
-    if not isinstance(metric, str):
-      raise ValueError("Expecting string for metric arg")
-    if not isinstance(patience, int):
-      raise ValueError("Expecting positive number for metric arg")
-
-    self.should_stop = False
-    self._metric = metric
-    self._patience = patience
-    self._current_patience = patience
-    self._checkpoint_dir = checkpoint_dir
-    self._exit_on_end = exit_on_end
-    self._latest_checkpoint_path = None
-    # used for distributed training (tf.estimator.train_and_evaluate)
-    self._file_path = file_path
-    self._epoch = start_epoch
-    if self._file_path is not None:
-      # TODO try to read epoch from a file that we create
-      if tf.io.gfile.exists(self._file_path):
-        # delete the file if it exists (not sure this makes sense)
-        logging.info("EarlyStopHook: Removing existing file: %s.", self._file_path)
-        tf.io.gfile.remove(self._file_path)
-
-    # best_checkpoint dir will contain the best checkpoint
-    self._best_checkpoint_path = os.path.join(checkpoint_dir, 'best_checkpoint')
-    self._eval_checkpoint_path = os.path.join(checkpoint_dir, 'eval_checkpoint')
-    self._best_metric_path = os.path.join(self._best_checkpoint_path, self._metric)
-
-    if tf.io.gfile.exists(self._best_metric_path):
-      with tf.io.gfile.GFile(self._best_metric_path, mode="r") as f:
-        best_metric_from_file = float(f.read())
-    else:
-      best_metric_from_file = None
-
-    if minimize:
-      # current < best : is better
-      self._is_better_than = operator.lt
-      # worse metric possible
-      if best_metric_from_file is None:
-        self._best_metric = np.inf
-      else:
-        self._best_metric = best_metric_from_file - tolerance
-      # used for printing
-      self._early_stop_name = "minimum"
-    else:
-      # current > best : is better
-      self._is_better_than = operator.gt
-      # worse metric possible
-      if best_metric_from_file is None:
-        self._best_metric = -np.inf
-      else:
-        self._best_metric = best_metric_from_file + tolerance
-      # used for printing
-      self._early_stop_name = "maximum"
-
-    def get_metrics_fn():
-      """ function to get metric tensors to early-stopping """
-      estimator_spec = get_estimator_spec_fn()
-      eval_metric_ops = estimator_spec.eval_metric_ops
-      if metric not in eval_metric_ops:
-        raise ValueError(
-          "Expecting early_stop_metric '%s' key in eval_metric_ops dict"
-          % (metric))
-      # get the value_op from the (value_op, update_op) value
-      return {k: v[0] for k, v in eval_metric_ops.items()}
-
-    # initialize GetMetricsHook to get current value of metric from session
-    super(EarlyStopHook, self).__init__(get_metrics_fn=get_metrics_fn)
-
-  def early_stop(self, epoch):
-    """
-    Looks at the current value of the early stopping metric.
-    Decrements current patience. If metric improves, patience is reset
-    and latest checkpoint is moved to checkpoint_dir/best_checkpoint.
-    If current patience reaches zero, returns True.
 
-    Args:
-      epoch:
-        The current epoch number.
-
-    Returns:
-      True when early-stopped. False otherwise.
-    """
-    # decrement patience
-    self._current_patience -= 1
-
-    # get the current metric value
-    current_metric = self.metric_values[self._metric]
-
-    if self._is_better_than(current_metric, self._best_metric):
-      # save best version of model
-      self._best_metric = current_metric
-      logging.info(
-        "Found new %s %s=%f @ epoch %d",
-        self._early_stop_name, self._metric, self._best_metric, epoch)
-      # backup the file to checkpoint_dir/best_checkpoint
-      assert self._latest_checkpoint_path, "expecting latest checkpoint"
-      logging.info("Backing up " + self._latest_checkpoint_path)
-
-      try:
-        eval_checkpoint = tf.train.latest_checkpoint(self._eval_checkpoint_path)
-        twml.util.backup_checkpoint(
-          checkpoint_path_prefix=eval_checkpoint,
-          backup_path=self._best_checkpoint_path)
-      except twml.errors.CheckpointNotFoundError as ex:
-        msg = "Consider increasing 'keep_checkpoint_max' or 'save_checkpoint_secs'"
-        raise twml.errors.CheckpointNotFoundError(str(ex) + "\n" + msg)
-
-      tf.io.gfile.makedirs(os.path.dirname(self._best_metric_path))
-      with tf.io.gfile.GFile(self._best_metric_path, mode="w") as f:
-        # Write with enough precision
-        f.write("%.8f" % self._best_metric)
-
-      # reset patience
-      self._current_patience = self._patience
-
-    elif self._current_patience > 0:
-      logging.info("No new %s found after %d epochs",
-                   self._early_stop_name, self._patience - self._current_patience)
-    elif self._current_patience == 0:
-      logging.info(
-        "No new %s found after %d epochs. Early-stopping experiment.",
-        self._early_stop_name, self._patience)
-      return True
-
-    return False
-
-  def cleanup_checkpoints(self):
+class EarlyStopHook(GetMetricsHook):
     """
-    makes it so that the best checkpoint is the only checkpoint
-    in checkpoint_dir.
+    A GetMetricsHook augmented with early-stopping logic for use
+    within the Trainer.learn method.
     """
-    raise NotImplementedError("cleanup_checkpoints is no longer supported")
 
-  def end(self, session):
-    """
-    This method is called at the end of an evaluation/epoch.
-    When file_path constructor argument is provided, this
-    will call ``early_stop()``.
-    When ``early_stop()`` returns True, it creates the file_path,
-    which will be detected by StopIfExistsHooks
-    and stop training for all workers and the chief. It will
-    also call ``cleanup_checkpoints()``.
-    """
-    super(EarlyStopHook, self).end(session)
-
-    # Checks for early stopping criteria and makes a backup
-    self.should_stop = self.early_stop(self._epoch)
-
-    if self._file_path is not None:
-      if self.should_stop:
-        # create a file to inform workers
-        with tf.io.gfile.GFile(self._file_path, "wb") as gfile:
-          gfile.write("early-stop\n")
-        # makes the best checkpoint the only checkpoint in save_dir.
-        msg = "early-stopping evaluation at epoch %d" % self._epoch
-        logging.info(msg)
-        if self._exit_on_end:
-          raise twml.errors.EarlyStopError(msg)
-      else:
+    def __init__(
+        self,
+        metric: str,
+        patience: int,
+        minimize: bool,
+        get_estimator_spec_fn: Callable[[], tf.estimator.EstimatorSpec],
+        checkpoint_dir: str,
+        file_path: str = None,
+        exit_on_end: bool = True,
+        start_epoch: int = 0,
+        tolerance: float = 0.0,
+    ):
+        """
+        Prepare early-stopping hook and variables.
+
+        Args:
+            metric:
+                String specifying the metric to early-stop on. Required with positive
+                ``early_stop_patience``. For example, 'accuracy', 'accuracy_0', 'loss', etc.
+                The string is used to extract the relevant tensor Op from the dict returned by
+                the get_eval_metric_ops method. For ``metrics`` pass to the constructor,
+                the string is one of those. For multi-class (that is, multi-metric)
+                metrics, the string may be appended with a ``_0``, ``_1``, etc. or one
+                of the ``multi_metric_names`` (one per class).
+            patience:
+                Maximum number of epochs to wait for an improvement in the early_stop_metric
+                before breaking off training. For example, a patience of 10 means that
+                training will have 10 epochs to improve the metric before it is killed.
+                Whenever the metric is improved before running out of patience,
+                patience is reset to ``early_stop_patience``.
+            minimize:
+                Set this to True for metrics that need to be minimized
+                (like ``loss``). Metrics like ``accuracy`` that need to be maximized
+                should set this to False.
+            get_estimator_spec_fn:
+                function that returns the current EstimatorSpec.
+                The EstimatorSpec is used to obtain the current eval_metric_ops.
+            checkpoint_dir:
+                path to directory containing the Estimator checkpoints.
+            file_path:
+                path to file that is used by this hook to communicate early-stopping
+                to StopIfExistsHook. This hook would be used for evaluation, while
+                the StopIfExistsHooks (the listeners) would be used for training.
+                When the file is created, the StopIfExistsHooks detect and terminate training.
+                This argument is used by ``Trainer.train_and_evaluate``.
+            exit_on_end:
+                when the end() method is called to indicate that the session is terminating,
+                and exit_on_end is True, twml.errors.EarlyStopError() is triggered to stop the evaluation job.
+                This is set to False by the trainer for non distributed jobs.
+            start_epoch:
+                Specifies the starting epoch number. This is used for logging purposes only.
+            tolerance:
+                A non-negative tolerance for comparing early_stop_metric.
+                e.g. when maximizing the condition is current_metric > best_metric + tolerance."
+                Defaults to 0.
+        """
+        if not isinstance(metric, str):
+            raise ValueError("Expecting string for metric arg")
+        if not isinstance(patience, int):
+            raise ValueError("Expecting positive number for metric arg")
+
+        self.should_stop = False
+        self._metric = metric
+        self._patience = patience
+        self._current_patience = patience
+        self._checkpoint_dir = checkpoint_dir
+        self._exit_on_end = exit_on_end
         self._latest_checkpoint_path = None
-
-    self._epoch += 1
-
-  def begin(self):
-    """
-    Saves the latest_checkpoint in case it gets superseded by another checkpoint.
-    Remember that when used with train_and_evaluate, the chief saves checkpoints
-    continuouly. The chief could save a checkpoint after evaluation started.
-    So saving the checkpoint at the beginning of evaluation ensures that we
-    later save the correct best checkpoint.
-    """
-    super(EarlyStopHook, self).begin()
-    self._latest_checkpoint_path = tf.train.latest_checkpoint(self._checkpoint_dir)
-
-    assert self._latest_checkpoint_path, "expecting latest checkpoint"
-    # Backup to temporary directory
-    try:
-      twml.util.backup_checkpoint(
-        checkpoint_path_prefix=self._latest_checkpoint_path,
-        backup_path=self._eval_checkpoint_path)
-    except twml.errors.CheckpointNotFoundError as ex:
-      msg = "Consider increasing 'keep_checkpoint_max' or 'save_checkpoint_secs'"
-      raise twml.errors.CheckpointNotFoundError(str(ex) + "\n" + msg)
+        # used for distributed training (tf.estimator.train_and_evaluate)
+        self._file_path = file_path
+        self._epoch = start_epoch
+        if self._file_path is not None:
+            # TODO try to read epoch from a file that we create
+            if tf.io.gfile.exists(self._file_path):
+                # delete the file if it exists (not sure this makes sense)
+                logging.info(
+                    "EarlyStopHook: Removing existing file: %s.", self._file_path
+                )
+                tf.io.gfile.remove(self._file_path)
+
+        # best_checkpoint dir will contain the best checkpoint
+        self._best_checkpoint_path = os.path.join(checkpoint_dir, "best_checkpoint")
+        self._eval_checkpoint_path = os.path.join(checkpoint_dir, "eval_checkpoint")
+        self._best_metric_path = os.path.join(self._best_checkpoint_path, self._metric)
+
+        if tf.io.gfile.exists(self._best_metric_path):
+            with tf.io.gfile.GFile(self._best_metric_path, mode="r") as f:
+                best_metric_from_file = float(f.read())
+        else:
+            best_metric_from_file = None
+
+        if minimize:
+            # current < best : is better
+            self._is_better_than = operator.lt
+            # worse metric possible
+            if best_metric_from_file is None:
+                self._best_metric = np.inf
+            else:
+                self._best_metric = best_metric_from_file - tolerance
+            # used for printing
+            self._early_stop_name = "minimum"
+        else:
+            # current > best : is better
+            self._is_better_than = operator.gt
+            # worse metric possible
+            if best_metric_from_file is None:
+                self._best_metric = -np.inf
+            else:
+                self._best_metric = best_metric_from_file + tolerance
+            # used for printing
+            self._early_stop_name = "maximum"
+
+        def get_metrics_fn() -> Dict[str, tf.Tensor]:
+            """function to get metric tensors to early-stopping"""
+            estimator_spec = get_estimator_spec_fn()
+            eval_metric_ops = estimator_spec.eval_metric_ops
+            if metric not in eval_metric_ops:
+                raise ValueError(
+                    "Expecting early_stop_metric '%s' key in eval_metric_ops dict"
+                    % (metric)
+                )
+            # get the value_op from the (value_op, update_op) value
+            return {k: v[0] for k, v in eval_metric_ops.items()}
+
+        # initialize GetMetricsHook to get current value of metric from session
+        super(EarlyStopHook, self).__init__(get_metrics_fn=get_metrics_fn)
+
+    def early_stop(self, epoch: int) -> bool:
+        """
+        Looks at the current value of the early stopping metric.
+        Decrements current patience. If metric improves, patience is reset
+        and latest checkpoint is moved to checkpoint_dir/best_checkpoint.
+        If current patience reaches zero, returns True.
+
+        Args:
+            epoch: The current epoch number.
+
+        Returns:
+            True when early-stopped. False otherwise.
+        """
+        # decrement patience
+        self._current_patience -= 1
+
+        # get the current metric value
+        current_metric = self.metric_values[self._metric]
+
+        if self._is_better_than(current_metric, self._best_metric):
+            # save best version of model
+            self._best_metric = current_metric
+            logging.info(
+                "Found new %s %s=%f @ epoch %d",
+                self._early_stop_name,
+                self._metric,
+                self._best_metric,
+                epoch,
+            )
+            # backup the file to checkpoint_dir/best_checkpoint
+            assert self._latest_checkpoint_path, "expecting latest checkpoint"
+            logging.info("Backing up " + self._latest_checkpoint_path)
+
+            try:
+                eval_checkpoint = tf.train.latest_checkpoint(self._eval_checkpoint_path)
+                twml.util.backup_checkpoint(
+                    checkpoint_path_prefix=eval_checkpoint,
+                    backup_path=self._best_checkpoint_path,
+                )
+            except twml.errors.CheckpointNotFoundError as ex:
+                msg = "Consider increasing 'keep_checkpoint_max' or 'save_checkpoint_secs'"
+                raise twml.errors.CheckpointNotFoundError(str(ex) + "\n" + msg)
+
+            tf.io.gfile.makedirs(os.path.dirname(self._best_metric_path))
+            with tf.io.gfile.GFile(self._best_metric_path, mode="w") as f:
+                # Write with enough precision
+                f.write("%.8f" % self._best_metric)
+
+            # reset patience
+            self._current_patience = self._patience
+
+        elif self._current_patience > 0:
+            logging.info(
+                "No new %s found after %d epochs",
+                self._early_stop_name,
+                self._patience - self._current_patience,
+            )
+        elif self._current_patience == 0:
+            logging.info(
+                "No new %s found after %d epochs. Early-stopping experiment.",
+                self._early_stop_name,
+                self._patience,
+            )
+            return True
+
+        return False
+
+    def cleanup_checkpoints(self) -> None:
+        """
+        makes it so that the best checkpoint is the only checkpoint
+        in checkpoint_dir.
+        """
+        raise NotImplementedError("cleanup_checkpoints is no longer supported")
+
+    def end(self, session: tf.Session) -> None:
+        """
+        This method is called at the end of an evaluation/epoch.
+        When file_path constructor argument is provided, this
+        will call ``early_stop()``.
+        When ``early_stop()`` returns True, it creates the file_path,
+        which will be detected by StopIfExistsHooks
+        and stop training for all workers and the chief. It will
+        also call ``cleanup_checkpoints()``.
+        """
+        super(EarlyStopHook, self).end(session)
+
+        # Checks for early stopping criteria and makes a backup
+        self.should_stop = self.early_stop(self._epoch)
+
+        if self._file_path is not None:
+            if self.should_stop:
+                # create a file to inform workers
+                with tf.io.gfile.GFile(self._file_path, "wb") as gfile:
+                    gfile.write("early-stop\n")
+                # makes the best checkpoint the only checkpoint in save_dir.
+                msg = "early-stopping evaluation at epoch %d" % self._epoch
+                logging.info(msg)
+                if self._exit_on_end:
+                    raise twml.errors.EarlyStopError(msg)
+            else:
+                self._latest_checkpoint_path = None
+
+        self._epoch += 1
+
+    def begin(self) -> None:
+        """
+        Saves the latest_checkpoint in case it gets superseded by another checkpoint.
+        Remember that when used with train_and_evaluate, the chief saves checkpoints
+        continuouly. The chief could save a checkpoint after evaluation started.
+        So saving the checkpoint at the beginning of evaluation ensures that we
+        later save the correct best checkpoint.
+        """
+        super(EarlyStopHook, self).begin()
+        self._latest_checkpoint_path = tf.train.latest_checkpoint(self._checkpoint_dir)
+
+        assert self._latest_checkpoint_path, "expecting latest checkpoint"
+        # Backup to temporary directory
+        try:
+            twml.util.backup_checkpoint(
+                checkpoint_path_prefix=self._latest_checkpoint_path,
+                backup_path=self._eval_checkpoint_path,
+            )
+        except twml.errors.CheckpointNotFoundError as ex:
+            msg = "Consider increasing 'keep_checkpoint_max' or 'save_checkpoint_secs'"
+            raise twml.errors.CheckpointNotFoundError(str(ex) + "\n" + msg)
 
 
 class MetricsUpdateHook(GetMetricsHook):
-  """
-  A GetMetricsHook augmented with logic to map SessionRun events to metrics updates.
-  It is mainly used by `TrackRun` to persist model metrics via Model Repo.
-  """
-
-  def __init__(self,
-               get_estimator_spec_fn,
-               add_metrics_fn,
-               every_n_iter=None,
-               every_n_secs=None
-               ):
-    """
-    Args:
-      get_estimator_spec_fn:
-        function that returns the current EstimatorSpec.
-        The EstimatorSpec is used to obtain the current eval_metric_ops.
-      add_metrics_fn: `function` callback used to report metrics, called automatically
-        at the end of every epoch.
-      every_n_iter: `int`, log the metrics once every N local
-        steps taken in the current epoch.
-      every_n_secs: `int` or `float`, log the metrics once every N
-        seconds passed in the current epoch. Exactly one of `every_n_iter` and `every_n_secs`
-        should be provided.
-    Raises:
-      ValueError: if `every_n_iter` is non-positive or if not exactly one of `every_n_iter` and
-        `every_n_secs` is set when `add_progress_metrics_fn` is provided.
-    """
-    only_log_at_end = (every_n_iter is None) and (every_n_secs is None)
-
-    if (not only_log_at_end and every_n_iter and every_n_secs):
-      raise ValueError(
-        'exactly one of every_n_iter and every_n_secs must be provided'
-      )
-
-    # TODO: should have a minimum to avoid too many calls to ModelRepo?
-    if every_n_iter is not None and every_n_iter <= 0:
-      raise ValueError("invalid every_n_iter=%s." % every_n_iter)
-
-    self._timer = (
-      NeverTriggerTimer() if only_log_at_end else
-      SecondOrStepTimer(every_secs=every_n_secs, every_steps=every_n_iter)
-    )
-
-    self._should_trigger = False
-    self._iter_count = 0
-
-    self._add_metrics_fn = add_metrics_fn
-
-    def get_metrics_fn():
-      """
-      Function that returns the current EstimatorSpec.
-        The EstimatorSpec is used to obtain the current eval_metric_ops.
-      """
-      estimator_spec = get_estimator_spec_fn()
-      eval_metric_ops = estimator_spec.eval_metric_ops
-      # get the value_op from the (value_op, update_op) value
-      return {k: v[0] for k, v in eval_metric_ops.items()}
-    super(MetricsUpdateHook, self).__init__(get_metrics_fn=get_metrics_fn)
-
-  def report_metrics(self):
     """
-    Triggers a metrics report.
+    A GetMetricsHook augmented with logic to map SessionRun events to metrics updates.
+    It is mainly used by `TrackRun` to persist model metrics via Model Repo.
     """
-    self._timer.update_last_triggered_step(self._iter_count)
-    if self.metric_values is not None:
-      self._add_metrics_fn(self.metric_values)
 
-  def begin(self):
-    """
-    Triggered before each epoch.
-    """
-    self._timer.reset()
-    self._iter_count = 0
-    return super(MetricsUpdateHook, self).begin()
+    def __init__(
+        self,
+        get_estimator_spec_fn: Callable[[], tf.estimator.EstimatorSpec],
+        add_metrics_fn: Callable[[Dict[str, float]], None],
+        every_n_iter: Optional[int] = None,
+        every_n_secs: Optional[float] = None,
+    ):
+        """
+        Args:
+            get_estimator_spec_fn:
+                function that returns the current EstimatorSpec.
+                The EstimatorSpec is used to obtain the current eval_metric_ops.
+            add_metrics_fn: `function` callback used to report metrics, called automatically
+                at the end of every epoch.
+            every_n_iter: `int`, log the metrics once every N local
+                steps taken in the current epoch.
+            every_n_secs: `int` or `float`, log the metrics once every N
+                seconds passed in the current epoch. Exactly one of `every_n_iter` and `every_n_secs`
+                should be provided.
+        Raises:
+            ValueError: if `every_n_iter` is non-positive or if not exactly one of `every_n_iter` and
+                `every_n_secs` is set when `add_progress_metrics_fn` is provided.
+        """
+        only_log_at_end = (every_n_iter is None) and (every_n_secs is None)
+
+        if not only_log_at_end and every_n_iter and every_n_secs:
+            raise ValueError(
+                "exactly one of every_n_iter and every_n_secs must be provided"
+            )
+
+        # TODO: should have a minimum to avoid too many calls to ModelRepo?
+        if every_n_iter is not None and every_n_iter <= 0:
+            raise ValueError("invalid every_n_iter=%s." % every_n_iter)
+
+        self._timer = (
+            NeverTriggerTimer()
+            if only_log_at_end
+            else SecondOrStepTimer(every_secs=every_n_secs, every_steps=every_n_iter)
+        )
+
+        self._should_trigger = False
+        self._iter_count = 0
+
+        self._add_metrics_fn = add_metrics_fn
+
+        def get_metrics_fn():
+            """
+            Function that returns the current EstimatorSpec.
+                The EstimatorSpec is used to obtain the current eval_metric_ops.
+            """
+            estimator_spec = get_estimator_spec_fn()
+            eval_metric_ops = estimator_spec.eval_metric_ops
+            # get the value_op from the (value_op, update_op) value
+            return {k: v[0] for k, v in eval_metric_ops.items()}
+
+        super(MetricsUpdateHook, self).__init__(get_metrics_fn=get_metrics_fn)
+
+    def report_metrics(self) -> None:
+        """
+        Triggers a metrics report.
+        """
+        self._timer.update_last_triggered_step(self._iter_count)
+        if self.metric_values is not None:
+            self._add_metrics_fn(self.metric_values)
+
+    def begin(self) -> None:
+        """
+        Triggered before each epoch.
+        """
+        self._timer.reset()
+        self._iter_count = 0
+        return super(MetricsUpdateHook, self).begin()
+
+    def before_run(
+        self, run_context: tf.estimator.SessionRunContext
+    ) -> tf.train.SessionRunArgs:
+        """
+        Triggered before each step.
+        """
+        self._should_trigger = self._timer.should_trigger_for_step(self._iter_count)
+        return super(MetricsUpdateHook, self).before_run(run_context)
+
+    def after_run(
+        self,
+        run_context: tf.estimator.SessionRunContext,
+        run_values: tf.train.SessionRunValues,
+    ) -> None:
+        """
+        Triggered after each step.
+        """
+        if self._should_trigger:
+            self.report_metrics()
+        self._iter_count += 1
+        return super(MetricsUpdateHook, self).after_run(run_context, run_values)
+
+    def end(self, session: tf.Session) -> None:
+        """
+        Triggered after each epoch.
+        """
+        self.report_metrics()
+        return super(MetricsUpdateHook, self).end(session)
 
-  def before_run(self, run_context):
-    """
-    Triggered before each step.
-    """
-    self._should_trigger = self._timer.should_trigger_for_step(self._iter_count)
-    return super(MetricsUpdateHook, self).before_run(run_context)
 
-  def after_run(self, run_context, run_values):
-    """
-    Triggered after each step.
+class EarlyStopDuration(tf.train.SessionRunHook):
     """
-    if self._should_trigger:
-      self.report_metrics()
-    self._iter_count += 1
-    return super(MetricsUpdateHook, self).after_run(run_context, run_values)
+    Hook that can be used to terminate a job (training or validation) after a certain duration.
+    The hook is fault tolerant, i.e., if a job is allotted 1 hour to run and fails after 45 minutes,
+    then it will only run for 15 minutes once restarted.
 
-  def end(self, session):
-    """
-    Triggered after each epoch.
+    Args:
+        max_duration:
+            A float. When this argument is defined, the job will automatically terminate after
+            `max_duration` seconds if it has not already completed.
+
+        overwrite:
+            A boolean. If set to True, this hook will overwrite the file containing the elapsed time
+            since the beginning of the job. In a distributed setting, this will be used so only one
+            job writes to the file while all others will have read access. In a distributed setting,
+            if all executors have this parameter set to False, then it just means that the hook will
+            not be fault tolerant. When restarted, the job will restart the clock from 0.
+
+        save_dir:
+            String. A directory (located on a file system that is Tensorflow compatible) where
+            we can store the file which contains the record of the elapsed time. This file is what makes
+            the hook fault tolerant.
+
+        exit_on_end:
+            when exit_on_end is True, twml.errors.EarlyStopError() is triggered to stop the job.
+            This is usually set to True to kill a validation job in a distributed setting.
     """
-    self.report_metrics()
-    return super(MetricsUpdateHook, self).end(session)
-
 
-class EarlyStopDuration(tf.train.SessionRunHook):
-  """
-  Hook that can be used to terminate a job (training or validation) after a certain duration.
-  The hook is fault tolerant, i.e., if a job is allotted 1 hour to run and fails after 45 minutes,
-  then it will only run for 15 minutes once restarted.
-
-  Args:
-    max_duration: 
-      A float. When this argument is defined, the job will automatically terminate after
-      `max_duration` seconds if it has not already compeleted. 
-    
-    overwrite:
-      A boolean. If set to True, this hook will overwrite the file containing the elapsed time
-      since the beginning of the job. In a distributed setting, this will be used so only one 
-      job writes to the file while all others will have read access. In a distributed setting,
-      if all executors have this parameter set to False, then it just means that the hook will 
-      not be fault tolerant. When restarted, the job will restart the clock from 0.
-      
-    save_dir:
-      String. A directory (located on a file system that is Tensorflow compatible) where 
-      we can store the file which contains the record of the elapsed time. This file is what makes 
-      the hook faul tolerant.
-
-    exit_on_end:
-      when exit_on_end is True, twml.errors.EarlyStopError() is triggered to stop the job.
-      This is usually set to True to kill a validation job in a distributed setting.
-   """
-
-  def __init__(self, max_duration: float, exit_on_end: bool, save_dir: str, overwrite: bool):
-    self._overwrite = overwrite
-    self._save_dir = save_dir
-    self._exit_on_end = exit_on_end
-    self._max_duration = max_duration
-    self._last_time_check = datetime.now()
-
-    # Initialize elapse time file
-    if overwrite:
-      self.elapsed_time()
-
-  @property
-  def elapsed_file_path(self):
-    return os.path.join(self._save_dir, "early_stop_duration.txt")
-
-  def early_stop(self) -> bool:
-    return self.elapsed_time() > self._max_duration
-
-  def elapsed_time(self) -> float:
-    # Recorded elapsed time is 0 unless it's been recorded in a file already
-    recorded_elapsed_time = 0
-    if tf.io.gfile.exists(self.elapsed_file_path):
-      with tf.io.gfile.GFile(self.elapsed_file_path, mode="r") as file:
-        recorded_elapsed_time = json.loads(file.read())["elapsed_time"]
-
-    elapsed_time = recorded_elapsed_time + (datetime.now() - self._last_time_check).total_seconds()
-    self._last_time_check = datetime.now()
-
-    if self._overwrite:
-      # Record the actualized new elapsed time to the file
-      tf.io.gfile.makedirs(os.path.dirname(self.elapsed_file_path))
-      with tf.io.gfile.GFile(self.elapsed_file_path, mode="w") as file:
-        record = {
-          "elapsed_time": elapsed_time,
-          "max_duration": self._max_duration
-        }
-        file.write(json.dumps(record, indent=2))
-
-    return elapsed_time
-
-  def before_run(self, run_context: tf.estimator.SessionRunContext) -> None:
-    if self.early_stop():
-      message = f"""
-        Stopping job which now exceeded the maximum duration of {self._max_duration} seconds. 
+    def __init__(
+        self,
+        max_duration: float,
+        exit_on_end: bool,
+        save_dir: str,
+        overwrite: bool,
+    ):
+        self._overwrite = overwrite
+        self._save_dir = save_dir
+        self._exit_on_end = exit_on_end
+        self._max_duration = max_duration
+        self._last_time_check = datetime.now()
+
+        # Initialize elapse time file
+        if overwrite:
+            self.elapsed_time()
+
+    @property
+    def elapsed_file_path(self):
+        return os.path.join(self._save_dir, "early_stop_duration.txt")
+
+    def early_stop(self) -> bool:
+        return self.elapsed_time() > self._max_duration
+
+    def elapsed_time(self) -> float:
+        # Recorded elapsed time is 0 unless it's been recorded in a file already
+        recorded_elapsed_time = 0
+        if tf.io.gfile.exists(self.elapsed_file_path):
+            with tf.io.gfile.GFile(self.elapsed_file_path, mode="r") as file:
+                recorded_elapsed_time = json.loads(file.read())["elapsed_time"]
+
+        elapsed_time = (
+            recorded_elapsed_time
+            + (datetime.now() - self._last_time_check).total_seconds()
+        )
+        self._last_time_check = datetime.now()
+
+        if self._overwrite:
+            # Record the actualized new elapsed time to the file
+            tf.io.gfile.makedirs(os.path.dirname(self.elapsed_file_path))
+            with tf.io.gfile.GFile(self.elapsed_file_path, mode="w") as file:
+                record = {
+                    "elapsed_time": elapsed_time,
+                    "max_duration": self._max_duration,
+                }
+                file.write(json.dumps(record, indent=2))
+
+        return elapsed_time
+
+    def before_run(self, run_context: tf.estimator.SessionRunContext) -> None:
+        if self.early_stop():
+            message = f"""
+        Stopping job which now exceeded the maximum duration of {self._max_duration} seconds.
       """
-      logging.info(message)
-      run_context.request_stop()
+            logging.info(message)
+            run_context.request_stop()
 
-      if self._exit_on_end:
-        raise twml.errors.EarlyStopError(message)
+            if self._exit_on_end:
+                raise twml.errors.EarlyStopError(message)
 
 
 class StopAtStepHook(tf.train.StopAtStepHook):
-  """
-  Overrides ``tf.train.StopAtStepHook`` so that
-  a ``stop_requested`` property can be accessed to determine
-  if this hook requested a stop.
-  """
+    """
+    Overrides ``tf.train.StopAtStepHook`` so that
+    a ``stop_requested`` property can be accessed to determine
+    if this hook requested a stop.
+    """
 
-  def __init__(self, *args, **kwargs):
-    super(StopAtStepHook, self).__init__(*args, **kwargs)
-    self._stop_requested = False
+    def __init__(self, *args, **kwargs):
+        super(StopAtStepHook, self).__init__(*args, **kwargs)
+        self._stop_requested = False
 
-  @property
-  def stop_requested(self):
-    """ true if this hook requested a stop """
-    return self._stop_requested
+    @property
+    def stop_requested(self) -> bool:
+        """true if this hook requested a stop"""
+        return self._stop_requested
 
-  def after_run(self, run_context, run_values):
-    """ sets self.stop_requested to true when requesting a stop """
-    super(StopAtStepHook, self).after_run(run_context, run_values)
-    self._stop_requested = run_context.stop_requested
+    def after_run(
+        self,
+        run_context: tf.estimator.SessionRunContext,
+        run_values: tf.train.SessionRunValues,
+    ) -> None:
+        """sets self.stop_requested to true when requesting a stop"""
+        super(StopAtStepHook, self).after_run(run_context, run_values)
+        self._stop_requested = run_context.stop_requested
 
 
 class StopIfExistsHook(tf.train.SessionRunHook):
-  """
-  Hook that requests stop if a file exists.
-  This hook is used with the EarlyStopHook to implement
-  early-stopping for distributed training (tf.estimator.train_and_evaluate).
-  """
-
-  def __init__(self, file_path):
     """
-    Arguments:
-      file_path:
-        path to file. When this hook detects that the file exists,
-        it requests a stop, which effectively kills this worker.
+    Hook that requests stop if a file exists.
+    This hook is used with the EarlyStopHook to implement
+    early-stopping for distributed training (tf.estimator.train_and_evaluate).
     """
-    self._file_path = file_path
-    self._stop_requested = False
-
-  def after_run(self, run_context, run_values):
-    if tf.io.gfile.exists(self._file_path):
-      logging.info("Early-stopping file detected; requesting stop")
-      run_context.request_stop()
-      self._stop_requested = True
-
-  @property
-  def stop_requested(self):
-    """ true if this hook requested a stop """
-    return self._stop_requested
+
+    def __init__(self, file_path: str):
+        """
+        Args:
+            file_path:
+                path to file. When this hook detects that the file exists,
+                it requests a stop, which effectively kills this worker.
+        """
+        self._file_path = file_path
+        self._stop_requested = False
+
+    def after_run(
+        self,
+        run_context: tf.estimator.SessionRunContext,
+        run_values: tf.train.SessionRunValues,
+    ) -> None:
+        if tf.io.gfile.exists(self._file_path):
+            logging.info("Early-stopping file detected; requesting stop")
+            run_context.request_stop()
+            self._stop_requested = True
+
+    @property
+    def stop_requested(self) -> bool:
+        """true if this hook requested a stop"""
+        return self._stop_requested
diff --git a/twml/twml/input_fns.py b/twml/twml/input_fns.py
index 394fc8674..814823678 100644
--- a/twml/twml/input_fns.py
+++ b/twml/twml/input_fns.py
@@ -1,129 +1,131 @@
-'''
+"""
 Contains implementations of functions to read input data.
-'''
-from .dataset import stream_block_format_dataset
+"""
+from typing import Callable, List, Optional
 
 import tensorflow.compat.v1 as tf
 
+from .dataset import stream_block_format_dataset
 
-def data_record_input_fn(
-        files, batch_size, parse_fn,
-        num_threads=2, repeat=False, dataset_fn=None,
-        keep_rate=None, parts_downsampling_rate=None,
-        shards=None, shard_index=None, shuffle=True, shuffle_files=True, interleave=True,
-        initializable=False, log_tf_data_summaries=False,
-        **kwargs):
-  """
-  Returns a nested structure of tf.Tensors containing the next element.
-  Used by ``train_input_fn`` and ``eval_input_fn`` in DataRecordTrainer.
-  By default, works with DataRecord dataset for compressed partition files.
-
-  Args:
-    files:
-      List of files that will be parsed.
-    batch_size:
-      number of samples per batch.
-    parse_fn:
-      function passed to data loading for parsing individual data records.
-      Usually one of the decoder functions like ``parsers.get_sparse_parse_fn``.
-    num_threads (optional):
-      number of threads used for loading data. Defaults to 2.
-    repeat (optional):
-      Repeat the dataset indefinitely. Defaults to False.
-      Useful when you want to use ``train_steps`` or ``eval_steps``
-      greater than the size of the dataset
-      (otherwise Estimator.[train,evaluate] stops when the end of the dataset is reached).
-    dataset_fn (optional):
-      A function that modifies the dataset after it reads different interleaved parts files.
-      Defaults to:
-
-      .. code-block:: python
-
-        def dataset_fn(dataset, parse_fn, batch_size):
-          return dataset.batch(batch_size).map(parse_fn, 1)
-
-    keep_rate (optional):
-      A float value in (0.0, 1.0] that indicates to drop records according to the Bernoulli
-      distribution with p = 1 - keep_rate.
-      Defaults to None (no records dropped).
-
-    parts_downsampling_rate (optional):
-      A float value in (0.0, 1.0] that indicates the factor by which to downsample part files.
-      For example, a value of 0.2 means only 20 percent of part files become part of the dataset.
-
-    shards (optional):
-      Number of partitions to shard the dataset into. This is useful for codistillation
-      (https://arxiv.org/pdf/1804.03235.pdf) and other techniques that require each worker to
-      train on disjoint partitions of the dataset.
-      The dataset is not sharded by default.
-
-    shard_index (optional):
-      Which partition of the dataset to use if ``shards`` is set.
-
-    shuffle (optional):
-      Whether to shuffle the records. Defaults to True.
-
-    shuffle_files (optional):
-      Shuffle the list of files. Defaults to True.
-      When False, files are iterated in the order they are passed in.
-
-    interleave (optional):
-      Interleave records from multiple files in parallel. Defaults to True.
-
-    initializable (optional):
-      A boolean indicator. When the Dataset Iterator depends on some resource, e.g. a HashTable or
-      a Tensor, i.e. it's an initializable iterator, set it to True. Otherwise, default value (false)
-      is used for most plain iterators.
-
-      log_tf_data_summaries (optional):
-        A boolean indicator denoting whether to add a `tf.data.experimental.StatsAggregator` to the
-        tf.data pipeline. This adds summaries of pipeline utilization and buffer sizes to the output
-        events files. This requires that `initializable` is `True` above.
-
-  Returns:
-    Iterator of elements of the dataset.
-  """
-  if not parse_fn:
-    raise ValueError("default_input_fn requires a parse_fn")
-
-  if log_tf_data_summaries and not initializable:
-    raise ValueError("Require `initializable` if `log_tf_data_summaries`.")
 
-  dataset = stream_block_format_dataset(
-    files=files,
-    parse_fn=parse_fn,
-    batch_size=batch_size,
-    repeat=repeat,
-    num_threads=num_threads,
-    dataset_fn=dataset_fn,
-    keep_rate=keep_rate,
-    parts_downsampling_rate=parts_downsampling_rate,
-    shards=shards,
-    shard_index=shard_index,
-    shuffle=shuffle,
-    shuffle_files=shuffle_files,
-    interleave=interleave,
+def data_record_input_fn(
+    files: List[str],
+    batch_size: int,
+    parse_fn: Callable,
+    num_threads: int = 2,
+    repeat: bool = False,
+    dataset_fn: Optional[Callable] = None,
+    keep_rate: Optional[float] = None,
+    parts_downsampling_rate: Optional[float] = None,
+    shards: Optional[int] = None,
+    shard_index: Optional[int] = None,
+    shuffle: bool = True,
+    shuffle_files: bool = True,
+    interleave: bool = True,
+    initializable: bool = False,
+    log_tf_data_summaries: bool = False,
     **kwargs
-  )
-
-  # Add a tf.data.experimental.StatsAggregator
-  # https://www.tensorflow.org/versions/r1.15/api_docs/python/tf/data/experimental/StatsAggregator
-  if log_tf_data_summaries:
-    aggregator = tf.data.experimental.StatsAggregator()
-    options = tf.data.Options()
-    options.experimental_stats.aggregator = aggregator
-    dataset = dataset.with_options(options)
-    stats_summary = aggregator.get_summary()
-    tf.add_to_collection(tf.GraphKeys.SUMMARIES, stats_summary)
-
-  if initializable:
-    # when the data parsing dpends on some HashTable or Tensor, the iterator is initalizable and
-    # therefore we need to be run explicitly
-    iterator = dataset.make_initializable_iterator()
-    tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS, iterator.initializer)
-  else:
-    iterator = dataset.make_one_shot_iterator()
-  return iterator.get_next()
+) -> tf.Tensor:
+    """
+    Returns a nested structure of tf.Tensors containing the next element.
+    Used by ``train_input_fn`` and ``eval_input_fn`` in DataRecordTrainer.
+    By default, works with DataRecord dataset for compressed partition files.
+
+    Args:
+        files:
+            List of files that will be parsed.
+        batch_size:
+            number of samples per batch.
+        parse_fn:
+            function passed to data loading for parsing individual data records.
+            Usually one of the decoder functions like ``parsers.get_sparse_parse_fn``.
+        num_threads (optional):
+            number of threads used for loading data. Defaults to 2.
+        repeat (optional):
+            Repeat the dataset indefinitely. Defaults to False.
+            Useful when you want to use ``train_steps`` or ``eval_steps``
+            greater than the size of the dataset
+            (otherwise Estimator.[train,evaluate] stops when the end of the dataset is reached).
+        dataset_fn (optional):
+            A function that modifies the dataset after it reads different interleaved parts files.
+            Defaults to:
+            .. code-block:: python
+            def dataset_fn(dataset, parse_fn, batch_size):
+                return dataset.batch(batch_size).map(parse_fn, 1)
+        keep_rate (optional):
+            A float value in (0.0, 1.0] that indicates to drop records according to the Bernoulli
+            distribution with p = 1 - keep_rate.
+            Defaults to None (no records dropped).
+        parts_downsampling_rate (optional):
+            A float value in (0.0, 1.0] that indicates the factor by which to downsample part files.
+            For example, a value of 0.2 means only 20 percent of part files become part of the dataset.
+        shards (optional):
+            Number of partitions to shard the dataset into. This is useful for codistillation
+            (https://arxiv.org/pdf/1804.03235.pdf) and other techniques that require each worker to
+            train on disjoint partitions of the dataset.
+            The dataset is not sharded by default.
+        shard_index (optional):
+            Which partition of the dataset to use if ``shards`` is set.
+        shuffle (optional):
+            Whether to shuffle the records. Defaults to True.
+        shuffle_files (optional):
+            Shuffle the list of files. Defaults to True.
+            When False, files are iterated in the order they are passed in.
+        interleave (optional):
+            Interleave records from multiple files in parallel. Defaults to True.
+        initializable (optional):
+            A boolean indicator. When the Dataset Iterator depends on some resource, e.g. a HashTable or
+            a Tensor, i.e. it's an initializable iterator, set it to True. Otherwise, default value (false)
+            is used for most plain iterators.
+        log_tf_data_summaries (optional):
+            A boolean indicator denoting whether to add a `tf.data.experimental.StatsAggregator` to the
+            tf.data pipeline. This adds summaries of pipeline utilization and buffer sizes to the output
+            events files. This requires that `initializable` is `True` above.
+
+    Returns:
+        Iterator of elements of the dataset.
+    """
+    if not parse_fn:
+        raise ValueError("default_input_fn requires a parse_fn")
+
+    if log_tf_data_summaries and not initializable:
+        raise ValueError("Require `initializable` if `log_tf_data_summaries`.")
+
+    dataset = stream_block_format_dataset(
+        files=files,
+        parse_fn=parse_fn,
+        batch_size=batch_size,
+        repeat=repeat,
+        num_threads=num_threads,
+        dataset_fn=dataset_fn,
+        keep_rate=keep_rate,
+        parts_downsampling_rate=parts_downsampling_rate,
+        shards=shards,
+        shard_index=shard_index,
+        shuffle=shuffle,
+        shuffle_files=shuffle_files,
+        interleave=interleave,
+        **kwargs
+    )
+
+    # Add a tf.data.experimental.StatsAggregator
+    # https://www.tensorflow.org/versions/r1.15/api_docs/python/tf/data/experimental/StatsAggregator
+    if log_tf_data_summaries:
+        aggregator = tf.data.experimental.StatsAggregator()
+        options = tf.data.Options()
+        options.experimental_stats.aggregator = aggregator
+        dataset = dataset.with_options(options)
+        stats_summary = aggregator.get_summary()
+        tf.add_to_collection(tf.GraphKeys.SUMMARIES, stats_summary)
+
+    if initializable:
+        # when the data parsing dpends on some HashTable or Tensor, the iterator is initalizable and
+        # therefore we need to be run explicitly
+        iterator = dataset.make_initializable_iterator()
+        tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS, iterator.initializer)
+    else:
+        iterator = dataset.make_one_shot_iterator()
+    return iterator.get_next()
 
 
 default_input_fn = data_record_input_fn  # pylint: disable=invalid-name
diff --git a/twml/twml/layers/__init__.py b/twml/twml/layers/__init__.py
index 917c61867..ad7e798c5 100644
--- a/twml/twml/layers/__init__.py
+++ b/twml/twml/layers/__init__.py
@@ -9,13 +9,13 @@
 from .batch_prediction_tensor_writer import BatchPredictionTensorWriter  # noqa: F401
 from .batch_prediction_writer import BatchPredictionWriter  # noqa: F401
 from .data_record_tensor_writer import DataRecordTensorWriter  # noqa: F401
-from .full_dense import full_dense, FullDense  # noqa: F401
-from .full_sparse import full_sparse, FullSparse  # noqa: F401
+from .full_dense import FullDense, full_dense  # noqa: F401
+from .full_sparse import FullSparse, full_sparse  # noqa: F401
 from .isotonic import Isotonic  # noqa: F401
 from .layer import Layer  # noqa: F401
 from .mdl import MDL  # noqa: F401
 from .partition import Partition  # noqa: F401
 from .percentile_discretizer import PercentileDiscretizer  # noqa: F401
 from .sequential import Sequential  # noqa: F401
-from .sparse_max_norm import MaxNorm, sparse_max_norm, SparseMaxNorm  # noqa: F401
+from .sparse_max_norm import MaxNorm, SparseMaxNorm, sparse_max_norm  # noqa: F401
 from .stitch import Stitch  # noqa: F401
diff --git a/twml/twml/layers/batch_prediction_tensor_writer.py b/twml/twml/layers/batch_prediction_tensor_writer.py
index 3f6633a8e..e5ca3c718 100644
--- a/twml/twml/layers/batch_prediction_tensor_writer.py
+++ b/twml/twml/layers/batch_prediction_tensor_writer.py
@@ -2,50 +2,58 @@
 """
 Implementing Writer Layer
 """
-from .layer import Layer
+from typing import List, Tuple
 
 import libtwml
+import tensorflow.compat.v1 as tf
+
+from .layer import Layer
 
 
 class BatchPredictionTensorWriter(Layer):
-  """
-  A layer that packages keys and dense tensors into a BatchPredictionResponse.
-  Typically used at the out of an exported model for use in a the PredictionEngine
-  (that is, in production) when model predictions are dense tensors.
-
-  Arguments:
-      keys:
-        keys to hashmap
-  Output:
-      output:
-        a BatchPredictionResponse serialized using Thrift into a uint8 tensor.
-   """
-
-  def __init__(self, keys, **kwargs):  # pylint: disable=useless-super-delegation
-    super(BatchPredictionTensorWriter, self).__init__(**kwargs)
-    self.keys = keys
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
+    """
+    A layer that packages keys and dense tensors into a BatchPredictionResponse.
+    Typically used at the out of an exported model for use in a the PredictionEngine
+    (that is, in production) when model predictions are dense tensors.
 
     Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raise NotImplementedError.
+        keys: keys to hashmap
 
+    Output:
+        output:
+            a BatchPredictionResponse serialized using Thrift into a uint8 tensor.
     """
-    raise NotImplementedError
-
-  def call(self, values, **kwargs):  # pylint: disable=unused-argument, arguments-differ
-    """The logic of the layer lives here.
 
-    Arguments:
-      values:
-        dense tensors corresponding to keys in hashmap
-
-    Returns:
-      The output from the layer
-    """
-    write_op = libtwml.ops.batch_prediction_tensor_response_writer(self.keys, values)
-    return write_op
+    def __init__(self, keys: List[str], **kwargs):
+        super(BatchPredictionTensorWriter, self).__init__(**kwargs)
+        self.keys = keys
+
+    def compute_output_shape(
+        self, input_shape: Tuple[tf.TensorShape]
+    ):  # pylint: disable=unused-argument
+        """Computes the output shape of the layer given the input shape.
+
+        Args:
+            input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
+                be fully defined (e.g. the batch size may be unknown).
+
+        Raise NotImplementedError.
+        """
+        raise NotImplementedError
+
+    def call(
+        self, values: List[tf.Tensor], **kwargs
+    ):  # pylint: disable=unused-argument
+        """The logic of the layer lives here.
+
+        Args:
+            values:
+                dense tensors corresponding to keys in hashmap
+
+        Returns:
+            The output from the layer
+        """
+        write_op = libtwml.ops.batch_prediction_tensor_response_writer(
+            self.keys, values
+        )
+        return write_op
diff --git a/twml/twml/layers/batch_prediction_writer.py b/twml/twml/layers/batch_prediction_writer.py
index 118d21921..15fd1379b 100644
--- a/twml/twml/layers/batch_prediction_writer.py
+++ b/twml/twml/layers/batch_prediction_writer.py
@@ -2,50 +2,57 @@
 """
 Implementing Writer Layer
 """
-from .layer import Layer
+from typing import List, Tuple
 
 import libtwml
+import tensorflow.compat.v1 as tf
+
+from .layer import Layer
 
 
 class BatchPredictionWriter(Layer):
-  """
-  A layer that packages keys and values into a BatchPredictionResponse.
-  Typically used at the out of an exported model for use in a the PredictionEngine
-  (that is, in production).
-
-  Arguments:
-      keys:
-        keys to hashmap
-  Output:
-      output:
-        a BatchPredictionResponse serialized using Thrift into a uint8 tensor.
-   """
-
-  def __init__(self, keys, **kwargs):  # pylint: disable=useless-super-delegation
-    super(BatchPredictionWriter, self).__init__(**kwargs)
-    self.keys = keys
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
+    """
+    A layer that packages keys and values into a BatchPredictionResponse.
+    Typically used at the out of an exported model for use in a the PredictionEngine
+    (that is, in production).
 
     Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
+        keys:
+            keys to hashmap
+    Output:
+        output:
+            a BatchPredictionResponse serialized using Thrift into a uint8 tensor.
+    """
 
-    Raise NotImplementedError.
+    def __init__(
+        self, keys: List[str], **kwargs
+    ):  # pylint: disable=useless-super-delegation
+        super(BatchPredictionWriter, self).__init__(**kwargs)
+        self.keys = keys
 
-    """
-    raise NotImplementedError
+    def compute_output_shape(self, input_shape: Tuple[tf.TensorShape]):
+        """Computes the output shape of the layer given the input shape.
 
-  def call(self, values, **kwargs):  # pylint: disable=unused-argument, arguments-differ
-    """The logic of the layer lives here.
+        Args:
+            input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
+                be fully defined (e.g. the batch size may be unknown).
 
-    Arguments:
-      values:
-        values corresponding to keys in hashmap
+        Raise NotImplementedError.
 
-    Returns:
-      The output from the layer
-    """
-    write_op = libtwml.ops.batch_prediction_response_writer(self.keys, values)
-    return write_op
+        """
+        raise NotImplementedError
+
+    def call(
+        self, values: List[tf.Tensor], **kwargs
+    ):  # pylint: disable=unused-argument, arguments-differ
+        """The logic of the layer lives here.
+
+        Args:
+            values:
+                values corresponding to keys in hashmap
+
+        Returns:
+            The output from the layer
+        """
+        write_op = libtwml.ops.batch_prediction_response_writer(self.keys, values)
+        return write_op
diff --git a/twml/twml/layers/data_record_tensor_writer.py b/twml/twml/layers/data_record_tensor_writer.py
index 0f70186b4..7b80f9fa7 100644
--- a/twml/twml/layers/data_record_tensor_writer.py
+++ b/twml/twml/layers/data_record_tensor_writer.py
@@ -2,49 +2,53 @@
 """
 Implementing Writer Layer
 """
-from .layer import Layer
+from typing import Tuple
 
 import libtwml
+import tensorflow.compat.v1 as tf
 
+from .layer import Layer
 
-class DataRecordTensorWriter(Layer):
-  """
-  A layer that packages keys and dense tensors into a DataRecord.
-  This layer was initially added to support exporting user embeddings as tensors.
 
-  Arguments:
-      keys:
-        keys to hashmap
-  Output:
-      output:
-        a DataRecord serialized using Thrift into a uint8 tensor
-   """
+class DataRecordTensorWriter(Layer):
+    """
+    A layer that packages keys and dense tensors into a DataRecord.
+    This layer was initially added to support exporting user embeddings as tensors.
 
-  def __init__(self, keys, **kwargs):  # pylint: disable=useless-super-delegation
-    super(DataRecordTensorWriter, self).__init__(**kwargs)
-    self.keys = keys
+    Args:
+        keys:
+            keys to hashmap
+    Output:
+        output:
+            a DataRecord serialized using Thrift into a uint8 tensor
+    """
 
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
+    def __init__(self, keys, **kwargs):  # pylint: disable=useless-super-delegation
+        super(DataRecordTensorWriter, self).__init__(**kwargs)
+        self.keys = keys
 
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
+    def compute_output_shape(self, input_shape: Tuple[tf.TensorShape]):
+        """Computes the output shape of the layer given the input shape.
 
-    Raises NotImplementedError.
+        Args:
+            input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
+                be fully defined (e.g. the batch size may be unknown).
 
-    """
-    raise NotImplementedError
+        Raises NotImplementedError.
+        """
+        raise NotImplementedError
 
-  def call(self, values, **kwargs):  # pylint: disable=unused-argument, arguments-differ
-    """The logic of the layer lives here.
+    def call(
+        self, values: Tuple[tf.Tensor], **kwargs
+    ) -> tf.Tensor:  # pylint: disable=unused-argument, arguments-differ
+        """The logic of the layer lives here.
 
-    Arguments:
-      values:
-        dense tensors corresponding to keys in hashmap
+        Args:
+            values:
+                dense tensors corresponding to keys in hashmap
 
-    Returns:
-      The output from the layer
-    """
-    write_op = libtwml.ops.data_record_tensor_writer(self.keys, values)
-    return write_op
+        Returns:
+            The output from the layer
+        """
+        write_op = libtwml.ops.data_record_tensor_writer(self.keys, values)
+        return write_op
diff --git a/twml/twml/layers/full_dense.py b/twml/twml/layers/full_dense.py
index 9c354ad3e..d08559a6d 100644
--- a/twml/twml/layers/full_dense.py
+++ b/twml/twml/layers/full_dense.py
@@ -2,258 +2,264 @@
 """
 Implementing Full Dense Layer
 """
-from tensorflow.python.layers import core as core_layers
-from tensorflow.python.ops import init_ops
+from typing import Callable, Optional
+
+import tensorflow.compat.v1 as tf
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras.engine.base_layer import InputSpec
-import tensorflow.compat.v1 as tf
+from tensorflow.python.layers import core as core_layers
+from tensorflow.python.ops import init_ops
 
 
 class FullDense(core_layers.Dense):
-  """
-  Densely-connected layer class.
-  This is wrapping tensorflow.python.layers.core.Dense
-  This layer implements the operation:
-
-  .. code-block:: python
+    """
+    Densely-connected layer class.
+    This is wrapping tensorflow.python.layers.core.Dense
+    This layer implements the operation:
 
-    outputs = activation(inputs.weight + bias)
+    .. code-block:: python
 
-  Where ``activation`` is the activation function passed as the ``activation``
-  argument (if not ``None``), ``weight`` is a weights matrix created by the layer,
-  and ``bias`` is a bias vector created by the layer.
+        outputs = activation(inputs.weight + bias)
 
-  Arguments:
-    output_size:
-      Integer or Long, dimensionality of the output space.
-    activation:
-      Activation function (callable). Set it to None to maintain a linear activation.
-    weight_initializer:
-      Initializer function for the weight matrix.
-    bias_initializer:
-      Initializer function for the bias.
-    weight_regularizer:
-      Regularizer function for the weight matrix.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    bias_regularizer:
-      Regularizer function for the bias.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    activity_regularizer:
-      Regularizer function for the output.
-    weight_constraint:
-      An optional projection function to be applied to the
-      weight after being updated by an `Optimizer` (e.g. used to implement
-      norm constraints or value constraints for layer weights). The function
-      must take as input the unprojected variable and must return the
-      projected variable (which must have the same shape). Constraints are
-      not safe to use when doing asynchronous distributed training.
-    bias_constraint:
-      An optional projection function to be applied to the
-      bias after being updated by an `Optimizer`.
-    trainable:
-      Boolean, if `True` also add variables to the graph collection
-      ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
-      <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
-    name:
-      String, the name of the layer. Layers with the same name will
-      share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
+    Where ``activation`` is the activation function passed as the ``activation``
+    argument (if not ``None``), ``weight`` is a weights matrix created by the layer,
+    and ``bias`` is a bias vector created by the layer.
 
-  Properties:
-    output_size:
-      Python integer, dimensionality of the output space.
-    activation:
-      Activation function (callable).
-    weight_initializer:
-      Initializer instance (or name) for the weight matrix.
-    bias_initializer:
-      Initializer instance (or name) for the bias.
-    weight:
-      Weight matrix (TensorFlow variable or tensor). (weight)
-    bias:
-      Bias vector, if applicable (TensorFlow variable or tensor).
-    weight_regularizer:
-      Regularizer instance for the weight matrix (callable)
-    bias_regularizer:
-      Regularizer instance for the bias (callable).
-    activity_regularizer:
-      Regularizer instance for the output (callable)
-    weight_constraint:
-      Constraint function for the weight matrix.
-    bias_constraint:
-      Constraint function for the bias.
+    Args:
+        output_size (int):
+            Integer or Long, dimensionality of the output space.
+        activation (callable):
+            Activation function (callable). Set it to None to maintain a linear activation.
+        weight_initializer (callable):
+            Initializer function for the weight matrix.
+        bias_initializer (callable):
+            Initializer function for the bias.
+        weight_regularizer (callable):
+            Regularizer function for the weight matrix.
+            Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
+        bias_regularizer (callable):
+            Regularizer function for the bias.
+            Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
+        activity_regularizer (callable):
+            Regularizer function for the output.
+        weight_constraint (callable):
+            An optional projection function to be applied to the
+            weight after being updated by an `Optimizer` (e.g. used to implement
+            norm constraints or value constraints for layer weights). The function
+            must take as input the unprojected variable and must return the
+            projected variable (which must have the same shape). Constraints are
+            not safe to use when doing asynchronous distributed training.
+        bias_constraint (callable):
+            An optional projection function to be applied to the
+            bias after being updated by an `Optimizer`.
+        trainable (bool):
+            Boolean, if `True` also add variables to the graph collection
+            ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
+            <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
+        name (str):
+            String, the name of the layer. Layers with the same name will
+            share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
 
-  """
+    Properties:
+        output_size (int):
+            Python integer, dimensionality of the output space.
+        activation (callable):
+            Activation function (callable).
+        weight_initializer (Initializer):
+            Initializer instance (or name) for the weight matrix.
+        bias_initializer (Initializer):
+            Initializer instance (or name) for the bias.
+        weight (TensorFlow variable or tensor):
+            Weight matrix (TensorFlow variable or tensor). (weight)
+        bias (TensorFlow variable or tensor):
+            Bias vector, if applicable (TensorFlow variable or tensor).
+        weight_regularizer (Regularizer):
+            Regularizer instance for the weight matrix (callable)
+        bias_regularizer (Regularizer):
+            Regularizer instance for the bias (callable).
+        activity_regularizer (Regularizer):
+            Regularizer instance for the output (callable)
+        weight_constraint (Constraint):
+            Constraint function for the weight matrix.
+        bias_constraint (Constraint):
+            Constraint function for the bias.
+    """
 
-  def __init__(self, output_size,
-               weight_initializer=None,
-               weight_regularizer=None,
-               weight_constraint=None,
-               bias_constraint=None,
-               num_partitions=None,
-               **kwargs):
-    super(FullDense, self).__init__(units=output_size,
-                                    kernel_initializer=weight_initializer,
-                                    kernel_regularizer=weight_regularizer,
-                                    kernel_constraint=weight_constraint,
-                                    **kwargs)
-    self._num_partitions = num_partitions
+    def __init__(
+        self,
+        output_size: int,
+        weight_initializer: Optional[Callable[[int], tf.Tensor]] = None,
+        weight_regularizer: Optional[Callable[[tf.Tensor], tf.Tensor]] = None,
+        weight_constraint: Optional[Callable[[tf.Tensor], tf.Tensor]] = None,
+        bias_constraint: Optional[Callable[[tf.Tensor], tf.Tensor]] = None,
+        num_partitions: Optional[int] = None,
+        **kwargs
+    ):
+        super(FullDense, self).__init__(
+            units=output_size,
+            kernel_initializer=weight_initializer,
+            kernel_regularizer=weight_regularizer,
+            kernel_constraint=weight_constraint,
+            **kwargs
+        )
+        self._num_partitions = num_partitions
 
-  def build(self, input_shape):
-    '''
-    code adapted from TF 1.12 Keras Dense layer:
-    https://github.com/tensorflow/tensorflow/blob/r1.12/tensorflow/python/keras/layers/core.py#L930-L956
-    '''
-    input_shape = tensor_shape.TensorShape(input_shape)
-    if input_shape[-1] is None:
-      raise ValueError('The last dimension of the inputs to `Dense` '
-                       'should be defined. Found `None`.')
-    self.input_spec = InputSpec(min_ndim=2,
-                                axes={-1: input_shape[-1]})
+    def build(self, input_shape: tf.TensorShape):
+        """
+        code adapted from TF 1.12 Keras Dense layer:
+        https://github.com/tensorflow/tensorflow/blob/r1.12/tensorflow/python/keras/layers/core.py#L930-L956
+        """
+        input_shape = tensor_shape.TensorShape(input_shape)
+        if input_shape[-1] is None:
+            raise ValueError(
+                "The last dimension of the inputs to `Dense` "
+                "should be defined. Found `None`."
+            )
+        self.input_spec = InputSpec(min_ndim=2, axes={-1: input_shape[-1]})
 
-    partitioner = None
-    if self._num_partitions:
-      partitioner = tf.fixed_size_partitioner(self._num_partitions)
+        partitioner = None
+        if self._num_partitions:
+            partitioner = tf.fixed_size_partitioner(self._num_partitions)
 
-    self.kernel = self.add_weight(
-        'kernel',
-        shape=[input_shape[-1], self.units],
-        initializer=self.kernel_initializer,
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint,
-        dtype=self.dtype,
-        partitioner=partitioner,
-        trainable=True)
+        self.kernel = self.add_weight(
+            "kernel",
+            shape=[input_shape[-1], self.units],
+            initializer=self.kernel_initializer,
+            regularizer=self.kernel_regularizer,
+            constraint=self.kernel_constraint,
+            dtype=self.dtype,
+            partitioner=partitioner,
+            trainable=True,
+        )
 
-    if self.use_bias:
-      self.bias = self.add_weight(
-          'bias',
-          shape=[self.units, ],
-          initializer=self.bias_initializer,
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint,
-          dtype=self.dtype,
-          trainable=True)
-    else:
-      self.bias = None
-    self.built = True
+        if self.use_bias:
+            self.bias = self.add_weight(
+                "bias",
+                shape=[
+                    self.units,
+                ],
+                initializer=self.bias_initializer,
+                regularizer=self.bias_regularizer,
+                constraint=self.bias_constraint,
+                dtype=self.dtype,
+                trainable=True,
+            )
+        else:
+            self.bias = None
+        self.built = True
 
-  @property
-  def output_size(self):
-    """
-    Returns output_size
-    """
-    return self.units
+    @property
+    def output_size(self) -> int:
+        """Returns output_size."""
+        return self.units
 
-  @property
-  def weight(self):
-    """
-    Returns weight
-    """
-    return self.kernel
+    @property
+    def weight(self) -> tf.Tensor:
+        """Returns weight."""
+        return self.kernel
 
-  @property
-  def weight_regularizer(self):
-    """
-    Returns weight_regularizer
-    """
-    return self.kernel_regularizer
+    @property
+    def weight_regularizer(self) -> Callable[[tf.Tensor], tf.Tensor]:
+        """Returns weight_regularizer."""
+        return self.kernel_regularizer
 
-  @property
-  def weight_initializer(self):
-    """
-    Returns weight_initializer
-    """
-    return self.kernel_initializer
+    @property
+    def weight_initializer(self) -> Callable[[int], tf.Tensor]:
+        """Returns weight_initializer."""
+        return self.kernel_initializer
 
-  @property
-  def weight_constraint(self):
-    """
-    Returns weight_constraint
-    """
-    return self.kernel_constraint
+    @property
+    def weight_constraint(self) -> Callable[[tf.Tensor], tf.Tensor]:
+        """Returns weight_constraint."""
+        return self.kernel_constraint
 
 
-def full_dense(inputs, output_size,
-               activation=None,
-               use_bias=True,
-               weight_initializer=None,
-               bias_initializer=init_ops.zeros_initializer(),
-               weight_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               weight_constraint=None,
-               bias_constraint=None,
-               trainable=True,
-               name=None,
-               num_partitions=None,
-               reuse=None):
-  """Functional interface for the densely-connected layer.
-  This layer implements the operation:
-  `outputs = activation(inputs.weight + bias)`
-  Where `activation` is the activation function passed as the `activation`
-  argument (if not `None`), `weight` is a weights matrix created by the layer,
-  and `bias` is a bias vector created by the layer
-  (only if `use_bias` is `True`).
+def full_dense(
+    inputs: tf.Tensor,
+    output_size: int,
+    activation: Optional[Callable[[tf.Tensor], tf.Tensor]] = None,
+    use_bias: bool = True,
+    weight_initializer: Optional[Callable[[int], tf.Tensor]] = None,
+    bias_initializer: Callable[[int], tf.Tensor] = init_ops.zeros_initializer(),
+    weight_regularizer: Optional[Callable[[tf.Tensor], tf.Tensor]] = None,
+    bias_regularizer: Optional[Callable[[tf.Tensor], tf.Tensor]] = None,
+    activity_regularizer: Optional[Callable[[tf.Tensor], tf.Tensor]] = None,
+    weight_constraint: Optional[Callable[[tf.Tensor], tf.Tensor]] = None,
+    bias_constraint: Optional[Callable[[tf.Tensor], tf.Tensor]] = None,
+    trainable: bool = True,
+    name: Optional[str] = None,
+    num_partitions: Optional[int] = None,
+    reuse: bool = False,
+) -> tf.Tensor:
+    """
+    Functional interface for the densely-connected layer.
+    This layer implements the operation:
+    `outputs = activation(inputs.weight + bias)`
+    Where `activation` is the activation function passed as the `activation`
+    argument (if not `None`), `weight` is a weights matrix created by the layer,
+    and `bias` is a bias vector created by the layer
+    (only if `use_bias` is `True`).
 
-  Arguments:
-    inputs: Tensor input.
-    units: Integer or Long, dimensionality of the output space.
-    activation: Activation function (callable). Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    weight_initializer: Initializer function for the weight matrix.
-      If `None` (default), weights are initialized using the default
-      initializer used by `tf.get_variable`.
-    bias_initializer:
-      Initializer function for the bias.
-    weight_regularizer:
-      Regularizer function for the weight matrix.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    bias_regularizer:
-      Regularizer function for the bias.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    activity_regularizer:
-      Regularizer function for the output.
-    weight_constraint:
-      An optional projection function to be applied to the
-      weight after being updated by an `Optimizer` (e.g. used to implement
-      norm constraints or value constraints for layer weights). The function
-      must take as input the unprojected variable and must return the
-      projected variable (which must have the same shape). Constraints are
-      not safe to use when doing asynchronous distributed training.
-    bias_constraint:
-      An optional projection function to be applied to the
-      bias after being updated by an `Optimizer`.
-    trainable:
-      Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name:
-      String, the name of the layer.
-    reuse:
-      Boolean, whether to reuse the weights of a previous layer
-      by the same name.
+    Args:
+        inputs: Tensor input.
+        units: Integer or Long, dimensionality of the output space.
+        activation: Activation function (callable). Set it to None to maintain a linear activation.
+        use_bias: Boolean, whether the layer uses a bias.
+        weight_initializer: Initializer function for the weight matrix.
+            If `None` (default), weights are initialized using the default
+            initializer used by `tf.get_variable`.
+        bias_initializer:
+            Initializer function for the bias.
+        weight_regularizer:
+            Regularizer function for the weight matrix.
+            Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
+        bias_regularizer:
+            Regularizer function for the bias.
+            Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
+        activity_regularizer:
+            Regularizer function for the output.
+        weight_constraint:
+            An optional projection function to be applied to the
+            weight after being updated by an `Optimizer` (e.g. used to implement
+            norm constraints or value constraints for layer weights). The function
+            must take as input the unprojected variable and must return the
+            projected variable (which must have the same shape). Constraints are
+            not safe to use when doing asynchronous distributed training.
+        bias_constraint:
+            An optional projection function to be applied to the
+            bias after being updated by an `Optimizer`.
+        trainable:
+            Boolean, if `True` also add variables to the graph collection
+            `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+        name:
+            String, the name of the layer.
+        reuse:
+            Boolean, whether to reuse the weights of a previous layer
+            by the same name.
 
-  Returns:
-    Output tensor the same shape as `inputs` except the last dimension is of
-    size `units`.
+    Returns:
+        Output tensor the same shape as `inputs` except the last dimension is of
+        size `units`.
 
-  Raises:
-    ValueError: if eager execution is enabled.
-  """
-  layer = FullDense(output_size,
-                    activation=activation,
-                    use_bias=use_bias,
-                    weight_initializer=weight_initializer,
-                    bias_initializer=bias_initializer,
-                    weight_regularizer=weight_regularizer,
-                    bias_regularizer=bias_regularizer,
-                    activity_regularizer=activity_regularizer,
-                    weight_constraint=weight_constraint,
-                    bias_constraint=bias_constraint,
-                    trainable=trainable,
-                    name=name,
-                    dtype=inputs.dtype.base_dtype,
-                    num_partitions=num_partitions,
-                    _scope=name,
-                    _reuse=reuse)
-  return layer.apply(inputs)
+    Raises:
+        ValueError: if eager execution is enabled.
+    """
+    layer = FullDense(
+        output_size,
+        activation=activation,
+        use_bias=use_bias,
+        weight_initializer=weight_initializer,
+        bias_initializer=bias_initializer,
+        weight_regularizer=weight_regularizer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        weight_constraint=weight_constraint,
+        bias_constraint=bias_constraint,
+        trainable=trainable,
+        name=name,
+        dtype=inputs.dtype.base_dtype,
+        num_partitions=num_partitions,
+        _scope=name,
+        _reuse=reuse,
+    )
+    return layer.apply(inputs)
diff --git a/twml/twml/layers/full_sparse.py b/twml/twml/layers/full_sparse.py
index 4f0f21930..27701ac49 100644
--- a/twml/twml/layers/full_sparse.py
+++ b/twml/twml/layers/full_sparse.py
@@ -4,367 +4,400 @@
 """
 
 import math
+from typing import Callable, List, Optional, Tuple, Union
 
+import tensorflow.compat.v1 as tf
 from twitter.deepbird.sparse import sparse_dense_matmul
 
-from .layer import Layer
-
-import tensorflow.compat.v1 as tf
 import twml
 
+from .layer import Layer
 
-class FullSparse(Layer):
-  """Fully-sparse layer class.
-  This layer implements the operation:
-
-  .. code-block:: python
-
-    outputs = activation(inputs.weight + bias)
-
-  Arguments:
-    output_size:
-      Long or Integer, dimensionality of the output space.
-    input_size:
-      The number of input units. (Deprecated)
-    weight_initializer:
-      Initializer function for the weight matrix.
-      This argument defaults to zeros_initializer().
-      This is valid when the FullSparse is the first layer of
-      parameters but should be changed otherwise.
-    weight_regularizer:
-      Regularizer function for the weight matrix.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    bias_regularizer:
-      Regularizer function for the bias.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect
-    activation:
-      Activation function (callable). Set it to None to maintain a linear activation.
-    bias_initializer:
-      Initializer function for the bias.
-      This argument defaults to tf.constant_initializer(1/output_size)
-    trainable:
-      Boolean, if `True` also add variables to the graph collection
-      ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
-      <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
-    name:
-      String, the name of the layer. Layers with the same name will
-      share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
-    use_sparse_grads:
-      Boolean, if `True` do sparse mat mul with `embedding_lookup_sparse`, which will
-      make gradients to weight matrix also sparse in backward pass. This can lead to non-trivial
-      speed up at training time when input_size is large and optimizer handles sparse gradients
-      correctly (eg. with SGD or LazyAdamOptimizer). If weight matrix is small, it's recommended
-      to set this flag to `False`; for most use cases of FullSparse, however, weight matrix will
-      be large, so it's better to set it to `True`
-    num_partitions:
-      Number of partitions to use for the weight variable. Defaults to 1.
-    partition_axis:
-      If num_partitions is specified, the partition axis for the weight variable
-      Defaults to 0 (partition by row).
-      Must be 0 (row) or 1 (column)
-    use_binary_values:
-      Assume all non zero values are 1. Defaults to False.
-      This can improve training if used in conjunction with MDL.
-      This parameter can also be a list of binary values if `inputs` passed to `call` a list.
-    use_compression:
-      Default False. Set True to enable data compression techniques for
-      optimization of network traffic for distributed training.
-    use_binary_sparse_dense_matmul:
-      If binary sparse dense matmul op is to be used. It will only be enabled if
-      `use_binary_values` is set true. It only should be used for inference, best practice is
-      to set `use_binary_sparse_dense_matmul = not is_training`.
-  """
-
-  def __init__(self,
-               output_size,
-               input_size=None,
-               weight_initializer=None,
-               activation=None,
-               bias_initializer=None,
-               trainable=True,
-               name=None,
-               use_sparse_grads=True,
-               num_partitions=None,
-               partition_axis=0,
-               use_binary_values=False,
-               bias_regularizer=None,
-               weight_regularizer=None,
-               use_compression=False,
-               use_binary_sparse_dense_matmul=False,
-               **kwargs):
-    super(FullSparse, self).__init__(trainable=trainable, name=name, **kwargs)
-    # TODO - remove input_size warning.
-    if input_size:
-      raise ValueError('input_size is deprecated - it is now automatically \
-                       inferred from your input.')
-
-    # The bias initialization and weights initialization is set to match v1's implementation.
-    if bias_initializer is None:
-      bias_initializer = tf.constant_initializer(1 / output_size)
-    # Weights initialization is set to 0s. This is safe for full sparse layers because
-    # you are supposed to learn your embedding from the label.
-    if weight_initializer is None:
-      weight_initializer = tf.zeros_initializer()
-    self.weight_initializer = weight_initializer
-    self.bias_initializer = bias_initializer
-    self.output_size = output_size
-    self.activation = activation
-    self.use_sparse_grads = use_sparse_grads
-    self.num_partitions = num_partitions
-    if partition_axis != 0 and partition_axis != 1:
-      raise ValueError('partition_axis must be 0 or 1')
-    self.partition_axis = partition_axis
-    self.use_binary_values = use_binary_values
-    self.weight_regularizer = weight_regularizer
-    self.bias_regularizer = bias_regularizer
-    self._use_compression = use_compression
-    self._cast_indices_dtype = tf.int32 if self._use_compression else None
-    self.use_binary_sparse_dense_matmul = use_binary_sparse_dense_matmul
-
-  def _make_weight_var(self, shape, partitioner):
-    self.weight = self.add_variable(
-      'weight',
-      initializer=self.weight_initializer,
-      regularizer=self.weight_regularizer,
-      shape=shape,
-      dtype=self.dtype,
-      trainable=True,
-      partitioner=partitioner,
-    )
 
-  def build(self, input_shapes):
-    """
-    creates the ``bias`` and ``weight`` Variables
-    of shape ``[output_size]`` and ``[input_size, output_size]`` respectively.
-    """
+class FullSparse(Layer):
+    """Fully-sparse layer class.
+    This layer implements the operation:
 
-    if isinstance(input_shapes, (list, tuple)):
-      input_shape = input_shapes[0]
-      is_compatible = True
-      for other_shape in input_shapes[1:]:
-        is_compatible &= input_shape.is_compatible_with(other_shape)
-      if not is_compatible:
-        raise ValueError("Input shapes %s are not compatible." % input_shapes)
-    else:
-      input_shape = input_shapes
-
-    self.bias = self.add_variable(
-      'bias',
-      initializer=self.bias_initializer,
-      regularizer=self.bias_regularizer,
-      shape=[self.output_size, ],
-      dtype=self.dtype,
-      trainable=True
-    )
+    .. code-block:: python
 
-    partitioner = None
-    shape = [input_shape[1], self.output_size]
-
-    # There is a 2gb limitation for each tensor because of protobuf.
-    # 2**30 is 1GB. 2 * (2**30) is 2GB.
-    dtype = tf.as_dtype(self.dtype)
-    num_partitions = 1 if self.num_partitions is None else self.num_partitions
-    in_shape = input_shape[1]
-    out_shape = self.output_size
-
-    # when v2 behavior is disabled, in_shape is tf.Dimension. otherwise it is int.
-    if isinstance(in_shape, tf.Dimension):
-      in_shape = in_shape.value
-
-    if in_shape is None:
-      raise ValueError("Input tensor should have shape."
-                       " You can set it using twml.util.limit_sparse_tensor_size")
-
-    (split_dim, other_dim) = (in_shape, out_shape) if self.partition_axis == 0 else (out_shape, in_shape)
-    requested_size = math.ceil(float(split_dim) / num_partitions) * other_dim * dtype.size
-    if (requested_size >= 2**31):
-      raise ValueError("Weight tensor partitions cannot be larger than 2GB.\n"
-                       "Requested Dimensions(%d, %d) of type %s (%d bytes total) over %d partitions.\n"
-                       "Possible solutions:\n"
-                       "- reduce the params.output_size_bits\n"
-                       "- reduce the output_size of the sparse_layer\n"
-                       "- specify a larger num_partitions argument\n"
-                       "- reduce input_size_bits" %
-                       (in_shape, self.output_size, dtype.name, requested_size, num_partitions))
-
-    if self.num_partitions:
-      partition_axis = int(self.partition_axis)
-      partitioner = tf.fixed_size_partitioner(self.num_partitions, axis=partition_axis)
-    else:
-      # Regular variables do not like it when you pass both constant tensors and shape
-      if not callable(self.weight_initializer):
-        shape = None
-
-    self._make_weight_var(shape, partitioner)
-
-    self.built = True
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
+        outputs = activation(inputs.weight + bias)
 
     Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raises NotImplementedError.
-
+        output_size:
+            Long or Integer, dimensionality of the output space.
+        weight_initializer:
+            Initializer function for the weight matrix.
+            This argument defaults to zeros_initializer().
+            This is valid when the FullSparse is the first layer of
+            parameters but should be changed otherwise.
+        weight_regularizer:
+            Regularizer function for the weight matrix.
+            Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
+        bias_regularizer:
+            Regularizer function for the bias.
+            Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect
+        activation:
+            Activation function (callable). Set it to None to maintain a linear activation.
+        bias_initializer:
+            Initializer function for the bias.
+            This argument defaults to tf.constant_initializer(1/output_size)
+        trainable:
+            Boolean, if `True` also add variables to the graph collection
+            ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
+            <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
+        name:
+            String, the name of the layer. Layers with the same name will
+            share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
+        use_sparse_grads:
+            Boolean, if `True` do sparse mat mul with `embedding_lookup_sparse`, which will
+            make gradients to weight matrix also sparse in backward pass. This can lead to non-trivial
+            speed up at training time when input_size is large and optimizer handles sparse gradients
+            correctly (eg. with SGD or LazyAdamOptimizer). If weight matrix is small, it's recommended
+            to set this flag to `False`; for most use cases of FullSparse, however, weight matrix will
+            be large, so it's better to set it to `True`
+        num_partitions:
+            Number of partitions to use for the weight variable. Defaults to 1.
+        partition_axis:
+            If num_partitions is specified, the partition axis for the weight variable
+            Defaults to 0 (partition by row).
+            Must be 0 (row) or 1 (column)
+        use_binary_values:
+            Assume all non zero values are 1. Defaults to False.
+            This can improve training if used in conjunction with MDL.
+            This parameter can also be a list of binary values if `inputs` passed to `call` a list.
+        use_compression:
+            Default False. Set True to enable data compression techniques for
+            optimization of network traffic for distributed training.
+        use_binary_sparse_dense_matmul:
+            If binary sparse dense matmul op is to be used. It will only be enabled if
+            `use_binary_values` is set true. It only should be used for inference, best practice is
+            to set `use_binary_sparse_dense_matmul = not is_training`.
     """
-    raise NotImplementedError
-
-  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """The logic of the layer lives here.
 
-    Arguments:
-      inputs:
-        A SparseTensor or a list of SparseTensors.
-        If `inputs` is a list, all tensors must have same `dense_shape`.
+    def __init__(
+        self,
+        output_size: int,
+        weight_initializer: Optional[Callable[[tf.Tensor], tf.Tensor]] = None,
+        activation: Optional[Callable[[tf.Tensor], tf.Tensor]] = tf.nn.relu,
+        bias_initializer: Optional[Callable[[tf.Tensor], tf.Tensor]] = None,
+        trainable: bool = True,
+        name: Optional[str] = None,
+        use_sparse_grads: bool = True,
+        num_partitions: Optional[int] = None,
+        partition_axis: int = 0,
+        use_binary_values: bool = False,
+        bias_regularizer: Optional[Callable[[tf.Tensor], tf.Tensor]] = None,
+        weight_regularizer: Optional[Callable[[tf.Tensor], tf.Tensor]] = None,
+        use_compression: bool = False,
+        use_binary_sparse_dense_matmul: bool = False,
+        **kwargs
+    ):
+        super(FullSparse, self).__init__(trainable=trainable, name=name, **kwargs)
+
+        # The bias initialization and weights initialization is set to match v1's implementation.
+        if bias_initializer is None:
+            bias_initializer = tf.constant_initializer(1 / output_size)
+        # Weights initialization is set to 0s. This is safe for full sparse layers because
+        # you are supposed to learn your embedding from the label.
+        if weight_initializer is None:
+            weight_initializer = tf.zeros_initializer()
+        self.weight_initializer = weight_initializer
+        self.bias_initializer = bias_initializer
+        self.output_size = output_size
+        self.activation = activation
+        self.use_sparse_grads = use_sparse_grads
+        self.num_partitions = num_partitions
+        if partition_axis != 0 and partition_axis != 1:
+            raise ValueError("partition_axis must be 0 or 1")
+        self.partition_axis = partition_axis
+        self.use_binary_values = use_binary_values
+        self.weight_regularizer = weight_regularizer
+        self.bias_regularizer = bias_regularizer
+        self._use_compression = use_compression
+        self._cast_indices_dtype = tf.int32 if self._use_compression else None
+        self.use_binary_sparse_dense_matmul = use_binary_sparse_dense_matmul
+
+    def _make_weight_var(
+        self,
+        shape: List[int],
+        partitioner: tf.Partitioner,
+    ) -> None:
+        self.weight = self.add_variable(
+            "weight",
+            initializer=self.weight_initializer,
+            regularizer=self.weight_regularizer,
+            shape=shape,
+            dtype=self.dtype,
+            trainable=True,
+            partitioner=partitioner,
+        )
+
+    def build(
+        self, input_shapes: Union[Tuple[tf.TensorShape], List[tf.TensorShape]]
+    ) -> None:
+        """
+        creates the ``bias`` and ``weight`` Variables
+        of shape ``[output_size]`` and ``[input_size, output_size]`` respectively.
+        """
+
+        if isinstance(input_shapes, (list, tuple)):
+            input_shape = input_shapes[0]
+            is_compatible = True
+            for other_shape in input_shapes[1:]:
+                is_compatible &= input_shape.is_compatible_with(other_shape)
+            if not is_compatible:
+                raise ValueError("Input shapes %s are not compatible." % input_shapes)
+        else:
+            input_shape = input_shapes
+
+        self.bias = self.add_variable(
+            "bias",
+            initializer=self.bias_initializer,
+            regularizer=self.bias_regularizer,
+            shape=[
+                self.output_size,
+            ],
+            dtype=self.dtype,
+            trainable=True,
+        )
+
+        partitioner = None
+        shape = [input_shape[1], self.output_size]
+
+        # There is a 2gb limitation for each tensor because of protobuf.
+        # 2**30 is 1GB. 2 * (2**30) is 2GB.
+        dtype = tf.as_dtype(self.dtype)
+        num_partitions = 1 if self.num_partitions is None else self.num_partitions
+        in_shape = input_shape[1]
+        out_shape = self.output_size
+
+        # when v2 behavior is disabled, in_shape is tf.Dimension. otherwise it is int.
+        if isinstance(in_shape, tf.Dimension):
+            in_shape = in_shape.value
+
+        if in_shape is None:
+            raise ValueError(
+                "Input tensor should have shape."
+                " You can set it using twml.util.limit_sparse_tensor_size"
+            )
+
+        (split_dim, other_dim) = (
+            (in_shape, out_shape) if self.partition_axis == 0 else (out_shape, in_shape)
+        )
+        requested_size = (
+            math.ceil(float(split_dim) / num_partitions) * other_dim * dtype.size
+        )
+        if requested_size >= 1 << 31:
+            raise ValueError(
+                "Weight tensor partitions cannot be larger than 2GB.\n"
+                "Requested Dimensions(%d, %d) of type %s (%d bytes total) over %d partitions.\n"
+                "Possible solutions:\n"
+                "- reduce the params.output_size_bits\n"
+                "- reduce the output_size of the sparse_layer\n"
+                "- specify a larger num_partitions argument\n"
+                "- reduce input_size_bits"
+                % (
+                    in_shape,
+                    self.output_size,
+                    dtype.name,
+                    requested_size,
+                    num_partitions,
+                )
+            )
+
+        if self.num_partitions:
+            partition_axis = int(self.partition_axis)
+            partitioner = tf.fixed_size_partitioner(
+                self.num_partitions, axis=partition_axis
+            )
+        else:
+            # Regular variables do not like it when you pass both constant tensors and shape
+            if not callable(self.weight_initializer):
+                shape = None
+
+        self._make_weight_var(shape, partitioner)
+
+        self.built = True
+
+    def compute_output_shape(self, input_shape: tf.TensorShape):
+        """Computes the output shape of the layer given the input shape.
+
+        Args:
+            input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
+                be fully defined (e.g. the batch size may be unknown).
+
+        Raises NotImplementedError.
+        """
+        raise NotImplementedError
+
+    def call(
+        self, inputs: Union[List[tf.SparseTensor], Tuple[tf.SparseTensor]], **kwargs
+    ) -> tf.Tensor:  # pylint: disable=unused-argument
+        """The logic of the layer lives here.
+
+        Args:
+            inputs:
+                A SparseTensor or a list of SparseTensors.
+                If `inputs` is a list, all tensors must have same `dense_shape`.
+
+        Returns:
+            - If `inputs` is `SparseTensor`, then returns `bias + inputs * dense_b`.
+            - If `inputs` is a `list[SparseTensor`, then returns
+                `bias + add_n([sp_a * dense_b for sp_a in inputs])`.
+
+        """
+        if isinstance(inputs, (list, tuple)):
+            if isinstance(self.use_binary_values, (list, tuple)):
+                use_binary_values = self.use_binary_values
+            else:
+                use_binary_values = [self.use_binary_values] * len(inputs)
+
+            num_inputs = len(inputs)
+            if num_inputs != len(use_binary_values):
+                raise ValueError(
+                    "#inputs is %d while #use_binary_values is %d"
+                    % (num_inputs, len(use_binary_values))
+                )
+
+            outputs = []
+            for n in range(num_inputs):
+                outputs.append(
+                    sparse_dense_matmul(
+                        inputs[n],
+                        self.weight,
+                        self.use_sparse_grads,
+                        use_binary_values[n],
+                        name="sparse_mm_" + str(n),
+                        partition_axis=self.partition_axis,
+                        num_partitions=self.num_partitions,
+                        compress_ids=self._use_compression,
+                        cast_indices_dtype=self._cast_indices_dtype,
+                        use_binary_sparse_dense_matmul=self.use_binary_sparse_dense_matmul,
+                    )
+                )
+            outputs = tf.accumulate_n(outputs)
+        else:
+            if isinstance(self.use_binary_values, (list, tuple)):
+                raise ValueError(
+                    "use_binary_values can not be %s when inputs is %s"
+                    % (type(self.use_binary_values), type(inputs))
+                )
+
+            outputs = sparse_dense_matmul(
+                inputs,
+                self.weight,
+                self.use_sparse_grads,
+                self.use_binary_values,
+                name="sparse_mm",
+                partition_axis=self.partition_axis,
+                num_partitions=self.num_partitions,
+                compress_ids=self._use_compression,
+                cast_indices_dtype=self._cast_indices_dtype,
+                use_binary_sparse_dense_matmul=self.use_binary_sparse_dense_matmul,
+            )
+
+        if self.bias is not None:
+            outputs = tf.nn.bias_add(outputs, self.bias)
+
+        if self.activation is not None:
+            return self.activation(outputs)  # pylint: disable=not-callable
+        return outputs
 
-    Returns:
-      - If `inputs` is `SparseTensor`, then returns `bias + inputs * dense_b`.
-      - If `inputs` is a `list[SparseTensor`, then returns
-        `bias + add_n([sp_a * dense_b for sp_a in inputs])`.
 
+def full_sparse(
+    inputs: tf.SparseTensor,
+    output_size: int,
+    activation: Optional[Callable] = None,
+    bias_regularizer: Optional[Callable] = None,
+    weight_regularizer: Optional[Callable] = None,
+    bias_initializer: Optional[Callable] = None,
+    weight_initializer: Optional[Callable] = None,
+    trainable: bool = True,
+    name: Optional[str] = None,
+    reuse: Optional[bool] = None,
+    use_sparse_grads: bool = True,
+    num_partitions: Optional[int] = None,
+    partition_axis: int = 0,
+    use_binary_values: bool = False,
+    use_compression: bool = False,
+) -> FullSparse:
     """
-    if isinstance(inputs, (list, tuple)):
+    Functional interface for the sparsely-connected layer.
 
-      if isinstance(self.use_binary_values, (list, tuple)):
-        use_binary_values = self.use_binary_values
-      else:
-        use_binary_values = [self.use_binary_values] * len(inputs)
-
-      num_inputs = len(inputs)
-      if num_inputs != len(use_binary_values):
-        raise ValueError("#inputs is %d while #use_binary_values is %d"
-                         % (num_inputs, len(use_binary_values)))
-
-      outputs = []
-      for n in range(num_inputs):
-        outputs.append(sparse_dense_matmul(inputs[n], self.weight,
-                                           self.use_sparse_grads,
-                                           use_binary_values[n],
-                                           name='sparse_mm_' + str(n),
-                                           partition_axis=self.partition_axis,
-                                           num_partitions=self.num_partitions,
-                                           compress_ids=self._use_compression,
-                                           cast_indices_dtype=self._cast_indices_dtype,
-                                           use_binary_sparse_dense_matmul=self.use_binary_sparse_dense_matmul))
-      outputs = tf.accumulate_n(outputs)
-    else:
-
-      if isinstance(self.use_binary_values, (list, tuple)):
-        raise ValueError("use_binary_values can not be %s when inputs is %s" %
-                         (type(self.use_binary_values), type(inputs)))
-
-      outputs = sparse_dense_matmul(inputs, self.weight,
-                                    self.use_sparse_grads,
-                                    self.use_binary_values,
-                                    name='sparse_mm',
-                                    partition_axis=self.partition_axis,
-                                    num_partitions=self.num_partitions,
-                                    compress_ids=self._use_compression,
-                                    cast_indices_dtype=self._cast_indices_dtype,
-                                    use_binary_sparse_dense_matmul=self.use_binary_sparse_dense_matmul)
-
-    if self.bias is not None:
-      outputs = tf.nn.bias_add(outputs, self.bias)
-
-    if self.activation is not None:
-      return self.activation(outputs)  # pylint: disable=not-callable
-    return outputs
+    Args:
+        inputs:
+            A sparse tensor (can be twml.SparseTensor or tf.SparseTensor)
+        output_size:
+            Long or Integer, dimensionality of the output space.
+        weight_initializer:
+            Initializer function for the weight matrix.
+        activation:
+            Activation function (callable). Set it to None to maintain a linear activation.
+        bias_initializer:
+            Initializer function for the bias.
+        weight_regularizer:
+            Regularizer function for the weight matrix.
+            Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
+        bias_regularizer:
+            Regularizer function for the bias.
+            Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
+        trainable:
+            Boolean, if `True` also add variables to the graph collection
+            ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
+            <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
+        name:
+            String, the name of the layer. Layers with the same name will
+            share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
+        use_sparse_grads:
+            Boolean, if `True` do sparse mat mul with `embedding_lookup_sparse`, which will
+            make gradients to weight matrix also sparse in backward pass. This can lead to non-trivial
+            speed up at training time when input_size is large and optimizer handles sparse gradients
+            correctly (eg. with SGD or LazyAdamOptimizer). If weight matrix is small, it's recommended
+            to set this flag to `False`; for most use cases of FullSparse, however, weight matrix will
+            be large, so it's better to set it to `True`
+        num_partitions:
+            Number of partitions to use for the weight variable. Defaults to 1.
+        partition_axis:
+            If num_partitions is specified, the partition axis for the weight variable
+            Defaults to 0 (partition by row).
+            Must be 0 (row) or 1 (column)
+        use_binary_values:
+            Assume all non zero values are 1. Defaults to False.
+            This can improve training if used in conjunction with MDL.
+        use_compression:
+            Default False. Set True to enable data compression techniques for
+            optimization of network traffic for distributed training.
+    Returns:
+        Outputs a ``tf.Tensor`` of size ``[batch_size x output_size]``.
+    """
 
+    dtype = None
+    if isinstance(inputs, twml.SparseTensor):
+        inputs = inputs.to_tf()
+        dtype = inputs.dtype.base_dtype
 
-def full_sparse(
-        inputs, output_size,
-        input_size=None,
-        activation=None,
-        bias_regularizer=None,
-        weight_regularizer=None,
-        bias_initializer=None,
-        weight_initializer=None,
-        trainable=True,
-        name=None,
-        reuse=None,
-        use_sparse_grads=True,
-        num_partitions=None,
-        partition_axis=0,
-        use_binary_values=False,
-        use_compression=False):
-  """Functional interface for the sparsely-connected layer.
-
-  Arguments:
-    inputs:
-      A sparse tensor (can be twml.SparseTensor or tf.SparseTensor)
-    output_size:
-      Long or Integer, dimensionality of the output space.
-    weight_initializer:
-      Initializer function for the weight matrix.
-    activation:
-      Activation function (callable). Set it to None to maintain a linear activation.
-    bias_initializer:
-      Initializer function for the bias.
-    weight_regularizer:
-      Regularizer function for the weight matrix.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    bias_regularizer:
-      Regularizer function for the bias.
-      Ensure to add tf.losses.get_regularization_loss() to your loss for this to take effect.
-    trainable:
-      Boolean, if `True` also add variables to the graph collection
-      ``GraphKeys.TRAINABLE_VARIABLES`` (see `tf.Variable
-      <https://www.tensorflow.org/versions/master/api_docs/python/tf/Variable>`_).
-    name:
-      String, the name of the layer. Layers with the same name will
-      share weights, but to avoid mistakes we require ``reuse=True`` in such cases.
-    use_sparse_grads:
-      Boolean, if `True` do sparse mat mul with `embedding_lookup_sparse`, which will
-      make gradients to weight matrix also sparse in backward pass. This can lead to non-trivial
-      speed up at training time when input_size is large and optimizer handles sparse gradients
-      correctly (eg. with SGD or LazyAdamOptimizer). If weight matrix is small, it's recommended
-      to set this flag to `False`; for most use cases of FullSparse, however, weight matrix will
-      be large, so it's better to set it to `True`
-    num_partitions:
-      Number of partitions to use for the weight variable. Defaults to 1.
-    partition_axis:
-      If num_partitions is specified, the partition axis for the weight variable
-      Defaults to 0 (partition by row).
-      Must be 0 (row) or 1 (column)
-    use_binary_values:
-      Assume all non zero values are 1. Defaults to False.
-      This can improve training if used in conjunction with MDL.
-    use_compression:
-      Default False. Set True to enable data compression techniques for
-      optimization of network traffic for distributed training.
-  Returns:
-    Outputs a ``tf.Tensor`` of size ``[batch_size x output_size]``.
-  """
-  # TODO - remove input_size warning.
-  if input_size:
-    raise ValueError('input_size is deprecated - it is now \
-                      automatically inferred from your input.')
-
-  dtype = None
-  if isinstance(inputs, twml.SparseTensor):
-    inputs = inputs.to_tf()
-    dtype = inputs.dtype.base_dtype
-
-  if isinstance(inputs, (list, tuple)):
-    inputs = [inp.to_tf() if isinstance(inp, twml.SparseTensor) else inp for inp in inputs]
-    dtype = inputs[0].dtype.base_dtype
-
-  layer = FullSparse(output_size=output_size,
-                     activation=activation,
-                     trainable=trainable,
-                     name=name,
-                     weight_initializer=weight_initializer,
-                     bias_initializer=bias_initializer,
-                     weight_regularizer=weight_regularizer,
-                     bias_regularizer=bias_regularizer,
-                     dtype=dtype,
-                     _scope=name,
-                     _reuse=reuse,
-                     use_sparse_grads=use_sparse_grads,
-                     num_partitions=num_partitions,
-                     partition_axis=partition_axis,
-                     use_compression=use_compression,
-                     use_binary_values=use_binary_values)
-  return layer(inputs)
+    if isinstance(inputs, (list, tuple)):
+        inputs = [
+            inp.to_tf() if isinstance(inp, twml.SparseTensor) else inp for inp in inputs
+        ]
+        dtype = inputs[0].dtype.base_dtype
+
+    layer = FullSparse(
+        output_size=output_size,
+        activation=activation,
+        trainable=trainable,
+        name=name,
+        weight_initializer=weight_initializer,
+        bias_initializer=bias_initializer,
+        weight_regularizer=weight_regularizer,
+        bias_regularizer=bias_regularizer,
+        dtype=dtype,
+        _scope=name,
+        _reuse=reuse,
+        use_sparse_grads=use_sparse_grads,
+        num_partitions=num_partitions,
+        partition_axis=partition_axis,
+        use_compression=use_compression,
+        use_binary_values=use_binary_values,
+    )
+    return layer(inputs)
diff --git a/twml/twml/layers/isotonic.py b/twml/twml/layers/isotonic.py
index 7113f7af4..9c1bed62a 100644
--- a/twml/twml/layers/isotonic.py
+++ b/twml/twml/layers/isotonic.py
@@ -3,74 +3,93 @@
 Contains the Isotonic Layer
 """
 
-from .layer import Layer
+from typing import Optional
 
 import libtwml
 import numpy as np
+import tensorflow.compat.v1 as tf
+
+from .layer import Layer
 
 
 class Isotonic(Layer):
-  """
-  This layer is created by the IsotonicCalibrator.
-  Typically it is used intead of sigmoid activation on the output unit.
-
-  Arguments:
-    n_unit:
-      number of input units to the layer (same as number of output units).
-    n_bin:
-      number of bins used for isotonic calibration.
-      More bins means a more precise isotonic function.
-      Less bins means a more regularized isotonic function.
-    xs_input:
-      A tensor containing the boundaries of the bins.
-    ys_input:
-      A tensor containing calibrated values for the corresponding bins.
-
-  Output:
-      output:
-        A layer containing calibrated probabilities with same shape and size as input.
-  Expected Sizes:
-      xs_input, ys_input:
-        [n_unit, n_bin].
-  Expected Types:
-      xs_input, ys_input:
-        same as input.
-  """
-
-  def __init__(self, n_unit, n_bin, xs_input=None, ys_input=None, **kwargs):
-    super(Isotonic, self).__init__(**kwargs)
-
-    self._n_unit = n_unit
-    self._n_bin = n_bin
-
-    self.xs_input = np.empty([n_unit, n_bin], dtype=np.float32) if xs_input is None else xs_input
-    self.ys_input = np.empty([n_unit, n_bin], dtype=np.float32) if ys_input is None else ys_input
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
+    """
+    This layer is created by the IsotonicCalibrator.
+    Typically it is used instead of sigmoid activation on the output unit.
 
     Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raises NotImplementedError.
-
+        n_unit:
+            number of input units to the layer (same as number of output units).
+        n_bin:
+            number of bins used for isotonic calibration.
+            More bins means a more precise isotonic function.
+            Less bins means a more regularized isotonic function.
+        xs_input:
+            A tensor containing the boundaries of the bins.
+        ys_input:
+            A tensor containing calibrated values for the corresponding bins.
+
+    Output:
+        output:
+            A layer containing calibrated probabilities with same shape and size as input.
+    Expected Sizes:
+        xs_input, ys_input:
+            [n_unit, n_bin].
+    Expected Types:
+        xs_input, ys_input:
+            same as input.
     """
-    raise NotImplementedError
 
-  def build(self, input_shape):  # pylint: disable=unused-argument
-    """Creates the variables of the layer."""
-
-    self.built = True
-
-  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """The logic of the layer lives here.
-
-    Arguments:
-      inputs: input tensor(s).
-
-    Returns:
-      The output from the layer
-    """
-    calibrate_op = libtwml.ops.isotonic_calibration(inputs, self.xs_input, self.ys_input)
-    return calibrate_op
+    def __init__(
+        self,
+        n_unit: int,
+        n_bin: int,
+        xs_input: Optional[np.ndarray] = None,
+        ys_input: Optional[np.ndarray] = None,
+        **kwargs,
+    ):
+        super(Isotonic, self).__init__(**kwargs)
+
+        self._n_unit = n_unit
+        self._n_bin = n_bin
+
+        self.xs_input = (
+            np.empty([n_unit, n_bin], dtype=np.float32)
+            if xs_input is None
+            else xs_input
+        )
+        self.ys_input = (
+            np.empty([n_unit, n_bin], dtype=np.float32)
+            if ys_input is None
+            else ys_input
+        )
+
+    def compute_output_shape(self, input_shape: tf.TensorShape):
+        """Computes the output shape of the layer given the input shape.
+
+        Args:
+            input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
+                be fully defined (e.g. the batch size may be unknown).
+
+        Raises NotImplementedError.
+
+        """
+        raise NotImplementedError
+
+    def build(self, input_shape: tf.TensorShape):  # pylint: disable=unused-argument
+        """Creates the variables of the layer."""
+        self.built = True
+
+    def call(self, inputs: tf.Tensor, **kwargs):  # pylint: disable=unused-argument
+        """The logic of the layer lives here.
+
+        Args:
+            inputs: input tensor(s).
+
+        Returns:
+            The output from the layer
+        """
+        calibrate_op = libtwml.ops.isotonic_calibration(
+            inputs, self.xs_input, self.ys_input
+        )
+        return calibrate_op
diff --git a/twml/twml/layers/layer.py b/twml/twml/layers/layer.py
index c1b00eb13..d798c9e5a 100644
--- a/twml/twml/layers/layer.py
+++ b/twml/twml/layers/layer.py
@@ -2,49 +2,52 @@
 """
 Implementing a base layer for twml
 """
+from typing import List, Union
+
 import tensorflow.compat.v1 as tf
 from tensorflow.python.layers import base
 
 
 class Layer(base.Layer):
-  """
-  Base Layer implementation for twml.
-  Overloads `twml.layers.Layer
-  <https://www.tensorflow.org/versions/master/api_docs/python/tf/layers/Layer>`_
-  from tensorflow and adds a couple of custom methods.
-  """
-
-  @property
-  def init(self):
-    """
-    Return initializer ops. By default returns tf.no_op().
-    This method is overwritten by classes like twml.layers.MDL, which
-    uses a HashTable internally, that must be initialized with its own op.
     """
-    return tf.no_op()
-
-  def call(self, inputs, **kwargs):
-    """The logic of the layer lives here.
-
-    Arguments:
-      inputs:
-        input tensor(s).
-      **kwargs:
-        additional keyword arguments.
-
-    Returns:
-      Output tensor(s).
+    Base Layer implementation for twml.
+    Overloads `twml.layers.Layer
+    <https://www.tensorflow.org/versions/master/api_docs/python/tf/layers/Layer>`_
+    from tensorflow and adds a couple of custom methods.
     """
-    raise NotImplementedError
 
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raise NotImplementedError.
-
-    """
-    raise NotImplementedError
+    @property
+    def init(self) -> tf.Operation:
+        """
+        Return initializer ops. By default returns tf.no_op().
+        This method is overwritten by classes like twml.layers.MDL, which
+        uses a HashTable internally, that must be initialized with its own op.
+        """
+        return tf.no_op()
+
+    def call(
+        self, inputs: Union[tf.Tensor, List[tf.Tensor]], **kwargs
+    ) -> tf.Tensor:  # pylint: disable=arguments-differ
+        """The logic of the layer lives here.
+
+        Args:
+            inputs:
+                input tensor(s).
+            **kwargs:
+                additional keyword arguments.
+
+        Returns:
+            Output tensor(s).
+        """
+        raise NotImplementedError
+
+    def compute_output_shape(self, input_shape: tf.TensorShape):
+        """Computes the output shape of the layer given the input shape.
+
+        Args:
+            input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
+                be fully defined (e.g. the batch size may be unknown).
+
+        Raise NotImplementedError.
+        """
+        raise NotImplementedError
diff --git a/twml/twml/layers/mdl.py b/twml/twml/layers/mdl.py
index cf4018afa..7202c0a45 100644
--- a/twml/twml/layers/mdl.py
+++ b/twml/twml/layers/mdl.py
@@ -4,253 +4,279 @@
 """
 
 
-from .layer import Layer
-from .partition import Partition
-from .stitch import Stitch
+from typing import Optional
 
 import libtwml
 import numpy as np
 import tensorflow.compat.v1 as tf
+
 import twml
 
+from .layer import Layer
+from .partition import Partition
+from .stitch import Stitch
+
 
 class MDL(Layer):  # noqa: T000
-  """
-  MDL layer is constructed by MDLCalibrator after accumulating data
-  and performing minimum description length (MDL) calibration.
-
-  MDL takes sparse continuous features and converts then to sparse
-  binary features. Each binary output feature is associated to an MDL bin.
-  Each MDL input feature is converted to n_bin bins.
-  Each MDL calibration tries to find bin delimiters such that the number of features values
-  per bin is roughly equal (for each given MDL feature).
-  Note that if an input feature is rarely used, so will its associated output bin/features.
-  """
-
-  def __init__(
-          self,
-          n_feature, n_bin, out_bits,
-          bin_values=None, hash_keys=None, hash_values=None,
-          bin_ids=None, feature_offsets=None, **kwargs):
     """
-    Creates a non-initialized `MDL` object.
-    Before using the table you will have to initialize it. After initialization
-    the table will be immutable.
-
-    Parent class args:
-      see [tf.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/layers/Layer)
-      for documentation of parent class arguments.
-
-    Required args:
-      n_feature:
-        number of unique features accumulated during MDL calibration.
-        This is the number of features in the hash map.
-        Used to initialize bin_values, hash_keys, hash_values,
-        bin_ids, bin_values and feature_offsets.
-      n_bin:
-        number of MDL bins used for MDL calibration.
-        Used to initialize bin_values, hash_keys, hash_values,
-        bin_ids, bin_values and feature_offsets.
-      out_bits:
-        Determines the maximum value for output feature IDs.
-        The dense_shape of the SparseTensor returned by lookup(x)
-        will be [x.shape[0], 1 << output_bits].
-
-    Optional args:
-      hash_keys:
-        contains the features ID that MDL discretizes and knows about.
-        The hash map (hash_keys->hash_values) is used for two reasons:
-          1. divide inputs into two feature spaces: MDL vs non-MDL
-          2. transate the MDL features into a hash_feature ID that MDL understands.
-        The hash_map is expected to contain n_feature items.
-      hash_values:
-        translates the feature IDs into hash_feature IDs for MDL.
-      bin_ids:
-        a 1D Tensor of size n_feature * n_bin + 1 which contains
-        unique IDs to which the MDL features will be translated to.
-        For example, tf.Tensor(np.arange(n_feature * n_bin)) would produce
-        the most efficient output space.
-      bin_values:
-        a 1D Tensor aligned with bin_ids.
-        For a given hash_feature ID j, it's value bin's are indexed between
-        `j*n_bin` and `j*n_bin + n_bin-1`.
-        As such, bin_ids[j*n_bin+i] is translated from a hash_feature ID of j
-        and a inputs value between
-        `bin_values[j*n_bin + i]` and `bin_values[j*n_bin+i+1]`.
-      feature_offsets:
-        a 1D Tensor specifying the starting location of bins for a given feature id.
-        For example, tf.Tensor(np.arange(0, bin_values.size, n_bin, dtype='int64')).
+    MDL layer is constructed by MDLCalibrator after accumulating data
+    and performing minimum description length (MDL) calibration.
+
+    MDL takes sparse continuous features and converts then to sparse
+    binary features. Each binary output feature is associated to an MDL bin.
+    Each MDL input feature is converted to n_bin bins.
+    Each MDL calibration tries to find bin delimiters such that the number of features values
+    per bin is roughly equal (for each given MDL feature).
+    Note that if an input feature is rarely used, so will its associated output bin/features.
     """
-    super(MDL, self).__init__(**kwargs)
-    tf.logging.warning("MDL will be deprecated. Please use PercentileDiscretizer instead")
-
-    max_mdl_feature = n_feature * (n_bin + 1)
-    self._n_feature = n_feature
-    self._n_bin = n_bin
-
-    self._hash_keys_initializer = tf.constant_initializer(
-      hash_keys if hash_keys is not None
-      else np.empty(n_feature, dtype=np.int64),
-      dtype=np.int64
-    )
-    self._hash_values_initializer = tf.constant_initializer(
-      hash_values if hash_values is not None
-      else np.empty(n_feature, dtype=np.int64),
-      dtype=np.int64
-    )
-    self._bin_ids_initializer = tf.constant_initializer(
-      bin_ids if bin_ids is not None
-      else np.empty(max_mdl_feature, dtype=np.int64),
-      dtype=np.int64
-    )
-    self._bin_values_initializer = tf.constant_initializer(
-      bin_values if bin_values is not None
-      else np.empty(max_mdl_feature, dtype=np.float32),
-      dtype=np.float32
-    )
-    self._feature_offsets_initializer = tf.constant_initializer(
-      feature_offsets if feature_offsets is not None
-      else np.empty(n_feature, dtype=np.int64),
-      dtype=np.int64
-    )
-
-    # note that calling build here is an exception as typically __call__ would call build().
-    # We call it here because we need to initialize hash_map.
-    # Also note that the variable_scope is set by add_variable in build()
-    if not self.built:
-      self.build(input_shape=None)
-
-    self.output_size = tf.convert_to_tensor(1 << out_bits, tf.int64)
-
-  def build(self, input_shape):  # pylint: disable=unused-argument
-    """
-    Creates the variables of the layer:
-    hash_keys, hash_values, bin_ids, bin_values, feature_offsets and self.output_size.
-    """
-
-    # build layers
-    self.partition = Partition()
-    self.stitch = Stitch()
-
-    # build variables
-
-    hash_keys = self.add_variable(
-      'hash_keys',
-      initializer=self._hash_keys_initializer,
-      shape=[self._n_feature],
-      dtype=tf.int64,
-      trainable=False)
-
-    hash_values = self.add_variable(
-      'hash_values',
-      initializer=self._hash_values_initializer,
-      shape=[self._n_feature],
-      dtype=tf.int64,
-      trainable=False)
-
-    # hashmap converts known features into range [0, n_feature)
-    initializer = tf.lookup.KeyValueTensorInitializer(hash_keys, hash_values)
-    self.hash_map = tf.lookup.StaticHashTable(initializer, -1)
-
-    self.bin_ids = self.add_variable(
-      'bin_ids',
-      initializer=self._bin_ids_initializer,
-      shape=[self._n_feature * (self._n_bin + 1)],
-      dtype=tf.int64,
-      trainable=False)
-
-    self.bin_values = self.add_variable(
-      'bin_values',
-      initializer=self._bin_values_initializer,
-      shape=[self._n_feature * (self._n_bin + 1)],
-      dtype=tf.float32,
-      trainable=False)
-
-    self.feature_offsets = self.add_variable(
-      'feature_offsets',
-      initializer=self._feature_offsets_initializer,
-      shape=[self._n_feature],
-      dtype=tf.int64,
-      trainable=False)
-
-    # make sure this is last
-    self.built = True
-
-  def call(self, inputs, **kwargs):
-    """Looks up `keys` in a table, outputs the corresponding values.
-
-    Implements MDL inference where inputs are intersected with a hash_map.
-    Part of the inputs are discretized using twml.mdl to produce a mdl_output SparseTensor.
-    This SparseTensor is then joined with the original inputs SparseTensor,
-    but only for the inputs keys that did not get discretized.
-
-    Args:
-      inputs: A 2D SparseTensor that is input to MDL for discretization.
-        It has a dense_shape of [batch_size, input_size]
-      name: A name for the operation (optional).
-    Returns:
-      A `SparseTensor` of the same type as `inputs`.
-      Its dense_shape is [shape_input.dense_shape[0], 1 << output_bits].
-    """
-    if isinstance(inputs, tf.SparseTensor):
-      inputs = twml.SparseTensor.from_tf(inputs)
-
-    assert(isinstance(inputs, twml.SparseTensor))
-
-    # sparse column indices
-    ids = inputs.ids
-    # sparse row indices
-    keys = inputs.indices
-    # sparse values
-    vals = inputs.values
-
-    # get intersect(keys, hash_map)
-    hashed_keys = self.hash_map.lookup(keys)
 
-    found = tf.not_equal(hashed_keys, tf.constant(-1, tf.int64))
-    partition_ids = tf.cast(found, tf.int32)
-
-    vals, key, indices = self.partition(partition_ids, vals, tf.where(found, hashed_keys, keys))
-    non_mdl_keys, mdl_in_keys = key
-    non_mdl_vals, mdl_in_vals = vals
-
-    self.non_mdl_keys = non_mdl_keys
-
-    # run MDL on the keys/values it knows about
-    mdl_keys, mdl_vals = libtwml.ops.mdl(mdl_in_keys, mdl_in_vals, self.bin_ids, self.bin_values,
-                                         self.feature_offsets)
-
-    # handle output ID conflicts
-    mdl_size = tf.size(self.bin_ids, out_type=tf.int64)
-    non_mdl_size = tf.subtract(self.output_size, mdl_size)
-    non_mdl_keys = tf.add(tf.floormod(non_mdl_keys, non_mdl_size), mdl_size)
-
-    # Stitch the keys and values from mdl and non mdl indices back, with help
-    # of the Stitch Layer
-
-    # out for inference checking
-    self.mdl_out_keys = mdl_keys
-
-    concat_data = self.stitch([non_mdl_vals, mdl_vals],
-                              [non_mdl_keys, mdl_keys],
-                              indices)
-
-    concat_vals, concat_keys = concat_data
-
-    # Generate output shape using _compute_output_shape
-
-    batch_size = tf.to_int64(inputs.dense_shape[0])
-    output_shape = [batch_size, self.output_size]
-    return twml.SparseTensor(ids, concat_keys, concat_vals, output_shape).to_tf()
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raises NotImplementedError.
-
-    """
-    raise NotImplementedError
+    def __init__(
+        self,
+        n_feature: int,
+        n_bin: int,
+        out_bits: int,
+        bin_values: Optional[np.ndarray] = None,
+        hash_keys: Optional[np.ndarray] = None,
+        hash_values: Optional[np.ndarray] = None,
+        bin_ids: Optional[np.ndarray] = None,
+        feature_offsets: Optional[np.ndarray] = None,
+        **kwargs
+    ):
+        """
+        Creates a non-initialized `MDL` object.
+        Before using the table you will have to initialize it. After initialization
+        the table will be immutable.
+
+        Parent class args:
+            see [tf.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/layers/Layer)
+            for documentation of parent class arguments.
+
+        Required args:
+            n_feature:
+                number of unique features accumulated during MDL calibration.
+                This is the number of features in the hash map.
+                Used to initialize bin_values, hash_keys, hash_values,
+                bin_ids, bin_values and feature_offsets.
+            n_bin:
+                number of MDL bins used for MDL calibration.
+                Used to initialize bin_values, hash_keys, hash_values,
+                bin_ids, bin_values and feature_offsets.
+            out_bits:
+                Determines the maximum value for output feature IDs.
+                The dense_shape of the SparseTensor returned by lookup(x)
+                will be [x.shape[0], 1 << output_bits].
+
+        Optional args:
+            hash_keys:
+                contains the features ID that MDL discretizes and knows about.
+                The hash map (hash_keys->hash_values) is used for two reasons:
+                1. divide inputs into two feature spaces: MDL vs non-MDL
+                2. transate the MDL features into a hash_feature ID that MDL understands.
+                The hash_map is expected to contain n_feature items.
+            hash_values:
+                translates the feature IDs into hash_feature IDs for MDL.
+            bin_ids:
+                a 1D Tensor of size n_feature * n_bin + 1 which contains
+                unique IDs to which the MDL features will be translated to.
+                For example, tf.Tensor(np.arange(n_feature * n_bin)) would produce
+                the most efficient output space.
+            bin_values:
+                a 1D Tensor aligned with bin_ids.
+                For a given hash_feature ID j, it's value bin's are indexed between
+                `j*n_bin` and `j*n_bin + n_bin-1`.
+                As such, bin_ids[j*n_bin+i] is translated from a hash_feature ID of j
+                and a inputs value between
+                `bin_values[j*n_bin + i]` and `bin_values[j*n_bin+i+1]`.
+            feature_offsets:
+                a 1D Tensor specifying the starting location of bins for a given feature id.
+                For example, tf.Tensor(np.arange(0, bin_values.size, n_bin, dtype='int64')).
+        """
+        super(MDL, self).__init__(**kwargs)
+        tf.logging.warning(
+            "MDL will be deprecated. Please use PercentileDiscretizer instead"
+        )
+
+        max_mdl_feature = n_feature * (n_bin + 1)
+        self._n_feature = n_feature
+        self._n_bin = n_bin
+
+        self._hash_keys_initializer = tf.constant_initializer(
+            hash_keys if hash_keys is not None else np.empty(n_feature, dtype=np.int64),
+            dtype=np.int64,
+        )
+        self._hash_values_initializer = tf.constant_initializer(
+            hash_values
+            if hash_values is not None
+            else np.empty(n_feature, dtype=np.int64),
+            dtype=np.int64,
+        )
+        self._bin_ids_initializer = tf.constant_initializer(
+            bin_ids
+            if bin_ids is not None
+            else np.empty(max_mdl_feature, dtype=np.int64),
+            dtype=np.int64,
+        )
+        self._bin_values_initializer = tf.constant_initializer(
+            bin_values
+            if bin_values is not None
+            else np.empty(max_mdl_feature, dtype=np.float32),
+            dtype=np.float32,
+        )
+        self._feature_offsets_initializer = tf.constant_initializer(
+            feature_offsets
+            if feature_offsets is not None
+            else np.empty(n_feature, dtype=np.int64),
+            dtype=np.int64,
+        )
+
+        # note that calling build here is an exception as typically __call__ would call build().
+        # We call it here because we need to initialize hash_map.
+        # Also note that the variable_scope is set by add_variable in build()
+        if not self.built:
+            self.build(input_shape=None)
+
+        self.output_size = tf.convert_to_tensor(1 << out_bits, tf.int64)
+
+    def build(
+        self, input_shape: Optional[tf.TensorShape] = None
+    ):  # pylint: disable=unused-argument
+        """
+        Creates the variables of the layer:
+        hash_keys, hash_values, bin_ids, bin_values, feature_offsets and self.output_size.
+        """
+
+        # build layers
+        self.partition = Partition()
+        self.stitch = Stitch()
+
+        # build variables
+
+        hash_keys = self.add_variable(
+            "hash_keys",
+            initializer=self._hash_keys_initializer,
+            shape=[self._n_feature],
+            dtype=tf.int64,
+            trainable=False,
+        )
+
+        hash_values = self.add_variable(
+            "hash_values",
+            initializer=self._hash_values_initializer,
+            shape=[self._n_feature],
+            dtype=tf.int64,
+            trainable=False,
+        )
+
+        # hashmap converts known features into range [0, n_feature)
+        initializer = tf.lookup.KeyValueTensorInitializer(hash_keys, hash_values)
+        self.hash_map = tf.lookup.StaticHashTable(initializer, -1)
+
+        self.bin_ids = self.add_variable(
+            "bin_ids",
+            initializer=self._bin_ids_initializer,
+            shape=[self._n_feature * (self._n_bin + 1)],
+            dtype=tf.int64,
+            trainable=False,
+        )
+
+        self.bin_values = self.add_variable(
+            "bin_values",
+            initializer=self._bin_values_initializer,
+            shape=[self._n_feature * (self._n_bin + 1)],
+            dtype=tf.float32,
+            trainable=False,
+        )
+
+        self.feature_offsets = self.add_variable(
+            "feature_offsets",
+            initializer=self._feature_offsets_initializer,
+            shape=[self._n_feature],
+            dtype=tf.int64,
+            trainable=False,
+        )
+
+        # make sure this is last
+        self.built = True
+
+    def call(self, inputs: twml.SparseTensor, **kwargs) -> twml.SparseTensor:
+        """Looks up `keys` in a table, outputs the corresponding values.
+
+        Implements MDL inference where inputs are intersected with a hash_map.
+        Part of the inputs are discretized using twml.mdl to produce a mdl_output SparseTensor.
+        This SparseTensor is then joined with the original inputs SparseTensor,
+        but only for the inputs keys that did not get discretized.
+
+        Args:
+            inputs: A 2D SparseTensor that is input to MDL for discretization.
+                It has a dense_shape of [batch_size, input_size]
+            name: A name for the operation (optional).
+
+        Returns:
+            A `SparseTensor` of the same type as `inputs`.
+            Its dense_shape is [shape_input.dense_shape[0], 1 << output_bits].
+        """
+        assert isinstance(inputs, twml.SparseTensor)
+
+        if isinstance(inputs, tf.SparseTensor):
+            inputs = twml.SparseTensor.from_tf(inputs)
+
+        # sparse column indices
+        ids = inputs.ids
+        # sparse row indices
+        keys = inputs.indices
+        # sparse values
+        vals = inputs.values
+
+        # get intersect(keys, hash_map)
+        hashed_keys = self.hash_map.lookup(keys)
+        found = tf.not_equal(hashed_keys, tf.constant(-1, tf.int64))
+        partition_ids = tf.cast(found, tf.int32)
+
+        vals, key, indices = self.partition(
+            partition_ids, vals, tf.where(found, hashed_keys, keys)
+        )
+        non_mdl_keys, mdl_in_keys = key
+        non_mdl_vals, mdl_in_vals = vals
+
+        self.non_mdl_keys = non_mdl_keys
+
+        # run MDL on the keys/values it knows about
+        mdl_keys, mdl_vals = libtwml.ops.mdl(
+            mdl_in_keys,
+            mdl_in_vals,
+            self.bin_ids,
+            self.bin_values,
+            self.feature_offsets,
+        )
+
+        # handle output ID conflicts
+        mdl_size = tf.size(self.bin_ids, out_type=tf.int64)
+        non_mdl_size = tf.subtract(self.output_size, mdl_size)
+        non_mdl_keys = tf.add(tf.floormod(non_mdl_keys, non_mdl_size), mdl_size)
+
+        # Stitch the keys and values from mdl and non mdl indices back, with help
+        # of the Stitch Layer
+        # out for inference checking
+        self.mdl_out_keys = mdl_keys
+
+        concat_data = self.stitch(
+            [non_mdl_vals, mdl_vals], [non_mdl_keys, mdl_keys], indices
+        )
+
+        concat_vals, concat_keys = concat_data
+
+        # Generate output shape using _compute_output_shape
+        batch_size = tf.to_int64(inputs.dense_shape[0])
+        output_shape = [batch_size, self.output_size]
+        return twml.SparseTensor(ids, concat_keys, concat_vals, output_shape).to_tf()
+
+    def compute_output_shape(self, input_shape: tf.TensorShape):
+        """Computes the output shape of the layer given the input shape.
+
+        Args:
+            input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
+                be fully defined (e.g. the batch size may be unknown).
+
+        Raises NotImplementedError.
+        """
+        raise NotImplementedError
diff --git a/twml/twml/layers/partition.py b/twml/twml/layers/partition.py
index 0e7c85f18..fb2c2f846 100644
--- a/twml/twml/layers/partition.py
+++ b/twml/twml/layers/partition.py
@@ -3,72 +3,86 @@
 """
 
 
-from .layer import Layer
+from typing import List, Union
 
 import tensorflow.compat.v1 as tf
 
+from .layer import Layer
 
-class Partition(Layer):
-  """
-  This layer implements:
-
-  .. code-block:: python
-
-    tf.dynamic_partition(input_vals, partition_ids, self.partitions)
-
-  Input:
-    partitions:
-      the number of partitions which we will divide the hashmap keys/bvalues
-
-  Output:
-    A layer that performs partitioning
-   """
 
-  def __init__(self, partitions=2, **kwargs):
-    self.partitions = partitions
-    super(Partition, self).__init__(**kwargs)
+class Partition(Layer):
+    """
+    This layer implements:
 
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
+    .. code-block:: python
 
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
+        tf.dynamic_partition(input_vals, partition_ids, self.partitions)
 
-    Raises NotImplementedError.
+    Input:
+        partitions:
+            the number of partitions which we will divide the hashmap keys/bvalues
 
+    Output:
+        A layer that performs partitioning
     """
-    raise NotImplementedError
-
-  def call(self, partition_ids, input_vals, input_keys, **kwargs):
-    """This layer is responsible for partitioning the values/keys of a hashmap
-
-    Arguments:
-      partition_ids:
-        Tensor that is equivalent to boolean (int32).
-      input_vals:
-        Tensor that represents the values of the hashmap(float).
-      input_keys:
-        Tensor that represents the keys of the hashmap(float)
-
-    Returns:
-      The output of the partition layer, which is a list of lists which looks
-      something like:
-
-      .. code-block:: python
-
-        [[vals_0, vals_1], [keys_0, keys_1], [indices_0, indices_1]]
-
-      where:
-        vals_x:
-          values of the hashmap for partition x
-        keys_x:
-          keys of the hashmap for partition x
-        indices_x:
-          indices of the hashmap for partition x
-    """
-    partioned_val = tf.dynamic_partition(input_vals, partition_ids, self.partitions)
-    partioned_keys = tf.dynamic_partition(input_keys, partition_ids, self.partitions)
-    partioned_indices = tf.dynamic_partition(tf.range(tf.shape(partition_ids)[0]),
-                                             tf.cast(partition_ids, tf.int32), self.partitions)
-    return [partioned_val, partioned_keys, partioned_indices]
+
+    def __init__(self, partitions: int = 2, **kwargs):
+        self.partitions = partitions
+        super(Partition, self).__init__(**kwargs)
+
+    def compute_output_shape(
+        self, input_shape: Union[tf.TensorShape, List[tf.TensorShape]]
+    ):
+        """Computes the output shape of the layer given the input shape.
+
+        Args:
+            input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
+                be fully defined (e.g. the batch size may be unknown).
+
+        Raises NotImplementedError.
+        """
+        raise NotImplementedError
+
+    def call(
+        self,
+        partition_ids: tf.Tensor,
+        input_vals: tf.Tensor,
+        input_keys: tf.Tensor,
+        **kwargs
+    ) -> List[List[tf.Tensor]]:
+        """This layer is responsible for partitioning the values/keys of a hashmap
+
+        Args:
+            partition_ids:
+                Tensor that is equivalent to boolean (int32).
+            input_vals:
+                Tensor that represents the values of the hashmap(float).
+            input_keys:
+                Tensor that represents the keys of the hashmap(float)
+
+        Returns:
+            The output of the partition layer, which is a list of lists which looks
+            something like:
+
+            .. code-block:: python
+
+                [[vals_0, vals_1], [keys_0, keys_1], [indices_0, indices_1]]
+
+            where:
+                vals_x:
+                    values of the hashmap for partition x
+                keys_x:
+                    keys of the hashmap for partition x
+                indices_x:
+                    indices of the hashmap for partition x
+        """
+        partioned_val = tf.dynamic_partition(input_vals, partition_ids, self.partitions)
+        partioned_keys = tf.dynamic_partition(
+            input_keys, partition_ids, self.partitions
+        )
+        partioned_indices = tf.dynamic_partition(
+            tf.range(tf.shape(partition_ids)[0]),
+            tf.cast(partition_ids, tf.int32),
+            self.partitions,
+        )
+        return [partioned_val, partioned_keys, partioned_indices]
diff --git a/twml/twml/layers/percentile_discretizer.py b/twml/twml/layers/percentile_discretizer.py
index 55bb4de8c..ab1734aec 100644
--- a/twml/twml/layers/percentile_discretizer.py
+++ b/twml/twml/layers/percentile_discretizer.py
@@ -4,206 +4,240 @@
 """
 
 
+from typing import Optional, Tuple, Union
+
 import libtwml
 import numpy as np
 import tensorflow.compat.v1 as tf
+
 import twml
 from twml.layers import Layer
 
 
 class PercentileDiscretizer(Layer):
-  """
-  PercentileDiscretizer layer is constructed by PercentileDiscretizerCalibrator after
-  accumulating data and performing percentile bucket calibration.
-
-  PercentileDiscretizer takes sparse continuous features and converts then to sparse
-  binary features. Each binary output feature is associated to an PercentileDiscretizer bin.
-  Each PercentileDiscretizer input feature is converted to n_bin bins.
-  Each PercentileDiscretizer calibration tries to find bin delimiters such
-  that the number of features values per bin is roughly equal (for
-  each given PercentileDiscretizer feature). In other words, bins are calibrated to be approx.
-  equiprobable, according to the given calibration data.
-  Note that if an input feature is rarely used, so will its associated output bin/features.
-  """
-
-  def __init__(
-      self,
-      n_feature, n_bin, out_bits,
-      bin_values=None, hash_keys=None, hash_values=None,
-      bin_ids=None, feature_offsets=None, num_parts=1, cost_per_unit=100, **kwargs):
-    """
-    Creates a non-initialized `PercentileDiscretizer` object.
-    Before using the table you will have to initialize it. After initialization
-    the table will be immutable.
-
-    If there are no calibrated features, then the discretizer will only apply
-    twml.util.limit_bits to the the feature keys (aka "feature_ids"). Essentially,
-    the discretizer will be a "no-operation", other than obeying `out_bits`
-
-    Parent class args:
-      see [tf.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/layers/Layer)
-      for documentation of parent class arguments.
-
-    Required args:
-      n_feature:
-        number of unique features accumulated during PercentileDiscretizer calibration.
-        This is the number of features in the hash map.
-        Used to initialize bin_values, hash_keys, hash_values,
-        bin_ids, bin_values and feature_offsets.
-      n_bin:
-        number of PercentileDiscretizer bins used for PercentileDiscretizer calibration.
-        Used to initialize bin_values, hash_keys, hash_values,
-        bin_ids, bin_values and feature_offsets.
-      out_bits:
-        Determines the maximum value for output feature IDs.
-        The dense_shape of the SparseTensor returned by lookup(x)
-        will be [x.shape[0], 1 << output_bits].
-
-    Optional args:
-      hash_keys:
-        contains the features ID that PercentileDiscretizer discretizes and knows about.
-        The hash map (hash_keys->hash_values) is used for two reasons:
-          1. divide inputs into two feature spaces:
-          PercentileDiscretizer vs non-PercentileDiscretizer
-          2. transate the PercentileDiscretizer features into a hash_feature ID that
-          PercentileDiscretizer understands.
-        The hash_map is expected to contain n_feature items.
-      hash_values:
-        translates the feature IDs into hash_feature IDs for PercentileDiscretizer.
-      bin_ids:
-        a 1D Tensor of size n_feature * n_bin + 1 which contains
-        unique IDs to which the PercentileDiscretizer features will be translated to.
-        For example, tf.Tensor(np.arange(n_feature * n_bin)) would produce
-        the most efficient output space.
-      bin_values:
-        a 1D Tensor aligned with bin_ids.
-        For a given hash_feature ID j, it's value bin's are indexed between
-        `j*n_bin` and `j*n_bin + n_bin-1`.
-        As such, bin_ids[j*n_bin+i] is translated from a hash_feature ID of j
-        and a inputs value between
-        `bin_values[j*n_bin + i]` and `bin_values[j*n_bin+i+1]`.
-      feature_offsets:
-        a 1D Tensor specifying the starting location of bins for a given feature id.
-        For example, tf.Tensor(np.arange(0, bin_values.size, n_bin, dtype='int64')).
-    """
-
-    super(PercentileDiscretizer, self).__init__(**kwargs)
-
-    if not self.built:
-      self.build(input_shape=None)
-
-    max_discretizer_feature = n_feature * (n_bin + 1)
-    self._n_feature = n_feature
-    self._n_bin = n_bin
-
-    # build variables
-    self._out_bits = out_bits
-    self._output_size = tf.convert_to_tensor(1 << out_bits, tf.int64)
-    self._hash_keys = (hash_keys if hash_keys is not None else
-     np.empty(n_feature, dtype=np.int64))
-    self._hash_values = (hash_values if hash_values is not None else
-     np.empty(n_feature, dtype=np.int64))
-    self._bin_ids = (bin_ids if bin_ids is not None else
-     np.empty(max_discretizer_feature, dtype=np.int64))
-    self._bin_values = (bin_values if bin_values is not None else
-     np.empty(max_discretizer_feature, dtype=np.float32))
-    self._feature_offsets = (feature_offsets if feature_offsets is not None else
-     np.empty(n_feature, dtype=np.int64))
-    self.num_parts = num_parts
-    self.cost_per_unit = cost_per_unit
-
-  def build(self, input_shape):  # pylint: disable=unused-argument
-    """
-    Creates the variables of the layer
     """
-    self.built = True
-
-  def call(self, inputs, keep_inputs=False, **kwargs):
-    """Looks up `keys` in a table, outputs the corresponding values.
-
-    Implements PercentileDiscretizer inference where inputs are intersected with a hash_map.
-    Input features that were not calibrated have their feature IDs truncated, so as
-    to be less than 1<<output_bits, but their values remain untouched (not discretized)
-
-    If there are no calibrated features, then the discretizer will only apply
-    twml.util.limit_bits to the the feature keys (aka "feature_ids"). Essentially,
-    the discretizer will be a "no-operation", other than obeying `out_bits`
-
-    Args:
-      inputs: A 2D SparseTensor that is input to PercentileDiscretizer for discretization.
-        It has a dense_shape of [batch_size, input_size]
-      keep_inputs:
-        Include the original inputs in the output.
-        Note - if True, undiscretized features will be passed through, but will have
-        their values doubled (unless there are no calibrated features to discretize).
-      name: A name for the operation (optional).
-    Returns:
-      A `SparseTensor` of the same type as `inputs`.
-      Its dense_shape is [shape_input.dense_shape[0], 1 << output_bits].
+    PercentileDiscretizer layer is constructed by PercentileDiscretizerCalibrator after
+    accumulating data and performing percentile bucket calibration.
+
+    PercentileDiscretizer takes sparse continuous features and converts then to sparse
+    binary features. Each binary output feature is associated to an PercentileDiscretizer bin.
+    Each PercentileDiscretizer input feature is converted to n_bin bins.
+    Each PercentileDiscretizer calibration tries to find bin delimiters such
+    that the number of features values per bin is roughly equal (for
+    each given PercentileDiscretizer feature). In other words, bins are calibrated to be approx.
+    equiprobable, according to the given calibration data.
+    Note that if an input feature is rarely used, so will its associated output bin/features.
     """
 
-    if isinstance(inputs, tf.SparseTensor):
-      inputs = twml.SparseTensor.from_tf(inputs)
-
-    assert(isinstance(inputs, twml.SparseTensor))
-
-    # sparse column indices
-    ids = inputs.ids
-    # sparse row indices
-    keys = inputs.indices
-    # sparse values
-    vals = inputs.values
-
-    if self._n_feature > 0:
-      discretizer_keys, discretizer_vals = libtwml.ops.percentile_discretizer_v2(
-        input_ids=keys,  # inc key assigned to feature_id, or -1
-        input_vals=vals,  # the observed feature values
-        bin_ids=self._bin_ids,  # n_feat X (n_bin+1) 2D arange
-        bin_vals=self._bin_values,  # bin boundaries
-        feature_offsets=self._feature_offsets,  # 0 : nbin_1 : max_feat
-        output_bits=self._out_bits,
-        feature_ids=tf.make_tensor_proto(self._hash_keys),  # feature ids to build internal hash map
-        feature_indices=tf.make_tensor_proto(self._hash_values),  # keys associated w/ feat. indices
-        start_compute=tf.constant(0, shape=[], dtype=tf.int64),
-        end_compute=tf.constant(-1, shape=[], dtype=tf.int64),
-        cost_per_unit=self.cost_per_unit
-      )
-    else:
-      discretizer_keys = twml.util.limit_bits(keys, self._out_bits)
-      discretizer_vals = vals
-      # don't 2x the input.
-      keep_inputs = False
-
-    batch_size = tf.to_int64(inputs.dense_shape[0])
-    output_shape = [batch_size, self._output_size]
-
-    output = twml.SparseTensor(ids, discretizer_keys, discretizer_vals, output_shape).to_tf()
-
-    if keep_inputs:
-      # Note the non-discretized features will end up doubled,
-      #   since these are already in `output`
-      # handle output ID conflicts
-      mdl_size = self._n_feature * (self._n_bin + 1)
-      non_mdl_size = tf.subtract(self._output_size, mdl_size)
-      input_keys = tf.add(tf.floormod(keys, non_mdl_size), mdl_size)
-
-      new_input = twml.SparseTensor(
-        ids=ids, indices=input_keys, values=vals, dense_shape=output_shape).to_tf()
-
-      # concatenate discretizer output with original input
-      sparse_add = tf.sparse_add(new_input, output)
-      output = tf.SparseTensor(sparse_add.indices, sparse_add.values, output_shape)
-
-    return output
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raises NotImplementedError.
-
-    """
-    raise NotImplementedError
+    def __init__(
+        self,
+        n_feature: int,
+        n_bin: int,
+        out_bits: int,
+        bin_values: Optional[tf.Tensor] = None,
+        hash_keys: Optional[tf.Tensor] = None,
+        hash_values: Optional[tf.Tensor] = None,
+        bin_ids: Optional[tf.Tensor] = None,
+        feature_offsets: Optional[tf.Tensor] = None,
+        num_parts: int = 1,
+        cost_per_unit: int = 100,
+        **kwargs
+    ):
+        """
+        Creates a non-initialized `PercentileDiscretizer` object.
+        Before using the table you will have to initialize it. After initialization
+        the table will be immutable.
+
+        If there are no calibrated features, then the discretizer will only apply
+        twml.util.limit_bits to the the feature keys (aka "feature_ids"). Essentially,
+        the discretizer will be a "no-operation", other than obeying `out_bits`
+
+        Parent class args:
+            see [tf.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/layers/Layer)
+            for documentation of parent class arguments.
+
+        Required args:
+            n_feature:
+                number of unique features accumulated during PercentileDiscretizer calibration.
+                This is the number of features in the hash map.
+                Used to initialize bin_values, hash_keys, hash_values,
+                bin_ids, bin_values and feature_offsets.
+            n_bin:
+                number of PercentileDiscretizer bins used for PercentileDiscretizer calibration.
+                Used to initialize bin_values, hash_keys, hash_values,
+                bin_ids, bin_values and feature_offsets.
+            out_bits:
+                Determines the maximum value for output feature IDs.
+                The dense_shape of the SparseTensor returned by lookup(x)
+                will be [x.shape[0], 1 << output_bits].
+
+        Optional args:
+            hash_keys:
+                contains the features ID that PercentileDiscretizer discretizes and knows about.
+                The hash map (hash_keys->hash_values) is used for two reasons:
+                1. divide inputs into two feature spaces:
+                PercentileDiscretizer vs non-PercentileDiscretizer
+                2. transate the PercentileDiscretizer features into a hash_feature ID that
+                PercentileDiscretizer understands.
+                The hash_map is expected to contain n_feature items.
+            hash_values:
+                translates the feature IDs into hash_feature IDs for PercentileDiscretizer.
+            bin_ids:
+                a 1D Tensor of size n_feature * n_bin + 1 which contains
+                unique IDs to which the PercentileDiscretizer features will be translated to.
+                For example, tf.Tensor(np.arange(n_feature * n_bin)) would produce
+                the most efficient output space.
+            bin_values:
+                a 1D Tensor aligned with bin_ids.
+                For a given hash_feature ID j, it's value bin's are indexed between
+                `j*n_bin` and `j*n_bin + n_bin-1`.
+                As such, bin_ids[j*n_bin+i] is translated from a hash_feature ID of j
+                and a inputs value between
+                `bin_values[j*n_bin + i]` and `bin_values[j*n_bin+i+1]`.
+            feature_offsets:
+                a 1D Tensor specifying the starting location of bins for a given feature id.
+                For example, tf.Tensor(np.arange(0, bin_values.size, n_bin, dtype='int64')).
+        """
+
+        super(PercentileDiscretizer, self).__init__(**kwargs)
+
+        if not self.built:
+            self.build(input_shape=None)
+
+        max_discretizer_feature = n_feature * (n_bin + 1)
+        self._n_feature = n_feature
+        self._n_bin = n_bin
+
+        # build variables
+        self._out_bits = out_bits
+        self._output_size = tf.convert_to_tensor(1 << out_bits, tf.int64)
+        self._hash_keys = (
+            hash_keys if hash_keys is not None else np.empty(n_feature, dtype=np.int64)
+        )
+        self._hash_values = (
+            hash_values
+            if hash_values is not None
+            else np.empty(n_feature, dtype=np.int64)
+        )
+        self._bin_ids = (
+            bin_ids
+            if bin_ids is not None
+            else np.empty(max_discretizer_feature, dtype=np.int64)
+        )
+        self._bin_values = (
+            bin_values
+            if bin_values is not None
+            else np.empty(max_discretizer_feature, dtype=np.float32)
+        )
+        self._feature_offsets = (
+            feature_offsets
+            if feature_offsets is not None
+            else np.empty(n_feature, dtype=np.int64)
+        )
+        self.num_parts = num_parts
+        self.cost_per_unit = cost_per_unit
+
+    def build(
+        self, input_shape: Optional[tf.TensorShape] = None
+    ):  # pylint: disable=unused-argument
+        """Creates the variables of the layer"""
+        self.built = True
+
+    def call(
+        self, inputs: tf.SparseTensor, keep_inputs: bool = False, **kwargs
+    ) -> tf.SparseTensor:
+        """Looks up `keys` in a table, outputs the corresponding values.
+
+        Implements PercentileDiscretizer inference where inputs are intersected with a hash_map.
+        Input features that were not calibrated have their feature IDs truncated, so as
+        to be less than 1<<output_bits, but their values remain untouched (not discretized)
+
+        If there are no calibrated features, then the discretizer will only apply
+        twml.util.limit_bits to the the feature keys (aka "feature_ids"). Essentially,
+        the discretizer will be a "no-operation", other than obeying `out_bits`
+
+        Args:
+            inputs: A 2D SparseTensor that is input to PercentileDiscretizer for discretization.
+                It has a dense_shape of [batch_size, input_size]
+            keep_inputs:
+                Include the original inputs in the output.
+                Note - if True, undiscretized features will be passed through, but will have
+                their values doubled (unless there are no calibrated features to discretize).
+        Returns:
+            A `SparseTensor` of the same type as `inputs`.
+            Its dense_shape is [shape_input.dense_shape[0], 1 << output_bits].
+        """
+
+        if isinstance(inputs, tf.SparseTensor):
+            inputs = twml.SparseTensor.from_tf(inputs)
+
+        assert isinstance(inputs, twml.SparseTensor)
+
+        # sparse column indices
+        ids = inputs.ids
+        # sparse row indices
+        keys = inputs.indices
+        # sparse values
+        vals = inputs.values
+
+        if self._n_feature > 0:
+            discretizer_keys, discretizer_vals = libtwml.ops.percentile_discretizer_v2(
+                input_ids=keys,  # inc key assigned to feature_id, or -1
+                input_vals=vals,  # the observed feature values
+                bin_ids=self._bin_ids,  # n_feat X (n_bin+1) 2D arange
+                bin_vals=self._bin_values,  # bin boundaries
+                feature_offsets=self._feature_offsets,  # 0 : nbin_1 : max_feat
+                output_bits=self._out_bits,
+                feature_ids=tf.make_tensor_proto(
+                    self._hash_keys
+                ),  # feature ids to build internal hash map
+                feature_indices=tf.make_tensor_proto(
+                    self._hash_values
+                ),  # keys associated w/ feat. indices
+                start_compute=tf.constant(0, shape=[], dtype=tf.int64),
+                end_compute=tf.constant(-1, shape=[], dtype=tf.int64),
+                cost_per_unit=self.cost_per_unit,
+            )
+        else:
+            discretizer_keys = twml.util.limit_bits(keys, self._out_bits)
+            discretizer_vals = vals
+            # don't 2x the input.
+            keep_inputs = False
+
+        batch_size = tf.to_int64(inputs.dense_shape[0])
+        output_shape = [batch_size, self._output_size]
+
+        output = twml.SparseTensor(
+            ids, discretizer_keys, discretizer_vals, output_shape
+        ).to_tf()
+
+        if keep_inputs:
+            # Note the non-discretized features will end up doubled,
+            #   since these are already in `output`
+            # handle output ID conflicts
+            mdl_size = self._n_feature * (self._n_bin + 1)
+            non_mdl_size = tf.subtract(self._output_size, mdl_size)
+            input_keys = tf.add(tf.floormod(keys, non_mdl_size), mdl_size)
+
+            new_input = twml.SparseTensor(
+                ids=ids, indices=input_keys, values=vals, dense_shape=output_shape
+            ).to_tf()
+
+            # concatenate discretizer output with original input
+            sparse_add = tf.sparse_add(new_input, output)
+            output = tf.SparseTensor(
+                sparse_add.indices, sparse_add.values, output_shape
+            )
+
+        return output
+
+    def compute_output_shape(self, input_shape: Union[tf.TensorShape, Tuple[int, ...]]):
+        """Computes the output shape of the layer given the input shape.
+
+        Args:
+            input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
+                be fully defined (e.g. the batch size may be unknown).
+
+        Raises NotImplementedError.
+        """
+        raise NotImplementedError
diff --git a/twml/twml/layers/sequential.py b/twml/twml/layers/sequential.py
index c0d4b92cc..7b0f83d34 100644
--- a/twml/twml/layers/sequential.py
+++ b/twml/twml/layers/sequential.py
@@ -3,158 +3,168 @@
 """
 
 
-from .layer import Layer
+from typing import List, Optional, Union
 
+import tensorflow.compat.v1 as tf
 from tensorflow import keras
 from tensorflow.python.layers import base
 
+from .layer import Layer
 
-class Sequential(Layer):
-  """
-  A sequential stack of layers.
-
-  Arguments:
-      layers: list of layers to add to the model.
-
-  Output:
-      the output of the sequential layers
-   """
-
-  def __init__(self, layers=None, **kwargs):
-    self._layers = []  # Stack of layers.
-    self._layer_names = []  # Stack of layers names
-    self._layer_outputs = []
-    # Add to the model any layers passed to the constructor.
-    if layers:
-      for layer in layers:
-        self.add(layer)
-    super(Sequential, self).__init__(**kwargs)
-
-  def add(self, layer):
-    """Adds a layer instance on top of the layer stack.
-
-    Arguments:
-      layer:
-        layer instance.
-
-    Raises:
-      TypeError:
-        if the layer argument is not instance of base.Layer
-    """
-    if not isinstance(layer, base.Layer) and not isinstance(layer, keras.layers.Layer):
-      raise TypeError('The added layer must be an instance of class Layer')
-
-    if layer.name in self._layer_names:
-      raise ValueError('Layer with name %s already exists in sequential layer' % layer.name)
-
-    self._layers.append(layer)
-    self._layer_names.append(layer.name)
-
-  def pop(self):
-    """Removes the last layer in the model.
-
-    Raises:
-      TypeError:
-        if there are no layers in the model.
-    """
-    if not self._layers or not self._layer_names:
-      raise TypeError('There are no layers in the model.')
-    self._layers.pop()
-    self._layer_names.pop()
-
-  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """The logic of the layer lives here.
-
-    Arguments:
-      inputs:
-        input tensor(s).
-
-    Returns:
-      The output of the sequential layers
-    """
-    self._layer_outputs = []
-    for layer in self._layers:
-      # don't use layer.call because you want to build individual layers
-      inputs = layer(inputs)  # overwrites the current input after it has been processed
-      self._layer_outputs.append(inputs)
-    return inputs
-
-  @property
-  def layers(self):
-    """ Return the layers in the sequential layer """
-    return self._layers
-
-  @property
-  def layer_names(self):
-    """ Return the layer names in the sequential layer """
-    return self._layer_names
-
-  @property
-  def layer_outputs(self):
-    """ Return the layer outputs in the sequential layer """
-    return self._layer_outputs
-
-  def get(self, key):
-    """Retrieves the n-th layer.
-
-    Arguments:
-      key:
-        index of the layer
-
-    Output:
-      The n-th layer where n is equal to the key.
-    """
-    return self._layers[key]
-
-  def get_output(self, key):
-    """Retrieves the n-th layer output.
-
-    Arguments:
-      key:
-        index of the layer
-
-    Output:
-      The intermediary output equivalent to the nth layer, where n is equal to the key.
-    """
-    return self._layer_outputs[key]
-
-  def get_layer_by_name(self, name):
-    """Retrieves the layer corresponding to the name.
-
-    Arguments:
-      name:
-        name of the layer
 
-    Output:
-      list of layers that have the name desired
+class Sequential(Layer):
     """
-    return self._layers[self._layer_names.index(name)]
+    A sequential stack of layers.
 
-  def get_layer_output_by_name(self, name):
-    """Retrieves the layer output corresponding to the name.
-
-    Arguments:
-      name:
-        name of the layer
+    Args:
+        layers: list of layers to add to the model.
 
     Output:
-      list of the output of the layers that have the desired name
+        the output of the sequential layers
     """
-    return self._layer_outputs[self._layer_names.index(name)]
-
-  @property
-  def init(self):
-    """ returns a list of initialization ops (one per layer) """
-    return [layer.init for layer in self._layers]
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
 
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Raise NotImplementedError.
-
-    """
-    raise NotImplementedError
+    def __init__(self, layers: Optional[List[Layer]] = None, **kwargs):
+        self._layers = []  # Stack of layers.
+        self._layer_names = []  # Stack of layers names
+        self._layer_outputs = []
+        # Add to the model any layers passed to the constructor.
+        if layers:
+            for layer in layers:
+                self.add(layer)
+        super(Sequential, self).__init__(**kwargs)
+
+    def add(self, layer: Layer) -> None:
+        """Adds a layer instance on top of the layer stack.
+
+        Args:
+            layer:
+                layer instance.
+
+        Raises:
+            TypeError:
+                if the layer argument is not instance of base.Layer
+        """
+        if not isinstance(layer, base.Layer) and not isinstance(
+            layer, keras.layers.Layer
+        ):
+            raise TypeError("The added layer must be an instance of class Layer")
+
+        if layer.name in self._layer_names:
+            raise ValueError(
+                "Layer with name %s already exists in sequential layer" % layer.name
+            )
+
+        self._layers.append(layer)
+        self._layer_names.append(layer.name)
+
+    def pop(self) -> None:
+        """Removes the last layer in the model.
+
+        Raises:
+            TypeError:
+                if there are no layers in the model.
+        """
+        if not self._layers or not self._layer_names:
+            raise TypeError("There are no layers in the model.")
+        self._layers.pop()
+        self._layer_names.pop()
+
+    def call(self, inputs: Layer, **kwargs) -> Layer:  # pylint: disable=unused-argument
+        """The logic of the layer lives here.
+
+        Args:
+            inputs:
+                input tensor(s).
+
+        Returns:
+            The output of the sequential layers
+        """
+        self._layer_outputs = []
+        for layer in self._layers:
+            # don't use layer.call because you want to build individual layers
+            inputs = layer(
+                inputs
+            )  # overwrites the current input after it has been processed
+            self._layer_outputs.append(inputs)
+        return inputs
+
+    @property
+    def layers(self) -> List[Layer]:
+        """Return the layers in the sequential layer"""
+        return self._layers
+
+    @property
+    def layer_names(self) -> List[str]:
+        """Return the layer names in the sequential layer"""
+        return self._layer_names
+
+    @property
+    def layer_outputs(self) -> List[Layer]:
+        """Return the layer outputs in the sequential layer"""
+        return self._layer_outputs
+
+    def get(self, key: int) -> Layer:
+        """Retrieves the n-th layer.
+
+        Args:
+            key:
+                index of the layer
+
+        Output:
+            The n-th layer where n is equal to the key.
+        """
+        return self._layers[key]
+
+    def get_output(self, key: int) -> Layer:
+        """Retrieves the n-th layer output.
+
+        Args:
+            key:
+                index of the layer
+
+        Output:
+            The intermediary output equivalent to the nth layer, where n is equal to the key.
+        """
+        return self._layer_outputs[key]
+
+    def get_layer_by_name(self, name: str) -> Layer:
+        """Retrieves the layer corresponding to the name.
+
+        Args:
+            name:
+                name of the layer
+
+        Output:
+            list of layers that have the name desired
+        """
+        return self._layers[self._layer_names.index(name)]
+
+    def get_layer_output_by_name(self, name: str) -> Layer:
+        """Retrieves the layer output corresponding to the name.
+
+        Args:
+            name:
+                name of the layer
+
+        Output:
+            list of the output of the layers that have the desired name
+        """
+        return self._layer_outputs[self._layer_names.index(name)]
+
+    @property
+    def init(self) -> List[tf.Operation]:
+        """returns a list of initialization ops (one per layer)"""
+        return [layer.init for layer in self._layers]
+
+    def compute_output_shape(
+        self, input_shape: Union[tf.TensorShape, List[tf.TensorShape]]
+    ) -> tf.TensorShape:
+        """Computes the output shape of the layer given the input shape.
+
+        Args:
+            input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
+                be fully defined (e.g. the batch size may be unknown).
+
+        Raise NotImplementedError.
+        """
+        raise NotImplementedError
diff --git a/twml/twml/layers/sparse_max_norm.py b/twml/twml/layers/sparse_max_norm.py
index e1f423fe0..04fc6b68d 100644
--- a/twml/twml/layers/sparse_max_norm.py
+++ b/twml/twml/layers/sparse_max_norm.py
@@ -2,220 +2,237 @@
 """
 Contains the twml.layers.SparseMaxNorm layer.
 """
-from .layer import Layer
+from typing import Optional, Union
 
-from libtwml import OPLIB
 import tensorflow.compat.v1 as tf
-import twml
-
+from libtwml import OPLIB
 
-class SparseMaxNorm(Layer):
-  """
-  Computes a max-normalization and adds bias to the sparse_input,
-  forwards that through a sparse affine transform followed
-  by an non-linear activation on the resulting dense representation.
-
-  This layer has two parameters, one of which learns through gradient descent:
-    bias_x (optional):
-      vector of shape [input_size]. Learned through gradient descent.
-    max_x:
-      vector of shape [input_size]. Holds the maximas of input ``x`` for normalization.
-      Either calibrated through SparseMaxNorm calibrator, or calibrated online, or both.
-
-  The pseudo-code for this layer looks like:
-
-  .. code-block:: python
-
-    abs_x = abs(x)
-    normed_x = clip_by_value(x / max_x, -1, 1)
-    biased_x = normed_x + bias_x
-    return biased
-
-
-  Args:
-    max_x_initializer:
-      initializer vector of shape [input_size] used by variable `max_x`
-    bias_x_initializer:
-      initializer vector of shape [input_size] used by parameter `bias_x`
-    is_training:
-      Are we training the layer to learn the normalization maximas.
-      If set to True, max_x will be able to learn. This is independent of bias_x
-    epsilon:
-      The minimum value used for max_x. Defaults to 1E-5.
-    use_bias:
-      Default True. Set to False to not use a bias term.
-
-  Returns:
-    A layer representing the output of the sparse_max_norm transformation.
-   """
-
-  def __init__(
-          self,
-          input_size=None,
-          max_x_initializer=None,
-          bias_x_initializer=None,
-          is_training=True,
-          epsilon=1E-5,
-          use_bias=True,
-          **kwargs):
-
-    super(SparseMaxNorm, self).__init__(**kwargs)
-    if input_size:
-      raise ValueError('input_size is deprecated - it is now automatically \
-                       inferred from your input.')
-    if max_x_initializer is None:
-      max_x_initializer = tf.zeros_initializer()
-    self.max_x_initializer = max_x_initializer
-
-    self._use_bias = use_bias
-    if use_bias:
-      if bias_x_initializer is None:
-        bias_x_initializer = tf.zeros_initializer()
-      self.bias_x_initializer = bias_x_initializer
-
-    self.epsilon = epsilon
-    self.is_training = is_training
-
-  def build(self, input_shape):  # pylint: disable=unused-argument
-    """Creates the max_x and bias_x tf.Variables of the layer."""
-
-    self.max_x = self.add_variable(
-      'max_x',
-      initializer=self.max_x_initializer,
-      shape=[input_shape[1]],
-      dtype=tf.float32,
-      trainable=False)
-
-    if self._use_bias:
-      self.bias_x = self.add_variable(
-        'bias_x',
-        initializer=self.bias_x_initializer,
-        shape=[input_shape[1]],
-        dtype=tf.float32,
-        trainable=True)
-
-    self.built = True
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
+import twml
 
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
+from .layer import Layer
 
-    Raises NotImplementedError.
 
+class SparseMaxNorm(Layer):
     """
-    raise NotImplementedError
+    Computes a max-normalization and adds bias to the sparse_input,
+    forwards that through a sparse affine transform followed
+    by an non-linear activation on the resulting dense representation.
 
-  def _call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """
-    The forward propagation logic of the layer lives here.
+    This layer has two parameters, one of which learns through gradient descent:
+        bias_x (optional):
+            vector of shape [input_size]. Learned through gradient descent.
+        max_x:
+            vector of shape [input_size]. Holds the maximas of input ``x`` for normalization.
+            Either calibrated through SparseMaxNorm calibrator, or calibrated online, or both.
 
-    Arguments:
-      sparse_input:
-        A 2D ``tf.SparseTensor`` of dense_shape ``[batch_size, input_size]``
-    Returns:
-       A ``tf.SparseTensor`` representing the output of the max_norm transformation, this can
-       be fed into twml.layers.FullSparse in order to be transformed into a ``tf.Tensor``.
-    """
+    The pseudo-code for this layer looks like:
 
-    if isinstance(inputs, twml.SparseTensor):
-      inputs = inputs.to_tf()
-    elif not isinstance(inputs, tf.SparseTensor):
-      raise TypeError("The inputs must be of type tf.SparseTensor or twml.SparseTensor")
+    .. code-block:: python
 
-    indices_x = inputs.indices[:, 1]
-    values_x = inputs.values
+        abs_x = abs(x)
+        normed_x = clip_by_value(x / max_x, -1, 1)
+        biased_x = normed_x + bias_x
+        return biased
 
-    if self.is_training is False:
-      normalized_x = OPLIB.sparse_max_norm_inference(self.max_x,
-                                                     indices_x,
-                                                     values_x,
-                                                     self.epsilon)
 
-      update_op = tf.no_op()
-    else:
-      max_x, normalized_x = OPLIB.sparse_max_norm_training(self.max_x,
-                                                           indices_x,
-                                                           values_x,
-                                                           self.epsilon)
+    Args:
+        max_x_initializer:
+            initializer vector of shape [input_size] used by variable `max_x`
+        bias_x_initializer:
+            initializer vector of shape [input_size] used by parameter `bias_x`
+        is_training:
+            Are we training the layer to learn the normalization maximas.
+            If set to True, max_x will be able to learn. This is independent of bias_x
+        epsilon:
+            The minimum value used for max_x. Defaults to 1E-5.
+        use_bias:
+            Default True. Set to False to not use a bias term.
 
-      update_op = tf.assign(self.max_x, max_x)
+    Returns:
+        A layer representing the output of the sparse_max_norm transformation.
+    """
 
-    with tf.control_dependencies([update_op]):
-      normalized_x = tf.stop_gradient(normalized_x)
+    def __init__(
+        self,
+        input_size: Optional[int] = None,
+        max_x_initializer: Optional[tf.keras.initializers.Initializer] = None,
+        bias_x_initializer: Optional[tf.keras.initializers.Initializer] = None,
+        is_training: bool = True,
+        epsilon: float = 1e-5,
+        use_bias: bool = True,
+        **kwargs
+    ):
+        super(SparseMaxNorm, self).__init__(**kwargs)
+        if input_size:
+            raise ValueError(
+                "input_size is deprecated - it is now automatically \
+                        inferred from your input."
+            )
+        if max_x_initializer is None:
+            max_x_initializer = tf.zeros_initializer()
+        self.max_x_initializer = max_x_initializer
+
+        self._use_bias = use_bias
+        if use_bias:
+            if bias_x_initializer is None:
+                bias_x_initializer = tf.zeros_initializer()
+            self.bias_x_initializer = bias_x_initializer
+
+        self.epsilon = epsilon
+        self.is_training = is_training
+
+    def build(self, input_shape: tf.TensorShape):
+        """Creates the max_x and bias_x tf.Variables of the layer."""
+
+        self.max_x = self.add_variable(
+            "max_x",
+            initializer=self.max_x_initializer,
+            shape=[input_shape[1]],
+            dtype=tf.float32,
+            trainable=False,
+        )
+
+        if self._use_bias:
+            self.bias_x = self.add_variable(
+                "bias_x",
+                initializer=self.bias_x_initializer,
+                shape=[input_shape[1]],
+                dtype=tf.float32,
+                trainable=True,
+            )
+
+        self.built = True
+
+    def compute_output_shape(
+        self, input_shape: tf.TensorShape
+    ):  # pylint: disable=unused-argument
+        """Computes the output shape of the layer given the input shape.
+
+        Args:
+            input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
+                be fully defined (e.g. the batch size may be unknown).
+
+        Raises NotImplementedError.
+        """
+        raise NotImplementedError
+
+    def _call(
+        self, inputs: tf.SparseTensor, **kwargs
+    ) -> tf.SparseTensor:  # pylint: disable=unused-argument
+        """
+        The forward propagation logic of the layer lives here.
+
+        Args:
+            sparse_input:
+                A 2D ``tf.SparseTensor`` of dense_shape ``[batch_size, input_size]``
+        Returns:
+            A ``tf.SparseTensor`` representing the output of the max_norm transformation, this can
+            be fed into twml.layers.FullSparse in order to be transformed into a ``tf.Tensor``.
+        """
+
+        if isinstance(inputs, twml.SparseTensor):
+            inputs = inputs.to_tf()
+        elif not isinstance(inputs, tf.SparseTensor):
+            raise TypeError(
+                "The inputs must be of type tf.SparseTensor or twml.SparseTensor"
+            )
+
+        indices_x = inputs.indices[:, 1]
+        values_x = inputs.values
+
+        if self.is_training is False:
+            normalized_x = OPLIB.sparse_max_norm_inference(
+                self.max_x, indices_x, values_x, self.epsilon
+            )
+
+            update_op = tf.no_op()
+        else:
+            max_x, normalized_x = OPLIB.sparse_max_norm_training(
+                self.max_x, indices_x, values_x, self.epsilon
+            )
+
+            update_op = tf.assign(self.max_x, max_x)
+
+        with tf.control_dependencies([update_op]):
+            normalized_x = tf.stop_gradient(normalized_x)
+
+        # add input bias
+        if self._use_bias:
+            normalized_x = normalized_x + tf.gather(self.bias_x, indices_x)
+
+        # convert back to sparse tensor
+        return tf.SparseTensor(inputs.indices, normalized_x, inputs.dense_shape)
+
+    def call(self, inputs: tf.SparseTensor, **kwargs) -> tf.SparseTensor:
+        """
+        The forward propagation logic of the layer lives here.
+
+        Args:
+            sparse_input:
+                A 2D ``tf.SparseTensor`` of dense_shape ``[batch_size, input_size]``
+        Returns:
+            A ``tf.SparseTensor`` representing the output of the max_norm transformation, this can
+            be fed into twml.layers.FullSparse in order to be transformed into a ``tf.Tensor``.
+        """
+        with tf.device(self.max_x.device):
+            return self._call(inputs, **kwargs)
 
-    # add input bias
-    if self._use_bias:
-      normalized_x = normalized_x + tf.gather(self.bias_x, indices_x)
 
-    # convert back to sparse tensor
-    return tf.SparseTensor(inputs.indices, normalized_x, inputs.dense_shape)
+# For backwards compatiblity and also because I don't want to change all the tests.
+MaxNorm = SparseMaxNorm
 
-  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """
-    The forward propagation logic of the layer lives here.
 
-    Arguments:
-      sparse_input:
-        A 2D ``tf.SparseTensor`` of dense_shape ``[batch_size, input_size]``
-    Returns:
-       A ``tf.SparseTensor`` representing the output of the max_norm transformation, this can
-       be fed into twml.layers.FullSparse in order to be transformed into a ``tf.Tensor``.
+def sparse_max_norm(
+    inputs: Union[tf.SparseTensor, twml.SparseTensor],
+    input_size: Optional[int] = None,
+    max_x_initializer: Optional[tf.keras.initializers.Initializer] = None,
+    bias_x_initializer: Optional[tf.keras.initializers.Initializer] = None,
+    is_training: bool = True,
+    epsilon: float = 1e-5,
+    use_bias: bool = True,
+    name: Optional[str] = None,
+    reuse: Optional[bool] = None,
+) -> tf.SparseTensor:
     """
-    with tf.device(self.max_x.device):
-      return self._call(inputs, **kwargs)
+    Functional inteface to SparseMaxNorm.
 
-# For backwards compatiblity and also because I don't want to change all the tests.
-MaxNorm = SparseMaxNorm
+    Args:
+        inputs:
+            A sparse tensor (can be twml.SparseTensor or tf.SparseTensor)
+        input_size:
+            number of input units
+        max_x_initializer:
+            initializer vector of shape [input_size] used by variable `max_x`
+        bias_x_initializer:
+            initializer vector of shape [input_size] used by parameter `bias_x`
+        is_training:
+            Are we training the layer to learn the normalization maximas.
+            If set to True, max_x will be able to learn. This is independent of bias_x
+        epsilon:
+            The minimum value used for max_x. Defaults to 1E-5.
+        use_bias:
+            Default True. Set to False to not use a bias term.
 
+    Returns:
+        Output after normalizing with the max value.
+    """
+    if input_size:
+        raise ValueError(
+            "input_size is deprecated - it is now automatically \
+                     inferred from your input."
+        )
 
-def sparse_max_norm(inputs,
-                    input_size=None,
-                    max_x_initializer=None,
-                    bias_x_initializer=None,
-                    is_training=True,
-                    epsilon=1E-5,
-                    use_bias=True,
-                    name=None,
-                    reuse=None):
-  """
-  Functional inteface to SparseMaxNorm.
-
-  Args:
-    inputs:
-      A sparse tensor (can be twml.SparseTensor or tf.SparseTensor)
-    input_size:
-      number of input units
-    max_x_initializer:
-      initializer vector of shape [input_size] used by variable `max_x`
-    bias_x_initializer:
-      initializer vector of shape [input_size] used by parameter `bias_x`
-    is_training:
-      Are we training the layer to learn the normalization maximas.
-      If set to True, max_x will be able to learn. This is independent of bias_x
-    epsilon:
-      The minimum value used for max_x. Defaults to 1E-5.
-    use_bias:
-      Default True. Set to False to not use a bias term.
-
-  Returns:
-    Output after normalizing with the max value.
-   """
-  if input_size:
-    raise ValueError('input_size is deprecated - it is now automatically \
-                     inferred from your input.')
-
-  if isinstance(inputs, twml.SparseTensor):
-    inputs = inputs.to_tf()
-
-  layer = SparseMaxNorm(max_x_initializer=max_x_initializer,
-                        bias_x_initializer=bias_x_initializer,
-                        is_training=is_training,
-                        epsilon=epsilon,
-                        use_bias=use_bias,
-                        name=name,
-                        _scope=name,
-                        _reuse=reuse)
-  return layer(inputs)
+    if isinstance(inputs, twml.SparseTensor):
+        inputs = inputs.to_tf()
+
+    layer = SparseMaxNorm(
+        max_x_initializer=max_x_initializer,
+        bias_x_initializer=bias_x_initializer,
+        is_training=is_training,
+        epsilon=epsilon,
+        use_bias=use_bias,
+        name=name,
+        _scope=name,
+        _reuse=reuse,
+    )
+    return layer(inputs)
diff --git a/twml/twml/layers/stitch.py b/twml/twml/layers/stitch.py
index 51dffdb8e..eba9bc318 100644
--- a/twml/twml/layers/stitch.py
+++ b/twml/twml/layers/stitch.py
@@ -4,51 +4,61 @@
 """
 
 
-from .layer import Layer
+from typing import List, Union
 
 import tensorflow.compat.v1 as tf
 
+from .layer import Layer
+
 
 class Stitch(Layer):
-  """
-  This layer is responsible for stitching a partioned layer together.
+    """
+    This layer is responsible for stitching a partitioned layer together.
+
+    Output:
+        A layer that performs stitching
+    """
 
-  Output:
-    A layer that performs stitching
-  """
+    def compute_output_shape(
+        self, input_shape: Union[tf.TensorShape, List[tf.TensorShape]]
+    ):
+        """Computes the output shape of the layer given the input shape.
 
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
+        Args:
+            input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
+                be fully defined (e.g. the batch size may be unknown).
 
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
+        Raises NotImplementedError.
 
-    Raises NotImplementedError.
+        """
+        raise NotImplementedError
 
-    """
-    raise NotImplementedError
+    def call(
+        self,
+        partitioned_val: List[tf.Tensor],
+        partitioned_keys: List[tf.Tensor],
+        partitioned_indices: List[tf.Tensor],
+        **kwargs,
+    ) -> List[tf.Tensor]:
+        """
+        This layer is responsible for stitching a partitioned layer together.
 
-  def call(self, partioned_val, partioned_keys,
-           partioned_indices, **kwargs):  # pylint: disable=unused-argument, arguments-differ
-    """
-    This layer is responsible for stitching a partioned layer together.
-
-    Input:
-      partioned_val:
-        a list of partioned Tensors which represent the vals of the hashmap
-      partioned_keys:
-        a list of partioned Tensors which represent the keys of the hashmap
-      partioned_indices:
-        a list of partioned Tensors which represent the indices of the hashmap
-    Output:
-      List which contains: [output_vals, output_keys]
-        output_vals:
-          Values of the HashMap (float)
-        output_keys:
-          Keys of HashMap (float)
-    """
-    indices = [tf.to_int32(index) for index in partioned_indices]
-    concat_keys = tf.dynamic_stitch(indices, partioned_keys)
-    concat_vals = tf.dynamic_stitch(indices, partioned_val)
-    return [concat_vals, concat_keys]
+        Input:
+            partitioned_val:
+                a list of partitioned Tensors which represent the vals of the hashmap
+            partitioned_keys:
+                a list of partitioned Tensors which represent the keys of the hashmap
+            partitioned_indices:
+                a list of partitioned Tensors which represent the indices of the hashmap
+
+        Output:
+            List which contains: [output_vals, output_keys]
+                output_vals:
+                    Values of the HashMap (float)
+                output_keys:
+                    Keys of HashMap (float)
+        """
+        indices = [tf.to_int32(index) for index in partitioned_indices]
+        concat_keys = tf.dynamic_stitch(indices, partitioned_keys)
+        concat_vals = tf.dynamic_stitch(indices, partitioned_val)
+        return [concat_vals, concat_keys]
diff --git a/twml/twml/learning_rate_decay.py b/twml/twml/learning_rate_decay.py
index be522d75b..9a012012c 100644
--- a/twml/twml/learning_rate_decay.py
+++ b/twml/twml/learning_rate_decay.py
@@ -3,166 +3,213 @@
 import tensorflow.compat.v1 as tf
 
 
-def get_learning_rate_decay_fn(params):
-  """
-  Returns a learning rate decay function that takes the initial
-  learning_rate and global_step
-  as arguments and returns the current learning rate.
-
-  Currently supports params.learning_rate_decay values of:
-  exponential | polynomial | piecewise_constant | cosine | cosine restarts.
-  See `Decaying the Leanring Rate
-  <https://www.tensorflow.org/api_guides/python/train#Decaying_the_learning_rate>`_ for details.
-
-  Arguments:
-    params:
-      a tensorflow.contrib.train.HParams object containing the relevant hyperparameters.
-  """
-  paramsv = params.values()
-  if 'learning_rate_decay' not in paramsv or params.learning_rate_decay == 'no_learning_rate_decay':
-    return None
-  elif params.learning_rate_decay == 'exponential_learning_rate_decay':
-    if 'decay_steps' not in paramsv:
-      raise ValueError("Expecting params.decay_steps for "
-                       "params.learning_rate_decay == 'exponential'")
-    if 'exponential_decay_rate' not in paramsv:
-      raise ValueError("Expecting params.exponential_decay_rate for "
-                       "params.learning_rate_decay == 'exponential'")
-
-    def exponential_decay_fn(learning_rate, global_step):
-      """ exponential decay function to be passed to optimize_loss """
-      return tf.train.exponential_decay(
-        learning_rate=learning_rate,
-        global_step=global_step,
-        decay_steps=params.decay_steps,
-        decay_rate=params.exponential_decay_rate
-      )
-    return exponential_decay_fn
-  elif params.learning_rate_decay == 'piecewise_constant_learning_rate_decay':
-    if 'piecewise_constant_boundaries' not in paramsv:
-      raise ValueError("Expecting params.piecewise_constant_boundaries for "
-                       "params.learning_rate_decay == 'piecewise_constant'")
-    if 'piecewise_constant_values' not in paramsv:
-      raise ValueError("Expecting params.piecewise_constant_values for "
-                       "params.learning_rate_decay == 'piecewise_constant'")
-    # pylint: disable=unused-argument
-
-    def piecewise_constant_fn(learning_rate, global_step):
-      """ piecewise_constant decay function to be passed to optimize_loss """
-      return tf.train.piecewise_constant(
-        x=global_step,
-        boundaries=params.piecewise_constant_boundaries,
-        values=params.piecewise_constant_values
-      )
-    return piecewise_constant_fn
-  elif params.learning_rate_decay == 'polynomial_learning_rate_decay':
-    if 'decay_steps' not in paramsv:
-      raise ValueError("Expecting params.decay_steps for "
-                       "params.learning_rate_decay == 'polynomial'")
-    if 'end_learning_rate' not in paramsv:
-      raise ValueError("Expecting params.end_learning_rate for "
-                       "params.learning_rate_decay == 'polynomial'")
-
-    def polynomial_decay_fn(learning_rate, global_step):
-      """ polynomial decay function to be passed to optimize_loss """
-      return tf.train.polynomial_decay(
-        learning_rate=learning_rate,
-        global_step=global_step,
-        decay_steps=params.decay_steps,
-        end_learning_rate=params.end_learning_rate,
-        power=params.polynomial_power if 'polynomial_power' in paramsv else 1.0,
-      )
-    return polynomial_decay_fn
-
-  elif params.learning_rate_decay == 'inverse_learning_rate_decay':
-    if 'min_learning_rate' not in paramsv:
-      raise ValueError("Expecting params.min_learning_rate for "
-                       "params.learning_rate_decay == 'inverse'")
-    if 'decay_rate' not in paramsv:
-      raise ValueError("Expecting params.decay_rate for "
-                       "params.learning_rate_decay == 'inverse'")
-    if 'decay_steps' not in paramsv:
-      raise ValueError("Expecting params.decay_steps for "
-                       "params.learning_rate_decay == 'inverse'")
-
-    def bounded_inverse_time_decay_fn(learning_rate, global_step):
-      '''
-      Returns the decayed learning_rate by applying the function:
-      decayed_lr = max(lr /(1 + decay_rate * floor(global_step /decay_step)),
-                       min_learning_rate)
-      Arguments:
-        learning_rate:
-          A scalar `float32` or `float64` `Tensor` or a Python number.
-          The initial learning rate.
-        global_step:
-          A scalar `int32` or `int64` `Tensor` or a Python number.
-          Global step to use for the decay computation.  Must not be negative.
-        min_learning_rate:
-          A scalar `int32` or `int64` `Tensor` or a Python number.
-          Minimum possible learning_rate. The decayed learning_rate will not be
-          smaller than the min_learning_rate
-        decay_steps:
-          How often to apply decay. In dbv1, this should be 1.
-        decay_rate:
-          A scalar `int32` or `int64` `Tensor` or a Python number.
-          Rate in which we decay the learning rate.
-        Returns:
-        A scalar `Tensor` of the same type as `learning_rate`.  The decayed
-        learning rate.
-      '''
-      decayed_rate = tf.train.inverse_time_decay(
-        learning_rate=learning_rate,
-        global_step=global_step,
-        decay_steps=params.decay_steps,
-        decay_rate=params.decay_rate)
-      # Getting dtype of returned Tensor
-      dtype = decayed_rate.dtype
-      # Casting the min_learning rate the same dtype as decayes rate
-      min_learning_rate = tf.cast(params.min_learning_rate, dtype)
-      # Returning the maximum between the two
-      return tf.maximum(decayed_rate, min_learning_rate)
-
-    return bounded_inverse_time_decay_fn
-
-  elif params.learning_rate_decay == 'cosine_learning_rate_decay':
-    if 'decay_steps' not in paramsv:
-      raise ValueError("Expecting params.decay_steps for "
-                       "params.learning_rate_decay == 'cosine_decay'")
-    if "alpha" not in paramsv:
-      raise ValueError("Expecting params.alpha for "
-                       "params.learning_rate_decay == 'cosine_decay'")
-    def cosine_decay_fn(learning_rate, global_step):
-      """ cosine decay function to be passed to optimize_loss """
-      return tf.train.cosine_decay(
-        learning_rate=learning_rate,
-        global_step=global_step,
-        decay_steps=params.decay_steps,
-        alpha=params.alpha
-      )
-    return cosine_decay_fn
-  elif params.learning_rate_decay == 'cosine_restarts_learning_rate_decay':
-    if 'first_decay_steps' not in paramsv:
-      raise ValueError("Expecting params.first_decay_steps for "
-                       "params.learning_rate_decay == 'cosine_restarts_decay'")
-    if 't_mul' not in paramsv:
-      raise ValueError("Expecting params.t_mul for "
-                       "params.learning_rate_decay == 'cosine_restarts_decay'")
-    if 'm_mul' not in paramsv:
-      raise ValueError("Expecting params.m_mul for "
-                       "params.learning_rate_decay == 'cosine_restarts_decay'")
-    if "alpha" not in paramsv:
-      raise ValueError("Expecting params.alpha for "
-                       "params.learning_rate_decay == 'cosine_restarts_decay'")
-    def cosine_restart_decay_fn(learning_rate, global_step):
-      """ cosine decay function to be passed to optimize_loss """
-      return tf.train.cosine_decay_restarts(
-        learning_rate=learning_rate,
-        global_step=global_step,
-        first_decay_steps=params.first_decay_steps,
-        t_mul=params.t_mul,
-        m_mul=params.m_mul,
-        alpha=params.alpha
-      )
-    return cosine_restart_decay_fn
-
-  raise ValueError("Unsupported params.learning_rate_decay: %s" % params.learning_rate_decay)
+def get_learning_rate_decay_fn(params: tf.contrib.training.HParams) -> tf.Tensor:
+    """
+    Returns a learning rate decay function that takes the initial
+    learning_rate and global_step
+    as arguments and returns the current learning rate.
+
+    Currently supports params.learning_rate_decay values of:
+    exponential | polynomial | piecewise_constant | cosine | cosine restarts.
+    See `Decaying the Learning Rate
+    <https://www.tensorflow.org/api_guides/python/train#Decaying_the_learning_rate>`_ for details.
+
+    Args:
+        params:
+            a tensorflow.contrib.train.HParams object containing the relevant hyper parameters.
+    """
+    paramsv = params.values()
+    if (
+        "learning_rate_decay" not in paramsv
+        or params.learning_rate_decay == "no_learning_rate_decay"
+    ):
+        return None
+    elif params.learning_rate_decay == "exponential_learning_rate_decay":
+        if "decay_steps" not in paramsv:
+            raise ValueError(
+                "Expecting params.decay_steps for "
+                "params.learning_rate_decay == 'exponential'"
+            )
+        if "exponential_decay_rate" not in paramsv:
+            raise ValueError(
+                "Expecting params.exponential_decay_rate for "
+                "params.learning_rate_decay == 'exponential'"
+            )
+
+        def exponential_decay_fn(learning_rate: float, global_step: int) -> tf.Tensor:
+            """exponential decay function to be passed to optimize_loss"""
+            return tf.train.exponential_decay(
+                learning_rate=learning_rate,
+                global_step=global_step,
+                decay_steps=params.decay_steps,
+                decay_rate=params.exponential_decay_rate,
+            )
+
+        return exponential_decay_fn
+    elif params.learning_rate_decay == "piecewise_constant_learning_rate_decay":
+        if "piecewise_constant_boundaries" not in paramsv:
+            raise ValueError(
+                "Expecting params.piecewise_constant_boundaries for "
+                "params.learning_rate_decay == 'piecewise_constant'"
+            )
+        if "piecewise_constant_values" not in paramsv:
+            raise ValueError(
+                "Expecting params.piecewise_constant_values for "
+                "params.learning_rate_decay == 'piecewise_constant'"
+            )
+        # pylint: disable=unused-argument
+
+        def piecewise_constant_fn(learning_rate: float, global_step: int) -> tf.Tensor:
+            """piecewise_constant decay function to be passed to optimize_loss"""
+            return tf.train.piecewise_constant(
+                x=global_step,
+                boundaries=params.piecewise_constant_boundaries,
+                values=params.piecewise_constant_values,
+            )
+
+        return piecewise_constant_fn
+    elif params.learning_rate_decay == "polynomial_learning_rate_decay":
+        if "decay_steps" not in paramsv:
+            raise ValueError(
+                "Expecting params.decay_steps for "
+                "params.learning_rate_decay == 'polynomial'"
+            )
+        if "end_learning_rate" not in paramsv:
+            raise ValueError(
+                "Expecting params.end_learning_rate for "
+                "params.learning_rate_decay == 'polynomial'"
+            )
+
+        def polynomial_decay_fn(learning_rate: float, global_step: int) -> tf.Tensor:
+            """polynomial decay function to be passed to optimize_loss"""
+            return tf.train.polynomial_decay(
+                learning_rate=learning_rate,
+                global_step=global_step,
+                decay_steps=params.decay_steps,
+                end_learning_rate=params.end_learning_rate,
+                power=params.polynomial_power if "polynomial_power" in paramsv else 1.0,
+            )
+
+        return polynomial_decay_fn
+
+    elif params.learning_rate_decay == "inverse_learning_rate_decay":
+        if "min_learning_rate" not in paramsv:
+            raise ValueError(
+                "Expecting params.min_learning_rate for "
+                "params.learning_rate_decay == 'inverse'"
+            )
+        if "decay_rate" not in paramsv:
+            raise ValueError(
+                "Expecting params.decay_rate for "
+                "params.learning_rate_decay == 'inverse'"
+            )
+        if "decay_steps" not in paramsv:
+            raise ValueError(
+                "Expecting params.decay_steps for "
+                "params.learning_rate_decay == 'inverse'"
+            )
+
+        def bounded_inverse_time_decay_fn(
+            learning_rate: float, global_step: int
+        ) -> tf.Tensor:
+            """
+            Returns the decayed learning_rate by applying the function:
+            decayed_lr = max(lr /(1 + decay_rate * floor(global_step /decay_step)),
+                             min_learning_rate)
+            Args:
+                learning_rate:
+                    A scalar `float32` or `float64` `Tensor` or a Python number.
+                    The initial learning rate.
+                global_step:
+                    A scalar `int32` or `int64` `Tensor` or a Python number.
+                    Global step to use for the decay computation.  Must not be negative.
+                min_learning_rate:
+                    A scalar `int32` or `int64` `Tensor` or a Python number.
+                    Minimum possible learning_rate. The decayed learning_rate will not be
+                    smaller than the min_learning_rate
+                decay_steps:
+                    How often to apply decay. In dbv1, this should be 1.
+                decay_rate:
+                    A scalar `int32` or `int64` `Tensor` or a Python number.
+                    Rate in which we decay the learning rate.
+            Returns:
+                A scalar `Tensor` of the same type as `learning_rate`.  The decayed
+                learning rate.
+            """
+            decayed_rate = tf.train.inverse_time_decay(
+                learning_rate=learning_rate,
+                global_step=global_step,
+                decay_steps=params.decay_steps,
+                decay_rate=params.decay_rate,
+            )
+            # Getting dtype of returned Tensor
+            dtype = decayed_rate.dtype
+            # Casting the min_learning rate the same dtype as decayes rate
+            min_learning_rate = tf.cast(params.min_learning_rate, dtype)
+            # Returning the maximum between the two
+            return tf.maximum(decayed_rate, min_learning_rate)
+
+        return bounded_inverse_time_decay_fn
+
+    elif params.learning_rate_decay == "cosine_learning_rate_decay":
+        if "decay_steps" not in paramsv:
+            raise ValueError(
+                "Expecting params.decay_steps for "
+                "params.learning_rate_decay == 'cosine_decay'"
+            )
+        if "alpha" not in paramsv:
+            raise ValueError(
+                "Expecting params.alpha for "
+                "params.learning_rate_decay == 'cosine_decay'"
+            )
+
+        def cosine_decay_fn(learning_rate: float, global_step: int) -> tf.Tensor:
+            """cosine decay function to be passed to optimize_loss"""
+            return tf.train.cosine_decay(
+                learning_rate=learning_rate,
+                global_step=global_step,
+                decay_steps=params.decay_steps,
+                alpha=params.alpha,
+            )
+
+        return cosine_decay_fn
+    elif params.learning_rate_decay == "cosine_restarts_learning_rate_decay":
+        if "first_decay_steps" not in paramsv:
+            raise ValueError(
+                "Expecting params.first_decay_steps for "
+                "params.learning_rate_decay == 'cosine_restarts_decay'"
+            )
+        if "t_mul" not in paramsv:
+            raise ValueError(
+                "Expecting params.t_mul for "
+                "params.learning_rate_decay == 'cosine_restarts_decay'"
+            )
+        if "m_mul" not in paramsv:
+            raise ValueError(
+                "Expecting params.m_mul for "
+                "params.learning_rate_decay == 'cosine_restarts_decay'"
+            )
+        if "alpha" not in paramsv:
+            raise ValueError(
+                "Expecting params.alpha for "
+                "params.learning_rate_decay == 'cosine_restarts_decay'"
+            )
+
+        def cosine_restart_decay_fn(
+            learning_rate: float, global_step: int
+        ) -> tf.Tensor:
+            """cosine decay function to be passed to optimize_loss"""
+            return tf.train.cosine_decay_restarts(
+                learning_rate=learning_rate,
+                global_step=global_step,
+                first_decay_steps=params.first_decay_steps,
+                t_mul=params.t_mul,
+                m_mul=params.m_mul,
+                alpha=params.alpha,
+            )
+
+        return cosine_restart_decay_fn
+
+    raise ValueError(
+        "Unsupported params.learning_rate_decay: %s" % params.learning_rate_decay
+    )
diff --git a/twml/twml/lookup/__init__.py b/twml/twml/lookup/__init__.py
index 87392d719..2695fa53c 100644
--- a/twml/twml/lookup/__init__.py
+++ b/twml/twml/lookup/__init__.py
@@ -1,9 +1,8 @@
-from tensorflow.python.ops.lookup_ops import (
-  index_table_from_file,
-  index_table_from_tensor,
-  index_to_string_table_from_file
-)  # noqa: F401
-
+from tensorflow.python.ops.lookup_ops import (  # noqa: F401
+    index_table_from_file,
+    index_table_from_tensor,
+    index_to_string_table_from_file,
+)
 
 """
 NOTE: Using `from tensorflow.python.ops.lookup_ops import index_table_from_tensor` in the code works.
diff --git a/twml/twml/metrics.py b/twml/twml/metrics.py
index ee2f82b74..663997416 100644
--- a/twml/twml/metrics.py
+++ b/twml/twml/metrics.py
@@ -4,1377 +4,1638 @@
 
 """
 
-from collections import OrderedDict
 from functools import partial
+from typing import Callable, Collection, Dict, List, Optional, Sequence, Tuple, Union
 
 import numpy as np
 import tensorboard as tb
 import tensorflow.compat.v1 as tf
 
-
 CLAMP_EPSILON = 0.00001
 
 
 def total_weight_metric(
-    labels,
-    predictions,
-    weights=None,
-    metrics_collections=None,
-    updates_collections=None,
-    name=None):
-  with tf.variable_scope(name, 'total_weight', (labels, predictions, weights)):
-    total_weight = _metric_variable(name='total_weight', shape=[], dtype=tf.float64)
-
-    if weights is None:
-      weights = tf.cast(tf.size(labels), total_weight.dtype, name="default_weight")
-    else:
-      weights = tf.cast(weights, total_weight.dtype)
+    labels: tf.Tensor,
+    predictions: tf.Tensor,
+    weights: Optional[tf.Tensor] = None,
+    metrics_collections: Optional[Collection[tf.Variable]] = None,
+    updates_collections: Optional[Collection[tf.Variable]] = None,
+    name: Optional[str] = None,
+) -> Tuple[tf.Tensor, tf.Tensor]:
+    with tf.variable_scope(name, "total_weight", (labels, predictions, weights)):
+        total_weight = _metric_variable(name="total_weight", shape=[], dtype=tf.float64)
+
+        if weights is None:
+            weights = tf.cast(
+                tf.size(labels), total_weight.dtype, name="default_weight"
+            )
+        else:
+            weights = tf.cast(weights, total_weight.dtype)
 
-    # add up the weights to get total weight of the eval set
-    update_total_weight = tf.assign_add(total_weight, tf.reduce_sum(weights), name="update_op")
+        # add up the weights to get total weight of the eval set
+        update_total_weight = tf.assign_add(
+            total_weight, tf.reduce_sum(weights), name="update_op"
+        )
 
-    value_op = tf.identity(total_weight)
-    update_op = tf.identity(update_total_weight)
+        value_op = tf.identity(total_weight)
+        update_op = tf.identity(update_total_weight)
 
-    if metrics_collections:
-      tf.add_to_collections(metrics_collections, value_op)
+        if metrics_collections:
+            tf.add_to_collections(metrics_collections, value_op)
 
-    if updates_collections:
-      tf.add_to_collections(updates_collections, update_op)
+        if updates_collections:
+            tf.add_to_collections(updates_collections, update_op)
 
-    return value_op, update_op
+        return value_op, update_op
 
 
 def num_samples_metric(
-    labels,
-    predictions,
-    weights=None,
-    metrics_collections=None,
-    updates_collections=None,
-    name=None):
-  with tf.variable_scope(name, 'num_samples', (labels, predictions, weights)):
-    num_samples = _metric_variable(name='num_samples', shape=[], dtype=tf.float64)
-    update_num_samples = tf.assign_add(num_samples, tf.cast(tf.size(labels), num_samples.dtype), name="update_op")
-
-    value_op = tf.identity(num_samples)
-    update_op = tf.identity(update_num_samples)
-
-    if metrics_collections:
-      tf.add_to_collections(metrics_collections, value_op)
-
-    if updates_collections:
-      tf.add_to_collections(updates_collections, update_op)
-
-    return value_op, update_op
-
-
-def ctr(labels, predictions,
-        weights=None,
-        metrics_collections=None,
-        updates_collections=None,
-        name=None):
-  # pylint: disable=unused-argument
-  """
-  Compute the weighted average positive sample ratio based on labels
-  (i.e. weighted average percentage of positive labels).
-  The name `ctr` (click-through-rate) is from legacy.
-
-  Args:
-    labels: the ground truth value.
-    predictions: the predicted values, whose shape must match labels. Ignored for CTR computation.
-    weights: optional weights, whose shape must match labels . Weight is 1 if not set.
-    metrics_collections: optional list of collections to add this metric into.
-    updates_collections: optional list of collections to add the associated update_op into.
-    name: an optional variable_scope name.
-
-  Return:
-    ctr: A `Tensor` representing positive sample ratio.
-    update_op: A update operation used to accumulate data into this metric.
-  """
-  return tf.metrics.mean(
-    values=labels,
-    weights=weights,
-    metrics_collections=metrics_collections,
-    updates_collections=updates_collections,
-    name=name)
-
-
-def predicted_ctr(labels, predictions,
-                  weights=None,
-                  metrics_collections=None,
-                  updates_collections=None,
-                  name=None):
-  # pylint: disable=unused-argument
-  """
-  Compute the weighted average positive ratio based on predictions,
-  (i.e. weighted averaged predicted positive probability).
-  The name `ctr` (click-through-rate) is from legacy.
-
-  Args:
-    labels: the ground truth value.
-    predictions: the predicted values, whose shape must match labels. Ignored for CTR computation.
-    weights: optional weights, whose shape must match labels . Weight is 1 if not set.
-    metrics_collections: optional list of collections to add this metric into.
-    updates_collections: optional list of collections to add the associated update_op into.
-    name: an optional variable_scope name.
-
-  Return:
-    predicted_ctr: A `Tensor` representing the predicted positive ratio.
-    update_op: A update operation used to accumulate data into this metric.
-  """
-  return tf.metrics.mean(
-    values=predictions,
-    weights=weights,
-    metrics_collections=metrics_collections,
-    updates_collections=updates_collections,
-    name=name)
-
-
-def prediction_std_dev(labels, predictions,
-                       weights=None,
-                       metrics_collections=None,
-                       updates_collections=None,
-                       name=None):
-  """
-  Compute the weighted standard deviation of the predictions.
-  Note - this is not a confidence interval metric.
-
-  Args:
-    labels: the ground truth value.
-    predictions: the predicted values, whose shape must match labels. Ignored for CTR computation.
-    weights: optional weights, whose shape must match labels . Weight is 1 if not set.
-    metrics_collections: optional list of collections to add this metric into.
-    updates_collections: optional list of collections to add the associated update_op into.
-    name: an optional variable_scope name.
-
-  Return:
-    metric value: A `Tensor` representing the value of the metric on the data accumulated so far.
-    update_op: A update operation used to accumulate data into this metric.
-  """
-  with tf.variable_scope(name, 'pred_std_dev', (labels, predictions, weights)):
-    labels = tf.cast(labels, tf.float64)
-    predictions = tf.cast(predictions, tf.float64)
-
-    if weights is None:
-      weights = tf.ones(shape=tf.shape(labels), dtype=tf.float64, name="default_weight")
-    else:
-      weights = tf.cast(weights, tf.float64)
-
-    # State kept during streaming of examples
-    total_weighted_preds = _metric_variable(
-        name='total_weighted_preds', shape=[], dtype=tf.float64)
-    total_weighted_preds_sq = _metric_variable(
-        name='total_weighted_preds_sq', shape=[], dtype=tf.float64)
-    total_weights = _metric_variable(
-        name='total_weights', shape=[], dtype=tf.float64)
-
-    # Update state
-    update_total_weighted_preds = tf.assign_add(total_weighted_preds, tf.reduce_sum(weights * predictions))
-    update_total_weighted_preds_sq = tf.assign_add(total_weighted_preds_sq, tf.reduce_sum(weights * predictions * predictions))
-    update_total_weights = tf.assign_add(total_weights, tf.reduce_sum(weights))
-
-    # Compute output
-    def compute_output(tot_w, tot_wp, tot_wpp):
-      return tf.math.sqrt(tot_wpp / tot_w - (tot_wp / tot_w) ** 2)
-    std_dev_est = compute_output(total_weights, total_weighted_preds, total_weighted_preds_sq)
-    update_std_dev_est = compute_output(update_total_weights, update_total_weighted_preds, update_total_weighted_preds_sq)
-
-    if metrics_collections:
-      tf.add_to_collections(metrics_collections, std_dev_est)
-
-    if updates_collections:
-      tf.add_to_collections(updates_collections, update_std_dev_est)
-
-    return std_dev_est, update_std_dev_est
-
-
-def _get_arce_predictions(predictions, weights, label_weighted, labels,
-                         up_weight, deprecated_rce,
-                         total_positive, update_total_positive):
-  """
-  Returns the ARCE predictions, total_positive, update_total_positive and weights
-  used by the rest of the twml.metrics.rce metric computation.
-  """
-  predictions_weighted = tf.multiply(predictions, weights, name="weighted_preds")
-  label_weighted_comp = tf.subtract(tf.reduce_sum(weights), tf.reduce_sum(label_weighted))
-  pred_weight_comp = tf.subtract(tf.reduce_sum(weights), tf.reduce_sum(predictions_weighted))
-  normalizer_comp = label_weighted_comp / pred_weight_comp
-
-  if up_weight is False:
-    total_positive_unweighted = _metric_variable(
-      name='total_positive_unweighted', shape=[], dtype=tf.float32)
-
-    update_total_positive_unweighted = tf.assign_add(
-      total_positive_unweighted, tf.reduce_sum(labels),
-      name="total_positive_unweighted_update")
-
-    if deprecated_rce:
-      normalizer = tf.reduce_sum(labels) / tf.reduce_sum(label_weighted)
-    else:
-      # sum of labels / sum of weighted labels
-      normalizer = update_total_positive_unweighted / update_total_positive
-
-    label_comp = tf.subtract(tf.to_float(tf.size(labels)), tf.reduce_sum(labels))
-    normalizer_comp = label_comp / label_weighted_comp
-
-    # note that up_weight=True changes these for the rest of the twml.metric.rce computation
-    weights = tf.ones(shape=tf.shape(labels), dtype=tf.float32, name="default_weight")
-    total_positive = total_positive_unweighted
-    update_total_positive = update_total_positive_unweighted
-  else:
-    if deprecated_rce:
-      normalizer = tf.reduce_sum(label_weighted) / tf.reduce_sum(predictions_weighted)
-    else:
-      # normalizer used for NRCE (and ARCE with up_weight=True)
-      total_prediction = _metric_variable(name='total_prediction', shape=[], dtype=tf.float32)
-
-      # update the variable holding the sum of weighted predictions
-      update_total_prediction = tf.assign_add(
-        total_prediction, tf.reduce_sum(predictions_weighted), name="total_prediction_update")
-
-      # this used to be tf.reduce_sum(label_weighted) / tf.reduce_sum(predictions_weighted)
-      # but it measure normalizer over batch was too flawed an approximation.
-      normalizer = update_total_positive / update_total_prediction
-
-  pred_comp = tf.subtract(tf.ones(shape=tf.shape(labels), dtype=tf.float32), predictions)
-  pred_comp_norm = tf.multiply(pred_comp, normalizer_comp, name="normalized_predictions_comp")
-  pred_num = tf.multiply(predictions, normalizer, name="normalized_pred_numerator")
-  pred_denom = tf.add(pred_num, pred_comp_norm, name="normalized_pred_denominator")
-  predictions = pred_num / pred_denom
-
-  return predictions, total_positive, update_total_positive, weights
-
-
-def rce(labels, predictions,
-        weights=None,
-        normalize=False,
-        arce=False,
-        up_weight=True,
-        metrics_collections=None,
-        updates_collections=None,
-        name=None,
-        deprecated_rce=False):
-  """
-  Compute the relative cross entropy (RCE).
-  The RCE is a relative measurement compared to the baseline model's performance.
-  The baseline model always predicts average click-through-rate (CTR).
-  The RCE measures, in percentage, how much better the predictions are, compared
-  to the baseline model, in terms of cross entropy loss.
-
-  y = label; p = prediction;
-  binary cross entropy = y * log(p) + (1-y) * log(1-p)
-
-  Args:
-    labels:
-      the ground true value.
-    predictions:
-      the predicted values, whose shape must match labels.
-    weights:
-      optional weights, whose shape must match labels . Weight is 1 if not set.
-    normalize:
-      if set to true, produce NRCEs used at Twitter. (normalize preds by weights first)
-      NOTE: if you don't understand what NRCE is, please don't use it.
-    arce:
-      if set to true, produces `ARCE <http://go/arce>`_.
-      This can only be activated if `normalize=True`.
-    up_weight:
-      if set to true, produces arce in the up_weighted space (considers CTR after up_weighting
-      data), while False gives arce in the original space (only considers CTR before up_weighting).
-      In the actual version, this flag can only be activated if arce is True.
-      Notice that the actual version of NRCE corresponds to up_weight=True.
-    metrics_collections:
-      optional list of collections to add this metric into.
-    updates_collections:
-      optional list of collections to add the associated update_op into.
-    name:
-      an optional variable_scope name.
-    deprecated_rce:
-      enables the previous NRCE/ARCE calculations which calculated some label metrics
-      on the batch instead of on all batches seen so far. Note that the older metric
-      calculation is less stable, especially for smaller batch sizes. You should probably
-      never have to set this to True.
-
-  Return:
-    rce_value:
-      A ``Tensor`` representing the RCE.
-    update_op:
-      A update operation used to accumulate data into this metric.
-
-  .. note:: Must have at least 1 positive and 1 negative sample accumulated,
-     or RCE will come out as NaN.
-  """
-  with tf.variable_scope(name, 'rce', (labels, predictions, weights)):
-    labels = tf.to_float(labels, name="label_to_float")
-    predictions = tf.to_float(predictions, name="predictions_to_float")
-
-    if weights is None:
-      weights = tf.ones(shape=tf.shape(labels), dtype=tf.float32, name="default_weight")
-    else:
-      weights = tf.to_float(weights, name="weight_to_float")
+    labels: tf.Tensor,
+    predictions: tf.Tensor,
+    weights: Optional[tf.Tensor] = None,
+    metrics_collections: Optional[Collection[tf.Variable]] = None,
+    updates_collections: Optional[Collection[tf.Variable]] = None,
+    name: Optional[str] = None,
+) -> Tuple[tf.Tensor, tf.Tensor]:
+    with tf.variable_scope(name, "num_samples", (labels, predictions, weights)):
+        num_samples = _metric_variable(name="num_samples", shape=[], dtype=tf.float64)
+        update_num_samples = tf.assign_add(
+            num_samples, tf.cast(tf.size(labels), num_samples.dtype), name="update_op"
+        )
+
+        value_op = tf.identity(num_samples)
+        update_op = tf.identity(update_num_samples)
+
+        if metrics_collections:
+            tf.add_to_collections(metrics_collections, value_op)
+
+        if updates_collections:
+            tf.add_to_collections(updates_collections, update_op)
+
+        return value_op, update_op
+
+
+def ctr(
+    labels: tf.Tensor,
+    predictions: tf.Tensor,
+    weights: Optional[tf.Tensor] = None,
+    metrics_collections: Optional[Collection[tf.Variable]] = None,
+    updates_collections: Optional[Collection[tf.Variable]] = None,
+    name: Optional[str] = None,
+) -> Tuple[tf.Tensor, tf.Tensor]:  # pylint: disable=unused-argument
+    """
+    Compute the weighted average positive sample ratio based on labels
+    (i.e. weighted average percentage of positive labels).
+    The name `ctr` (click-through-rate) is from legacy.
+
+    Args:
+        labels: the ground truth value.
+        predictions: the predicted values, whose shape must match labels. Ignored for CTR computation.
+        weights: optional weights, whose shape must match labels . Weight is 1 if not set.
+        metrics_collections: optional list of collections to add this metric into.
+        updates_collections: optional list of collections to add the associated update_op into.
+        name: an optional variable_scope name.
+
+    Return:
+        ctr: A `Tensor` representing positive sample ratio.
+        update_op: A update operation used to accumulate data into this metric.
+    """
+    return tf.metrics.mean(
+        values=labels,
+        weights=weights,
+        metrics_collections=metrics_collections,
+        updates_collections=updates_collections,
+        name=name,
+    )
 
-    total_positive = _metric_variable(name='total_positive', shape=[], dtype=tf.float32)
-    total_loss = _metric_variable(name='total_loss', shape=[], dtype=tf.float32)
-    total_weight = _metric_variable(name='total_weight', shape=[], dtype=tf.float32)
 
-    label_weighted = tf.multiply(labels, weights, name="weighted_label")
+def predicted_ctr(
+    labels: tf.Tensor,
+    predictions: tf.Tensor,
+    weights: tf.Tensor = None,
+    metrics_collections: Optional[Collection[tf.Variable]] = None,
+    updates_collections: Optional[Collection[tf.Variable]] = None,
+    name: Optional[str] = None,
+) -> Tuple[tf.Tensor, tf.Tensor]:
+    # pylint: disable=unused-argument
+    """
+    Compute the weighted average positive ratio based on predictions,
+    (i.e. weighted averaged predicted positive probability).
+    The name `ctr` (click-through-rate) is from legacy.
+
+    Args:
+        labels: the ground truth value.
+        predictions: the predicted values, whose shape must match labels. Ignored for CTR computation.
+        weights: optional weights, whose shape must match labels . Weight is 1 if not set.
+        metrics_collections: optional list of collections to add this metric into.
+        updates_collections: optional list of collections to add the associated update_op into.
+        name: an optional variable_scope name.
+
+    Return:
+        predicted_ctr: A `Tensor` representing the predicted positive ratio.
+        update_op: A update operation used to accumulate data into this metric.
+    """
+    return tf.metrics.mean(
+        values=predictions,
+        weights=weights,
+        metrics_collections=metrics_collections,
+        updates_collections=updates_collections,
+        name=name,
+    )
 
-    update_total_positive = tf.assign_add(
-      total_positive, tf.reduce_sum(label_weighted), name="total_pos_update")
 
-    if arce:
-      if normalize is False:
-        raise ValueError('This configuration of parameters is not actually allowed')
+def prediction_std_dev(
+    labels: tf.Tensor,
+    predictions: tf.Tensor,
+    weights: tf.Tensor = None,
+    metrics_collections: Optional[Collection[tf.Variable]] = None,
+    updates_collections: Optional[Collection[tf.Variable]] = None,
+    name: Optional[str] = None,
+) -> Tuple[tf.Tensor, tf.Tensor]:
+    """
+    Compute the weighted standard deviation of the predictions.
+    Note - this is not a confidence interval metric.
+
+    Args:
+        labels: the ground truth value.
+        predictions: the predicted values, whose shape must match labels. Ignored for CTR computation.
+        weights: optional weights, whose shape must match labels . Weight is 1 if not set.
+        metrics_collections: optional list of collections to add this metric into.
+        updates_collections: optional list of collections to add the associated update_op into.
+        name: an optional variable_scope name.
+
+    Return:
+        metric value: A `Tensor` representing the value of the metric on the data accumulated so far.
+        update_op: A update operation used to accumulate data into this metric.
+    """
+    with tf.variable_scope(name, "pred_std_dev", (labels, predictions, weights)):
+        labels = tf.cast(labels, tf.float64)
+        predictions = tf.cast(predictions, tf.float64)
+
+        if weights is None:
+            weights = tf.ones(
+                shape=tf.shape(labels), dtype=tf.float64, name="default_weight"
+            )
+        else:
+            weights = tf.cast(weights, tf.float64)
+
+        # State kept during streaming of examples
+        total_weighted_preds = _metric_variable(
+            name="total_weighted_preds", shape=[], dtype=tf.float64
+        )
+        total_weighted_preds_sq = _metric_variable(
+            name="total_weighted_preds_sq", shape=[], dtype=tf.float64
+        )
+        total_weights = _metric_variable(
+            name="total_weights", shape=[], dtype=tf.float64
+        )
+
+        # Update state
+        update_total_weighted_preds = tf.assign_add(
+            total_weighted_preds, tf.reduce_sum(weights * predictions)
+        )
+        update_total_weighted_preds_sq = tf.assign_add(
+            total_weighted_preds_sq, tf.reduce_sum(weights * predictions * predictions)
+        )
+        update_total_weights = tf.assign_add(total_weights, tf.reduce_sum(weights))
+
+        # Compute output
+        def compute_output(
+            tot_w: tf.Tensor, tot_wp: tf.Tensor, tot_wpp: tf.Tensor
+        ) -> tf.Tensor:
+            return tf.math.sqrt(tot_wpp / tot_w - (tot_wp / tot_w) ** 2)
+
+        std_dev_est = compute_output(
+            total_weights, total_weighted_preds, total_weighted_preds_sq
+        )
+        update_std_dev_est = compute_output(
+            update_total_weights,
+            update_total_weighted_preds,
+            update_total_weighted_preds_sq,
+        )
+
+        if metrics_collections:
+            tf.add_to_collections(metrics_collections, std_dev_est)
+
+        if updates_collections:
+            tf.add_to_collections(updates_collections, update_std_dev_est)
+
+        return std_dev_est, update_std_dev_est
+
+
+def _get_arce_predictions(
+    predictions: tf.Tensor,
+    weights: tf.Tensor,
+    label_weighted: tf.Tensor,
+    labels: tf.Tensor,
+    up_weight: bool,
+    deprecated_rce: bool,
+    total_positive: tf.Tensor,
+    update_total_positive: tf.Tensor,
+) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]:
+    """
+    Returns the ARCE predictions, total_positive, update_total_positive and weights
+    used by the rest of the twml.metrics.rce metric computation.
+    """
+    predictions_weighted = tf.multiply(predictions, weights, name="weighted_preds")
+    label_weighted_comp = tf.subtract(
+        tf.reduce_sum(weights), tf.reduce_sum(label_weighted)
+    )
+    pred_weight_comp = tf.subtract(
+        tf.reduce_sum(weights), tf.reduce_sum(predictions_weighted)
+    )
+    normalizer_comp = label_weighted_comp / pred_weight_comp
 
-      predictions, total_positive, update_total_positive, weights = _get_arce_predictions(
-        predictions=predictions, weights=weights, deprecated_rce=deprecated_rce,
-        label_weighted=label_weighted, labels=labels, up_weight=up_weight,
-        total_positive=total_positive, update_total_positive=update_total_positive)
+    if up_weight is False:
+        total_positive_unweighted = _metric_variable(
+            name="total_positive_unweighted", shape=[], dtype=tf.float32
+        )
 
-    elif normalize:
-      predictions_weighted = tf.multiply(predictions, weights, name="weighted_preds")
+        update_total_positive_unweighted = tf.assign_add(
+            total_positive_unweighted,
+            tf.reduce_sum(labels),
+            name="total_positive_unweighted_update",
+        )
 
-      if deprecated_rce:
-        normalizer = tf.reduce_sum(label_weighted) / tf.reduce_sum(predictions_weighted)
-      else:
-        total_prediction = _metric_variable(name='total_prediction', shape=[], dtype=tf.float32)
+        if deprecated_rce:
+            normalizer = tf.reduce_sum(labels) / tf.reduce_sum(label_weighted)
+        else:
+            # sum of labels / sum of weighted labels
+            normalizer = update_total_positive_unweighted / update_total_positive
+
+        label_comp = tf.subtract(tf.to_float(tf.size(labels)), tf.reduce_sum(labels))
+        normalizer_comp = label_comp / label_weighted_comp
+
+        # note that up_weight=True changes these for the rest of the twml.metric.rce computation
+        weights = tf.ones(
+            shape=tf.shape(labels), dtype=tf.float32, name="default_weight"
+        )
+        total_positive = total_positive_unweighted
+        update_total_positive = update_total_positive_unweighted
+    else:
+        if deprecated_rce:
+            normalizer = tf.reduce_sum(label_weighted) / tf.reduce_sum(
+                predictions_weighted
+            )
+        else:
+            # normalizer used for NRCE (and ARCE with up_weight=True)
+            total_prediction = _metric_variable(
+                name="total_prediction", shape=[], dtype=tf.float32
+            )
+
+            # update the variable holding the sum of weighted predictions
+            update_total_prediction = tf.assign_add(
+                total_prediction,
+                tf.reduce_sum(predictions_weighted),
+                name="total_prediction_update",
+            )
+
+            # this used to be tf.reduce_sum(label_weighted) / tf.reduce_sum(predictions_weighted)
+            # but it measure normalizer over batch was too flawed an approximation.
+            normalizer = update_total_positive / update_total_prediction
+
+    pred_comp = tf.subtract(
+        tf.ones(shape=tf.shape(labels), dtype=tf.float32), predictions
+    )
+    pred_comp_norm = tf.multiply(
+        pred_comp, normalizer_comp, name="normalized_predictions_comp"
+    )
+    pred_num = tf.multiply(predictions, normalizer, name="normalized_pred_numerator")
+    pred_denom = tf.add(pred_num, pred_comp_norm, name="normalized_pred_denominator")
+    predictions = pred_num / pred_denom
+
+    return predictions, total_positive, update_total_positive, weights
+
+
+def rce(
+    labels: tf.Tensor,
+    predictions: tf.Tensor,
+    weights: tf.Tensor = None,
+    normalize: bool = False,
+    arce: bool = False,
+    up_weight: bool = True,
+    metrics_collections: Optional[Collection[tf.Variable]] = None,
+    updates_collections: Optional[Collection[tf.Variable]] = None,
+    name: Optional[str] = None,
+    deprecated_rce: bool = False,
+) -> Tuple[tf.Tensor, tf.Tensor]:
+    """
+    Compute the relative cross entropy (RCE).
+    The RCE is a relative measurement compared to the baseline model's performance.
+    The baseline model always predicts average click-through-rate (CTR).
+    The RCE measures, in percentage, how much better the predictions are, compared
+    to the baseline model, in terms of cross entropy loss.
+
+    y = label; p = prediction;
+    binary cross entropy = y * log(p) + (1-y) * log(1-p)
+
+    Args:
+        labels:
+            the ground true value.
+        predictions:
+            the predicted values, whose shape must match labels.
+        weights:
+            optional weights, whose shape must match labels . Weight is 1 if not set.
+        normalize:
+            if set to true, produce NRCEs used at Twitter. (normalize preds by weights first)
+            NOTE: if you don't understand what NRCE is, please don't use it.
+        arce:
+            if set to true, produces `ARCE <http://go/arce>`_.
+            This can only be activated if `normalize=True`.
+        up_weight:
+            if set to true, produces arce in the up_weighted space (considers CTR after up_weighting
+            data), while False gives arce in the original space (only considers CTR before up_weighting).
+            In the actual version, this flag can only be activated if arce is True.
+            Notice that the actual version of NRCE corresponds to up_weight=True.
+        metrics_collections:
+            optional list of collections to add this metric into.
+        updates_collections:
+            optional list of collections to add the associated update_op into.
+        name:
+            an optional variable_scope name.
+        deprecated_rce:
+            enables the previous NRCE/ARCE calculations which calculated some label metrics
+            on the batch instead of on all batches seen so far. Note that the older metric
+            calculation is less stable, especially for smaller batch sizes. You should probably
+            never have to set this to True.
+
+    Return:
+        rce_value:
+            A ``Tensor`` representing the RCE.
+        update_op:
+            A update operation used to accumulate data into this metric.
+
+        .. note:: Must have at least 1 positive and 1 negative sample accumulated,
+        or RCE will come out as NaN.
+    """
+    with tf.variable_scope(name, "rce", (labels, predictions, weights)):
+        labels = tf.to_float(labels, name="label_to_float")
+        predictions = tf.to_float(predictions, name="predictions_to_float")
+
+        if weights is None:
+            weights = tf.ones(
+                shape=tf.shape(labels), dtype=tf.float32, name="default_weight"
+            )
+        else:
+            weights = tf.to_float(weights, name="weight_to_float")
+
+        total_positive = _metric_variable(
+            name="total_positive", shape=[], dtype=tf.float32
+        )
+        total_loss = _metric_variable(name="total_loss", shape=[], dtype=tf.float32)
+        total_weight = _metric_variable(name="total_weight", shape=[], dtype=tf.float32)
+
+        label_weighted = tf.multiply(labels, weights, name="weighted_label")
+
+        update_total_positive = tf.assign_add(
+            total_positive, tf.reduce_sum(label_weighted), name="total_pos_update"
+        )
+
+        if arce:
+            if normalize is False:
+                raise ValueError(
+                    "This configuration of parameters is not actually allowed"
+                )
+
+            (
+                predictions,
+                total_positive,
+                update_total_positive,
+                weights,
+            ) = _get_arce_predictions(
+                predictions=predictions,
+                weights=weights,
+                deprecated_rce=deprecated_rce,
+                label_weighted=label_weighted,
+                labels=labels,
+                up_weight=up_weight,
+                total_positive=total_positive,
+                update_total_positive=update_total_positive,
+            )
+
+        elif normalize:
+            predictions_weighted = tf.multiply(
+                predictions, weights, name="weighted_preds"
+            )
+
+            if deprecated_rce:
+                normalizer = tf.reduce_sum(label_weighted) / tf.reduce_sum(
+                    predictions_weighted
+                )
+            else:
+                total_prediction = _metric_variable(
+                    name="total_prediction", shape=[], dtype=tf.float32
+                )
+
+                # update the variable holding the sum of weighted predictions
+                update_total_prediction = tf.assign_add(
+                    total_prediction,
+                    tf.reduce_sum(predictions_weighted),
+                    name="total_prediction_update",
+                )
+
+                # this used to be tf.reduce_sum(label_weighted) / tf.reduce_sum(predictions_weighted)
+                # but it measure normalizer over batch was too flawed an approximation.
+                normalizer = update_total_positive / update_total_prediction
+
+            # NRCE
+            predictions = tf.multiply(
+                predictions, normalizer, name="normalized_predictions"
+            )
+
+        # clamp predictions to keep log(p) stable
+        clip_p = tf.clip_by_value(
+            predictions, CLAMP_EPSILON, 1.0 - CLAMP_EPSILON, name="clip_p"
+        )
+        logloss = _binary_cross_entropy(pred=clip_p, target=labels, name="logloss")
+
+        logloss_weighted = tf.multiply(logloss, weights, name="weighted_logloss")
+
+        update_total_loss = tf.assign_add(
+            total_loss, tf.reduce_sum(logloss_weighted), name="total_loss_update"
+        )
+        update_total_weight = tf.assign_add(
+            total_weight, tf.reduce_sum(weights), name="total_weight_update"
+        )
+
+        # metric value retrieval subgraph
+        ctr1 = tf.truediv(total_positive, total_weight, name="ctr")
+        # Note: we don't have to keep running averages for computing baseline CE. Because the prediction
+        # is constant for every sample, we can simplify it to the formula below.
+        baseline_ce = _binary_cross_entropy(pred=ctr1, target=ctr1, name="baseline_ce")
+        pred_ce = tf.truediv(total_loss, total_weight, name="pred_ce")
+
+        rce_t = tf.multiply(1.0 - tf.truediv(pred_ce, baseline_ce), 100, name="rce")
+
+        # metric update subgraph
+        ctr2 = tf.truediv(update_total_positive, update_total_weight, name="ctr_update")
+        # Note: we don't have to keep running averages for computing baseline CE. Because the prediction
+        # is constant for every sample, we can simplify it to the formula below.
+        baseline_ce2 = _binary_cross_entropy(
+            pred=ctr2, target=ctr2, name="baseline_ce_update"
+        )
+        pred_ce2 = tf.truediv(
+            update_total_loss, update_total_weight, name="pred_ce_update"
+        )
+
+        update_op = tf.multiply(
+            1.0 - tf.truediv(pred_ce2, baseline_ce2), 100, name="update_op"
+        )
+
+        if metrics_collections:
+            tf.add_to_collections(metrics_collections, rce_t)
+
+        if updates_collections:
+            tf.add_to_collections(updates_collections, update_op)
+
+        return rce_t, update_op
+
+
+def ce(p_true: tf.Tensor, p_est: Optional[tf.Tensor] = None) -> tf.Tensor:
+    if p_est is None:
+        p_est = p_true
+    return _binary_cross_entropy(pred=p_est, target=p_true, name=None)
+
+
+def rce_transform(outputs, labels: tf.Tensor, weights: tf.Tensor) -> dict:
+    """
+    Construct an dict of quantities to aggregate over eval batches
+    outputs, labels, weights are TensorFlow tensors, and are assumed to
+        be of shape [N] for batch_size = N
+    Each entry in the output dict should also be of shape [N]
+    """
+    out_vals = dict()
+    out_vals["weighted_loss"] = weights * ce(p_true=labels, p_est=outputs)
+    out_vals["weighted_labels"] = labels * weights
+    out_vals["weight"] = weights
+    return out_vals
+
+
+def rce_metric(aggregates: dict) -> tf.Tensor:
+    """
+    input ``aggregates`` is an dict with the same keys as those created
+        by rce_transform(). The dict values are the aggregates (reduce_sum)
+        of the values produced by rce_transform(), and should be scalars.
+    output is the value of RCE
+    """
+    # cumulative weighted loss of model predictions
+    total_weighted_loss = aggregates["weighted_loss"]
+    total_weighted_labels = aggregates["weighted_labels"]
+    total_weight = aggregates["weight"]
+
+    model_average_loss = total_weighted_loss / total_weight
+    baseline_average_loss = ce(total_weighted_labels / total_weight)
+    return 100.0 * (1 - model_average_loss / baseline_average_loss)
+
+
+def metric_std_err(
+    labels: tf.Tensor,
+    predictions: tf.Tensor,
+    weights: tf.Tensor = None,
+    transform: Callable[[tf.Tensor, tf.Tensor, tf.Tensor], dict] = rce_transform,
+    metric: Callable[[dict], tf.Tensor] = rce_metric,
+    metrics_collections: Optional[Collection[tf.Variable]] = None,
+    updates_collections: Optional[Collection[tf.Variable]] = None,
+    name: str = "rce_std_err",
+) -> tf.Tensor:
+    """
+    Compute the weighted standard error of the RCE metric on this eval set.
+    This can be used for confidence intervals and unpaired hypothesis tests.
+
+    Args:
+        labels: the ground truth value.
+        predictions: the predicted values, whose shape must match labels.
+        weights: optional weights, whose shape must match labels . Weight is 1 if not set.
+        transform: a function of the following form:
+
+            .. code-block:: python
+
+            def transform(outputs, labels, weights):
+                out_vals = dict()
+                ...
+                return out_vals
+
+            where outputs, labels, and weights are all tensors of shape [eval_batch_size].
+            The returned dict() should have values that are tensors of shape  [eval_batch_size].
+            These will be aggregated across many batches in the eval dataset, to produce
+            one scalar value per key of out_vals.
+        metric: a function of the following form
+
+            .. code-block:: python
+
+            def metric(aggregates):
+                ...
+                return metric_value
+
+            where aggregates is an dict() having the same keys created by transform().
+            Each of the corresponding dict values is the reduce_sum of the values produced by
+            transform(), and is a TF scalar. The return value should be a scalar representing
+            the value of the desired metric.
+        metrics_collections: optional list of collections to add this metric into.
+        updates_collections: optional list of collections to add the associated update_op into.
+        name: an optional variable_scope name.
+
+    Return:
+        metric value: A `Tensor` representing the value of the metric on the data accumulated so far.
+        update_op: A update operation used to accumulate data into this metric.
+    """
+    with tf.variable_scope(name, "metric_std_err", (labels, predictions, weights)):
+        labels = tf.cast(labels, tf.float64)
+        predictions = tf.cast(predictions, tf.float64)
 
-        # update the variable holding the sum of weighted predictions
+        if weights is None:
+            weights = tf.ones_like(labels, dtype=tf.float64, name="default_weight")
+        else:
+            weights = tf.cast(weights, tf.float64)
+
+        labels = tf.reshape(labels, [-1])
+        predictions = tf.reshape(predictions, [-1])
+        predictions = tf.clip_by_value(
+            predictions, CLAMP_EPSILON, 1.0 - CLAMP_EPSILON, name="clip_p"
+        )
+        weights = tf.reshape(weights, [-1])
+
+        # first apply the supplied transform function to the output, label, weight data
+        # returns an dict of 1xN tensors for N input samples
+        # for each sample, compute f = transform(pred, l, w)
+        transformed = transform(predictions, labels, weights)
+
+        # we track 3 types of aggregate information
+        # 1. total number of samples
+        # 2. aggregated transformed samples (moment1), i.e. sum(f)
+        # 3. aggregated crosses of transformed samples (moment2), i.e. sum(f*f^T)
+
+        # count total number of samples
+        sample_count = _metric_variable(name="sample_count", shape=[], dtype=tf.int64)
+        update_sample_count = tf.assign_add(
+            sample_count, tf.size(labels, out_type=sample_count.dtype)
+        )
+
+        # compose the ordered dict into a single vector
+        # so f can be treated as a single column vector rather than a collection of scalars
+        N = len(transformed)
+        transformed_vec = tf.stack(list(transformed.values()), axis=1)
+
+        # compute and update transformed samples (1st order statistics)
+        # i.e. accumulate f into F as F += sum(f)
+        aggregates_1 = _metric_variable(
+            name="aggregates_1", shape=[N], dtype=tf.float64
+        )
+        update_aggregates_1 = tf.assign_add(
+            aggregates_1, tf.reduce_sum(transformed_vec, axis=0)
+        )
+
+        # compute and update crossed transformed samples (2nd order statistics)
+        # i.e. accumulate f*f^T into F2 as F2 += sum(f*transpose(f))
+        aggregates_2 = _metric_variable(
+            name="aggregates_2", shape=[N, N], dtype=tf.float64
+        )
+        moment_2_temp = tf.reshape(transformed_vec, shape=[-1, N, 1]) * tf.reshape(
+            transformed_vec, shape=[-1, 1, N]
+        )
+        update_aggregates_2 = tf.assign_add(
+            aggregates_2, tf.reduce_sum(moment_2_temp, axis=0)
+        )
+
+        def compute_output(
+            agg_1: tf.Tensor, agg_2: tf.Tensor, samp_cnt: tf.Tensor
+        ) -> tf.Tensor:
+            """Compute the metric value and its standard error."""
+            # decompose the aggregates back into a dict to pass to the user-supplied metric fn
+            aggregates_dict = dict()
+            for i, key in enumerate(transformed.keys()):
+                aggregates_dict[key] = agg_1[i]
+
+            metric_value = metric(aggregates_dict)
+
+            # derivative of metric with respect to the 1st order aggregates
+            # i.e. d M(agg1) / d agg1
+            metric_prime = tf.gradients(metric_value, agg_1, stop_gradients=agg_1)
+
+            # estimated covariance of agg_1
+            # cov(F) = sum(f*f^T) - (sum(f) * sum(f)^T) / N
+            #     = agg_2 - (agg_1 * agg_1^T) / N
+            N_covariance_estimate = agg_2 - (
+                tf.reshape(agg_1, shape=[-1, 1])
+                @ tf.reshape(agg_1, shape=[1, -1])
+                / tf.cast(samp_cnt, dtype=tf.float64)
+            )
+
+            # push N_covariance_estimate through a linearization of metric around agg_1
+            # metric var = transpose(d M(agg1) / d agg1) * cov(F) * (d M(agg1) / d agg1)
+            metric_variance = (
+                tf.reshape(metric_prime, shape=[1, -1])
+                @ N_covariance_estimate
+                @ tf.reshape(metric_prime, shape=[-1, 1])
+            )
+            # result should be a single element, but the matmul is 2D
+            metric_variance = metric_variance[0][0]
+            metric_stderr = tf.sqrt(metric_variance)
+            return metric_stderr
+
+        metric_stderr = compute_output(aggregates_1, aggregates_2, sample_count)
+        update_metric_stderr = compute_output(
+            update_aggregates_1, update_aggregates_2, update_sample_count
+        )
+
+        if metrics_collections:
+            tf.add_to_collections(metrics_collections, metric_stderr)
+
+        if updates_collections:
+            tf.add_to_collections(updates_collections, update_metric_stderr)
+
+        return metric_stderr, update_metric_stderr
+
+
+def lolly_nrce(
+    labels: tf.Tensor,
+    predictions: tf.Tensor,
+    weights: Optional[tf.Tensor] = None,
+    metrics_collections: Optional[Collection[tf.Variable]] = None,
+    updates_collections: Optional[Collection[tf.Variable]] = None,
+    name: Optional[str] = None,
+) -> Tuple[tf.Tensor, tf.Tensor]:
+    """
+    Compute the Lolly NRCE.
+
+    Note: As this NRCE calculation uses Taylor expansion, it becomes inaccurate when the ctr is large,
+    especially when the adjusted ctr goes above 1.0.
+
+    Calculation:
+
+    ::
+
+      NRCE: lolly NRCE
+      BCE: baseline cross entropy
+      NCE: normalized cross entropy
+      CE: cross entropy
+      y_i: label of example i
+      p_i: prediction of example i
+      y: ctr
+      p: average prediction
+      a: normalizer
+
+      Assumes any p_i and a * p_i is within [0, 1)
+      NRCE = (1 - NCE / BCE) * 100
+      BCE = - sum_i(y_i * log(y) + (1 - y_i) * log(1 - y))
+          = - (y * log(y) + (1 - y) * log(1 - y))
+      a = y / p
+      CE = - sum_i(y_i * log(p_i) + (1 - y_i) * log(1 - p_i))
+      NCE = - sum_i(y_i * log(a * p_i) + (1 - y_i) * log(1 - a * p_i))
+          = - sum_i(y_i * log(p_i) + (1 - y_i) * log(1 - p_i))
+            - sum_i(y_i * log(a))
+            + sum_i((1 - y_i) * log(1 - p_i))
+            - sum_i((1 - y_i) * log(1 - a * p_i))
+          ~= CE - sum_i(y_i) * log(a)
+            + sum_i((1 - y_i) * (- sum_{j=1~5}(p_i^j / j)))
+            - sum_i((1 - y_i) * (- sum_{j=1~5}(a^j * p_i^j / j)))
+            # Takes 5 items from the Taylor expansion, can be increased if needed
+            # Error for each example is O(p_i^6)
+          = CE - sum_i(y_i) * log(a)
+            - sum_{j=1~5}(sum_i((1 - y_i) * p_i^j) / j)
+            + sum_{j=1~5}(sum_i((1 - y_i) * p_i^j) * a^j / j)
+          = CE - sum_i(y_i) * log(a)
+            + sum_{j=1~5}(sum_i((1 - y_i) * p_i^j) * (a^j - 1) / j)
+
+    Thus we keep track of CE, sum_i(y_i), sum_i((1 - y_i) * p_i^j) for j=1~5.
+    We also keep track of p and y by sum_i(y_i), sum_i(p_i), sum_i(1) so that
+    we can get a at the end, which leads to this NRCE.
+
+    NRCE uses ctr and average pctr to normalize the pctrs.
+    It removes the impact of prediction error from RCE.
+    Usually NRCE is higher as the prediction error impact on RCE is negative.
+    Removing prediction error in our model can make RCE closer to NRCE and thus improve RCE.
+
+    In Lolly NRCE we use ctr and average pctr of the whole dataset.
+    We thus remove the dataset level error in NRCE calculation.
+    In this case, when we want to improve RCE to the level of NRCE,
+    it is achievable as dataset level prediction error is easy to remove by calibration.
+    Lolly NRCE is thus a good estimate about the potential gain by adding calibration.
+
+    In DBv2 NRCE, we use per-batch ctr and average pctr. We remove the batch level error.
+    This error is difficult to remove by modeling improvement,
+    at least not by simple calibration.
+    It thus cannot indicate the same opportunity as the Lolly NRCE does.
+
+    Args:
+        labels:
+            the ground true value.
+        predictions:
+            the predicted values, whose shape must match labels.
+        weights:
+            optional weights, whose shape must match labels . Weight is 1 if not set.
+        metrics_collections:
+            optional list of collections to add this metric into.
+        updates_collections:
+            optional list of collections to add the associated update_op into.
+        name:
+            an optional variable_scope name.
+
+    Return:
+        rce_value:
+            A ``Tensor`` representing the RCE.
+        update_op:
+            A update operation used to accumulate data into this metric.
+
+    Note: Must have at least 1 positive and 1 negative sample accumulated,
+          or NRCE will come out as NaN.
+    """
+    with tf.variable_scope(name, "lolly_nrce", (labels, predictions, weights)):
+        labels = tf.to_float(labels, name="label_to_float")
+        predictions = tf.to_float(predictions, name="predictions_to_float")
+
+        if weights is None:
+            weights = tf.ones(
+                shape=tf.shape(labels), dtype=tf.float32, name="default_weight"
+            )
+        else:
+            weights = tf.to_float(weights, name="weight_to_float")
+
+        positive_weights = tf.multiply(labels, weights, name="positive_weights")
+
+        # clamp predictions to keep log(p) stable
+        clip_predictions = tf.clip_by_value(
+            predictions, CLAMP_EPSILON, 1.0 - CLAMP_EPSILON, name="clip_predictions"
+        )
+        weighted_predictions = tf.multiply(
+            predictions, weights, name="weighted_predictions"
+        )
+
+        logloss = _binary_cross_entropy(
+            pred=clip_predictions, target=labels, name="logloss"
+        )
+        weighted_logloss = tf.multiply(logloss, weights, name="weighted_logloss")
+
+        negatives = tf.subtract(
+            tf.ones(shape=tf.shape(labels), dtype=tf.float32), labels, name="negatives"
+        )
+        negative_predictions = tf.multiply(
+            predictions, negatives, name="negative_predictions"
+        )
+        weighted_negative_predictions = tf.multiply(
+            negative_predictions, weights, name="weighted_negative_predictions"
+        )
+        negative_squared_predictions = tf.multiply(
+            negative_predictions,
+            negative_predictions,
+            name="negative_squared_predictions",
+        )
+        weighted_negative_squared_predictions = tf.multiply(
+            negative_squared_predictions,
+            weights,
+            name="weighted_negative_squared_predictions",
+        )
+        negative_cubed_predictions = tf.multiply(
+            negative_squared_predictions,
+            negative_predictions,
+            name="negative_cubed_predictions",
+        )
+        weighted_negative_cubed_predictions = tf.multiply(
+            negative_cubed_predictions,
+            weights,
+            name="weighted_negative_cubed_predictions",
+        )
+        negative_quartic_predictions = tf.multiply(
+            negative_cubed_predictions,
+            negative_predictions,
+            name="negative_quartic_predictions",
+        )
+        weighted_negative_quartic_predictions = tf.multiply(
+            negative_quartic_predictions,
+            weights,
+            name="weighted_negative_quartic_predictions",
+        )
+        negative_quintic_predictions = tf.multiply(
+            negative_quartic_predictions,
+            negative_predictions,
+            name="negative_quintic_predictions",
+        )
+        weighted_negative_quintic_predictions = tf.multiply(
+            negative_quintic_predictions,
+            weights,
+            name="weighted_negative_quintic_predictions",
+        )
+
+        # Tracked stats
+        total_positive = _metric_variable(
+            name="total_positive", shape=[], dtype=tf.float32
+        )
+        total_weight = _metric_variable(name="total_weight", shape=[], dtype=tf.float32)
+
+        total_prediction = _metric_variable(
+            name="total_prediction", shape=[], dtype=tf.float32
+        )
+
+        total_negative_prediction = _metric_variable(
+            name="total_negative_prediction", shape=[], dtype=tf.float32
+        )
+        total_negative_squared_prediction = _metric_variable(
+            name="total_negative_squared_prediction", shape=[], dtype=tf.float32
+        )
+        total_negative_cubed_prediction = _metric_variable(
+            name="total_negative_cubed_prediction", shape=[], dtype=tf.float32
+        )
+        total_negative_quartic_prediction = _metric_variable(
+            name="total_negative_quartic_prediction", shape=[], dtype=tf.float32
+        )
+        total_negative_quintic_prediction = _metric_variable(
+            name="total_negative_quintic_prediction", shape=[], dtype=tf.float32
+        )
+
+        total_loss = _metric_variable(name="total_loss", shape=[], dtype=tf.float32)
+
+        # Update tracked stats
+        update_total_positive = tf.assign_add(
+            total_positive,
+            tf.reduce_sum(positive_weights),
+            name="total_positive_update",
+        )
+        update_total_weight = tf.assign_add(
+            total_weight, tf.reduce_sum(weights), name="total_weight_update"
+        )
         update_total_prediction = tf.assign_add(
-          total_prediction, tf.reduce_sum(predictions_weighted), name="total_prediction_update")
-
-        # this used to be tf.reduce_sum(label_weighted) / tf.reduce_sum(predictions_weighted)
-        # but it measure normalizer over batch was too flawed an approximation.
-        normalizer = update_total_positive / update_total_prediction
-
-      # NRCE
-      predictions = tf.multiply(predictions, normalizer, name="normalized_predictions")
-
-    # clamp predictions to keep log(p) stable
-    clip_p = tf.clip_by_value(predictions, CLAMP_EPSILON, 1.0 - CLAMP_EPSILON, name="clip_p")
-    logloss = _binary_cross_entropy(pred=clip_p, target=labels, name="logloss")
-
-    logloss_weighted = tf.multiply(logloss, weights, name="weighted_logloss")
-
-    update_total_loss = tf.assign_add(
-      total_loss, tf.reduce_sum(logloss_weighted), name="total_loss_update")
-    update_total_weight = tf.assign_add(
-      total_weight, tf.reduce_sum(weights), name="total_weight_update")
-
-    # metric value retrieval subgraph
-    ctr1 = tf.truediv(total_positive, total_weight, name="ctr")
-    # Note: we don't have to keep running averages for computing baseline CE. Because the prediction
-    # is constant for every sample, we can simplify it to the formula below.
-    baseline_ce = _binary_cross_entropy(pred=ctr1, target=ctr1, name="baseline_ce")
-    pred_ce = tf.truediv(total_loss, total_weight, name="pred_ce")
-
-    rce_t = tf.multiply(
-      1.0 - tf.truediv(pred_ce, baseline_ce),
-      100,
-      name="rce")
-
-    # metric update subgraph
-    ctr2 = tf.truediv(update_total_positive, update_total_weight, name="ctr_update")
-    # Note: we don't have to keep running averages for computing baseline CE. Because the prediction
-    # is constant for every sample, we can simplify it to the formula below.
-    baseline_ce2 = _binary_cross_entropy(pred=ctr2, target=ctr2, name="baseline_ce_update")
-    pred_ce2 = tf.truediv(update_total_loss, update_total_weight, name="pred_ce_update")
-
-    update_op = tf.multiply(
-      1.0 - tf.truediv(pred_ce2, baseline_ce2),
-      100,
-      name="update_op")
-
-    if metrics_collections:
-      tf.add_to_collections(metrics_collections, rce_t)
-
-    if updates_collections:
-      tf.add_to_collections(updates_collections, update_op)
-
-    return rce_t, update_op
-
-
-def ce(p_true, p_est=None):
-  if p_est is None:
-    p_est = p_true
-  return _binary_cross_entropy(pred=p_est, target=p_true, name=None)
-
-
-def rce_transform(outputs, labels, weights):
-  '''
-  Construct an OrderedDict of quantities to aggregate over eval batches
-  outputs, labels, weights are TensorFlow tensors, and are assumed to
-    be of shape [N] for batch_size = N
-  Each entry in the output OrderedDict should also be of shape [N]
-  '''
-  out_vals = OrderedDict()
-  out_vals['weighted_loss'] = weights * ce(p_true=labels, p_est=outputs)
-  out_vals['weighted_labels'] = labels * weights
-  out_vals['weight'] = weights
-  return out_vals
-
-
-def rce_metric(aggregates):
-  '''
-  input ``aggregates`` is an OrderedDict with the same keys as those created
-    by rce_transform(). The dict values are the aggregates (reduce_sum)
-    of the values produced by rce_transform(), and should be scalars.
-  output is the value of RCE
-  '''
-  # cummulative weighted loss of model predictions
-  total_weighted_loss = aggregates['weighted_loss']
-  total_weighted_labels = aggregates['weighted_labels']
-  total_weight = aggregates['weight']
-
-  model_average_loss = total_weighted_loss / total_weight
-  baseline_average_loss = ce(total_weighted_labels / total_weight)
-  return 100.0 * (1 - model_average_loss / baseline_average_loss)
-
-
-def metric_std_err(labels, predictions,
-                   weights=None,
-                   transform=rce_transform, metric=rce_metric,
-                   metrics_collections=None,
-                   updates_collections=None,
-                   name='rce_std_err'):
-  """
-  Compute the weighted standard error of the RCE metric on this eval set.
-  This can be used for confidence intervals and unpaired hypothesis tests.
-
-  Args:
-    labels: the ground truth value.
-    predictions: the predicted values, whose shape must match labels.
-    weights: optional weights, whose shape must match labels . Weight is 1 if not set.
-    transform: a function of the following form:
-
-      .. code-block:: python
-
-        def transform(outputs, labels, weights):
-          out_vals = OrderedDict()
-          ...
-          return out_vals
-
-      where outputs, labels, and weights are all tensors of shape [eval_batch_size].
-      The returned OrderedDict() should have values that are tensors of shape  [eval_batch_size].
-      These will be aggregated across many batches in the eval dataset, to produce
-      one scalar value per key of out_vals.
-    metric: a function of the following form
-
-      .. code-block:: python
-
-        def metric(aggregates):
-          ...
-          return metric_value
-
-      where aggregates is an OrderedDict() having the same keys created by transform().
-      Each of the corresponding dict values is the reduce_sum of the values produced by
-      transform(), and is a TF scalar. The return value should be a scalar representing
-      the value of the desired metric.
-    metrics_collections: optional list of collections to add this metric into.
-    updates_collections: optional list of collections to add the associated update_op into.
-    name: an optional variable_scope name.
-
-  Return:
-    metric value: A `Tensor` representing the value of the metric on the data accumulated so far.
-    update_op: A update operation used to accumulate data into this metric.
-  """
-  with tf.variable_scope(name, 'metric_std_err', (labels, predictions, weights)):
-    labels = tf.cast(labels, tf.float64)
-    predictions = tf.cast(predictions, tf.float64)
-
-    if weights is None:
-      weights = tf.ones_like(labels, dtype=tf.float64, name="default_weight")
-    else:
-      weights = tf.cast(weights, tf.float64)
-
-    labels = tf.reshape(labels, [-1])
-    predictions = tf.reshape(predictions, [-1])
-    predictions = tf.clip_by_value(predictions, CLAMP_EPSILON, 1.0 - CLAMP_EPSILON, name="clip_p")
-    weights = tf.reshape(weights, [-1])
-
-    # first apply the supplied transform function to the output, label, weight data
-    # returns an OrderedDict of 1xN tensors for N input samples
-    # for each sample, compute f = transform(pred, l, w)
-    transformed = transform(predictions, labels, weights)
-
-    # we track 3 types of aggregate information
-    # 1. total number of samples
-    # 2. aggregated transformed samples (moment1), i.e. sum(f)
-    # 3. aggregated crosses of transformed samples (moment2), i.e. sum(f*f^T)
-
-    # count total number of samples
-    sample_count = _metric_variable(
-        name='sample_count', shape=[], dtype=tf.int64)
-    update_sample_count = tf.assign_add(sample_count, tf.size(labels, out_type=sample_count.dtype))
-
-    # compose the ordered dict into a single vector
-    # so f can be treated as a single column vector rather than a collection of scalars
-    N = len(transformed)
-    transformed_vec = tf.stack(list(transformed.values()), axis=1)
-
-    # compute and update transformed samples (1st order statistics)
-    # i.e. accumulate f into F as F += sum(f)
-    aggregates_1 = _metric_variable(
-        name='aggregates_1', shape=[N], dtype=tf.float64)
-    update_aggregates_1 = tf.assign_add(aggregates_1, tf.reduce_sum(transformed_vec, axis=0))
-
-    # compute and update crossed transformed samples (2nd order statistics)
-    # i.e. accumulate f*f^T into F2 as F2 += sum(f*transpose(f))
-    aggregates_2 = _metric_variable(
-        name='aggregates_2', shape=[N, N], dtype=tf.float64)
-    moment_2_temp = (
-      tf.reshape(transformed_vec, shape=[-1, N, 1])
-      * tf.reshape(transformed_vec, shape=[-1, 1, N])
+            total_prediction,
+            tf.reduce_sum(weighted_predictions),
+            name="total_prediction_update",
+        )
+        update_total_negative_prediction = tf.assign_add(
+            total_negative_prediction,
+            tf.reduce_sum(weighted_negative_predictions),
+            name="total_negative_prediction_update",
+        )
+        update_total_negative_squared_prediction = tf.assign_add(
+            total_negative_squared_prediction,
+            tf.reduce_sum(weighted_negative_squared_predictions),
+            name="total_negative_squared_prediction_update",
+        )
+        update_total_negative_cubed_prediction = tf.assign_add(
+            total_negative_cubed_prediction,
+            tf.reduce_sum(weighted_negative_cubed_predictions),
+            name="total_negative_cubed_prediction_update",
+        )
+        update_total_negative_quartic_prediction = tf.assign_add(
+            total_negative_quartic_prediction,
+            tf.reduce_sum(weighted_negative_quartic_predictions),
+            name="total_negative_quartic_prediction_update",
+        )
+        update_total_negative_quintic_prediction = tf.assign_add(
+            total_negative_quintic_prediction,
+            tf.reduce_sum(weighted_negative_quintic_predictions),
+            name="total_negative_quintic_prediction_update",
+        )
+        update_total_loss = tf.assign_add(
+            total_loss, tf.reduce_sum(weighted_logloss), name="total_loss_update"
+        )
+
+        # metric value retrieval subgraph
+        # ctr of this batch
+        positive_rate = tf.truediv(total_positive, total_weight, name="positive_rate")
+        # Note: we don't have to keep running averages for computing baseline CE. Because the prediction
+        # is constant for every sample, we can simplify it to the formula below.
+        baseline_loss = _binary_cross_entropy(
+            pred=positive_rate, target=positive_rate, name="baseline_loss"
+        )
+
+        # normalizing ratio for nrce
+        # calculated using total ctr and pctr so the last batch has the dataset ctr and pctr
+        normalizer = tf.truediv(total_positive, total_prediction, name="normalizer")
+        # Taylor expansion to calculate nl = - sum(y * log(p * a) + (1 - y) * log (1 - p * a))
+        # log(1 - p * a) = -sum_{i=1~+inf}(a^i * x^i / i)
+        # log(1 - p) = -sum_{i=1~+inf}(a^i * x^i / i)
+        normalized_loss = (
+            total_loss
+            - total_positive * tf.log(normalizer)
+            + total_negative_prediction * (normalizer - 1)
+            + total_negative_squared_prediction * (normalizer * normalizer - 1) / 2
+            + total_negative_cubed_prediction
+            * (normalizer * normalizer * normalizer - 1)
+            / 3
+            + total_negative_quartic_prediction
+            * (normalizer * normalizer * normalizer * normalizer - 1)
+            / 4
+            + total_negative_quintic_prediction
+            * (normalizer * normalizer * normalizer * normalizer * normalizer - 1)
+            / 5
+        )
+
+        # average normalized loss
+        avg_loss = tf.truediv(normalized_loss, total_weight, name="avg_loss")
+
+        nrce_t = tf.multiply(
+            1.0 - tf.truediv(avg_loss, baseline_loss), 100, name="lolly_nrce"
+        )
+
+        # metric update subgraph
+        update_positive_rate = tf.truediv(
+            update_total_positive, update_total_weight, name="update_positive_rate"
+        )
+        # Note: we don't have to keep running averages for computing baseline CE. Because the prediction
+        # is constant for every sample, we can simplify it to the formula below.
+        update_baseline_loss = _binary_cross_entropy(
+            pred=update_positive_rate,
+            target=update_positive_rate,
+            name="update_baseline_loss",
+        )
+
+        update_normalizer = tf.truediv(
+            update_total_positive, update_total_prediction, name="update_normalizer"
+        )
+        update_normalized_loss = (
+            update_total_loss
+            - update_total_positive * tf.log(update_normalizer)
+            + update_total_negative_prediction * (update_normalizer - 1)
+            + update_total_negative_squared_prediction
+            * (update_normalizer * update_normalizer - 1)
+            / 2
+            + update_total_negative_cubed_prediction
+            * (update_normalizer * update_normalizer * update_normalizer - 1)
+            / 3
+            + update_total_negative_quartic_prediction
+            * (
+                update_normalizer
+                * update_normalizer
+                * update_normalizer
+                * update_normalizer
+                - 1
+            )
+            / 4
+            + update_total_negative_quintic_prediction
+            * (
+                update_normalizer
+                * update_normalizer
+                * update_normalizer
+                * update_normalizer
+                * update_normalizer
+                - 1
+            )
+            / 5
+        )
+
+        update_avg_loss = tf.truediv(
+            update_normalized_loss, update_total_weight, name="update_avg_loss"
+        )
+
+        update_op = tf.multiply(
+            1.0 - tf.truediv(update_avg_loss, update_baseline_loss),
+            100,
+            name="update_op",
+        )
+
+        if metrics_collections:
+            tf.add_to_collections(metrics_collections, nrce_t)
+
+        if updates_collections:
+            tf.add_to_collections(updates_collections, update_op)
+
+        return nrce_t, update_op
+
+
+def _binary_cross_entropy(
+    pred: tf.Tensor,
+    target: tf.Tensor,
+    name: Optional[str] = None,
+) -> tf.Tensor:
+    return -tf.add(
+        target * tf.log(pred), (1.0 - target) * tf.log(1.0 - pred), name=name
     )
-    update_aggregates_2 = tf.assign_add(aggregates_2, tf.reduce_sum(moment_2_temp, axis=0))
-
-    def compute_output(agg_1, agg_2, samp_cnt):
-      # decompose the aggregates back into a dict to pass to the user-supplied metric fn
-      aggregates_dict = OrderedDict()
-      for i, key in enumerate(transformed.keys()):
-        aggregates_dict[key] = agg_1[i]
-
-      metric_value = metric(aggregates_dict)
-
-      # derivative of metric with respect to the 1st order aggregates
-      # i.e. d M(agg1) / d agg1
-      metric_prime = tf.gradients(metric_value, agg_1, stop_gradients=agg_1)
-
-      # estimated covariance of agg_1
-      # cov(F) = sum(f*f^T) - (sum(f) * sum(f)^T) / N
-      #     = agg_2 - (agg_1 * agg_1^T) / N
-      N_covariance_estimate = agg_2 - (
-        tf.reshape(agg_1, shape=[-1, 1])
-        @ tf.reshape(agg_1, shape=[1, -1])
-        / tf.cast(samp_cnt, dtype=tf.float64)
-      )
-
-      # push N_covariance_estimate through a linearization of metric around agg_1
-      # metric var = transpose(d M(agg1) / d agg1) * cov(F) * (d M(agg1) / d agg1)
-      metric_variance = (
-        tf.reshape(metric_prime, shape=[1, -1])
-        @ N_covariance_estimate
-        @ tf.reshape(metric_prime, shape=[-1, 1])
-      )
-      # result should be a single element, but the matmul is 2D
-      metric_variance = metric_variance[0][0]
-      metric_stderr = tf.sqrt(metric_variance)
-      return metric_stderr
-
-    metric_stderr = compute_output(aggregates_1, aggregates_2, sample_count)
-    update_metric_stderr = compute_output(update_aggregates_1, update_aggregates_2, update_sample_count)
-
-    if metrics_collections:
-      tf.add_to_collections(metrics_collections, metric_stderr)
-
-    if updates_collections:
-      tf.add_to_collections(updates_collections, update_metric_stderr)
-
-    return metric_stderr, update_metric_stderr
-
-
-def lolly_nrce(labels, predictions,
-               weights=None,
-               metrics_collections=None,
-               updates_collections=None,
-               name=None):
-  """
-  Compute the Lolly NRCE.
-
-  Note: As this NRCE calculation uses Taylor expansion, it becomes inaccurate when the ctr is large,
-  especially when the adjusted ctr goes above 1.0.
-
-  Calculation:
-
-  ::
-
-    NRCE: lolly NRCE
-    BCE: baseline cross entropy
-    NCE: normalized cross entropy
-    CE: cross entropy
-    y_i: label of example i
-    p_i: prediction of example i
-    y: ctr
-    p: average prediction
-    a: normalizer
-
-    Assumes any p_i and a * p_i is within [0, 1)
-    NRCE = (1 - NCE / BCE) * 100
-    BCE = - sum_i(y_i * log(y) + (1 - y_i) * log(1 - y))
-        = - (y * log(y) + (1 - y) * log(1 - y))
-    a = y / p
-    CE = - sum_i(y_i * log(p_i) + (1 - y_i) * log(1 - p_i))
-    NCE = - sum_i(y_i * log(a * p_i) + (1 - y_i) * log(1 - a * p_i))
-        = - sum_i(y_i * log(p_i) + (1 - y_i) * log(1 - p_i))
-          - sum_i(y_i * log(a))
-          + sum_i((1 - y_i) * log(1 - p_i))
-          - sum_i((1 - y_i) * log(1 - a * p_i))
-        ~= CE - sum_i(y_i) * log(a)
-          + sum_i((1 - y_i) * (- sum_{j=1~5}(p_i^j / j)))
-          - sum_i((1 - y_i) * (- sum_{j=1~5}(a^j * p_i^j / j)))
-          # Takes 5 items from the Taylor expansion, can be increased if needed
-          # Error for each example is O(p_i^6)
-        = CE - sum_i(y_i) * log(a)
-          - sum_{j=1~5}(sum_i((1 - y_i) * p_i^j) / j)
-          + sum_{j=1~5}(sum_i((1 - y_i) * p_i^j) * a^j / j)
-        = CE - sum_i(y_i) * log(a)
-          + sum_{j=1~5}(sum_i((1 - y_i) * p_i^j) * (a^j - 1) / j)
-
-  Thus we keep track of CE, sum_i(y_i), sum_i((1 - y_i) * p_i^j) for j=1~5.
-  We also keep track of p and y by sum_i(y_i), sum_i(p_i), sum_i(1) so that
-  we can get a at the end, which leads to this NRCE.
-
-  NRCE uses ctr and average pctr to normalize the pctrs.
-  It removes the impact of prediction error from RCE.
-  Usually NRCE is higher as the prediction error impact on RCE is negative.
-  Removing prediction error in our model can make RCE closer to NRCE and thus improve RCE.
-
-  In Lolly NRCE we use ctr and average pctr of the whole dataset.
-  We thus remove the dataset level error in NRCE calculation.
-  In this case, when we want to improve RCE to the level of NRCE,
-  it is achievable as dataset level prediction error is easy to remove by calibration.
-  Lolly NRCE is thus a good estimate about the potential gain by adding calibration.
-
-  In DBv2 NRCE, we use per-batch ctr and average pctr. We remove the batch level error.
-  This error is difficult to remove by modeling improvement,
-  at least not by simple calibration.
-  It thus cannot indicate the same opportunity as the Lolly NRCE does.
-
-  Args:
-    labels:
-      the ground true value.
-    predictions:
-      the predicted values, whose shape must match labels.
-    weights:
-      optional weights, whose shape must match labels . Weight is 1 if not set.
-    metrics_collections:
-      optional list of collections to add this metric into.
-    updates_collections:
-      optional list of collections to add the associated update_op into.
-    name:
-      an optional variable_scope name.
-
-  Return:
-    rce_value:
-      A ``Tensor`` representing the RCE.
-    update_op:
-      A update operation used to accumulate data into this metric.
-
-  Note: Must have at least 1 positive and 1 negative sample accumulated,
-        or NRCE will come out as NaN.
-  """
-  with tf.variable_scope(name, "lolly_nrce", (labels, predictions, weights)):
-    labels = tf.to_float(labels, name="label_to_float")
-    predictions = tf.to_float(predictions, name="predictions_to_float")
-
-    if weights is None:
-      weights = tf.ones(shape=tf.shape(labels), dtype=tf.float32, name="default_weight")
-    else:
-      weights = tf.to_float(weights, name="weight_to_float")
-
-    positive_weights = tf.multiply(labels, weights, name="positive_weights")
-
-    # clamp predictions to keep log(p) stable
-    clip_predictions = tf.clip_by_value(
-      predictions,
-      CLAMP_EPSILON,
-      1.0 - CLAMP_EPSILON,
-      name="clip_predictions")
-    weighted_predictions = tf.multiply(
-      predictions, weights,
-      name="weighted_predictions")
-
-    logloss = _binary_cross_entropy(pred=clip_predictions, target=labels, name="logloss")
-    weighted_logloss = tf.multiply(logloss, weights, name="weighted_logloss")
-
-    negatives = tf.subtract(
-      tf.ones(shape=tf.shape(labels), dtype=tf.float32),
-      labels,
-      name="negatives")
-    negative_predictions = tf.multiply(
-      predictions,
-      negatives,
-      name="negative_predictions")
-    weighted_negative_predictions = tf.multiply(
-      negative_predictions, weights,
-      name="weighted_negative_predictions")
-    negative_squared_predictions = tf.multiply(
-      negative_predictions,
-      negative_predictions,
-      name="negative_squared_predictions")
-    weighted_negative_squared_predictions = tf.multiply(
-      negative_squared_predictions, weights,
-      name="weighted_negative_squared_predictions")
-    negative_cubed_predictions = tf.multiply(
-      negative_squared_predictions,
-      negative_predictions,
-      name="negative_cubed_predictions")
-    weighted_negative_cubed_predictions = tf.multiply(
-      negative_cubed_predictions, weights,
-      name="weighted_negative_cubed_predictions")
-    negative_quartic_predictions = tf.multiply(
-      negative_cubed_predictions,
-      negative_predictions,
-      name="negative_quartic_predictions")
-    weighted_negative_quartic_predictions = tf.multiply(
-      negative_quartic_predictions, weights,
-      name="weighted_negative_quartic_predictions")
-    negative_quintic_predictions = tf.multiply(
-      negative_quartic_predictions,
-      negative_predictions,
-      name="negative_quintic_predictions")
-    weighted_negative_quintic_predictions = tf.multiply(
-      negative_quintic_predictions, weights,
-      name="weighted_negative_quintic_predictions")
-
-    # Tracked stats
-    total_positive = _metric_variable(name="total_positive", shape=[], dtype=tf.float32)
-    total_weight = _metric_variable(name="total_weight", shape=[], dtype=tf.float32)
-
-    total_prediction = _metric_variable(name="total_prediction", shape=[], dtype=tf.float32)
-
-    total_negative_prediction = _metric_variable(
-      name="total_negative_prediction",
-      shape=[], dtype=tf.float32)
-    total_negative_squared_prediction = _metric_variable(
-      name="total_negative_squared_prediction",
-      shape=[], dtype=tf.float32)
-    total_negative_cubed_prediction = _metric_variable(
-      name="total_negative_cubed_prediction",
-      shape=[], dtype=tf.float32)
-    total_negative_quartic_prediction = _metric_variable(
-      name="total_negative_quartic_prediction",
-      shape=[], dtype=tf.float32)
-    total_negative_quintic_prediction = _metric_variable(
-      name="total_negative_quintic_prediction",
-      shape=[], dtype=tf.float32)
-
-    total_loss = _metric_variable(name="total_loss", shape=[], dtype=tf.float32)
-
-    # Update tracked stats
-    update_total_positive = tf.assign_add(
-      total_positive, tf.reduce_sum(positive_weights), name="total_positive_update")
-    update_total_weight = tf.assign_add(
-      total_weight, tf.reduce_sum(weights), name="total_weight_update")
-    update_total_prediction = tf.assign_add(
-      total_prediction, tf.reduce_sum(weighted_predictions), name="total_prediction_update")
-    update_total_negative_prediction = tf.assign_add(
-      total_negative_prediction,
-      tf.reduce_sum(weighted_negative_predictions), name="total_negative_prediction_update")
-    update_total_negative_squared_prediction = tf.assign_add(
-      total_negative_squared_prediction,
-      tf.reduce_sum(weighted_negative_squared_predictions),
-      name="total_negative_squared_prediction_update")
-    update_total_negative_cubed_prediction = tf.assign_add(
-      total_negative_cubed_prediction,
-      tf.reduce_sum(weighted_negative_cubed_predictions),
-      name="total_negative_cubed_prediction_update")
-    update_total_negative_quartic_prediction = tf.assign_add(
-      total_negative_quartic_prediction,
-      tf.reduce_sum(weighted_negative_quartic_predictions),
-      name="total_negative_quartic_prediction_update")
-    update_total_negative_quintic_prediction = tf.assign_add(
-      total_negative_quintic_prediction,
-      tf.reduce_sum(weighted_negative_quintic_predictions),
-      name="total_negative_quintic_prediction_update")
-    update_total_loss = tf.assign_add(
-      total_loss, tf.reduce_sum(weighted_logloss), name="total_loss_update")
-
-    # metric value retrieval subgraph
-    # ctr of this batch
-    positive_rate = tf.truediv(total_positive, total_weight, name="positive_rate")
-    # Note: we don't have to keep running averages for computing baseline CE. Because the prediction
-    # is constant for every sample, we can simplify it to the formula below.
-    baseline_loss = _binary_cross_entropy(
-      pred=positive_rate,
-      target=positive_rate,
-      name="baseline_loss")
-
-    # normalizing ratio for nrce
-    # calculated using total ctr and pctr so the last batch has the dataset ctr and pctr
-    normalizer = tf.truediv(total_positive, total_prediction, name="normalizer")
-    # Taylor expansion to calculate nl = - sum(y * log(p * a) + (1 - y) * log (1 - p * a))
-    # log(1 - p * a) = -sum_{i=1~+inf}(a^i * x^i / i)
-    # log(1 - p) = -sum_{i=1~+inf}(a^i * x^i / i)
-    normalized_loss = (
-      total_loss -
-      total_positive * tf.log(normalizer) +
-      total_negative_prediction * (normalizer - 1) +
-      total_negative_squared_prediction * (normalizer * normalizer - 1) / 2 +
-      total_negative_cubed_prediction *
-      (normalizer * normalizer * normalizer - 1) / 3 +
-      total_negative_quartic_prediction *
-      (normalizer * normalizer * normalizer * normalizer - 1) / 4 +
-      total_negative_quintic_prediction *
-      (normalizer * normalizer * normalizer * normalizer * normalizer - 1) / 5)
-
-    # average normalized loss
-    avg_loss = tf.truediv(normalized_loss, total_weight, name="avg_loss")
-
-    nrce_t = tf.multiply(
-      1.0 - tf.truediv(avg_loss, baseline_loss),
-      100,
-      name="lolly_nrce")
-
-    # metric update subgraph
-    update_positive_rate = tf.truediv(
-      update_total_positive,
-      update_total_weight,
-      name="update_positive_rate")
-    # Note: we don't have to keep running averages for computing baseline CE. Because the prediction
-    # is constant for every sample, we can simplify it to the formula below.
-    update_baseline_loss = _binary_cross_entropy(
-      pred=update_positive_rate,
-      target=update_positive_rate,
-      name="update_baseline_loss")
-
-    update_normalizer = tf.truediv(
-      update_total_positive,
-      update_total_prediction,
-      name="update_normalizer")
-    update_normalized_loss = (
-      update_total_loss -
-      update_total_positive * tf.log(update_normalizer) +
-      update_total_negative_prediction *
-      (update_normalizer - 1) +
-      update_total_negative_squared_prediction *
-      (update_normalizer * update_normalizer - 1) / 2 +
-      update_total_negative_cubed_prediction *
-      (update_normalizer * update_normalizer * update_normalizer - 1) / 3 +
-      update_total_negative_quartic_prediction *
-      (update_normalizer * update_normalizer * update_normalizer *
-       update_normalizer - 1) / 4 +
-      update_total_negative_quintic_prediction *
-      (update_normalizer * update_normalizer * update_normalizer *
-       update_normalizer * update_normalizer - 1) / 5)
-
-    update_avg_loss = tf.truediv(
-      update_normalized_loss,
-      update_total_weight,
-      name="update_avg_loss")
-
-    update_op = tf.multiply(
-      1.0 - tf.truediv(update_avg_loss, update_baseline_loss),
-      100,
-      name="update_op")
-
-    if metrics_collections:
-      tf.add_to_collections(metrics_collections, nrce_t)
-
-    if updates_collections:
-      tf.add_to_collections(updates_collections, update_op)
-
-    return nrce_t, update_op
-
-
-def _binary_cross_entropy(pred, target, name):
-  return - tf.add(
-    target * tf.log(pred),
-    (1.0 - target) * tf.log(1.0 - pred),
-    name=name)
 
 
 # Copied from metrics_impl.py with minor modifications.
 # https://github.com/tensorflow/tensorflow/blob/v1.5.0/tensorflow/python/ops/metrics_impl.py#L39
-def _metric_variable(shape, dtype, validate_shape=True, name=None):
-  """Create variable in `GraphKeys.(LOCAL|METRIC_VARIABLES`) collections."""
+def _metric_variable(
+    shape: Sequence[int],
+    dtype: tf.dtypes.DType,
+    validate_shape: bool = True,
+    name: Optional[str] = None,
+) -> tf.Variable:
+    """Create variable in `GraphKeys.(LOCAL|METRIC_VARIABLES`) collections."""
+
+    return tf.Variable(
+        lambda: tf.zeros(shape, dtype),
+        trainable=False,
+        collections=[tf.GraphKeys.LOCAL_VARIABLES, tf.GraphKeys.METRIC_VARIABLES],
+        validate_shape=validate_shape,
+        name=name,
+    )
 
-  return tf.Variable(
-    lambda: tf.zeros(shape, dtype),
-    trainable=False,
-    collections=[tf.GraphKeys.LOCAL_VARIABLES, tf.GraphKeys.METRIC_VARIABLES],
-    validate_shape=validate_shape,
-    name=name)
 
 PERCENTILES = np.linspace(0, 1, 101, dtype=np.float32)
 
 # metric_name: (metric, requires thresholded output)
 SUPPORTED_BINARY_CLASS_METRICS = {
-  # TWML metrics
-  'total_weight': (total_weight_metric, False),
-  'num_samples': (num_samples_metric, False),
-  'rce': (rce, False),
-  'rce_std_err': (partial(metric_std_err, transform=rce_transform, metric=rce_metric, name='rce_std_err'), False),
-  'nrce': (partial(rce, normalize=True), False),
-  'lolly_nrce': (lolly_nrce, False),
-  'arce': (partial(rce, normalize=True, arce=True), False),
-  'arce_original': (partial(rce, normalize=True, arce=True, up_weight=False), False),
-  # CTR measures positive sample ratio. This terminology is inherited from Ads.
-  'ctr': (ctr, False),
-  # predicted CTR measures predicted positive ratio.
-  'predicted_ctr': (predicted_ctr, False),
-  'pred_std_dev': (prediction_std_dev, False),
-  # thresholded metrics
-  'accuracy': (tf.metrics.accuracy, True),
-  'precision': (tf.metrics.precision, True),
-  'recall': (tf.metrics.recall, True),
-
-  'false_positives': (tf.metrics.false_positives, True),
-  'false_negatives': (tf.metrics.false_negatives, True),
-  'true_positives': (tf.metrics.true_positives, True),
-  'true_negatives': (tf.metrics.true_negatives, True),
-
-  'precision_at_percentiles': (partial(tf.metrics.precision_at_thresholds, thresholds=PERCENTILES), False),
-  'recall_at_percentiles': (partial(tf.metrics.recall_at_thresholds, thresholds=PERCENTILES), False),
-  'false_positives_at_percentiles': (partial(tf.metrics.false_positives_at_thresholds, thresholds=PERCENTILES), False),
-  'false_negatives_at_percentiles': (partial(tf.metrics.false_negatives_at_thresholds, thresholds=PERCENTILES), False),
-  'true_positives_at_percentiles': (partial(tf.metrics.true_positives_at_thresholds, thresholds=PERCENTILES), False),
-  'true_negatives_at_percentiles': (partial(tf.metrics.true_negatives_at_thresholds, thresholds=PERCENTILES), False),
-
-  # tensorflow metrics
-  'roc_auc': (partial(tf.metrics.auc, curve='ROC',
-    summation_method='careful_interpolation'), False),
-  'pr_auc': (partial(tf.metrics.auc, curve='PR',
-    summation_method='careful_interpolation'), False),
-
-  # tensorboard curves
-  'pr_curve': (tb.summary.v1.pr_curve_streaming_op, False),
-
-  # deprecated metrics
-  'deprecated_nrce': (partial(rce, normalize=True, deprecated_rce=True), False),
-  'deprecated_arce': (partial(rce, normalize=True, arce=True, deprecated_rce=True), False),
-  'deprecated_arce_original': (partial(rce, normalize=True, arce=True,
-                                     up_weight=False, deprecated_rce=True), False)
+    # TWML metrics
+    "total_weight": (total_weight_metric, False),
+    "num_samples": (num_samples_metric, False),
+    "rce": (rce, False),
+    "rce_std_err": (
+        partial(
+            metric_std_err,
+            transform=rce_transform,
+            metric=rce_metric,
+            name="rce_std_err",
+        ),
+        False,
+    ),
+    "nrce": (partial(rce, normalize=True), False),
+    "lolly_nrce": (lolly_nrce, False),
+    "arce": (partial(rce, normalize=True, arce=True), False),
+    "arce_original": (partial(rce, normalize=True, arce=True, up_weight=False), False),
+    # CTR measures positive sample ratio. This terminology is inherited from Ads.
+    "ctr": (ctr, False),
+    # predicted CTR measures predicted positive ratio.
+    "predicted_ctr": (predicted_ctr, False),
+    "pred_std_dev": (prediction_std_dev, False),
+    # thresholded metrics
+    "accuracy": (tf.metrics.accuracy, True),
+    "precision": (tf.metrics.precision, True),
+    "recall": (tf.metrics.recall, True),
+    "false_positives": (tf.metrics.false_positives, True),
+    "false_negatives": (tf.metrics.false_negatives, True),
+    "true_positives": (tf.metrics.true_positives, True),
+    "true_negatives": (tf.metrics.true_negatives, True),
+    "precision_at_percentiles": (
+        partial(tf.metrics.precision_at_thresholds, thresholds=PERCENTILES),
+        False,
+    ),
+    "recall_at_percentiles": (
+        partial(tf.metrics.recall_at_thresholds, thresholds=PERCENTILES),
+        False,
+    ),
+    "false_positives_at_percentiles": (
+        partial(tf.metrics.false_positives_at_thresholds, thresholds=PERCENTILES),
+        False,
+    ),
+    "false_negatives_at_percentiles": (
+        partial(tf.metrics.false_negatives_at_thresholds, thresholds=PERCENTILES),
+        False,
+    ),
+    "true_positives_at_percentiles": (
+        partial(tf.metrics.true_positives_at_thresholds, thresholds=PERCENTILES),
+        False,
+    ),
+    "true_negatives_at_percentiles": (
+        partial(tf.metrics.true_negatives_at_thresholds, thresholds=PERCENTILES),
+        False,
+    ),
+    # tensorflow metrics
+    "roc_auc": (
+        partial(tf.metrics.auc, curve="ROC", summation_method="careful_interpolation"),
+        False,
+    ),
+    "pr_auc": (
+        partial(tf.metrics.auc, curve="PR", summation_method="careful_interpolation"),
+        False,
+    ),
+    # tensorboard curves
+    "pr_curve": (tb.summary.v1.pr_curve_streaming_op, False),
+    # deprecated metrics
+    "deprecated_nrce": (partial(rce, normalize=True, deprecated_rce=True), False),
+    "deprecated_arce": (
+        partial(rce, normalize=True, arce=True, deprecated_rce=True),
+        False,
+    ),
+    "deprecated_arce_original": (
+        partial(rce, normalize=True, arce=True, up_weight=False, deprecated_rce=True),
+        False,
+    ),
 }
 
 # default metrics provided by get_binary_class_metric_fn
-DEFAULT_BINARY_CLASS_METRICS = ['total_weight', 'num_samples', 'rce', 'rce_std_err',
-                                'nrce', 'arce', 'ctr', 'predicted_ctr', 'pred_std_dev',
-                                'accuracy', 'precision', 'recall', 'roc_auc', 'pr_auc']
-
-
-def get_binary_class_metric_fn(metrics=None):
-  """
-  Returns a function having signature:
-
-  .. code-block:: python
-
-    def get_eval_metric_ops(graph_output, labels, weights):
-      ...
-      return eval_metric_ops
-
-  where the returned eval_metric_ops is a dict of common evaluation metric
-  Ops for binary classification. See `tf.estimator.EstimatorSpec
-  <https://www.tensorflow.org/api_docs/python/tf/estimator/EstimatorSpec>`_
-  for a description of eval_metric_ops. The graph_output is a the result
-  dict returned by build_graph. Labels and weights are tf.Tensors.
-
-  The following graph_output keys are recognized:
-    output:
-      the raw predictions between 0 and 1. Required.
-    threshold:
-      A value between 0 and 1 used to threshold the output into a hard_output.
-      Defaults to 0.5 when threshold and hard_output are missing.
-      Either threshold or hard_output can be provided, but not both.
-    hard_output:
-      A thresholded output. Either threshold or hard_output can be provided, but not both.
-
-  Args:
-    metrics (list of String):
-      a list of metrics of interest. E.g. ['ctr', 'accuracy', 'rce']
-      Element in the list can be a string from following supported metrics, or can be a tuple
-      with three items: metric name, metric function, bool for thresholded output.
-
-      These metrics are evaluated and reported to tensorboard *during the eval phases only*.
-      Supported metrics:
-
-      - ctr (same as positive sample ratio.)
-      - rce (cross entropy loss compared to the baseline model of always predicting ctr)
-      - nrce (normalized rce, do not use this one if you do not understand what it is)
-      - `arce <http://go/arce>`_ (a more recent proposed improvment over NRCE)
-      - arce_original
-      - lolly_nrce (NRCE as it is computed in Lolly, with Taylor expansion)
-      - pr_auc
-      - roc_auc
-      - accuracy (percentage of predictions that are correct)
-      - precision (true positives) / (true positives + false positives)
-      - recall (true positives) / (true positives + false negatives)
-      - pr_curve (precision-recall curve)
-      - deprecated_arce (ARCE as it was calculated before a stability fix)
-      - deprecated_nrce (NRCE as it was calculated before a stability fix)
-
-      Example of metrics list with mixture of string and tuple:
-      metrics = [
-        'rce','nrce',
-        'roc_auc',  # default roc_auc metric
-        (
-          'roc_auc_500',  # give this metric a name
-          partial(tf.metrics.auc, curve='ROC', summation_method='careful_interpolation', num_thresholds=500),  # the metric fn
-          False,  # whether the metric requires thresholded output
-        )]
-
-      NOTE: When predicting rare events roc_auc can be underestimated. Increasing num_threshold
-      can reduce the underestimation. See go/roc-auc-pitfall for more details.
-
-      NOTE: accuracy / precision / recall apply to binary classification problems only.
-      I.e. a prediction is only considered correct if it matches the label. E.g. if the label
-      is 1.0, and the prediction is 0.99, it does not get credit.  If you want to use
-      precision / recall / accuracy metrics with soft predictions, you'll need to threshold
-      your predictions into hard 0/1 labels.
-
-      When metrics is None (the default), it defaults to:
-      [rce, nrce, arce, ctr, predicted_ctr, accuracy, precision, recall, prauc, roc_auc],
-  """
-  # pylint: disable=dict-keys-not-iterating
-  if metrics is None:
-    # remove expensive metrics by default for faster eval
-    metrics = list(DEFAULT_BINARY_CLASS_METRICS)
-
-  def get_eval_metric_ops(graph_output, labels, weights):
+DEFAULT_BINARY_CLASS_METRICS = [
+    "total_weight",
+    "num_samples",
+    "rce",
+    "rce_std_err",
+    "nrce",
+    "arce",
+    "ctr",
+    "predicted_ctr",
+    "pred_std_dev",
+    "accuracy",
+    "precision",
+    "recall",
+    "roc_auc",
+    "pr_auc",
+]
+
+
+def get_binary_class_metric_fn(metrics: Optional[List[str]] = None) -> Callable:
     """
-    graph_output:
-      dict that is returned by build_graph given input features.
-    labels:
-      target labels associated to batch.
-    weights:
-      weights of the samples..
+    Returns a function having signature:
+
+    .. code-block:: python
+
+      def get_eval_metric_ops(graph_output, labels, weights):
+        ...
+        return eval_metric_ops
+
+    where the returned eval_metric_ops is a dict of common evaluation metric
+    Ops for binary classification. See `tf.estimator.EstimatorSpec
+    <https://www.tensorflow.org/api_docs/python/tf/estimator/EstimatorSpec>`_
+    for a description of eval_metric_ops. The graph_output is a the result
+    dict returned by build_graph. Labels and weights are tf.Tensors.
+
+    The following graph_output keys are recognized:
+      output:
+        the raw predictions between 0 and 1. Required.
+      threshold:
+        A value between 0 and 1 used to threshold the output into a hard_output.
+        Defaults to 0.5 when threshold and hard_output are missing.
+        Either threshold or hard_output can be provided, but not both.
+      hard_output:
+        A thresholded output. Either threshold or hard_output can be provided, but not both.
+
+    Args:
+      metrics (list of String):
+        a list of metrics of interest. E.g. ['ctr', 'accuracy', 'rce']
+        Element in the list can be a string from following supported metrics, or can be a tuple
+        with three items: metric name, metric function, bool for thresholded output.
+
+        These metrics are evaluated and reported to tensorboard *during the eval phases only*.
+        Supported metrics:
+
+        - ctr (same as positive sample ratio.)
+        - rce (cross entropy loss compared to the baseline model of always predicting ctr)
+        - nrce (normalized rce, do not use this one if you do not understand what it is)
+        - `arce <http://go/arce>`_ (a more recent proposed improvment over NRCE)
+        - arce_original
+        - lolly_nrce (NRCE as it is computed in Lolly, with Taylor expansion)
+        - pr_auc
+        - roc_auc
+        - accuracy (percentage of predictions that are correct)
+        - precision (true positives) / (true positives + false positives)
+        - recall (true positives) / (true positives + false negatives)
+        - pr_curve (precision-recall curve)
+        - deprecated_arce (ARCE as it was calculated before a stability fix)
+        - deprecated_nrce (NRCE as it was calculated before a stability fix)
+
+        Example of metrics list with mixture of string and tuple:
+        metrics = [
+          'rce','nrce',
+          'roc_auc',  # default roc_auc metric
+          (
+            'roc_auc_500',  # give this metric a name
+            partial(tf.metrics.auc, curve='ROC', summation_method='careful_interpolation', num_thresholds=500),  # the metric fn
+            False,  # whether the metric requires thresholded output
+          )]
+
+        NOTE: When predicting rare events roc_auc can be underestimated. Increasing num_threshold
+        can reduce the underestimation. See go/roc-auc-pitfall for more details.
+
+        NOTE: accuracy / precision / recall apply to binary classification problems only.
+        I.e. a prediction is only considered correct if it matches the label. E.g. if the label
+        is 1.0, and the prediction is 0.99, it does not get credit.  If you want to use
+        precision / recall / accuracy metrics with soft predictions, you'll need to threshold
+        your predictions into hard 0/1 labels.
+
+        When metrics is None (the default), it defaults to:
+        [rce, nrce, arce, ctr, predicted_ctr, accuracy, precision, recall, prauc, roc_auc],
     """
-
-    eval_metric_ops = OrderedDict()
-
-    preds = graph_output['output']
-
-    threshold = graph_output['threshold'] if 'threshold' in graph_output else 0.5
-
-    hard_preds = graph_output.get('hard_output')
-    if hard_preds is None:
-      hard_preds = tf.greater_equal(preds, threshold)
-
-    # add metrics to eval_metric_ops dict
-    for metric in metrics:
-      if isinstance(metric, tuple) and len(metric) == 3:
-        metric_name, metric_factory, requires_threshold = metric
-        metric_name = metric_name.lower()
-      elif isinstance(metric, str):
-        metric_name = metric.lower()  # metric name are case insensitive.
-        metric_factory, requires_threshold = SUPPORTED_BINARY_CLASS_METRICS.get(metric_name)
-      else:
-        raise ValueError("Metric should be either string or tuple of length 3.")
-
-      if metric_name in eval_metric_ops:
-        # avoid adding duplicate metrics.
-        continue
-
-      if metric_factory:
-        value_op, update_op = metric_factory(
-          labels=labels,
-          predictions=(hard_preds if requires_threshold else preds),
-          weights=weights, name=metric_name)
-        eval_metric_ops[metric_name] = (value_op, update_op)
-      else:
-        raise ValueError('Cannot find the metric named ' + metric_name)
-
-    return eval_metric_ops
-
-  return get_eval_metric_ops
-
-
-def get_multi_binary_class_metric_fn(metrics, classes=None, class_dim=1):
-  """
-  Returns a function having signature:
-
-  .. code-block:: python
-
-    def get_eval_metric_ops(graph_output, labels, weights):
-      ...
-      return eval_metric_ops
-
-  where the returned eval_metric_ops is a dict of common evaluation metric
-  Ops for concatenated binary classifications. See `tf.estimator.EstimatorSpec
-  <https://www.tensorflow.org/api_docs/python/tf/estimator/EstimatorSpec>`_
-  for a description of eval_metric_ops. The graph_output is a the result
-  dict returned by build_graph. Labels and weights are tf.Tensors.
-
-  In multiple binary classification problems, the
-  ``predictions`` (that is, ``graph_output['output']``)
-  are expected to have shape ``batch_size x n_classes``,
-  where ``n_classes`` is the number of binary classification.
-  Binary classification at output[i] is expected to discriminate between ``classes[i]`` (1)
-  and NOT ``classes[i]`` (0). The labels should be of the same shape as ``graph_output``
-  with binary values (0 or 1). The weights can be of size ``batch_size`` or
-  ``batch_size x n_classes``. The ``class_dim`` contain separate probabilities,
-  and need to have separate metrics.
-
-  The following graph_output keys are recognized:
-    output:
-      the raw predictions between 0 and 1. Required.
-    threshold:
-      A value between 0 and 1 used to threshold the output into a hard_output.
-      Defaults to 0.5 when threshold and hard_output are missing.
-      Either threshold or hard_output can be provided, but not both.
-    hard_output:
-      A thresholded output. Either threshold or hard_output can be provided, but not both.
-
-  Args:
-    metrics (list of Metrics):
-      a list of metrics of interest. E.g. ['ctr', 'accuracy', 'rce']
-      Element in the list can be a string from following supported metrics, or can be a tuple
-      with three items: metric name, metric function, bool for thresholded output.
-
-      These metrics are evaluated and reported to tensorboard *during the eval phases only*.
-      Supported metrics:
-
-      - ctr (same as positive sample ratio.)
-      - rce (cross entropy loss compared to the baseline model of always predicting ctr)
-      - nrce (normalized rce, do not use this one if you do not understand what it is)
-      - pr_auc
-      - roc_auc
-      - accuracy (percentage of predictions that are correct)
-      - precision (true positives) / (true positives + false positives)
-      - recall (true positives) / (true positives + false negatives)
-      - pr_curve (precision-recall curve)
-
-      Example of metrics list with mixture of string and tuple:
-      metrics = [
-        'rce','nrce',
-        'roc_auc',  # default roc_auc metric
-        (
-          'roc_auc_500',  # give this metric a name
-          partial(tf.metrics.auc, curve='ROC', summation_method='careful_interpolation', num_thresholds=500),  # the metric fn
-          False,  # whether the metric requires thresholded output
-        )]
-
-      NOTE: When prediction on rare events, roc_auc can be underestimated. Increase num_threshold
-      can reduce the underestimation. See go/roc-auc-pitfall for more details.
-
-      NOTE: accuracy / precision / recall apply to binary classification problems only.
-      I.e. a prediction is only considered correct if it matches the label. E.g. if the label
-      is 1.0, and the prediction is 0.99, it does not get credit.  If you want to use
-      precision / recall / accuracy metrics with soft predictions, you'll need to threshold
-      your predictions into hard 0/1 labels.
-
-      When metrics is None (the default), it defaults to:
-      [rce, nrce, arce, ctr, predicted_ctr, accuracy, precision, recall, prauc, roc_auc],
-
-    classes (list of strings):
-      In case of multiple binary class models, the names for each class or label.
-      These are used to display metrics on tensorboard.
-      If these are not specified, the index in the class or label dimension is used, and you'll
-      get metrics on tensorboard named like: accuracy_0, accuracy_1, etc.
-
-    class_dim (number):
-      Dimension of the classes in predictions. Defaults to 1, that is, batch_size x n_classes.
-  """
-  # pylint: disable=invalid-name,dict-keys-not-iterating
-  if metrics is None:
-    # remove expensive metrics by default for faster eval
-    metrics = list(DEFAULT_BINARY_CLASS_METRICS)
-
-  def get_eval_metric_ops(graph_output, labels, weights):
+    # pylint: disable=dict-keys-not-iterating
+    if metrics is None:
+        # remove expensive metrics by default for faster eval
+        metrics = list(DEFAULT_BINARY_CLASS_METRICS)
+
+    def get_eval_metric_ops(
+        graph_output: Dict[str, tf.Tensor],
+        labels: tf.Tensor,
+        weights: tf.Tensor,
+    ) -> Dict[str, tf.Tensor]:
+        """
+        graph_output:
+            dict that is returned by build_graph given input features.
+        labels:
+            target labels associated to batch.
+        weights:
+            weights of the samples..
+        """
+
+        eval_metric_ops = dict()
+
+        preds = graph_output["output"]
+
+        threshold = graph_output["threshold"] if "threshold" in graph_output else 0.5
+
+        hard_preds = graph_output.get("hard_output")
+        if hard_preds is None:
+            hard_preds = tf.greater_equal(preds, threshold)
+
+        # add metrics to eval_metric_ops dict
+        for metric in metrics:
+            if isinstance(metric, tuple) and len(metric) == 3:
+                metric_name, metric_factory, requires_threshold = metric
+                metric_name = metric_name.lower()
+            elif isinstance(metric, str):
+                metric_name = metric.lower()  # metric name are case insensitive.
+                metric_factory, requires_threshold = SUPPORTED_BINARY_CLASS_METRICS.get(
+                    metric_name
+                )
+            else:
+                raise ValueError("Metric should be either string or tuple of length 3.")
+
+            if metric_name in eval_metric_ops:
+                # avoid adding duplicate metrics.
+                continue
+
+            if metric_factory:
+                value_op, update_op = metric_factory(
+                    labels=labels,
+                    predictions=(hard_preds if requires_threshold else preds),
+                    weights=weights,
+                    name=metric_name,
+                )
+                eval_metric_ops[metric_name] = (value_op, update_op)
+            else:
+                raise ValueError("Cannot find the metric named " + metric_name)
+
+        return eval_metric_ops
+
+    return get_eval_metric_ops
+
+
+def get_multi_binary_class_metric_fn(
+    metrics: List[str],
+    classes: Optional[List[str]] = None,
+    class_dim: int = 1,
+) -> Callable:
     """
-    graph_output:
-      dict that is returned by build_graph given input features.
-    labels:
-      target labels associated to batch.
-    weights:
-      weights of the samples..
+    Returns a function having signature:
+
+    .. code-block:: python
+
+      def get_eval_metric_ops(graph_output, labels, weights):
+        ...
+        return eval_metric_ops
+
+    where the returned eval_metric_ops is a dict of common evaluation metric
+    Ops for concatenated binary classifications. See `tf.estimator.EstimatorSpec
+    <https://www.tensorflow.org/api_docs/python/tf/estimator/EstimatorSpec>`_
+    for a description of eval_metric_ops. The graph_output is a the result
+    dict returned by build_graph. Labels and weights are tf.Tensors.
+
+    In multiple binary classification problems, the
+    ``predictions`` (that is, ``graph_output['output']``)
+    are expected to have shape ``batch_size x n_classes``,
+    where ``n_classes`` is the number of binary classification.
+    Binary classification at output[i] is expected to discriminate between ``classes[i]`` (1)
+    and NOT ``classes[i]`` (0). The labels should be of the same shape as ``graph_output``
+    with binary values (0 or 1). The weights can be of size ``batch_size`` or
+    ``batch_size x n_classes``. The ``class_dim`` contain separate probabilities,
+    and need to have separate metrics.
+
+    The following graph_output keys are recognized:
+        output:
+            the raw predictions between 0 and 1. Required.
+        threshold:
+            A value between 0 and 1 used to threshold the output into a hard_output.
+            Defaults to 0.5 when threshold and hard_output are missing.
+            Either threshold or hard_output can be provided, but not both.
+        hard_output:
+            A thresholded output. Either threshold or hard_output can be provided, but not both.
+
+    Args:
+        metrics (list of Metrics):
+            a list of metrics of interest. E.g. ['ctr', 'accuracy', 'rce']
+            Element in the list can be a string from following supported metrics, or can be a tuple
+            with three items: metric name, metric function, bool for thresholded output.
+
+            These metrics are evaluated and reported to tensorboard *during the eval phases only*.
+            Supported metrics:
+
+            - ctr (same as positive sample ratio.)
+            - rce (cross entropy loss compared to the baseline model of always predicting ctr)
+            - nrce (normalized rce, do not use this one if you do not understand what it is)
+            - pr_auc
+            - roc_auc
+            - accuracy (percentage of predictions that are correct)
+            - precision (true positives) / (true positives + false positives)
+            - recall (true positives) / (true positives + false negatives)
+            - pr_curve (precision-recall curve)
+
+            Example of metrics list with mixture of string and tuple:
+            metrics = [
+            'rce','nrce',
+            'roc_auc',  # default roc_auc metric
+            (
+                'roc_auc_500',  # give this metric a name
+                partial(tf.metrics.auc, curve='ROC', summation_method='careful_interpolation', num_thresholds=500),  # the metric fn
+                False,  # whether the metric requires thresholded output
+            )]
+
+            NOTE: When prediction on rare events, roc_auc can be underestimated. Increase num_threshold
+            can reduce the underestimation. See go/roc-auc-pitfall for more details.
+
+            NOTE: accuracy / precision / recall apply to binary classification problems only.
+            I.e. a prediction is only considered correct if it matches the label. E.g. if the label
+            is 1.0, and the prediction is 0.99, it does not get credit.  If you want to use
+            precision / recall / accuracy metrics with soft predictions, you'll need to threshold
+            your predictions into hard 0/1 labels.
+
+            When metrics is None (the default), it defaults to:
+            [rce, nrce, arce, ctr, predicted_ctr, accuracy, precision, recall, prauc, roc_auc],
+
+        classes (list of strings):
+            In case of multiple binary class models, the names for each class or label.
+            These are used to display metrics on tensorboard.
+            If these are not specified, the index in the class or label dimension is used, and you'll
+            get metrics on tensorboard named like: accuracy_0, accuracy_1, etc.
+
+        class_dim (number):
+            Dimension of the classes in predictions. Defaults to 1, that is, batch_size x n_classes.
     """
-
-    eval_metric_ops = OrderedDict()
-
-    preds = graph_output['output']
-
-    threshold = graph_output['threshold'] if 'threshold' in graph_output else 0.5
-
-    hard_preds = graph_output.get('hard_output')
-    if hard_preds is None:
-      hard_preds = tf.greater_equal(preds, threshold)
-
-    shape = labels.get_shape()
-    # basic sanity check: multi_metric dimension must exist
-    assert len(shape) > class_dim, "Dimension specified by class_dim does not exist."
-
-    num_labels = shape[class_dim]
-    # If we are doing multi-class / multi-label metric, the number of classes / labels must
-    # be know at graph construction time.  This dimension cannot have size None.
-    assert num_labels is not None, "The multi-metric dimension cannot be None."
-    assert classes is None or len(classes) == num_labels, (
-      "Number of classes must match the number of labels")
-
-    weights_shape = weights.get_shape() if weights is not None else None
-    if weights_shape is None:
-      num_weights = None
-    elif len(weights_shape) > 1:
-      num_weights = weights_shape[class_dim]
-    else:
-      num_weights = 1
-
-    for i in range(num_labels):
-
-      # add metrics to eval_metric_ops dict
-      for metric in metrics:
-        if isinstance(metric, tuple) and len(metric) == 3:
-          metric_name, metric_factory, requires_threshold = metric
-          metric_name = metric_name.lower()
-        elif isinstance(metric, str):
-          metric_name = metric.lower()  # metric name are case insensitive.
-          metric_factory, requires_threshold = SUPPORTED_BINARY_CLASS_METRICS.get(metric_name)
+    # pylint: disable=invalid-name,dict-keys-not-iterating
+    if metrics is None:
+        # remove expensive metrics by default for faster eval
+        metrics = list(DEFAULT_BINARY_CLASS_METRICS)
+
+    def get_eval_metric_ops(
+        graph_output: Dict[str, tf.Tensor],
+        labels: tf.Tensor,
+        weights: tf.Tensor,
+    ) -> Dict:
+        """
+        graph_output:
+            dict that is returned by build_graph given input features.
+        labels:
+            target labels associated to batch.
+        weights:
+            weights of the samples..
+        """
+
+        eval_metric_ops = dict()
+
+        preds = graph_output["output"]
+
+        threshold = graph_output["threshold"] if "threshold" in graph_output else 0.5
+
+        hard_preds = graph_output.get("hard_output")
+        if hard_preds is None:
+            hard_preds = tf.greater_equal(preds, threshold)
+
+        shape = labels.get_shape()
+        # basic sanity check: multi_metric dimension must exist
+        assert (
+            len(shape) > class_dim
+        ), "Dimension specified by class_dim does not exist."
+
+        num_labels = shape[class_dim]
+        # If we are doing multi-class / multi-label metric, the number of classes / labels must
+        # be know at graph construction time.  This dimension cannot have size None.
+        assert num_labels is not None, "The multi-metric dimension cannot be None."
+        assert (
+            classes is None or len(classes) == num_labels
+        ), "Number of classes must match the number of labels"
+
+        weights_shape = weights.get_shape() if weights is not None else None
+        if weights_shape is None:
+            num_weights = None
+        elif len(weights_shape) > 1:
+            num_weights = weights_shape[class_dim]
         else:
-          raise ValueError("Metric should be either string or tuple of length 3.")
-
-        class_metric_name = metric_name + "_" + (classes[i] if classes is not None else str(i))
-
-        if class_metric_name in eval_metric_ops:
-          # avoid adding duplicate metrics.
-          continue
-
-        class_labels = tf.gather(labels, indices=[i], axis=class_dim)
-        class_preds = tf.gather(preds, indices=[i], axis=class_dim)
-        class_hard_preds = tf.gather(hard_preds, indices=[i], axis=class_dim)
-
-        if num_weights is None:
-          class_weights = None
-        elif num_weights == num_labels:
-          class_weights = tf.gather(weights, indices=[i], axis=class_dim)
-        elif num_weights == 1:
-          class_weights = weights
-        else:
-          raise ValueError("num_weights (%d) and num_labels (%d) do not match"
-                           % (num_weights, num_labels))
-
-        if metric_factory:
-          value_op, update_op = metric_factory(
-            labels=class_labels,
-            predictions=(class_hard_preds if requires_threshold else class_preds),
-            weights=class_weights, name=class_metric_name)
-          eval_metric_ops[class_metric_name] = (value_op, update_op)
-        else:
-          raise ValueError('Cannot find the metric named ' + metric_name)
-
-    return eval_metric_ops
-
-  return get_eval_metric_ops
-
+            num_weights = 1
+
+        for i in range(num_labels):
+            # add metrics to eval_metric_ops dict
+            for metric in metrics:
+                if isinstance(metric, tuple) and len(metric) == 3:
+                    metric_name, metric_factory, requires_threshold = metric
+                    metric_name = metric_name.lower()
+                elif isinstance(metric, str):
+                    metric_name = metric.lower()  # metric name are case insensitive.
+                    (
+                        metric_factory,
+                        requires_threshold,
+                    ) = SUPPORTED_BINARY_CLASS_METRICS.get(metric_name)
+                else:
+                    raise ValueError(
+                        "Metric should be either string or tuple of length 3."
+                    )
+
+                class_metric_name = (
+                    metric_name + "_" + (classes[i] if classes is not None else str(i))
+                )
+
+                if class_metric_name in eval_metric_ops:
+                    # avoid adding duplicate metrics.
+                    continue
+
+                class_labels = tf.gather(labels, indices=[i], axis=class_dim)
+                class_preds = tf.gather(preds, indices=[i], axis=class_dim)
+                class_hard_preds = tf.gather(hard_preds, indices=[i], axis=class_dim)
+
+                if num_weights is None:
+                    class_weights = None
+                elif num_weights == num_labels:
+                    class_weights = tf.gather(weights, indices=[i], axis=class_dim)
+                elif num_weights == 1:
+                    class_weights = weights
+                else:
+                    raise ValueError(
+                        "num_weights (%d) and num_labels (%d) do not match"
+                        % (num_weights, num_labels)
+                    )
+
+                if metric_factory:
+                    value_op, update_op = metric_factory(
+                        labels=class_labels,
+                        predictions=(
+                            class_hard_preds if requires_threshold else class_preds
+                        ),
+                        weights=class_weights,
+                        name=class_metric_name,
+                    )
+                    eval_metric_ops[class_metric_name] = (value_op, update_op)
+                else:
+                    raise ValueError("Cannot find the metric named " + metric_name)
+
+        return eval_metric_ops
+
+    return get_eval_metric_ops
+
+
+def _get_uncalibrated_metric_fn(
+    calibrated_metric_fn: Callable, keep_weight: bool = True
+) -> Callable:
+    """
+    Returns a function having signature:
+
+    .. code-block:: python
+
+      def get_eval_metric_ops(graph_output, labels, weights):
+        ...
+        return eval_metric_ops
+
+    where the returned eval_metric_ops is a dict of common evaluation metric
+    Ops with uncalibrated output.
+
+    The following graph_output keys are recognized:
+        uncalibrated_output:
+            the uncalibrated raw predictions between 0 and 1. Required.
+        output:
+            the calibrated predictions between 0 and 1.
+        threshold:
+            A value between 0 and 1 used to threshold the output into a hard_output.
+            Defaults to 0.5 when threshold and hard_output are missing.
+            Either threshold or hard_output can be provided, but not both.
+        hard_output:
+            A thresholded output. Either threshold or hard_output can be provided, but not both.
+
+    Args:
+        calibrated_metric_fn: metrics function with calibration and weight.
+        keep_weight: Bool indicating whether we keep weight.
+    """
+    metric_scope = "uncalibrated" if keep_weight else "unweighted"
 
-def _get_uncalibrated_metric_fn(calibrated_metric_fn, keep_weight=True):
-  """
-  Returns a function having signature:
+    def get_eval_metric_ops(graph_output, labels, weights):
+        """
+        graph_output:
+          dict that is returned by build_graph given input features.
+        labels:
+          target labels associated to batch.
+        weights:
+          weights of the samples..
+        """
+        with tf.variable_scope(metric_scope):
+            if "uncalibrated_output" not in graph_output:
+                raise Exception("Missing uncalibrated_output in graph_output!")
+            un_calibrated_weights = weights if keep_weight else tf.ones_like(weights)
+            uncalibrated_output = {
+                "output": graph_output["uncalibrated_output"],
+                "threshold": graph_output.get("threshold", 0.5),
+                "hard_output": graph_output.get("hard_output"),
+                **{
+                    k: v
+                    for k, v in graph_output.items()
+                    if k not in ["output", "threshold", "hard_output"]
+                },
+            }
+
+            eval_metrics_ops = calibrated_metric_fn(
+                uncalibrated_output, labels, un_calibrated_weights
+            )
+
+            renamed_metrics_ops = {
+                f"{metric_scope}_{k}": v for k, v in eval_metrics_ops.items()
+            }
+            return renamed_metrics_ops
+
+    return get_eval_metric_ops
 
-  .. code-block:: python
 
-    def get_eval_metric_ops(graph_output, labels, weights):
-      ...
-      return eval_metric_ops
-
-  where the returned eval_metric_ops is a dict of common evaluation metric
-  Ops with uncalibrated output.
-
-  The following graph_output keys are recognized:
-    uncalibrated_output:
-      the uncalibrated raw predictions between 0 and 1. Required.
-    output:
-      the calibrated predictions between 0 and 1.
-    threshold:
-      A value between 0 and 1 used to threshold the output into a hard_output.
-      Defaults to 0.5 when threshold and hard_output are missing.
-      Either threshold or hard_output can be provided, but not both.
-    hard_output:
-      A thresholded output. Either threshold or hard_output can be provided, but not both.
-
-  Args:
-    calibrated_metric_fn: metrics function with calibration and weight.
-    keep_weight: Bool indicating whether we keep weight.
-  """
-  metric_scope = 'uncalibrated' if keep_weight else 'unweighted'
-
-  def get_eval_metric_ops(graph_output, labels, weights):
+def get_multi_binary_class_uncalibrated_metric_fn(
+    metrics: List[Union[str, Tuple[str, Callable, bool]]],
+    classes: Optional[List[str]] = None,
+    class_dim: int = 1,
+    keep_weight: bool = True,
+) -> Callable:
     """
-    graph_output:
-      dict that is returned by build_graph given input features.
-    labels:
-      target labels associated to batch.
-    weights:
-      weights of the samples..
+    Returns a function having signature:
+
+    .. code-block:: python
+
+      def get_eval_metric_ops(graph_output, labels, weights):
+        ...
+        return eval_metric_ops
+
+    where the returned eval_metric_ops is a dict of common evaluation metric
+    Ops for concatenated binary classifications without calibration.
+
+    Note: 'uncalibrated_output' is required key in graph_output.
+
+    The main use case for this function is:
+
+    1) To calculated roc-auc for rare event.
+    Calibrated prediction score for rare events will be concentrated near zero. As a result,
+    the roc-auc can be seriously underestimated with current implementation in tf.metric.auc.
+    Since roc-auc is invariant against calibration, we can directly use uncalibrated score for roc-auc.
+    For more details, please refer to: go/roc-auc-invariance.
+
+    2) To set keep_weight=False and get unweighted and uncalibrated metrics.
+    This is useful to eval how the model is fitted to its actual training data, since
+    often time the model is trained without weight.
+
+    Args:
+        metrics (list of String):
+            a list of metrics of interest. E.g. ['ctr', 'accuracy', 'rce']
+            Element in the list can be a string from supported metrics, or can be a tuple
+            with three items: metric name, metric function, bool for thresholded output.
+            These metrics are evaluated and reported to tensorboard *during the eval phases only*.
+            When metrics is None (the default), it defaults to:
+                [rce, nrce, arce, ctr, predicted_ctr, accuracy, precision, recall, prauc, roc_auc],
+        classes (list of strings):
+            In case of multiple binary class models, the names for each class or label.
+            These are used to display metrics on tensorboard.
+            If these are not specified, the index in the class or label dimension is used, and you'll
+            get metrics on tensorboard named like: accuracy_0, accuracy_1, etc.
+        class_dim (number):
+            Dimension of the classes in predictions. Defaults to 1, that is, batch_size x n_classes.
+        keep_weight (bool):
+            Whether to keep weights for the metric.
     """
-    with tf.variable_scope(metric_scope):
-      if 'uncalibrated_output' not in graph_output:
-        raise Exception("Missing uncalibrated_output in graph_output!")
-      un_calibrated_weights = weights if keep_weight else tf.ones_like(weights)
-      uncalibrated_output = {
-        'output': graph_output['uncalibrated_output'],
-        'threshold': graph_output.get('threshold', 0.5),
-        'hard_output': graph_output.get('hard_output'),
-        **{k: v for k, v in graph_output.items() if k not in ['output', 'threshold', 'hard_output']}
-      }
 
-      eval_metrics_ops = calibrated_metric_fn(uncalibrated_output, labels, un_calibrated_weights)
+    calibrated_metric_fn = get_multi_binary_class_metric_fn(
+        metrics, classes=classes, class_dim=class_dim
+    )
+    return _get_uncalibrated_metric_fn(calibrated_metric_fn, keep_weight=keep_weight)
 
-      renamed_metrics_ops = {f'{metric_scope}_{k}': v for k, v in eval_metrics_ops.items()}
-      return renamed_metrics_ops
 
-  return get_eval_metric_ops
+def combine_metric_fns(*fn_list) -> Callable:
+    """
+    Combine multiple metric functions.
+    For example, we can combine metrics function generated by
+    get_multi_binary_class_metric_fn and get_multi_binary_class_uncalibrated_metric_fn.
 
+    Args:
+      *fn_list: Multiple metric functions to be combined
 
-def get_multi_binary_class_uncalibrated_metric_fn(
-  metrics, classes=None, class_dim=1, keep_weight=True):
-  """
-  Returns a function having signature:
+    Returns:
+        Combined metric function.
+    """
 
-  .. code-block:: python
+    def combined_metric_ops(*args, **kwargs) -> dict:
+        eval_metric_ops = dict()
+        for fn in fn_list:
+            eval_metric_ops.update(fn(*args, **kwargs))
+        return eval_metric_ops
 
-    def get_eval_metric_ops(graph_output, labels, weights):
-      ...
-      return eval_metric_ops
-
-  where the returned eval_metric_ops is a dict of common evaluation metric
-  Ops for concatenated binary classifications without calibration.
-
-  Note: 'uncalibrated_output' is required key in graph_output.
-
-  The main use case for this function is:
-
-  1) To calculated roc-auc for rare event.
-  Calibrated prediction score for rare events will be concentrated near zero. As a result,
-  the roc-auc can be seriously underestimated with current implementation in tf.metric.auc.
-  Since roc-auc is invariant against calibration, we can directly use uncalibrated score for roc-auc.
-  For more details, please refer to: go/roc-auc-invariance.
-
-  2) To set keep_weight=False and get unweighted and uncalibrated metrics.
-  This is useful to eval how the model is fitted to its actual training data, since
-  often time the model is trained without weight.
-
-  Args:
-    metrics (list of String):
-      a list of metrics of interest. E.g. ['ctr', 'accuracy', 'rce']
-      Element in the list can be a string from supported metrics, or can be a tuple
-      with three items: metric name, metric function, bool for thresholded output.
-      These metrics are evaluated and reported to tensorboard *during the eval phases only*.
-
-      When metrics is None (the default), it defaults to:
-      [rce, nrce, arce, ctr, predicted_ctr, accuracy, precision, recall, prauc, roc_auc],
-
-    classes (list of strings):
-      In case of multiple binary class models, the names for each class or label.
-      These are used to display metrics on tensorboard.
-      If these are not specified, the index in the class or label dimension is used, and you'll
-      get metrics on tensorboard named like: accuracy_0, accuracy_1, etc.
-
-    class_dim (number):
-      Dimension of the classes in predictions. Defaults to 1, that is, batch_size x n_classes.
-
-    keep_weight (bool):
-      Whether to keep weights for the metric.
-  """
-
-  calibrated_metric_fn = get_multi_binary_class_metric_fn(
-    metrics, classes=classes, class_dim=class_dim)
-  return _get_uncalibrated_metric_fn(calibrated_metric_fn, keep_weight=keep_weight)
-
-
-def combine_metric_fns(*fn_list):
-  """
-  Combine multiple metric functions.
-  For example, we can combine metrics function generated by
-  get_multi_binary_class_metric_fn and get_multi_binary_class_uncalibrated_metric_fn.
-
-  Args:
-    *fn_list: Multiple metric functions to be combined
-
-  Returns:
-    Combined metric function.
-  """
-  def combined_metric_ops(*args, **kwargs):
-    eval_metric_ops = OrderedDict()
-    for fn in fn_list:
-      eval_metric_ops.update(fn(*args, **kwargs))
-    return eval_metric_ops
-  return combined_metric_ops
+    return combined_metric_ops
diff --git a/twml/twml/optimizers/__init__.py b/twml/twml/optimizers/__init__.py
index eaa29883c..e96cadfdf 100644
--- a/twml/twml/optimizers/__init__.py
+++ b/twml/twml/optimizers/__init__.py
@@ -1,4 +1,2 @@
-from twitter.deepbird.compat.v1.optimizers import (
-  LazyAdamOptimizer,
-  optimize_loss,
-  OPTIMIZER_SUMMARIES) # noqa: F401
+from twitter.deepbird.compat.v1.optimizers import OPTIMIZER_SUMMARIES  # noqa: F401
+from twitter.deepbird.compat.v1.optimizers import LazyAdamOptimizer, optimize_loss
diff --git a/twml/twml/parsers.py b/twml/twml/parsers.py
index eac60083a..d0cb09011 100644
--- a/twml/twml/parsers.py
+++ b/twml/twml/parsers.py
@@ -1,20 +1,20 @@
-'''
+"""
 Contains implementations of functions to parse training and evaluation data.
 
 Modelers can use the functions in this module as the the train/eval_parse_fn of
 the DataRecordTrainer constructor to customize how to parse their datasets.
 
 Modelers may also provide custom implementations of train/eval_parse_fn using these as reference.
-'''
+"""
 
-from twitter.deepbird.io.legacy.parsers import (
-  convert_to_supervised_input_receiver_fn,  # noqa: F401
-  get_continuous_parse_fn,  # noqa: F401
-  get_default_parse_fn,  # noqa: F401
-  get_features_as_tensor_dict,  # noqa: F401
-  get_labels_in_features_parse_fn,  # noqa: F401
-  get_serving_input_receiver_fn_feature_dict,  # noqa: F401
-  get_sparse_parse_fn,  # noqa: F401
-  get_sparse_serving_input_receiver_fn,  # noqa: F401
-  get_tensor_parse_fn,  # noqa: F401
+from twitter.deepbird.io.legacy.parsers import get_continuous_parse_fn  # noqa: F401
+from twitter.deepbird.io.legacy.parsers import get_default_parse_fn  # noqa: F401
+from twitter.deepbird.io.legacy.parsers import get_features_as_tensor_dict  # noqa: F401
+from twitter.deepbird.io.legacy.parsers import get_sparse_parse_fn  # noqa: F401
+from twitter.deepbird.io.legacy.parsers import get_tensor_parse_fn  # noqa: F401
+from twitter.deepbird.io.legacy.parsers import (  # noqa: F401
+    convert_to_supervised_input_receiver_fn,
+    get_labels_in_features_parse_fn,
+    get_serving_input_receiver_fn_feature_dict,
+    get_sparse_serving_input_receiver_fn,
 )
diff --git a/twml/twml/readers/__init__.py b/twml/twml/readers/__init__.py
index 06a6d79f5..2578ab0da 100644
--- a/twml/twml/readers/__init__.py
+++ b/twml/twml/readers/__init__.py
@@ -2,6 +2,7 @@
 """ This module contains data readers """
 
 from .batch_prediction_request import BatchPredictionRequest  # noqa: F401
-from .data_record import DataRecord, SPARSE_DATA_RECORD_FEATURE_FIELDS  # noqa: F401
+from .data_record import SPARSE_DATA_RECORD_FEATURE_FIELDS  # noqa: F401
+from .data_record import DataRecord
 from .hashed_batch_prediction_request import HashedBatchPredictionRequest  # noqa: F401
-from .hashed_data_record import HashedDataRecord  # noqa: F401
\ No newline at end of file
+from .hashed_data_record import HashedDataRecord  # noqa: F401
diff --git a/twml/twml/readers/batch_prediction_request.py b/twml/twml/readers/batch_prediction_request.py
index 512a8c514..f0c233d35 100644
--- a/twml/twml/readers/batch_prediction_request.py
+++ b/twml/twml/readers/batch_prediction_request.py
@@ -4,5 +4,5 @@
 """
 
 from twitter.deepbird.io.legacy.readers.batch_prediction_request import (
-  BatchPredictionRequest  # noqa: F401
-)
+    BatchPredictionRequest,
+)  # noqa: F401
diff --git a/twml/twml/readers/data_record.py b/twml/twml/readers/data_record.py
index d1c377afd..d5d773aa0 100644
--- a/twml/twml/readers/data_record.py
+++ b/twml/twml/readers/data_record.py
@@ -3,13 +3,13 @@
 This module includes facilities for manipulating data records.
 """
 
-from twitter.deepbird.io.legacy.readers.data_record import (
-  _SPEC_TO_TF,  # noqa: F401
-  SPARSE_DATA_RECORD_FEATURE_FIELDS,  # noqa: F401
-  _FeaturesBase,  # noqa: F401
-  _Features,  # noqa: F401
-  _DiscreteFeatures,  # noqa: F401
-  _StringFeatures,  # noqa: F401
-  _BaseDataRecord,  # noqa: F401
-  DataRecord,  # noqa: F401
+from twitter.deepbird.io.legacy.readers.data_record import _SPEC_TO_TF  # noqa: F401
+from twitter.deepbird.io.legacy.readers.data_record import DataRecord  # noqa: F401
+from twitter.deepbird.io.legacy.readers.data_record import _BaseDataRecord  # noqa: F401
+from twitter.deepbird.io.legacy.readers.data_record import _Features  # noqa: F401
+from twitter.deepbird.io.legacy.readers.data_record import _FeaturesBase  # noqa: F401
+from twitter.deepbird.io.legacy.readers.data_record import _StringFeatures  # noqa: F401
+from twitter.deepbird.io.legacy.readers.data_record import (  # noqa: F401
+    SPARSE_DATA_RECORD_FEATURE_FIELDS,
+    _DiscreteFeatures,
 )
diff --git a/twml/twml/readers/hashed_batch_prediction_request.py b/twml/twml/readers/hashed_batch_prediction_request.py
index 5850c4497..213dee734 100644
--- a/twml/twml/readers/hashed_batch_prediction_request.py
+++ b/twml/twml/readers/hashed_batch_prediction_request.py
@@ -4,5 +4,5 @@
 """
 
 from twitter.deepbird.io.legacy.readers.hashed_batch_prediction_request import (
-  HashedBatchPredictionRequest  # noqa: F401
-)
+    HashedBatchPredictionRequest,
+)  # noqa: F401
diff --git a/twml/twml/readers/hashed_data_record.py b/twml/twml/readers/hashed_data_record.py
index 1ff9ce816..9f8c5bd8f 100644
--- a/twml/twml/readers/hashed_data_record.py
+++ b/twml/twml/readers/hashed_data_record.py
@@ -5,8 +5,8 @@
 """
 
 from twitter.deepbird.io.legacy.readers.hashed_data_record import (
-  _HASHED_FIELDS,
-  _FEATURE_NAMES,
-  _FEATURE_TYPES,
-  HashedDataRecord,
+    _FEATURE_NAMES,
+    _FEATURE_TYPES,
+    _HASHED_FIELDS,
+    HashedDataRecord,
 )
diff --git a/twml/twml/saved_model_cli/__main__.py b/twml/twml/saved_model_cli/__main__.py
index ad5326431..96d4409e0 100644
--- a/twml/twml/saved_model_cli/__main__.py
+++ b/twml/twml/saved_model_cli/__main__.py
@@ -5,5 +5,5 @@
 
 from tensorflow.python.tools import saved_model_cli
 
-if __name__ == '__main__':
-  sys.exit(saved_model_cli.main())
+if __name__ == "__main__":
+    sys.exit(saved_model_cli.main())
diff --git a/twml/twml/tensorboard/__main__.py b/twml/twml/tensorboard/__main__.py
index c426060d1..75557b5f0 100644
--- a/twml/twml/tensorboard/__main__.py
+++ b/twml/twml/tensorboard/__main__.py
@@ -7,10 +7,9 @@
 
 from tensorboard.main import run_main
 
-
-if __name__ == '__main__':
-  # Tensorboard relies on werkzeug for its HTTP server which logs at info level
-  # by default
-  logging.getLogger('werkzeug').setLevel(logging.WARNING)
-  sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0])
-  sys.exit(run_main())
+if __name__ == "__main__":
+    # Tensorboard relies on werkzeug for its HTTP server which logs at info level
+    # by default
+    logging.getLogger("werkzeug").setLevel(logging.WARNING)
+    sys.argv[0] = re.sub(r"(-script\.pyw?|\.exe)?$", "", sys.argv[0])
+    sys.exit(run_main())
diff --git a/twml/twml/tensorio.py b/twml/twml/tensorio.py
index bc551ac56..9802d028d 100644
--- a/twml/twml/tensorio.py
+++ b/twml/twml/tensorio.py
@@ -4,11 +4,11 @@
 # too-few-public-methods
 
 import os
+from typing import List
 
 import numpy as np
 import yaml
 
-
 """
 Utility to load tensors serialized by Deepbird V1.
 
@@ -19,143 +19,153 @@
 
 # helper class used to assist hierarchical key access by remembering intermediate keys.
 class _KeyRecorder(object):
-  def __init__(self, tensorio, keys=[]):
-    self.tensorio = tensorio
-    self.keys = keys
+    def __init__(
+        self,
+        tensorio: "TensorIO",
+        keys: List[str] = [],
+    ):
+        self.tensorio = tensorio
+        self.keys = keys
 
-  def __getitem__(self, k):
-    new_keys = self.keys + [str(k)]
-    prefix = ".".join(new_keys)
+    def __getitem__(self, k: str):
+        new_keys = self.keys + [str(k)]
+        prefix = ".".join(new_keys)
 
-    key_list = self.tensorio.list_tensors()
+        key_list = self.tensorio.list_tensors()
 
-    # if we have a complete key, load the tensor.
-    if prefix in key_list:
-      return self.tensorio._load(prefix)
+        # if we have a complete key, load the tensor.
+        if prefix in key_list:
+            return self.tensorio._load(prefix)
 
-    # we don't have a complete key yet, but at least one tensor should start with this prefix.
-    for k_value in key_list:
-      if k_value.startswith(prefix):
-        return _KeyRecorder(self.tensorio, new_keys)
+        # we don't have a complete key yet, but at least one tensor should start with this prefix.
+        for k_value in key_list:
+            if k_value.startswith(prefix):
+                return _KeyRecorder(self.tensorio, new_keys)
 
-    # if no key starts with the prefix, this _key_recorder is not valid.
-    raise ValueError("Key not found: " + prefix)
+        # if no key starts with the prefix, this _key_recorder is not valid.
+        raise ValueError("Key not found: " + prefix)
 
 
 # convert tensorio tensor type to numpy data type.
 # also returns element size in bytes.
-def _get_data_type(data_type):
-  if data_type == 'Double':
-    return (np.float64, 8)
+def _get_data_type(data_type: str):
+    if data_type == "Double":
+        return (np.float64, 8)
 
-  if data_type == 'Float':
-    return (np.float32, 4)
+    if data_type == "Float":
+        return (np.float32, 4)
 
-  if data_type == 'Int':
-    return (np.int32, 4)
+    if data_type == "Int":
+        return (np.int32, 4)
 
-  if data_type == 'Long':
-    return (np.int64, 8)
+    if data_type == "Long":
+        return (np.int64, 8)
 
-  if data_type == 'Byte':
-    return (np.int8, 1)
+    if data_type == "Byte":
+        return (np.int8, 1)
 
-  raise ValueError('Unexpected tensorio data type: ' + data_type)
+    raise ValueError("Unexpected tensorio data type: " + data_type)
 
 
 class TensorIO(object):
-  """
-  Construct a TensorIO class.
-  tensorio_path: a directory containing tensors serialized using tensorio. tar file not supported.
-  mmap_tensor:
-    By default, loaded tensors use mmap storage.
-    Set this to false to not use mmap. Useful when loading multiple tensors.
-  """
-
-  def __init__(self, tensorio_path, mmap_tensor=True):
-    self._tensorio_path = tensorio_path
-    self._mmap_tensor = mmap_tensor
-
-    # Make sure we can locate spec.yaml.
-    yaml_file = os.path.join(tensorio_path, 'spec.yaml')
-    if not os.path.exists(yaml_file):
-      raise ValueError('Invalid tensorio path: no spec.yaml found.')
-
-    # load spec.yaml.
-    with open(yaml_file, 'r') as file_open:
-      # Note that tensor names in the yaml are like this: \"weight\".\'1\'
-      # For user-friendliness, we remove the quotes.
-      _spec = yaml.safe_load(file_open)
-      self._spec = {k.replace("'", '').replace('"', ''): v for (k, v) in _spec.items()}
-
-  def list_tensors(self):
-    """
-    Returns a list of tensors saved in the given path.
-    """
-    return self._spec.keys()
-
-  def _load_tensor(self, name):
     """
-    Load Tensor with the given name.
-    Raise value error if the named tensor is not found.
-    Returns a numpy array if the named tensor is found.
+    Construct a TensorIO class.
+    tensorio_path: a directory containing tensors serialized using tensorio. tar file not supported.
+    mmap_tensor:
+        By default, loaded tensors use mmap storage.
+        Set this to false to not use mmap. Useful when loading multiple tensors.
     """
-    tensor_info = self._spec[name]
-    if tensor_info['type'] != 'tensor':
-      raise ValueError('Trying to load a tensor of unknown type: ' + tensor_info['type'])
-
-    filename = os.path.join(self._tensorio_path, tensor_info['filename'])
-    (data_type, element_size) = _get_data_type(tensor_info['tensorType'])
-
-    np_array = np.memmap(
-      filename,
-      dtype=data_type,
-      mode='r',
-      # -1 because lua offset is 1 based.
-      offset=(tensor_info['offset'] - 1) * element_size,
-      shape=tuple(tensor_info['size']),
-      order='C',
-    )
-
-    return np_array if self._mmap_tensor else np_array[:].copy()
-
-  def _load_nontensor_data(self, name):
-    """
-    Load non-tensor data with the given name.
-    Returns a python string.
-    """
-    tensor_info = self._spec[name]
-    return tensor_info['data']
 
-  def _load(self, name):
-    """
-    Load data serialized under the given name, it could be a tensor or regular data.
-    """
-    if name not in self._spec:
-      raise ValueError('The specified key {} is not found in {}'.format(name, self._tensorio_path))
-
-    data_type = self._spec[name]['type']
-    if data_type == 'tensor':
-      return self._load_tensor(name)
-    else:
-      return self._load_nontensor_data(name)
-
-  def load_all(self):
-    """
-    Load all tensors stored in the tensorio directory.
-    Returns a dictionary from tensor name to numpy arrays.
-    """
-    return {k: self._load(k) for k in self._spec}
-
-  ###########################################
-  # The below are utilities for convenience #
-  ###########################################
-  def __getitem__(self, k):
-    """
-    Shorthand for _load_tensor, but also supports hierarchical access like: tensorio['a']['b']['1']
-    """
-    if k in self._spec:
-      # We have a full tensor name, directly load it.
-      return self._load_tensor(k)
-    else:
-      return _KeyRecorder(self)[k]
+    def __init__(self, tensorio_path, mmap_tensor=True):
+        self._tensorio_path = tensorio_path
+        self._mmap_tensor = mmap_tensor
+
+        # Make sure we can locate spec.yaml.
+        yaml_file = os.path.join(tensorio_path, "spec.yaml")
+        if not os.path.exists(yaml_file):
+            raise ValueError("Invalid tensorio path: no spec.yaml found.")
+
+        # load spec.yaml.
+        with open(yaml_file, "r") as file_open:
+            # Note that tensor names in the yaml are like this: \"weight\".\'1\'
+            # For user-friendliness, we remove the quotes.
+            _spec = yaml.safe_load(file_open)
+            self._spec = {
+                k.replace("'", "").replace('"', ""): v for (k, v) in _spec.items()
+            }
+
+    def list_tensors(self) -> List[str]:
+        """
+        Returns a list of tensors saved in the given path.
+        """
+        return self._spec.keys()
+
+    def _load_tensor(self, name: str) -> np.ndarray:
+        """
+        Load Tensor with the given name.
+        Raise value error if the named tensor is not found.
+        Returns a numpy array if the named tensor is found.
+        """
+        tensor_info = self._spec[name]
+        if tensor_info["type"] != "tensor":
+            raise ValueError(
+                "Trying to load a tensor of unknown type: " + tensor_info["type"]
+            )
+
+        filename = os.path.join(self._tensorio_path, tensor_info["filename"])
+        (data_type, element_size) = _get_data_type(tensor_info["tensorType"])
+
+        np_array = np.memmap(
+            filename,
+            dtype=data_type,
+            mode="r",
+            # -1 because lua offset is 1 based.
+            offset=(tensor_info["offset"] - 1) * element_size,
+            shape=tuple(tensor_info["size"]),
+            order="C",
+        )
+
+        return np_array if self._mmap_tensor else np_array[:].copy()
+
+    def _load_nontensor_data(self, name: str) -> str:
+        """
+        Load non-tensor data with the given name.
+        Returns a python string.
+        """
+        tensor_info = self._spec[name]
+        return tensor_info["data"]
+
+    def _load(self, name: str) -> np.ndarray:
+        """
+        Load data serialized under the given name, it could be a tensor or regular data.
+        """
+        if name not in self._spec:
+            raise ValueError(
+                f"The specified key {name} is not found in {self._tensorio_path}"
+            )
+
+        data_type = self._spec[name]["type"]
+        if data_type == "tensor":
+            return self._load_tensor(name)
+        else:
+            return self._load_nontensor_data(name)
+
+    def load_all(self):
+        """
+        Load all tensors stored in the tensorio directory.
+        Returns a dictionary from tensor name to numpy arrays.
+        """
+        return {k: self._load(k) for k in self._spec}
+
+    ###########################################
+    # The below are utilities for convenience #
+    ###########################################
+    def __getitem__(self, k: str) -> np.ndarray:
+        """
+        Shorthand for _load_tensor, but also supports hierarchical access like: tensorio['a']['b']['1']
+        """
+        if k in self._spec:
+            # We have a full tensor name, directly load it.
+            return self._load_tensor(k)
+        else:
+            return _KeyRecorder(self)[k]
diff --git a/twml/twml/tracking/experiment_tracker.py b/twml/twml/tracking/experiment_tracker.py
index 4f275ba4b..12bacd111 100644
--- a/twml/twml/tracking/experiment_tracker.py
+++ b/twml/twml/tracking/experiment_tracker.py
@@ -1,543 +1,644 @@
 """
 This module contains the experiment tracker for tracking training in ML Metastore
 """
-from contextlib import contextmanager
-from datetime import datetime
 import getpass
 import hashlib
 import os
 import re
 import sys
 import time
+from contextlib import contextmanager
+from datetime import datetime
+from typing import Any, Callable, Dict, List, Optional, Type, Union
 
-from absl import logging
 import tensorflow.compat.v1 as tf
-from twml.hooks import MetricsUpdateHook
+from absl import logging
 
+from twml.hooks import MetricsUpdateHook
 
 try:
-  from urllib import quote as encode_url
+    from urllib import quote as encode_url
 except ImportError:
-  from urllib.parse import quote as encode_url
+    from urllib.parse import quote as encode_url
 
 
 try:
-  # ML Metastore packages might not be available on GCP.
-  # If they are not found, tracking is disabled
-  import requests
-  from com.twitter.mlmetastore.modelrepo.client import ModelRepoClient
-  from com.twitter.mlmetastore.modelrepo.core.path import (
-    check_valid_id, get_components_from_id, generate_id)
-  from com.twitter.mlmetastore.modelrepo.core import (
-    DeepbirdRun, Experiment, FeatureConfig, FeatureConfigFeature, Model, ProgressReport, Project, StatusUpdate)
+    # ML Metastore packages might not be available on GCP.
+    # If they are not found, tracking is disabled
+    import requests
+    from com.twitter.mlmetastore.modelrepo.client import ModelRepoClient
+    from com.twitter.mlmetastore.modelrepo.core import (
+        DeepbirdRun,
+        Experiment,
+        FeatureConfig,
+        FeatureConfigFeature,
+        Model,
+        ProgressReport,
+        Project,
+        StatusUpdate,
+    )
+    from com.twitter.mlmetastore.modelrepo.core.path import (
+        check_valid_id,
+        generate_id,
+        get_components_from_id,
+    )
 except ImportError:
-  ModelRepoClient = None
+    ModelRepoClient = None
 
 
 class ExperimentTracker(object):
-  """
-  A tracker that records twml runs in ML Metastore.
-  """
-
-  def __init__(self, params, run_config, save_dir):
-    """
-
-    Args:
-      params (python dict):
-        The trainer params. ExperimentTracker uses `params.experiment_tracking_path` (String) and
-        `params.disable_experiment_tracking`.
-        If `experiment_tracking_path` is set to None, the tracker tries to guess a path with
-        save_dir.
-        If `disable_experiment_tracking` is True, the tracker is disabled.
-      run_config (tf.estimator.RunConfig):
-        The run config used by the estimator.
-      save_dir (str):
-        save_dir of the trainer
     """
-    if isinstance(params, dict):
-      self._params = params
-    else:
-      # preserving backward compatibility for people still using HParams
-      logging.warning("Please stop using HParams and use python dicts. HParams are removed in TF 2")
-      self._params = dict((k, v) for k, v in params.values().items() if v != 'null')
-    self._run_config = run_config
-    self._graceful_shutdown_port = self._params.get('health_port')
-
-    self.tracking_path = self._params.get('experiment_tracking_path')
-    is_tracking_path_too_long = self.tracking_path is not None and len(self.tracking_path) > 256
-
-    if is_tracking_path_too_long:
-      raise ValueError("Experiment Tracking Path longer than 256 characters")
-
-    self.disabled = (
-      self._params.get('disable_experiment_tracking', False) or
-      not self._is_env_eligible_for_tracking() or
-      ModelRepoClient is None
-    )
-
-    self._is_hogwild = bool(os.environ.get('TWML_HOGWILD_PORTS'))
+    A tracker that records twml runs in ML Metastore.
+    """
+
+    def __init__(self, params: dict, run_config: tf.estimator.RunConfig, save_dir: str):
+        """
+        Args:
+            params (python dict):
+                The trainer params. ExperimentTracker uses `params.experiment_tracking_path` (String) and
+                `params.disable_experiment_tracking`.
+                If `experiment_tracking_path` is set to None, the tracker tries to guess a path with
+                save_dir.
+                If `disable_experiment_tracking` is True, the tracker is disabled.
+            run_config (tf.estimator.RunConfig):
+                The run config used by the estimator.
+            save_dir (str):
+                save_dir of the trainer
+        """
+        if isinstance(params, dict):
+            self._params = params
+        else:
+            # preserving backward compatibility for people still using HParams
+            logging.warning(
+                "Please stop using HParams and use python dicts. HParams are removed in TF 2"
+            )
+            self._params = dict(
+                (k, v) for k, v in params.values().items() if v != "null"
+            )
+        self._run_config = run_config
+        self._graceful_shutdown_port = self._params.get("health_port")
+
+        self.tracking_path = self._params.get("experiment_tracking_path")
+        is_tracking_path_too_long = (
+            self.tracking_path is not None and len(self.tracking_path) > 256
+        )
 
-    self._is_distributed = bool(os.environ.get('TF_CONFIG'))
+        if is_tracking_path_too_long:
+            raise ValueError("Experiment Tracking Path longer than 256 characters")
 
-    self._client = None if self.disabled else ModelRepoClient()
+        self.disabled = (
+            self._params.get("disable_experiment_tracking", False)
+            or not self._is_env_eligible_for_tracking()
+            or ModelRepoClient is None
+        )
 
-    run_name_from_environ = self.run_name_from_environ()
-    run_name_can_be_inferred = (
-      self.tracking_path is not None or run_name_from_environ is not None)
+        self._is_hogwild = bool(os.environ.get("TWML_HOGWILD_PORTS"))
+        self._is_distributed = bool(os.environ.get("TF_CONFIG"))
+        self._client = None if self.disabled else ModelRepoClient()
 
-    # Turn the flags off as needed in hogwild / distributed
-    if self._is_hogwild or self._is_distributed:
-      self._env_eligible_for_recording_experiment = (
-        self._run_config.task_type == "evaluator")
-      if run_name_can_be_inferred:
-        self._env_eligible_for_recording_export_metadata = (
-          self._run_config.task_type == "chief")
-      else:
-        logging.info(
-          'experiment_tracking_path is not set and can not be inferred. '
-          'Recording export metadata is disabled because the chief node and eval node '
-          'are setting different experiment tracking paths.')
-        self._env_eligible_for_recording_export_metadata = False
-    else:
-      # Defaults to True
-      self._env_eligible_for_recording_experiment = True
-      self._env_eligible_for_recording_export_metadata = True
-
-    if not self.disabled:
-      # Sanitize passed in experiment tracking paths. e.g. own:proJ:exp:Run.Name
-      # -> own:proj:exp:Run_Name
-      if self.tracking_path:
-        try:
-          check_valid_id(self.tracking_path)
-        except ValueError as err:
-          logging.error(f'Invalid experiment tracking path provided. Sanitizing: {self.tracking_path}\nError: {err}')
-          self.tracking_path = generate_id(
-            owner=self.path['owner'],
-            project_name=self.path['project_name'],
-            experiment_name=self.path['experiment_name'],
-            run_name=self.path['run_name']
-          )
-          logging.error(f'Generated sanitized experiment tracking path: {self.tracking_path}')
-      else:
-        logging.info(
-          'No experiment_tracking_path set. Experiment Tracker will try to guess a path')
-        self.tracking_path = self.guess_path(save_dir, run_name_from_environ)
-        logging.info('Guessed path: %s', self.tracking_path)
-
-      # additional check to see if generated path is valid
-      try:
-        check_valid_id(self.tracking_path)
-      except ValueError as err:
-        logging.error(
-          'Could not generate valid experiment tracking path. Disabling tracking. ' +
-          'Error:\n{}'.format(err)
+        run_name_from_environ = self.run_name_from_environ()
+        run_name_can_be_inferred = (
+            self.tracking_path is not None or run_name_from_environ is not None
         )
-        self.disabled = True
 
-    self.project_id = None if self.disabled else '{}:{}'.format(
-      self.path['owner'], self.path['project_name'])
-    self.base_run_id = None if self.disabled else self.tracking_path
-    self._current_run_name_suffix = None
+        # Turn the flags off as needed in hogwild / distributed
+        if self._is_hogwild or self._is_distributed:
+            self._env_eligible_for_recording_experiment = (
+                self._run_config.task_type == "evaluator"
+            )
+            if run_name_can_be_inferred:
+                self._env_eligible_for_recording_export_metadata = (
+                    self._run_config.task_type == "chief"
+                )
+            else:
+                logging.info(
+                    "experiment_tracking_path is not set and can not be inferred. "
+                    "Recording export metadata is disabled because the chief node and eval node "
+                    "are setting different experiment tracking paths."
+                )
+                self._env_eligible_for_recording_export_metadata = False
+        else:
+            # Defaults to True
+            self._env_eligible_for_recording_experiment = True
+            self._env_eligible_for_recording_export_metadata = True
+
+        if not self.disabled:
+            # Sanitize passed in experiment tracking paths. e.g. own:proJ:exp:Run.Name
+            # -> own:proj:exp:Run_Name
+            if self.tracking_path:
+                try:
+                    check_valid_id(self.tracking_path)
+                except ValueError as err:
+                    logging.error(
+                        f"Invalid experiment tracking path provided. Sanitizing: {self.tracking_path}\nError: {err}"
+                    )
+                    self.tracking_path = generate_id(
+                        owner=self.path["owner"],
+                        project_name=self.path["project_name"],
+                        experiment_name=self.path["experiment_name"],
+                        run_name=self.path["run_name"],
+                    )
+                    logging.error(
+                        f"Generated sanitized experiment tracking path: {self.tracking_path}"
+                    )
+            else:
+                logging.info(
+                    "No experiment_tracking_path set. Experiment Tracker will try to guess a path"
+                )
+                self.tracking_path = self.guess_path(save_dir, run_name_from_environ)
+                logging.info("Guessed path: %s", self.tracking_path)
+
+            # additional check to see if generated path is valid
+            try:
+                check_valid_id(self.tracking_path)
+            except ValueError as err:
+                logging.error(
+                    "Could not generate valid experiment tracking path. Disabling tracking. "
+                    + f"Error:\n{err}"
+                )
+                self.disabled = True
+
+        self.project_id = (
+            None
+            if self.disabled
+            else f'{self.path["owner"]}:{self.path["project_name"]}'
+        )
+        self.base_run_id = None if self.disabled else self.tracking_path
+        self._current_run_name_suffix = None
+        self._current_tracker_hook = None
 
-    self._current_tracker_hook = None
+        if self.disabled:
+            logging.info("Experiment Tracker is disabled")
+        else:
+            logging.info(
+                "Experiment Tracker initialized with base run id: %s", self.base_run_id
+            )
+
+    @contextmanager
+    def track_experiment(
+        self,
+        eval_hooks: List[tf.estimator.SessionRunHook],
+        get_estimator_spec_fn: Callable[[], tf.estimator.EstimatorSpec],
+        name: Optional[str] = None,
+    ) -> tf.estimator.SessionRunHook:
+        """
+        A context manager for tracking experiment. It should wrap the training loop.
+        An experiment tracker eval hook is appended to eval_hooks to collect metrics.
+
+        Args:
+            eval_hooks (list):
+                The list of eval_hooks to be used. When it's not None, and does not contain any ,
+                MetricsUpdateHook an experiment tracker eval hook is appended to it. When it contains
+                any MetricsUpdateHook, this tracker is disabled to avoid conflict with legacy Model Repo
+                tracker (`TrackRun`).
+            get_estimator_spec_fn (func):
+                A function to get the current EstimatorSpec of the trainer, used by the eval hook.
+            name (str);
+                Name of this training or evaluation. Used as a suffix of the run_id.
+
+        Returns:
+            The tracker's eval hook which is appended to eval_hooks.
+        """
+
+        # disable this tracker if legacy TrackRun hook is present
+        # TODO: remove this once we completely deprecate the old TrackRun interface
+        if eval_hooks is not None:
+            self.disabled = self.disabled or any(
+                isinstance(x, MetricsUpdateHook) for x in eval_hooks
+            )
 
-    if self.disabled:
-      logging.info('Experiment Tracker is disabled')
-    else:
-      logging.info('Experiment Tracker initialized with base run id: %s', self.base_run_id)
+        logging.info(
+            "Is environment eligible for recording experiment: %s",
+            self._env_eligible_for_recording_experiment,
+        )
 
-  @contextmanager
-  def track_experiment(self, eval_hooks, get_estimator_spec_fn, name=None):
-    """
-    A context manager for tracking experiment. It should wrap the training loop.
-    An experiment tracker eval hook is appended to eval_hooks to collect metrics.
-
-    Args:
-      eval_hooks (list):
-        The list of eval_hooks to be used. When it's not None, and does not contain any ,
-        MetricsUpdateHook an experiment tracker eval hook is appended to it. When it contains
-        any MetricsUpdateHook, this tracker is disabled to avoid conflict with legacy Model Repo
-        tracker (`TrackRun`).
-      get_estimator_spec_fn (func):
-        A function to get the current EstimatorSpec of the trainer, used by the eval hook.
-      name (str);
-        Name of this training or evaluation. Used as a suffix of the run_id.
-
-    Returns:
-      The tracker's eval hook which is appended to eval_hooks.
-    """
+        if self._env_eligible_for_recording_experiment and self._graceful_shutdown_port:
+            requests.post(
+                f"http://localhost:{self._graceful_shutdown_port}/track_training_start"
+            )
+
+        if self.disabled or eval_hooks is None:
+            yield None
+        else:
+            assert (
+                self._current_tracker_hook is None
+            ), "experiment tracking has been started already"
+
+            if name is not None:
+                self._current_run_name_suffix = "_" + name
+
+            logging.info("Starting experiment tracking. Path: %s", self._current_run_id)
+            logging.info(
+                "Is environment eligible for recording export metadata: %s",
+                self._env_eligible_for_recording_export_metadata,
+            )
+            logging.info(
+                "This run will be available at: http://go/mldash/experiments/%s",
+                encode_url(self.experiment_id),
+            )
+
+            try:
+                self._record_run()
+                self._add_run_status(
+                    StatusUpdate(self._current_run_id, status="RUNNING")
+                )
+                self._register_for_graceful_shutdown()
+
+                self._current_tracker_hook = self.create_eval_hook(
+                    get_estimator_spec_fn
+                )
+            except Exception as err:
+                logging.error(
+                    "Failed to record run. This experiment will not be tracked. Error: %s",
+                    str(err),
+                )
+                self._current_tracker_hook = None
+
+            if self._current_tracker_hook is None:
+                yield None
+            else:
+                try:
+                    eval_hooks.append(self._current_tracker_hook)
+                    yield self._current_tracker_hook
+                except Exception as err:
+                    self._add_run_status(
+                        StatusUpdate(
+                            self._current_run_id, status="FAILED", description=str(err)
+                        )
+                    )
+                    self._deregister_for_graceful_shutdown()
+                    self._current_tracker_hook = None
+                    self._current_run_name_suffix = None
+                    logging.error("Experiment tracking done. Experiment failed.")
+                    raise
+
+                try:
+                    if self._current_tracker_hook.metric_values:
+                        self._record_update(self._current_tracker_hook.metric_values)
+                    self._add_run_status(
+                        StatusUpdate(self._current_run_id, status="SUCCESS")
+                    )
+                    logging.info("Experiment tracking done. Experiment succeeded.")
+                except Exception as err:
+                    logging.error(
+                        "Failed to update mark run as successful. Error: %s", str(err)
+                    )
+                finally:
+                    self._deregister_for_graceful_shutdown()
+                    self._current_tracker_hook = None
+                    self._current_run_name_suffix = None
+
+    def create_eval_hook(
+        self, get_estimator_spec_fn: Callable[[], tf.estimator.EstimatorSpec]
+    ) -> tf.estimator.SessionRunHook:
+        """
+        Create an eval_hook to track eval metrics
+
+        Args:
+            get_estimator_spec_fn (func):
+                A function that returns the current EstimatorSpec of the trainer.
+
+        Returns:
+            The tracker's eval hook.
+        """
+        return MetricsUpdateHook(
+            get_estimator_spec_fn=get_estimator_spec_fn,
+            add_metrics_fn=self._record_update,
+        )
 
-    # disable this tracker if legacy TrackRun hook is present
-    # TODO: remove this once we completely deprecate the old TrackRun interface
-    if eval_hooks is not None:
-      self.disabled = self.disabled or any(isinstance(x, MetricsUpdateHook) for x in eval_hooks)
-
-    logging.info('Is environment eligible for recording experiment: %s',
-                 self._env_eligible_for_recording_experiment)
-
-    if self._env_eligible_for_recording_experiment and self._graceful_shutdown_port:
-      requests.post('http://localhost:{}/track_training_start'.format(
-        self._graceful_shutdown_port
-      ))
-
-    if self.disabled or eval_hooks is None:
-      yield None
-    else:
-      assert self._current_tracker_hook is None, 'experiment tracking has been started already'
-
-      if name is not None:
-        self._current_run_name_suffix = '_' + name
-
-      logging.info('Starting experiment tracking. Path: %s', self._current_run_id)
-      logging.info('Is environment eligible for recording export metadata: %s',
-                   self._env_eligible_for_recording_export_metadata)
-      logging.info('This run will be available at: http://go/mldash/experiments/%s',
-                   encode_url(self.experiment_id))
-
-      try:
-        self._record_run()
-        self._add_run_status(StatusUpdate(self._current_run_id, status='RUNNING'))
-        self._register_for_graceful_shutdown()
-
-        self._current_tracker_hook = self.create_eval_hook(get_estimator_spec_fn)
-      except Exception as err:
-        logging.error(
-          'Failed to record run. This experiment will not be tracked. Error: %s', str(err))
-        self._current_tracker_hook = None
+    def register_model(self, export_path: str) -> None:
+        """
+        Record the exported model.
 
-      if self._current_tracker_hook is None:
-        yield None
-      else:
-        try:
-          eval_hooks.append(self._current_tracker_hook)
-          yield self._current_tracker_hook
-        except Exception as err:
-          self._add_run_status(
-            StatusUpdate(self._current_run_id, status='FAILED', description=str(err)))
-          self._deregister_for_graceful_shutdown()
-          self._current_tracker_hook = None
-          self._current_run_name_suffix = None
-          logging.error('Experiment tracking done. Experiment failed.')
-          raise
+        Args:
+            export_path (str):
+                The path to the exported model.
+        """
+        if self.disabled:
+            return None
 
         try:
-          if self._current_tracker_hook.metric_values:
-            self._record_update(self._current_tracker_hook.metric_values)
-          self._add_run_status(StatusUpdate(self._current_run_id, status='SUCCESS'))
-          logging.info('Experiment tracking done. Experiment succeeded.')
+            logging.info(
+                "Model is exported to %s. Computing hash of the model.", export_path
+            )
+            model_hash = self.compute_model_hash(export_path)
+            logging.info("Model hash: %s. Registering it in ML Metastore.", model_hash)
+            self._client.register_model(
+                Model(model_hash, self.path["owner"], self.base_run_id)
+            )
         except Exception as err:
-          logging.error(
-            'Failed to update mark run as successful. Error: %s', str(err))
-        finally:
-          self._deregister_for_graceful_shutdown()
-          self._current_tracker_hook = None
-          self._current_run_name_suffix = None
-
-  def create_eval_hook(self, get_estimator_spec_fn):
-    """
-    Create an eval_hook to track eval metrics
+            logging.error("Failed to register model. Error: %s", str(err))
 
-    Args:
-      get_estimator_spec_fn (func):
-        A function that returns the current EstimatorSpec of the trainer.
-    """
-    return MetricsUpdateHook(
-      get_estimator_spec_fn=get_estimator_spec_fn,
-      add_metrics_fn=self._record_update)
+    def export_feature_spec(self, feature_spec_dict: Dict[str, Any]) -> None:
+        """
+        Export feature spec to ML Metastore (go/ml-metastore).
 
-  def register_model(self, export_path):
-    """
-    Record the exported model.
+        Please note that the feature list in FeatureConfig only keeps the list of feature hash ids due
+        to the 1mb upper limit for values in manhattan, and more specific information (feature type,
+        feature name) for each feature config feature is stored separately in FeatureConfigFeature dataset.
 
-    Args:
-      export_path (str):
-        The path to the exported model.
-    """
-    if self.disabled:
-      return None
-
-    try:
-      logging.info('Model is exported to %s. Computing hash of the model.', export_path)
-      model_hash = self.compute_model_hash(export_path)
-      logging.info('Model hash: %s. Registering it in ML Metastore.', model_hash)
-      self._client.register_model(Model(model_hash, self.path['owner'], self.base_run_id))
-    except Exception as err:
-      logging.error('Failed to register model. Error: %s', str(err))
-
-  def export_feature_spec(self, feature_spec_dict):
-    """
-    Export feature spec to ML Metastore (go/ml-metastore).
+        Args:
+            feature_spec_dict (dict): A dictionary obtained from FeatureConfig.get_feature_spec()
+        """
+        if self.disabled or not self._env_eligible_for_recording_export_metadata:
+            return None
 
-    Please note that the feature list in FeatureConfig only keeps the list of feature hash ids due
-    to the 1mb upper limit for values in manhattan, and more specific information (feature type,
-    feature name) for each feature config feature is stored separately in FeatureConfigFeature dataset.
+        try:
+            logging.info("Exporting feature spec to ML Metastore.")
+            feature_list = feature_spec_dict["features"]
+            label_list = feature_spec_dict["labels"]
+            weight_list = feature_spec_dict["weight"]
+            self._client.add_feature_config(
+                FeatureConfig(
+                    self._current_run_id,
+                    list(feature_list.keys()),
+                    list(label_list.keys()),
+                    list(weight_list.keys()),
+                )
+            )
+
+            feature_config_features = [
+                FeatureConfigFeature(
+                    hash_id=_feature_hash_id,
+                    feature_name=_feature["featureName"],
+                    feature_type=_feature["featureType"],
+                )
+                for _feature_hash_id, _feature in zip(
+                    feature_list.keys(), feature_list.values()
+                )
+            ]
+            self._client.add_feature_config_features(
+                list(feature_list.keys()), feature_config_features
+            )
+
+            feature_config_labels = [
+                FeatureConfigFeature(
+                    hash_id=_label_hash_id, feature_name=_label["featureName"]
+                )
+                for _label_hash_id, _label in zip(
+                    label_list.keys(), label_list.values()
+                )
+            ]
+            self._client.add_feature_config_features(
+                list(label_list.keys()), feature_config_labels
+            )
+
+            feature_config_weights = [
+                FeatureConfigFeature(
+                    hash_id=_weight_hash_id,
+                    feature_name=_weight["featureName"],
+                    feature_type=_weight["featureType"],
+                )
+                for _weight_hash_id, _weight in zip(
+                    weight_list.keys(), weight_list.values()
+                )
+            ]
+            self._client.add_feature_config_features(
+                list(weight_list.keys()), feature_config_weights
+            )
 
-    Args:
-       feature_spec_dict (dict): A dictionary obtained from FeatureConfig.get_feature_spec()
-    """
-    if self.disabled or not self._env_eligible_for_recording_export_metadata:
-      return None
-
-    try:
-      logging.info('Exporting feature spec to ML Metastore.')
-      feature_list = feature_spec_dict['features']
-      label_list = feature_spec_dict['labels']
-      weight_list = feature_spec_dict['weight']
-      self._client.add_feature_config(FeatureConfig(self._current_run_id, list(feature_list.keys()),
-                                                    list(label_list.keys()), list(weight_list.keys())))
-
-      feature_config_features = [
-        FeatureConfigFeature(
-          hash_id=_feature_hash_id,
-          feature_name=_feature['featureName'],
-          feature_type=_feature['featureType']
-        )
-        for _feature_hash_id, _feature in zip(feature_list.keys(), feature_list.values())
-      ]
-      self._client.add_feature_config_features(list(feature_list.keys()), feature_config_features)
-
-      feature_config_labels = [
-        FeatureConfigFeature(
-          hash_id=_label_hash_id,
-          feature_name=_label['featureName']
-        )
-        for _label_hash_id, _label in zip(label_list.keys(), label_list.values())
-      ]
-      self._client.add_feature_config_features(list(label_list.keys()), feature_config_labels)
-
-      feature_config_weights = [
-        FeatureConfigFeature(
-          hash_id=_weight_hash_id,
-          feature_name=_weight['featureName'],
-          feature_type=_weight['featureType']
+        except Exception as err:
+            logging.error("Failed to export feature spec. Error: %s", str(err))
+
+    @property
+    def path(self) -> Optional[Dict[str, str]]:
+        if self.disabled:
+            return None
+        return get_components_from_id(self.tracking_path, ensure_valid_id=False)
+
+    @property
+    def experiment_id(self) -> Optional[str]:
+        """Return the experiment id."""
+        if self.disabled:
+            return None
+        return f"{self.path['owner']}:{self.path['project_name']}:{self.path['experiment_name']}"
+
+    @property
+    def _current_run_name(self) -> str:
+        """Return the current run name."""
+        if self._current_run_name_suffix is not None:
+            return self.path["run_name"] + self._current_run_name_suffix
+        return self.path["run_name"]
+
+    @property
+    def _current_run_id(self) -> str:
+        """Return the current run id."""
+        if self._current_run_name_suffix is not None:
+            return self.base_run_id + self._current_run_name_suffix
+        return self.base_run_id
+
+    def get_run_status(self) -> Union[StatusUpdate, None]:
+        """Get the current run status."""
+        if not self.disabled:
+            return self._client.get_latest_dbv2_status(self._current_run_id)
+        return None
+
+    def _add_run_status(self, status: StatusUpdate) -> None:
+        """
+        Add run status with underlying client.
+
+        Args:
+            status (StatusUpdate):
+                The status update to add.
+        """
+        if not self.disabled and self._env_eligible_for_recording_experiment:
+            self._client.add_run_status(status)
+
+    def _record_run(self) -> None:
+        """Record the run in ML Metastore."""
+        if self.disabled or not self._env_eligible_for_recording_experiment:
+            return None
+
+        if not self._client.project_exists(self.project_id):
+            self._client.add_project(
+                Project(self.path["project_name"], self.path["owner"])
+            )
+            time.sleep(1)
+
+        if not self._client.experiment_exists(self.experiment_id):
+            self._client.add_experiment(
+                Experiment(
+                    self.path["experiment_name"],
+                    self.path["owner"],
+                    self.project_id,
+                    "",
+                )
+            )
+            time.sleep(1)
+
+        run = DeepbirdRun(
+            self.experiment_id,
+            self._current_run_name,
+            "",
+            {"raw_command": " ".join(sys.argv)},
+            self._params,
         )
-        for _weight_hash_id, _weight in zip(weight_list.keys(), weight_list.values())
-      ]
-      self._client.add_feature_config_features(list(weight_list.keys()), feature_config_weights)
-
-    except Exception as err:
-      logging.error('Failed to export feature spec. Error: %s', str(err))
-
-  @property
-  def path(self):
-    if self.disabled:
-      return None
-    return get_components_from_id(self.tracking_path, ensure_valid_id=False)
-
-  @property
-  def experiment_id(self):
-    if self.disabled:
-      return None
-    return '%s:%s:%s' % (self.path['owner'], self.path['project_name'],
-                         self.path['experiment_name'])
-
-  @property
-  def _current_run_name(self):
-    """
-    Return the current run name.
-    """
-    if self._current_run_name_suffix is not None:
-      return self.path['run_name'] + self._current_run_name_suffix
-    else:
-      return self.path['run_name']
-
-  @property
-  def _current_run_id(self):
-    """
-    Return the current run id.
-    """
-    if self._current_run_name_suffix is not None:
-      return self.base_run_id + self._current_run_name_suffix
-    else:
-      return self.base_run_id
-
-  def get_run_status(self) -> str:
-    if not self.disabled:
-      return self._client.get_latest_dbv2_status(self._current_run_id)
-
-  def _add_run_status(self, status):
-    """
-    Add run status with underlying client.
-
-    Args:
-      status (StatusUpdate):
-        The status update to add.
-    """
-    if not self.disabled and self._env_eligible_for_recording_experiment:
-      self._client.add_run_status(status)
-
-  def _record_run(self):
-    """
-    Record the run in ML Metastore.
-    """
-    if self.disabled or not self._env_eligible_for_recording_experiment:
-      return None
-
-    if not self._client.project_exists(self.project_id):
-      self._client.add_project(Project(self.path['project_name'], self.path['owner']))
-      time.sleep(1)
-
-    if not self._client.experiment_exists(self.experiment_id):
-      self._client.add_experiment(Experiment(
-        self.path['experiment_name'], self.path['owner'], self.project_id, ''))
-      time.sleep(1)
-
-    run = DeepbirdRun(self.experiment_id, self._current_run_name, '',
-                      {'raw_command': ' '.join(sys.argv)}, self._params)
-    self._client.add_deepbird_run(run, force=True)
-    time.sleep(1)
-
-  def _record_update(self, metrics):
-    """
-    Record metrics update in ML Metastore.
-
-    Args:
-      metrics (dict):
-        The dict of the metrics and their values.
-    """
-
-    if self.disabled or not self._env_eligible_for_recording_experiment:
-      return None
-
-    reported_metrics = {}
-    for k, v in metrics.items():
+        self._client.add_deepbird_run(run, force=True)
+        time.sleep(1)
 
-      if hasattr(v, 'item'):
-        reported_metrics[k] = v.item() if v.size == 1 else str(v.tolist())
-      else:
-        logging.warning("Ignoring %s because the value (%s) is not valid" % (k, str(v)))
+    def _record_update(self, metrics: Dict[str, Any]) -> None:
+        """
+        Record metrics update in ML Metastore.
 
-    report = ProgressReport(self._current_run_id, reported_metrics)
-
-    try:
-      self._client.add_progress_report(report)
-    except Exception as err:
-      logging.error('Failed to record metrics in ML Metastore. Error: {}'.format(err))
-      logging.error('Run ID: {}'.format(self._current_run_id))
-      logging.error('Progress Report: {}'.format(report.to_json_string()))
-
-  def _register_for_graceful_shutdown(self):
-    """
-    Register the tracker with the health server, enabling graceful shutdown.
+        Args:
+            metrics (dict):
+                The dict of the metrics and their values.
+        """
 
-    Returns:
-      (Response) health server response
-    """
-    if self._graceful_shutdown_port and not self.disabled and self._env_eligible_for_recording_experiment:
-      return requests.post('http://localhost:{}/register_id/{}'.format(
-        self._graceful_shutdown_port,
-        self._current_run_id
-      ))
+        if self.disabled or not self._env_eligible_for_recording_experiment:
+            return None
 
-  def _deregister_for_graceful_shutdown(self):
-    """
-    Deregister the tracker with the health server, disabling graceful shutdown.
+        reported_metrics = {}
+        for k, v in metrics.items():
+            if hasattr(v, "item"):
+                reported_metrics[k] = v.item() if v.size == 1 else str(v.tolist())
+            else:
+                logging.warning(
+                    "Ignoring %s because the value (%s) is not valid" % (k, str(v))
+                )
 
-    Returns:
-      (Response) health server response
-    """
-    if self._graceful_shutdown_port and not self.disabled and self._env_eligible_for_recording_experiment:
-      return requests.post('http://localhost:{}/deregister_id/{}'.format(
-        self._graceful_shutdown_port,
-        self._current_run_id
-      ))
+        report = ProgressReport(self._current_run_id, reported_metrics)
 
-  def _is_env_eligible_for_tracking(self):
-    """
-    Determine if experiment tracking should run in the env.
-    """
-    is_unit_test = (
-      os.environ.get('PYTEST_CURRENT_TEST') is not None and
-      os.environ.get('TEST_EXP_TRACKER') is None
-    )
-
-    is_running_on_ci = (
-      getpass.getuser() == 'scoot-service' and
-      os.environ.get('TEST_EXP_TRACKER') is None
-    )
-
-    return (
-      not is_unit_test and
-      not is_running_on_ci
-    )
-
-  @classmethod
-  def run_name_from_environ(cls):
-    """
-    Create run id from environment if possible.
-    """
-    job_name = os.environ.get("TWML_JOB_NAME")
-    job_launch_time = os.environ.get("TWML_JOB_LAUNCH_TIME")
-
-    if not job_name or not job_launch_time:
-      return None
-
-    try:
-      # job_launch_time should be in isoformat
-      # python2 doesnt support datetime.fromisoformat, so use hardcoded format string.
-      job_launch_time_formatted = datetime.strptime(job_launch_time,
-                                                    "%Y-%m-%dT%H:%M:%S.%f")
-    except ValueError:
-      # Fallback in case aurora config is generating datetime in a different format.
-      job_launch_time_formatted = (job_launch_time
-                                   .replace("-", "_").replace("T", "_")
-                                   .replace(":", "_").replace(".", "_"))
-
-    return '{}_{}'.format(
-      job_name, job_launch_time_formatted.strftime('%m_%d_%Y_%I_%M_%p'))
-
-  @classmethod
-  def guess_path(cls, save_dir, run_name=None):
-    """
-    Guess an experiment tracking path based on save_dir.
-
-    Returns:
-      (str) guessed path
-    """
-    if not run_name:
-      run_name = 'Unnamed_{}'.format(datetime.now().strftime('%m_%d_%Y_%I_%M_%p'))
-
-    if save_dir.startswith('hdfs://'):
-      path_match = re.search(r'/user/([a-z0-9\-_]+)/([a-z0-9\-_]+)', save_dir)
-
-      if path_match:
-        groups = path_match.groups()
-        user = groups[0]
-        project_name = groups[1]
-
-        return generate_id(user, 'default', project_name, run_name)
-
-    user = getpass.getuser()
-    project_name = re.sub(r'^[a-z0-9\-_]', os.path.basename(save_dir), '')
-    if not project_name:
-      project_name = 'unnamed'
-
-    return generate_id(user, 'default', project_name, run_name)
-
-  @classmethod
-  def compute_model_hash(cls, export_path):
-    """
-    Computes the hash of an exported model. This is a gfile version of
-    twitter.mlmetastore.common.versioning.compute_hash. The two functions should generate
-    the same hash when given the same model.
+        try:
+            self._client.add_progress_report(report)
+        except Exception as err:
+            logging.error(f"Failed to record metrics in ML Metastore. Error: {err}")
+            logging.error(f"Run ID: {self._current_run_id}")
+            logging.error(f"Progress Report: {report.to_json_string()}")
+
+    def _register_for_graceful_shutdown(self) -> Optional[requests.Response]:
+        """
+        Register the tracker with the health server, enabling graceful shutdown.
+
+        Returns:
+            (Response) health server response
+        """
+        if (
+            self._graceful_shutdown_port
+            and not self.disabled
+            and self._env_eligible_for_recording_experiment
+        ):
+            return requests.post(
+                f"http://localhost:{self._graceful_shutdown_port}/register_id/{self._current_run_id}"
+            )
+        return None
+
+    def _deregister_for_graceful_shutdown(self) -> Optional[requests.Response]:
+        """
+        Deregister the tracker with the health server, disabling graceful shutdown.
+
+        Returns:
+            (Response) health server response
+        """
+        if (
+            self._graceful_shutdown_port
+            and not self.disabled
+            and self._env_eligible_for_recording_experiment
+        ):
+            return requests.post(
+                f"http://localhost:{self._graceful_shutdown_port}/deregister_id/{self._current_run_id}"
+            )
+
+    def _is_env_eligible_for_tracking(self) -> bool:
+        """Determine if experiment tracking should run in the env."""
+        is_unit_test = (
+            os.environ.get("PYTEST_CURRENT_TEST") is not None
+            and os.environ.get("TEST_EXP_TRACKER") is None
+        )
 
-    Args:
-      export_path (str):
-        The path to the exported model.
+        is_running_on_ci = (
+            getpass.getuser() == "scoot-service"
+            and os.environ.get("TEST_EXP_TRACKER") is None
+        )
 
-    Returns:
-      (str) hash of the exported model
-    """
-    paths = []
-    for path, subdirs, files in tf.io.gfile.walk(export_path):
-      for name in sorted(files):
-        paths.append(os.path.join(path, name))
+        return (not is_unit_test) and (not is_running_on_ci)
 
-    paths.sort()
-    hash_object = hashlib.new('sha1')
+    @classmethod
+    def run_name_from_environ(cls: Type["ExperimentTracker"]) -> Optional[str]:
+        """
+        Create run id from environment if possible.
+        """
+        job_name = os.environ.get("TWML_JOB_NAME")
+        job_launch_time = os.environ.get("TWML_JOB_LAUNCH_TIME")
 
-    for path in paths:
-      with tf.io.gfile.GFile(path, "rb") as file:
-        hash_object.update(file.read())
+        if not job_name or not job_launch_time:
+            return None
 
-    return hash_object.hexdigest()
+        try:
+            # job_launch_time should be in isoformat
+            # python2 doesnt support datetime.fromisoformat, so use hardcoded format string.
+            job_launch_time_formatted = datetime.strptime(
+                job_launch_time, "%Y-%m-%dT%H:%M:%S.%f"
+            )
+        except ValueError:
+            # Fallback in case aurora config is generating datetime in a different format.
+            job_launch_time_formatted = (
+                job_launch_time.replace("-", "_")
+                .replace("T", "_")
+                .replace(":", "_")
+                .replace(".", "_")
+            )
+        return f"{job_name}_{job_launch_time_formatted.strftime('%m_%d_%Y_%I_%M_%p')}"
+
+    @classmethod
+    def guess_path(
+        cls: Type["ExperimentTracker"],
+        save_dir: str,
+        run_name: Optional[str] = None,
+    ) -> str:
+        """
+        Guess an experiment tracking path based on save_dir.
+
+        Args:
+            save_dir (str): save directory
+            run_name (str): run name
+
+        Returns:
+            (str) guessed path
+        """
+        if not run_name:
+            run_name = f'Unnamed_{datetime.now().strftime("%m_%d_%Y_%I_%M_%p")}'
+
+        if save_dir.startswith("hdfs://"):
+            path_match = re.search(r"/user/([a-z0-9\-_]+)/([a-z0-9\-_]+)", save_dir)
+
+            if path_match:
+                groups = path_match.groups()
+                user = groups[0]
+                project_name = groups[1]
+
+                return generate_id(user, "default", project_name, run_name)
+
+        user = getpass.getuser()
+        project_name = re.sub(r"^[a-z0-9\-_]", os.path.basename(save_dir), "")
+        if not project_name:
+            project_name = "unnamed"
+
+        return generate_id(user, "default", project_name, run_name)
+
+    @classmethod
+    def compute_model_hash(cls, export_path: str) -> str:
+        """
+        Computes the hash of an exported model. This is a gfile version of
+        twitter.mlmetastore.common.versioning.compute_hash. The two functions should generate
+        the same hash when given the same model.
+
+        Args:
+            export_path (str): The path to the exported model.
+
+        Returns:
+            (str) hash of the exported model
+        """
+        paths = []
+        for path, subdirs, files in tf.io.gfile.walk(export_path):
+            for name in sorted(files):
+                paths.append(os.path.join(path, name))
+
+        paths.sort()
+        hash_object = hashlib.new("sha1")
+
+        for path in paths:
+            with tf.io.gfile.GFile(path, "rb") as file:
+                hash_object.update(file.read())
+
+        return hash_object.hexdigest()
diff --git a/twml/twml/trainers/__init__.py b/twml/twml/trainers/__init__.py
index e6664d9a6..9dbaf3cf4 100644
--- a/twml/twml/trainers/__init__.py
+++ b/twml/twml/trainers/__init__.py
@@ -6,5 +6,5 @@
 <https://www.tensorflow.org/versions/master/api_docs/python/tf/estimator/Estimator>`_.
 """
 
-from .trainer import Trainer  # noqa: F401
 from .data_record_trainer import DataRecordTrainer  # noqa: F401
+from .trainer import Trainer  # noqa: F401
diff --git a/twml/twml/trainers/data_record_trainer.py b/twml/twml/trainers/data_record_trainer.py
index 76dd16f80..30c3f9684 100644
--- a/twml/twml/trainers/data_record_trainer.py
+++ b/twml/twml/trainers/data_record_trainer.py
@@ -56,766 +56,970 @@
 """
 
 import datetime
+from typing import Any, Callable, Dict, List, Optional
 
 import tensorflow.compat.v1 as tf
+from absl import logging
 from twitter.deepbird.io.dal import dal_to_hdfs_path, is_dal_path
+
 import twml
-from twml.trainers import Trainer
 from twml.contrib.feature_importances.feature_importances import (
-  compute_feature_importances,
-  TREE,
-  write_feature_importances_to_hdfs,
-  write_feature_importances_to_ml_dash)
-from absl import logging
+    TREE,
+    compute_feature_importances,
+    write_feature_importances_to_hdfs,
+    write_feature_importances_to_ml_dash,
+)
+from twml.trainers import Trainer
 
 
 class DataRecordTrainer(Trainer):  # pylint: disable=abstract-method
-  """
-  The ``DataRecordTrainer`` implementation is intended to satisfy the most common use cases
-  at Twitter where only the build_graph methods needs to be overridden.
-  For this reason, ``Trainer.[train,eval]_input_fn`` methods
-  assume a DataRecord dataset partitioned into part files stored in compressed (e.g. gzip) format.
-
-  For use-cases that differ from this common Twitter use-case,
-  further Trainer methods can be overridden.
-  If that still doesn't provide enough flexibility, the user can always
-  use the tf.estimator.Esimator or tf.session.run directly.
-  """
-
-  def __init__(
-          self, name, params,
-          build_graph_fn,
-          feature_config=None,
-          **kwargs):
     """
-    The DataRecordTrainer constructor builds a
-    ``tf.estimator.Estimator`` and stores it in self.estimator.
-    For this reason, DataRecordTrainer accepts the same Estimator constructor arguments.
-    It also accepts additional arguments to facilitate metric evaluation and multi-phase training
-    (init_from_dir, init_map).
-
-    Args:
-      parent arguments:
-        See the `Trainer constructor <#twml.trainers.Trainer.__init__>`_ documentation
-        for a full list of arguments accepted by the parent class.
-      name, params, build_graph_fn (and other parent class args):
-        see documentation for twml.Trainer doc.
-      feature_config:
-        An object of type FeatureConfig describing what features to decode.
-        Defaults to None. But it is needed in the following cases:
-          - `get_train_input_fn()` / `get_eval_input_fn()` is called without a `parse_fn`
-          - `learn()`, `train()`, `eval()`, `calibrate()` are called without providing `*input_fn`.
-
-      **kwargs:
-        further kwargs can be specified and passed to the Estimator constructor.
+    The ``DataRecordTrainer`` implementation is intended to satisfy the most common use cases
+    at Twitter where only the build_graph methods needs to be overridden.
+    For this reason, ``Trainer.[train,eval]_input_fn`` methods
+    assume a DataRecord dataset partitioned into part files stored in compressed (e.g. gzip) format.
+
+    For use-cases that differ from this common Twitter use-case,
+    further Trainer methods can be overridden.
+    If that still doesn't provide enough flexibility, the user can always
+    use the tf.estimator.Esimator or tf.session.run directly.
     """
 
-    # NOTE: DO NOT MODIFY `params` BEFORE THIS CALL.
-    super(DataRecordTrainer, self).__init__(
-      name=name, params=params, build_graph_fn=build_graph_fn, **kwargs)
-
-    self._feature_config = feature_config
-
-    # date range parameters common to both training and evaluation data:
-    hour_resolution = self.params.get("hour_resolution", 1)
-    data_threads = self.params.get("data_threads", 4)
-    datetime_format = self.params.get("datetime_format", "%Y/%m/%d")
-
-    # retrieve the desired training dataset files
-    self._train_files = self.build_files_list(
-      files_list_path=self.params.get("train_files_list", None),
-      data_dir=self.params.get("train_data_dir", None),
-      start_datetime=self.params.get("train_start_datetime", None),
-      end_datetime=self.params.get("train_end_datetime", None),
-      datetime_format=datetime_format, data_threads=data_threads,
-      hour_resolution=hour_resolution, maybe_save=self.is_chief(),
-      overwrite=self.params.get("train_overwrite_files_list", False),
-    )
-
-    # retrieve the desired evaluation dataset files
-    eval_name = self.params.get("eval_name", None)
-
-    if eval_name == "train":
-      self._eval_files = self._train_files
-    else:
-      self._eval_files = self.build_files_list(
-        files_list_path=self.params.get("eval_files_list", None),
-        data_dir=self.params.get("eval_data_dir", None),
-        start_datetime=self.params.get("eval_start_datetime", None),
-        end_datetime=self.params.get("eval_end_datetime", None),
-        datetime_format=datetime_format, data_threads=data_threads,
-        hour_resolution=hour_resolution, maybe_save=self.is_chief(),
-        overwrite=self.params.get("eval_overwrite_files_list", False),
-      )
-
-      if not self.params.get("allow_train_eval_overlap"):
-        # if there is overlap between train and eval, error out!
-        if self._train_files and self._eval_files:
-          overlap_files = set(self._train_files) & set(self._eval_files)
+    def __init__(
+        self,
+        name: str,
+        params: Dict[str, Any],
+        build_graph_fn: Callable,
+        feature_config: twml.FeatureConfig = None,
+        **kwargs,
+    ):
+        """
+        The DataRecordTrainer constructor builds a
+        ``tf.estimator.Estimator`` and stores it in self.estimator.
+        For this reason, DataRecordTrainer accepts the same Estimator constructor arguments.
+        It also accepts additional arguments to facilitate metric evaluation and multi-phase training
+        (init_from_dir, init_map).
+
+        Args:
+            parent Args:
+                See the `Trainer constructor <#twml.trainers.Trainer.__init__>`_ documentation
+                for a full list of arguments accepted by the parent class.
+            name, params, build_graph_fn (and other parent class args):
+                see documentation for twml.Trainer doc.
+            feature_config:
+                An object of type FeatureConfig describing what features to decode.
+                Defaults to None. But it is needed in the following cases:
+                - `get_train_input_fn()` / `get_eval_input_fn()` is called without a `parse_fn`
+                - `learn()`, `train()`, `eval()`, `calibrate()` are called without providing `*input_fn`.
+
+            **kwargs:
+                further kwargs can be specified and passed to the Estimator constructor.
+        """
+
+        # NOTE: DO NOT MODIFY `params` BEFORE THIS CALL.
+        super(DataRecordTrainer, self).__init__(
+            name=name, params=params, build_graph_fn=build_graph_fn, **kwargs
+        )
+
+        self._feature_config = feature_config
+
+        # date range parameters common to both training and evaluation data:
+        hour_resolution = self.params.get("hour_resolution", 1)
+        data_threads = self.params.get("data_threads", 4)
+        datetime_format = self.params.get("datetime_format", "%Y/%m/%d")
+
+        # retrieve the desired training dataset files
+        self._train_files = self.build_files_list(
+            files_list_path=self.params.get("train_files_list", None),
+            data_dir=self.params.get("train_data_dir", None),
+            start_datetime=self.params.get("train_start_datetime", None),
+            end_datetime=self.params.get("train_end_datetime", None),
+            datetime_format=datetime_format,
+            data_threads=data_threads,
+            hour_resolution=hour_resolution,
+            maybe_save=self.is_chief(),
+            overwrite=self.params.get("train_overwrite_files_list", False),
+        )
+
+        # retrieve the desired evaluation dataset files
+        eval_name = self.params.get("eval_name", None)
+
+        if eval_name == "train":
+            self._eval_files = self._train_files
         else:
-          overlap_files = set()
-        if overlap_files:
-          raise ValueError("There is an overlap between train and eval files:\n %s" %
-                           (overlap_files))
-
-  @staticmethod
-  def build_hdfs_files_list(
-      files_list_path, data_dir,
-      start_datetime, end_datetime, datetime_format,
-      data_threads, hour_resolution, maybe_save, overwrite):
-    if files_list_path:
-      files_list_path = twml.util.preprocess_path(files_list_path)
-
-    if isinstance(start_datetime, datetime.datetime):
-      start_datetime = start_datetime.strftime(datetime_format)
-    if isinstance(end_datetime, datetime.datetime):
-      end_datetime = end_datetime.strftime(datetime_format)
-
-    list_files_by_datetime_args = {
-      "base_path": data_dir,
-      "start_datetime": start_datetime,
-      "end_datetime": end_datetime,
-      "datetime_prefix_format": datetime_format,
-      "extension": "lzo",
-      "parallelism": data_threads,
-      "hour_resolution": hour_resolution,
-      "sort": True,
-    }
-
-    # no cache of data file paths, just get the list by scraping the directory
-    if not files_list_path or not tf.io.gfile.exists(files_list_path):
-      # twml.util.list_files_by_datetime returns None if data_dir is None.
-      # twml.util.list_files_by_datetime passes through data_dir if data_dir is a list
-      files_list = twml.util.list_files_by_datetime(**list_files_by_datetime_args)
-    else:
-      # the cached data file paths file exists.
-      files_info = twml.util.read_file(files_list_path, decode="json")
-      # use the cached list if data params match current params,
-      #  or if current params are None
-      # Not including None checks for datetime_format and hour_resolution,
-      #  since those are shared between eval and training.
-      if (all(param is None for param in [data_dir, start_datetime, end_datetime]) or
-          (files_info["data_dir"] == data_dir and
-           files_info["start_datetime"] == start_datetime and
-           files_info["end_datetime"] == end_datetime and
-           files_info["datetime_format"] == datetime_format and
-           files_info["hour_resolution"] == hour_resolution)):
-        files_list = files_info["files"]
-      elif overwrite:
-        # current params are not none and don't match saved params
-        # `overwrite` indicates we should thus update the list
-        files_list = twml.util.list_files_by_datetime(**list_files_by_datetime_args)
-      else:
-        # dont update the cached list
-        raise ValueError("Information in files_list is inconsistent with provided args.\n"
-                         "Did you intend to overwrite files_list using "
-                         "--train.overwrite_files_list or --eval.overwrite_files_list?\n"
-                         "If you instead want to use the paths in files_list, ensure that "
-                         "data_dir, start_datetime, and end_datetime are None.")
-
-    if maybe_save and files_list_path and (overwrite or not tf.io.gfile.exists(files_list_path)):
-      save_dict = {}
-      save_dict["files"] = files_list
-      save_dict["data_dir"] = data_dir
-      save_dict["start_datetime"] = start_datetime
-      save_dict["end_datetime"] = end_datetime
-      save_dict["datetime_format"] = datetime_format
-      save_dict["hour_resolution"] = hour_resolution
-      twml.util.write_file(files_list_path, save_dict, encode="json")
-
-    return files_list
-
-  @staticmethod
-  def build_files_list(files_list_path, data_dir,
-                        start_datetime, end_datetime, datetime_format,
-                        data_threads, hour_resolution, maybe_save, overwrite):
-    '''
-    When specifying DAL datasets, only data_dir, start_dateime, and end_datetime
-    should be given with the format:
-
-    dal://{cluster}/{role}/{dataset_name}/{env}
-
-    '''
-    if not data_dir or not is_dal_path(data_dir):
-      logging.warn(f"Please consider specifying a dal:// dataset rather than passing a physical hdfs path.")
-      return DataRecordTrainer.build_hdfs_files_list(
-        files_list_path, data_dir,
-        start_datetime, end_datetime, datetime_format,
-        data_threads, hour_resolution, maybe_save, overwrite)
-
-    del datetime_format
-    del data_threads
-    del hour_resolution
-    del maybe_save
-    del overwrite
-
-    return dal_to_hdfs_path(
-      path=data_dir,
-      start_datetime=start_datetime,
-      end_datetime=end_datetime,
-    )
-
-  @property
-  def train_files(self):
-    return self._train_files
-
-  @property
-  def eval_files(self):
-    return self._eval_files
-
-  @staticmethod
-  def add_parser_arguments():
-    """
-    Add common commandline args to parse for the Trainer class.
-    Typically, the user calls this function and then parses cmd-line arguments
-    into an argparse.Namespace object which is then passed to the Trainer constructor
-    via the params argument.
-
-    See the `Trainer code <_modules/twml/trainers/trainer.html#Trainer.add_parser_arguments>`_
-    and `DataRecordTrainer code
-    <_modules/twml/trainers/trainer.html#DataRecordTrainer.add_parser_arguments>`_
-    for a list and description of all cmd-line arguments.
-
-    Args:
-      learning_rate_decay:
-        Defaults to False. When True, parses learning rate decay arguments.
-
-    Returns:
-      argparse.ArgumentParser instance with some useful args already added.
-    """
-    parser = super(DataRecordTrainer, DataRecordTrainer).add_parser_arguments()
-    parser.add_argument(
-      "--train.files_list", "--train_files_list", type=str, default=None,
-      dest="train_files_list",
-      help="Path for a json file storing information on training data.\n"
-           "Specifically, the file at files_list should contain the dataset parameters "
-           "for constructing the list of data files, and the list of data file paths.\n"
-           "If the json file does not exist, other args are used to construct the "
-           "training files list, and that list will be saved to the indicated json file.\n"
-           "If the json file does exist, and current args are consistent with "
-           "saved args, or are all None, then the saved files list will be used.\n"
-           "If current args are not consistent with the saved args, then error out "
-           "if train_overwrite_files_list==False, else overwrite files_list with "
-           "a newly constructed list.")
-    parser.add_argument(
-      "--train.overwrite_files_list", "--train_overwrite_files_list", action="store_true", default=False,
-      dest="train_overwrite_files_list",
-      help="When the --train.files_list param is used, indicates whether to "
-           "overwrite the existing --train.files_list when there are differences "
-           "between the current and saved dataset args. Default (False) is to "
-           "error out if files_list exists and differs from current params.")
-    parser.add_argument(
-      "--train.data_dir", "--train_data_dir", type=str, default=None,
-      dest="train_data_dir",
-      help="Path to the training data directory."
-           "Supports local, dal://{cluster}-{region}/{role}/{dataset_name}/{environment}, "
-           "and HDFS (hdfs://default/<path> ) paths.")
-    parser.add_argument(
-      "--train.start_date", "--train_start_datetime",
-      type=str, default=None,
-      dest="train_start_datetime",
-      help="Starting date for training inside the train data dir."
-           "The start datetime is inclusive."
-           "e.g. 2019/01/15")
-    parser.add_argument(
-      "--train.end_date", "--train_end_datetime", type=str, default=None,
-      dest="train_end_datetime",
-      help="Ending date for training inside the train data dir."
-           "The end datetime is inclusive."
-           "e.g. 2019/01/15")
-    parser.add_argument(
-      "--eval.files_list", "--eval_files_list", type=str, default=None,
-      dest="eval_files_list",
-      help="Path for a json file storing information on evaluation data.\n"
-           "Specifically, the file at files_list should contain the dataset parameters "
-           "for constructing the list of data files, and the list of data file paths.\n"
-           "If the json file does not exist, other args are used to construct the "
-           "evaluation files list, and that list will be saved to the indicated json file.\n"
-           "If the json file does exist, and current args are consistent with "
-           "saved args, or are all None, then the saved files list will be used.\n"
-           "If current args are not consistent with the saved args, then error out "
-           "if eval_overwrite_files_list==False, else overwrite files_list with "
-           "a newly constructed list.")
-    parser.add_argument(
-      "--eval.overwrite_files_list", "--eval_overwrite_files_list", action="store_true", default=False,
-      dest="eval_overwrite_files_list",
-      help="When the --eval.files_list param is used, indicates whether to "
-           "overwrite the existing --eval.files_list when there are differences "
-           "between the current and saved dataset args. Default (False) is to "
-           "error out if files_list exists and differs from current params.")
-    parser.add_argument(
-      "--eval.data_dir", "--eval_data_dir", type=str, default=None,
-      dest="eval_data_dir",
-      help="Path to the cross-validation data directory."
-           "Supports local, dal://{cluster}-{region}/{role}/{dataset_name}/{environment}, "
-           "and HDFS (hdfs://default/<path> ) paths.")
-    parser.add_argument(
-      "--eval.start_date", "--eval_start_datetime",
-      type=str, default=None,
-      dest="eval_start_datetime",
-      help="Starting date for evaluating inside the eval data dir."
-           "The start datetime is inclusive."
-           "e.g. 2019/01/15")
-    parser.add_argument(
-      "--eval.end_date", "--eval_end_datetime", type=str, default=None,
-      dest="eval_end_datetime",
-      help="Ending date for evaluating inside the eval data dir."
-           "The end datetime is inclusive."
-           "e.g. 2019/01/15")
-    parser.add_argument(
-      "--datetime_format", type=str, default="%Y/%m/%d",
-      help="Date format for training and evaluation datasets."
-           "Has to be a format that is understood by python datetime."
-           "e.g. %%Y/%%m/%%d for 2019/01/15."
-           "Used only if {train/eval}.{start/end}_date are provided.")
-    parser.add_argument(
-      "--hour_resolution", type=int, default=None,
-      help="Specify the hourly resolution of the stored data.")
-    parser.add_argument(
-      "--data_spec", type=str, required=True,
-      help="Path to data specification JSON file. This file is used to decode DataRecords")
-    parser.add_argument(
-      "--train.keep_rate", "--train_keep_rate", type=float, default=None,
-      dest="train_keep_rate",
-      help="A float value in (0.0, 1.0] that indicates to drop records according to the Bernoulli \
-      distribution with p = 1 - keep_rate.")
-    parser.add_argument(
-      "--eval.keep_rate", "--eval_keep_rate", type=float, default=None,
-      dest="eval_keep_rate",
-      help="A float value in (0.0, 1.0] that indicates to drop records according to the Bernoulli \
-      distribution with p = 1 - keep_rate.")
-    parser.add_argument(
-      "--train.parts_downsampling_rate", "--train_parts_downsampling_rate",
-      dest="train_parts_downsampling_rate",
-      type=float, default=None,
-      help="A float value in (0.0, 1.0] that indicates the factor by which to downsample part \
-      files. For example, a value of 0.2 means only 20 percent of part files become part of the \
-      dataset.")
-    parser.add_argument(
-      "--eval.parts_downsampling_rate", "--eval_parts_downsampling_rate",
-      dest="eval_parts_downsampling_rate",
-      type=float, default=None,
-      help="A float value in (0.0, 1.0] that indicates the factor by which to downsample part \
-      files. For example, a value of 0.2 means only 20 percent of part files become part of the \
-      dataset.")
-    parser.add_argument(
-      "--allow_train_eval_overlap",
-      dest="allow_train_eval_overlap",
-      action="store_true",
-      help="Allow overlap between train and eval datasets."
-    )
-    parser.add_argument(
-      "--eval_name", type=str, default=None,
-      help="String denoting what we want to name the eval. If this is `train`, then we eval on \
-      the training dataset."
-    )
-    return parser
-
-  def contrib_run_feature_importances(self, feature_importances_parse_fn=None, write_to_hdfs=True, extra_groups=None, datarecord_filter_fn=None, datarecord_filter_run_name=None):
-    """Compute feature importances on a trained model (this is a contrib feature)
-    Args:
-      feature_importances_parse_fn (fn): The same parse_fn that we use for training/evaluation.
-        Defaults to feature_config.get_parse_fn()
-      write_to_hdfs (bool): Setting this to True writes the feature importance metrics to HDFS
-    extra_groups (dict<str, list<str>>): A dictionary mapping the name of extra feature groups to the list of
-      the names of the features in the group
-    datarecord_filter_fn (function): a function takes a single data sample in com.twitter.ml.api.ttypes.DataRecord format
-        and return a boolean value, to indicate if this data record should be kept in feature importance module or not.
-    """
-    logging.info("Computing feature importance")
-    algorithm = self._params.feature_importance_algorithm
-
-    kwargs = {}
-    if algorithm == TREE:
-      kwargs["split_feature_group_on_period"] = self._params.split_feature_group_on_period
-      kwargs["stopping_metric"] = self._params.feature_importance_metric
-      kwargs["sensitivity"] = self._params.feature_importance_sensitivity
-      kwargs["dont_build_tree"] = self._params.dont_build_tree
-      kwargs["extra_groups"] = extra_groups
-      if self._params.feature_importance_is_metric_larger_the_better:
-        # The user has specified that the stopping metric is one where larger values are better (e.g. ROC_AUC)
-        kwargs["is_metric_larger_the_better"] = True
-      elif self._params.feature_importance_is_metric_smaller_the_better:
-        # The user has specified that the stopping metric is one where smaller values are better (e.g. LOSS)
-        kwargs["is_metric_larger_the_better"] = False
-      else:
-        # The user has not specified which direction is better for the stopping metric
-        kwargs["is_metric_larger_the_better"] = None
-      logging.info("Using the tree algorithm with kwargs {}".format(kwargs))
-
-    feature_importances = compute_feature_importances(
-      trainer=self,
-      data_dir=self._params.get('feature_importance_data_dir'),
-      feature_config=self._feature_config,
-      algorithm=algorithm,
-      record_count=self._params.feature_importance_example_count,
-      parse_fn=feature_importances_parse_fn,
-      datarecord_filter_fn=datarecord_filter_fn,
-      **kwargs)
-
-    if not feature_importances:
-      logging.info("Feature importances returned None")
-    else:
-      if write_to_hdfs:
-        logging.info("Writing feature importance to HDFS")
-        write_feature_importances_to_hdfs(
-          trainer=self,
-          feature_importances=feature_importances,
-          output_path=datarecord_filter_run_name,
-          metric=self._params.get('feature_importance_metric'))
-      else:
-        logging.info("Not writing feature importance to HDFS")
-
-      logging.info("Writing feature importance to ML Metastore")
-      write_feature_importances_to_ml_dash(
-        trainer=self, feature_importances=feature_importances)
-    return feature_importances
-
-  def export_model(self, serving_input_receiver_fn=None,
-                   export_output_fn=None,
-                   export_dir=None, checkpoint_path=None,
-                   feature_spec=None):
-    """
-    Export the model for prediction. Typically, the exported model
-    will later be run in production servers. This method is called
-    by the user to export the PREDICT graph to disk.
-
-    Internally, this method calls `tf.estimator.Estimator.export_savedmodel
-    <https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator#export_savedmodel>`_.
-
-    Args:
-      serving_input_receiver_fn (Function):
-        function preparing the model for inference requests.
-        If not set; defaults to the the serving input receiver fn set by the FeatureConfig.
-      export_output_fn (Function):
-        Function to export the graph_output (output of build_graph) for
-        prediction. Takes a graph_output dict as sole argument and returns
-        the export_output_fns dict.
-        Defaults to ``twml.export_output_fns.batch_prediction_continuous_output_fn``.
-      export_dir:
-        directory to export a SavedModel for prediction servers.
-        Defaults to ``[save_dir]/exported_models``.
-      checkpoint_path:
-        the checkpoint path to export. If None (the default), the most recent checkpoint
-        found within the model directory ``save_dir`` is chosen.
-
-    Returns:
-      The export directory where the PREDICT graph is saved.
-    """
-    if serving_input_receiver_fn is None:
-      if self._feature_config is None:
-        raise ValueError("`feature_config` was not passed to `DataRecordTrainer`")
-      serving_input_receiver_fn = self._feature_config.get_serving_input_receiver_fn()
-
-    if feature_spec is None:
-      if self._feature_config is None:
-        raise ValueError("feature_spec can not be inferred."
-                         "Please pass feature_spec=feature_config.get_feature_spec() to the trainer.export_model method")
-      else:
-        feature_spec = self._feature_config.get_feature_spec()
-
-    if isinstance(serving_input_receiver_fn, twml.feature_config.FeatureConfig):
-      raise ValueError("Cannot pass FeatureConfig as a parameter to serving_input_receiver_fn")
-    elif not callable(serving_input_receiver_fn):
-      raise ValueError("Expecting Function for serving_input_receiver_fn")
-
-    if export_output_fn is None:
-      export_output_fn = twml.export_output_fns.batch_prediction_continuous_output_fn
-
-    return super(DataRecordTrainer, self).export_model(
-      export_dir=export_dir,
-      serving_input_receiver_fn=serving_input_receiver_fn,
-      checkpoint_path=checkpoint_path,
-      export_output_fn=export_output_fn,
-      feature_spec=feature_spec,
-    )
-
-  def get_train_input_fn(
-      self, parse_fn=None, repeat=None, shuffle=True, interleave=True, shuffle_files=None,
-      initializable=False, log_tf_data_summaries=False, **kwargs):
-    """
-    This method is used to create input function used by estimator.train().
-
-    Args:
-      parse_fn:
-        Function to parse a data record into a set of features.
-        Defaults to the parser returned by the FeatureConfig selected
-      repeat (optional):
-        Specifies if the dataset is to be repeated. Defaults to `params.train_steps > 0`.
-        This ensures the training is run for atleast `params.train_steps`.
-        Toggling this to `False` results in training finishing when one of the following happens:
-          - The entire dataset has been trained upon once.
-          - `params.train_steps` has been reached.
-      shuffle (optional):
-        Specifies if the files and records in the files need to be shuffled.
-        When `True`,  files are shuffled, and records of each files are shuffled.
-        When `False`, files are read in alpha-numerical order. Also when `False`
-        the dataset is sharded among workers for Hogwild and distributed training
-        if no sharding configuration is provided in `params.train_dataset_shards`.
-        Defaults to `True`.
-      interleave (optional):
-        Specifies if records from multiple files need to be interleaved in parallel.
-        Defaults to `True`.
-      shuffle_files (optional):
-        Shuffle the list of files. Defaults to 'Shuffle' if not provided.
-      initializable (optional):
-        A boolean indicator. When the parsing function depends on some resource, e.g. a HashTable or
-        a Tensor, i.e. it's an initializable iterator, set it to True. Otherwise, default value
-        (false) is used for most plain iterators.
-      log_tf_data_summaries (optional):
-        A boolean indicator denoting whether to add a `tf.data.experimental.StatsAggregator` to the
-        tf.data pipeline. This adds summaries of pipeline utilization and buffer sizes to the output
-        events files. This requires that `initializable` is `True` above.
-
-    Returns:
-      An input_fn that can be consumed by `estimator.train()`.
-    """
-    if parse_fn is None:
-      if self._feature_config is None:
-        raise ValueError("`feature_config` was not passed to `DataRecordTrainer`")
-      parse_fn = self._feature_config.get_parse_fn()
-
-    if not callable(parse_fn):
-      raise ValueError("Expecting parse_fn to be a function.")
-
-    if log_tf_data_summaries and not initializable:
-      raise ValueError("Require `initializable` if `log_tf_data_summaries`.")
-
-    if repeat is None:
-      repeat = self.params.train_steps > 0 or self.params.get('distributed', False)
-
-    if not shuffle and self.num_workers > 1 and self.params.train_dataset_shards is None:
-      num_shards = self.num_workers
-      shard_index = self.worker_index
-    else:
-      num_shards = self.params.train_dataset_shards
-      shard_index = self.params.train_dataset_shard_index
-
-    return lambda: twml.input_fns.default_input_fn(
-      files=self._train_files,
-      batch_size=self.params.train_batch_size,
-      parse_fn=parse_fn,
-      num_threads=self.params.num_threads,
-      repeat=repeat,
-      keep_rate=self.params.train_keep_rate,
-      parts_downsampling_rate=self.params.train_parts_downsampling_rate,
-      shards=num_shards,
-      shard_index=shard_index,
-      shuffle=shuffle,
-      shuffle_files=(shuffle if shuffle_files is None else shuffle_files),
-      interleave=interleave,
-      initializable=initializable,
-      log_tf_data_summaries=log_tf_data_summaries,
-      **kwargs)
-
-  def get_eval_input_fn(
-      self, parse_fn=None, repeat=None,
-      shuffle=True, interleave=True,
-      shuffle_files=None, initializable=False, log_tf_data_summaries=False, **kwargs):
-    """
-    This method is used to create input function used by estimator.eval().
-
-    Args:
-      parse_fn:
-        Function to parse a data record into a set of features.
-        Defaults to twml.parsers.get_sparse_parse_fn(feature_config).
-      repeat (optional):
-        Specifies if the dataset is to be repeated. Defaults to `params.eval_steps > 0`.
-        This ensures the evaluation is run for atleast `params.eval_steps`.
-        Toggling this to `False` results in evaluation finishing when one of the following happens:
-          - The entire dataset has been evaled upon once.
-          - `params.eval_steps` has been reached.
-      shuffle (optional):
-        Specifies if the files and records in the files need to be shuffled.
-        When `False`, files are read in alpha-numerical order.
-        When `True`,  files are shuffled, and records of each files are shuffled.
-        Defaults to `True`.
-      interleave (optional):
-        Specifies if records from multiple files need to be interleaved in parallel.
-        Defaults to `True`.
-      shuffle_files (optional):
-        Shuffles the list of files. Defaults to 'Shuffle' if not provided.
-      initializable (optional):
-        A boolean indicator. When the parsing function depends on some resource, e.g. a HashTable or
-        a Tensor, i.e. it's an initializable iterator, set it to True. Otherwise, default value
-        (false) is used for most plain iterators.
-      log_tf_data_summaries (optional):
-        A boolean indicator denoting whether to add a `tf.data.experimental.StatsAggregator` to the
-        tf.data pipeline. This adds summaries of pipeline utilization and buffer sizes to the output
-        events files. This requires that `initializable` is `True` above.
-
-    Returns:
-      An input_fn that can be consumed by `estimator.eval()`.
-    """
-    if parse_fn is None:
-      if self._feature_config is None:
-        raise ValueError("`feature_config` was not passed to `DataRecordTrainer`")
-      parse_fn = self._feature_config.get_parse_fn()
-
-    if not self._eval_files:
-      raise ValueError("`eval_files` was not present in `params` passed to `DataRecordTrainer`")
-
-    if not callable(parse_fn):
-      raise ValueError("Expecting parse_fn to be a function.")
-
-    if log_tf_data_summaries and not initializable:
-      raise ValueError("Require `initializable` if `log_tf_data_summaries`.")
-
-    if repeat is None:
-      repeat = self.params.eval_steps > 0
-
-    return lambda: twml.input_fns.default_input_fn(
-      files=self._eval_files,
-      batch_size=self.params.eval_batch_size,
-      parse_fn=parse_fn,
-      num_threads=self.params.num_threads,
-      repeat=repeat,
-      keep_rate=self.params.eval_keep_rate,
-      parts_downsampling_rate=self.params.eval_parts_downsampling_rate,
-      shuffle=shuffle,
-      shuffle_files=(shuffle if shuffle_files is None else shuffle_files),
-      interleave=interleave,
-      initializable=initializable,
-      log_tf_data_summaries=log_tf_data_summaries,
-      **kwargs
-    )
-
-  def _assert_train_files(self):
-    if not self._train_files:
-      raise ValueError("train.data_dir was not set in params passed to DataRecordTrainer.")
-
-  def _assert_eval_files(self):
-    if not self._eval_files:
-      raise ValueError("eval.data_dir was not set in params passed to DataRecordTrainer.")
-
-  def train(self, input_fn=None, steps=None, hooks=None):
-    """
-    Makes input functions optional. input_fn defaults to self.get_train_input_fn().
-    See Trainer for more detailed documentation documentation.
-    """
-    if input_fn is None:
-      self._assert_train_files()
-    input_fn = input_fn if input_fn else self.get_train_input_fn()
-    super(DataRecordTrainer, self).train(input_fn=input_fn, steps=steps, hooks=hooks)
-
-  def evaluate(self, input_fn=None, steps=None, hooks=None, name=None):
-    """
-    Makes input functions optional. input_fn defaults to self.get_eval_input_fn().
-    See Trainer for more detailed documentation.
-    """
-    if input_fn is None:
-      self._assert_eval_files()
-    input_fn = input_fn if input_fn else self.get_eval_input_fn(repeat=False)
-    return super(DataRecordTrainer, self).evaluate(
-      input_fn=input_fn,
-      steps=steps,
-      hooks=hooks,
-      name=name
-    )
-
-  def learn(self, train_input_fn=None, eval_input_fn=None, **kwargs):
-    """
-    Overrides ``Trainer.learn`` to make ``input_fn`` functions optional.
-    Respectively, ``train_input_fn`` and ``eval_input_fn`` default to
-    ``self.train_input_fn`` and ``self.eval_input_fn``.
-    See ``Trainer.learn`` for more detailed documentation.
-    """
-    if train_input_fn is None:
-      self._assert_train_files()
-    if eval_input_fn is None:
-      self._assert_eval_files()
-    train_input_fn = train_input_fn if train_input_fn else self.get_train_input_fn()
-    eval_input_fn = eval_input_fn if eval_input_fn else self.get_eval_input_fn()
-
-    super(DataRecordTrainer, self).learn(
-      train_input_fn=train_input_fn,
-      eval_input_fn=eval_input_fn,
-      **kwargs
-    )
-
-  def train_and_evaluate(self,
-                         train_input_fn=None, eval_input_fn=None,
-                          **kwargs):
-    """
-    Overrides ``Trainer.train_and_evaluate`` to make ``input_fn`` functions optional.
-    Respectively, ``train_input_fn`` and ``eval_input_fn`` default to
-    ``self.train_input_fn`` and ``self.eval_input_fn``.
-    See ``Trainer.train_and_evaluate`` for detailed documentation.
-    """
-    if train_input_fn is None:
-      self._assert_train_files()
-    if eval_input_fn is None:
-      self._assert_eval_files()
-    train_input_fn = train_input_fn if train_input_fn else self.get_train_input_fn()
-    eval_input_fn = eval_input_fn if eval_input_fn else self.get_eval_input_fn()
-
-    super(DataRecordTrainer, self).train_and_evaluate(
-      train_input_fn=train_input_fn,
-      eval_input_fn=eval_input_fn,
-      **kwargs
-    )
-
-  def _model_fn(self, features, labels, mode, params, config=None):
-    """
-    Overrides the _model_fn to correct for the features shape of the sparse features
-    extracted with the contrib.FeatureConfig
-    """
-    if isinstance(self._feature_config, twml.contrib.feature_config.FeatureConfig):
-      # Fix the shape of the features. The features dictionary will be modified to
-      # contain the shape changes.
-      twml.util.fix_shape_sparse(features, self._feature_config)
-    return super(DataRecordTrainer, self)._model_fn(
-      features=features,
-      labels=labels,
-      mode=mode,
-      params=params,
-      config=config
-    )
-
-  def calibrate(self,
-                calibrator,
-                input_fn=None,
-                steps=None,
-                save_calibrator=True,
-                hooks=None):
-    """
-    Makes input functions optional. input_fn defaults to self.train_input_fn.
-    See Trainer for more detailed documentation.
-    """
-    if input_fn is None:
-      self._assert_train_files()
-    input_fn = input_fn if input_fn else self.get_train_input_fn()
-    super(DataRecordTrainer, self).calibrate(calibrator=calibrator,
-                                             input_fn=input_fn,
-                                             steps=steps,
-                                             save_calibrator=save_calibrator,
-                                             hooks=hooks)
-
-  def save_checkpoints_and_export_model(self,
-                                        serving_input_receiver_fn,
-                                        export_output_fn=None,
-                                        export_dir=None,
-                                        checkpoint_path=None,
-                                        input_fn=None):
-    """
-    Exports saved module after saving checkpoint to save_dir.
-    Please note that to use this method, you need to assign a loss to the output
-    of the build_graph (for the train mode).
-    See export_model for more detailed information.
-    """
-    self.train(input_fn=input_fn, steps=1)
-    self.export_model(serving_input_receiver_fn, export_output_fn, export_dir, checkpoint_path)
-
-  def save_checkpoints_and_evaluate(self,
-                                    input_fn=None,
-                                    steps=None,
-                                    hooks=None,
-                                    name=None):
-    """
-    Evaluates model after saving checkpoint to save_dir.
-    Please note that to use this method, you need to assign a loss to the output
-    of the build_graph (for the train mode).
-    See evaluate for more detailed information.
-    """
-    self.train(input_fn=input_fn, steps=1)
-    self.evaluate(input_fn, steps, hooks, name)
+            self._eval_files = self.build_files_list(
+                files_list_path=self.params.get("eval_files_list", None),
+                data_dir=self.params.get("eval_data_dir", None),
+                start_datetime=self.params.get("eval_start_datetime", None),
+                end_datetime=self.params.get("eval_end_datetime", None),
+                datetime_format=datetime_format,
+                data_threads=data_threads,
+                hour_resolution=hour_resolution,
+                maybe_save=self.is_chief(),
+                overwrite=self.params.get("eval_overwrite_files_list", False),
+            )
+
+            if not self.params.get("allow_train_eval_overlap"):
+                # if there is overlap between train and eval, error out!
+                if self._train_files and self._eval_files:
+                    overlap_files = set(self._train_files) & set(self._eval_files)
+                else:
+                    overlap_files = set()
+                if overlap_files:
+                    raise ValueError(
+                        "There is an overlap between train and eval files:\n %s"
+                        % (overlap_files)
+                    )
+
+    @staticmethod
+    def build_hdfs_files_list(
+        files_list_path: str,
+        data_dir: str,
+        start_datetime: datetime.datetime,
+        end_datetime: datetime.datetime,
+        datetime_format: str,
+        data_threads: int,
+        hour_resolution: int,
+        maybe_save: bool,
+        overwrite: bool,
+    ) -> List[str]:
+        if files_list_path:
+            files_list_path = twml.util.preprocess_path(files_list_path)
+
+        if isinstance(start_datetime, datetime.datetime):
+            start_datetime = start_datetime.strftime(datetime_format)
+        if isinstance(end_datetime, datetime.datetime):
+            end_datetime = end_datetime.strftime(datetime_format)
+
+        list_files_by_datetime_args = {
+            "base_path": data_dir,
+            "start_datetime": start_datetime,
+            "end_datetime": end_datetime,
+            "datetime_prefix_format": datetime_format,
+            "extension": "lzo",
+            "parallelism": data_threads,
+            "hour_resolution": hour_resolution,
+            "sort": True,
+        }
+
+        # no cache of data file paths, just get the list by scraping the directory
+        if not files_list_path or not tf.io.gfile.exists(files_list_path):
+            # twml.util.list_files_by_datetime returns None if data_dir is None.
+            # twml.util.list_files_by_datetime passes through data_dir if data_dir is a list
+            files_list = twml.util.list_files_by_datetime(**list_files_by_datetime_args)
+        else:
+            # the cached data file paths file exists.
+            files_info = twml.util.read_file(files_list_path, decode="json")
+            # use the cached list if data params match current params,
+            #  or if current params are None
+            # Not including None checks for datetime_format and hour_resolution,
+            #  since those are shared between eval and training.
+            if all(
+                param is None for param in [data_dir, start_datetime, end_datetime]
+            ) or (
+                files_info["data_dir"] == data_dir
+                and files_info["start_datetime"] == start_datetime
+                and files_info["end_datetime"] == end_datetime
+                and files_info["datetime_format"] == datetime_format
+                and files_info["hour_resolution"] == hour_resolution
+            ):
+                files_list = files_info["files"]
+            elif overwrite:
+                # current params are not none and don't match saved params
+                # `overwrite` indicates we should thus update the list
+                files_list = twml.util.list_files_by_datetime(
+                    **list_files_by_datetime_args
+                )
+            else:
+                # dont update the cached list
+                raise ValueError(
+                    "Information in files_list is inconsistent with provided args.\n"
+                    "Did you intend to overwrite files_list using "
+                    "--train.overwrite_files_list or --eval.overwrite_files_list?\n"
+                    "If you instead want to use the paths in files_list, ensure that "
+                    "data_dir, start_datetime, and end_datetime are None."
+                )
+
+        if (
+            maybe_save
+            and files_list_path
+            and (overwrite or not tf.io.gfile.exists(files_list_path))
+        ):
+            save_dict = {}
+            save_dict["files"] = files_list
+            save_dict["data_dir"] = data_dir
+            save_dict["start_datetime"] = start_datetime
+            save_dict["end_datetime"] = end_datetime
+            save_dict["datetime_format"] = datetime_format
+            save_dict["hour_resolution"] = hour_resolution
+            twml.util.write_file(files_list_path, save_dict, encode="json")
+
+        return files_list
+
+    @staticmethod
+    def build_files_list(
+        files_list_path: str,
+        data_dir: str,
+        start_datetime: datetime.datetime,
+        end_datetime: datetime.datetime,
+        datetime_format: str,
+        data_threads: int,
+        hour_resolution: int,
+        maybe_save: bool,
+        overwrite: bool,
+    ):
+        """
+        When specifying DAL datasets, only data_dir, start_dateime, and end_datetime
+        should be given with the format:
+
+        dal://{cluster}/{role}/{dataset_name}/{env}
+
+        """
+        if not data_dir or not is_dal_path(data_dir):
+            logging.warn(
+                f"Please consider specifying a dal:// dataset rather than passing a physical hdfs path."
+            )
+            return DataRecordTrainer.build_hdfs_files_list(
+                files_list_path,
+                data_dir,
+                start_datetime,
+                end_datetime,
+                datetime_format,
+                data_threads,
+                hour_resolution,
+                maybe_save,
+                overwrite,
+            )
+
+        del datetime_format
+        del data_threads
+        del hour_resolution
+        del maybe_save
+        del overwrite
+
+        return dal_to_hdfs_path(
+            path=data_dir,
+            start_datetime=start_datetime,
+            end_datetime=end_datetime,
+        )
+
+    @property
+    def train_files(self) -> List[str]:
+        return self._train_files
+
+    @property
+    def eval_files(self) -> List[str]:
+        return self._eval_files
+
+    @staticmethod
+    def add_parser_arguments():
+        """
+        Add common commandline args to parse for the Trainer class.
+        Typically, the user calls this function and then parses cmd-line arguments
+        into an argparse.Namespace object which is then passed to the Trainer constructor
+        via the params argument.
+
+        See the `Trainer code <_modules/twml/trainers/trainer.html#Trainer.add_parser_arguments>`_
+        and `DataRecordTrainer code
+        <_modules/twml/trainers/trainer.html#DataRecordTrainer.add_parser_arguments>`_
+        for a list and description of all cmd-line arguments.
+
+        Args:
+            learning_rate_decay:
+                Defaults to False. When True, parses learning rate decay arguments.
+
+        Returns:
+            argparse.ArgumentParser instance with some useful args already added.
+        """
+        parser = super(DataRecordTrainer, DataRecordTrainer).add_parser_arguments()
+        parser.add_argument(
+            "--train.files_list",
+            "--train_files_list",
+            type=str,
+            default=None,
+            dest="train_files_list",
+            help="Path for a json file storing information on training data.\n"
+            "Specifically, the file at files_list should contain the dataset parameters "
+            "for constructing the list of data files, and the list of data file paths.\n"
+            "If the json file does not exist, other args are used to construct the "
+            "training files list, and that list will be saved to the indicated json file.\n"
+            "If the json file does exist, and current args are consistent with "
+            "saved args, or are all None, then the saved files list will be used.\n"
+            "If current args are not consistent with the saved args, then error out "
+            "if train_overwrite_files_list==False, else overwrite files_list with "
+            "a newly constructed list.",
+        )
+        parser.add_argument(
+            "--train.overwrite_files_list",
+            "--train_overwrite_files_list",
+            action="store_true",
+            default=False,
+            dest="train_overwrite_files_list",
+            help="When the --train.files_list param is used, indicates whether to "
+            "overwrite the existing --train.files_list when there are differences "
+            "between the current and saved dataset args. Default (False) is to "
+            "error out if files_list exists and differs from current params.",
+        )
+        parser.add_argument(
+            "--train.data_dir",
+            "--train_data_dir",
+            type=str,
+            default=None,
+            dest="train_data_dir",
+            help="Path to the training data directory."
+            "Supports local, dal://{cluster}-{region}/{role}/{dataset_name}/{environment}, "
+            "and HDFS (hdfs://default/<path> ) paths.",
+        )
+        parser.add_argument(
+            "--train.start_date",
+            "--train_start_datetime",
+            type=str,
+            default=None,
+            dest="train_start_datetime",
+            help="Starting date for training inside the train data dir."
+            "The start datetime is inclusive."
+            "e.g. 2019/01/15",
+        )
+        parser.add_argument(
+            "--train.end_date",
+            "--train_end_datetime",
+            type=str,
+            default=None,
+            dest="train_end_datetime",
+            help="Ending date for training inside the train data dir."
+            "The end datetime is inclusive."
+            "e.g. 2019/01/15",
+        )
+        parser.add_argument(
+            "--eval.files_list",
+            "--eval_files_list",
+            type=str,
+            default=None,
+            dest="eval_files_list",
+            help="Path for a json file storing information on evaluation data.\n"
+            "Specifically, the file at files_list should contain the dataset parameters "
+            "for constructing the list of data files, and the list of data file paths.\n"
+            "If the json file does not exist, other args are used to construct the "
+            "evaluation files list, and that list will be saved to the indicated json file.\n"
+            "If the json file does exist, and current args are consistent with "
+            "saved args, or are all None, then the saved files list will be used.\n"
+            "If current args are not consistent with the saved args, then error out "
+            "if eval_overwrite_files_list==False, else overwrite files_list with "
+            "a newly constructed list.",
+        )
+        parser.add_argument(
+            "--eval.overwrite_files_list",
+            "--eval_overwrite_files_list",
+            action="store_true",
+            default=False,
+            dest="eval_overwrite_files_list",
+            help="When the --eval.files_list param is used, indicates whether to "
+            "overwrite the existing --eval.files_list when there are differences "
+            "between the current and saved dataset args. Default (False) is to "
+            "error out if files_list exists and differs from current params.",
+        )
+        parser.add_argument(
+            "--eval.data_dir",
+            "--eval_data_dir",
+            type=str,
+            default=None,
+            dest="eval_data_dir",
+            help="Path to the cross-validation data directory."
+            "Supports local, dal://{cluster}-{region}/{role}/{dataset_name}/{environment}, "
+            "and HDFS (hdfs://default/<path> ) paths.",
+        )
+        parser.add_argument(
+            "--eval.start_date",
+            "--eval_start_datetime",
+            type=str,
+            default=None,
+            dest="eval_start_datetime",
+            help="Starting date for evaluating inside the eval data dir."
+            "The start datetime is inclusive."
+            "e.g. 2019/01/15",
+        )
+        parser.add_argument(
+            "--eval.end_date",
+            "--eval_end_datetime",
+            type=str,
+            default=None,
+            dest="eval_end_datetime",
+            help="Ending date for evaluating inside the eval data dir."
+            "The end datetime is inclusive."
+            "e.g. 2019/01/15",
+        )
+        parser.add_argument(
+            "--datetime_format",
+            type=str,
+            default="%Y/%m/%d",
+            help="Date format for training and evaluation datasets."
+            "Has to be a format that is understood by python datetime."
+            "e.g. %%Y/%%m/%%d for 2019/01/15."
+            "Used only if {train/eval}.{start/end}_date are provided.",
+        )
+        parser.add_argument(
+            "--hour_resolution",
+            type=int,
+            default=None,
+            help="Specify the hourly resolution of the stored data.",
+        )
+        parser.add_argument(
+            "--data_spec",
+            type=str,
+            required=True,
+            help="Path to data specification JSON file. This file is used to decode DataRecords",
+        )
+        parser.add_argument(
+            "--train.keep_rate",
+            "--train_keep_rate",
+            type=float,
+            default=None,
+            dest="train_keep_rate",
+            help="A float value in (0.0, 1.0] that indicates to drop records according to the Bernoulli \
+            distribution with p = 1 - keep_rate.",
+        )
+        parser.add_argument(
+            "--eval.keep_rate",
+            "--eval_keep_rate",
+            type=float,
+            default=None,
+            dest="eval_keep_rate",
+            help="A float value in (0.0, 1.0] that indicates to drop records according to the Bernoulli \
+            distribution with p = 1 - keep_rate.",
+        )
+        parser.add_argument(
+            "--train.parts_downsampling_rate",
+            "--train_parts_downsampling_rate",
+            dest="train_parts_downsampling_rate",
+            type=float,
+            default=None,
+            help="A float value in (0.0, 1.0] that indicates the factor by which to downsample part \
+            files. For example, a value of 0.2 means only 20 percent of part files become part of the \
+            dataset.",
+        )
+        parser.add_argument(
+            "--eval.parts_downsampling_rate",
+            "--eval_parts_downsampling_rate",
+            dest="eval_parts_downsampling_rate",
+            type=float,
+            default=None,
+            help="A float value in (0.0, 1.0] that indicates the factor by which to downsample part \
+            files. For example, a value of 0.2 means only 20 percent of part files become part of the \
+            dataset.",
+        )
+        parser.add_argument(
+            "--allow_train_eval_overlap",
+            dest="allow_train_eval_overlap",
+            action="store_true",
+            help="Allow overlap between train and eval datasets.",
+        )
+        parser.add_argument(
+            "--eval_name",
+            type=str,
+            default=None,
+            help="String denoting what we want to name the eval. If this is `train`, then we eval on \
+            the training dataset.",
+        )
+        return parser
+
+    def contrib_run_feature_importances(
+        self,
+        feature_importances_parse_fn: Optional[Callable] = None,
+        write_to_hdfs: bool = True,
+        extra_groups: Optional[Dict[str, List[str]]] = None,
+        datarecord_filter_fn: Optional[Callable] = None,
+        datarecord_filter_run_name: Optional[str] = None,
+    ):
+        """
+        Compute feature importances on a trained model (this is a contrib feature)
+
+        Args:
+            feature_importances_parse_fn (fn):
+                The same parse_fn that we use for training/evaluation.
+                Defaults to feature_config.get_parse_fn()
+            write_to_hdfs (bool):
+                Setting this to True writes the feature importance metrics to HDFS
+            extra_groups (dict<str, list<str>>):
+                A dictionary mapping the name of extra feature groups to the list of
+                the names of the features in the group
+            datarecord_filter_fn (function):
+                a function takes a single data sample in com.twitter.ml.api.ttypes.DataRecord format
+                and return a boolean value, to indicate if this data record should be kept in feature importance module or not.
+        """
+        logging.info("Computing feature importance")
+        algorithm = self._params.feature_importance_algorithm
+
+        kwargs = {}
+        if algorithm == TREE:
+            kwargs[
+                "split_feature_group_on_period"
+            ] = self._params.split_feature_group_on_period
+            kwargs["stopping_metric"] = self._params.feature_importance_metric
+            kwargs["sensitivity"] = self._params.feature_importance_sensitivity
+            kwargs["dont_build_tree"] = self._params.dont_build_tree
+            kwargs["extra_groups"] = extra_groups
+            if self._params.feature_importance_is_metric_larger_the_better:
+                # The user has specified that the stopping metric is one where larger values are better (e.g. ROC_AUC)
+                kwargs["is_metric_larger_the_better"] = True
+            elif self._params.feature_importance_is_metric_smaller_the_better:
+                # The user has specified that the stopping metric is one where smaller values are better (e.g. LOSS)
+                kwargs["is_metric_larger_the_better"] = False
+            else:
+                # The user has not specified which direction is better for the stopping metric
+                kwargs["is_metric_larger_the_better"] = None
+            logging.info(f"Using the tree algorithm with kwargs {kwargs}")
+
+        feature_importances = compute_feature_importances(
+            trainer=self,
+            data_dir=self._params.get("feature_importance_data_dir"),
+            feature_config=self._feature_config,
+            algorithm=algorithm,
+            record_count=self._params.feature_importance_example_count,
+            parse_fn=feature_importances_parse_fn,
+            datarecord_filter_fn=datarecord_filter_fn,
+            **kwargs,
+        )
+
+        if not feature_importances:
+            logging.info("Feature importances returned None")
+        else:
+            if write_to_hdfs:
+                logging.info("Writing feature importance to HDFS")
+                write_feature_importances_to_hdfs(
+                    trainer=self,
+                    feature_importances=feature_importances,
+                    output_path=datarecord_filter_run_name,
+                    metric=self._params.get("feature_importance_metric"),
+                )
+            else:
+                logging.info("Not writing feature importance to HDFS")
+
+            logging.info("Writing feature importance to ML Metastore")
+            write_feature_importances_to_ml_dash(
+                trainer=self, feature_importances=feature_importances
+            )
+        return feature_importances
+
+    def export_model(
+        self,
+        serving_input_receiver_fn: Optional[Callable] = None,
+        export_output_fn: Optional[Callable] = None,
+        export_dir: Optional[str] = None,
+        checkpoint_path: Optional[str] = None,
+        feature_spec: Optional[Dict[str, tf.io.FixedLenFeature]] = None,
+    ) -> str:
+        """
+        Export the model for prediction. Typically, the exported model
+        will later be run in production servers. This method is called
+        by the user to export the PREDICT graph to disk.
+
+        Internally, this method calls `tf.estimator.Estimator.export_savedmodel
+        <https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator#export_savedmodel>`_.
+
+        Args:
+            serving_input_receiver_fn (Function):
+                function preparing the model for inference requests.
+                If not set; defaults to the the serving input receiver fn set by the FeatureConfig.
+            export_output_fn (Function):
+                Function to export the graph_output (output of build_graph) for
+                prediction. Takes a graph_output dict as sole argument and returns
+                the export_output_fns dict.
+                Defaults to ``twml.export_output_fns.batch_prediction_continuous_output_fn``.
+            export_dir:
+                directory to export a SavedModel for prediction servers.
+                Defaults to ``[save_dir]/exported_models``.
+            checkpoint_path:
+                the checkpoint path to export. If None (the default), the most recent checkpoint
+                found within the model directory ``save_dir`` is chosen.
+
+        Returns:
+            The export directory where the PREDICT graph is saved.
+        """
+        if serving_input_receiver_fn is None:
+            if self._feature_config is None:
+                raise ValueError(
+                    "`feature_config` was not passed to `DataRecordTrainer`"
+                )
+            serving_input_receiver_fn = (
+                self._feature_config.get_serving_input_receiver_fn()
+            )
+
+        if feature_spec is None:
+            if self._feature_config is None:
+                raise ValueError(
+                    "feature_spec can not be inferred."
+                    "Please pass feature_spec=feature_config.get_feature_spec() to the trainer.export_model method"
+                )
+            else:
+                feature_spec = self._feature_config.get_feature_spec()
+
+        if isinstance(serving_input_receiver_fn, twml.feature_config.FeatureConfig):
+            raise ValueError(
+                "Cannot pass FeatureConfig as a parameter to serving_input_receiver_fn"
+            )
+        elif not callable(serving_input_receiver_fn):
+            raise ValueError("Expecting Function for serving_input_receiver_fn")
+
+        if export_output_fn is None:
+            export_output_fn = (
+                twml.export_output_fns.batch_prediction_continuous_output_fn
+            )
+
+        return super(DataRecordTrainer, self).export_model(
+            export_dir=export_dir,
+            serving_input_receiver_fn=serving_input_receiver_fn,
+            checkpoint_path=checkpoint_path,
+            export_output_fn=export_output_fn,
+            feature_spec=feature_spec,
+        )
+
+    def get_train_input_fn(
+        self,
+        parse_fn: Optional[Callable] = None,
+        repeat: bool = True,
+        shuffle: bool = True,
+        interleave: bool = True,
+        shuffle_files: Optional[bool] = None,
+        initializable: bool = False,
+        log_tf_data_summaries: bool = False,
+        **kwargs,
+    ) -> Callable:
+        """
+        This method is used to create input function used by estimator.train().
+
+        Args:
+            parse_fn:
+                Function to parse a data record into a set of features.
+                Defaults to the parser returned by the FeatureConfig selected
+            repeat (optional):
+                Specifies if the dataset is to be repeated. Defaults to `params.train_steps > 0`.
+                This ensures the training is run for at least `params.train_steps`.
+                Toggling this to `False` results in training finishing when one of the following happens:
+                - The entire dataset has been trained upon once.
+                - `params.train_steps` has been reached.
+            shuffle (optional):
+                Specifies if the files and records in the files need to be shuffled.
+                When `True`,  files are shuffled, and records of each files are shuffled.
+                When `False`, files are read in alpha-numerical order. Also when `False`
+                the dataset is shared among workers for Hogwild and distributed training
+                if no sharding configuration is provided in `params.train_dataset_shards`.
+                Defaults to `True`.
+            interleave (optional):
+                Specifies if records from multiple files need to be interleaved in parallel.
+                Defaults to `True`.
+            shuffle_files (optional):
+                Shuffle the list of files. Defaults to 'Shuffle' if not provided.
+            initializable (optional):
+                A boolean indicator. When the parsing function depends on some resource, e.g. a HashTable or
+                a Tensor, i.e. it's an initializable iterator, set it to True. Otherwise, default value
+                (false) is used for most plain iterators.
+            log_tf_data_summaries (optional):
+                A boolean indicator denoting whether to add a `tf.data.experimental.StatsAggregator` to the
+                tf.data pipeline. This adds summaries of pipeline utilization and buffer sizes to the output
+                events files. This requires that `initializable` is `True` above.
+
+        Returns:
+            An input_fn that can be consumed by `estimator.train()`.
+        """
+        if parse_fn is None:
+            if self._feature_config is None:
+                raise ValueError(
+                    "`feature_config` was not passed to `DataRecordTrainer`"
+                )
+            parse_fn = self._feature_config.get_parse_fn()
+
+        if not callable(parse_fn):
+            raise ValueError("Expecting parse_fn to be a function.")
+
+        if log_tf_data_summaries and not initializable:
+            raise ValueError("Require `initializable` if `log_tf_data_summaries`.")
+
+        if repeat is None:
+            repeat = self.params.train_steps > 0 or self.params.get(
+                "distributed", False
+            )
+
+        if (
+            not shuffle
+            and self.num_workers > 1
+            and self.params.train_dataset_shards is None
+        ):
+            num_shards = self.num_workers
+            shard_index = self.worker_index
+        else:
+            num_shards = self.params.train_dataset_shards
+            shard_index = self.params.train_dataset_shard_index
+
+        return lambda: twml.input_fns.default_input_fn(
+            files=self._train_files,
+            batch_size=self.params.train_batch_size,
+            parse_fn=parse_fn,
+            num_threads=self.params.num_threads,
+            repeat=repeat,
+            keep_rate=self.params.train_keep_rate,
+            parts_downsampling_rate=self.params.train_parts_downsampling_rate,
+            shards=num_shards,
+            shard_index=shard_index,
+            shuffle=shuffle,
+            shuffle_files=(shuffle if shuffle_files is None else shuffle_files),
+            interleave=interleave,
+            initializable=initializable,
+            log_tf_data_summaries=log_tf_data_summaries,
+            **kwargs,
+        )
+
+    def get_eval_input_fn(
+        self,
+        parse_fn: Optional[Callable] = None,
+        repeat: bool = True,
+        shuffle: bool = True,
+        interleave: bool = True,
+        shuffle_files: Optional[bool] = None,
+        initializable: bool = False,
+        log_tf_data_summaries: bool = False,
+        **kwargs,
+    ) -> Callable:
+        """
+        This method is used to create input function used by estimator.eval().
+
+        Args:
+            parse_fn:
+                Function to parse a data record into a set of features.
+                Defaults to twml.parsers.get_sparse_parse_fn(feature_config).
+            repeat (optional):
+                Specifies if the dataset is to be repeated. Defaults to `params.eval_steps > 0`.
+                This ensures the evaluation is run for at least `params.eval_steps`.
+                Toggling this to `False` results in evaluation finishing when one of the following happens:
+                - The entire dataset has been evaluated upon once.
+                - `params.eval_steps` has been reached.
+            shuffle (optional):
+                Specifies if the files and records in the files need to be shuffled.
+                When `False`, files are read in alpha-numerical order.
+                When `True`,  files are shuffled, and records of each files are shuffled.
+                Defaults to `True`.
+            interleave (optional):
+                Specifies if records from multiple files need to be interleaved in parallel.
+                Defaults to `True`.
+            shuffle_files (optional):
+                Shuffles the list of files. Defaults to 'Shuffle' if not provided.
+            initializable (optional):
+                A boolean indicator. When the parsing function depends on some resource, e.g. a HashTable or
+                a Tensor, i.e. it's an initializable iterator, set it to True. Otherwise, default value
+                (false) is used for most plain iterators.
+            log_tf_data_summaries (optional):
+                A boolean indicator denoting whether to add a `tf.data.experimental.StatsAggregator` to the
+                tf.data pipeline. This adds summaries of pipeline utilization and buffer sizes to the output
+                events files. This requires that `initializable` is `True` above.
+
+        Returns:
+            An input_fn that can be consumed by `estimator.eval()`.
+        """
+        if parse_fn is None:
+            if self._feature_config is None:
+                raise ValueError(
+                    "`feature_config` was not passed to `DataRecordTrainer`"
+                )
+            parse_fn = self._feature_config.get_parse_fn()
+
+        if not self._eval_files:
+            raise ValueError(
+                "`eval_files` was not present in `params` passed to `DataRecordTrainer`"
+            )
+
+        if not callable(parse_fn):
+            raise ValueError("Expecting parse_fn to be a function.")
+
+        if log_tf_data_summaries and not initializable:
+            raise ValueError("Require `initializable` if `log_tf_data_summaries`.")
+
+        if repeat is None:
+            repeat = self.params.eval_steps > 0
+
+        return lambda: twml.input_fns.default_input_fn(
+            files=self._eval_files,
+            batch_size=self.params.eval_batch_size,
+            parse_fn=parse_fn,
+            num_threads=self.params.num_threads,
+            repeat=repeat,
+            keep_rate=self.params.eval_keep_rate,
+            parts_downsampling_rate=self.params.eval_parts_downsampling_rate,
+            shuffle=shuffle,
+            shuffle_files=(shuffle if shuffle_files is None else shuffle_files),
+            interleave=interleave,
+            initializable=initializable,
+            log_tf_data_summaries=log_tf_data_summaries,
+            **kwargs,
+        )
+
+    def _assert_train_files(self) -> None:
+        if not self._train_files:
+            raise ValueError(
+                "train.data_dir was not set in params passed to DataRecordTrainer."
+            )
+
+    def _assert_eval_files(self) -> None:
+        if not self._eval_files:
+            raise ValueError(
+                "eval.data_dir was not set in params passed to DataRecordTrainer."
+            )
+
+    def train(
+        self,
+        input_fn: Optional[Callable] = None,
+        steps: Optional[int] = None,
+        hooks: Optional[List[tf.train.SessionRunHook]] = None,
+    ) -> None:
+        """
+        Makes input functions optional. input_fn defaults to self.get_train_input_fn().
+        See Trainer for more detailed documentation documentation.
+        """
+        if input_fn is None:
+            self._assert_train_files()
+        input_fn = input_fn if input_fn else self.get_train_input_fn()
+        super(DataRecordTrainer, self).train(
+            input_fn=input_fn, steps=steps, hooks=hooks
+        )
+
+    def evaluate(
+        self,
+        input_fn: Optional[Callable] = None,
+        steps: Optional[int] = None,
+        hooks: Optional[List[tf.train.SessionRunHook]] = None,
+        name: Optional[str] = None,
+    ) -> Dict[str, float]:
+        """
+        Makes input functions optional. input_fn defaults to self.get_eval_input_fn().
+        See Trainer for more detailed documentation.
+        """
+        if input_fn is None:
+            self._assert_eval_files()
+        input_fn = input_fn if input_fn else self.get_eval_input_fn(repeat=False)
+        return super(DataRecordTrainer, self).evaluate(
+            input_fn=input_fn, steps=steps, hooks=hooks, name=name
+        )
+
+    def learn(
+        self,
+        train_input_fn: Optional[Callable] = None,
+        eval_input_fn: Optional[Callable] = None,
+        **kwargs,
+    ) -> None:
+        """
+        Overrides ``Trainer.learn`` to make ``input_fn`` functions optional.
+        Respectively, ``train_input_fn`` and ``eval_input_fn`` default to
+        ``self.train_input_fn`` and ``self.eval_input_fn``.
+        See ``Trainer.learn`` for more detailed documentation.
+        """
+        if train_input_fn is None:
+            self._assert_train_files()
+        if eval_input_fn is None:
+            self._assert_eval_files()
+        train_input_fn = train_input_fn if train_input_fn else self.get_train_input_fn()
+        eval_input_fn = eval_input_fn if eval_input_fn else self.get_eval_input_fn()
+
+        super(DataRecordTrainer, self).learn(
+            train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, **kwargs
+        )
+
+    def train_and_evaluate(
+        self,
+        train_input_fn: Optional[Callable] = None,
+        eval_input_fn: Optional[Callable] = None,
+        **kwargs,
+    ) -> None:
+        """
+        Overrides ``Trainer.train_and_evaluate`` to make ``input_fn`` functions optional.
+        Respectively, ``train_input_fn`` and ``eval_input_fn`` default to
+        ``self.train_input_fn`` and ``self.eval_input_fn``.
+        See ``Trainer.train_and_evaluate`` for detailed documentation.
+        """
+        if train_input_fn is None:
+            self._assert_train_files()
+        if eval_input_fn is None:
+            self._assert_eval_files()
+        train_input_fn = train_input_fn if train_input_fn else self.get_train_input_fn()
+        eval_input_fn = eval_input_fn if eval_input_fn else self.get_eval_input_fn()
+
+        super(DataRecordTrainer, self).train_and_evaluate(
+            train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, **kwargs
+        )
+
+    def _model_fn(
+        self,
+        features: Dict[str, tf.Tensor],
+        labels: tf.Tensor,
+        mode: tf.estimator.ModeKeys,
+        params: Dict[str, Any],
+        config: Optional[tf.estimator.RunConfig] = None,
+    ) -> tf.estimator.EstimatorSpec:
+        """
+        Overrides the _model_fn to correct for the features shape of the sparse features
+        extracted with the contrib.FeatureConfig
+        """
+        if isinstance(self._feature_config, twml.contrib.feature_config.FeatureConfig):
+            # Fix the shape of the features. The features dictionary will be modified to
+            # contain the shape changes.
+            twml.util.fix_shape_sparse(features, self._feature_config)
+        return super(DataRecordTrainer, self)._model_fn(
+            features=features, labels=labels, mode=mode, params=params, config=config
+        )
+
+    def calibrate(
+        self,
+        calibrator,
+        input_fn: Optional[Callable] = None,
+        steps: Optional[int] = None,
+        save_calibrator: bool = True,
+        hooks: Optional[List[tf.train.SessionRunHook]] = None,
+    ) -> None:
+        """
+        Makes input functions optional. input_fn defaults to self.train_input_fn.
+        See Trainer for more detailed documentation.
+        """
+        if input_fn is None:
+            self._assert_train_files()
+        input_fn = input_fn if input_fn else self.get_train_input_fn()
+        super(DataRecordTrainer, self).calibrate(
+            calibrator=calibrator,
+            input_fn=input_fn,
+            steps=steps,
+            save_calibrator=save_calibrator,
+            hooks=hooks,
+        )
+
+    def save_checkpoints_and_export_model(
+        self,
+        serving_input_receiver_fn: Callable[
+            [], tf.estimator.export.ServingInputReceiver
+        ],
+        export_output_fn: Optional[Callable] = None,
+        export_dir: Optional[str] = None,
+        checkpoint_path: Optional[str] = None,
+        input_fn: Optional[Callable] = None,
+    ) -> None:
+        """
+        Exports saved module after saving checkpoint to save_dir.
+        Please note that to use this method, you need to assign a loss to the output
+        of the build_graph (for the train mode).
+        See export_model for more detailed information.
+        """
+        self.train(input_fn=input_fn, steps=1)
+        self.export_model(
+            serving_input_receiver_fn, export_output_fn, export_dir, checkpoint_path
+        )
+
+    def save_checkpoints_and_evaluate(
+        self,
+        input_fn: Optional[Callable] = None,
+        steps: Optional[int] = None,
+        hooks: Optional[List[tf.train.SessionRunHook]] = None,
+        name: Optional[str] = None,
+    ) -> Dict[str, float]:
+        """
+        Evaluates model after saving checkpoint to save_dir.
+        Please note that to use this method, you need to assign a loss to the output
+        of the build_graph (for the train mode).
+        See evaluate for more detailed information.
+        """
+        self.train(input_fn=input_fn, steps=1)
+        return self.evaluate(input_fn, steps, hooks, name)
diff --git a/twml/twml/trainers/trainer.py b/twml/twml/trainers/trainer.py
index e51b4e0fd..7178b3a83 100644
--- a/twml/twml/trainers/trainer.py
+++ b/twml/twml/trainers/trainer.py
@@ -66,49 +66,58 @@
 
 """
 
+import argparse
 import datetime
 import functools
 import math
-from operator import itemgetter
 import os
 import pprint as pp
 import random
-from string import Template
 import subprocess
 import sys
 import time
+from operator import itemgetter
+from string import Template
 from threading import Thread
+from typing import Any, Callable, Dict, List, Optional, Union
 
+from absl import logging
 from twitter.common.metrics import AtomicGauge
 from twitter.deepbird.stats_server import utils as stats_server_utils
 from twitter.deepbird.stats_server.stats_exporter import StatsExporter
 from twitter.ml.common import metrics
-from twitter.ml.common.kubernetes import kubectl_delete_by_name, Resource
-from twitter.ml.twml.status import get_distributed_training_job_status, TrainingJobStatus
+from twitter.ml.common.kubernetes import Resource, kubectl_delete_by_name
+from twitter.ml.twml.status import (
+    TrainingJobStatus,
+    get_distributed_training_job_status,
+)
 
-from absl import logging
-from twml.optimizers import LazyAdamOptimizer, optimize_loss, OPTIMIZER_SUMMARIES
 from twml.contrib.optimizers import DeepGradientCompressionOptimizer
+from twml.optimizers import OPTIMIZER_SUMMARIES, LazyAdamOptimizer, optimize_loss
 from twml.tracking import ExperimentTracker
-from twml.util import (delete_file_or_dir,
-                       get_distributed_training_job_path,
-                       sanitize_hdfs_path)
+from twml.util import (
+    delete_file_or_dir,
+    get_distributed_training_job_path,
+    sanitize_hdfs_path,
+)
+
 try:
-  from urllib import quote as encode_url
+    from urllib import quote as encode_url
 except ImportError:
-  from urllib.parse import quote as encode_url
-import tensorflow.compat.v1 as tf
+    from urllib.parse import quote as encode_url
+
 import tensorflow
+import tensorflow.compat.v1 as tf
 import tensorflow_hub as hub
-
 import twitter.ml.twml.kubernetes.status as k8s_status
+
 import twml
 import twml.export_output_fns
 import twml.learning_rate_decay
 import twml.metrics
 
-
-_CLUSTER_TEMPLATE = Template('''{
+_CLUSTER_TEMPLATE = Template(
+    """{
   "cluster": {
     "ps": [$PS],
     "chief": [$CHIEF],
@@ -116,1662 +125,1872 @@
   },
   "task": {"type": "$TYPE", "index": $INDEX}
 }
-''')
+"""
+)
 
 
-def init_from_checkpoint(init_dir, init_map):
-  """
-  Wrapper around tf.train.init_from_checkpoint
-  """
-  if init_dir:
-    init_dir = sanitize_hdfs_path(init_dir)
-    tf.train.init_from_checkpoint(init_dir, init_map)
+def init_from_checkpoint(init_dir: str, init_map: Dict[str, str] = None) -> None:
+    """
+    Wrapper around tf.train.init_from_checkpoint
+    """
+    if init_dir:
+        init_dir = sanitize_hdfs_path(init_dir)
+        tf.train.init_from_checkpoint(init_dir, init_map)
 
 
 class Trainer(object):
-  """
-  This class wraps ``tf.estimator.Estimator`` to make construction, saving, and loading easier.
-  Supports multi-phase training (for example, use a Trainer for MDL calibration, then
-  another for training the rest of the model, then another for isotonic calibration).
-  The Trainer also implements a training and evaluation loop via the ``learn()`` method.
-  Each Trainer is associated to a fixed set of hyper parameters (params), and a single model
-  specified by ``build_graph``. Given these constraints, a single Trainer can be called
-  multiple times for training and evaluation over multiple epochs.
-
-  However, if you intend to try different sets of hyper-parameters, we recommend you instantiate
-  a different Trainer for each such experiment. That way, each experiment can be tracked
-  in a different ``save_dir``. Indeed, after calling ``learn``, a Trainer's save_dir will contain
-  checkpoints of the model (its graph, and variables), and the history of metrics (for example,
-  evaluation accuracy at each epoch), and other store observations like the average time per step.
-  The latter metrics can be viewed by pointing
-  TensorBoard to the save_dir and accessing TensorBoard via your browser.
-  """
-
-  def __init__(self, name, params, build_graph_fn,
-               metric_fn=None,
-               optimize_loss_fn=None,
-               run_config=None,
-               save_dir=None,
-               init_from_dir=None,
-               init_map=None,
-               warm_start_from=None,
-               profiler_steps=None,
-               **kwargs):
+    """
+    This class wraps ``tf.estimator.Estimator`` to make construction, saving, and loading easier.
+    Supports multi-phase training (for example, use a Trainer for MDL calibration, then
+    another for training the rest of the model, then another for isotonic calibration).
+    The Trainer also implements a training and evaluation loop via the ``learn()`` method.
+    Each Trainer is associated to a fixed set of hyper parameters (params), and a single model
+    specified by ``build_graph``. Given these constraints, a single Trainer can be called
+    multiple times for training and evaluation over multiple epochs.
+
+    However, if you intend to try different sets of hyper-parameters, we recommend you instantiate
+    a different Trainer for each such experiment. That way, each experiment can be tracked
+    in a different ``save_dir``. Indeed, after calling ``learn``, a Trainer's save_dir will contain
+    checkpoints of the model (its graph, and variables), and the history of metrics (for example,
+    evaluation accuracy at each epoch), and other store observations like the average time per step.
+    The latter metrics can be viewed by pointing
+    TensorBoard to the save_dir and accessing TensorBoard via your browser.
     """
 
-    Args:
-      name (String):
-        string name of this estimator; used as scope names for variables and tensors.
-      params (HParams, Namespace, or Dict):
-        hyper-parameters to be passed to Estimator constructor.
-        Must include params.train_batch_size and params.eval_batch_size.
-        Note that params is passed to twml.util.convert_to_hparams() to produce an HParams.
-      build_graph_fn:
-        A function for building tensorflow graphs.
-        This matches TensorFlow Estimator's model_fn signature.
-        For example,
-
-        .. code-block:: python
+    def __init__(
+        self,
+        name: str,
+        params: Union[Dict, tf.contrib.training.HParams, argparse.Namespace],
+        build_graph_fn: Callable,
+        metric_fn: Callable[[tf.Tensor, tf.Tensor], tf.Tensor] = None,
+        optimize_loss_fn: Callable = optimize_loss,
+        run_config: tf.estimator.RunConfig = None,
+        save_dir: str = None,
+        init_from_dir: str = None,
+        init_map: Dict[str, str] = None,
+        warm_start_from_dir: str = None,
+        profiler_steps: int = 0,
+        **kwargs,
+    ):
+        """
 
-          def build_graph(features, label, mode, params, config=None):
-            # Implements a simple binary logistic regression model
-            sparse_tf = twml.util.convert_to_sparse(features, params.input_size_bits)
+        Args:
+            name (String):
+                string name of this estimator; used as scope names for variables and tensors.
+            params (HParams, Namespace, or Dict):
+                hyper-parameters to be passed to Estimator constructor.
+                Must include params.train_batch_size and params.eval_batch_size.
+                Note that params is passed to twml.util.convert_to_hparams() to produce an HParams.
+            build_graph_fn:
+                A function for building tensorflow graphs.
+                This matches TensorFlow Estimator's model_fn signature.
+                For example,
+
+                .. code-block:: python
+
+                def build_graph(features, label, mode, params, config=None):
+                    # Implements a simple binary logistic regression model
+                    sparse_tf = twml.util.convert_to_sparse(features, params.input_size_bits)
+                    logits = twml.layers.full_sparse(sparse_tf, 1 << params.input_size_bits, 1)
+                    if mode == 'infer':
+                        loss = None
+                    else:
+                        loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=label, logits=logits)
+                        loss = twml.util.weighted_average(loss, features['weights'])
+                    output = tf.nn.sigmoid(logits)
+                    return {'output': output, 'loss': loss}
+
+                features (dict of Tensor keyed by a string name):
+                    input tensors.
+                mode (tf.estimator.ModeKeys / String):
+                    one of 'train', 'eval', 'infer'.
+                label (Tensor):
+                    if in ``mode == 'train'`` mode, these contain the corresponding labels for input.
+                params (HParams):
+                    hyper parameters that control how to build a graph.
+                config:
+                    the RunConfig object passed to Estimator constructor.
+
+                    This function is expected to return a dictionary containing the following keys:
+
+                    * 'output': a node representing model output; required.
+                    * 'loss': (required) a loss node used for optimization; required for training and evaluation.
+                    * 'train_op': (optional) an operation that minimizes the loss (as output by `tf.train.Optimizer.minimize`).
+                    If train_op is specified, train_op is used for optimization as opposed to loss. Loss is always logged to tensorboard.
+
+                Notes:
+
+                * any tf.summary written inside build graph are logged to tensorboard during training.
+                * the ``build_graph_fn`` is called once or twice per epoch (once per training,
+                once per evaluation). All data loading (and preprocessing) logic not required
+                for serving should be in the ``input_fn`` passed to ``learn``, ``train``,
+                ``evaluate``, etc.
+
+            optimize_loss_fn:
+                Defaults to Trainer.get_train_op. A function that takes params and loss as arguments
+                and returns a training op. The training op is used to update parameters (that is, to learn).
+            metric_fn:
+                A function that returns the eval_metric_ops dict given graph_output, labels and weights.
+                Defaults to None.
+                Use ``twml.metrics.get_binary_class_metric_fn()`` to return a ``metric_fn``
+                which implements many binary classification metrics.
+            run_config (RunConfig):
+                optional configuration to be passed to Estimator constructor. Defaults to None.
+            save_dir (String):
+                optional directory where to save model checkpoints,
+                tensorboard event files and trained parameters.
+                Overwrites and defaults to run_config.model_dir.
+            init_from_dir (String):
+                optional directory to load weights from.
+                if set to None (the default), do not init from any directory.
+            init_map (map from String to String):
+                Must be specified if init_from_dir is specified.
+                Defines which scopes and variables to load.
+                Keys are the variables and scopes to load from the directory.
+                Values are the destinations (in the current graph) to load into.
+                See tf.init_from_checkpoint for more information.
+                Note that the the trainer prepends name_scope of the form `name`/model/ to the name_scope
+                of any variable defined inside `build_graph_fn` and this should be taken into account when
+                defining the values.
+            warm_start_from:
+                Optional string filepath to a checkpoint to warm-start from,
+                or a tf.estimator.WarmStartSettings object to fully configure warm-starting.
+                If the string filepath is provided instead of a WarmStartSettings,
+                then all variables are warm-started, and it is assumed that
+                vocabularies and Tensor names are unchanged.
+            profiler_steps (Integer):
+                Defaults to None. If set defines the number of steps in the
+                `tf.train.ProfileHook <https://www.tensorflow.org/api_docs/python/tf/train/ProfilerHook>`_.
+                Captures CPU/GPU profiling information every ``profiler_steps`` steps or seconds.
+                When executing ``learn``, ``train`` or ``predict`` methods,
+                with ``profiler_steps`` set to a number,
+                a ``timeline_X.json`` file is created in the save_dir. This file contains profiling data
+                stored in Chrome trace format. To view stored data, use the Chrome browser to follow
+                these steps:
+
+            1) Go to the page chrome://tracing.
+            2) In the upper left corner, you will find Load button.
+            3) Press it and load our JSON file, which can be found in the ``save_dir``
+
+            *Warning*: This could create too many these json files which can be a potential problem,
+            e.g. for  HDFS there is normally quota for file count, so use with caution.
+
+            Note: this argument is ignored when a non-None ``hooks`` argument is passed to
+            ``train``, ``learn``, or ``predict`` methods. The hook can be added manually by passing
+            ``trainer.train(..., hooks=myhooks.extend(trainer.get_train_hooks()))``, for example.
+        """
+
+        if tensorflow.__version__ >= "2.0":
+            RuntimeError("Trainer not yet supported for Tensorflow >= 2.0")
+
+        self._name = name
+        self._build_graph_fn = build_graph_fn
+        self._metric_fn = metric_fn
+        self._tensorboard_handle = None
+        self._current_estimator_spec = None  # holds the current estimator spec
+        self._profiler_steps = profiler_steps
+        self._export_output_fn = None
+        self._is_early_stopping = False
+
+        # NOTE: Sanitize all HDFS paths first.
+        save_dir = sanitize_hdfs_path(save_dir)
+        init_from_dir = sanitize_hdfs_path(init_from_dir)
+
+        # warm_start_from can be of type tf.estimator.WarmStartSettings.
+        if isinstance(warm_start_from, str):
+            warm_start_from = sanitize_hdfs_path(warm_start_from)
+
+        # convert to twitter.deepbird.hparam.hparam.HParams object
+        params = twml.util.convert_to_hparams(params)
+
+        # keep a copy of the params because calling self._estimator.params creates a deepcopy
+        self._params = params
+        self.check_params()
+
+        self._using_hogwild = True if os.environ.get("TWML_HOGWILD_PORTS") else False
+        # configure Hogwild (needs to be called before RunConfig is created)
+        self._hogwild_setup()
+
+        if not run_config:
+            session_config = tf.ConfigProto()
+            # By default each process tries to allocate (almost) all of the memory.
+            # This option ensures the gpu memory grows dynamically instead.
+            session_config.gpu_options.allow_growth = True  # pylint: disable=no-member
+
+            if "TWML_NUM_CPUS" in os.environ:
+                num_available_cpus = int(os.environ.get("TWML_MESOS_CPU", "8"))
+                if params.num_mkl_threads > 1:
+                    os.environ["OMP_NUM_THREADS"] = str(params.num_mkl_threads)
+                    os.environ["MKL_NUM_THREADS"] = str(params.num_mkl_threads)
+                    session_config.inter_op_parallelism_threads = (
+                        num_available_cpus // params.num_mkl_threads
+                    )
+                    session_config.intra_op_parallelism_threads = params.num_mkl_threads
+
+            run_config = tf.estimator.RunConfig(
+                session_config=session_config,
+                keep_checkpoint_max=self._params.get("keep_checkpoint_max", 20),
+                log_step_count_steps=10000,
+                save_checkpoints_secs=self._params.get("save_checkpoints_secs", 600),
+                tf_random_seed=self._tf_random_seed(),
+            )
+        elif not isinstance(run_config, tf.estimator.RunConfig):
+            raise ValueError(
+                "Expecting run_config argument of type None or tf.estimator.RunConfig"
+                "Got %s instead." % type(run_config).__name__
+            )
+        elif os.environ.get("TWML_HOGWILD_PORTS"):
+            raise ValueError("Custom RunConfig not supported with Hogwild")
+
+        if run_config.model_dir is None and save_dir is None:
+            raise ValueError(
+                "Expecting either save_dir or run_config.model_dir to be specified. Got None for each."
+            )
+        elif run_config.model_dir is None:
+            run_config = run_config.replace(model_dir=save_dir)
+        elif save_dir is None:
+            save_dir = run_config.model_dir
+
+        self._save_dir = save_dir
+        self.experiment_tracker = ExperimentTracker(
+            self._params, run_config, self._save_dir
+        )
 
-            logits = twml.layers.full_sparse(sparse_tf, 1 << params.input_size_bits, 1)
+        # Check if should delete the tsd running this training job. In certain use case when
+        # there are other tf operations following trainer.train_and_evaluate (or trainer.learn),
+        # additional state files need to be specified to ensure those steps are executed after job restart.
+        kwargs["gke_state_files"] = kwargs.get("gke_state_files", ["_SUCCESS"])
+        self._maybe_del_tsd_exit(kwargs["gke_state_files"])
+        logging.info(
+            "Checkpoint and event files will be saved at save_dir=%s", save_dir
+        )
+        self._optimize_loss_fn = (
+            self.get_train_op if optimize_loss_fn is None else optimize_loss_fn
+        )
 
-            if mode == 'infer':
-              loss = None
+        # overwrite the current save_dir
+        if self._params.get("overwrite_save_dir") and tf.io.gfile.exists(
+            self._save_dir
+        ):
+            logging.info(
+                "Trainer overwriting existing save directory: %s (params.overwrite_save_dir)"
+                % self._save_dir
+            )
+            # if distributed or hogwild:
+            if self._params.get("distributed", False):
+                # sleep for 30 seconds to allow each worker to get to this point.
+                time.sleep(30)
+                if run_config.is_chief:
+                    logging.info("Chief deleting the save_dir now")
+                    delete_file_or_dir(self._save_dir)
+                # sleep for 30 seconds to allow each worker to get to this point.
+                time.sleep(30)
             else:
-              loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=label, logits=logits)
-              loss = twml.util.weighted_average(loss, features['weights'])
+                delete_file_or_dir(self._save_dir)
+
+        # Exposing stats to a /vars.json endpoint that will be collected
+        # by the absorber
+        if self._params.get("stats_port"):
+            try:
+                stats_server_utils.start_stats_server(
+                    self._params.get("stats_port"), self._save_dir
+                )
+            except Exception as err:
+                logging.error("Failed to start the stats server. Error: %s", str(err))
+
+        checkpoint = os.path.join(self._save_dir, "checkpoint")
+        if tf.io.gfile.exists(checkpoint):
+            logging.info(
+                "The provided save_dir directory %s already exists."
+                " Training will be resumed." % checkpoint
+            )
+
+        self._maybe_restore_checkpoint = lambda: init_from_checkpoint(
+            init_from_dir, init_map
+        )
 
-            output = tf.nn.sigmoid(logits)
+        if init_from_dir is not None and init_map is None:
+            raise ValueError("Need to provide init_map when init_from_dir is provided.")
 
-            return {'output': output, 'loss': loss}
+        if not tf.io.gfile.exists(self._save_dir):
+            # so tensorboard can point to a directory that exists
+            tf.io.gfile.mkdir(self._save_dir)
 
-        Args:
-          features (dict of Tensor keyed by a string name):
-            input tensors.
-          mode (tf.estimator.ModeKeys / String):
-            one of 'train', 'eval', 'infer'.
-          label (Tensor):
-            if in ``mode == 'train'`` mode, these contain the corresponding labels for input.
-          params (HParams):
-            hyper parameters that control how to build a graph.
-          config:
-            the RunConfig object passed to Estimator constructor.
-
-        This function is expected to return a dictionary containing the following keys:
-
-        * 'output': a node representing model output; required.
-        * 'loss': (required) a loss node used for optimization; required for training and
-          evaluation.
-        * 'train_op': (optional) an operation that minimizes the loss (as output by
-          `tf.train.Optimizer.minimize`). If train_op is specified, train_op is used
-          for optimization as opposed to loss. Loss is always logged to tensorboard.
-
-        Notes:
-
-        * any tf.summary written inside build graph are logged to tensorboard during training.
-        * the ``build_graph_fn`` is called once or twice per epoch (once per training,
-          once per evaluation). All data loading (and preprocessing) logic not required
-          for serving should be in the ``input_fn`` passed to ``learn``, ``train``,
-          ``evalulate``, etc.
-
-      optimize_loss_fn:
-        Defaults to Trainer.get_train_op. A function that takes params and loss as arguments
-        and returns a training op. The training op is used to update parameters (that is, to learn).
-      metric_fn:
-        A function that returns the eval_metric_ops dict given graph_output, labels and weights.
-        Defaults to None.
-        Use ``twml.metrics.get_binary_class_metric_fn()`` to return a ``metric_fn``
-        which implements many binary classification metrics.
-      run_config (RunConfig):
-        optional configuration to be passed to Estimator constructor. Defaults to None.
-      save_dir (String):
-        optional directory where to save model checkpoints,
-        tensorboard event files and trained parameters.
-        Overwrites and defaults to run_config.model_dir.
-      init_from_dir (String):
-        optional directory to load weights from.
-        if set to None (the default), do not init from any directory.
-      init_map (map from String to String):
-        Must be specified if init_from_dir is specified.
-        Defines which scopes and variables to load.
-        Keys are the variables and scopes to load from the directory.
-        Values are the destinations (in the current graph) to load into.
-        See tf.init_from_checkpoint for more information.
-        Note that the the trainer prepends name_scope of the form `name`/model/ to the name_scope
-        of any variable defined inside `build_graph_fn` and this should be taken into account when
-        defining the values.
-      warm_start_from:
-        Optional string filepath to a checkpoint to warm-start from,
-        or a tf.estimator.WarmStartSettings object to fully configure warm-starting.
-        If the string filepath is provided instead of a WarmStartSettings,
-        then all variables are warm-started, and it is assumed that
-        vocabularies and Tensor names are unchanged.
-      profiler_steps (Integer):
-        Defaults to None. If set defines the number of steps in the
-        `tf.train.ProfileHook <https://www.tensorflow.org/api_docs/python/tf/train/ProfilerHook>`_.
-        Captures CPU/GPU profiling information every ``profiler_steps`` steps or seconds.
-        When executing ``learn``, ``train`` or ``predict`` methods,
-        with ``profiler_steps`` set to a number,
-        a ``timeline_X.json`` file is created in the save_dir. This file contains profiling data
-        storedin Chrome trace format. To view stored data, use the Chrome browser to follow
-        these steps:
-
-        1) Go to the page chrome://tracing.
-        2) In the upper left corner, you will find Load button.
-        3) Press it and load our JSON file, which can be found in the ``save_dir``
-
-        *Warning*: This could create too many these json files which can be a potential problem,
-        e.g. for  HDFS there is normally quota forfile count, so use with caution.
-
-        Note: this argument is ignored when a non-None ``hooks`` argument is pasesd to
-        ``train``, ``learn``, or ``predict`` methods. The hook can be added manually by passing
-        ``trainer.train(..., hooks=myhooks.extend(trainer.get_train_hooks()))``, for example.
-    """
-
-    if tensorflow.__version__ >= "2.0":
-      RuntimeError("Trainer not yet supported for Tensorflow >= 2.0")
-
-    self._name = name
-    self._build_graph_fn = build_graph_fn
-    self._metric_fn = metric_fn
-    self._tensorboard_handle = None
-    self._current_estimator_spec = None  # holds the current estimator spec
-    self._profiler_steps = profiler_steps
-    self._export_output_fn = None
-    self._is_early_stopping = False
-
-    # NOTE: Sanitize all HDFS paths first.
-    save_dir = sanitize_hdfs_path(save_dir)
-    init_from_dir = sanitize_hdfs_path(init_from_dir)
-
-    # warm_start_from can be of type tf.estimator.WarmStartSettings.
-    if isinstance(warm_start_from, str):
-      warm_start_from = sanitize_hdfs_path(warm_start_from)
-
-    # convert to twitter.deepbird.hparam.hparam.HParams object
-    params = twml.util.convert_to_hparams(params)
-
-    # keep a copy of the params because calling self._estimator.params creates a deepcopy
-    self._params = params
-    self.check_params()
-
-    self._using_hogwild = True if os.environ.get('TWML_HOGWILD_PORTS') else False
-    # configure Hogwild (needs to be called before RunConfig is created)
-    self._hogwild_setup()
-
-    if not run_config:
-      session_config = tf.ConfigProto()
-      # By default each process tries to allocate (almost) all of the memory.
-      # This option ensures the gpu memory grows dynamically instead.
-      session_config.gpu_options.allow_growth = True  # pylint: disable=no-member
-
-      if 'TWML_NUM_CPUS' in os.environ:
-        num_available_cpus = int(os.environ.get("TWML_MESOS_CPU", "8"))
-        if params.num_mkl_threads > 1:
-          os.environ["OMP_NUM_THREADS"] = str(params.num_mkl_threads)
-          os.environ["MKL_NUM_THREADS"] = str(params.num_mkl_threads)
-          session_config.inter_op_parallelism_threads = num_available_cpus // params.num_mkl_threads
-          session_config.intra_op_parallelism_threads = params.num_mkl_threads
-
-      run_config = tf.estimator.RunConfig(
-        session_config=session_config,
-        keep_checkpoint_max=self._params.get('keep_checkpoint_max', 20),
-        log_step_count_steps=10000,
-        save_checkpoints_secs=self._params.get('save_checkpoints_secs', 600),
-        tf_random_seed=self._tf_random_seed())
-    elif not isinstance(run_config, tf.estimator.RunConfig):
-      raise ValueError("Expecting run_config argument of type None or tf.estimator.RunConfig"
-        "Got %s instead." % type(run_config).__name__)
-    elif os.environ.get('TWML_HOGWILD_PORTS'):
-      raise ValueError("Custom RunConfig not supported with Hogwild")
-
-    if run_config.model_dir is None and save_dir is None:
-      raise ValueError(
-          "Expecting either save_dir or run_config.model_dir to be specified. Got None for each.")
-    elif run_config.model_dir is None:
-      run_config = run_config.replace(model_dir=save_dir)
-    elif save_dir is None:
-      save_dir = run_config.model_dir
-
-    self._save_dir = save_dir
-    self.experiment_tracker = ExperimentTracker(self._params, run_config, self._save_dir)
-
-    # Check if should delete the tsd running this training job. In certain use case when 
-    # there are other tf operations following trainer.train_and_evaluate (or trainer.learn),
-    # additional state files need to be specified to ensure those steps are executed after job restart.
-    kwargs['gke_state_files'] = kwargs.get('gke_state_files', ['_SUCCESS'])
-    self._maybe_del_tsd_exit(kwargs['gke_state_files'])
-    logging.info("Checkpoint and event files will be saved at save_dir=%s", save_dir)
-    self._optimize_loss_fn = self.get_train_op if optimize_loss_fn is None else optimize_loss_fn
-
-    # overwrite the current save_dir
-    if self._params.get('overwrite_save_dir') and tf.io.gfile.exists(self._save_dir):
-      logging.info("Trainer overwriting existing save directory: %s (params.overwrite_save_dir)"
-                   % self._save_dir)
-      # if distributed or hogwild:
-      if self._params.get('distributed', False):
-        # sleep for 30 seconds to allow each worker to get to this point.
-        time.sleep(30)
-        if run_config.is_chief:
-          logging.info("Chief deleting the save_dir now")
-          delete_file_or_dir(self._save_dir)
-        # sleep for 30 seconds to allow each worker to get to this point.
-        time.sleep(30)
-      else:
-        delete_file_or_dir(self._save_dir)
-
-    # Exposing stats to a /vars.json endpoint that will be collected
-    # by the absorber
-    if self._params.get('stats_port'):
-      try:
-        stats_server_utils.start_stats_server(self._params.get('stats_port'), self._save_dir)
-      except Exception as err:
-        logging.error('Failed to start the stats server. Error: %s', str(err))
-
-    checkpoint = os.path.join(self._save_dir, 'checkpoint')
-    if tf.io.gfile.exists(checkpoint):
-      logging.info("The provided save_dir directory %s already exists."
-                   " Training will be resumed."
-                   % checkpoint)
-
-    self._maybe_restore_checkpoint = lambda: init_from_checkpoint(init_from_dir, init_map)
-
-    if init_from_dir is not None and init_map is None:
-      raise ValueError("Need to provide init_map when init_from_dir is provided.")
-
-    if not tf.io.gfile.exists(self._save_dir):
-      # so tensorboard can point to a directory that exists
-      tf.io.gfile.mkdir(self._save_dir)
-
-    self._estimator = tf.estimator.Estimator(
-      model_fn=self._model_fn,
-      params=self._params,  # HParams
-      config=run_config,  # RunConfig
-      warm_start_from=warm_start_from,
-      model_dir=self._save_dir,  # By this point it is same as run_config.model_dir
-    )
-
-    # Log parameters that are used to construct trainer. This allows people to see default values.
-    logging.info("Trainer constructed using the following parameters: ")
-    pp_params = pp.pformat(self._params.values())
-    logging.info(pp_params)
-
-    # Start TensorBoard
-    if self._params.get('disable_tensorboard', False):
-      logging.info("Skipping launching TensorBoard [--disable_tensorboard is set]")
-    elif "tensorboard_port" in self._params.values() and self._params.tensorboard_port is not None:
-      self.start_tensorboard(self._params.tensorboard_port)
-
-    # Export gauge that will track whether a model was exported
-    self.stats_exporter = StatsExporter("twml.trainer")
-    self.export_gauge = AtomicGauge('export_model')
-    self.stats_exporter.register_metrics(self.export_gauge)
-
-  def _hogwild_setup(self):
-    """
-    Setup the parameters required for hogwild.
-    """
-    self._num_workers = self._params.get('num_workers') or 1
-    logging.info("NUM_WORKERS: %d", self._num_workers)
-    if self._num_workers <= 1:
-      self._ports = None
-      return
-
-    # a hogwild job is considered distributed
-    if 'distributed' in self._params:
-      self._params.set_hparam('distributed', True)
-    else:
-      self._params.add_hparam('distributed', True)
-
-    ports = os.environ.get('TWML_HOGWILD_PORTS')
-    if ports:
-      self._ports = [int(port) for port in ports.strip().split(",")]
-      if (self._num_workers + 1!= len(self._ports)):
-        raise ValueError("Number of (workers + PS) and ports need to match")
-    else:
-      if self._num_workers > 1:
-        raise ValueError("TWML_HOGWILD_PORTS needs to be set to use hogwild training")
-
-    # Split the number of data threads across multiple workers
-    num_threads = self._params.get('num_threads')
-    num_threads_per_worker = int(math.ceil(float(num_threads) / self._num_workers))
-    self._params.set_hparam('num_threads', num_threads_per_worker)
-
-    hogwild_task_type = os.environ.get('TWML_HOGWILD_TASK_TYPE')
-    hogwild_task_id = int(os.environ.get('TWML_HOGWILD_TASK_ID'))
-    os.environ['TF_CONFIG'] = self._get_cluster_config(hogwild_task_type, hogwild_task_id)
-
-  def _tf_random_seed(self):
-    """ Returns user set seed and deal with Hogwild multiple seeds """
-    tf_random_seed = self._params.get('tf_random_seed', None)
-    if tf_random_seed is None:
-      return None
-    elif self.using_hogwild and os.environ.get('TWML_HOGWILD_TASK_TYPE') == 'worker':
-      # chief (tf_random_seed), worker_0 (tf_random_seed + 1), worker_1 (tf_random_seed + 2)...
-      return tf_random_seed + 1 + int(os.environ.get('TWML_HOGWILD_TASK_ID'))
-    else:
-      return tf_random_seed
-
-  def check_params(self):
-    """ Verify that params has the correct key,values """
-    param_values = self._params.values()
-
-    if 'train_batch_size' in param_values:
-      if not isinstance(self._params.train_batch_size, int):
-        raise ValueError("Expecting params.train_batch_size to be an integer.")
-      if self._params.train_batch_size <= 0:
-        raise ValueError("train_batch_size needs to be positive")
-    else:
-      raise ValueError("train_batch_size needs to be present in params")
-
-    if 'eval_batch_size' in param_values:
-      if not isinstance(self._params.eval_batch_size, int):
-        raise ValueError("Expecting params.eval_batch_size to be an integer.")
-      if self._params.eval_batch_size <= 0:
-        raise ValueError("eval_batch_size needs to be positive.")
-    else:
-      self._params.add_hparam('eval_batch_size', self._params.train_batch_size)
-
-    if (self._params.get('distributed_training_cleanup') and
-      not self._params.get('distributed')):
-      # we only need to support training discontinuation for distributed training
-      # bc we are still using TSDs on GKE for distributed training
-      raise ValueError(
-        "Expecting params.distributed to be set if "
-        "params.distributed_training_cleanup is set."
-      )
-
-  def _get_cluster_config(self, name, index):
-    """Create a tensorflow cluster config from ports, name and index"""
-    host = '"localhost:%d"'
-    ps = host % self._ports[0]
-    chief = host % self._ports[1]
-    workers = ", ".join([host % port for port in self._ports[2:]])
-    config = _CLUSTER_TEMPLATE.substitute(
-      PS=ps,
-      CHIEF=chief,
-      WORKER=workers,
-      TYPE=name,
-      INDEX=index,
-    )
-    return config
-
-  @property
-  def current_estimator_spec(self):
-    """
-    returns the current estimator (warning: often reset)
-    """
-    return self._current_estimator_spec
-
-  @property
-  def estimator(self):
-    """ returns estimator encapsulated by Trainer """
-    return self._estimator
+        self._estimator = tf.estimator.Estimator(
+            model_fn=self._model_fn,
+            params=self._params,  # HParams
+            config=run_config,  # RunConfig
+            warm_start_from=warm_start_from,
+            model_dir=self._save_dir,  # By this point it is same as run_config.model_dir
+        )
 
-  @property
-  def num_workers(self):
-    """ returns number of workers """
-    return self._estimator.config.num_worker_replicas
+        # Log parameters that are used to construct trainer. This allows people to see default values.
+        logging.info("Trainer constructed using the following parameters: ")
+        pp_params = pp.pformat(self._params.values())
+        logging.info(pp_params)
+
+        # Start TensorBoard
+        if self._params.get("disable_tensorboard", False):
+            logging.info(
+                "Skipping launching TensorBoard [--disable_tensorboard is set]"
+            )
+        elif (
+            "tensorboard_port" in self._params.values()
+            and self._params.tensorboard_port is not None
+        ):
+            self.start_tensorboard(self._params.tensorboard_port)
+
+        # Export gauge that will track whether a model was exported
+        self.stats_exporter = StatsExporter("twml.trainer")
+        self.export_gauge = AtomicGauge("export_model")
+        self.stats_exporter.register_metrics(self.export_gauge)
+
+    def _hogwild_setup(self) -> None:
+        """
+        Setup the parameters required for hogwild.
+        """
+        self._num_workers = self._params.get("num_workers") or 1
+        logging.info("NUM_WORKERS: %d", self._num_workers)
+        if self._num_workers <= 1:
+            self._ports = None
+            return
+
+        # a hogwild job is considered distributed
+        if "distributed" in self._params:
+            self._params.set_hparam("distributed", True)
+        else:
+            self._params.add_hparam("distributed", True)
+
+        ports = os.environ.get("TWML_HOGWILD_PORTS")
+        if ports:
+            self._ports = [int(port) for port in ports.strip().split(",")]
+            if self._num_workers + 1 != len(self._ports):
+                raise ValueError("Number of (workers + PS) and ports need to match")
+        else:
+            if self._num_workers > 1:
+                raise ValueError(
+                    "TWML_HOGWILD_PORTS needs to be set to use hogwild training"
+                )
+
+        # Split the number of data threads across multiple workers
+        num_threads = self._params.get("num_threads")
+        num_threads_per_worker = int(math.ceil(float(num_threads) / self._num_workers))
+        self._params.set_hparam("num_threads", num_threads_per_worker)
+
+        hogwild_task_type = os.environ.get("TWML_HOGWILD_TASK_TYPE")
+        hogwild_task_id = int(os.environ.get("TWML_HOGWILD_TASK_ID"))
+        os.environ["TF_CONFIG"] = self._get_cluster_config(
+            hogwild_task_type, hogwild_task_id
+        )
 
-  @property
-  def worker_index(self):
-    """
-    returns index of worker in the cluster
-    chief has index 0
-    non-chief workers have indices 1 through (num_workers - 1)
-    """
-    return self._estimator.config.global_id_in_cluster
-
-  @property
-  def using_hogwild(self):
-    """ returns a bool indicating whether hogwild is being used """
-    return self._using_hogwild
-
-  def set_estimator(self, estimator):
-    """ sets the estimator used internally by Trainer """
-    if not isinstance(estimator, tf.estimator.Estimator):
-      raise ValueError("Expecting tf.estimator.Estimator")
-    self._estimator = estimator
-    self._params = self.estimator.params
-
-  @property
-  def params(self):
-    """
-    returns the hyper-parameters passed to the constructor.
-    """
-    return self._params
+    def _tf_random_seed(self) -> int:
+        """Returns user set seed and deal with Hogwild multiple seeds"""
+        tf_random_seed = self._params.get("tf_random_seed", None)
+        if tf_random_seed is None:
+            return None
+        elif (
+            self.using_hogwild and os.environ.get("TWML_HOGWILD_TASK_TYPE") == "worker"
+        ):
+            # chief (tf_random_seed), worker_0 (tf_random_seed + 1), worker_1 (tf_random_seed + 2)...
+            return tf_random_seed + 1 + int(os.environ.get("TWML_HOGWILD_TASK_ID"))
+        else:
+            return tf_random_seed
+
+    def check_params(self) -> None:
+        """Verify that params has the correct key,values"""
+        param_values = self._params.values()
+
+        if "train_batch_size" in param_values:
+            if not isinstance(self._params.train_batch_size, int):
+                raise ValueError("Expecting params.train_batch_size to be an integer.")
+            if self._params.train_batch_size <= 0:
+                raise ValueError("train_batch_size needs to be positive")
+        else:
+            raise ValueError("train_batch_size needs to be present in params")
+
+        if "eval_batch_size" in param_values:
+            if not isinstance(self._params.eval_batch_size, int):
+                raise ValueError("Expecting params.eval_batch_size to be an integer.")
+            if self._params.eval_batch_size <= 0:
+                raise ValueError("eval_batch_size needs to be positive.")
+        else:
+            self._params.add_hparam("eval_batch_size", self._params.train_batch_size)
+
+        if self._params.get("distributed_training_cleanup") and not self._params.get(
+            "distributed"
+        ):
+            # we only need to support training discontinuation for distributed training
+            # bc we are still using TSDs on GKE for distributed training
+            raise ValueError(
+                "Expecting params.distributed to be set if "
+                "params.distributed_training_cleanup is set."
+            )
+
+    def _get_cluster_config(self, name, index) -> str:
+        """Create a tensorflow cluster config from ports, name and index"""
+        host = '"localhost:%d"'
+        ps = host % self._ports[0]
+        chief = host % self._ports[1]
+        workers = ", ".join([host % port for port in self._ports[2:]])
+        config = _CLUSTER_TEMPLATE.substitute(
+            PS=ps,
+            CHIEF=chief,
+            WORKER=workers,
+            TYPE=name,
+            INDEX=index,
+        )
+        return config
+
+    @property
+    def current_estimator_spec(self) -> tf.estimator.EstimatorSpec:
+        """returns the current estimator (warning: often reset)"""
+        return self._current_estimator_spec
+
+    @property
+    def estimator(self) -> tf.estimator.Estimator:
+        """returns estimator encapsulated by Trainer"""
+        return self._estimator
+
+    @property
+    def num_workers(self) -> int:
+        """returns number of workers"""
+        return self._estimator.config.num_worker_replicas
+
+    @property
+    def worker_index(self) -> int:
+        """
+        returns index of worker in the cluster chief has index 0
+        non-chief workers have indices 1 through (num_workers - 1)
+        """
+        return self._estimator.config.global_id_in_cluster
+
+    @property
+    def using_hogwild(self) -> bool:
+        """returns a bool indicating whether hogwild is being used"""
+        return self._using_hogwild
+
+    def set_estimator(self, estimator: tf.estimator.Estimator) -> None:
+        """sets the estimator used internally by Trainer"""
+        if not isinstance(estimator, tf.estimator.Estimator):
+            raise ValueError("Expecting tf.estimator.Estimator")
+        self._estimator = estimator
+        self._params = self.estimator.params
+
+    @property
+    def params(self):
+        """returns the hyper-parameters passed to the constructor."""
+        return self._params
+
+    @staticmethod
+    def add_parser_arguments() -> argparse.ArgumentParser:
+        """
+        Add common commandline args to parse for the Trainer class.
+        Typically, the user calls this function and then parses cmd-line arguments
+        into an argparse.Namespace object which is then passed to the Trainer constructor
+        via the params argument.
+
+        See the `code <_modules/twml/argument_parser.html#get_trainer_parser>`_
+        for a list and description of all cmd-line arguments.
+
+        Returns:
+            argparse.ArgumentParser instance with some useful args already added.
+        """
+        return twml.argument_parser.get_trainer_parser()
+
+    @staticmethod
+    def get_train_op(
+        params: tf.contrib.training.HParams, loss: tf.Tensor
+    ) -> tf.Operation:
+        """
+        Return a training Op, that is, a `twml.optimizers.optimize_loss
+        <https://www.tensorflow.org/api_docs/python/tf/contrib/layers/optimize_loss>`_
+        instance given params and loss.
+        This method can be overwritten by passing the optimize_loss_fn to the Trainer
+        constructor.
 
-  @staticmethod
-  def add_parser_arguments():
-    """
-    Add common commandline args to parse for the Trainer class.
-    Typically, the user calls this function and then parses cmd-line arguments
-    into an argparse.Namespace object which is then passed to the Trainer constructor
-    via the params argument.
+        Args:
+            params:
+                tensorflow.contrib.training.HParams instance. Recognizes the optimizer, optimizer_summaries,
+                gradient_noise_scale, clip_gradients and learning_rate_decay (including
+                other learning rate decay arguments).
+            loss:
+                scalar Op returned by the build_graph that specifies the training loss to
+                be minimized.
+        """
+        optimizer = params.get("optimizer")
+
+        if not optimizer:
+            optimizer = "SGD"
+
+        if optimizer == "LazyAdam":
+            optimizer = LazyAdamOptimizer
+
+        if optimizer == "DGC":
+            optimizer = DeepGradientCompressionOptimizer(
+                learning_rate=params.learning_rate,
+                use_locking=False,
+                name="Sparse",
+                density=params.get("dgc_density"),
+                density_decay=params.get("dgc_density_decay"),
+                density_decay_steps=params.get("dgc_density_decay_steps"),
+                density_decay_rate=params.get("dgc_density_decay_rate"),
+                min_density=params.get("dgc_min_density"),
+                accumulation=params.get("dgc_accumulation"),
+            )
+
+        summaries = ["loss"]
+        if params.get("show_optimizer_summaries"):
+            summaries = OPTIMIZER_SUMMARIES
+
+        train_op = optimize_loss(
+            loss=loss,
+            global_step=tf.train.get_global_step(),
+            optimizer=optimizer,
+            learning_rate=params.learning_rate,
+            summaries=summaries,
+            colocate_gradients_with_ops=True,
+            gradient_noise_scale=params.get("gradient_noise_scale"),
+            clip_gradients=params.get("clip_gradients"),
+            learning_rate_decay_fn=twml.learning_rate_decay.get_learning_rate_decay_fn(
+                params
+            ),
+        )
+        return train_op
+
+    def export_model_effects(
+        self,
+        export_path: str,
+        feature_spec: Dict[str, Any] = None,
+        log_features: bool = True,
+    ) -> None:
+        """Export model effects to disk."""
+        if feature_spec:
+            if log_features:
+                features = feature_spec["features"]
+                feature_names = [
+                    ".".join(features[fid]["featureName"].split(".")[1:])
+                    for fid in features.keys()
+                ]
+                features_to_log = ",".join(feature_names)
+                try:
+                    model_hash = self.experiment_tracker.compute_model_hash(export_path)
+                    metrics.log_usage(
+                        "dbv2",
+                        "export_model_effects",
+                        "v1",
+                        custom_attrs=[
+                            model_hash,
+                            "feature config present",
+                            features_to_log,
+                        ],
+                    )
+                except:  # noqa: T803
+                    logging.info("Failed to log Feature Config features")
+
+            twml.contrib.export.export_fn.export_feature_spec(export_path, feature_spec)
+            export_start_time = time.time()
+            self.experiment_tracker.export_feature_spec(feature_spec)
+            logging.info(
+                "Exported feature spec to ML Metastore in %s seconds.",
+                time.time() - export_start_time,
+            )
+
+        self.experiment_tracker.register_model(str(export_path))
+        self.export_gauge.increment()
+
+    @property
+    def best_or_latest_checkpoint(self) -> str:
+        if self._is_early_stopping:
+            best_checkpoint_path = os.path.join(self._save_dir, "best_checkpoint")
+            checkpoint_path = tf.train.latest_checkpoint(best_checkpoint_path)
+            # Return best checkpoint if necessary
+            if checkpoint_path:
+                return str(checkpoint_path)
+            else:
+                raise ValueError(
+                    "Best checkpoint not found at %s." % best_checkpoint_path
+                )
+        else:  # Fallback to latest checkpoint from save directory
+            return str(self.latest_checkpoint)
+
+    @property
+    def latest_checkpoint(self) -> str:
+        return str(self.estimator.latest_checkpoint())
+
+    def export_model(
+        self,
+        serving_input_receiver_fn: Callable[
+            [], tf.estimator.export.ServingInputReceiver
+        ],
+        export_output_fn: Callable[
+            [tf.estimator.EstimatorSpec], tf.estimator.export.ExportOutput
+        ] = None,
+        export_dir: str = None,
+        checkpoint_path: str = None,
+        feature_spec: Dict[str, Any] = None,
+        log_features: bool = True,
+    ) -> None:
+        """
+        Export the model for prediction. Typically, the exported model
+        will later be run in production servers. This method is called
+        by the user to export the PREDICTgraph to disk.
+
+        Internally, this method calls `tf.estimator.Estimator.export_savedmodel
+        <https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator#export_savedmodel>`_.
+
+        Note that a valid self._export_output_fn is required.
+        If export_ouput_fn is provided, it is used to set the self._export_output_fn.
 
-    See the `code <_modules/twml/argument_parser.html#get_trainer_parser>`_
-    for a list and description of all cmd-line arguments.
+        Args:
+            serving_input_receiver_fn:
+                function preparing the model for inference requests.
+                This funtion returns the ``features`` dict passed to ``build_graph``.
+            export_dir:
+                directory to export a SavedModel for prediction servers.
+                Defaults to ``[save_dir]/exported_models``.
+            checkpoint_path:
+                the checkpoint path to export. If None (the default), the most recent checkpoint
+                found within the model directory is chosen.
+            export_output_fn:
+                Function to export the graph_output (output of build_graph) for
+                prediction. Takes a graph_output dict as sole argument and returns
+                the export_output_fns dict.
+                Defaults to `twml.export_output_fns.default_output_fn`.
+
+        Return:
+            returns a string path to exported directory.
+        """
+        if not self.is_chief():
+            logging.info(
+                "Trainer.export_model ignored due to the process not being chief."
+            )
+            return
 
-    Returns:
-      argparse.ArgumentParser instance with some useful args already added.
-    """
-    return twml.argument_parser.get_trainer_parser()
+        self._export_output_fn = (
+            export_output_fn or twml.export_output_fns.default_output_fn
+        )
 
-  @staticmethod
-  def get_train_op(params, loss):
-    """
-    Return a training Op, that is, a `twml.optimizers.optimize_loss
-    <https://www.tensorflow.org/api_docs/python/tf/contrib/layers/optimize_loss>`_
-    instance given params and loss.
-    This method can be overwritten by passing the optimize_loss_fn to the Trainer
-    constructor.
-
-    Args:
-      params:
-        tensorflow.contrib.training.HParams instance. Recognizes the optimizer, optimizer_summaries,
-        gradient_noise_scale, clip_gradients and learning_rate_decay (including
-        other learning rate decay arguments).
-      loss:
-        scalar Op returned by the build_graph that specifies the training loss to
-        be minimized.
-    """
-    optimizer = params.get('optimizer')
-
-    if not optimizer:
-      optimizer = 'SGD'
-
-    if optimizer == 'LazyAdam':
-      optimizer = LazyAdamOptimizer
-
-    if optimizer == 'DGC':
-      optimizer = DeepGradientCompressionOptimizer(
-          learning_rate=params.learning_rate,
-          use_locking=False,
-          name="Sparse",
-          density=params.get('dgc_density'),
-          density_decay=params.get('dgc_density_decay'),
-          density_decay_steps=params.get('dgc_density_decay_steps'),
-          density_decay_rate=params.get('dgc_density_decay_rate'),
-          min_density=params.get('dgc_min_density'),
-          accumulation=params.get('dgc_accumulation')
-      )
-
-    summaries = ['loss']
-    if params.get('show_optimizer_summaries'):
-      summaries = OPTIMIZER_SUMMARIES
-
-    train_op = optimize_loss(
-      loss=loss,
-      global_step=tf.train.get_global_step(),
-      optimizer=optimizer,
-      learning_rate=params.learning_rate,
-      summaries=summaries,
-      colocate_gradients_with_ops=True,
-      gradient_noise_scale=params.get('gradient_noise_scale'),
-      clip_gradients=params.get('clip_gradients'),
-      learning_rate_decay_fn=twml.learning_rate_decay.get_learning_rate_decay_fn(params)
-    )
-    return train_op
-
-  def export_model_effects(self, export_path, feature_spec=None, log_features=True):
-
-    # DO NOT CHANGE THE ORDER.
-    # This needs to be done before registering the model.
-    if feature_spec:
-      if log_features:
-        features = feature_spec['features']
-        feature_names = ['.'.join(features[fid]['featureName'].split('.')[1:]) for fid in features.keys()]
-        features_to_log = ','.join(feature_names)
-        try:
-          model_hash = self.experiment_tracker.compute_model_hash(export_path)
-          metrics.log_usage('dbv2', 'export_model_effects', 'v1', custom_attrs=[model_hash, "feature config present", features_to_log])
-        except:  # noqa: T803
-          logging.info("Failed to log Feature Config features")
-
-      twml.contrib.export.export_fn.export_feature_spec(export_path, feature_spec)
-      export_start_time = time.time()
-      self.experiment_tracker.export_feature_spec(feature_spec)
-      logging.info("Exported feature spec to ML Metastore in %s seconds.", time.time() - export_start_time)
-
-    self.experiment_tracker.register_model(str(export_path))
-    self.export_gauge.increment()
-
-  @property
-  def best_or_latest_checkpoint(self):
-    if self._is_early_stopping:
-      best_checkpoint_path = os.path.join(self._save_dir, "best_checkpoint")
-      checkpoint_path = tf.train.latest_checkpoint(best_checkpoint_path)
-      # Return best checkpoint if necessary
-      if checkpoint_path:
-        return checkpoint_path
-      else:
-        raise ValueError("Best checkpoint not found at %s." % best_checkpoint_path)
-    else:  # Fallback to latest checkpoint from save directory
-      return self.latest_checkpoint
-
-  @property
-  def latest_checkpoint(self):
-    return self.estimator.latest_checkpoint()
-
-  def export_model(self, serving_input_receiver_fn,
-                   export_output_fn=None,
-                   export_dir=None, checkpoint_path=None,
-                   feature_spec=None,
-                   log_features=True):
-    """
-    Export the model for prediction. Typically, the exported model
-    will later be run in production servers. This method is called
-    by the user to export the PREDICTgraph to disk.
-
-    Internally, this method calls `tf.estimator.Estimator.export_savedmodel
-    <https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator#export_savedmodel>`_.
-
-    Note that a valid self._export_output_fn is required.
-    If export_ouput_fn is provided, it is used to set the self._export_output_fn.
-
-    Args:
-      serving_input_receiver_fn:
-        function preparing the model for inference requests.
-        This funtion returns the ``features`` dict passed to ``build_graph``.
-      export_dir:
-        directory to export a SavedModel for prediction servers.
-        Defaults to ``[save_dir]/exported_models``.
-      checkpoint_path:
-        the checkpoint path to export. If None (the default), the most recent checkpoint
-        found within the model directory is chosen.
-      export_output_fn:
-        Function to export the graph_output (output of build_graph) for
-        prediction. Takes a graph_output dict as sole argument and returns
-        the export_output_fns dict.
-        Defaults to `twml.export_output_fns.default_output_fn`.
-
-    Return:
-      returns a string path to exported directory.
-
-    # set the export output function
-    """
-    if not self.is_chief():
-      logging.info("Trainer.export_model ignored due to the process not being chief.")
-      return
+        if not callable(self._export_output_fn):
+            raise RuntimeError(
+                "Expecting export_output_fn function. Got %s."
+                % type(self._export_output_fn).__name__
+            )
+
+        if export_dir:
+            export_dir = sanitize_hdfs_path(export_dir)
+
+        if checkpoint_path:
+            checkpoint_path = sanitize_hdfs_path(checkpoint_path)
+        else:
+            checkpoint_path = self.best_or_latest_checkpoint
+
+        # actually export the model using the Estimator API
+        export_path = self._estimator.export_savedmodel(
+            export_dir_base=export_dir
+            or os.path.join(self._save_dir, "exported_models"),
+            serving_input_receiver_fn=serving_input_receiver_fn,
+            checkpoint_path=checkpoint_path,
+        )
 
-    self._export_output_fn = export_output_fn or twml.export_output_fns.default_output_fn
+        # export_path is bytes, need to convert to string for python3 to work.
+        logging.info("The exported model path is: " + str(export_path))
 
-    if not callable(self._export_output_fn):
-      raise RuntimeError(
-        "Expecting export_output_fn function. Got %s."
-        % type(self._export_output_fn).__name__)
+        self.export_model_effects(export_path, feature_spec, log_features)
 
-    if export_dir:
-      export_dir = sanitize_hdfs_path(export_dir)
+        return export_path
 
-    if checkpoint_path:
-      checkpoint_path = sanitize_hdfs_path(checkpoint_path)
-    else:
-      checkpoint_path = self.best_or_latest_checkpoint
+    def _model_fn(
+        self,
+        features: Dict[str, tf.Tensor],
+        labels: tf.Tensor,
+        mode: tf.estimator.ModeKeys,
+        params: tf.contrib.training.HParams,
+        config=None,
+    ) -> tf.estimator.EstimatorSpec:
+        """
+        returns tf.estimator.EstimatorSpec that can be used with tf.estimator.Estimators.
+        You would probably never need to modify this method.
+        Instead, you should override build_graph, which this method calls.
 
-    # actually export the model using the Estimator API
-    export_path = self._estimator.export_savedmodel(
-      export_dir_base=export_dir or os.path.join(self._save_dir, 'exported_models'),
-      serving_input_receiver_fn=serving_input_receiver_fn,
-      checkpoint_path=checkpoint_path)
+        Args:
+            features:
+                Dict of input tensors.
+            labels:
+                Tensor of target labels.
+            mode:
+                an instance of tf.estimator.ModeKeys.
+                Typically used to toggle TRAINing or EVALuation.
+            params:
+                HParams object containing hyper-parameters.
+        """
+        # pylint: disable=too-many-branches
+        if isinstance(features, dict):
+            weights = features.get("weights", None)
+        else:
+            weights = None
+
+        with tf.variable_scope(self._name + "/model"):
+            graph_output = self._build_graph_fn(features, labels, mode, params, config)
+            loss = graph_output["loss"] if "loss" in graph_output else None
+
+        self._maybe_restore_checkpoint()
+
+        with tf.variable_scope(self._name + "/optim"):
+            train_op = None
+            if mode == tf.estimator.ModeKeys.TRAIN:
+                if "train_op" in graph_output:
+                    train_op = graph_output["train_op"]
+                    graph_output[
+                        "train_op"
+                    ] = None  # remove from preds to prevent error
+                elif loss is not None:
+                    train_op = self._optimize_loss_fn(params, loss)
+
+                if params.get("train_log_metrics") and self._metric_fn:
+                    metric_ops = self._metric_fn(
+                        graph_output=graph_output, labels=labels, weights=weights
+                    )
+                    for metric_name in metric_ops:
+                        tf.summary.scalar(
+                            name="training_metric_" + metric_name,
+                            tensor=metric_ops[metric_name][1],
+                        )  # index 0 contains value_op, 1 contains update_op
+
+        if mode == tf.estimator.ModeKeys.PREDICT and self._export_output_fn is not None:
+            # note that this is ignored by the predict method.
+            # Estimator only uses export_output_fn for export_model.
+            export_outputs = self._export_output_fn(graph_output)
+        else:
+            export_outputs = None
+
+        if mode == tf.estimator.ModeKeys.EVAL and self._metric_fn:
+            eval_metric_ops = self._metric_fn(
+                graph_output=graph_output, labels=labels, weights=weights
+            )
+        else:
+            eval_metric_ops = None
+
+        # None and loss (scalar, not sliceable by TFMA) should be removed from the graph_output
+        preds = {
+            key: graph_output[key]
+            for key in graph_output
+            if (graph_output[key] is not None) and (key is not "loss")
+        }
+
+        init_feed_dict = twml.contrib.initializers.get_init_feed_dict()
+        scaffold = tf.train.Scaffold(init_feed_dict=init_feed_dict)
+
+        # Clear the init feed collection to avoid serializing the initializers.
+        twml.contrib.initializers.clear_init_feed_collection()
+
+        # save estimator for use by later methods and hooks (warning: often reset)
+        self._current_estimator_spec = tf.estimator.EstimatorSpec(
+            mode=mode,
+            predictions=preds,
+            export_outputs=export_outputs,
+            loss=loss,
+            train_op=train_op,
+            eval_metric_ops=eval_metric_ops,
+            scaffold=scaffold,
+        )
 
-    # export_path is bytes, need to convert to string for python3 to work.
-    logging.info("The exported model path is: " + str(export_path))
+        return self._current_estimator_spec
 
-    self.export_model_effects(export_path, feature_spec, log_features)
+    def get_train_hooks(self) -> List[tf.train.SessionRunHook]:
+        """Return SessionRunHooks used during training.
 
-    return export_path
+        By default training uses one hooks `tf.train.StepCounterHook` for monitoring step speed.
 
-  def _model_fn(self, features, labels, mode, params, config=None):
-    """
-    returns tf.estimator.EstimatorSpec that can be used with tf.estimator.Estimators.
-    You would probably never need to modify this method.
-    Instead, you should override build_graph, which this method calls.
-
-    Args:
-      features:
-        Dict of input tensors.
-      labels:
-        Tensor of target labels.
-      mode:
-        an instance of tf.estimator.ModeKeys.
-        Typically used to toggle TRAINing or EVALuation.
-      params:
-        HParams object containing hyper-parameters.
-    """
-    # pylint: disable=too-many-branches
-    if isinstance(features, dict):
-      weights = features.get('weights', None)
-    else:
-      weights = None
-
-    with tf.variable_scope(self._name + '/model'):
-      graph_output = self._build_graph_fn(features, labels, mode, params, config)
-      loss = graph_output['loss'] if 'loss' in graph_output else None
-
-    self._maybe_restore_checkpoint()
-
-    with tf.variable_scope(self._name + '/optim'):
-      train_op = None
-      if mode == tf.estimator.ModeKeys.TRAIN:
-        if 'train_op' in graph_output:
-          train_op = graph_output['train_op']
-          graph_output['train_op'] = None  # remove from preds to prevent error
-        elif loss is not None:
-          train_op = self._optimize_loss_fn(params, loss)
-
-        if params.get('train_log_metrics') and self._metric_fn:
-          metric_ops = self._metric_fn(graph_output=graph_output, labels=labels, weights=weights)
-          for metric_name in metric_ops:
-            tf.summary.scalar(
-              name="training_metric_" + metric_name,
-              tensor=metric_ops[metric_name][1])  # index 0 contains value_op, 1 contains update_op
-
-    if mode == tf.estimator.ModeKeys.PREDICT and self._export_output_fn is not None:
-      # note that this is ignored by the predict method.
-      # Estimator only uses export_output_fn for export_model.
-      export_outputs = self._export_output_fn(graph_output)
-    else:
-      export_outputs = None
-
-    if mode == tf.estimator.ModeKeys.EVAL and self._metric_fn:
-      eval_metric_ops = self._metric_fn(graph_output=graph_output, labels=labels, weights=weights)
-    else:
-      eval_metric_ops = None
-
-    # None and loss (scalar, not sliceable by TFMA) should be removed from the graph_output
-    preds = {key: graph_output[key] for key in graph_output if (graph_output[key] is not None) and (key is not 'loss')}
-
-    init_feed_dict = twml.contrib.initializers.get_init_feed_dict()
-    scaffold = tf.train.Scaffold(init_feed_dict=init_feed_dict)
-
-    # Clear the init feed collection to avoid serializing the initializers.
-    twml.contrib.initializers.clear_init_feed_collection()
-
-    # save estimator for use by later methods and hooks (warning: often reset)
-    self._current_estimator_spec = tf.estimator.EstimatorSpec(
-      mode=mode,
-      predictions=preds,
-      export_outputs=export_outputs,
-      loss=loss,
-      train_op=train_op,
-      eval_metric_ops=eval_metric_ops,
-      scaffold=scaffold,
-    )
-
-    return self._current_estimator_spec
-
-  def get_train_hooks(self):
-    """Return SessionRunHooks used during training.
-
-    By default training uses one hooks `tf.train.StepCounterHook` for monitoring step speed.
-
-    If self._profiler_steps is set then we also use the ProfilerHook `tf.train.ProfilerHook`
-    for monitoring the profile.
+        If self._profiler_steps is set then we also use the ProfilerHook `tf.train.ProfilerHook`
+        for monitoring the profile.
 
-    """
-    # Instead of having every_n_steps be a constant number,
-    # change it dynamically based on batch size.
-    # Ideally we should be using every_n_secs, but that seems buggy as of 1.7.
-    # The every_n_steps = 20K / batch_size
-    every_n_steps = ((2048 * 100) // self._params.train_batch_size)
-    step_counter = tf.train.StepCounterHook(
-      every_n_steps=every_n_steps, output_dir=self._save_dir
-    )
-    train_hooks = [step_counter]
-
-    if self._profiler_steps is not None:
-      if not self._params.get('distributed') or self._estimator.config.is_chief:
-        profiler = tf.train.ProfilerHook(
-          save_steps=self._profiler_steps,
-          output_dir=self._save_dir
+        """
+        # Instead of having every_n_steps be a constant number,
+        # change it dynamically based on batch size.
+        # Ideally we should be using every_n_secs, but that seems buggy as of 1.7.
+        # The every_n_steps = 20K / batch_size
+        every_n_steps = (2048 * 100) // self._params.train_batch_size
+        step_counter = tf.train.StepCounterHook(
+            every_n_steps=every_n_steps, output_dir=self._save_dir
         )
-        train_hooks.append(profiler)
-
-    return train_hooks
-
-  def is_task_type(self, name):
-    """
-    Helper function to specify if the current process is of the given worker type.
-    Note: This an only be called *after* self._hogwild_setup() is called in __init__()
-    """
-    if os.environ.get('TF_CONFIG'):
-      if self._estimator.config.task_type == name:
+        train_hooks = [step_counter]
+
+        if self._profiler_steps is not None:
+            if not self._params.get("distributed") or self._estimator.config.is_chief:
+                profiler = tf.train.ProfilerHook(
+                    save_steps=self._profiler_steps, output_dir=self._save_dir
+                )
+                train_hooks.append(profiler)
+
+        return train_hooks
+
+    def is_task_type(self, name: str) -> bool:
+        """
+        Helper function to specify if the current process is of the given worker type.
+        Note: This an only be called *after* self._hogwild_setup() is called in __init__()
+        """
+        if os.environ.get("TF_CONFIG"):
+            if self._estimator.config.task_type == name:
+                return True
+            else:
+                return False
         return True
-      else:
-        return False
-    return True
 
-  def is_evaluator(self):
-    """
-    Helper function to let you know if the worker is evaluator.
-    Note: This an only be called *after* self._hogwild_setup() is called in __init__()
-    """
-    return self.is_task_type("evaluator")
-
-  def is_chief(self):
-    """
-    Helper function to let you know if the worker is chief.
-    Note: This an only be called *after* self._hogwild_setup() is called in __init__()
-    """
-    return self.is_task_type("chief") or self.is_task_type("master")
-
-  def is_ps(self):
-    """
-    Helper function to let you know if the task is parameter server.
-    """
-    if os.environ.get('TF_CONFIG') and self._estimator.config.task_type == 'ps':
-      return True
-    return False
+    def is_evaluator(self) -> bool:
+        """
+        Helper function to let you know if the worker is evaluator.
+        Note: This an only be called *after* self._hogwild_setup() is called in __init__()
+        """
+        return self.is_task_type("evaluator")
+
+    def is_chief(self) -> bool:
+        """
+        Helper function to let you know if the worker is chief.
+        Note: This an only be called *after* self._hogwild_setup() is called in __init__()
+        """
+        return self.is_task_type("chief") or self.is_task_type("master")
+
+    def is_ps(self) -> bool:
+        """
+        Helper function to let you know if the task is parameter server.
+        """
+        if os.environ.get("TF_CONFIG") and self._estimator.config.task_type == "ps":
+            return True
+        return False
 
-  def _exit_ps_after_training_complete(self):
-    """
-    Helper function to shutdown parameter server after training job complete (either succeed or failed).
-    """
-    if not self.is_ps():
-      return
-
-    # No need to exit ps if on the same machine
-    if os.environ.get('TWML_HOGWILD_PORTS'):
-      return
-
-    if self._params.get('disable_auto_ps_shutdown', False):
-      logging.info("Skip shutting down parameter server after training complete [--disable_auto_ps_shutdown is set]")
-      return
-
-    # checking job status is different on gke vs aurora
-    if self._is_on_gke():
-      get_job_status = functools.partial(
-        k8s_status.get_training_job_status,
-        cluster=None,
-        namespace=os.environ['TWML_JOB_ROLE'],
-        environment=os.environ['TWML_JOB_ENV'],
-        job_name=os.environ['TWML_JOB_NAME'],
-        using_tsd=True)
-    else:
-      get_job_status = functools.partial(
-        get_distributed_training_job_path,
-        base_job_path=get_distributed_training_job_path()
-      )
-
-    def wait_complete_then_exit():
-      retry_max = 60
-      retry = 0
-      while True:
-        try:
-          training_status = get_job_status()
-          if training_status == TrainingJobStatus.FINISHED:
-            logging.info("Distributed training job succeed, shutting down parameter server.")
-            os._exit(0)
-          elif training_status == TrainingJobStatus.FAILED:
-            logging.info("Distributed training job failed, shutting down parameter server.")
-            os._exit(0)
-          elif training_status == TrainingJobStatus.NOT_FOUND:
-            raise Exception("Distributed training job status not found.")
-          else:
-            poke_interval = random.randrange(60, 90)  # prevent spike QPS to aurora endpoint
-            time.sleep(poke_interval)
+    def _exit_ps_after_training_complete(self):
+        """Helper function to shutdown parameter server after training job complete (either succeed or failed)."""
+        if not self.is_ps():
+            return
+
+        # No need to exit ps if on the same machine
+        if os.environ.get("TWML_HOGWILD_PORTS"):
+            return
+
+        if self._params.get("disable_auto_ps_shutdown", False):
+            logging.info(
+                "Skip shutting down parameter server after training complete [--disable_auto_ps_shutdown is set]"
+            )
+            return
+
+        # checking job status is different on gke vs aurora
+        if self._is_on_gke():
+            get_job_status = functools.partial(
+                k8s_status.get_training_job_status,
+                cluster=None,
+                namespace=os.environ["TWML_JOB_ROLE"],
+                environment=os.environ["TWML_JOB_ENV"],
+                job_name=os.environ["TWML_JOB_NAME"],
+                using_tsd=True,
+            )
+        else:
+            get_job_status = functools.partial(
+                get_distributed_training_job_path,
+                base_job_path=get_distributed_training_job_path(),
+            )
+
+        def wait_complete_then_exit() -> None:
+            """Wait for distributed training job to complete, then exit parameter server."""
+            retry_max = 60
             retry = 0
-        except Exception as e:
-          if retry >= retry_max:
-            raise e  # only exception in this thread, won't fail parameter server thread
-          retry += 1
-          poke_interval = random.randrange(60, 90) + retry * 10
-          logging.warn("Error getting distributed training job status, will retry after %s seconds." % poke_interval)
-          time.sleep(poke_interval)
-    Thread(target=wait_complete_then_exit).start()
-
-  def get_eval_hooks(self):  # pylint: disable=no-self-use
-    """ Return SessionRunHooks used during evaluation."""
-    return None
-
-  def get_predict_hooks(self):
-    """ Return hooks used during prediction.
-    If profiler_steps is set in the constructor to the Trainer,
-    we pass a tf.Train.ProfilerHook to the estimator's predict function.
-    """
-    hooks = []
-    if self._profiler_steps is not None:
-      profiler = tf.train.ProfilerHook(
-        save_steps=self._profiler_steps,
-        output_dir=self._save_dir
-      )
-      hooks.append(profiler)
-    return hooks
-
-  def learn(self, train_input_fn=None, eval_input_fn=None,
-            train_max_steps=None,
-            train_steps=None, eval_steps=None,
-            train_hooks=None, eval_hooks=None,
-            early_stop_metric=None, early_stop_patience=-1,
-            early_stop_minimize=True, early_stop_tolerance=0, start_epoch=0,
-            exporters=None, export_output_fn=None, max_duration=None):
-    """
-    Train and evaluate the estimator for ``train_max_steps`` steps.
-    Each epoch involves ``train_steps`` training steps followed
-    by ``eval_steps`` evaluation steps. Note that each step
-    is a ``session.run()``, that is, each batch is a step.
-
-    Args:
-      train_max_steps:
-        maximum number of global steps of training to run.
-        Defaults to params.train_max_steps.
-        None-values cause learn() to terminate after *one* call to train() and evaluate(),
-        which is usually useful when using train_steps=-1
-        Non-positive values trains indefinitely in a loop (use with caution),
-        which is usually useful when used with early stopping.
-      train_steps:
-        number of training steps per epoch. For example, 100 means each
-        training epoch will end after processing 100 batches.
-        Defaults to params.train_steps.
-        Non-positive values and None-values go through the entire training set each epoch.
-      eval_steps:
-        number of evaluation steps per epoch.
-        Defaults to params.eval_steps.
-        Non-positive values and None-values go through the entire evaluation set each epoch.
-      train_input_fn:
-        Function to iterate through training set. It is passed to estimator.train.
-      eval_input_fn:
-        Function to iterate through evaluation set. It is passed to estimator.evaluate.
-      train_hooks:
-        List of SessionRunHooks uses for training. Defaults to self.get_train_hooks().
-      eval_hooks:
-        List of SessionRunHooks uses for evaluation. Defaults to self.get_eval_hooks()
-      start_epoch:
-        The epoch from which to start learn. If you want to do training and evaluation
-        for N epochs, you can call ``learn()`` in a loop as follows:
-      exporters:
-        List of exporters called at the end of each evaluation run.
-        Defaults to none.
-      export_output_fn:
-        The output format to use for exported models.
-        Only used if exporters is not None.
-
-        .. code-block:: python
-
-          for epoch in range(1,max_epoch):
-            trainer.learn(start_epoch=epoch)
-
-    Early-stopping arguments:
-      early_stop_metric:
-        String specifying the metric to early-stop on. Required with positive
-        ``early_stop_patience``. For example, 'accuracy', 'accuracy_0', 'loss', etc.
-        The string is used to extract the relevant tensor Op from the dict returned by
-        the get_eval_metric_ops method. For ``metrics`` pass to the constructor,
-        the string is one of those. For multi-class (that is, multi-metric)
-        metrics, the string may be appended with a ``_0``, ``_1``, etc. or one
-        of the ``multi_metric_names`` (one per class).
-      early_stop_patience:
-        Maximum number of epochs to wait for an improvement in the early_stop_metric
-        before breaking off training. For example, a patience of 10 means that
-        training will have 10 epochs to improve the metric before it is killed.
-        Whenever the metric is improved before running out of patience,
-        patience is reset to ``early_stop_patience``.
-        Defaults to -1 (that is, no early-stopping).
-      early_stop_minimize:
-        Set this to True (the default) for metrics that need to be minimized
-        (like ``loss``). Metrics like ``accuracy`` that need to be maximized
-        should set this to False.
-      early_stop_tolerance:
-        A non-negative tolerance for comparing early_stop_metric.
-        E.g. when maximizing the condition is current_metric > best_metric + tolerance.
-        Defaults to 0.
-      max_duration:
-        A float. When this argument is defined, the job will automatically terminate after
-        `max_duration` seconds if it has not already compeleted. 
-
-    Returns:
-      The directory where the checkpoints were saved.
-      That is, save_dir.
-      You can point TensorBoard to this directory to get metrics,
-      or pass it to another Trainer via ``init_from_dir`` when doing
-      multi-phase training.
-    """
-    # pylint: disable=too-many-branches
-
-    if not callable(train_input_fn):
-      raise ValueError("Expecting callable train_input_fn function")
-    if not callable(eval_input_fn):
-      raise ValueError("Expecting callable eval_input_fn function")
-
-    if os.environ.get('TF_CONFIG'):
-      raise ValueError("trainer.learn() can not be used with distributed / hogwild setups")
-
-    if exporters and export_output_fn:
-      self._export_output_fn = export_output_fn
-
-    train_hooks = self.get_train_hooks() if train_hooks is None else train_hooks
-    eval_hooks = self.get_eval_hooks() if eval_hooks is None else eval_hooks
-    eval_hooks = [] if eval_hooks is None else eval_hooks
-
-    if train_max_steps is None:
-      train_max_steps = self.params.get('train_max_steps')
-
-    if train_steps is None:
-      train_steps = self.params.train_steps
-    if train_steps <= 0:
-      train_steps = None
-
-    if eval_steps is None:
-      eval_steps = self.params.eval_steps
-    if eval_steps <= 0:
-      eval_steps = None
-
-    if early_stop_patience > 0:
-      assert train_max_steps is not None, "Early stopping and max_steps=None are not compatible."
-      # prepare early stopping hook (which also handles logic here)
-      self._is_early_stopping = True
-      early_stop_hook = twml.hooks.EarlyStopHook(
-        metric=early_stop_metric,
-        checkpoint_dir=self._save_dir,
-        patience=early_stop_patience,
-        minimize=early_stop_minimize,
-        tolerance=early_stop_tolerance,
-        get_estimator_spec_fn=lambda: self.current_estimator_spec,
-        start_epoch=start_epoch)
-      # add early stop hook to eval hooks
-      eval_hooks.append(early_stop_hook)
-
-    if max_duration is not None:
-      train_early_stop_duration_hook = twml.hooks.EarlyStopDuration(
-        max_duration=max_duration,
-        exit_on_end=False,
-        save_dir=self._save_dir,
-        overwrite=True,
-      )
-      train_hooks.append(train_early_stop_duration_hook)
-
-      eval_early_stop_duration_hook = twml.hooks.EarlyStopDuration(
-        max_duration=max_duration,
-        exit_on_end=False,
-        save_dir=self._save_dir,
-        overwrite=True,
-      )
-      eval_hooks.append(eval_early_stop_duration_hook)
-
-    if not self._is_early_stopping:
-      if (train_max_steps is not None) and (train_max_steps <= 0):
-        if ((max_duration is not None) and (max_duration < 0)) or (max_duration is None):
-          logging.warn("train.max_steps is non-positive, and no early or duration stopping is configured. "
-                      "Training job will loop forever.")
-
-    if train_max_steps is not None and train_max_steps > 0:
-      # we can't pass max_steps AND steps to estimator.train.
-      # so we pass steps to estimator.train and max_steps to this hook instead...
-      stop_at_step_hook = twml.hooks.StopAtStepHook(last_step=train_max_steps)
-      train_hooks.append(stop_at_step_hook)
-
-    with self.experiment_tracker.track_experiment(eval_hooks,
-                                                  lambda: self.current_estimator_spec):
-      # alternate training and evaluation epochs
-      epoch = start_epoch
-      while True:
-        logging.info("Training epoch %d", epoch)
-        self._estimator.train(train_input_fn, steps=train_steps, hooks=train_hooks)
-
-        logging.info("Evaluating epoch %d", epoch)
-        eval_result = self._estimator.evaluate(
-          eval_input_fn, steps=eval_steps, hooks=eval_hooks)
-
-        if exporters:
-          checkpoint_path = self.estimator.latest_checkpoint()
-          for exporter in exporters:
-            export_path = os.path.join(self._save_dir, "export", exporter.name)
-            exporter.export(
-              estimator=self.estimator, export_path=export_path,
-              checkpoint_path=checkpoint_path, eval_result=eval_result,
-              is_the_final_export=False)
-
-        # If train_max_step is none. Terminate after one loop.
-        if train_max_steps is None:
-          break
-
-        # If stop_at_step_hook requested a stop, break
-        if train_max_steps > 0 and stop_at_step_hook.stop_requested:
-          break
+            while True:
+                try:
+                    training_status = get_job_status()
+                    if training_status == TrainingJobStatus.FINISHED:
+                        logging.info(
+                            "Distributed training job succeed, shutting down parameter server."
+                        )
+                        os._exit(0)
+                    elif training_status == TrainingJobStatus.FAILED:
+                        logging.info(
+                            "Distributed training job failed, shutting down parameter server."
+                        )
+                        os._exit(0)
+                    elif training_status == TrainingJobStatus.NOT_FOUND:
+                        raise Exception("Distributed training job status not found.")
+                    else:
+                        poke_interval = random.randrange(
+                            60, 90
+                        )  # prevent spike QPS to aurora endpoint
+                        time.sleep(poke_interval)
+                        retry = 0
+                except Exception as e:
+                    if retry >= retry_max:
+                        raise e  # only exception in this thread, won't fail parameter server thread
+                    retry += 1
+                    poke_interval = random.randrange(60, 90) + retry * 10
+                    logging.warn(
+                        "Error getting distributed training job status, will retry after %s seconds."
+                        % poke_interval
+                    )
+                    time.sleep(poke_interval)
+
+        Thread(target=wait_complete_then_exit).start()
+
+    def get_eval_hooks(self) -> None:  # pylint: disable=no-self-use
+        """Return SessionRunHooks used during evaluation."""
+        return None
+
+    def get_predict_hooks(self) -> List[tf.train.SessionRunHook]:
+        """Return hooks used during prediction.
+        If profiler_steps is set in the constructor to the Trainer,
+        we pass a tf.Train.ProfilerHook to the estimator's predict function.
+        """
+        hooks = []
+        if self._profiler_steps is not None:
+            profiler = tf.train.ProfilerHook(
+                save_steps=self._profiler_steps, output_dir=self._save_dir
+            )
+            hooks.append(profiler)
+        return hooks
+
+    def learn(
+        self,
+        train_input_fn: Optional[Callable[[], tf.data.Dataset]] = None,
+        eval_input_fn: Optional[Callable[[], tf.data.Dataset]] = None,
+        train_max_steps: Optional[int] = None,
+        train_steps: Optional[int] = None,
+        eval_steps: Optional[int] = None,
+        train_hooks: Optional[List[tf.train.SessionRunHook]] = None,
+        eval_hooks: Optional[List[tf.train.SessionRunHook]] = None,
+        early_stop_metric: Optional[str] = None,
+        early_stop_patience: Optional[int] = -1,
+        early_stop_minimize: Optional[bool] = True,
+        early_stop_tolerance: Optional[int] = 0,
+        start_epoch: Optional[int] = 0,
+        exporters: Optional[List[tf.estimator.Exporter]] = None,
+        export_output_fn: Optional[
+            Callable[[tf.estimator.Estimator], tf.estimator.ExportOutput]
+        ] = None,
+        max_duration: Optional[int] = None,
+    ) -> None:
+        """
+        Train and evaluate the estimator for ``train_max_steps`` steps.
+        Each epoch involves ``train_steps`` training steps followed
+        by ``eval_steps`` evaluation steps. Note that each step
+        is a ``session.run()``, that is, each batch is a step.
 
-        # early-stopping logic is handled internally by the hook
-        if early_stop_patience > 0 and early_stop_hook.should_stop:
-          # but we still need to break here
-          break
-        epoch += 1
-
-      self.write_state_to_disk(save_dir=self._save_dir, filename='_SUCCESS')
-
-    return self._save_dir
+        Args:
+            train_max_steps:
+                maximum number of global steps of training to run.
+                Defaults to params.train_max_steps.
+                None-values cause learn() to terminate after *one* call to train() and evaluate(),
+                which is usually useful when using train_steps=-1
+                Non-positive values trains indefinitely in a loop (use with caution),
+                which is usually useful when used with early stopping.
+            train_steps:
+                number of training steps per epoch. For example, 100 means each
+                training epoch will end after processing 100 batches.
+                Defaults to params.train_steps.
+                Non-positive values and None-values go through the entire training set each epoch.
+            eval_steps:
+                number of evaluation steps per epoch.
+                Defaults to params.eval_steps.
+                Non-positive values and None-values go through the entire evaluation set each epoch.
+            train_input_fn:
+                Function to iterate through training set. It is passed to estimator.train.
+            eval_input_fn:
+                Function to iterate through evaluation set. It is passed to estimator.evaluate.
+            train_hooks:
+                List of SessionRunHooks uses for training. Defaults to self.get_train_hooks().
+            eval_hooks:
+                List of SessionRunHooks uses for evaluation. Defaults to self.get_eval_hooks()
+            start_epoch:
+                The epoch from which to start learn. If you want to do training and evaluation
+                for N epochs, you can call ``learn()`` in a loop as follows:
+            exporters:
+                List of exporters called at the end of each evaluation run.
+                Defaults to none.
+            export_output_fn:
+                The output format to use for exported models.
+                Only used if exporters is not None.
+
+                .. code-block:: python
+
+                for epoch in range(1,max_epoch):
+                    trainer.learn(start_epoch=epoch)
+
+            Early-stopping Args:
+            early_stop_metric:
+                String specifying the metric to early-stop on. Required with positive
+                ``early_stop_patience``. For example, 'accuracy', 'accuracy_0', 'loss', etc.
+                The string is used to extract the relevant tensor Op from the dict returned by
+                the get_eval_metric_ops method. For ``metrics`` pass to the constructor,
+                the string is one of those. For multi-class (that is, multi-metric)
+                metrics, the string may be appended with a ``_0``, ``_1``, etc. or one
+                of the ``multi_metric_names`` (one per class).
+            early_stop_patience:
+                Maximum number of epochs to wait for an improvement in the early_stop_metric
+                before breaking off training. For example, a patience of 10 means that
+                training will have 10 epochs to improve the metric before it is killed.
+                Whenever the metric is improved before running out of patience,
+                patience is reset to ``early_stop_patience``.
+                Defaults to -1 (that is, no early-stopping).
+            early_stop_minimize:
+                Set this to True (the default) for metrics that need to be minimized
+                (like ``loss``). Metrics like ``accuracy`` that need to be maximized
+                should set this to False.
+            early_stop_tolerance:
+                A non-negative tolerance for comparing early_stop_metric.
+                E.g. when maximizing the condition is current_metric > best_metric + tolerance.
+                Defaults to 0.
+            max_duration:
+                A float. When this argument is defined, the job will automatically terminate after
+                `max_duration` seconds if it has not already compeleted.
+
+        Returns:
+            The directory where the checkpoints were saved. That is, save_dir.
+            You can point TensorBoard to this directory to get metrics,
+            or pass it to another Trainer via ``init_from_dir`` when doing
+            multi-phase training.
+        """
+        # pylint: disable=too-many-branches
+
+        if not callable(train_input_fn):
+            raise ValueError("Expecting callable train_input_fn function")
+        if not callable(eval_input_fn):
+            raise ValueError("Expecting callable eval_input_fn function")
+
+        if os.environ.get("TF_CONFIG"):
+            raise ValueError(
+                "trainer.learn() can not be used with distributed / hogwild setups"
+            )
+
+        if exporters and export_output_fn:
+            self._export_output_fn = export_output_fn
+
+        train_hooks = self.get_train_hooks() if train_hooks is None else train_hooks
+        eval_hooks = self.get_eval_hooks() if eval_hooks is None else eval_hooks
+        eval_hooks = [] if eval_hooks is None else eval_hooks
 
-  def get_train_spec(self, input_fn, max_steps=None, hooks=None):
-    """Get the TrainSpec used by ``tf.train.train_and_evaluate``."""
-    if not callable(input_fn):
-      raise ValueError("Expecting callable train_input_fn")
+        if train_max_steps is None:
+            train_max_steps = self.params.get("train_max_steps")
+
+        if train_steps is None:
+            train_steps = self.params.train_steps
+        if train_steps <= 0:
+            train_steps = None
+
+        if eval_steps is None:
+            eval_steps = self.params.eval_steps
+        if eval_steps <= 0:
+            eval_steps = None
+
+        if early_stop_patience > 0:
+            assert (
+                train_max_steps is not None
+            ), "Early stopping and max_steps=None are not compatible."
+            # prepare early stopping hook (which also handles logic here)
+            self._is_early_stopping = True
+            early_stop_hook = twml.hooks.EarlyStopHook(
+                metric=early_stop_metric,
+                checkpoint_dir=self._save_dir,
+                patience=early_stop_patience,
+                minimize=early_stop_minimize,
+                tolerance=early_stop_tolerance,
+                get_estimator_spec_fn=lambda: self.current_estimator_spec,
+                start_epoch=start_epoch,
+            )
+            # add early stop hook to eval hooks
+            eval_hooks.append(early_stop_hook)
+
+        if max_duration is not None:
+            train_early_stop_duration_hook = twml.hooks.EarlyStopDuration(
+                max_duration=max_duration,
+                exit_on_end=False,
+                save_dir=self._save_dir,
+                overwrite=True,
+            )
+            train_hooks.append(train_early_stop_duration_hook)
+
+            eval_early_stop_duration_hook = twml.hooks.EarlyStopDuration(
+                max_duration=max_duration,
+                exit_on_end=False,
+                save_dir=self._save_dir,
+                overwrite=True,
+            )
+            eval_hooks.append(eval_early_stop_duration_hook)
+
+        if not self._is_early_stopping:
+            if (train_max_steps is not None) and (train_max_steps <= 0):
+                if ((max_duration is not None) and (max_duration < 0)) or (
+                    max_duration is None
+                ):
+                    logging.warn(
+                        "train.max_steps is non-positive, and no early or duration stopping is configured. "
+                        "Training job will loop forever."
+                    )
+
+        if train_max_steps is not None and train_max_steps > 0:
+            # we can't pass max_steps AND steps to estimator.train.
+            # so we pass steps to estimator.train and max_steps to this hook instead...
+            stop_at_step_hook = twml.hooks.StopAtStepHook(last_step=train_max_steps)
+            train_hooks.append(stop_at_step_hook)
+
+        with self.experiment_tracker.track_experiment(
+            eval_hooks, lambda: self.current_estimator_spec
+        ):
+            # alternate training and evaluation epochs
+            epoch = start_epoch
+            while True:
+                logging.info("Training epoch %d", epoch)
+                self._estimator.train(
+                    train_input_fn, steps=train_steps, hooks=train_hooks
+                )
+
+                logging.info("Evaluating epoch %d", epoch)
+                eval_result = self._estimator.evaluate(
+                    eval_input_fn, steps=eval_steps, hooks=eval_hooks
+                )
+
+                if exporters:
+                    checkpoint_path = self.estimator.latest_checkpoint()
+                    for exporter in exporters:
+                        export_path = os.path.join(
+                            self._save_dir, "export", exporter.name
+                        )
+                        exporter.export(
+                            estimator=self.estimator,
+                            export_path=export_path,
+                            checkpoint_path=checkpoint_path,
+                            eval_result=eval_result,
+                            is_the_final_export=False,
+                        )
+
+                # If train_max_step is none. Terminate after one loop.
+                if train_max_steps is None:
+                    break
+
+                # If stop_at_step_hook requested a stop, break
+                if train_max_steps > 0 and stop_at_step_hook.stop_requested:
+                    break
+
+                # early-stopping logic is handled internally by the hook
+                if early_stop_patience > 0 and early_stop_hook.should_stop:
+                    # but we still need to break here
+                    break
+                epoch += 1
+
+            self.write_state_to_disk(save_dir=self._save_dir, filename="_SUCCESS")
+
+        return self._save_dir
+
+    def get_train_spec(
+        self,
+        input_fn: Callable[[], tf.data.Dataset],
+        max_steps: Optional[int] = None,
+        hooks: Optional[List[tf.train.SessionRunHook]] = None,
+    ) -> tf.estimator.TrainSpec:
+        """Get the TrainSpec used by ``tf.train.train_and_evaluate``."""
+        if not callable(input_fn):
+            raise ValueError("Expecting callable train_input_fn")
+
+        if max_steps is None:
+            max_steps = self.params.train_max_steps
+
+        if max_steps is not None and max_steps <= 0:
+            max_steps = None
+
+        hooks = self.get_train_hooks() if hooks is None else hooks
+
+        return tf.estimator.TrainSpec(
+            input_fn=input_fn, max_steps=max_steps, hooks=hooks
+        )
 
-    if max_steps is None:
-      max_steps = self.params.train_max_steps
+    def get_eval_spec(
+        self,
+        input_fn: Callable[[], tf.data.Dataset],
+        steps: Optional[int] = None,
+        delay: Optional[int] = None,
+        period: Optional[int] = None,
+        hooks: Optional[List[tf.train.SessionRunHook]] = None,
+        exporters: Optional[List[tf.estimator.Exporter]] = None,
+    ) -> tf.estimator.EvalSpec:
+        """Get the EvalSpec used by ``tf.train.train_and_evaluate``."""
+        if not callable(input_fn):
+            raise ValueError("Expecting callable eval_input_fn")
+
+        if steps is None:
+            steps = self.params.eval_steps
+
+        if steps <= 0:
+            steps = None
+
+        if delay is None:
+            delay = self.params.eval_delay
+
+        if period is None:
+            period = self.params.eval_period
+
+        hooks = self.get_eval_hooks() if hooks is None else hooks
+
+        eval_name = self.params.get("eval_name", None)
+
+        return tf.estimator.EvalSpec(
+            input_fn=input_fn,
+            steps=steps,
+            name=eval_name,
+            start_delay_secs=delay,
+            throttle_secs=period,
+            hooks=hooks,
+            exporters=exporters,
+        )
 
-    if max_steps is not None and max_steps <= 0:
-      max_steps = None
+    def train_and_evaluate(
+        self,
+        train_input_fn: Callable[[], tf.data.Dataset] = None,
+        eval_input_fn: Callable[[], tf.data.Dataset] = None,
+        train_max_steps: Optional[int] = None,
+        eval_steps: Optional[int] = None,
+        eval_delay: Optional[int] = None,
+        eval_period: Optional[int] = None,
+        train_hooks: Optional[List[tf.train.SessionRunHook]] = None,
+        eval_hooks: Optional[List[tf.train.SessionRunHook]] = None,
+        early_stop_metric: Optional[str] = None,
+        early_stop_patience: Optional[int] = -1,
+        early_stop_minimize: Optional[bool] = True,
+        early_stop_tolerance: Optional[float] = 0.0,
+        exporters: Optional[List[tf.estimator.Exporter]] = None,
+        export_output_fn: Optional[Callable[[tf.estimator.Estimator], None]] = None,
+        max_duration: Optional[int] = None,
+    ) -> str:
+        """
+        Train and evaluate the estimator for ``train_max_steps``
+        using ``tf.estimator.train_and_evaluate``.
+        With a cluster configuration provided in the ``TF_CONFIG`` environment variable, this method
+        can be used for distributed training (multi-node or multi-process).
+        Unlike the ``learn`` method, training is continuous with ``train_max_steps``.
+        For distributed use case, evaluation happens periodically.
+        That is, after ``eval_delay`` seconds, an evaluation epoch of ``eval_step`` steps
+        occurs every ``eval_period`` seconds. Evaluation happens on the most recent checkpoint.
+        TF defaults to saving checkpoints every 10 mins.
+        For local use case, training occurs for train_max_steps epochs followed by a
+        single evaluation. For local use case we therefore recommend using learn() instead
+        as it provides early-stopping and multiple evaluations.
+
+        ``train_and_evaluate`` will evaluate for ``eval_steps`` every ``eval_period`` seconds.
+        It will stop after ``train_steps`` is reached.
+
+        You must ensure that all workers/servers are assigned the same `save_dir`.
+
+        .. Note::
+
+            If the TF_CONFIG environment variable is set, this function assumes its running a distribute job.
 
-    hooks = self.get_train_hooks() if hooks is None else hooks
+        Args:
+            train_input_fn:
+                Function to iterate through training set. It is passed to estimator.train_and_evalute
+            eval_input_fn:
+                Function to iterate through evaluation set. It is passed to estimator.train_and_evalute.
+            train_max_steps:
+                maximum number of global steps of training to run.
+                Defaults to params.train_max_steps.
+                Non-positive values and None-values train indefinitely (use with caution).
+            eval_steps:
+                number of steps per evaluation.
+                Defaults to params.eval_steps.
+                Non-positive values and None-values go through
+                the entire evaluation set for each evaluation.
+                Note that the number of eval_steps should be high enough to minimize noise.
+                This is especially true for early-stopping.
+            eval_delay:
+                Start the first evaluation after eval_delay. Defaults to params.eval_delay or 2*60s.
+            eval_period:
+                Run an evaluation every eval_period seconds. Defaults to params.eval_period or 10*60s.
+            exporters:
+                List of exporters called at the end of each evaluation run.
+                Defaults to none.
+            export_output_fn:
+                The output format to use for exported models.
+                Only used if exporters is not None.
+
+        Early-stopping Args:
+            early_stop_metric:
+                String specifying the metric to early-stop on. Required with positive
+                ``early_stop_patience``. For example, 'accuracy', 'accuracy_0', 'loss', etc.
+                The string is used to extract the relevant tensor Op from the dict returned by
+                the get_eval_metric_ops method. For ``metrics`` pass to the constructor,
+                the string is one of those. For multi-class (that is, multi-metric)
+                metrics, the string may be appended with a ``_0``, ``_1``, etc. or one
+                of the ``multi_metric_names`` (one per class).
+            early_stop_patience:
+                Maximum number of epochs to wait for an improvement in the early_stop_metric
+                before breaking off training. For example, a patience of 10 means that
+                training will have 10 epochs to improve the metric before it is killed.
+                Whenever the metric is improved before running out of patience,
+                patience is reset to ``early_stop_patience``.
+                Defaults to -1 (that is, no early-stopping).
+            early_stop_minimize:
+                Set this to True (the default) for metrics that need to be minimized
+                (like ``loss``). Metrics like ``accuracy`` that need to be maximized
+                should set this to False.
+            early_stop_tolerance:
+                A non-negative tolerance for comparing early_stop_metric.
+                E.g. when maximizing the condition is current_metric > best_metric + tolerance.
+                Defaults to 0.
+            max_duration:
+                A float. When this argument is defined, the job will automatically terminate after
+                `max_duration` seconds if it has not already compeleted.
+
+        Returns:
+            The directory where the checkpoints were saved.
+        """
+
+        logging.info("WARNING: Trainer.train_and_evaluate is an EXPERIMENTAL API.")
+        logging.info(
+            "Trainer.train_and_evaluate may change or be removed in future versions."
+        )
 
-    return tf.estimator.TrainSpec(input_fn=input_fn,
-                                  max_steps=max_steps,
-                                  hooks=hooks)
+        if not callable(train_input_fn):
+            raise ValueError("Expecting callable train_input_fn function")
+        if not callable(eval_input_fn):
+            raise ValueError("Expecting callable eval_input_fn function")
+
+        self._exit_ps_after_training_complete()
+
+        # Maybe export in eval processes.
+        if self.is_evaluator():
+            if self.params.get("eval_name") is not None:
+                # Do not export if running special eval.
+                exporters = None
+                export_output_fn = None
+            elif exporters and export_output_fn:
+                self._export_output_fn = export_output_fn
+            else:
+                # Default option.
+                self._export_output_fn = None
 
-  def get_eval_spec(self, input_fn, steps=None, delay=None, period=None,
-                    hooks=None, exporters=None):
-    """Get the EvalSpec used by ``tf.train.train_and_evaluate``."""
-    if not callable(input_fn):
-      raise ValueError("Expecting callable eval_input_fn")
+        train_hooks = self.get_train_hooks() if train_hooks is None else train_hooks
+        train_hooks = [] if train_hooks is None else train_hooks
 
-    if steps is None:
-      steps = self.params.eval_steps
+        eval_hooks = self.get_eval_hooks() if eval_hooks is None else eval_hooks
+        eval_hooks = [] if eval_hooks is None else eval_hooks
 
-    if steps <= 0:
-      steps = None
+        if train_max_steps is None:
+            train_max_steps = self.params.get("train_max_steps")
+
+        if eval_steps is None:
+            eval_steps = self.params.eval_steps
+        if eval_steps <= 0:
+            eval_steps = None
+
+        if eval_delay is None:
+            eval_delay = self.params.eval_delay
+        if eval_period is None:
+            eval_period = self.params.eval_period
+
+        if early_stop_patience > 0:
+            # when training hooks detect this file, they request a stop to training
+            early_stop_path = os.path.join(self._save_dir, "earlystop_now.txt")
+            # prepare early stopping hook (which also handles logic here)
+
+            self._is_early_stopping = True
+
+            eval_early_stop_hook = twml.hooks.EarlyStopHook(
+                metric=early_stop_metric,
+                checkpoint_dir=self._save_dir,
+                patience=early_stop_patience,
+                minimize=early_stop_minimize,
+                tolerance=early_stop_tolerance,
+                get_estimator_spec_fn=lambda: self.current_estimator_spec,
+                file_path=early_stop_path,
+                exit_on_end=os.environ.get("TF_CONFIG") is not None,
+            )  # only exit for distributed jobs
+            # add early stop hook to eval hooks
+            eval_hooks.append(eval_early_stop_hook)
+
+            # prepare the commensurate training hook
+            train_early_stop_hook = twml.hooks.StopIfExistsHook(early_stop_path)
+            train_hooks.append(train_early_stop_hook)
+
+        if max_duration is not None:
+            train_early_stop_duration_hook = twml.hooks.EarlyStopDuration(
+                max_duration=max_duration,
+                exit_on_end=False,
+                save_dir=self._save_dir,
+                overwrite=self.is_chief(),
+            )
+            eval_early_stop_duration_hook = twml.hooks.EarlyStopDuration(
+                max_duration=max_duration,
+                exit_on_end=os.environ.get("TF_CONFIG") is not None,
+                save_dir=self._save_dir,
+                overwrite=False,
+            )  # only exit for distributed jobs
+
+            train_hooks.append(train_early_stop_duration_hook)
+            eval_hooks.append(eval_early_stop_duration_hook)
+
+        with self.experiment_tracker.track_experiment(
+            eval_hooks, lambda: self.current_estimator_spec
+        ):
+            train_spec = self.get_train_spec(
+                train_input_fn, train_max_steps, train_hooks
+            )
+            eval_spec = self.get_eval_spec(
+                eval_input_fn,
+                eval_steps,
+                eval_delay,
+                eval_period,
+                eval_hooks,
+                exporters,
+            )
+            self._train_and_evaluate(train_spec, eval_spec)
+
+        if self.is_chief():
+            self.write_state_to_disk(save_dir=self._save_dir, filename="_SUCCESS")
+
+        return self._save_dir
+
+    def _train_and_evaluate(
+        self, train_spec: tf.estimator.TrainSpec, eval_spec: tf.estimator.EvalSpec
+    ) -> None:
+        """
+        Private method that calls
+        ``tf.estimator.train_and_evaluate(self._estimator, train_spec, eval_spec)``.
+        """
+        try:
+            tf.estimator.train_and_evaluate(self._estimator, train_spec, eval_spec)
+        except twml.errors.EarlyStopError:
+            # Ignore the exception if on evaluator.
+            if self.is_evaluator():
+                pass
+            else:
+                raise
 
-    if delay is None:
-      delay = self.params.eval_delay
+    def train(
+        self,
+        input_fn: Optional[Callable] = None,
+        steps: Optional[int] = None,
+        hooks: Optional[List[tf.train.SessionRunHook]] = None,
+    ) -> None:
+        """
+        Train the estimator for `steps` training steps.
 
-    if period is None:
-      period = self.params.eval_period
+        Args:
+            steps:
+                number of steps for which to perform training. For example, 100 means each
+                evaluation will end after processing 100 batches.
+                Defaults to None. i.e. trains on the entire dataset a single time.
+                Non-positive values and None-values go through the entire training set each epoch.
+            input_fn:
+                Function to iterate through training set. It is passed to estimator.train.
+            hooks:
+                List of SessionRunHooks uses for training. Defaults to self.get_train_hooks().
+        """
+        if os.environ.get("TF_CONFIG") and "is_calibrating" not in self.params:
+            raise ValueError(
+                "trainer.train() can not be used with distributed / hogwild setups"
+            )
+
+        if not callable(input_fn):
+            raise ValueError("Expecting callable input_fn function")
+
+        if self._is_early_stopping:
+            raise ValueError(
+                "Can not call train() after learn() when using early stopping."
+            )
+
+        hooks = self.get_train_hooks() if hooks is None else hooks
+        self._estimator.train(input_fn, steps=steps, hooks=hooks)
+        return self
+
+    def evaluate(
+        self,
+        input_fn: Optional[Callable] = None,
+        steps: Optional[int] = None,
+        hooks: Optional[List[tf.train.SessionRunHook]] = None,
+        name: Optional[str] = None,
+    ) -> Dict[str, float]:
+        """
+        Evaluate the estimator for `steps` evaluation steps.
 
-    hooks = self.get_eval_hooks() if hooks is None else hooks
+        Args:
+            steps:
+                number of steps for which to perform evaluation. For example, 100 means each
+                evaluation will end after processing 100 batches.
+                Defaults to None. i.e. evaluates on the entire dataset a single time.
+                Negative values and None-values go through the entire training set each epoch.
+            input_fn:
+                Function to iterate through evaluation set. It is passed to estimator.evaluate.
+            hooks:
+                List of SessionRunHooks used for evaluation. Defaults to None.
+                Note that, unlike learn(), hooks defaults to None instead of self.get_eval_hooks()
+                as the latter may implement early-stopping, which isn't necessarilty the desired
+                behavior when calling evaluate() on its own.
+            name:
+                Name of the evaluation if user needs to run multiple evaluations on different data sets.
+                Metrics for different evaluations are saved in separate folders,
+                and appear separately in tensorboard.
+
+        Returns:
+            If `is_evaluator()`, returns a dict containing the evaluation metrics specified
+            in `metric_fn` keyed by name, as well as an entry `global_step` that contains
+            the value of the global step for which this evaluation was performed.
+            Otherwise (i.e. `is_evaluator() == False`), returns None.
+        """
+        if not self.is_evaluator():
+            return None
+
+        if not callable(input_fn):
+            raise ValueError("Expecting callable input_fn function")
+
+        hooks = self.get_eval_hooks() if hooks is None else hooks
+        hooks = [] if hooks is None else hooks
+
+        # for consistency with train/learn
+        eval_steps = None if steps is not None and steps < 0 else steps
+
+        with self.experiment_tracker.track_experiment(
+            hooks, lambda: self.current_estimator_spec, name=name
+        ):
+            checkpoint = self.best_or_latest_checkpoint
+            computed_metrics = self._estimator.evaluate(
+                input_fn,
+                steps=eval_steps,
+                hooks=hooks,
+                checkpoint_path=checkpoint,
+                name=name,
+            )
+
+        return computed_metrics
+
+    def start_tensorboard(self, port: Optional[int] = None) -> None:
+        """
+        Start tensorboard process to visualize logs in save_dir.
+        """
+        logging.info("Starting tensorboard.")
+        if self._tensorboard_handle:
+            logging.warn("Tensorboard already running. Nothing done.")
+            return
+
+        if port is None:
+            if "tensorboard_port" not in self.params.values():
+                raise ValueError("You must specify a port for tensorboard to run on.")
+            elif self.params.tensorboard_port is None:
+                return
+            else:
+                port = self.params.tensorboard_port
 
-    eval_name = self.params.get("eval_name", None)
+        mldash_path = "experiments"
+        if self.experiment_tracker.path:
+            mldash_path += "/%s" % encode_url(self.experiment_tracker.experiment_id)
+        tensorboard_args = ["--logdir=%s" % self._save_dir, "--port=%d" % port]
 
-    return tf.estimator.EvalSpec(input_fn=input_fn,
-                                 steps=steps,
-                                 name=eval_name,
-                                 start_delay_secs=delay,
-                                 throttle_secs=period,
-                                 hooks=hooks,
-                                 exporters=exporters)
+        try:
+            args = [
+                "email_and_launch_tensorboard",
+                mldash_path,
+                "--",
+            ] + tensorboard_args
+            self._tensorboard_handle = subprocess.Popen(args)
+        except OSError:
+            try:
+                self._tensorboard_handle = subprocess.Popen(
+                    ["tensorboard"] + tensorboard_args
+                )
+            except OSError:
+                try:
+                    # this will work with Twitter internal pants build when run locally
+                    args = [
+                        "./pants",
+                        "run",
+                        "twml:tensorboard",
+                        "--",
+                    ] + tensorboard_args
+                    self._tensorboard_handle = subprocess.Popen(args)
+                except OSError:
+                    logging.error(
+                        "No tensorboard installed, won't able to visualize training in tensorboard."
+                    )
+
+    def stop_tensorboard(self) -> None:
+        """
+        Shutdown this Trainer's associated Tensorboard.
+        """
+        if self._tensorboard_handle:
+            logging.info("Shutting down tensorboard.")
+            self._tensorboard_handle.kill()
+        else:
+            logging.warn("No known tensorboard process. Nothing done.")
+
+    def calibrate(
+        self,
+        calibrator: Union[Calibrator, Dict[str, Calibrator]],
+        steps: Optional[int] = None,
+        input_fn: Optional[Callable[[], tf.data.Dataset]] = None,
+        save_calibrator: bool = True,
+        hooks: Optional[List[tf.train.SessionRunHook]] = None,
+    ) -> None:
+        """
+        Calibrate the calibrator for `steps` calibration steps using the estimator.train method.
+        The build_graph passed to the Trainer constructor should
+        call calibrator.accumulate using something like tf.py_func.
+        That way, when this method calls estimator.train the calibrator will
+        accumulate one epoch of samples. After which, this method calls calibrator.calibrate().
+        It is up to the user to then call calibrator.save() to save the calibrated Layer
+        and other information to disk for multi-phase training.
 
-  def train_and_evaluate(self, train_input_fn=None, eval_input_fn=None,
-                         train_max_steps=None, eval_steps=None,
-                         eval_delay=None, eval_period=None,
-                         train_hooks=None, eval_hooks=None,
-                         early_stop_metric=None, early_stop_patience=-1,
-                         early_stop_minimize=True, early_stop_tolerance=0, exporters=None,
-                         export_output_fn=None, max_duration=None):
-    """
-    Train and evaluate the estimator for ``train_max_steps``
-    using ``tf.estimator.train_and_evaluate``.
-    With a cluster configuration provided in the ``TF_CONFIG`` environment variable, this method
-    can be used for distributed training (multi-node or multi-process).
-    Unlike the ``learn`` method, training is continuous with ``train_max_steps``.
-    For distributed use case, evaluation happens periodically.
-    That is, after ``eval_delay`` seconds, an evaluation epoch of ``eval_step`` steps
-    occurs every ``eval_period`` seconds. Evaluation happens on the most recent checkpoint.
-    TF defaults to saving checkpoints every 10 mins.
-    For local use case, training occurs for train_max_steps epochs followed by a
-    single evaluation. For local use case we therefore recommend using learn() instead
-    as it provides early-stopping and multiple evaluations.
-
-    ``train_and_evaluate`` will evaluate for ``eval_steps`` every ``eval_period`` seconds.
-    It will stop after ``train_steps`` is reached.
-
-    You must ensure that all workers/servers are assigned the same `save_dir`.
-
-    .. Note::
-
-      If the TF_CONFIG environment variable is set, this function assumes its running a distribute job.
-
-    Args:
-      train_input_fn:
-        Function to iterate through training set. It is passed to estimator.train_and_evalute
-      eval_input_fn:
-        Function to iterate through evaluation set. It is passed to estimator.train_and_evalute.
-      train_max_steps:
-        maximum number of global steps of training to run.
-        Defaults to params.train_max_steps.
-        Non-positive values and None-values train indefinitely (use with caution).
-      eval_steps:
-        number of steps per evaluation.
-        Defaults to params.eval_steps.
-        Non-positive values and None-values go through
-        the entire evaluation set for each evaluation.
-        Note that the number of eval_steps should be high enough to minimize noise.
-        This is especially true for early-stopping.
-      eval_delay:
-        Start the first evaluation after eval_delay. Defaults to params.eval_delay or 2*60s.
-      eval_period:
-        Run an evaluation every eval_period seconds. Defaults to params.eval_period or 10*60s.
-      exporters:
-        List of exporters called at the end of each evaluation run.
-        Defaults to none.
-      export_output_fn:
-        The output format to use for exported models.
-        Only used if exporters is not None.
-
-    Early-stopping arguments:
-      early_stop_metric:
-        String specifying the metric to early-stop on. Required with positive
-        ``early_stop_patience``. For example, 'accuracy', 'accuracy_0', 'loss', etc.
-        The string is used to extract the relevant tensor Op from the dict returned by
-        the get_eval_metric_ops method. For ``metrics`` pass to the constructor,
-        the string is one of those. For multi-class (that is, multi-metric)
-        metrics, the string may be appended with a ``_0``, ``_1``, etc. or one
-        of the ``multi_metric_names`` (one per class).
-      early_stop_patience:
-        Maximum number of epochs to wait for an improvement in the early_stop_metric
-        before breaking off training. For example, a patience of 10 means that
-        training will have 10 epochs to improve the metric before it is killed.
-        Whenever the metric is improved before running out of patience,
-        patience is reset to ``early_stop_patience``.
-        Defaults to -1 (that is, no early-stopping).
-      early_stop_minimize:
-        Set this to True (the default) for metrics that need to be minimized
-        (like ``loss``). Metrics like ``accuracy`` that need to be maximized
-        should set this to False.
-      early_stop_tolerance:
-        A non-negative tolerance for comparing early_stop_metric.
-        E.g. when maximizing the condition is current_metric > best_metric + tolerance.
-        Defaults to 0.
-      max_duration:
-        A float. When this argument is defined, the job will automatically terminate after
-        `max_duration` seconds if it has not already compeleted. 
-
-    Returns:
-      The directory where the checkpoints were saved.
-    """
+        Args:
+            calibrator:
+                a twml.Calibrator instance or a dict of the form {name(str): twml.Calibrator}.
+            steps:
+                Maximum steps to accumulate examples for calibration. Optional.
+                If not specified, examples will be accumulated until all downsampled parts are processed.
+            input_fn:
+                Function to iterate through training set. It is passed to estimator.train.
+            hooks:
+                List of SessionRunHooks uses for training. Defaults to self.get_train_hooks().
+            save_calibrator:
+                Boolean (default: True). If set to True it will save the calibrator layer.
+        """
+
+        if not callable(input_fn):
+            raise ValueError("Expecting callable input_fn function")
+
+        # making everything a dict to avoid multiple ifs
+        if isinstance(calibrator, twml.contrib.calibrators.Calibrator):
+            calibrator = {"default": calibrator}
+
+        # This is a dummy call to train, since we cannot predict without training
+        # from the Estimator API
+        self._estimator.train(input_fn, steps=1)
+        max_steps = steps if steps is not None else -1
+        for name, clbrt in sorted(calibrator.items(), key=itemgetter(0)):
+            count = 0
+            for out in self._estimator.predict(
+                input_fn, hooks=hooks, yield_single_examples=False
+            ):
+                if max_steps > 0 and count > max_steps:
+                    break
+                clbrt.accumulate_feature(out)
+                count += 1
+            clbrt.calibrate()
+
+        # this step is done to allow us to keep the current phases event file for
+        # visualization on Tensorboard. It removes all files that
+        # are not event files. This piece of code should be deprecated when
+        # we deprecate the MDL calibrator (CX-12329)
+        for fname in tf.io.gfile.listdir(self._save_dir):
+            if not fname.startswith("events"):
+                tf.io.gfile.remove(os.path.join(self._save_dir, fname))
+
+        if save_calibrator:
+            # If we only have one calibrator, the calibrator signature
+            # will be set to default
+            if len(calibrator) == 1:
+                calibrator = calibrator["default"]
+                calibrator.save(
+                    self.params.save_dir, name=calibrator.name, verbose=True
+                )
+            else:
+                for name, clbrt in calibrator.items():
+                    clbrt.save(
+                        self.params.save_dir, name=clbrt.name + str(name), verbose=True
+                    )
+
+    def predict(self, *args, **kwargs) -> tf.estimator.EstimatorSpec:
+        """
+        Wrapper over the tensorflow `Estimator.predict
+        <https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator#predict>`_.
+        method. See that documentation for description of arguments accepted.
+
+        If hooks is passed as an argument, the specified hooks are used.
+        Else when profiler_steps is specified in the constructor of the Trainer, a
+        tf.train.ProfilerHook is passed to the predict interface.
+        Otherwise, hooks is set to an empty list.
+        """
+        if "hooks" not in kwargs and len(args) < 3:
+            # If hooks is not specified as a keyword argument, nor as a positional argument
+            # add hooks as a keyword argument.
+            kwargs["hooks"] = self.get_predict_hooks()
+
+        return self.estimator.predict(*args, **kwargs)
+
+    def hub_export(
+        self,
+        name: str,
+        serving_input_receiver_fn: Callable[
+            [], tf.estimator.export.ServingInputReceiver
+        ],
+        export_dir: Optional[str] = None,
+        checkpoint_path: Optional[str] = None,
+        export_task_type_overrider: Optional[str] = None,
+    ) -> None:
+        """
+        Exports registered modules into a save directory.
+
+        This method creates a directory under export_path with the save TF Hub.
+        One sub-directory (named export_name) per module registered via register_module_for_export.
 
-    logging.info("WARNING: Trainer.train_and_evaluate is an EXPERIMENTAL API.")
-    logging.info("Trainer.train_and_evaluate may change or be removed in future versions.")
-
-    if not callable(train_input_fn):
-      raise ValueError("Expecting callable train_input_fn function")
-    if not callable(eval_input_fn):
-      raise ValueError("Expecting callable eval_input_fn function")
-
-    self._exit_ps_after_training_complete()
-
-    # Maybe export in eval processes.
-    if self.is_evaluator():
-      if self.params.get("eval_name") is not None:
-        # Do not export if running special eval.
-        exporters = None
-        export_output_fn = None
-      elif exporters and export_output_fn:
-        self._export_output_fn = export_output_fn
-      else:
-        # Default option.
-        self._export_output_fn = None
+        Args:
+            name:
+                unique name of the module to export.
+            serving_input_receiver_fn:
+                A function with no arguments that returns a ServingInputReceiver.
+                This is used with the estimator passed to export() to build the graph (in PREDICT mode)
+                that registers the modules for export. The model in that graph is never run,
+                so the actual data provided by this input fn does not matter.
+            export_dir:
+                A string containing a directory where to write the export directories.
+                Defaults to the save_dir.
+            checkpoint_path:
+                The checkpoint path to export. Defaults to the latest.
+            export_task_type_overrider:
+                Specifies the task type that will override the default task type used for export
+                (hogwild training defaults to evaluator, otherwise, defaults to chief)
+        """
+        if export_task_type_overrider:
+            if not self.is_task_type(export_task_type_overrider):
+                logging.info(
+                    f"Trainer.hub_export ignored due to process not being {export_task_type_overrider}"
+                )
+                return
+        else:
+            if self._using_hogwild:
+                if not self.is_evaluator():
+                    logging.info(
+                        "Trainer.hub_export ignored due to the process not being evaluator."
+                    )
+                    return
+            else:
+                if not self.is_chief():
+                    logging.info(
+                        "Trainer.hub_export ignored due to the process not being chief."
+                    )
+                    return
+
+        if export_dir:
+            export_dir = sanitize_hdfs_path(export_dir)
+
+        if checkpoint_path:
+            checkpoint_path = sanitize_hdfs_path(checkpoint_path)
+        else:
+            checkpoint_path = self.best_or_latest_checkpoint
+
+        export_dir = export_dir if export_dir is not None else self._save_dir
+        exporter = hub.LatestModuleExporter(name, serving_input_receiver_fn)
+        # The path_exporter by default contains a timestamp directory in its path.
+        path_exporter = exporter.export(
+            estimator=self.estimator,
+            export_path=export_dir,
+            checkpoint_path=checkpoint_path,
+        )
 
-    train_hooks = self.get_train_hooks() if train_hooks is None else train_hooks
-    train_hooks = [] if train_hooks is None else train_hooks
-
-    eval_hooks = self.get_eval_hooks() if eval_hooks is None else eval_hooks
-    eval_hooks = [] if eval_hooks is None else eval_hooks
-
-    if train_max_steps is None:
-      train_max_steps = self.params.get('train_max_steps')
-
-    if eval_steps is None:
-      eval_steps = self.params.eval_steps
-    if eval_steps <= 0:
-      eval_steps = None
-
-    if eval_delay is None:
-      eval_delay = self.params.eval_delay
-    if eval_period is None:
-      eval_period = self.params.eval_period
-
-    if early_stop_patience > 0:
-      # when training hooks detect this file, they request a stop to training
-      early_stop_path = os.path.join(self._save_dir, 'earlystop_now.txt')
-      # prepare early stopping hook (which also handles logic here)
-
-      self._is_early_stopping = True
-
-      eval_early_stop_hook = twml.hooks.EarlyStopHook(
-        metric=early_stop_metric,
-        checkpoint_dir=self._save_dir,
-        patience=early_stop_patience,
-        minimize=early_stop_minimize,
-        tolerance=early_stop_tolerance,
-        get_estimator_spec_fn=lambda: self.current_estimator_spec,
-        file_path=early_stop_path,
-        exit_on_end=os.environ.get('TF_CONFIG') is not None)  # only exit for distributed jobs
-      # add early stop hook to eval hooks
-      eval_hooks.append(eval_early_stop_hook)
-
-      # prepare the commensurate training hook
-      train_early_stop_hook = twml.hooks.StopIfExistsHook(early_stop_path)
-      train_hooks.append(train_early_stop_hook)
-
-    if max_duration is not None:
-      train_early_stop_duration_hook = twml.hooks.EarlyStopDuration(
-        max_duration=max_duration,
-        exit_on_end=False,
-        save_dir=self._save_dir,
-        overwrite=self.is_chief()
-      )
-      eval_early_stop_duration_hook = twml.hooks.EarlyStopDuration(
-        max_duration=max_duration,
-        exit_on_end=os.environ.get('TF_CONFIG') is not None,
-        save_dir=self._save_dir,
-        overwrite=False
-      )  # only exit for distributed jobs
-
-      train_hooks.append(train_early_stop_duration_hook)
-      eval_hooks.append(eval_early_stop_duration_hook)
-
-    with self.experiment_tracker.track_experiment(eval_hooks, lambda: self.current_estimator_spec):
-      train_spec = self.get_train_spec(train_input_fn, train_max_steps, train_hooks)
-      eval_spec = self.get_eval_spec(eval_input_fn, eval_steps,
-                                     eval_delay, eval_period,
-                                     eval_hooks, exporters)
-      self._train_and_evaluate(train_spec, eval_spec)
-
-    if self.is_chief():
-      self.write_state_to_disk(save_dir=self._save_dir, filename='_SUCCESS')
-
-    return self._save_dir
-
-  def _train_and_evaluate(self, train_spec, eval_spec):
-    """
-    Private method that calls
-    ``tf.estimator.train_and_evaluate(self._estimator, train_spec, eval_spec)``.
-    """
-    try:
-      tf.estimator.train_and_evaluate(self._estimator, train_spec, eval_spec)
-    except twml.errors.EarlyStopError:
-      # Ignore the exception if on evaluator.
-      if self.is_evaluator():
-        pass
-      else:
-        raise
-
-  def train(self, input_fn=None, steps=None, hooks=None):
-    """
-    Train the estimator for `steps` training steps.
-
-    Args:
-      steps:
-        number of steps for which to perform training. For example, 100 means each
-        evaluation will end after processing 100 batches.
-        Defaults to None. i.e. trains on the entire dataset a single time.
-        Non-positive values and None-values go through the entire training set each epoch.
-      input_fn:
-        Function to iterate through training set. It is passed to estimator.train.
-      hooks:
-        List of SessionRunHooks uses for training. Defaults to self.get_train_hooks().
-    """
-    if os.environ.get('TF_CONFIG') and "is_calibrating" not in self.params:
-      raise ValueError("trainer.train() can not be used with distributed / hogwild setups")
+        # LatestModuleExporter.export() returns a binary string on Cloud ML Engine
+        # but tf.io.gfile.listdir() does not; this is an issue when joining paths
+        if isinstance(path_exporter, bytes):
+            path_exporter = path_exporter.decode()
 
-    if not callable(input_fn):
-      raise ValueError("Expecting callable input_fn function")
+        # Copying the saved hub module to export_dir so we don't need to specify
+        # the timestamp when loading the module.
+        # This is a workaround due to the current implementation of hub.LatestModuleExporter.
+        # This works for multiple hub modules.
+        hub_exported_modules = tf.io.gfile.listdir(path_exporter)
 
-    if self._is_early_stopping:
-      raise ValueError("Can not call train() after learn() when using early stopping.")
+        backup_dir = os.path.join(
+            export_dir, "backups", datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+        )
 
-    hooks = self.get_train_hooks() if hooks is None else hooks
-    self._estimator.train(input_fn, steps=steps, hooks=hooks)
-    return self
+        for folder in hub_exported_modules:
+            hub_module_oldpath = os.path.join(path_exporter, folder)
+            hub_module_newpath = os.path.join(export_dir, folder)
 
-  def evaluate(self, input_fn=None, steps=None, hooks=None, name=None):
-    """
-    Evaluate the estimator for `steps` evaluation steps.
-
-    Args:
-      steps:
-        number of steps for which to perform evaluation. For example, 100 means each
-        evaluation will end after processing 100 batches.
-        Defaults to None. i.e. evaluates on the entire dataset a single time.
-        Negative values and None-values go through the entire training set each epoch.
-      input_fn:
-        Function to iterate through evaluation set. It is passed to estimator.evaluate.
-      hooks:
-        List of SessionRunHooks used for evaluation. Defaults to None.
-        Note that, unlike learn(), hooks defaults to None instead of self.get_eval_hooks()
-        as the latter may implement early-stopping, which isn't necessarilty the desired
-        behavior when calling evaluate() on its own.
-      name:
-        Name of the evaluation if user needs to run multiple evaluations on different data sets.
-        Metrics for different evaluations are saved in separate folders,
-        and appear separately in tensorboard.
-
-    Returns:
-      If `is_evaluator()`, returns a dict containing the evaluation metrics specified
-      in `metric_fn` keyed by name, as well as an entry `global_step` that contains
-      the value of the global step for which this evaluation was performed.
-      Otherwise (i.e. `is_evaluator() == False`), returns None.
-    """
-    if not self.is_evaluator():
-      return None
+            # If the destination already exists, move to backup
+            if tf.io.gfile.exists(hub_module_newpath):
+                # Ensure backup_dir exists
+                tf.io.gfile.makedirs(backup_dir)
+                hub_module_backup = os.path.join(backup_dir, folder)
+                tf.io.gfile.rename(hub_module_newpath, hub_module_backup)
 
-    if not callable(input_fn):
-      raise ValueError("Expecting callable input_fn function")
+            tf.io.gfile.rename(hub_module_oldpath, hub_module_newpath)
 
-    hooks = self.get_eval_hooks() if hooks is None else hooks
-    hooks = [] if hooks is None else hooks
+        # Since the timestamped folder exists but is empty, we can delete it.
+        tf.io.gfile.rmtree(path_exporter)
 
-    # for consistency with train/learn
-    eval_steps = None if steps is not None and steps < 0 else steps
+    def _is_on_gke(self) -> bool:
+        """Returns True if running on gke."""
+        cluster = os.environ.get("TWML_JOB_CLUSTER")
+        if not cluster or cluster in {"smf1", "atla"}:
+            return False
+        return True
 
-    with self.experiment_tracker.track_experiment(hooks, lambda: self.current_estimator_spec, name=name):
-      checkpoint = self.best_or_latest_checkpoint
-      computed_metrics = self._estimator.evaluate(
-        input_fn,
-        steps=eval_steps,
-        hooks=hooks,
-        checkpoint_path=checkpoint,
-        name=name
-      )
+    def _maybe_del_tsd_exit(self, state_files: List[str]) -> None:
+        """Handle potential early exit and TwitterSetDeployment deletion.
 
-    return computed_metrics
+        If:
+            - distributed training
+            - running GKE
+            - training is finished (all state_files exists)
+        we will exit early and not restart work
 
-  def start_tensorboard(self, port=None):
-    """
-    Start tensorboard process to visualize logs in save_dir.
-    """
-    logging.info("Starting tensorboard.")
-    if self._tensorboard_handle:
-      logging.warn("Tensorboard already running. Nothing done.")
-      return
-
-    if port is None:
-      if 'tensorboard_port' not in self.params.values():
-        raise ValueError('You must specify a port for tensorboard to run on.')
-      elif self.params.tensorboard_port is None:
-        return
-      else:
-        port = self.params.tensorboard_port
-
-    mldash_path = 'experiments'
-    if self.experiment_tracker.path:
-      mldash_path += '/%s' % encode_url(self.experiment_tracker.experiment_id)
-    tensorboard_args = ['--logdir=%s' % self._save_dir, '--port=%d' % port]
-
-    try:
-      args = ['email_and_launch_tensorboard', mldash_path, '--'] + tensorboard_args
-      self._tensorboard_handle = subprocess.Popen(args)
-    except OSError:
-      try:
-        self._tensorboard_handle = subprocess.Popen(['tensorboard'] + tensorboard_args)
-      except OSError:
-        try:
-          # this will work with Twitter internal pants build when run locally
-          args = ['./pants', 'run', 'twml:tensorboard', '--'] + tensorboard_args
-          self._tensorboard_handle = subprocess.Popen(args)
-        except OSError:
-          logging.error("No tensorboard installed, won't able to visualize training in tensorboard.")
+        If --distributed_training_cleanup = True then we will also handle
+        cleaning up the TwitterSetDeployments.
 
-  def stop_tensorboard(self):
-    """
-    Shutdown this Trainer's associated Tensorboard.
-    """
-    if self._tensorboard_handle:
-      logging.info("Shutting down tensorboard.")
-      self._tensorboard_handle.kill()
-    else:
-      logging.warn("No known tensorboard process. Nothing done.")
-
-  def calibrate(self,
-                calibrator,
-                steps=None,
-                input_fn=None,
-                save_calibrator=True,
-                hooks=None):
-    """
-    Calibrate the calibrator for `steps` calibration steps using the estimator.train method.
-    The build_graph passed to the Trainer constructor should
-    call calibrator.accumulate using something like tf.py_func.
-    That way, when this method calls estimator.train the calibrator will
-    accumulate one epoch of samples. After which, this method calls calibrator.calibrate().
-    It is up to the user to then call calibrator.save() to save the calibrated Layer
-    and other information to disk for multi-phase training.
-
-    Args:
-      calibrator:
-        a twml.Calibrator instance or a dict of the form {name(str): twml.Calibrator}.
-      steps:
-        Maximum steps to accumulate examples for calibration. Optional.
-        If not specified, examples will be accumulated until all downsampled parts are processed.
-      input_fn:
-        Function to iterate through training set. It is passed to estimator.train.
-      hooks:
-        List of SessionRunHooks uses for training. Defaults to self.get_train_hooks().
-      save_calibrator:
-        Boolean (default: True). If set to True it will save the calibrator layer.
-    """
+        Args:
+            state_files:
+                A python list indicate state files to determine the finish state of the job.
+        """
+        # job type that is responsible for experiment tracking will remain alive
+        # until it marks the experiment as finished.
+        if self.experiment_tracker._env_eligible_for_recording_experiment:
+            exp_status = self.experiment_tracker.get_run_status()
+            if exp_status and exp_status not in {"Success", "Failed"}:
+                logging.info(
+                    f"Not exiting early because experiment is still {exp_status}."
+                )
+                return
+
+        # do not bother if we are on prem
+        if not self._is_on_gke():
+            logging.info("No need to exit early because running on prem.")
+            return
+
+        states = [
+            twml.util.file_exist_in_dir(self._save_dir, state_file)
+            for state_file in state_files
+        ]
+        do_not_restart = self._params.get("distributed") and all(states)
+        if not do_not_restart:
+            return
 
-    if not callable(input_fn):
-      raise ValueError("Expecting callable input_fn function")
-
-    # making everything a dict to avoid multiple ifs
-    if isinstance(calibrator, twml.contrib.calibrators.Calibrator):
-      calibrator = {"default": calibrator}
-
-    # This is a dummy call to train, since we cannot predict without training
-    # from the Estimator API
-    self._estimator.train(input_fn, steps=1)
-    max_steps = steps if steps is not None else -1
-    for name, clbrt in sorted(calibrator.items(), key=itemgetter(0)):
-      count = 0
-      for out in self._estimator.predict(input_fn, hooks=hooks, yield_single_examples=False):
-        if max_steps > 0 and count > max_steps:
-          break
-        clbrt.accumulate_feature(out)
-        count += 1
-      clbrt.calibrate()
-
-    # this step is done to allow us to keep the current phases event file for
-    # visualization on Tensorboard. It removes all files that
-    # are not event files. This piece of code should be deprecated when
-    # we deprecate the MDL calibrator (CX-12329)
-    for fname in tf.io.gfile.listdir(self._save_dir):
-      if not fname.startswith("events"):
-        tf.io.gfile.remove(os.path.join(self._save_dir, fname))
-
-    if save_calibrator:
-      # If we only have one calibrator, the calibrator signature
-      # will be set to default
-      if len(calibrator) == 1:
-        calibrator = calibrator['default']
-        calibrator.save(
-          self.params.save_dir,
-          name=calibrator.name,
-          verbose=True
-        )
-      else:
-        for name, clbrt in calibrator.items():
-          clbrt.save(
-            self.params.save_dir,
-            name=clbrt.name + str(name),
-            verbose=True
-          )
-
-  def predict(self, *args, **kwargs):
-    """
-    Wrapper over the tensorflow `Estimator.predict
-    <https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator#predict>`_.
-    method. See that documentation for description of arguments accepted.
-
-    If hooks is passed as an argument, the specified hooks are used.
-    Else when profiler_steps is specified in the constructor of the Trainer, a
-    tf.train.ProfilerHook is passed to the predict interface.
-    Otherwise, hooks is set to an empty list.
-    """
-    if 'hooks' not in kwargs and len(args) < 3:
-      # If hooks is not specified as a keyword argument, nor as a positional argument
-      # add hooks as a keyword argument.
-      kwargs['hooks'] = self.get_predict_hooks()
-
-    return self.estimator.predict(*args, **kwargs)
-
-  def hub_export(self,
-                 name,
-                 serving_input_receiver_fn,
-                 export_dir=None,
-                 checkpoint_path=None,
-                 export_task_type_overrider=None):
-    """
-    Exports registered modules into a save directory.
-
-    This method creates a directory under export_path with the save TF Hub.
-    One sub-directory (named export_name) per module registered via register_module_for_export.
-
-    Arguments:
-      name:
-        unique name of the module to export.
-      serving_input_receiver_fn:
-        A function with no arguments that returns a ServingInputReceiver.
-        This is used with the estimator passed to export() to build the graph (in PREDICT mode)
-        that registers the modules for export. The model in that graph is never run,
-        so the actual data provided by this input fn does not matter.
-      export_dir:
-        A string containing a directory where to write the export directories.
-        Defaults to the save_dir.
-      checkpoint_path:
-        The checkpoint path to export. Defaults to the latest.
-      export_task_type_overrider:
-        Specifies the task type that will override the default task type used for export
-        (hogwild training defaults to evaluator, otherwise, defaults to chief)
-    """
-    if export_task_type_overrider:
-      if not self.is_task_type(export_task_type_overrider):
-        logging.info(
-          f"Trainer.hub_export ignored due to process not being {export_task_type_overrider}")
-        return
-    else:
-      if self._using_hogwild:
-        if not self.is_evaluator():
-          logging.info("Trainer.hub_export ignored due to the process not being evaluator.")
-          return
-      else:
-        if not self.is_chief():
-          logging.info("Trainer.hub_export ignored due to the process not being chief.")
-          return
-
-    if export_dir:
-      export_dir = sanitize_hdfs_path(export_dir)
-
-    if checkpoint_path:
-      checkpoint_path = sanitize_hdfs_path(checkpoint_path)
-    else:
-      checkpoint_path = self.best_or_latest_checkpoint
-
-    export_dir = export_dir if export_dir is not None else self._save_dir
-    exporter = hub.LatestModuleExporter(name, serving_input_receiver_fn)
-    # The path_exporter by default contains a timestamp directory in its path.
-    path_exporter = exporter.export(estimator=self.estimator,
-                                    export_path=export_dir,
-                                    checkpoint_path=checkpoint_path)
-
-    # LatestModuleExporter.export() returns a binary string on Cloud ML Engine
-    # but tf.io.gfile.listdir() does not; this is an issue when joining paths
-    if isinstance(path_exporter, bytes):
-      path_exporter = path_exporter.decode()
-
-    # Copying the saved hub module to export_dir so we don't need to specify
-    # the timestamp when loading the module.
-    # This is a workaround due to the current implementation of hub.LatestModuleExporter.
-    # This works for multiple hub modules.
-    hub_exported_modules = tf.io.gfile.listdir(path_exporter)
-
-    backup_dir = os.path.join(export_dir, "backups",
-                              datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
-
-    for folder in hub_exported_modules:
-      hub_module_oldpath = os.path.join(path_exporter, folder)
-      hub_module_newpath = os.path.join(export_dir, folder)
-
-      # If the destination already exists, move to backup
-      if tf.io.gfile.exists(hub_module_newpath):
-        # Ensure backup_dir exists
-        tf.io.gfile.makedirs(backup_dir)
-        hub_module_backup = os.path.join(backup_dir, folder)
-        tf.io.gfile.rename(hub_module_newpath, hub_module_backup)
-
-      tf.io.gfile.rename(hub_module_oldpath, hub_module_newpath)
-
-    # Since the timestamped folder exists but is empty, we can delete it.
-    tf.io.gfile.rmtree(path_exporter)
-
-  def _is_on_gke(self) -> bool:
-    """Returns True if running on gke."""
-    cluster = os.environ.get('TWML_JOB_CLUSTER')
-    if not cluster or cluster in {'smf1', 'atla'}:
-      return False
-    return True
-
-  def _maybe_del_tsd_exit(self, state_files) -> None:
-    """Handle potential early exit and TwitterSetDeployment deletion.
-
-      If:
-        - distributed training
-        - running GKE
-        - training is finished (all state_files exists)
-      we will exit early and not restart work
-
-      If --distributed_training_cleanup = True then we will also handle
-      cleaning up the TwitterSetDeployments.
-
-      Args:
-        state_files: A python list indicate state files to determine the finish 
-        state of the job.
-    """
-    # job type that is responsible for experiment tracking will remain alive
-    # until it marks the experiment as finished.
-    if self.experiment_tracker._env_eligible_for_recording_experiment:
-      exp_status = self.experiment_tracker.get_run_status()
-      if exp_status and exp_status not in {'Success', 'Failed'}:
         logging.info(
-          f"Not exiting early because experiment is still {exp_status}."
+            f"Exiting early because a _SUCCESS file already exists in {self._save_dir}"
         )
-        return
-
-    # do not bother if we are on prem
-    if not self._is_on_gke():
-      logging.info("No need to exit early because running on prem.")
-      return
-
-    states = [
-      twml.util.file_exist_in_dir(self._save_dir, state_file) for state_file in state_files]
-    do_not_restart = (self._params.get('distributed') and all(states))
-    if not do_not_restart:
-      return
-
-    logging.info(
-      f"Exiting early because a _SUCCESS file already exists in {self._save_dir}")
-    if self._params.get('distributed_training_cleanup'):
-      resource_name = '-'.join([
-        os.environ['TWML_JOB_NAME'],
-        os.environ['TWML_DISTRIBUTED_JOB_TYPE'],
-        os.environ['TWML_JOB_ENV'],
-      ])
-      logging.info(f"Deleting TwitterSetDeployment {resource_name}")
-      # each job type will manage its own deletion so that deletion happens
-      # in the trainer init call for every job type
-      # otherwise we may kill another job type during an important
-      # process like experiment tracking management (handled by the evaluator
-      kubectl_delete_by_name(
-        zone=None,
-        namespace=os.environ['TWML_JOB_ROLE'],
-        resource_type=Resource.TWITTERSETDEPLOYMENTS.value,
-        resource_name=resource_name,
-        wait=False,
-      )
-    sys.exit(0)
-
-  def write_state_to_disk(self, save_dir, filename='_SUCCESS') -> None:
-    """Write state file to disk to indicate the state of training process. This is usually used 
-      to mark the state of training progress and determine the start when job restarts/resumes.
-    Args:
-      save_dir: A str of local/gcs/hdfs dir to write the state file.
-      file_name: A str indicate the state file. Default to `_SUCCESS`.
-    """
-    file_path = os.path.join(save_dir, filename)
-    if tf.io.gfile.exists(file_path):
-      tf.logging.warn(f'{file_path} already exist.')
-      return
+        if self._params.get("distributed_training_cleanup"):
+            resource_name = "-".join(
+                [
+                    os.environ["TWML_JOB_NAME"],
+                    os.environ["TWML_DISTRIBUTED_JOB_TYPE"],
+                    os.environ["TWML_JOB_ENV"],
+                ]
+            )
+            logging.info(f"Deleting TwitterSetDeployment {resource_name}")
+            # each job type will manage its own deletion so that deletion happens
+            # in the trainer init call for every job type
+            # otherwise we may kill another job type during an important
+            # process like experiment tracking management (handled by the evaluator
+            kubectl_delete_by_name(
+                zone=None,
+                namespace=os.environ["TWML_JOB_ROLE"],
+                resource_type=Resource.TWITTERSETDEPLOYMENTS.value,
+                resource_name=resource_name,
+                wait=False,
+            )
+        sys.exit(0)
+
+    def write_state_to_disk(self, save_dir, filename="_SUCCESS") -> None:
+        """
+        Write state file to disk to indicate the state of training process. This is usually used
+        to mark the state of training progress and determine the start when job restarts/resumes.
 
-    with tf.io.gfile.GFile(file_path, 'w') as f:
-      f.write('')
\ No newline at end of file
+        Args:
+            save_dir: A str of local/gcs/hdfs dir to write the state file.
+            file_name: A str indicate the state file. Default to `_SUCCESS`.
+        """
+        file_path = os.path.join(save_dir, filename)
+        if tf.io.gfile.exists(file_path):
+            tf.logging.warn(file_path + " already exist.")
+            return
+
+        with tf.io.gfile.GFile(file_path, "w") as f:
+            f.write("")
diff --git a/twml/twml/util.py b/twml/twml/util.py
index cd7679a6f..271cb284b 100644
--- a/twml/twml/util.py
+++ b/twml/twml/util.py
@@ -3,940 +3,1073 @@
 """
 
 import argparse
-from datetime import datetime
 import itertools
 import json
 import logging as _logging
 import os
 import re
+from datetime import datetime
+from typing import Any, Callable, Dict, Iterable, List, Optional, Union
 
-from twitter.ml.common.resources import AuroraPath
-from twitter.deepbird.hparam import HParams
-from twitter.deepbird.io.util import (
-  _get_feature_id,  # noqa: F401
-  feature_id,  # noqa: F401
-  preprocess_feature_regex,  # noqa: F401
-  preprocess_path,  # noqa: F401
-  sanitize_hdfs_path,  # noqa: F401
-  is_string,  # noqa: F401
-  list_files,  # noqa: F401
-  match_files,  # noqa: F401
-)
-from twitter.deepbird.io.legacy.util import (
-  batch_apply,  # noqa: F401
-  boolean_mask,  # noqa: F401
-  fixed_length_tensor,  # noqa: F401
-)
-from twitter.deepbird.sparse.util import (
-  convert_to_sparse,  # noqa: F401
-  limit_bits,  # noqa: F401
-)
-
-from dateutil import rrule
-from joblib import delayed, Parallel
-from six import string_types
-
+import tensorflow.compat.v1 as tf
 from absl import logging
+from dateutil import rrule
+from joblib import Parallel, delayed
 from libtwml import CLIB, OPLIB  # noqa: F401
-import tensorflow.compat.v1 as tf
+from six import string_types
 from tensorflow.python.platform import tf_logging
+from twitter.deepbird.hparam import HParams
+from twitter.deepbird.io.legacy.util import batch_apply  # noqa: F401
+from twitter.deepbird.io.legacy.util import boolean_mask  # noqa: F401
+from twitter.deepbird.io.legacy.util import fixed_length_tensor  # noqa: F401
+from twitter.deepbird.io.util import _get_feature_id  # noqa: F401
+from twitter.deepbird.io.util import feature_id  # noqa: F401
+from twitter.deepbird.io.util import is_string  # noqa: F401
+from twitter.deepbird.io.util import list_files  # noqa: F401
+from twitter.deepbird.io.util import match_files  # noqa: F401
+from twitter.deepbird.io.util import preprocess_feature_regex  # noqa: F401
+from twitter.deepbird.io.util import preprocess_path  # noqa: F401
+from twitter.deepbird.io.util import sanitize_hdfs_path  # noqa: F401
+from twitter.deepbird.sparse.util import convert_to_sparse  # noqa: F401
+from twitter.deepbird.sparse.util import limit_bits  # noqa: F401
+from twitter.ml.common.resources import AuroraPath
+
 import twml
 from twml.feature_config import FeatureConfigBuilder
 
-
 # big_prime is less than 2**32
 # This just needs to be co-prime with powers of 2
 # any large prime is sufficient, but it's not necessary.
 HASHING_PRIME = 2479700537
 
 
-def multiplicative_hash(input, hash_constant=HASHING_PRIME):
-  return input * hash_constant
-
-
-def _return_tensors_from_checkpoint_folder(init_dir, model_name=None):
-  """Returns tensors list from a checkpoint folder
-
-  Args:
-    init_dir: Name of the checkpoint directory.
-    model_name: the model which we will use to obtain the checkpoint
-      (e.g. model.ckpt-50000) if set to None it will default to the
-      latest model saved in the checkpont file.
-
-  """
-  if model_name is None:
-    # gets the most recently generated model.cpkt file
-    model_path = tf.train.latest_checkpoint(init_dir)
-    if model_path is None:
-      raise ValueError("Could not find a valid model checkpoint inside the directory")
-  else:
-    model_path = os.path.join(init_dir, model_name)
-  reader = tf.train.NewCheckpointReader(model_path)
-  try:
-    return (reader.debug_string().decode("utf-8"))
-  except OSError:
-    logging.error('Could not decode the string')
-
-
-def get_scope_dict(init_dir, incoming_scope_name, current_scope_name, model_name=None):
-  """Returns tensors map from a checkpoint file.
-
-  Args:
-    file_name:
-      Name of the checkpoint directory.
-    incoming_scope_name:
-      scope name of the previous phase
-    current_scope_name:
-      scope name of current phase
-    model_name:
-      the model which we will use to obtain the checkpoint
-      (e.g. model.ckpt-50000) if set to None it will default
-      to the latest model saved in the checkpoint file.
-  Returns:
-    init_map:
-      init_map which will be inputted to the checkpoint
-  """
-  init_map = {}
-  reader_dump = _return_tensors_from_checkpoint_folder(init_dir=init_dir,
-                                                       model_name=model_name).splitlines()
-  for member in reader_dump:
-    # remove global_step since it is not necessary
-    if 'global_step' not in member:
-      saved_variables = str(member.split(" ")[0])
-      saved_scope = saved_variables.rsplit('/', 1)[0] + "/"
-      new_scope = saved_scope.replace(incoming_scope_name, current_scope_name, 1)
-      # create key in init_map
-      if saved_scope not in init_map.keys():  # pylint: disable=dict-keys-not-iterating
-        init_map[saved_scope] = new_scope
-  return init_map
+def multiplicative_hash(input: int, hash_constant: int = HASHING_PRIME) -> int:
+    return input * hash_constant
+
+
+def _return_tensors_from_checkpoint_folder(
+    init_dir: str, model_name: Optional[str] = None
+) -> Optional[str]:
+    """Returns tensors list from a checkpoint folder
+
+    Args:
+        init_dir: Name of the checkpoint directory.
+        model_name: the model which we will use to obtain the checkpoint
+            (e.g. model.ckpt-50000) if set to None it will default to the
+            latest model saved in the checkpoint file.
+
+    Returns:
+        debug_string (str):
+            debug string of the checkpoint file
+    """
+    if model_name is None:
+        # gets the most recently generated model.cpkt file
+        model_path = tf.train.latest_checkpoint(init_dir)
+        if model_path is None:
+            raise ValueError(
+                "Could not find a valid model checkpoint inside the directory"
+            )
+    else:
+        model_path = os.path.join(init_dir, model_name)
+    reader = tf.train.NewCheckpointReader(model_path)
+    try:
+        return reader.debug_string().decode("utf-8")
+    except OSError:
+        logging.error("Could not decode the string")
+
+
+def get_scope_dict(
+    init_dir: str,
+    incoming_scope_name: str,
+    current_scope_name: str,
+    model_name: Optional[str] = None,
+) -> Dict[str, str]:
+    """Returns tensors map from a checkpoint file.
+
+    Args:
+        file_name: str
+            Name of the checkpoint directory.
+        incoming_scope_name: str
+            scope name of the previous phase
+        current_scope_name: str
+            scope name of current phase
+        model_name: str
+            the model which we will use to obtain the checkpoint
+            (e.g. model.ckpt-50000) if set to None it will default
+            to the latest model saved in the checkpoint file.
+    Returns:
+        init_map (dict):
+            init_map which will be inputted to the checkpoint
+    """
+    init_map = {}
+    reader_dump = _return_tensors_from_checkpoint_folder(
+        init_dir=init_dir, model_name=model_name
+    ).splitlines()
+    for member in reader_dump:
+        # remove global_step since it is not necessary
+        if "global_step" not in member:
+            saved_variables = str(member.split(" ")[0])
+            saved_scope = saved_variables.rsplit("/", 1)[0] + "/"
+            new_scope = saved_scope.replace(incoming_scope_name, current_scope_name, 1)
+            # create key in init_map
+            if (
+                saved_scope not in init_map.keys()
+            ):  # pylint: disable=dict-keys-not-iterating
+                init_map[saved_scope] = new_scope
+    return init_map
 
 
 def get_init_map(
+    init_from_dir: str,
+    exclude_var_names: Optional[List[str]] = None,
+    exclude_name_scopes: Optional[List[str]] = None,
+    name_scope_to_remove: Optional[str] = None,
+    name_scope_to_prepend: Optional[str] = None,
+) -> Dict[str, str]:
+    """
+    Builds a map for initializing from a checkpoint (see tf.train.init_from_checkpoint).
+
+    It assumes that the latter part of the variable names are consistent between the checkpoint and
+    the new model, but their name_scopes may be different. If the checkpoint model has variable names
+    of the form old/scope/var/foo, and the corresponding variable names for the new model should be
+    my/new/scope/var/foo, then you should set name_scope_to_remove = 'old/' and
+    name_scope_to_prepend = 'my/new/'.
+
+    This function can be used to
+
+    1. Generate an ``init_map`` map that can be passed to the ``Trainer`` init or
+    2. Used to generate an ``init_map`` directly inside ``build_graph_fn``, in
+        which case it should be passed directly to ``tf.train.init_from_checkpoint`` inside
+        ``build_graph_fn``, in which case you do not also need to specify the ``init_map`` argument to
+        the trainer.
+
+    Parameters
+    ----------
+    init_from_dir: str
+        Directory containing checkpoint
+    exclude_var_names: list[str]
+        List of variables in the checkpoint that should be excluded from the map.
+    exclude_name_scopes: list[str]
+        List of name_scopes in the checkpoint model that should be excluded from the map.
+    name_scope_to_remove: str
+        portion of name_scope for checkpoint variables that should not be included in variable names
+        for new model.
+    name_scope_to_prepend: str
+        name_scope to prepend to variable names in checkpoint to give variable names for new model.
+
+    Returns
+    -------
+    dict
+        keys are variable names in the checkpoint and values are variable names in the new model,
+        into which the checkpoint parameters should be loaded.
+    """
+    vars_to_restore = get_checkpoint_variable_names(
         init_from_dir,
-        exclude_var_names=None,
-        exclude_name_scopes=None,
-        name_scope_to_remove=None,
-        name_scope_to_prepend=None):
-  """
-  Builds a map for initializing from a checkpoint (see tf.train.init_from_checkpoint).
-
-  It assumes that the latter part of the variable names are consistent between the checkpoint and
-  the new model, but their name_scopes may be different. If the checkpoint model has variable names
-  of the form old/scope/var/foo, and the corresponding variable names for the new model should be
-  my/new/scope/var/foo, then you should set name_scope_to_remove = 'old/' and
-  name_scope_to_prepend = 'my/new/'.
-
-  This function can be used to
-
-  1. Generate an ``init_map`` map that can be passed to the ``Trainer`` init or
-  2. Used to generate an ``init_map`` directly inside ``build_graph_fn``, in
-     which case it should be passed directly to ``tf.train.init_from_checkpoint`` inside
-     ``build_graph_fn``, in which case you do not also need to specify the ``init_map`` argument to
-     the trainer.
-
-  Parameters
-  ----------
-  init_from_dir: Directory containing checkpoint
-  exclude_var_names: list[str]
-    List of variables in the checkpoint that should be excluded from the map.
-  exclude_name_scopes: list[str]
-    List of name_scopes in the checkpoint model that should be excluded from the map.
-  name_scope_to_remove: str
-    portion of name_scope for checkpoint variables that should not be included in variable names
-    for new model.
-  name_scope_to_prepend: str
-    name_scope to prepend to variable names in checkpoint to give variable names for new model.
-
-  Returns
-  -------
-  dict
-    keys are variable names in the checkpoint and values are variable names in the new model,
-    into which the checkpoint parameters should be loaded.
-  """
-  vars_to_restore = get_checkpoint_variable_names(
-    init_from_dir,
-    exclude_var_names=exclude_var_names,
-    exclude_scopes=exclude_name_scopes,
-  )
-
-  if name_scope_to_prepend is not None:
-    if not name_scope_to_prepend.endswith('/'):
-      name_scope_to_prepend += '/'
-
-  if name_scope_to_remove is not None:
-    if not name_scope_to_remove.endswith('/'):
-      name_scope_to_remove += '/'
-
-  init_map = {}
-
-  for var_name in vars_to_restore:
-    var_name_checkpoint = var_name
-
-    if name_scope_to_remove is not None:
-      var_name = var_name.replace(name_scope_to_remove, '')
-
-    var_name_new_model = var_name
+        exclude_var_names=exclude_var_names,
+        exclude_scopes=exclude_name_scopes,
+    )
 
     if name_scope_to_prepend is not None:
-      var_name_new_model = name_scope_to_prepend + var_name_new_model
-
-    init_map[var_name_checkpoint] = var_name_new_model
-
-  return init_map
-
-
-def get_checkpoint_variable_names(model_dir, exclude_var_names=None, exclude_scopes=None):
-  """
-  Gets a list of variable names from the latest checkpoint in model_dir.
-  Removes variables with scope defined by exclude_scopes, and/or with names defined by
-  exclude_var_names.
-
-  Args:
-    model_dir (str): Directory containing checkpoint file for the pre-trained model
-    exclude_var_names (list): Optional variable names to exclude (can include full/partial scope)
-    exclude_scopes (list): Optional scopes to exclude
-
-  Returns:
-    list: variable names
-  """
-  checkpoint_path = tf.train.latest_checkpoint(model_dir)
-  variables_and_shapes = tf.train.list_variables(checkpoint_path)
-
-  def _keep(name):
-    if exclude_scopes and any(name.startswith(exc_scope) for exc_scope in exclude_scopes):
-      return False
-    if exclude_var_names and any(name.endswith(exc_var) for exc_var in exclude_var_names):
-      return False
-    return True
-
-  names = [x[0] for x in variables_and_shapes if _keep(x[0])]
-
-  return names
-
-
-def to_snake_case(name):
-  """
-  Changes name to snake case
-  """
-  intermediate = re.sub('(.)([A-Z][a-z0-9]+)', r'\1_\2', name)
-  insecure = re.sub('([a-z])([A-Z])', r'\1_\2', intermediate).lower()
-  # If the class is private the name starts with "_" which is not secure
-  # for creating scopes. We prefix the name with "private" in this case.
-  if insecure[0] != '_':
-    return insecure
-  return 'private' + insecure
-
-
-def copy_phase_inputs(init_dir, dest_dir):
-  """Automatically copies the .json.tf from the init_dir to save_dir
-  so we can load multiple parameters at the same time.
-
-  Args:
-    init_dir:
-      Name of the checkpoint directory.
-    dest_dir:
-      Name of the output directory.
-  """
-  if init_dir is not None:
-    # we are using tf.io.gfile so we can use it with both local and hdfs paths
-    for files in tf.io.gfile.listdir(init_dir):
-      if files.endswith(".json.tf"):
-        src_file = os.path.join(init_dir, files)
-        dest_file = os.path.join(dest_dir, files)
-        if not tf.io.gfile.exists(dest_dir):
-          # creates the folder
-          try:
-            tf.io.gfile.makedirs(dest_dir)
-          # to prevent racing condition
-          except OSError:
-            if not tf.io.gfile.isdir(dest_dir):
-              raise
-        # dest_file may be old if it exists and
-        # dest_file gets copied several times in distributed training
-        tf.io.gfile.copy(src_file, dest_file, overwrite=True)
-
-
-def rehash_sparse_features_nbits(sp_a, nbits, hash_fn=multiplicative_hash):
-  """
-  Rehash the feature ids of the sparse tensor,
-  and limit the output to n bits.
-
-  This is useful for making the distribution of
-  feature_ids more uniform, which may improve performance
-  in some situations.
-
-  This would typically be used on the output of
-  PercentileDiscretizer, since it assigns many
-  bins to low-valued output feature ids.
-
-  Input feature IDs should take values less than 2**32,
-  and nbits should be less than 32
-
-  Args:
-    sp_a:
-      a tf.SparseTensor object
-    nbits:
-      integer number of bits to mask output feature_ids
-    hash_fn:
-      Function that takes integer values and returns hashes of these values.
-      The output does not need to be masked to the desired number of bits,
-      as this masking will be taken care of. Default value = multiplicative_hash.
-
-  Returns:
-    a new tf.SparseTensor
-  """
-
-  feature_ids = sp_a.indices[:, 1]
-  feature_ids = hash_fn(feature_ids)
-
-  sample_ids = sp_a.indices[:, 0]
-  values = sp_a.values
-  dense_shape = sp_a.dense_shape
-
-  indices = tf.stack([sample_ids, feature_ids], axis=1)
-
-  sp_a = tf.SparseTensor(indices, values, dense_shape)
-
-  # note - we need 2**nbits >= batch size
-  # otherwise, sample_ids will be squashed by the mask.
-  return limit_sparse_tensor_size(sp_a, nbits)
-
-
-def convert_to_hparams(opt):
-  """
-  Converts argparse.Namespace object to twitter.deepbird.hparam.hparam.HParams.
-  Note that tensorflow.contrib.training.HParams is gone in TF 2.x, and we forward ported
-  tensorflow.contrib.training.HParams to twitter.deepbird.hparam.hapram.HParams.
-
-  NOTE: If you are using estimators, please don't call this method and directly pass python dict
-  to TensorFlow estimator. Starting TensorFlow 2.0, Estimator will only accept dicts.
-  """
-
-  # Convert to dict so we can iterate through it cleanly.
-  if isinstance(opt, argparse.Namespace):
-    params_dict = vars(opt)
-  elif isinstance(opt, dict):
-    params_dict = opt
-  elif isinstance(opt, HParams):
-    logging.warning('If you are using Estimator, please pass python dict directly to Estimator.')
-    params_dict = opt.values()
-  else:
-    raise ValueError("Input can not be of type %s. "
-                     "It can be one of { argparse.Namespace, dict, "
-                     "twitter.deepbird.hparam.HParams}."
-                     % type(opt))
-
-  params = HParams()
-  # Hack to convert all parameters from hdfs:/// format to hdfs://default/
-  # Note: .items() makes a copy in python 2.7, but that is fine since the performance isn't critical.
-  for key, val in params_dict.items():
-    val = params_dict[key]
-    # Fix the path if the value is a string
-    if isinstance(val, str):
-      params.add_hparam(key, sanitize_hdfs_path(val))
-    else:
-      params.add_hparam(key, val)
-
-  return params
+        if not name_scope_to_prepend.endswith("/"):
+            name_scope_to_prepend += "/"
 
-
-def dynamic_partition(features, partitions, num_partitions=2, name=None):
-  """
-  Partitions each of the tensor in features using the provided mask.
-
-  Args:
-    features:
-      A single tensor or an iterable of tensors (list, tuple, dict)
-    partitions:
-      A bool or integer tensor representing the partitions.
-
-  Returns partitioned outputs as a list. Each element of the list is the same type as features.
-
-  This uses tf.dynamic_partition but adds the following niceties:
-    - features can be a list or dict of different tensor types.
-    - only a partition tensor is used to partition all the feature tensors recursively.
-    - the partition tensor is automatically converted into an integer tensor.
-    - defaults to num_partitions == 2
-  """
-
-  if not isinstance(features, (dict, list, tuple, tf.Tensor)):
-    raise AssertionError("features container must be a dict, list, or tuple, tf.Tensor")
-
-  if isinstance(partitions, tf.Tensor):
-    partitions = tf.cast(partitions, tf.int32)
-
-  if isinstance(features, tf.Tensor):
-    return tf.dynamic_partition(features, partitions, num_partitions, name)
-
-  outputs = []
-  for _ in range(num_partitions):
-    if isinstance(features, (tuple, list)):
-      # Create an empty list of lists first, will be converted to right type afterwards.
-      outputs.append([None for _ in range(len(features))])
+    if name_scope_to_remove is not None:
+        if not name_scope_to_remove.endswith("/"):
+            name_scope_to_remove += "/"
+
+    init_map = {}
+
+    for var_name in vars_to_restore:
+        var_name_checkpoint = var_name
+
+        if name_scope_to_remove is not None:
+            var_name = var_name.replace(name_scope_to_remove, "")
+
+        var_name_new_model = var_name
+
+        if name_scope_to_prepend is not None:
+            var_name_new_model = name_scope_to_prepend + var_name_new_model
+
+        init_map[var_name_checkpoint] = var_name_new_model
+
+    return init_map
+
+
+def get_checkpoint_variable_names(
+    model_dir: str,
+    exclude_var_names: Optional[List[str]] = None,
+    exclude_scopes: Optional[List[str]] = None,
+) -> List[str]:
+    """
+    Gets a list of variable names from the latest checkpoint in model_dir.
+    Removes variables with scope defined by exclude_scopes, and/or with names defined by
+    exclude_var_names.
+
+    Args:
+        model_dir (str): Directory containing checkpoint file for the pre-trained model
+        exclude_var_names (list): Optional variable names to exclude (can include full/partial scope)
+        exclude_scopes (list): Optional scopes to exclude
+
+    Returns:
+        list: variable names
+    """
+    checkpoint_path = tf.train.latest_checkpoint(model_dir)
+    variables_and_shapes = tf.train.list_variables(checkpoint_path)
+
+    def _keep(name: str) -> bool:
+        if exclude_scopes and any(
+            name.startswith(exc_scope) for exc_scope in exclude_scopes
+        ):
+            return False
+        if exclude_var_names and any(
+            name.endswith(exc_var) for exc_var in exclude_var_names
+        ):
+            return False
+        return True
+
+    names = [str(x[0]) for x in variables_and_shapes if _keep(x[0])]
+
+    return names
+
+
+def to_snake_case(name: str) -> str:
+    """
+    Changes name to snake case
+    """
+    intermediate = re.sub("(.)([A-Z][a-z0-9]+)", r"\1_\2", name)
+    insecure = re.sub("([a-z])([A-Z])", r"\1_\2", intermediate).lower()
+    # If the class is private the name starts with "_" which is not secure
+    # for creating scopes. We prefix the name with "private" in this case.
+    if insecure[0] != "_":
+        return insecure
+    return "private" + insecure
+
+
+def copy_phase_inputs(init_dir: str, dest_dir: str):
+    """Automatically copies the .json.tf from the init_dir to save_dir
+    so we can load multiple parameters at the same time.
+
+    Args:
+        init_dir (str):
+            Name of the checkpoint directory.
+        dest_dir (str):
+            Name of the output directory.
+    """
+    if init_dir is not None:
+        # we are using tf.io.gfile so we can use it with both local and hdfs paths
+        for files in tf.io.gfile.listdir(init_dir):
+            if files.endswith(".json.tf"):
+                src_file = os.path.join(init_dir, files)
+                dest_file = os.path.join(dest_dir, files)
+                if not tf.io.gfile.exists(dest_dir):
+                    # creates the folder
+                    try:
+                        tf.io.gfile.makedirs(dest_dir)
+                    # to prevent racing condition
+                    except OSError:
+                        if not tf.io.gfile.isdir(dest_dir):
+                            raise
+                # dest_file may be old if it exists and
+                # dest_file gets copied several times in distributed training
+                tf.io.gfile.copy(src_file, dest_file, overwrite=True)
+
+
+def rehash_sparse_features_nbits(
+    sp_a: tf.SparseTensor,
+    nbits: int,
+    hash_fn: Callable[[int], int] = multiplicative_hash,
+) -> tf.SparseTensor:
+    """
+    Rehash the feature ids of the sparse tensor,
+    and limit the output to n bits.
+
+    This is useful for making the distribution of
+    feature_ids more uniform, which may improve performance
+    in some situations.
+
+    This would typically be used on the output of
+    PercentileDiscretizer, since it assigns many
+    bins to low-valued output feature ids.
+
+    Input feature IDs should take values less than 2**32,
+    and nbits should be less than 32
+
+    Args:
+        sp_a:
+            a tf.SparseTensor object
+        nbits:
+            integer number of bits to mask output feature_ids
+        hash_fn:
+            Function that takes integer values and returns hashes of these values.
+            The output does not need to be masked to the desired number of bits,
+            as this masking will be taken care of. Default value = multiplicative_hash.
+
+    Returns:
+        a new tf.SparseTensor
+    """
+
+    feature_ids = sp_a.indices[:, 1]
+    feature_ids = hash_fn(feature_ids)
+
+    sample_ids = sp_a.indices[:, 0]
+    values = sp_a.values
+    dense_shape = sp_a.dense_shape
+
+    indices = tf.stack([sample_ids, feature_ids], axis=1)
+
+    sp_a = tf.SparseTensor(indices, values, dense_shape)
+
+    # note - we need 2**nbits >= batch size
+    # otherwise, sample_ids will be squashed by the mask.
+    return limit_sparse_tensor_size(sp_a, nbits)
+
+
+def convert_to_hparams(opt: Union[argparse.Namespace, dict, HParams]) -> HParams:
+    """
+    Converts argparse.Namespace object to twitter.deepbird.hparam.hparam.HParams.
+    Note that tensorflow.contrib.training.HParams is gone in TF 2.x, and we forward ported
+    tensorflow.contrib.training.HParams to twitter.deepbird.hparam.hapram.HParams.
+
+    NOTE: If you are using estimators, please don't call this method and directly pass python dict
+    to TensorFlow estimator. Starting TensorFlow 2.0, Estimator will only accept dicts.
+    """
+
+    # Convert to dict so we can iterate through it cleanly.
+    if isinstance(opt, argparse.Namespace):
+        params_dict = vars(opt)
+    elif isinstance(opt, dict):
+        params_dict = opt
+    elif isinstance(opt, HParams):
+        logging.warning(
+            "If you are using Estimator, please pass python dict directly to Estimator."
+        )
+        params_dict = opt.values()
     else:
-      outputs.append(dict())
+        raise ValueError(
+            "Input can not be of type %s. "
+            "It can be one of { argparse.Namespace, dict, "
+            "twitter.deepbird.hparam.HParams}." % type(opt)
+        )
+
+    params = HParams()
+    # Hack to convert all parameters from hdfs:/// format to hdfs://default/
+    # Note: .items() makes a copy in python 2.7, but that is fine since the performance isn't critical.
+    for key, val in params_dict.items():
+        val = params_dict[key]
+        # Fix the path if the value is a string
+        if isinstance(val, str):
+            params.add_hparam(key, sanitize_hdfs_path(val))
+        else:
+            params.add_hparam(key, val)
+
+    return params
+
+
+def dynamic_partition(
+    features: Iterable,
+    partitions: tf.Tensor,
+    num_partitions: int = 2,
+    name: Optional[str] = None,
+) -> list:
+    """
+    Partitions each of the tensor in features using the provided mask.
+
+    Args:
+        features:
+            A single tensor or an iterable of tensors (list, tuple, dict)
+        partitions:
+            A bool or integer tensor representing the partitions.
+
+    Returns partitioned outputs as a list. Each element of the list is the same type as features.
+
+    This uses tf.dynamic_partition but adds the following niceties:
+        - features can be a list or dict of different tensor types.
+        - only a partition tensor is used to partition all the feature tensors recursively.
+        - the partition tensor is automatically converted into an integer tensor.
+        - defaults to num_partitions == 2
+    """
+
+    if not isinstance(features, (dict, list, tuple, tf.Tensor)):
+        raise AssertionError(
+            "features container must be a dict, list, or tuple, tf.Tensor"
+        )
 
-  iterable = features.items() if isinstance(features, dict) else enumerate(features)
-
-  # Handling partitions of nested classes handled here:
-  # Recursively call dynamic_partition for containers
-  for key, feature in iterable:
-    name_key = None if name is None else name + "_" + str(key)
     if isinstance(partitions, tf.Tensor):
-      results = tf.dynamic_partition(feature, partitions, num_partitions, name_key)
-    else:
-      results = tf.dynamic_partition(feature, partitions[key], num_partitions[key], name_key)
-      # Append the result to the proper output container
-    for idx, result in enumerate(results):
-      outputs[idx][key] = result
-
-  # if input is tuple, convert list of lists back to list of tuples
-  if isinstance(features, tuple):
-    outputs = [type(features)(output) for output in outputs]
-
-  return outputs
-
-
-def write_file(filename, contents, encode=False):
-  '''
-  Optionally encodes contents and writes contents to a file.
-
-  Arguments:
-    filename:
-      path to file where the contents will be saved.
-      Accepts HDFS and local paths.
-    contents:
-      contents to save to the file.
-      Must be a string when encode is False.
-    encode:
-      False | 'json'. When encode='json', contents is encoded
-      with json.dumps.
-  '''
-  if encode == 'json':
-    contents = json.dumps(contents)
-  elif not is_string(contents):
-    raise ValueError("Expecting string for encode=False")
-
-  graph = tf.Graph()
-  with graph.as_default():
-    write = tf.write_file(filename, contents)
-
-  with tf.Session(graph=graph) as sess:
-    sess.run(write)
-
-
-def read_file(filename, decode=False):
-  '''
-  Reads contents from a file and optionally decodes it.
-
-  Arguments:
-    filename:
-      path to file where the contents will be loaded from.
-      Accepts HDFS and local paths.
-    decode:
-      False | 'json'. When decode='json', contents is decoded
-      with json.loads. When False, contents is returned as is.
-
-  Returns:
-    contents
-  '''
-  graph = tf.Graph()
-  with graph.as_default():
-    read = tf.read_file(filename)
-
-  with tf.Session(graph=graph) as sess:
-    contents = (sess.run(read))
-    # particular version of TF and/or Python may or may not perform decoding step from utf-8 to str
-    if not isinstance(contents, str):
-      contents = contents.decode()
-
-  if decode == 'json':
-    contents = json.loads(contents)
-
-  return contents
-
-def setup_tf_logging_formatter():
-  formatter = _logging.Formatter(
-      '%(asctime)s [%(levelname)s] %(name)s: %(message)s',
-      None)
-  # Setting up absl logging verbosity
-  logging.set_verbosity('info')
-  logging.set_stderrthreshold('info')
-  logging.get_absl_handler().setFormatter(formatter)
-  tf.logging.set_verbosity(tf.logging.INFO)
-  # Set tensorflow logging handler format
-  if len(tf_logging.get_logger().handlers) > 0:
-    tf_logging.get_logger().handlers[0].setFormatter(formatter)
-
-
-def set_tensorflow_log_level(log_level):
-  """
-  Sets tensorflow's default logging level.
-
-  0. all logs are shown.
-  1. filter out INFO logs.
-  2. filter out WARNINGs and INFOs.
-  3. filter out ERRORs, WARNINGs, and INFOs.
-
-  Note that tf.Print output are INFO logs, so setting log_level above 0 would hide
-  output from tf.Print.
-  """
-  assert isinstance(log_level, int) and log_level >= 0 and log_level <= 3
-  os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(log_level)
-
-
-def weighted_average(values, weights):
-  """
-  Compute a weighted average using the given values and weights.
-  E.g. this is usually used to compute a weighted loss given sample weights.
-  """
-  return tf.reduce_sum(tf.multiply(values, weights)) / tf.reduce_sum(weights)
-
-
-def backup_checkpoint(checkpoint_path_prefix,
-                      backup_path='backup',
-                      empty_backup=True):
-  """
-  Creates a backup copy of a checkpoint in backup_dir.
-  This function is used by the Trainer for early-stopping.
-
-  Arguments:
-    checkpoint_path_prefix:
-      Prefix of the path to the checkpoint files.
-    backup_path:
-      path to a directory where checkpoint files will be backed up.
-    empty_backup:
-      When True (the default), the current contents of the backup directory
-      are removed before the backup is performed.
-
-  Returns:
-    The number of backed up files.
-  """
-  checkpoint_file_prefix = os.path.basename(checkpoint_path_prefix)
-
-  if tf.io.gfile.exists(backup_path) and empty_backup:
-    tf.io.gfile.rmtree(backup_path)
-
-  tf.io.gfile.mkdir(backup_path)
-
-  n_backup = 0
-  # copy all checkpoint files to backup directory (TODO use gfile.glob instead)
-  try:
-    checkpoint_files = tf.io.gfile.glob(checkpoint_path_prefix + "*")
-    if len(checkpoint_files) == 0:
-      raise twml.errors.CheckpointNotFoundError("%s not found" % checkpoint_path_prefix)
-    for filename in checkpoint_files:
-      n_backup += 1
-      tf.io.gfile.copy(
-        src=filename,
-        dst=os.path.join(backup_path, os.path.basename(filename))
-      )
-  except tf.errors.OpError as ex:
-    raise twml.errors.CheckpointNotFoundError(
-      f"{str(ex)}\n {checkpoint_path_prefix} not found."
+        partitions = tf.cast(partitions, tf.int32)
+
+    if isinstance(features, tf.Tensor):
+        return tf.dynamic_partition(features, partitions, num_partitions, name)
+
+    outputs = []
+    for _ in range(num_partitions):
+        if isinstance(features, (tuple, list)):
+            # Create an empty list of lists first, will be converted to right type afterwards.
+            outputs.append([None for _ in range(len(features))])
+        else:
+            outputs.append(dict())
+
+    iterable = features.items() if isinstance(features, dict) else enumerate(features)
+
+    # Handling partitions of nested classes handled here:
+    # Recursively call dynamic_partition for containers
+    for key, feature in iterable:
+        name_key = None if name is None else name + "_" + str(key)
+        if isinstance(partitions, tf.Tensor):
+            results = tf.dynamic_partition(
+                feature, partitions, num_partitions, name_key
+            )
+        else:
+            results = tf.dynamic_partition(
+                feature, partitions[key], num_partitions[key], name_key
+            )
+            # Append the result to the proper output container
+        for idx, result in enumerate(results):
+            outputs[idx][key] = result
+
+    # if input is tuple, convert list of lists back to list of tuples
+    if isinstance(features, tuple):
+        outputs = [type(features)(output) for output in outputs]
+
+    return outputs
+
+
+def write_file(filename: str, contents: str, encode: bool = False) -> None:
+    """
+    Optionally encodes contents and writes contents to a file.
+
+    Args:
+        filename:
+            path to file where the contents will be saved.
+            Accepts HDFS and local paths.
+        contents:
+            contents to save to the file.
+            Must be a string when encode is False.
+        encode:
+            False | 'json'. When encode='json', contents is encoded
+            with json.dumps.
+    """
+    if encode == "json":
+        contents = json.dumps(contents)
+    elif not is_string(contents):
+        raise ValueError("Expecting string for encode=False")
+
+    graph = tf.Graph()
+    with graph.as_default():
+        write = tf.write_file(filename, contents)
+
+    with tf.Session(graph=graph) as sess:
+        sess.run(write)
+
+
+def read_file(filename: str, decode: bool = False) -> str:
+    """
+    Reads contents from a file and optionally decodes it.
+
+    Args:
+        filename:
+            path to file where the contents will be loaded from.
+            Accepts HDFS and local paths.
+        decode:
+            False | 'json'. When decode='json', contents is decoded
+            with json.loads. When False, contents is returned as is.
+
+    Returns:
+        contents
+    """
+    graph = tf.Graph()
+    with graph.as_default():
+        read = tf.read_file(filename)
+
+    with tf.Session(graph=graph) as sess:
+        contents = sess.run(read)
+        # particular version of TF and/or Python may or may not perform decoding step from utf-8 to str
+        if not isinstance(contents, str):
+            contents = contents.decode()
+
+    if decode == "json":
+        contents = json.loads(contents)
+
+    return contents
+
+
+def setup_tf_logging_formatter() -> None:
+    formatter = _logging.Formatter(
+        "%(asctime)s [%(levelname)s] %(name)s: %(message)s", None
     )
-
-  # tf.train.latest_checkpoint needs the 'checkpoint' file.
-  with tf.io.gfile.GFile(os.path.join(backup_path, 'checkpoint'), 'w') as f:
-    f.write('model_checkpoint_path: "%s"\n' % checkpoint_file_prefix)
-
-  return n_backup
-
-
-def set_only_checkpoint(source_path, dest_path, remove_source=True):
-  """
-  Removes the checkpoint and model.ckpt* files from dest_path.
-  Moves the latest checkpoint from source_path to dest_path.
-
-  Arguments:
-    source_path:
-      path to directory containing the latest checkpoint.
-      Should contain a valid checkpoint file and model.ckpt files.
-      For early-stopping, this should be the save_dir/best_checkpoint dir.
-    dest_path:
-      path to directory where the latest checkpoint files will be moved.
-      All its checkpoint and model.ckpt* files will be removed.
-      For early-stopping, this should be the save_dir.
-    remove_source:
-      When True (the default), deletes the source directory.
-      Note that even when False, its checkpoint files are moved to
-      dest_path anyway.
-      This deletes the source directory (and any remaining contents).
-  """
-  # make it so that source_path checkpoint is the only checkpoint
-  source_path_prefix = tf.train.latest_checkpoint(source_path)
-  if source_path_prefix is not None:
-    # remove intermediate checkpoints
-    for filename in tf.io.gfile.listdir(dest_path):
-      if filename.startswith("model.ckpt"):
-        tf.io.gfile.Remove(os.path.join(dest_path, filename))
-    # move contents of source_path to dest_path
-    for filename in tf.io.gfile.listdir(source_path):
-      tf.io.gfile.rename(
-        oldname=os.path.join(source_path, filename),
-        newname=os.path.join(dest_path, filename),
-        overwrite=True)  # overwrite "checkpoint" file
-    # delete the source_path dir
-    if remove_source:
-      tf.io.gfile.rmtree(source_path)
+    # Setting up absl logging verbosity
+    logging.set_verbosity("info")
+    logging.set_stderrthreshold("info")
+    logging.get_absl_handler().setFormatter(formatter)
+    tf.logging.set_verbosity(tf.logging.INFO)
+    # Set tensorflow logging handler format
+    if len(tf_logging.get_logger().handlers) > 0:
+        tf_logging.get_logger().handlers[0].setFormatter(formatter)
+
+
+def set_tensorflow_log_level(log_level: object) -> None:
+    """
+    Sets tensorflow's default logging level.
+
+    0. all logs are shown.
+    1. filter out INFO logs.
+    2. filter out WARNINGs and INFOs.
+    3. filter out ERRORs, WARNINGs, and INFOs.
+
+    Note that tf.Print output are INFO logs, so setting log_level above 0 would hide
+    output from tf.Print.
+    """
+    assert isinstance(log_level, int) and log_level >= 0 and log_level <= 3
+    os.environ["TF_CPP_MIN_LOG_LEVEL"] = str(log_level)
+
+
+def weighted_average(values, weights) -> tf.Tensor:
+    """
+    Compute a weighted average using the given values and weights.
+    E.g. this is usually used to compute a weighted loss given sample weights.
+    """
+    return tf.reduce_sum(tf.multiply(values, weights)) / tf.reduce_sum(weights)
+
+
+def backup_checkpoint(
+    checkpoint_path_prefix: str,
+    backup_path: str = "backup",
+    empty_backup: bool = True,
+) -> int:
+    """
+    Creates a backup copy of a checkpoint in backup_dir.
+    This function is used by the Trainer for early-stopping.
+
+    Args:
+        checkpoint_path_prefix:
+            Prefix of the path to the checkpoint files.
+        backup_path:
+            path to a directory where checkpoint files will be backed up.
+        empty_backup:
+            When True (the default), the current contents of the backup directory
+            are removed before the backup is performed.
+
+    Returns:
+        The number of backed up files.
+    """
+    checkpoint_file_prefix = os.path.basename(checkpoint_path_prefix)
+
+    if tf.io.gfile.exists(backup_path) and empty_backup:
+        tf.io.gfile.rmtree(backup_path)
+
+    tf.io.gfile.mkdir(backup_path)
+
+    n_backup = 0
+    # copy all checkpoint files to backup directory (TODO use gfile.glob instead)
+    try:
+        checkpoint_files = tf.io.gfile.glob(checkpoint_path_prefix + "*")
+        if len(checkpoint_files) == 0:
+            raise twml.errors.CheckpointNotFoundError(
+                "%s not found" % checkpoint_path_prefix
+            )
+        for filename in checkpoint_files:
+            n_backup += 1
+            tf.io.gfile.copy(
+                src=filename, dst=os.path.join(backup_path, os.path.basename(filename))
+            )
+    except tf.errors.OpError as ex:
+        raise twml.errors.CheckpointNotFoundError(
+            f"{str(ex)}\n {checkpoint_path_prefix} not found."
+        )
+
+    # tf.train.latest_checkpoint needs the 'checkpoint' file.
+    with tf.io.gfile.GFile(os.path.join(backup_path, "checkpoint"), "w") as f:
+        f.write('model_checkpoint_path: "%s"\n' % checkpoint_file_prefix)
+
+    return n_backup
+
+
+def set_only_checkpoint(
+    source_path: str,
+    dest_path: str,
+    remove_source: bool = True,
+) -> None:
+    """
+    Removes the checkpoint and model.ckpt* files from dest_path.
+    Moves the latest checkpoint from source_path to dest_path.
+
+    Args:
+        source_path:
+            path to directory containing the latest checkpoint.
+            Should contain a valid checkpoint file and model.ckpt files.
+            For early-stopping, this should be the save_dir/best_checkpoint dir.
+        dest_path:
+            path to directory where the latest checkpoint files will be moved.
+            All its checkpoint and model.ckpt* files will be removed.
+            For early-stopping, this should be the save_dir.
+        remove_source:
+            When True (the default), deletes the source directory.
+            Note that even when False, its checkpoint files are moved to
+            dest_path anyway.
+            This deletes the source directory (and any remaining contents).
+    """
+    # make it so that source_path checkpoint is the only checkpoint
+    source_path_prefix = tf.train.latest_checkpoint(source_path)
+    if source_path_prefix is not None:
+        # remove intermediate checkpoints
+        for filename in tf.io.gfile.listdir(dest_path):
+            if filename.startswith("model.ckpt"):
+                tf.io.gfile.Remove(os.path.join(dest_path, filename))
+        # move contents of source_path to dest_path
+        for filename in tf.io.gfile.listdir(source_path):
+            tf.io.gfile.rename(
+                oldname=os.path.join(source_path, filename),
+                newname=os.path.join(dest_path, filename),
+                overwrite=True,
+            )  # overwrite "checkpoint" file
+        # delete the source_path dir
+        if remove_source:
+            tf.io.gfile.rmtree(source_path)
 
 
 def list_files_by_datetime(
-  base_path,
-  start_datetime,
-  end_datetime=None,
-  datetime_prefix_format='%Y/%m/%d/%H',
-  extension='lzo',
-  parallelism=1,
-  hour_resolution=1,
-  sort=False
+    base_path: str,
+    start_datetime: Optional[datetime] = None,
+    end_datetime: Optional[datetime] = None,
+    datetime_prefix_format: str = "%Y/%m/%d/%H",
+    extension: str = "lzo",
+    parallelism: int = 1,
+    hour_resolution: int = 1,
+    sort: bool = False,
+) -> List[str]:
+    """List files matching `base_path/dt_prefix_format/*.extension` for the requested datetime range.
+
+    Args:
+        base_path:
+            The base path. If `None`, returns `None`.
+        start_datetime:
+            A `datetime.datetime` or string representing the start of the range (inclusive).
+            If `None`, it returns `list_files(base_path, extension, sort)`.
+        end_datetime:
+            A `datetime.datetime` or string representing the end of the range (inclusive).
+            If `None`, assumed to be the same as start_datetime.
+        datetime_prefix_format:
+            Format compatible with `datetime.datetime.strftime`
+            (https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior).
+        extension:
+            The extension of the files composing the dataset (e.g. 'lzo').
+        parallelism:
+            The number of threads used to process list patterns (this is mostly useful
+            when dealing with filesystems such as HDFS in which listing files is a potentially expensive
+            operation).
+        hour_resolution:
+            The separation between consecutive hours. The default value is 1.
+        sort:
+            bool, whether to return a sorted list of files. Default False.
+
+    Returns:
+        A list with all the matching files.
+
+    Raises:
+        errors.OpError: If there are filesystem / directory listing errors.
+    """
+    if hour_resolution is None:
+        hour_resolution = 1
+
+    if base_path is None:
+        return None
+
+    if start_datetime is None:
+        return list_files(base_path, extension, sort)
+
+    # Do this in case people want to use a single day for training.
+    if end_datetime is None:
+        end_datetime = start_datetime
+
+    assert parallelism > 0
+    assert start_datetime <= end_datetime
+
+    if isinstance(start_datetime, str):
+        start_datetime = datetime.strptime(start_datetime, datetime_prefix_format)
+
+    if isinstance(end_datetime, str):
+        end_datetime = datetime.strptime(end_datetime, datetime_prefix_format)
+
+    assert isinstance(start_datetime, datetime)
+    assert isinstance(end_datetime, datetime)
+
+    base_path = preprocess_path(base_path)
+
+    def _handle_missing_globs(pattern: str) -> List[str]:
+        try:
+            return tf.io.gfile.glob(pattern)
+        except tf.errors.NotFoundError as e:
+            tf.logging.warning(e.message)
+            return []
+
+    # a set is used because there might be some repeated globs depending on dt_prefix_format
+    globs = {
+        os.path.join(base_path, dt.strftime(datetime_prefix_format), "*.%s" % extension)
+        for dt in rrule.rrule(
+            freq=rrule.HOURLY,
+            interval=hour_resolution,
+            dtstart=start_datetime,
+            until=end_datetime,
+        )
+    }
+    nested_files = Parallel(n_jobs=parallelism, backend="threading")(
+        delayed(_handle_missing_globs)(p) for p in globs
+    )
+    flattened_files = list(itertools.chain.from_iterable(nested_files))
+
+    if not flattened_files:
+        error_msg = f"Files list is empty: base_path={base_path}, start_datetime={start_datetime}, end_datetime={end_datetime}"
+        raise OSError(error_msg)
+
+    if sort:
+        flattened_files = sorted(flattened_files)
+
+    return flattened_files
+
+
+def limit_sparse_tensor_size(
+    sparse_tf: Union[twml.SparseTensor, tf.SparseTensor],
+    input_size_bits: int,
+    mask_indices: bool = True,
+) -> tf.SparseTensor:
+    """
+    Returns a ``tf.SparseTensor`` which is the input SparseTensor
+    limited to the specified input_size_bits
+
+    Args:
+        sparse_tf:
+            twml.SparseTensor or tf.SparseTensor
+        input_size_bits:
+            The number of bits allocated to the input size.
+            Input size will be power(2,input_size_bits).
+            Note that twml.limit_bits truncates any feature keys that
+            exceed the input size.
+        mask_indices:
+            If mask indices is False; only the shape is changed. Defaults to True.
+
+    Returns:
+        (tf.SparseTensor) The limited sparse tensor
+    """
+    if isinstance(sparse_tf, twml.SparseTensor):
+        sparse_tf = sparse_tf.to_tf()
+    if not isinstance(sparse_tf, tf.SparseTensor):
+        raise TypeError(
+            "Input argument `sparse_tf` should either be of type"
+            f"twml.SparseTensor of tf.SparseTensor. Found type: {type(sparse_tf)}"
+        )
+    if mask_indices:
+        indices = twml.limit_bits(sparse_tf.indices, input_size_bits)
+    else:
+        indices = sparse_tf.indices
+    dense_shape = tf.stack([sparse_tf.dense_shape[0], 1 << input_size_bits])
+    return tf.SparseTensor(
+        indices=indices, values=sparse_tf.values, dense_shape=dense_shape
+    )
+
+
+def create_module_spec(
+    mlp_fn: Callable[
+        [tf.estimator.ModeKeys, Dict[str, Any]], tf.estimator.EstimatorSpec
+    ],
+    mode: tf.estimator.ModeKeys,
+    params: Dict[str, Any],
+    drop_collections: Optional[List[str]] = None,
 ):
-  """List files matching `base_path/dt_prefix_format/*.extension` for the requested datetime range.
-
-  Args:
-    base_path:
-      The base path. If `None`, returns `None`.
-    start_datetime:
-      A `datetime.datetime` or string representing the start of the range (inclusive).
-      If `None`, it returns `list_files(base_path, extension, sort)`.
-    end_datetime:
-      A `datetime.datetime` or string representing the end of the range (inclusive).
-      If `None`, assumed to be the same as start_datetime.
-    datetime_prefix_format:
-      Format compatible with `datetime.datetime.strftime`
-      (https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior).
-    extension:
-      The extension of the files composing the dataset (e.g. 'lzo').
-    parallelism:
-      The number of threads used to process list patterns (this is mostly useful
-      when dealing with filesystems such as HDFS in which listing files is a potentially expensive
-      operation).
-    hour_resolution:
-      The separation between consecutive hours. The default value is 1.
-    sort:
-      bool, whether to return a sorted list of files. Default False.
-
-  Returns:
-    A list with all the matching files.
-
-  Raises:
-    errors.OpError: If there are filesystem / directory listing errors.
-  """
-  if hour_resolution is None:
-    hour_resolution = 1
-
-  if base_path is None:
-    return None
-
-  if start_datetime is None:
-    return list_files(base_path, extension, sort)
-
-  # Do this in case people want to use a single day for training.
-  if end_datetime is None:
-    end_datetime = start_datetime
-
-  assert parallelism > 0
-  assert start_datetime <= end_datetime
-
-  if isinstance(start_datetime, str):
-    start_datetime = datetime.strptime(start_datetime, datetime_prefix_format)
-
-  if isinstance(end_datetime, str):
-    end_datetime = datetime.strptime(end_datetime, datetime_prefix_format)
-
-  assert isinstance(start_datetime, datetime)
-  assert isinstance(end_datetime, datetime)
-
-  base_path = preprocess_path(base_path)
-
-  def _handle_missing_globs(pattern):
-    try:
-      return tf.io.gfile.glob(pattern)
-    except tf.errors.NotFoundError as e:
-      tf.logging.warning(e.message)
-      return []
-
-  # a set is used because there might be some repeated globs depending on dt_prefix_format
-  globs = {
-    os.path.join(base_path, dt.strftime(datetime_prefix_format), '*.%s' % extension)
-    for dt in rrule.rrule(
-      freq=rrule.HOURLY, interval=hour_resolution, dtstart=start_datetime, until=end_datetime)
-  }
-  nested_files = Parallel(n_jobs=parallelism, backend='threading')(
-    delayed(_handle_missing_globs)(p) for p in globs
-  )
-  flattened_files = list(itertools.chain.from_iterable(nested_files))
-
-  if not flattened_files:
-    error_msg = "Files list is empty: base_path={base_path}, start_datetime={start_datetime}, end_datetime={end_datetime}".format(
-      base_path=base_path, start_datetime=start_datetime, end_datetime=end_datetime
+    """
+    Creates a standard tags_and_args which should be passed to the create_module_spec
+    spec = hub.create_module_spec(mlp_fn, tags_and_args=tags_and_args).
+
+    Args:
+        module_fn:
+            a function to build a graph for the Module.
+        mode:
+            mode in which the Estimator is run
+        params:
+            parameters passed to the Estimator
+    """
+    import tensorflow_hub as hub  # noqa: F402
+
+    tags_and_args = [
+        (set(), {"params": params, "mode": mode}),  # serving graph
+        ({"train"}, {"params": params, "mode": mode}),  # training graph
+    ]
+    spec = hub.create_module_spec(
+        mlp_fn, tags_and_args=tags_and_args, drop_collections=drop_collections
     )
-    raise OSError(error_msg)
-
-  if sort:
-    flattened_files = sorted(flattened_files)
-
-  return flattened_files
-
-
-def limit_sparse_tensor_size(sparse_tf, input_size_bits, mask_indices=True):
-  """
-  Returns a ``tf.SparseTensor`` which is the input SparseTensor
-  limited to the specified input_size_bits
-
-  Args:
-    sparse_tf:
-      twml.SparseTensor or tf.SparseTensor
-    input_size_bits:
-      The number of bits allocated to the input size.
-      Input size will be power(2,input_size_bits).
-      Note that twml.limit_bits truncates any feature keys that
-      exceed the input size.
-    mask_indices:
-      If mask indices is False; only the shape is changed. Defaults to True.
-  """
-  if isinstance(sparse_tf, twml.SparseTensor):
-    sparse_tf = sparse_tf.to_tf()
-  if not isinstance(sparse_tf, tf.SparseTensor):
-    raise TypeError('Input argument `sparse_tf` should either be of type'
-                    'twml.SparseTensor of tf.SparseTensor. Found type: {}'.
-                    format(type(sparse_tf)))
-  if mask_indices:
-    indices = twml.limit_bits(sparse_tf.indices, input_size_bits)
-  else:
-    indices = sparse_tf.indices
-  dense_shape = tf.stack([sparse_tf.dense_shape[0], 1 << input_size_bits])
-  return tf.SparseTensor(indices=indices, values=sparse_tf.values,
-                         dense_shape=dense_shape)
-
-
-def create_module_spec(mlp_fn, mode, params, drop_collections=None):
-  """
-  Creates a standard tags_and_args which should be passed to the create_module_spec
-  spec = hub.create_module_spec(mlp_fn, tags_and_args=tags_and_args).
-
-  Args:
-    module_fn:
-      a function to build a graph for the Module.
-    mode:
-      mode in which the Estimator is run
-    params:
-      parameters passed to the Estimator
-  """
-  import tensorflow_hub as hub # noqa: F402
-  tags_and_args = [(set(), {"params": params, "mode": mode}),  # serving graph
-                   ({"train"}, {"params": params, "mode": mode})  # training graph
-                   ]
-  spec = hub.create_module_spec(mlp_fn, tags_and_args=tags_and_args, drop_collections=drop_collections)
-  return spec
-
-
-def change_name_scope_from_dir(init_scope_name, final_scope_name, save_dir):
-  """
-  Changes the name of the saved scope to the desired name and saves it
-  to the same save_dir.
-
-  Args:
-    init_scope_name:
-      initial scope name
-    final_scope_name:
-      desired (final) scope name
-    save_dir:
-      directory which the scopes are saved
-
-  In the follwing section we:
-    - Read all the variables from the latest checkpoint.
-    - Make a copy of the variables with new name scope.
-    - Store both sets of variables into the latest checkpoint.
-  This essentially doubles up the size of the checkpoint.
-  But when a job is restarted after this part is done, the checkpoint size doubles again.
-  To avoid doing this, we create a copy in backup if a backup isn't found.
-  This allows us always read (from backup) and write same sized checkpoint files.
-  """
-
-  # Create a backup_checkpoints dir
-  backup_dir = os.path.join(save_dir, "change_name_scope_backups")
-  tf.io.gfile.makedirs(backup_dir)
-
-  latest_checkpoint = tf.train.latest_checkpoint(save_dir)
-
-  if latest_checkpoint is None:
-    raise OSError("No checkpoints found in save_dir: %s" % save_dir)
-
-  latest_backup_checkpoint = tf.train.latest_checkpoint(backup_dir)
-
-  if (latest_backup_checkpoint is None or
-      (os.path.basename(latest_checkpoint) !=
-       os.path.basename(latest_backup_checkpoint))):
-    backup_checkpoint(latest_checkpoint, backup_dir, empty_backup=False)
-
-  variables = tf.train.list_variables(backup_dir)
-  with tf.Graph().as_default(), tf.Session().as_default() as sess:
-    new_variables = []
-    for name, _ in variables:
-      var = tf.train.load_variable(backup_dir, name)
-      # Append both the rename and the original variable
-      new_variables.append(
-        tf.Variable(var, name=name.replace(init_scope_name, final_scope_name)))
-      new_variables.append(tf.Variable(var, name=name))
-    # Save this to the checkpoint in the save_dir
-    saver = tf.train.Saver(new_variables)
-    sess.run(tf.global_variables_initializer())
-    saver.save(sess, latest_checkpoint)  # pylint: disable=no-member
-
-
-def hub_import(input, module, module_name, trainable=False):
-  """
-  Loads exported hub module.
-
-  Args:
-    input:
-      input to hub module
-    module:
-      module path
-    module_name:
-      signature of the exported hub module
-  """
-  import tensorflow_hub as hub # noqa: F402
-  hub_module = hub.Module(module, trainable=trainable)
-  output = hub_module(input, signature=module_name)
-  return output
-
-
-def _extract_hash_space_bits(feature_config):
-  """
-  Extract Sparse Shapes for contrib.FeatureConfig.
-  Arguments:
-    feature_config:
-      Feature Configuration of the type contrib.FeatureConfig
-  Returns:
-    Dictionary of tensor names and hash space bits.
-  """
-  if not isinstance(feature_config, twml.contrib.feature_config.FeatureConfig):
-    fc_type = type(feature_config)
-    raise TypeError(f"Feature config must be of type contrib.FeatureConfig: {fc_type}")
-  sparse_shapes_dict = {}
-  for config in feature_config.sparse_extraction_configs:
-    sparse_shapes_dict[config.output_name] = config.hash_space_bits
-  return sparse_shapes_dict
-
-
-def fix_shape_sparse(features, feature_config):
-  """
-  Modifies the shape of features which are extracted using the hashing trick.
-  Features itself is changed by this function.
-  Arguments:
-    features:
-      Feature dictionary extracted by the feature config
-    feature_config:
-      Feature Configuration of the type contrib.FeatureConfig
-  """
-  if not isinstance(feature_config, twml.contrib.feature_config.FeatureConfig):
-    raise TypeError(f"Feature config must be of type contrib.FeatureConfig, currently of {type(feature_config)}")
-  sparse_shape = _extract_hash_space_bits(feature_config)
-  if not isinstance(features, dict):
-    raise TypeError(f"features must be of dictionary type, it is of {type(features)} type")
-  for key in set(features) & set(sparse_shape):
-    features[key] = limit_sparse_tensor_size(features[key], sparse_shape[key], mask_indices=False)
-
-
-def touch_file_in_dir(directory, filename):
-  """
-  Creates a file named filename in directory.
-
-  Arguments:
-    filename: (str)
-    directory: (str)
-  """
-  file_path = os.path.join(directory, filename)
-  with tf.io.gfile.GFile(file_path, "w") as f:
-    f.write("")
+    return spec
+
+
+def change_name_scope_from_dir(
+    init_scope_name: str,
+    final_scope_name: str,
+    save_dir: str,
+) -> None:
+    """
+    Changes the name of the saved scope to the desired name and saves it
+    to the same save_dir.
+
+    Args:
+        init_scope_name:
+            initial scope name
+        final_scope_name:
+            desired (final) scope name
+        save_dir:
+            directory which the scopes are saved
+
+    In the follwing section we:
+        - Read all the variables from the latest checkpoint.
+        - Make a copy of the variables with new name scope.
+        - Store both sets of variables into the latest checkpoint.
+        This essentially doubles up the size of the checkpoint.
+        But when a job is restarted after this part is done, the checkpoint size doubles again.
+        To avoid doing this, we create a copy in backup if a backup isn't found.
+        This allows us always read (from backup) and write same sized checkpoint files.
+    """
+
+    # Create a backup_checkpoints dir
+    backup_dir = os.path.join(save_dir, "change_name_scope_backups")
+    tf.io.gfile.makedirs(backup_dir)
+
+    latest_checkpoint = tf.train.latest_checkpoint(save_dir)
+
+    if latest_checkpoint is None:
+        raise OSError("No checkpoints found in save_dir: %s" % save_dir)
+
+    latest_backup_checkpoint = tf.train.latest_checkpoint(backup_dir)
+
+    if latest_backup_checkpoint is None or (
+        os.path.basename(latest_checkpoint)
+        != os.path.basename(latest_backup_checkpoint)
+    ):
+        backup_checkpoint(latest_checkpoint, backup_dir, empty_backup=False)
+
+    variables = tf.train.list_variables(backup_dir)
+    with tf.Graph().as_default(), tf.Session().as_default() as sess:
+        new_variables = []
+        for name, _ in variables:
+            var = tf.train.load_variable(backup_dir, name)
+            # Append both the rename and the original variable
+            new_variables.append(
+                tf.Variable(var, name=name.replace(init_scope_name, final_scope_name))
+            )
+            new_variables.append(tf.Variable(var, name=name))
+        # Save this to the checkpoint in the save_dir
+        saver = tf.train.Saver(new_variables)
+        sess.run(tf.global_variables_initializer())
+        saver.save(sess, latest_checkpoint)  # pylint: disable=no-member
+
+
+def hub_import(
+    input: tf.Tensor, module: str, module_name: str, trainable: bool = False
+) -> tf.Tensor:
+    """
+    Loads exported hub module.
+
+    Args:
+        input:
+            input to hub module
+        module:
+            module path
+        module_name:
+            signature of the exported hub module
+
+    Returns:
+        output of the hub module
+    """
+    import tensorflow_hub as hub  # noqa: F402
+
+    hub_module = hub.Module(module, trainable=trainable)
+    output = hub_module(input, signature=module_name)
+    return output
+
+
+def _extract_hash_space_bits(
+    feature_config: twml.contrib.feature_config.FeatureConfig,
+) -> Dict[str, int]:
+    """
+    Extract Sparse Shapes for contrib.FeatureConfig.
+
+    Args:
+        feature_config:
+            Feature Configuration of the type contrib.FeatureConfig
+
+    Returns:
+        Dictionary of tensor names and hash space bits.
+    """
+    if not isinstance(feature_config, twml.contrib.feature_config.FeatureConfig):
+        fc_type = type(feature_config)
+        raise TypeError(
+            f"Feature config must be of type contrib.FeatureConfig: {fc_type}"
+        )
+    sparse_shapes_dict = {}
+    for config in feature_config.sparse_extraction_configs:
+        sparse_shapes_dict[config.output_name] = config.hash_space_bits
+    return sparse_shapes_dict
+
+
+def fix_shape_sparse(
+    features: dict, feature_config: twml.contrib.feature_config.FeatureConfig
+) -> None:
+    """
+    Modifies the shape of features which are extracted using the hashing trick.
+    Features itself is changed by this function.
+    Args:
+        features:
+            Feature dictionary extracted by the feature config
+        feature_config:
+            Feature Configuration of the type contrib.FeatureConfig
+    """
+    if not isinstance(feature_config, twml.contrib.feature_config.FeatureConfig):
+        raise TypeError(
+            f"Feature config must be of type contrib.FeatureConfig, currently of {type(feature_config)}"
+        )
+    sparse_shape = _extract_hash_space_bits(feature_config)
+    if not isinstance(features, dict):
+        raise TypeError(
+            f"features must be of dictionary type, it is of {type(features)} type"
+        )
+    for key in set(features) & set(sparse_shape):
+        features[key] = limit_sparse_tensor_size(
+            features[key], sparse_shape[key], mask_indices=False
+        )
+
+
+def touch_file_in_dir(directory: str, filename: str) -> None:
+    """
+    Creates a file named filename in directory.
+
+    Args:
+        filename: (str)
+        directory: (str)
+    """
+    file_path = os.path.join(directory, filename)
+    with tf.io.gfile.GFile(file_path, "w") as f:
+        f.write("")
 
 
 def file_exist_in_dir(directory: str, filename: str) -> bool:
-  file_path = os.path.join(directory, filename)
-  return tf.io.gfile.exists(file_path)
-
-
-def copy_to_local(remote, local, filename, overwrite=False):
-  """Function to file from remote directory to local directory."""
-  assert "hdfs://" not in local
-  tf.io.gfile.makedirs(local)
-  return tf.io.gfile.copy(
-    os.path.join(remote, filename),
-    os.path.join(local, filename),
-    overwrite=overwrite,
-  )
-
-
-def copy_recursive(src, dst, overwrite=False):
-  """
-  Function to copy a directory recursively.
-
-  Arguments:
-    src: Source directory.
-    dst: Destination directory.
-    overwrite: Specifies if files are to be overwritten if they exist.
-  """
-
-  src = src.rstrip("/")
-  dst = dst.rstrip("/")
-
-  for dirname, subdirs, files in tf.io.gfile.walk(src):
-    dst_dirname = dirname.replace(src, dst)
-    tf.io.gfile.makedirs(dst_dirname)
-
-    for f in files:
-      src_f = os.path.join(dirname, f)
-      dst_f = os.path.join(dst_dirname, f)
-
-      tf.logging.info(f"Copying {src_f} to {dst_f}")
-      tf.io.gfile.copy(src_f, dst_f, overwrite=overwrite)
-
-
-def delete_file_or_dir(path):
-  """
-  Delete the file or directory given by `path`
-  Arguments:
-    path:
-      string indicating path of file or directory to remove
-  """
-  if tf.io.gfile.isdir(path):
-    tf.io.gfile.rmtree(path)
-  else:
-    tf.io.gfile.remove(path)
-
-
-def get_distributed_training_job_path():
-  """
-  Function to get distributed training job path.
-  Note: distributed training has three jobs, one parameter server job,
-  one worker job and one evaluator job. All of these three jobs' name
-  share a common base job name.
-  """
-  job_path = AuroraPath(dc=os.environ.get("TWML_JOB_CLUSTER"),
-    role=os.environ.get("TWML_JOB_ROLE"),
-    env=os.environ.get("TWML_JOB_ENV"),
-    job_name=os.environ.get("TWML_DISTRIBUTED_BASE_JOBNAME"))
-  return job_path
-
-def do_every_n_steps(action, num_steps):
-  """
-  Execute a sequence of TensorFlow operations only once in a while.
-  Specifically, `action` is performed if `global_step` is a
-    multiple of `num_steps`
-
-  Args:
-    action: callable to be performed at regular intervals. This callable
-      must return a TF op with no output tensors.
-    num_steps: period of performing the action, as measured
-      in number of training steps
-
-  Returns:
-    A TensorFlow op with no output tensors, like a tf.print() or tf.no_op().
-    You must use tf.control_dependencies() to execute the op.
-
-  """
-  global_step = tf.train.get_or_create_global_step()
-  condition = tf.math.equal(tf.math.floormod(global_step, num_steps), 0)
-  return tf.cond(condition, action, lambda: tf.no_op())
+    """
+    Checks if a file exists in directory.
+
+    Args:
+        filename: (str)
+            the name of the file
+        directory: (str)
+            the directory where the file is located
+
+    Returns:
+        bool: True if the file exists, False otherwise
+    """
+    file_path = os.path.join(directory, filename)
+    return tf.io.gfile.exists(file_path)
+
+
+def copy_to_local(
+    remote: str,
+    local: str,
+    filename: str,
+    overwrite: bool = False,
+) -> tf.io.gfile:
+    """
+    Function to file from remote directory to local directory.
+
+    Args:
+        remote (str): Remote directory.
+        local (str): Local directory.
+        filename (str): Name of the file to be copied.
+        overwrite (bool): Specifies if files are to be overwritten if they exist.
+
+    Returns:
+        tf.io.gfile: The copied file.
+    """
+    assert "hdfs://" not in local
+    tf.io.gfile.makedirs(local)
+    return tf.io.gfile.copy(
+        os.path.join(remote, filename),
+        os.path.join(local, filename),
+        overwrite=overwrite,
+    )
+
+
+def copy_recursive(
+    src: str,
+    dst: str,
+    overwrite: bool = False,
+) -> None:
+    """
+    Function to copy a directory recursively.
+
+    Args:
+        src (str): Source directory.
+        dst (str): Destination directory.
+        overwrite (bool): Specifies if files are to be overwritten if they exist.
+    """
+
+    src = src.rstrip("/")
+    dst = dst.rstrip("/")
+
+    for dirname, subdirs, files in tf.io.gfile.walk(src):
+        dst_dirname = dirname.replace(src, dst)
+        tf.io.gfile.makedirs(dst_dirname)
+
+        for f in files:
+            src_f = os.path.join(dirname, f)
+            dst_f = os.path.join(dst_dirname, f)
+
+            tf.logging.info(f"Copying {src_f} to {dst_f}")
+            tf.io.gfile.copy(src_f, dst_f, overwrite=overwrite)
+
+
+def delete_file_or_dir(path: str) -> None:
+    """
+    Delete the file or directory given by `path`
+    Args:
+        path (str):
+            string indicating path of file or directory to remove
+    """
+    if tf.io.gfile.isdir(path):
+        tf.io.gfile.rmtree(path)
+    else:
+        tf.io.gfile.remove(path)
+
+
+def get_distributed_training_job_path() -> AuroraPath:
+    """
+    Function to get distributed training job path.
+    Note: distributed training has three jobs, one parameter server job,
+    one worker job and one evaluator job. All of these three jobs' name
+    share a common base job name.
+
+    Returns:
+        AuroraPath: The distributed training job path.
+    """
+    job_path = AuroraPath(
+        dc=os.environ.get("TWML_JOB_CLUSTER"),
+        role=os.environ.get("TWML_JOB_ROLE"),
+        env=os.environ.get("TWML_JOB_ENV"),
+        job_name=os.environ.get("TWML_DISTRIBUTED_BASE_JOBNAME"),
+    )
+    return job_path
+
+
+def do_every_n_steps(
+    action: Callable[[], tf.Operation],
+    num_steps: int,
+) -> tf.Operation:
+    """
+    Execute a sequence of TensorFlow operations only once in a while.
+    Specifically, `action` is performed if `global_step` is a multiple of `num_steps`
+
+    Args:
+        action: callable to be performed at regular intervals. This callable
+            must return a TF op with no output tensors.
+        num_steps: period of performing the action, as measured
+            in number of training steps
+
+    Returns:
+        A TensorFlow op with no output tensors, like a tf.print() or tf.no_op().
+        You must use tf.control_dependencies() to execute the op.
+    """
+
+    global_step = tf.train.get_or_create_global_step()
+    condition = tf.math.equal(tf.math.floormod(global_step, num_steps), 0)
+    return tf.cond(condition, action, lambda: tf.no_op())
diff --git a/twml/twml_common/initializer.py b/twml/twml_common/initializer.py
index 7a9c734c7..31c3c1407 100644
--- a/twml/twml_common/initializer.py
+++ b/twml/twml_common/initializer.py
@@ -1,14 +1,18 @@
+import numpy as np
 import tensorflow.compat.v1 as tf
 
 
 class PartitionInitializer(tf.keras.initializers.Initializer):
-  """Required to initialize partitioned weight with numpy array for tests"""
+    """Required to initialize partitioned weight with numpy array for tests"""
 
-  def __init__(self, np_array):
-    self.np_array = np_array
+    def __init__(self, np_array: np.ndarray):
+        self.np_array = np_array
 
-  def __call__(self, shape, dtype=None, partition_info=None):
-    offset = partition_info.var_offset
-    ix0, ix1 = offset[0], offset[0] + shape[0]
-    iy0, iy1 = offset[1], offset[1] + shape[1]
-    return self.np_array[ix0:ix1, iy0:iy1]
+    def __call__(self, shape, dtype=None, partition_info=None) -> np.ndarray:
+        """Returns a numpy array for the given shape and dtype."""
+        offset = partition_info.var_offset
+        ix0, ix1 = offset[0], offset[0] + shape[0]
+        iy0, iy1 = offset[1], offset[1] + shape[1]
+        if dtype is not None:
+            return self.np_array[ix0:ix1, iy0:iy1].astype(dtype)
+        return self.np_array[ix0:ix1, iy0:iy1]
diff --git a/twml/twml_common/serialize.py b/twml/twml_common/serialize.py
index 36c53881e..e7210bc0e 100644
--- a/twml/twml_common/serialize.py
+++ b/twml/twml_common/serialize.py
@@ -2,15 +2,36 @@
 from thrift.transport import TTransport
 
 
-def serialize(obj):
-  tbuf = TTransport.TMemoryBuffer()
-  iproto = TBinaryProtocol.TBinaryProtocol(tbuf)
-  obj.write(iproto)
-  return tbuf.getvalue()
-
-
-def deserialize(record, bytes):
-  tbuf = TTransport.TMemoryBuffer(bytes)
-  iproto = TBinaryProtocol.TBinaryProtocol(tbuf)
-  record.read(iproto)
-  return record
+def serialize(obj: TBinaryProtocol.TBinaryProtocol) -> bytes:
+    """
+    Serialize a thrift object into a byte string
+
+    Args:
+        obj: the thrift object to serialize
+
+    Returns:
+        The serialized thrift object
+    """
+    tbuf = TTransport.TMemoryBuffer()
+    iproto = TBinaryProtocol.TBinaryProtocol(tbuf)
+    obj.write(iproto)
+    return tbuf.getvalue()
+
+
+def deserialize(
+    record: TBinaryProtocol.TBinaryProtocol, bytes: bytes
+) -> TBinaryProtocol.TBinaryProtocol:
+    """
+    Deserialize a thrift object from a byte string
+
+    Args:
+        record: the thrift object to deserialize into
+        bytes: the byte string to deserialize from
+
+    Returns:
+        The deserialized thrift object
+    """
+    tbuf = TTransport.TMemoryBuffer(bytes)
+    iproto = TBinaryProtocol.TBinaryProtocol(tbuf)
+    record.read(iproto)
+    return record
diff --git a/twml/twml_common/sparse_inputs.py b/twml/twml_common/sparse_inputs.py
index b8f7939e5..fc2d61e2b 100644
--- a/twml/twml_common/sparse_inputs.py
+++ b/twml/twml_common/sparse_inputs.py
@@ -2,23 +2,59 @@
 import tensorflow.compat.v1 as tf
 
 
-def create_sparse_tensor(batch_size, input_size, num_values, dtype=tf.float32):
-  random_indices = np.sort(np.random.randint(batch_size * input_size, size=num_values))
-  test_indices_i = random_indices // input_size
-  test_indices_j = random_indices % input_size
-  test_indices = np.stack([test_indices_i, test_indices_j], axis=1)
-  test_values = np.random.random(num_values).astype(dtype.as_numpy_dtype)
-
-  return tf.SparseTensor(indices=tf.constant(test_indices),
-                         values=tf.constant(test_values),
-                         dense_shape=(batch_size, input_size))
-
-
-def create_reference_input(sparse_input, use_binary_values):
-  if use_binary_values:
-    sp_a = tf.SparseTensor(indices=sparse_input.indices,
-                           values=tf.ones_like(sparse_input.values),
-                           dense_shape=sparse_input.dense_shape)
-  else:
-    sp_a = sparse_input
-  return sp_a
+def create_sparse_tensor(
+    batch_size: int,
+    input_size: int,
+    num_values: int,
+    dtype: tf.DType = tf.float32,
+) -> tf.SparseTensor:
+    """
+    Creates a sparse tensor with the given batch size, input size, and number of values.
+
+    Args:
+        batch_size (int): The batch size of the sparse tensor.
+        input_size (int): The input size of the sparse tensor.
+        num_values (int): The number of values in the sparse tensor.
+        dtype (tf.DType): The dtype of the sparse tensor.
+
+    Returns:
+        A sparse tensor with the given batch size, input size, and number of values.
+    """
+    random_indices = np.sort(
+        np.random.randint(batch_size * input_size, size=num_values)
+    )
+    test_indices_i = random_indices // input_size
+    test_indices_j = random_indices % input_size
+    test_indices = np.stack([test_indices_i, test_indices_j], axis=1)
+    test_values = np.random.random(num_values).astype(dtype.as_numpy_dtype)
+
+    return tf.SparseTensor(
+        indices=tf.constant(test_indices),
+        values=tf.constant(test_values),
+        dense_shape=(batch_size, input_size),
+    )
+
+
+def create_reference_input(
+    sparse_input: tf.SparseTensor, use_binary_values: bool
+) -> tf.SparseTensor:
+    """
+    Creates a reference input for the sparse input.
+
+    Args:
+        sparse_input (tf.SparseTensor): The sparse input.
+        use_binary_values (bool): Whether to use binary values.
+
+    Returns:
+        A reference input for the sparse input.
+    """
+
+    if use_binary_values:
+        sp_a = tf.SparseTensor(
+            indices=sparse_input.indices,
+            values=tf.ones_like(sparse_input.values),
+            dense_shape=sparse_input.dense_shape,
+        )
+    else:
+        sp_a = sparse_input
+    return sp_a