Quantization tool: Allow user to override calibrator's session EP (#23559)

jambayk · web-flow · commit d1fb58b0f2be · 2025-02-05T22:38:21.000-08:00
### Description The quantization calibrators have `execution_providers` attributes but there is no way for a user to provide their own providers when using the `quantize` or `quantize_static` functions. This PR adds a `calibration_providers` parameter to allow users to specify the execution providers to use during calibration. It is helpful when quantizing large models which are slow to calibrate on the CPU. - Chose `calibration_providers` as the name since there is the docstrings refer to another `execution_provider` https://github.com/microsoft/onnxruntime/blob/169917b1e7f69daa687a5448526c189d1f7a4e2b/onnxruntime/python/tools/quantization/quantize.py#L204 https://github.com/microsoft/onnxruntime/blob/169917b1e7f69daa687a5448526c189d1f7a4e2b/onnxruntime/python/tools/quantization/quantize.py#L415 which are not present anywhere in the code. - Can change the name to something else if needed like calibrator_providers, and/or make it into a string instead of a providers list.
diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py
@@ -380,7 +380,7 @@ def add_reduce_min_max(tensor_name, reduce_op_name):
             else:
                 raise ValueError(
                     f"Unable to guess tensor type for tensor {tensor_name!r}, "
-                    f"running shape inference before quantization may resolve this issue."
+                    "running shape inference before quantization may resolve this issue."
                 )
 
             # Include axes in reduce_op when per_channel, always keeping axis=1
@@ -1177,6 +1177,7 @@ def create_calibrator(
     augmented_model_path="augmented_model.onnx",
     calibrate_method=CalibrationMethod.MinMax,
     use_external_data_format=False,
+    providers=None,
     extra_options={},  # noqa: B006
 ):
     calibrator = None
@@ -1243,6 +1244,8 @@ def create_calibrator(
 
     if calibrator:
         calibrator.augment_graph()
+        if providers:
+            calibrator.execution_providers = providers
         calibrator.create_inference_session()
         return calibrator
 
diff --git a/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py b/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py
@@ -53,6 +53,7 @@ def get_qnn_qdq_config(
     weight_symmetric: bool | None = None,
     keep_removable_activations: bool = False,
     stride: int | None = None,
+    calibration_providers: list[str] | None = None,
 ) -> StaticQuantConfig:
     """
     Returns a static quantization configuration suitable for running QDQ models on QNN EP.
@@ -117,6 +118,8 @@ def get_qnn_qdq_config(
                         are automatically removed if activations are asymmetrically quantized. Keeping these activations
                         is necessary if optimizations or EP transformations will later remove
                         QuantizeLinear/DequantizeLinear operators from the model.
+        calibration_providers: Execution providers to run the session during calibration. Default is None which uses
+            [ "CPUExecutionProvider" ].
 
     Returns:
         A StaticQuantConfig object
@@ -192,6 +195,7 @@ def get_qnn_qdq_config(
         op_types_to_quantize=list(op_types.difference(OP_TYPES_TO_EXCLUDE)),
         per_channel=per_channel,
         use_external_data_format=(model_has_external_data or model.ByteSize() >= MODEL_SIZE_THRESHOLD),
+        calibration_providers=calibration_providers,
         extra_options=extra_options,
     )
 
diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py
@@ -99,6 +99,7 @@ def __init__(
         per_channel=False,
         reduce_range=False,
         use_external_data_format=False,
+        calibration_providers=None,
         extra_options=None,
     ):
         """
@@ -112,6 +113,8 @@ def __init__(
             quant_format: QuantFormat{QOperator, QDQ}.
                 QOperator format quantizes the model with quantized operators directly.
                 QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
+            calibration_providers: Execution providers to run the session during calibration. Default is None which uses
+                [ "CPUExecutionProvider" ].
             extra_options:
                 key value pair dictionary for various options in different case. Current used:
                     extra.Sigmoid.nnapi = True/False  (Default is False)
@@ -219,6 +222,7 @@ def __init__(
         self.calibration_data_reader = calibration_data_reader
         self.calibrate_method = calibrate_method
         self.quant_format = quant_format
+        self.calibration_providers = calibration_providers
         self.extra_options = extra_options or {}
 
 
@@ -473,6 +477,7 @@ def quantize_static(
     nodes_to_exclude=None,
     use_external_data_format=False,
     calibrate_method=CalibrationMethod.MinMax,
+    calibration_providers=None,
     extra_options=None,
 ):
     """
@@ -520,6 +525,8 @@ def quantize_static(
             List of nodes names to exclude. The nodes in this list will be excluded from quantization
             when it is not None.
         use_external_data_format: option used for large size (>2GB) model. Set to False by default.
+        calibration_providers: Execution providers to run the session during calibration. Default is None which uses
+            [ "CPUExecutionProvider" ]
         extra_options:
             key value pair dictionary for various options in different case. Current used:
                 extra.Sigmoid.nnapi = True/False  (Default is False)
@@ -697,6 +704,7 @@ def inc_dataloader():
             augmented_model_path=Path(quant_tmp_dir).joinpath("augmented_model.onnx").as_posix(),
             calibrate_method=calibrate_method,
             use_external_data_format=use_external_data_format,
+            providers=calibration_providers,
             extra_options=calib_extra_options,
         )
 
@@ -890,6 +898,7 @@ def quantize(
             per_channel=quant_config.per_channel,
             reduce_range=quant_config.reduce_range,
             use_external_data_format=quant_config.use_external_data_format,
+            calibration_providers=quant_config.calibration_providers,
             extra_options=quant_config.extra_options,
         )