From b5eeaa4e479338747018b1408d5827e1c2f848c8 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Mon, 21 Oct 2024 11:37:39 +0200 Subject: [PATCH 01/43] update --- src/diffusers/loaders/gguf.py | 114 ++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 src/diffusers/loaders/gguf.py diff --git a/src/diffusers/loaders/gguf.py b/src/diffusers/loaders/gguf.py new file mode 100644 index 000000000000..4d381b80766a --- /dev/null +++ b/src/diffusers/loaders/gguf.py @@ -0,0 +1,114 @@ +# coding=utf-8 +# Copyright 2024 The ggml.ai team and The HuggingFace Inc. team. and pygguf author (github.com/99991) +# https://github.com/99991/pygguf +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from array import array + +from tqdm import tqdm + +from ..utils import is_torch_available +from ..utils.logging import get_logger + + +TORCH_COMPATIBLE_QTYPES = {None, gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16} + +if is_torch_available(): + pass + +logger = get_logger(__name__) + + +GGUF_TO_DIFFUSERS_MAPPING = { + "ignore": { + "GGUF": { + "version": "version", + "tensor_count": "tensor_count", + "kv_count": "kv_count", + }, + "general": {"file_type": "file_type", "quantization_version": "quantization_version"}, + }, +} + + +def _gguf_parse_value(_value, data_type): + if not isinstance(data_type, list): + data_type = [data_type] + if len(data_type) == 1: + data_type = data_type[0] + array_data_type = None + else: + if data_type[0] != 9: + raise ValueError("Received multiple types, therefore expected the first type to indicate an array.") + data_type, array_data_type = data_type + + if data_type in [0, 1, 2, 3, 4, 5, 10, 11]: + _value = int(_value[0]) + elif data_type in [6, 12]: + _value = float(_value[0]) + elif data_type in [7]: + _value = bool(_value[0]) + elif data_type in [8]: + _value = array("B", list(_value)).tobytes().decode() + elif data_type in [9]: + _value = _gguf_parse_value(_value, array_data_type) + return _value + + +def read_field(reader, field): + value = reader.fields[field] + return [_gguf_parse_value(value.parts[_data_index], value.types) for _data_index in value.data] + + +def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): + """ + Load a GGUF file and return a dictionary of parsed parameters containing tensors, the parsed + tokenizer and config attributes. + + Args: + gguf_checkpoint_path (`str`): + The path the to GGUF file to load + return_tensors (`bool`, defaults to `True`): + Whether to read the tensors from the file and return them. Not doing so is faster + and only loads the metadata in memory. + """ + + """ + if is_gguf_available() and is_torch_available(): + from gguf import GGUFReader, dequantize + else: + logger.error( + "Loading a GGUF checkpoint in PyTorch, requires both PyTorch and GGUF>=0.10.0 to be installed. Please see " + "https://pytorch.org/ and https://github.com/ggerganov/llama.cpp/tree/master/gguf-py for installation instructions." + ) + raise ImportError("Please install torch and gguf>=0.10.0 to load a GGUF checkpoint in PyTorch.") + """ + from gguf import GGUFReader, dequantize + + reader = GGUFReader(gguf_checkpoint_path) + fields = reader.fields + reader_keys = list(fields.keys()) + + parsed_parameters = {} + for tensor in tqdm(reader.tensors, desc="Converting and de-quantizing GGUF tensors..."): + name = tensor.name + weights = dequantize(tensor.data, tensor.tensor_type) + + parsed_parameters[name] = weights + + if len(reader_keys) > 0: + logger.info(f"Some keys of the GGUF file were not considered: {reader_keys}") + + return parsed_parameters From 71897b1df13b0f4b35932181a6657d73f8156d77 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Mon, 21 Oct 2024 18:47:30 +0200 Subject: [PATCH 02/43] update --- src/diffusers/utils/import_utils.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py index f1323bf00ea4..f440bf67cb6c 100644 --- a/src/diffusers/utils/import_utils.py +++ b/src/diffusers/utils/import_utils.py @@ -339,6 +339,14 @@ def is_timm_available(): except importlib_metadata.PackageNotFoundError: _imageio_available = False +_is_gguf_available = importlib.util.find_spec("gguf") is not None +if _is_gguf_available: + try: + _gguf_version = importlib_metadata.version("gguf") + logger.debug(f"Successfully import gguf version {_gguf_version}") + except importlib_metadata.PackageNotFoundError: + _is_gguf_available = False + def is_torch_available(): return _torch_available @@ -460,6 +468,10 @@ def is_imageio_available(): return _imageio_available +def is_gguf_available(): + return _is_gguf_available + + # docstyle-ignore FLAX_IMPORT_ERROR = """ {0} requires the FLAX library but it was not found in your environment. Checkout the instructions on the @@ -593,6 +605,11 @@ def is_imageio_available(): {0} requires the imageio library and ffmpeg but it was not found in your environment. You can install it with pip: `pip install imageio imageio-ffmpeg` """ +# docstyle-ignore +GGUF_IMPORT_ERROR = """ +{0} requires the gguf library but it was not found in your environment. You can install it with pip: `pip install gguf` +""" + BACKENDS_MAPPING = OrderedDict( [ ("bs4", (is_bs4_available, BS4_IMPORT_ERROR)), @@ -618,6 +635,7 @@ def is_imageio_available(): ("bitsandbytes", (is_bitsandbytes_available, BITSANDBYTES_IMPORT_ERROR)), ("sentencepiece", (is_sentencepiece_available, SENTENCEPIECE_IMPORT_ERROR)), ("imageio", (is_imageio_available, IMAGEIO_IMPORT_ERROR)), + ("gguf", (is_gguf_available, GGUF_IMPORT_ERROR)), ] ) From 89ea1eeb2a66ec55a22467bb8f54787c4c6932a1 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Thu, 24 Oct 2024 12:52:29 +0200 Subject: [PATCH 03/43] update --- src/diffusers/loaders/gguf.py | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/src/diffusers/loaders/gguf.py b/src/diffusers/loaders/gguf.py index 4d381b80766a..d85e2895f85d 100644 --- a/src/diffusers/loaders/gguf.py +++ b/src/diffusers/loaders/gguf.py @@ -14,14 +14,14 @@ # See the License for the specific language governing permissions and # limitations under the License. - +import torch from array import array - +import gguf from tqdm import tqdm from ..utils import is_torch_available from ..utils.logging import get_logger - +from ..utils.import_utils import is_gguf_available TORCH_COMPATIBLE_QTYPES = {None, gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16} @@ -43,6 +43,29 @@ } +class GGMLTensor(torch.Tensor): + def __init__(self, dtype, axis): + self._dtype = dtype + self._axis = axis + + @property + def axis(self): + return self._axis + + @property + def dtype(self): + return self._dtype + + def numpy(self): + return self.dequantize().cpu().numpy() + + def clone(self, *args, **kwargs): + return self + + def detach(self, *args, **kwargs): + return self + + def _gguf_parse_value(_value, data_type): if not isinstance(data_type, list): data_type = [data_type] @@ -95,7 +118,8 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): ) raise ImportError("Please install torch and gguf>=0.10.0 to load a GGUF checkpoint in PyTorch.") """ - from gguf import GGUFReader, dequantize + if is_torch_available(): + from gguf import GGUFReader, dequantize reader = GGUFReader(gguf_checkpoint_path) fields = reader.fields @@ -104,7 +128,7 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): parsed_parameters = {} for tensor in tqdm(reader.tensors, desc="Converting and de-quantizing GGUF tensors..."): name = tensor.name - weights = dequantize(tensor.data, tensor.tensor_type) + weights = torch.from_numpy(tensor.data) parsed_parameters[name] = weights From f0bcd94d43036610f63d84c5f65f902ceca216d8 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Thu, 24 Oct 2024 18:08:17 +0200 Subject: [PATCH 04/43] update --- .../quantizers/gguf/gguf_quantizer.py | 118 ++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 src/diffusers/quantizers/gguf/gguf_quantizer.py diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py new file mode 100644 index 000000000000..bc6c87ca09ce --- /dev/null +++ b/src/diffusers/quantizers/gguf/gguf_quantizer.py @@ -0,0 +1,118 @@ +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union + +from ...utils import get_module_from_name +from ..base import DiffusersQuantizer + + +if TYPE_CHECKING: + from ...models.modeling_utils import ModelMixin + +from ...utils import ( + is_accelerate_available, + is_accelerate_version, + is_bitsandbytes_available, + is_bitsandbytes_version, + is_torch_available, + logging, +) + + +if is_torch_available(): + import torch + +logger = logging.get_logger(__name__) + + +class GGUFQuantizer(DiffusersQuantizer) + use_keep_in_fp32_modules = True + + def __init__(self, quantization_config, **kwargs): + super().__init__(quantization_config, **kwargs) + + def check_quantized_param( + self, + model: "ModelMixin", + param_value: "torch.Tensor", + param_name: str, + state_dict: Dict[str, Any], + **kwargs, + ) -> bool: + + return + + def create_quantized_param( + self, + model: "ModelMixin", + param_value: "torch.Tensor", + param_name: str, + target_device: "torch.device", + state_dict: Dict[str, Any], + unexpected_keys: Optional[List[str]] = None, + ): + import bitsandbytes as bnb + + module, tensor_name = get_module_from_name(model, param_name) + + if tensor_name not in module._parameters: + raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.") + + old_value = getattr(module, tensor_name) + + if tensor_name == "bias": + if param_value is None: + new_value = old_value.to(target_device) + else: + new_value = param_value.to(target_device) + + new_value = torch.nn.Parameter(new_value, requires_grad=old_value.requires_grad) + module._parameters[tensor_name] = new_value + return + + if not isinstance(module._parameters[tensor_name], bnb.nn.Params4bit): + raise ValueError("this function only loads `Linear4bit components`") + if ( + old_value.device == torch.device("meta") + and target_device not in ["meta", torch.device("meta")] + and param_value is None + ): + raise ValueError(f"{tensor_name} is on the meta device, we need a `value` to put in on {target_device}.") + + # construct `new_value` for the module._parameters[tensor_name]: + if self.pre_quantized: + # 4bit loading. Collecting components for restoring quantized weight + # This can be expanded to make a universal call for any quantized weight loading + + if not self.is_serializable: + raise ValueError( + "Detected int4 weights but the version of bitsandbytes is not compatible with int4 serialization. " + "Make sure to download the latest `bitsandbytes` version. `pip install --upgrade bitsandbytes`." + ) + + if (param_name + ".quant_state.bitsandbytes__fp4" not in state_dict) and ( + param_name + ".quant_state.bitsandbytes__nf4" not in state_dict + ): + raise ValueError( + f"Supplied state dict for {param_name} does not contain `bitsandbytes__*` and possibly other `quantized_stats` components." + ) + + quantized_stats = {} + for k, v in state_dict.items(): + # `startswith` to counter for edge cases where `param_name` + # substring can be present in multiple places in the `state_dict` + if param_name + "." in k and k.startswith(param_name): + quantized_stats[k] = v + if unexpected_keys is not None and k in unexpected_keys: + unexpected_keys.remove(k) + + new_value = bnb.nn.Params4bit.from_prequantized( + data=param_value, + quantized_stats=quantized_stats, + requires_grad=False, + device=target_device, + ) + else: + new_value = param_value.to("cpu") + kwargs = old_value.__dict__ + new_value = bnb.nn.Params4bit(new_value, requires_grad=False, **kwargs).to(target_device) + + module._parameters[tensor_name] = new_value From 60d1385876db9eb64e096855580f6cc139d811f5 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Tue, 29 Oct 2024 19:09:38 +0100 Subject: [PATCH 05/43] update --- src/diffusers/loaders/gguf.py | 12 +++--- src/diffusers/loaders/single_file_model.py | 48 +++++++++++++++++++++- 2 files changed, 52 insertions(+), 8 deletions(-) diff --git a/src/diffusers/loaders/gguf.py b/src/diffusers/loaders/gguf.py index d85e2895f85d..f305cf8eac06 100644 --- a/src/diffusers/loaders/gguf.py +++ b/src/diffusers/loaders/gguf.py @@ -14,14 +14,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -import torch from array import array + import gguf +import torch from tqdm import tqdm from ..utils import is_torch_available -from ..utils.logging import get_logger from ..utils.import_utils import is_gguf_available +from ..utils.logging import get_logger + TORCH_COMPATIBLE_QTYPES = {None, gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16} @@ -108,18 +110,14 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): and only loads the metadata in memory. """ - """ if is_gguf_available() and is_torch_available(): - from gguf import GGUFReader, dequantize + from gguf import GGUFReader else: logger.error( "Loading a GGUF checkpoint in PyTorch, requires both PyTorch and GGUF>=0.10.0 to be installed. Please see " "https://pytorch.org/ and https://github.com/ggerganov/llama.cpp/tree/master/gguf-py for installation instructions." ) raise ImportError("Please install torch and gguf>=0.10.0 to load a GGUF checkpoint in PyTorch.") - """ - if is_torch_available(): - from gguf import GGUFReader, dequantize reader = GGUFReader(gguf_checkpoint_path) fields = reader.fields diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py index 3fe1abfbead5..fceef2cb0ca3 100644 --- a/src/diffusers/loaders/single_file_model.py +++ b/src/diffusers/loaders/single_file_model.py @@ -17,8 +17,10 @@ from contextlib import nullcontext from typing import Optional +import torch from huggingface_hub.utils import validate_hf_hub_args +from ..quantizers import DiffusersAutoQuantizer from ..utils import deprecate, is_accelerate_available, logging from .single_file_utils import ( SingleFileComponentError, @@ -202,6 +204,7 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] = subfolder = kwargs.pop("subfolder", None) revision = kwargs.pop("revision", None) torch_dtype = kwargs.pop("torch_dtype", None) + quantization_config = kwargs.pop("quantization_config", None) if isinstance(pretrained_model_link_or_path_or_dict, dict): checkpoint = pretrained_model_link_or_path_or_dict @@ -216,6 +219,36 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] = revision=revision, ) + pre_quantized = "quantization_config" in config and config["quantization_config"] is not None + if pre_quantized or quantization_config is not None: + if pre_quantized: + config["quantization_config"] = DiffusersAutoQuantizer.merge_quantization_configs( + config["quantization_config"], quantization_config + ) + else: + config["quantization_config"] = quantization_config + hf_quantizer = DiffusersAutoQuantizer.from_config( + config["quantization_config"], pre_quantized=pre_quantized + ) + else: + hf_quantizer = None + + if hf_quantizer is not None: + hf_quantizer.validate_environment(torch_dtype=torch_dtype) + torch_dtype = hf_quantizer.update_torch_dtype(torch_dtype) + + # Check if `_keep_in_fp32_modules` is not None + use_keep_in_fp32_modules = (cls._keep_in_fp32_modules is not None) and ( + (torch_dtype == torch.float16) or hasattr(hf_quantizer, "use_keep_in_fp32_modules") + ) + if use_keep_in_fp32_modules: + keep_in_fp32_modules = cls._keep_in_fp32_modules + if not isinstance(keep_in_fp32_modules, list): + keep_in_fp32_modules = [keep_in_fp32_modules] + + else: + keep_in_fp32_modules = [] + mapping_functions = SINGLE_FILE_LOADABLE_CLASSES[mapping_class_name] checkpoint_mapping_fn = mapping_functions["checkpoint_mapping_fn"] @@ -295,8 +328,17 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] = with ctx(): model = cls.from_config(diffusers_model_config) + if hf_quantizer is not None: + hf_quantizer.preprocess_model(model=model, keep_in_fp32_modules=keep_in_fp32_modules) + if is_accelerate_available(): - unexpected_keys = load_model_dict_into_meta(model, diffusers_format_checkpoint, dtype=torch_dtype) + unexpected_keys = load_model_dict_into_meta( + model, + diffusers_format_checkpoint, + dtype=torch_dtype, + hf_quantizer=hf_quantizer, + keep_in_fp32_modules=keep_in_fp32_modules, + ) else: _, unexpected_keys = model.load_state_dict(diffusers_format_checkpoint, strict=False) @@ -310,6 +352,10 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] = f"Some weights of the model checkpoint were not used when initializing {cls.__name__}: \n {[', '.join(unexpected_keys)]}" ) + if hf_quantizer is not None: + hf_quantizer.postprocess_model(model) + model.hf_quantizer = hf_quantizer + if torch_dtype is not None: model.to(torch_dtype) From 22ed0b054a472d98e307e6760ab7fffdc1f67b9d Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Thu, 31 Oct 2024 12:23:13 +0100 Subject: [PATCH 06/43] update --- src/diffusers/loaders/gguf.py | 136 -------------------- src/diffusers/loaders/single_file_model.py | 20 ++- src/diffusers/loaders/single_file_utils.py | 9 +- src/diffusers/models/model_loading_utils.py | 83 ++++++++++++ src/diffusers/quantizers/gguf/utils.py | 6 + src/diffusers/utils/constants.py | 1 + tests/models/test_attention_processor.py | 3 + 7 files changed, 108 insertions(+), 150 deletions(-) delete mode 100644 src/diffusers/loaders/gguf.py create mode 100644 src/diffusers/quantizers/gguf/utils.py diff --git a/src/diffusers/loaders/gguf.py b/src/diffusers/loaders/gguf.py deleted file mode 100644 index f305cf8eac06..000000000000 --- a/src/diffusers/loaders/gguf.py +++ /dev/null @@ -1,136 +0,0 @@ -# coding=utf-8 -# Copyright 2024 The ggml.ai team and The HuggingFace Inc. team. and pygguf author (github.com/99991) -# https://github.com/99991/pygguf -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from array import array - -import gguf -import torch -from tqdm import tqdm - -from ..utils import is_torch_available -from ..utils.import_utils import is_gguf_available -from ..utils.logging import get_logger - - -TORCH_COMPATIBLE_QTYPES = {None, gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16} - -if is_torch_available(): - pass - -logger = get_logger(__name__) - - -GGUF_TO_DIFFUSERS_MAPPING = { - "ignore": { - "GGUF": { - "version": "version", - "tensor_count": "tensor_count", - "kv_count": "kv_count", - }, - "general": {"file_type": "file_type", "quantization_version": "quantization_version"}, - }, -} - - -class GGMLTensor(torch.Tensor): - def __init__(self, dtype, axis): - self._dtype = dtype - self._axis = axis - - @property - def axis(self): - return self._axis - - @property - def dtype(self): - return self._dtype - - def numpy(self): - return self.dequantize().cpu().numpy() - - def clone(self, *args, **kwargs): - return self - - def detach(self, *args, **kwargs): - return self - - -def _gguf_parse_value(_value, data_type): - if not isinstance(data_type, list): - data_type = [data_type] - if len(data_type) == 1: - data_type = data_type[0] - array_data_type = None - else: - if data_type[0] != 9: - raise ValueError("Received multiple types, therefore expected the first type to indicate an array.") - data_type, array_data_type = data_type - - if data_type in [0, 1, 2, 3, 4, 5, 10, 11]: - _value = int(_value[0]) - elif data_type in [6, 12]: - _value = float(_value[0]) - elif data_type in [7]: - _value = bool(_value[0]) - elif data_type in [8]: - _value = array("B", list(_value)).tobytes().decode() - elif data_type in [9]: - _value = _gguf_parse_value(_value, array_data_type) - return _value - - -def read_field(reader, field): - value = reader.fields[field] - return [_gguf_parse_value(value.parts[_data_index], value.types) for _data_index in value.data] - - -def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): - """ - Load a GGUF file and return a dictionary of parsed parameters containing tensors, the parsed - tokenizer and config attributes. - - Args: - gguf_checkpoint_path (`str`): - The path the to GGUF file to load - return_tensors (`bool`, defaults to `True`): - Whether to read the tensors from the file and return them. Not doing so is faster - and only loads the metadata in memory. - """ - - if is_gguf_available() and is_torch_available(): - from gguf import GGUFReader - else: - logger.error( - "Loading a GGUF checkpoint in PyTorch, requires both PyTorch and GGUF>=0.10.0 to be installed. Please see " - "https://pytorch.org/ and https://github.com/ggerganov/llama.cpp/tree/master/gguf-py for installation instructions." - ) - raise ImportError("Please install torch and gguf>=0.10.0 to load a GGUF checkpoint in PyTorch.") - - reader = GGUFReader(gguf_checkpoint_path) - fields = reader.fields - reader_keys = list(fields.keys()) - - parsed_parameters = {} - for tensor in tqdm(reader.tensors, desc="Converting and de-quantizing GGUF tensors..."): - name = tensor.name - weights = torch.from_numpy(tensor.data) - - parsed_parameters[name] = weights - - if len(reader_keys) > 0: - logger.info(f"Some keys of the GGUF file were not considered: {reader_keys}") - - return parsed_parameters diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py index fceef2cb0ca3..dd00cd4c116e 100644 --- a/src/diffusers/loaders/single_file_model.py +++ b/src/diffusers/loaders/single_file_model.py @@ -17,6 +17,7 @@ from contextlib import nullcontext from typing import Optional +from huggingface_hub import QuestionAnsweringInput import torch from huggingface_hub.utils import validate_hf_hub_args @@ -218,18 +219,15 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] = local_files_only=local_files_only, revision=revision, ) + is_gguf = "gguf_metadata" in checkpoint + gguf_metadata = checkpoint["gguf_metadata"] if is_gguf else None - pre_quantized = "quantization_config" in config and config["quantization_config"] is not None - if pre_quantized or quantization_config is not None: - if pre_quantized: - config["quantization_config"] = DiffusersAutoQuantizer.merge_quantization_configs( - config["quantization_config"], quantization_config - ) - else: - config["quantization_config"] = quantization_config - hf_quantizer = DiffusersAutoQuantizer.from_config( - config["quantization_config"], pre_quantized=pre_quantized - ) + while "state_dict" in checkpoint: + checkpoint = checkpoint["state_dict"] + + if is_gguf: + quantization_config = GGUFConfig() + hf_quantizer = DiffusersAutoQuantizer.from_config(quantization_config, pre_quantized=False) else: hf_quantizer = None diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py index 236fbd0c2295..4c50e36dc4f8 100644 --- a/src/diffusers/loaders/single_file_utils.py +++ b/src/diffusers/loaders/single_file_utils.py @@ -353,9 +353,12 @@ def load_single_file_checkpoint( checkpoint = load_state_dict(pretrained_model_link_or_path) - # some checkpoints contain the model state dict under a "state_dict" key - while "state_dict" in checkpoint: - checkpoint = checkpoint["state_dict"] + if "gguf_qtypes" in checkpoint: + return checkpoint + else: + # some checkpoints contain the model state dict under a "state_dict" key + while "state_dict" in checkpoint: + checkpoint = checkpoint["state_dict"] return checkpoint diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py index 5277ad2f9389..0a7522591382 100644 --- a/src/diffusers/models/model_loading_utils.py +++ b/src/diffusers/models/model_loading_utils.py @@ -25,6 +25,17 @@ import torch from huggingface_hub.utils import EntryNotFoundError +from diffusers.utils.constants import GGUF_FILE_EXTENSION +from array import array + +import torch +from tqdm import tqdm + +from ..utils import is_torch_available +from ..utils.import_utils import is_gguf_available +from ..utils.logging import get_logger + + from ..quantizers.quantization_config import QuantizationMethod from ..utils import ( SAFE_WEIGHTS_INDEX_NAME, @@ -140,6 +151,8 @@ def load_state_dict(checkpoint_file: Union[str, os.PathLike], variant: Optional[ file_extension = os.path.basename(checkpoint_file).split(".")[-1] if file_extension == SAFETENSORS_FILE_EXTENSION: return safetensors.torch.load_file(checkpoint_file, device="cpu") + elif file_extension == GGUF_FILE_EXTENSION: + return load_gguf_checkpoint(checkpoint_file) else: weights_only_kwarg = {"weights_only": True} if is_torch_version(">=", "1.13") else {} return torch.load( @@ -389,3 +402,73 @@ def _fetch_index_file_legacy( index_file = None return index_file + + +def _gguf_parse_value(_value, data_type): + if not isinstance(data_type, list): + data_type = [data_type] + if len(data_type) == 1: + data_type = data_type[0] + array_data_type = None + else: + if data_type[0] != 9: + raise ValueError("Received multiple types, therefore expected the first type to indicate an array.") + data_type, array_data_type = data_type + + if data_type in [0, 1, 2, 3, 4, 5, 10, 11]: + _value = int(_value[0]) + elif data_type in [6, 12]: + _value = float(_value[0]) + elif data_type in [7]: + _value = bool(_value[0]) + elif data_type in [8]: + _value = array("B", list(_value)).tobytes().decode() + elif data_type in [9]: + _value = _gguf_parse_value(_value, array_data_type) + return _value + + +def read_field(reader, field): + value = reader.fields[field] + return [_gguf_parse_value(value.parts[_data_index], value.types) for _data_index in value.data] + + +def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): + """ + Load a GGUF file and return a dictionary of parsed parameters containing tensors, the parsed + tokenizer and config attributes. + + Args: + gguf_checkpoint_path (`str`): + The path the to GGUF file to load + return_tensors (`bool`, defaults to `True`): + Whether to read the tensors from the file and return them. Not doing so is faster + and only loads the metadata in memory. + """ + + if is_gguf_available() and is_torch_available(): + from gguf import GGUFReader + else: + logger.error( + "Loading a GGUF checkpoint in PyTorch, requires both PyTorch and GGUF>=0.10.0 to be installed. Please see " + "https://pytorch.org/ and https://github.com/ggerganov/llama.cpp/tree/master/gguf-py for installation instructions." + ) + raise ImportError("Please install torch and gguf>=0.10.0 to load a GGUF checkpoint in PyTorch.") + + reader = GGUFReader(gguf_checkpoint_path) + fields = reader.fields + reader_keys = list(fields.keys()) + + parsed_parameters = {} + qtypes = {} + for tensor in tqdm(reader.tensors): + name = tensor.name + weights = torch.from_numpy(tensor.data) + + parsed_parameters[name] = weights + qtypes[name] = str(tensor.tensor_type) + + if len(reader_keys) > 0: + logger.info(f"Some keys of the GGUF file were not considered: {reader_keys}") + + return {"state_dict": parsed_parameters, "gguf_metadata": qtypes} diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py new file mode 100644 index 000000000000..1be035fe478a --- /dev/null +++ b/src/diffusers/quantizers/gguf/utils.py @@ -0,0 +1,6 @@ +import torch + + +class GGUFParameter(torch.nn.Parameter): + def __init__(self, data): + super().__init__() diff --git a/src/diffusers/utils/constants.py b/src/diffusers/utils/constants.py index 553ac5d1bb27..93b0cd847d91 100644 --- a/src/diffusers/utils/constants.py +++ b/src/diffusers/utils/constants.py @@ -34,6 +34,7 @@ SAFETENSORS_WEIGHTS_NAME = "diffusion_pytorch_model.safetensors" SAFE_WEIGHTS_INDEX_NAME = "diffusion_pytorch_model.safetensors.index.json" SAFETENSORS_FILE_EXTENSION = "safetensors" +GGUF_FILE_EXTENSION = "gguf" ONNX_EXTERNAL_WEIGHTS_NAME = "weights.pb" HUGGINGFACE_CO_RESOLVE_ENDPOINT = os.environ.get("HF_ENDPOINT", "https://huggingface.co") DIFFUSERS_DYNAMIC_MODULE_NAME = "diffusers_modules" diff --git a/tests/models/test_attention_processor.py b/tests/models/test_attention_processor.py index 2489604274b4..c1432fee5211 100644 --- a/tests/models/test_attention_processor.py +++ b/tests/models/test_attention_processor.py @@ -6,6 +6,7 @@ from diffusers import DiffusionPipeline from diffusers.models.attention_processor import Attention, AttnAddedKVProcessor +import pytest class AttnAddedKVProcessorTests(unittest.TestCase): @@ -83,6 +84,7 @@ def test_conversion_when_using_device_map(self): pipe = DiffusionPipeline.from_pretrained( "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None ) + torch.cuda.synchronize() pre_conversion = pipe( "foo", @@ -95,6 +97,7 @@ def test_conversion_when_using_device_map(self): pipe = DiffusionPipeline.from_pretrained( "hf-internal-testing/tiny-stable-diffusion-torch", device_map="balanced", safety_checker=None ) + torch.cuda.synchronize() conversion = pipe( "foo", From 2e6d3405e3404b9f91b9de880e6854594cc25bf0 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Sun, 3 Nov 2024 09:40:38 +0100 Subject: [PATCH 07/43] update --- src/diffusers/loaders/single_file_model.py | 3 +- src/diffusers/models/model_loading_utils.py | 24 +++- .../quantizers/gguf/gguf_quantizer.py | 105 ++++++------------ src/diffusers/quantizers/gguf/utils.py | 55 ++++++++- .../quantizers/quantization_config.py | 14 ++- 5 files changed, 119 insertions(+), 82 deletions(-) diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py index dd00cd4c116e..2c04ba58bafa 100644 --- a/src/diffusers/loaders/single_file_model.py +++ b/src/diffusers/loaders/single_file_model.py @@ -222,11 +222,12 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] = is_gguf = "gguf_metadata" in checkpoint gguf_metadata = checkpoint["gguf_metadata"] if is_gguf else None + # For GGUF models we nest the state_dict along with gguf_metadata while "state_dict" in checkpoint: checkpoint = checkpoint["state_dict"] if is_gguf: - quantization_config = GGUFConfig() + quantization_config = GGUFQuantizationConfig() hf_quantizer = DiffusersAutoQuantizer.from_config(quantization_config, pre_quantized=False) else: hf_quantizer = None diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py index 0a7522591382..c650f35076d5 100644 --- a/src/diffusers/models/model_loading_utils.py +++ b/src/diffusers/models/model_loading_utils.py @@ -60,6 +60,25 @@ } +_GGUF_FILE_TYPE_MAPPING = { + 0: "ALL_F32", + 1: "MOSTLY_F16", + 2: "MOSTLY_Q4_0", + 3: "MOSTLY_Q4_1", + 4: "MOSTLY_Q4_1_SOME_F16", + 8: "MOSTLY_Q5_0", + 9: "MOSTLY_Q5_1", + 10: "MOSTLY_Q2_K", + 11: "MOSTLY_Q3_K_S", + 12: "MOSTLY_Q3_K_M", + 13: "MOSTLY_Q3_K_L", + 14: "MOSTLY_Q4_K_S", + 15: "MOSTLY_Q4_K_M", + 16: "MOSTLY_Q5_K_S", + 17: "MOSTLY_Q5_K_M", + 18: "MOSTLY_Q6_K", +} + if is_accelerate_available(): from accelerate import infer_auto_device_map from accelerate.utils import get_balanced_memory, get_max_memory, set_module_tensor_to_device @@ -460,15 +479,14 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): reader_keys = list(fields.keys()) parsed_parameters = {} - qtypes = {} + metadata = {"gguf_file_type": _GGUF_FILE_TYPE_MAPPING[read_field(reader, "general.file_type")[0]]} for tensor in tqdm(reader.tensors): name = tensor.name weights = torch.from_numpy(tensor.data) parsed_parameters[name] = weights - qtypes[name] = str(tensor.tensor_type) if len(reader_keys) > 0: logger.info(f"Some keys of the GGUF file were not considered: {reader_keys}") - return {"state_dict": parsed_parameters, "gguf_metadata": qtypes} + return {"state_dict": parsed_parameters, "gguf_metadata": metadata} diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py index bc6c87ca09ce..eef3f7e43ca4 100644 --- a/src/diffusers/quantizers/gguf/gguf_quantizer.py +++ b/src/diffusers/quantizers/gguf/gguf_quantizer.py @@ -1,8 +1,9 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union + from ...utils import get_module_from_name from ..base import DiffusersQuantizer - +from .utils import GGUFLinear if TYPE_CHECKING: from ...models.modeling_utils import ModelMixin @@ -10,25 +11,28 @@ from ...utils import ( is_accelerate_available, is_accelerate_version, - is_bitsandbytes_available, - is_bitsandbytes_version, is_torch_available, logging, ) +if accelerate_is_available(): + from accelerate import init_empty_weights if is_torch_available(): import torch + import torch.nn as nn -logger = logging.get_logger(__name__) +logger = logging.get_logger(__name__) -class GGUFQuantizer(DiffusersQuantizer) - use_keep_in_fp32_modules = True +class GGUFQuantizer(DiffusersQuantizer): def __init__(self, quantization_config, **kwargs): super().__init__(quantization_config, **kwargs) + self.quant_type = quantization_config.quant_type + self.compute_dtype = quantization_config.compute_dtype + def check_quantized_param( self, model: "ModelMixin", @@ -36,8 +40,7 @@ def check_quantized_param( param_name: str, state_dict: Dict[str, Any], **kwargs, - ) -> bool: - + ) -> bool: return def create_quantized_param( @@ -49,70 +52,24 @@ def create_quantized_param( state_dict: Dict[str, Any], unexpected_keys: Optional[List[str]] = None, ): - import bitsandbytes as bnb - - module, tensor_name = get_module_from_name(model, param_name) - - if tensor_name not in module._parameters: - raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.") - - old_value = getattr(module, tensor_name) - - if tensor_name == "bias": - if param_value is None: - new_value = old_value.to(target_device) - else: - new_value = param_value.to(target_device) - - new_value = torch.nn.Parameter(new_value, requires_grad=old_value.requires_grad) - module._parameters[tensor_name] = new_value - return - - if not isinstance(module._parameters[tensor_name], bnb.nn.Params4bit): - raise ValueError("this function only loads `Linear4bit components`") - if ( - old_value.device == torch.device("meta") - and target_device not in ["meta", torch.device("meta")] - and param_value is None - ): - raise ValueError(f"{tensor_name} is on the meta device, we need a `value` to put in on {target_device}.") - - # construct `new_value` for the module._parameters[tensor_name]: - if self.pre_quantized: - # 4bit loading. Collecting components for restoring quantized weight - # This can be expanded to make a universal call for any quantized weight loading - - if not self.is_serializable: - raise ValueError( - "Detected int4 weights but the version of bitsandbytes is not compatible with int4 serialization. " - "Make sure to download the latest `bitsandbytes` version. `pip install --upgrade bitsandbytes`." - ) - - if (param_name + ".quant_state.bitsandbytes__fp4" not in state_dict) and ( - param_name + ".quant_state.bitsandbytes__nf4" not in state_dict - ): - raise ValueError( - f"Supplied state dict for {param_name} does not contain `bitsandbytes__*` and possibly other `quantized_stats` components." - ) - - quantized_stats = {} - for k, v in state_dict.items(): - # `startswith` to counter for edge cases where `param_name` - # substring can be present in multiple places in the `state_dict` - if param_name + "." in k and k.startswith(param_name): - quantized_stats[k] = v - if unexpected_keys is not None and k in unexpected_keys: - unexpected_keys.remove(k) - - new_value = bnb.nn.Params4bit.from_prequantized( - data=param_value, - quantized_stats=quantized_stats, - requires_grad=False, - device=target_device, - ) - else: - new_value = param_value.to("cpu") - kwargs = old_value.__dict__ - new_value = bnb.nn.Params4bit(new_value, requires_grad=False, **kwargs).to(target_device) + return - module._parameters[tensor_name] = new_value + def _process_model_before_weight_loading( + self, + model: "ModelMixin", + device_map, + keep_in_fp32_modules: List[str] = [], + **kwargs, + ): + for name, module in model.named_children(): + if isinstance(module, nn.Linear) and name not in modules_to_not_convert: + with init_empty_weights(): + in_features = module.in_features + out_features = module.out_features + model._modules[name] = GGUFLinear( + in_features, + out_features, + module.bias is not None, + compute_dtype=self.compute_dtype, + quant_type=self.quant_type, + ) diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py index 1be035fe478a..ac38fe63480c 100644 --- a/src/diffusers/quantizers/gguf/utils.py +++ b/src/diffusers/quantizers/gguf/utils.py @@ -1,6 +1,55 @@ import torch +import torch.nn as nn +import gguf -class GGUFParameter(torch.nn.Parameter): - def __init__(self, data): - super().__init__() +QK_K_BLOCKSIZE = 256 +K_SCALE_SIZE = 12 + + +def split_block_dims(blocks, *args): + n_max = blocks.shape[1] + dims = list(args) + [n_max - sum(args)] + return torch.split(blocks, dims, dim=1) + + +def dequantize_Q2_K(blocks, block_size, type_size, dtype=None): + n_blocks = blocks.shape[0] + + scales, qs, d, dmin = split_block_dims(blocks, QK_K_BLOCKSIZE // 16, QK_K // 4, 2) + d = d.view(torch.float16).to(dtype) + dmin = dmin.view(torch.float16).to(dtype) + + # (n_blocks, 16, 1) + dl = (d * (scales & 0xF)).reshape((n_blocks, QK_K_BLOCKSIZE // 16, 1)) + ml = (dmin * (scales >> 4)).reshape((n_blocks, QK_K_BLOCKSIZE // 16, 1)) + + shift = torch.tensor([0, 2, 4, 6], device=d.device, dtype=torch.uint8).reshape((1, 1, 4, 1)) + + qs = (qs.reshape((n_blocks, -1, 1, 32)) >> shift) & 3 + qs = qs.reshape((n_blocks, QK_K_BLOCKSIZE // 16, 16)) + qs = dl * qs - ml + + return qs.reshape((n_blocks, -1)) + + +class GGUFLinear(nn.Linear): + def __init__( + self, + in_features, + out_features, + bias=False, + compute_dtype=None, + quant_type=None, + device=None, + ) -> None: + super().__init__(in_features, out_features, bias, device) + self._dequant_fn = gguf.quants.dequantize + self.compute_dtype = compute_dtype + self.quant_type = quant_type + + def forward(self, inputs): + weight = self._dequant_fn(self.weight, self.quant_type).to(self.compute_dtype) + bias = self._dequant_fn(self.bias, self.quant_type).to(self.compute_dtype) + + return torch.nn.functional.linear(inputs, weight, bias) diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py index f521c5d717d6..213c5ae57da6 100644 --- a/src/diffusers/quantizers/quantization_config.py +++ b/src/diffusers/quantizers/quantization_config.py @@ -32,7 +32,6 @@ from ..utils import is_torch_available, logging - if is_torch_available(): import torch @@ -389,3 +388,16 @@ def to_diff_dict(self) -> Dict[str, Any]: serializable_config_dict[key] = value return serializable_config_dict + + +class GGUFQuantizationConfig(QuantizationConfigMixin): + def __init__(self, quant_type: str, compute_dtype=None, quant_storage=None): + self.quant_type = quant_type + self.compute_dtype = compute_dtype + self.quant_storage = quant_storage + + if self.compute_dtype is None: + self.compute_dtype = torch.float32 + + if self.quant_storage is None: + self.quant_storage = torch.uint8 From b5f927c2cda3de767aa5028925fb636baa83405f Mon Sep 17 00:00:00 2001 From: DN6 Date: Mon, 11 Nov 2024 21:37:33 +0530 Subject: [PATCH 08/43] update --- src/diffusers/loaders/single_file_model.py | 14 +++++++------- src/diffusers/quantizers/gguf/gguf_quantizer.py | 17 +++++++++++------ src/diffusers/quantizers/gguf/utils.py | 16 ++++++++-------- src/diffusers/utils/__init__.py | 1 + 4 files changed, 27 insertions(+), 21 deletions(-) diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py index 2c04ba58bafa..f3e163126c43 100644 --- a/src/diffusers/loaders/single_file_model.py +++ b/src/diffusers/loaders/single_file_model.py @@ -22,7 +22,7 @@ from huggingface_hub.utils import validate_hf_hub_args from ..quantizers import DiffusersAutoQuantizer -from ..utils import deprecate, is_accelerate_available, logging +from ..utils import deprecate, is_accelerate_available, is_gguf_available, logging from .single_file_utils import ( SingleFileComponentError, convert_animatediff_checkpoint_to_diffusers, @@ -49,6 +49,9 @@ from ..models.modeling_utils import load_model_dict_into_meta +if is_gguf_available(): + from ..quantizers import GGUFQuantizationConfig + SINGLE_FILE_LOADABLE_CLASSES = { "StableCascadeUNet": { @@ -227,15 +230,12 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] = checkpoint = checkpoint["state_dict"] if is_gguf: - quantization_config = GGUFQuantizationConfig() - hf_quantizer = DiffusersAutoQuantizer.from_config(quantization_config, pre_quantized=False) + quantization_config = GGUFQuantizationConfig(quant_type=gguf_metadata["gguf_file_type"]) + # Only support loading pre_quantized gguf checkpoints + hf_quantizer = DiffusersAutoQuantizer.from_config(quantization_config, pre_quantized=True) else: hf_quantizer = None - if hf_quantizer is not None: - hf_quantizer.validate_environment(torch_dtype=torch_dtype) - torch_dtype = hf_quantizer.update_torch_dtype(torch_dtype) - # Check if `_keep_in_fp32_modules` is not None use_keep_in_fp32_modules = (cls._keep_in_fp32_modules is not None) and ( (torch_dtype == torch.float16) or hasattr(hf_quantizer, "use_keep_in_fp32_modules") diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py index eef3f7e43ca4..bb4607094a8c 100644 --- a/src/diffusers/quantizers/gguf/gguf_quantizer.py +++ b/src/diffusers/quantizers/gguf/gguf_quantizer.py @@ -15,7 +15,7 @@ logging, ) -if accelerate_is_available(): +if is_accelerate_available(): from accelerate import init_empty_weights if is_torch_available(): @@ -52,6 +52,13 @@ def create_quantized_param( state_dict: Dict[str, Any], unexpected_keys: Optional[List[str]] = None, ): + module, tensor_name = get_module_from_name(model, param_name) + if tensor_name not in module._parameters: + raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.") + __import__("ipdb").set_trace() + + module._parameters[tensor_name] = param_value + return def _process_model_before_weight_loading( @@ -62,13 +69,11 @@ def _process_model_before_weight_loading( **kwargs, ): for name, module in model.named_children(): - if isinstance(module, nn.Linear) and name not in modules_to_not_convert: + if isinstance(module, nn.Linear) and name not in self.modules_to_not_convert: with init_empty_weights(): - in_features = module.in_features - out_features = module.out_features model._modules[name] = GGUFLinear( - in_features, - out_features, + module.in_features, + module.out_features, module.bias is not None, compute_dtype=self.compute_dtype, quant_type=self.quant_type, diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py index ac38fe63480c..6923865ecb05 100644 --- a/src/diffusers/quantizers/gguf/utils.py +++ b/src/diffusers/quantizers/gguf/utils.py @@ -13,20 +13,20 @@ def split_block_dims(blocks, *args): return torch.split(blocks, dims, dim=1) -def dequantize_Q2_K(blocks, block_size, type_size, dtype=None): +def dequantize_Q2_K(blocks, dtype=None): n_blocks = blocks.shape[0] - scales, qs, d, dmin = split_block_dims(blocks, QK_K_BLOCKSIZE // 16, QK_K // 4, 2) - d = d.view(torch.float16).to(dtype) - dmin = dmin.view(torch.float16).to(dtype) + scales, quantized_values, delta, delta_min = split_block_dims(blocks, QK_K_BLOCKSIZE // 16, QK_K_BLOCKSIZE // 4, 2) + delta = delta.view(torch.float16).to(dtype) + delta_min = delta_min.view(torch.float16).to(dtype) # (n_blocks, 16, 1) - dl = (d * (scales & 0xF)).reshape((n_blocks, QK_K_BLOCKSIZE // 16, 1)) - ml = (dmin * (scales >> 4)).reshape((n_blocks, QK_K_BLOCKSIZE // 16, 1)) + dl = (delta * (scales & 0xF)).reshape((n_blocks, QK_K_BLOCKSIZE // 16, 1)) + ml = (delta_min * (scales >> 4)).reshape((n_blocks, QK_K_BLOCKSIZE // 16, 1)) - shift = torch.tensor([0, 2, 4, 6], device=d.device, dtype=torch.uint8).reshape((1, 1, 4, 1)) + shift = torch.tensor([0, 2, 4, 6], device=delta.device, dtype=torch.uint8).reshape((1, 1, 4, 1)) - qs = (qs.reshape((n_blocks, -1, 1, 32)) >> shift) & 3 + qs = (quantized_values.reshape((n_blocks, -1, 1, 32)) >> shift) & 3 qs = qs.reshape((n_blocks, QK_K_BLOCKSIZE // 16, 16)) qs = dl * qs - ml diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py index c8f64adf3e8a..da2cd55afa03 100644 --- a/src/diffusers/utils/__init__.py +++ b/src/diffusers/utils/__init__.py @@ -93,6 +93,7 @@ is_unidecode_available, is_wandb_available, is_xformers_available, + is_gguf_available, requires_backends, ) from .loading_utils import get_module_from_name, load_image, load_video From 6dc5d225a4545b86b19a0b8db0eebd9ade4a48b7 Mon Sep 17 00:00:00 2001 From: DN6 Date: Wed, 13 Nov 2024 16:57:33 +0530 Subject: [PATCH 09/43] update --- src/diffusers/loaders/single_file_model.py | 43 +++++---- src/diffusers/loaders/single_file_utils.py | 2 +- src/diffusers/models/model_loading_utils.py | 51 ++++------- src/diffusers/quantizers/auto.py | 2 + src/diffusers/quantizers/gguf/__init__.py | 1 + .../quantizers/gguf/gguf_quantizer.py | 55 ++++++++---- src/diffusers/quantizers/gguf/utils.py | 88 +++++++++++++++++-- .../quantizers/quantization_config.py | 6 +- src/diffusers/utils/__init__.py | 2 +- 9 files changed, 169 insertions(+), 81 deletions(-) create mode 100644 src/diffusers/quantizers/gguf/__init__.py diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py index f3e163126c43..7ce4460eb674 100644 --- a/src/diffusers/loaders/single_file_model.py +++ b/src/diffusers/loaders/single_file_model.py @@ -17,7 +17,6 @@ from contextlib import nullcontext from typing import Optional -from huggingface_hub import QuestionAnsweringInput import torch from huggingface_hub.utils import validate_hf_hub_args @@ -50,7 +49,7 @@ from ..models.modeling_utils import load_model_dict_into_meta if is_gguf_available(): - from ..quantizers import GGUFQuantizationConfig + from ..quantizers.quantization_config import GGUFQuantizationConfig SINGLE_FILE_LOADABLE_CLASSES = { @@ -229,25 +228,6 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] = while "state_dict" in checkpoint: checkpoint = checkpoint["state_dict"] - if is_gguf: - quantization_config = GGUFQuantizationConfig(quant_type=gguf_metadata["gguf_file_type"]) - # Only support loading pre_quantized gguf checkpoints - hf_quantizer = DiffusersAutoQuantizer.from_config(quantization_config, pre_quantized=True) - else: - hf_quantizer = None - - # Check if `_keep_in_fp32_modules` is not None - use_keep_in_fp32_modules = (cls._keep_in_fp32_modules is not None) and ( - (torch_dtype == torch.float16) or hasattr(hf_quantizer, "use_keep_in_fp32_modules") - ) - if use_keep_in_fp32_modules: - keep_in_fp32_modules = cls._keep_in_fp32_modules - if not isinstance(keep_in_fp32_modules, list): - keep_in_fp32_modules = [keep_in_fp32_modules] - - else: - keep_in_fp32_modules = [] - mapping_functions = SINGLE_FILE_LOADABLE_CLASSES[mapping_class_name] checkpoint_mapping_fn = mapping_functions["checkpoint_mapping_fn"] @@ -327,8 +307,27 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] = with ctx(): model = cls.from_config(diffusers_model_config) + if is_gguf: + quantization_config = GGUFQuantizationConfig(quant_type=gguf_metadata["gguf_file_type"]) + # Only support loading pre_quantized gguf checkpoints + hf_quantizer = DiffusersAutoQuantizer.from_config(quantization_config, pre_quantized=True) + else: + hf_quantizer = None + + # Check if `_keep_in_fp32_modules` is not None + use_keep_in_fp32_modules = (cls._keep_in_fp32_modules is not None) and ( + (torch_dtype == torch.float16) or hasattr(hf_quantizer, "use_keep_in_fp32_modules") + ) + if use_keep_in_fp32_modules: + keep_in_fp32_modules = cls._keep_in_fp32_modules + if not isinstance(keep_in_fp32_modules, list): + keep_in_fp32_modules = [keep_in_fp32_modules] + + else: + keep_in_fp32_modules = [] + if hf_quantizer is not None: - hf_quantizer.preprocess_model(model=model, keep_in_fp32_modules=keep_in_fp32_modules) + hf_quantizer.preprocess_model(model=model, device_map=None, keep_in_fp32_modules=keep_in_fp32_modules) if is_accelerate_available(): unexpected_keys = load_model_dict_into_meta( diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py index c2c8496af5d9..a24317783e8e 100644 --- a/src/diffusers/loaders/single_file_utils.py +++ b/src/diffusers/loaders/single_file_utils.py @@ -357,7 +357,7 @@ def load_single_file_checkpoint( checkpoint = load_state_dict(pretrained_model_link_or_path) - if "gguf_qtypes" in checkpoint: + if "gguf_metadata" in checkpoint: return checkpoint else: # some checkpoints contain the model state dict under a "state_dict" key diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py index 3f7be9ff246a..0a6f24865c69 100644 --- a/src/diffusers/models/model_loading_utils.py +++ b/src/diffusers/models/model_loading_utils.py @@ -17,6 +17,7 @@ import importlib import inspect import os +from array import array from collections import OrderedDict from pathlib import Path from typing import List, Optional, Union @@ -24,17 +25,9 @@ import safetensors import torch from huggingface_hub.utils import EntryNotFoundError - -from diffusers.utils.constants import GGUF_FILE_EXTENSION -from array import array - -import torch from tqdm import tqdm -from ..utils import is_torch_available -from ..utils.import_utils import is_gguf_available -from ..utils.logging import get_logger - +from diffusers.utils.constants import GGUF_FILE_EXTENSION from ..quantizers.quantization_config import QuantizationMethod from ..utils import ( @@ -45,9 +38,11 @@ _get_model_file, deprecate, is_accelerate_available, + is_torch_available, is_torch_version, logging, ) +from ..utils.import_utils import is_gguf_available logger = logging.get_logger(__name__) @@ -60,25 +55,6 @@ } -_GGUF_FILE_TYPE_MAPPING = { - 0: "ALL_F32", - 1: "MOSTLY_F16", - 2: "MOSTLY_Q4_0", - 3: "MOSTLY_Q4_1", - 4: "MOSTLY_Q4_1_SOME_F16", - 8: "MOSTLY_Q5_0", - 9: "MOSTLY_Q5_1", - 10: "MOSTLY_Q2_K", - 11: "MOSTLY_Q3_K_S", - 12: "MOSTLY_Q3_K_M", - 13: "MOSTLY_Q3_K_L", - 14: "MOSTLY_Q4_K_S", - 15: "MOSTLY_Q4_K_M", - 16: "MOSTLY_Q5_K_S", - 17: "MOSTLY_Q5_K_M", - 18: "MOSTLY_Q6_K", -} - if is_accelerate_available(): from accelerate import infer_auto_device_map from accelerate.utils import get_balanced_memory, get_max_memory, set_module_tensor_to_device @@ -245,12 +221,13 @@ def load_model_dict_into_meta( # bnb params are flattened. if empty_state_dict[param_name].shape != param.shape: if ( - is_quant_method_bnb + is_quantized and hf_quantizer.pre_quantized and hf_quantizer.check_if_quantized_param(model, param, param_name, state_dict, param_device=device) ): hf_quantizer.check_quantized_param_shape(param_name, empty_state_dict[param_name].shape, param.shape) - elif not is_quant_method_bnb: + else: + __import__('ipdb').set_trace() model_name_or_path_str = f"{model_name_or_path} " if model_name_or_path is not None else "" raise ValueError( f"Cannot load {model_name_or_path_str} because {param_name} expected shape {empty_state_dict[param_name]}, but got {param.shape}. If you want to instead overwrite randomly initialized weights, please make sure to pass both `low_cpu_mem_usage=False` and `ignore_mismatched_sizes=True`. For more information, see also: https://github.com/huggingface/diffusers/issues/1619#issuecomment-1345604389 as an example." @@ -473,7 +450,10 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): """ if is_gguf_available() and is_torch_available(): + import gguf from gguf import GGUFReader + + from ..quantizers.gguf.utils import _GGUF_FILE_TYPE_MAPPING, GGUFParameter else: logger.error( "Loading a GGUF checkpoint in PyTorch, requires both PyTorch and GGUF>=0.10.0 to be installed. Please see " @@ -486,12 +466,17 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): reader_keys = list(fields.keys()) parsed_parameters = {} - metadata = {"gguf_file_type": _GGUF_FILE_TYPE_MAPPING[read_field(reader, "general.file_type")[0]]} + metadata = {"gguf_file_type": _GGUF_FILE_TYPE_MAPPING[read_field(reader, "general.file_type")[0]], "qtypes": {}} + for tensor in tqdm(reader.tensors): name = tensor.name - weights = torch.from_numpy(tensor.data) + tensor_type = tensor.tensor_type - parsed_parameters[name] = weights + # if the tensor is a torch supported dtype do not use GGUFParameter + is_gguf_quant = tensor_type not in [gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16] + + weights = torch.from_numpy(tensor.data) + parsed_parameters[name] = GGUFParameter(weights, tensor_type=tensor_type) if is_gguf_quant else weights.permute(*torch.arange(weights.ndim - 1, -1, -1)) if len(reader_keys) > 0: logger.info(f"Some keys of the GGUF file were not considered: {reader_keys}") diff --git a/src/diffusers/quantizers/auto.py b/src/diffusers/quantizers/auto.py index 97cbcdc0e53f..02a8b4fe917c 100644 --- a/src/diffusers/quantizers/auto.py +++ b/src/diffusers/quantizers/auto.py @@ -19,12 +19,14 @@ from typing import Dict, Optional, Union from .bitsandbytes import BnB4BitDiffusersQuantizer, BnB8BitDiffusersQuantizer +from .gguf import GGUFQuantizer from .quantization_config import BitsAndBytesConfig, QuantizationConfigMixin, QuantizationMethod AUTO_QUANTIZER_MAPPING = { "bitsandbytes_4bit": BnB4BitDiffusersQuantizer, "bitsandbytes_8bit": BnB8BitDiffusersQuantizer, + "gguf": GGUFQuantizer } AUTO_QUANTIZATION_CONFIG_MAPPING = { diff --git a/src/diffusers/quantizers/gguf/__init__.py b/src/diffusers/quantizers/gguf/__init__.py new file mode 100644 index 000000000000..b3d9082ac803 --- /dev/null +++ b/src/diffusers/quantizers/gguf/__init__.py @@ -0,0 +1 @@ +from .gguf_quantizer import GGUFQuantizer diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py index bb4607094a8c..02d01e179676 100644 --- a/src/diffusers/quantizers/gguf/gguf_quantizer.py +++ b/src/diffusers/quantizers/gguf/gguf_quantizer.py @@ -1,26 +1,25 @@ -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union - +from typing import TYPE_CHECKING, Any, Dict, List, Optional from ...utils import get_module_from_name from ..base import DiffusersQuantizer -from .utils import GGUFLinear +from .utils import _replace_with_gguf_linear + if TYPE_CHECKING: from ...models.modeling_utils import ModelMixin from ...utils import ( is_accelerate_available, - is_accelerate_version, is_torch_available, logging, ) + if is_accelerate_available(): - from accelerate import init_empty_weights + pass if is_torch_available(): import torch - import torch.nn as nn logger = logging.get_logger(__name__) @@ -32,6 +31,8 @@ def __init__(self, quantization_config, **kwargs): self.quant_type = quantization_config.quant_type self.compute_dtype = quantization_config.compute_dtype + self.qtypes = quantization_config.qtypes + self.pre_quantized = True def check_quantized_param( self, @@ -41,7 +42,20 @@ def check_quantized_param( state_dict: Dict[str, Any], **kwargs, ) -> bool: - return + return True + + def check_quantized_param_shape(self, param_name, current_param_shape, loaded_param_shape): + return True + + def check_if_quantized_param( + self, + model: "ModelMixin", + param_value: "torch.Tensor", + param_name: str, + state_dict: Dict[str, Any], + **kwargs, + ) -> bool: + return True def create_quantized_param( self, @@ -55,7 +69,9 @@ def create_quantized_param( module, tensor_name = get_module_from_name(model, param_name) if tensor_name not in module._parameters: raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.") - __import__("ipdb").set_trace() + + if param_name == "transformer_blocks.0.attn.to_q.weight": + __import__("ipdb").set_trace() module._parameters[tensor_name] = param_value @@ -68,13 +84,16 @@ def _process_model_before_weight_loading( keep_in_fp32_modules: List[str] = [], **kwargs, ): - for name, module in model.named_children(): - if isinstance(module, nn.Linear) and name not in self.modules_to_not_convert: - with init_empty_weights(): - model._modules[name] = GGUFLinear( - module.in_features, - module.out_features, - module.bias is not None, - compute_dtype=self.compute_dtype, - quant_type=self.quant_type, - ) + model = _replace_with_gguf_linear(model, self.compute_dtype, self.quant_type) + + def _process_model_after_weight_loading(self, model: "ModelMixin", **kwargs): + return model + + @property + def is_serializable(self): + return False + + @property + def is_trainable(self) -> bool: + # Because we're mandating `bitsandbytes` 0.43.3. + return False diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py index 6923865ecb05..d3d71d00507f 100644 --- a/src/diffusers/quantizers/gguf/utils.py +++ b/src/diffusers/quantizers/gguf/utils.py @@ -1,7 +1,26 @@ import torch +from torch._prims_common import is_low_precision_dtype import torch.nn as nn import gguf +_GGUF_FILE_TYPE_MAPPING = { + 0: "ALL_F32", + 1: "MOSTLY_F16", + 2: "MOSTLY_Q4_0", + 3: "MOSTLY_Q4_1", + 4: "MOSTLY_Q4_1_SOME_F16", + 8: "MOSTLY_Q5_0", + 9: "MOSTLY_Q5_1", + 10: "MOSTLY_Q2_K", + 11: "MOSTLY_Q3_K_S", + 12: "MOSTLY_Q3_K_M", + 13: "MOSTLY_Q3_K_L", + 14: "MOSTLY_Q4_K_S", + 15: "MOSTLY_Q4_K_M", + 16: "MOSTLY_Q5_K_S", + 17: "MOSTLY_Q5_K_M", + 18: "MOSTLY_Q6_K", +} QK_K_BLOCKSIZE = 256 K_SCALE_SIZE = 12 @@ -33,6 +52,62 @@ def dequantize_Q2_K(blocks, dtype=None): return qs.reshape((n_blocks, -1)) +dequantize_fns = { + "MOSTLY_Q2_K": dequantize_Q2_K, +} + + +def _replace_with_gguf_linear(model, compute_dtype, quant_type, qtypes=None): + for name, module in model.named_children(): + if isinstance(module, nn.Linear): + model._modules[name] = GGUFLinear( + module.in_features, + module.out_features, + module.bias is not None, + compute_dtype=compute_dtype, + quant_type=quant_type, + ) + model._modules[name].source_cls = type(module) + # Force requires grad to False to avoid unexpected errors + model._modules[name].requires_grad_(False) + + has_children = list(module.children()) + if has_children: + _replace_with_gguf_linear(module, compute_dtype, quant_type) + + return model + + +class GGUFParameter(torch.nn.Parameter): + def __new__(cls, data, requires_grad=False, tensor_type=None): + data = data if data is not None else torch.empty(0) + self = torch.Tensor._make_subclass(cls, data, requires_grad) + self.tensor_type = tensor_type + + return self + + @classmethod + def __torch_function__(cls, func, types, args=(), kwargs=None): + if kwargs is None: + kwargs = {} + result = super().__torch_function__(func, types, args, kwargs) + + tensor_type = None + for arg in args: + if isinstance(arg, GGUFParameter): + tensor_type = arg.tensor_type + break + if isinstance(result, torch.Tensor): + return cls(result, tensor_type=tensor_type) + # Handle tuples and lists + elif isinstance(result, (tuple, list)): + # Preserve the original type (tuple or list) + wrapped = [cls(x, tensor_type=tensor_type) if isinstance(x, torch.Tensor) else x for x in result] + return type(result)(wrapped) + else: + return result + + class GGUFLinear(nn.Linear): def __init__( self, @@ -44,12 +119,15 @@ def __init__( device=None, ) -> None: super().__init__(in_features, out_features, bias, device) - self._dequant_fn = gguf.quants.dequantize self.compute_dtype = compute_dtype self.quant_type = quant_type + self._dequant_fn = dequantize_fns[self.quant_type] def forward(self, inputs): - weight = self._dequant_fn(self.weight, self.quant_type).to(self.compute_dtype) - bias = self._dequant_fn(self.bias, self.quant_type).to(self.compute_dtype) - - return torch.nn.functional.linear(inputs, weight, bias) + is_gguf_quant = hasattr(self.weight, "tensor_type") + if is_gguf_quant: + weight = self._dequant_fn(self.weight, torch.uint8).to(self.compute_dtype) + else: + weight = self.weight + __import__("ipdb").set_trace() + return torch.nn.functional.linear(inputs, weight, self.bias) diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py index 213c5ae57da6..05cd997900eb 100644 --- a/src/diffusers/quantizers/quantization_config.py +++ b/src/diffusers/quantizers/quantization_config.py @@ -32,6 +32,7 @@ from ..utils import is_torch_available, logging + if is_torch_available(): import torch @@ -40,6 +41,7 @@ class QuantizationMethod(str, Enum): BITS_AND_BYTES = "bitsandbytes" + GGUF = "gguf" @dataclass @@ -391,10 +393,12 @@ def to_diff_dict(self) -> Dict[str, Any]: class GGUFQuantizationConfig(QuantizationConfigMixin): - def __init__(self, quant_type: str, compute_dtype=None, quant_storage=None): + def __init__(self, quant_type: str, qtypes=None, compute_dtype=None, quant_storage=None): + self.quant_method = QuantizationMethod.GGUF self.quant_type = quant_type self.compute_dtype = compute_dtype self.quant_storage = quant_storage + self.qtypes = qtypes if self.compute_dtype is None: self.compute_dtype = torch.float32 diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py index da2cd55afa03..24e324ac4382 100644 --- a/src/diffusers/utils/__init__.py +++ b/src/diffusers/utils/__init__.py @@ -66,6 +66,7 @@ is_bs4_available, is_flax_available, is_ftfy_available, + is_gguf_available, is_google_colab, is_inflect_available, is_invisible_watermark_available, @@ -93,7 +94,6 @@ is_unidecode_available, is_wandb_available, is_xformers_available, - is_gguf_available, requires_backends, ) from .loading_utils import get_module_from_name, load_image, load_video From 428e44be60c633395363bfa37043ea45a8d13dcd Mon Sep 17 00:00:00 2001 From: DN6 Date: Fri, 15 Nov 2024 16:11:46 +0530 Subject: [PATCH 10/43] update --- src/diffusers/__init__.py | 2 +- src/diffusers/loaders/single_file_model.py | 22 +- src/diffusers/loaders/single_file_utils.py | 9 +- src/diffusers/models/model_loading_utils.py | 32 +- src/diffusers/quantizers/auto.py | 2 +- .../quantizers/gguf/gguf_quantizer.py | 30 +- src/diffusers/quantizers/gguf/utils.py | 350 +++++++++++++++--- .../quantizers/quantization_config.py | 4 +- src/diffusers/utils/__init__.py | 1 + 9 files changed, 339 insertions(+), 113 deletions(-) diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 533aa5de1e87..a21f44982a61 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -31,7 +31,7 @@ "loaders": ["FromOriginalModelMixin"], "models": [], "pipelines": [], - "quantizers.quantization_config": ["BitsAndBytesConfig"], + "quantizers.quantization_config": ["BitsAndBytesConfig", "GGUFQuantizationConfig"], "schedulers": [], "utils": [ "OptionalDependencyNotAvailable", diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py index 7ce4460eb674..b27ce1c4c84d 100644 --- a/src/diffusers/loaders/single_file_model.py +++ b/src/diffusers/loaders/single_file_model.py @@ -21,7 +21,7 @@ from huggingface_hub.utils import validate_hf_hub_args from ..quantizers import DiffusersAutoQuantizer -from ..utils import deprecate, is_accelerate_available, is_gguf_available, logging +from ..utils import deprecate, is_accelerate_available, logging from .single_file_utils import ( SingleFileComponentError, convert_animatediff_checkpoint_to_diffusers, @@ -48,9 +48,6 @@ from ..models.modeling_utils import load_model_dict_into_meta -if is_gguf_available(): - from ..quantizers.quantization_config import GGUFQuantizationConfig - SINGLE_FILE_LOADABLE_CLASSES = { "StableCascadeUNet": { @@ -221,12 +218,11 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] = local_files_only=local_files_only, revision=revision, ) - is_gguf = "gguf_metadata" in checkpoint - gguf_metadata = checkpoint["gguf_metadata"] if is_gguf else None + if quantization_config is not None: + hf_quantizer = DiffusersAutoQuantizer.from_config(quantization_config) - # For GGUF models we nest the state_dict along with gguf_metadata - while "state_dict" in checkpoint: - checkpoint = checkpoint["state_dict"] + else: + hf_quantizer = None mapping_functions = SINGLE_FILE_LOADABLE_CLASSES[mapping_class_name] @@ -307,13 +303,6 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] = with ctx(): model = cls.from_config(diffusers_model_config) - if is_gguf: - quantization_config = GGUFQuantizationConfig(quant_type=gguf_metadata["gguf_file_type"]) - # Only support loading pre_quantized gguf checkpoints - hf_quantizer = DiffusersAutoQuantizer.from_config(quantization_config, pre_quantized=True) - else: - hf_quantizer = None - # Check if `_keep_in_fp32_modules` is not None use_keep_in_fp32_modules = (cls._keep_in_fp32_modules is not None) and ( (torch_dtype == torch.float16) or hasattr(hf_quantizer, "use_keep_in_fp32_modules") @@ -352,7 +341,6 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] = if hf_quantizer is not None: hf_quantizer.postprocess_model(model) - model.hf_quantizer = hf_quantizer if torch_dtype is not None: model.to(torch_dtype) diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py index a24317783e8e..d1bad8b5a7cd 100644 --- a/src/diffusers/loaders/single_file_utils.py +++ b/src/diffusers/loaders/single_file_utils.py @@ -357,12 +357,9 @@ def load_single_file_checkpoint( checkpoint = load_state_dict(pretrained_model_link_or_path) - if "gguf_metadata" in checkpoint: - return checkpoint - else: - # some checkpoints contain the model state dict under a "state_dict" key - while "state_dict" in checkpoint: - checkpoint = checkpoint["state_dict"] + # some checkpoints contain the model state dict under a "state_dict" key + while "state_dict" in checkpoint: + checkpoint = checkpoint["state_dict"] return checkpoint diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py index 0a6f24865c69..e2e7ec83ff2a 100644 --- a/src/diffusers/models/model_loading_utils.py +++ b/src/diffusers/models/model_loading_utils.py @@ -27,10 +27,8 @@ from huggingface_hub.utils import EntryNotFoundError from tqdm import tqdm -from diffusers.utils.constants import GGUF_FILE_EXTENSION - -from ..quantizers.quantization_config import QuantizationMethod from ..utils import ( + GGUF_FILE_EXTENSION, SAFE_WEIGHTS_INDEX_NAME, SAFETENSORS_FILE_EXTENSION, WEIGHTS_INDEX_NAME, @@ -188,7 +186,6 @@ def load_model_dict_into_meta( device = device or torch.device("cpu") dtype = dtype or torch.float32 is_quantized = hf_quantizer is not None - is_quant_method_bnb = getattr(model, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES accepts_dtype = "dtype" in set(inspect.signature(set_module_tensor_to_device).parameters.keys()) empty_state_dict = model.state_dict() @@ -219,6 +216,7 @@ def load_model_dict_into_meta( set_module_kwargs["dtype"] = dtype # bnb params are flattened. + # gguf quants have a different shape based on the type of quantization applied if empty_state_dict[param_name].shape != param.shape: if ( is_quantized @@ -227,7 +225,6 @@ def load_model_dict_into_meta( ): hf_quantizer.check_quantized_param_shape(param_name, empty_state_dict[param_name].shape, param.shape) else: - __import__('ipdb').set_trace() model_name_or_path_str = f"{model_name_or_path} " if model_name_or_path is not None else "" raise ValueError( f"Cannot load {model_name_or_path_str} because {param_name} expected shape {empty_state_dict[param_name]}, but got {param.shape}. If you want to instead overwrite randomly initialized weights, please make sure to pass both `low_cpu_mem_usage=False` and `ignore_mismatched_sizes=True`. For more information, see also: https://github.com/huggingface/diffusers/issues/1619#issuecomment-1345604389 as an example." @@ -438,22 +435,22 @@ def read_field(reader, field): def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): """ - Load a GGUF file and return a dictionary of parsed parameters containing tensors, the parsed - tokenizer and config attributes. + Load a GGUF file and return a dictionary of parsed parameters containing tensors, the parsed tokenizer and config + attributes. Args: gguf_checkpoint_path (`str`): The path the to GGUF file to load return_tensors (`bool`, defaults to `True`): - Whether to read the tensors from the file and return them. Not doing so is faster - and only loads the metadata in memory. + Whether to read the tensors from the file and return them. Not doing so is faster and only loads the + metadata in memory. """ if is_gguf_available() and is_torch_available(): import gguf from gguf import GGUFReader - from ..quantizers.gguf.utils import _GGUF_FILE_TYPE_MAPPING, GGUFParameter + from ..quantizers.gguf.utils import GGUFParameter else: logger.error( "Loading a GGUF checkpoint in PyTorch, requires both PyTorch and GGUF>=0.10.0 to be installed. Please see " @@ -466,19 +463,20 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): reader_keys = list(fields.keys()) parsed_parameters = {} - metadata = {"gguf_file_type": _GGUF_FILE_TYPE_MAPPING[read_field(reader, "general.file_type")[0]], "qtypes": {}} - for tensor in tqdm(reader.tensors): name = tensor.name - tensor_type = tensor.tensor_type + quant_type = tensor.tensor_type # if the tensor is a torch supported dtype do not use GGUFParameter - is_gguf_quant = tensor_type not in [gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16] - + is_gguf_quant = quant_type not in [gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16] weights = torch.from_numpy(tensor.data) - parsed_parameters[name] = GGUFParameter(weights, tensor_type=tensor_type) if is_gguf_quant else weights.permute(*torch.arange(weights.ndim - 1, -1, -1)) + parsed_parameters[name] = ( + GGUFParameter(weights, quant_type=quant_type) + if is_gguf_quant + else weights.permute(*torch.arange(weights.ndim - 1, -1, -1)) + ) if len(reader_keys) > 0: logger.info(f"Some keys of the GGUF file were not considered: {reader_keys}") - return {"state_dict": parsed_parameters, "gguf_metadata": metadata} + return parsed_parameters diff --git a/src/diffusers/quantizers/auto.py b/src/diffusers/quantizers/auto.py index 02a8b4fe917c..54000fb85330 100644 --- a/src/diffusers/quantizers/auto.py +++ b/src/diffusers/quantizers/auto.py @@ -26,7 +26,7 @@ AUTO_QUANTIZER_MAPPING = { "bitsandbytes_4bit": BnB4BitDiffusersQuantizer, "bitsandbytes_8bit": BnB8BitDiffusersQuantizer, - "gguf": GGUFQuantizer + "gguf": GGUFQuantizer, } AUTO_QUANTIZATION_CONFIG_MAPPING = { diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py index 02d01e179676..e7c5647f3dd9 100644 --- a/src/diffusers/quantizers/gguf/gguf_quantizer.py +++ b/src/diffusers/quantizers/gguf/gguf_quantizer.py @@ -2,7 +2,7 @@ from ...utils import get_module_from_name from ..base import DiffusersQuantizer -from .utils import _replace_with_gguf_linear +from .utils import GGUFParameter, _quant_shape_from_byte_shape, _replace_with_gguf_linear if TYPE_CHECKING: @@ -29,23 +29,12 @@ class GGUFQuantizer(DiffusersQuantizer): def __init__(self, quantization_config, **kwargs): super().__init__(quantization_config, **kwargs) - self.quant_type = quantization_config.quant_type self.compute_dtype = quantization_config.compute_dtype - self.qtypes = quantization_config.qtypes self.pre_quantized = True - def check_quantized_param( - self, - model: "ModelMixin", - param_value: "torch.Tensor", - param_name: str, - state_dict: Dict[str, Any], - **kwargs, - ) -> bool: - return True - def check_quantized_param_shape(self, param_name, current_param_shape, loaded_param_shape): - return True + if _quant_shape_from_byte_shape(loaded_param_shape) == current_param_shape: + return True def check_if_quantized_param( self, @@ -55,7 +44,11 @@ def check_if_quantized_param( state_dict: Dict[str, Any], **kwargs, ) -> bool: - return True + module, tensor_name = get_module_from_name(model, param_name) + if isinstance(module._parameters.get(tensor_name, None), GGUFParameter): + return True + + return False def create_quantized_param( self, @@ -70,13 +63,8 @@ def create_quantized_param( if tensor_name not in module._parameters: raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.") - if param_name == "transformer_blocks.0.attn.to_q.weight": - __import__("ipdb").set_trace() - module._parameters[tensor_name] = param_value - return - def _process_model_before_weight_loading( self, model: "ModelMixin", @@ -84,7 +72,7 @@ def _process_model_before_weight_loading( keep_in_fp32_modules: List[str] = [], **kwargs, ): - model = _replace_with_gguf_linear(model, self.compute_dtype, self.quant_type) + _replace_with_gguf_linear(model, self.compute_dtype) def _process_model_after_weight_loading(self, model: "ModelMixin", **kwargs): return model diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py index d3d71d00507f..dfff6a11ec50 100644 --- a/src/diffusers/quantizers/gguf/utils.py +++ b/src/diffusers/quantizers/gguf/utils.py @@ -1,7 +1,21 @@ +# Copyright 2024 The HuggingFace Team and City96. All rights reserved. +# # +# # Licensed under the Apache License, Version 2.0 (the "License"); +# # you may not use this file except in compliance with the License. +# # You may obtain a copy of the License at +# # +# # http://www.apache.org/licenses/LICENSE-2.0 +# # +# # Unless required by applicable law or agreed to in writing, software +# # distributed under the License is distributed on an "AS IS" BASIS, +# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# # See the License for the specific language governing permissions and +# # limitations under the License. + +import gguf import torch -from torch._prims_common import is_low_precision_dtype import torch.nn as nn -import gguf + _GGUF_FILE_TYPE_MAPPING = { 0: "ALL_F32", @@ -22,9 +36,39 @@ 18: "MOSTLY_Q6_K", } -QK_K_BLOCKSIZE = 256 + +def _replace_with_gguf_linear(model, compute_dtype): + for name, module in model.named_children(): + if isinstance(module, nn.Linear): + model._modules[name] = GGUFLinear( + module.in_features, + module.out_features, + module.bias is not None, + compute_dtype=compute_dtype, + ) + model._modules[name].source_cls = type(module) + # Force requires grad to False to avoid unexpected errors + model._modules[name].requires_grad_(False) + + has_children = list(module.children()) + if has_children: + _replace_with_gguf_linear(module, compute_dtype) + + return model + + +QK_K = 256 K_SCALE_SIZE = 12 +# dequantize operations based on torch ports of GGUF dequantize_functions +# from City96 +# more info: https://github.com/city96/ComfyUI-GGUF/blob/main/dequant.py + + +def to_uint32(x): + x = x.view(torch.uint8).to(torch.int32) + return (x[:, 0] | x[:, 1] << 8 | x[:, 2] << 16 | x[:, 3] << 24).unsqueeze(1) + def split_block_dims(blocks, *args): n_max = blocks.shape[1] @@ -32,57 +76,270 @@ def split_block_dims(blocks, *args): return torch.split(blocks, dims, dim=1) -def dequantize_Q2_K(blocks, dtype=None): +def get_scale_min(scales): + n_blocks = scales.shape[0] + scales = scales.view(torch.uint8) + scales = scales.reshape((n_blocks, 3, 4)) + + d, m, m_d = torch.split(scales, scales.shape[-2] // 3, dim=-2) + + sc = torch.cat([d & 0x3F, (m_d & 0x0F) | ((d >> 2) & 0x30)], dim=-1) + min = torch.cat([m & 0x3F, (m_d >> 4) | ((m >> 2) & 0x30)], dim=-1) + + return (sc.reshape((n_blocks, 8)), min.reshape((n_blocks, 8))) + + +def dequantize_blocks_Q8_0(blocks, block_size, type_size, dtype=None): + d, x = split_block_dims(blocks, 2) + d = d.view(torch.float16).to(dtype) + x = x.view(torch.int8) + return d * x + + +def dequantize_blocks_Q5_1(blocks, block_size, type_size, dtype=None): + n_blocks = blocks.shape[0] + + d, m, qh, qs = split_block_dims(blocks, 2, 2, 4) + d = d.view(torch.float16).to(dtype) + m = m.view(torch.float16).to(dtype) + qh = to_uint32(qh) + + qh = qh.reshape((n_blocks, 1)) >> torch.arange(32, device=d.device, dtype=torch.int32).reshape(1, 32) + ql = qs.reshape((n_blocks, -1, 1, block_size // 2)) >> torch.tensor( + [0, 4], device=d.device, dtype=torch.uint8 + ).reshape(1, 1, 2, 1) + qh = (qh & 1).to(torch.uint8) + ql = (ql & 0x0F).reshape((n_blocks, -1)) + + qs = ql | (qh << 4) + return (d * qs) + m + + +def dequantize_blocks_Q5_0(blocks, block_size, type_size, dtype=None): + n_blocks = blocks.shape[0] + + d, qh, qs = split_block_dims(blocks, 2, 4) + d = d.view(torch.float16).to(dtype) + qh = to_uint32(qh) + + qh = qh.reshape(n_blocks, 1) >> torch.arange(32, device=d.device, dtype=torch.int32).reshape(1, 32) + ql = qs.reshape(n_blocks, -1, 1, block_size // 2) >> torch.tensor( + [0, 4], device=d.device, dtype=torch.uint8 + ).reshape(1, 1, 2, 1) + + qh = (qh & 1).to(torch.uint8) + ql = (ql & 0x0F).reshape(n_blocks, -1) + + qs = (ql | (qh << 4)).to(torch.int8) - 16 + return d * qs + + +def dequantize_blocks_Q4_1(blocks, block_size, type_size, dtype=None): + n_blocks = blocks.shape[0] + + d, m, qs = split_block_dims(blocks, 2, 2) + d = d.view(torch.float16).to(dtype) + m = m.view(torch.float16).to(dtype) + + qs = qs.reshape((n_blocks, -1, 1, block_size // 2)) >> torch.tensor( + [0, 4], device=d.device, dtype=torch.uint8 + ).reshape(1, 1, 2, 1) + qs = (qs & 0x0F).reshape(n_blocks, -1) + + return (d * qs) + m + + +def dequantize_blocks_Q4_0(blocks, block_size, type_size, dtype=None): + n_blocks = blocks.shape[0] + + d, qs = split_block_dims(blocks, 2) + d = d.view(torch.float16).to(dtype) + + qs = qs.reshape((n_blocks, -1, 1, block_size // 2)) >> torch.tensor( + [0, 4], device=d.device, dtype=torch.uint8 + ).reshape((1, 1, 2, 1)) + qs = (qs & 0x0F).reshape((n_blocks, -1)).to(torch.int8) - 8 + return d * qs + + +def dequantize_blocks_Q6_K(blocks, block_size, type_size, dtype=None): + n_blocks = blocks.shape[0] + + ( + ql, + qh, + scales, + d, + ) = split_block_dims(blocks, QK_K // 2, QK_K // 4, QK_K // 16) + + scales = scales.view(torch.int8).to(dtype) + d = d.view(torch.float16).to(dtype) + d = (d * scales).reshape((n_blocks, QK_K // 16, 1)) + + ql = ql.reshape((n_blocks, -1, 1, 64)) >> torch.tensor([0, 4], device=d.device, dtype=torch.uint8).reshape( + (1, 1, 2, 1) + ) + ql = (ql & 0x0F).reshape((n_blocks, -1, 32)) + qh = qh.reshape((n_blocks, -1, 1, 32)) >> torch.tensor([0, 2, 4, 6], device=d.device, dtype=torch.uint8).reshape( + (1, 1, 4, 1) + ) + qh = (qh & 0x03).reshape((n_blocks, -1, 32)) + q = (ql | (qh << 4)).to(torch.int8) - 32 + q = q.reshape((n_blocks, QK_K // 16, -1)) + + return (d * q).reshape((n_blocks, QK_K)) + + +def dequantize_blocks_Q5_K(blocks, block_size, type_size, dtype=None): + n_blocks = blocks.shape[0] + + d, dmin, scales, qh, qs = split_block_dims(blocks, 2, 2, K_SCALE_SIZE, QK_K // 8) + + d = d.view(torch.float16).to(dtype) + dmin = dmin.view(torch.float16).to(dtype) + + sc, m = get_scale_min(scales) + + d = (d * sc).reshape((n_blocks, -1, 1)) + dm = (dmin * m).reshape((n_blocks, -1, 1)) + + ql = qs.reshape((n_blocks, -1, 1, 32)) >> torch.tensor([0, 4], device=d.device, dtype=torch.uint8).reshape( + (1, 1, 2, 1) + ) + qh = qh.reshape((n_blocks, -1, 1, 32)) >> torch.arange(0, 8, device=d.device, dtype=torch.uint8).reshape( + (1, 1, 8, 1) + ) + ql = (ql & 0x0F).reshape((n_blocks, -1, 32)) + qh = (qh & 0x01).reshape((n_blocks, -1, 32)) + q = ql | (qh << 4) + + return (d * q - dm).reshape((n_blocks, QK_K)) + + +def dequantize_blocks_Q4_K(blocks, block_size, type_size, dtype=None): + n_blocks = blocks.shape[0] + + d, dmin, scales, qs = split_block_dims(blocks, 2, 2, K_SCALE_SIZE) + d = d.view(torch.float16).to(dtype) + dmin = dmin.view(torch.float16).to(dtype) + + sc, m = get_scale_min(scales) + + d = (d * sc).reshape((n_blocks, -1, 1)) + dm = (dmin * m).reshape((n_blocks, -1, 1)) + + qs = qs.reshape((n_blocks, -1, 1, 32)) >> torch.tensor([0, 4], device=d.device, dtype=torch.uint8).reshape( + (1, 1, 2, 1) + ) + qs = (qs & 0x0F).reshape((n_blocks, -1, 32)) + + return (d * qs - dm).reshape((n_blocks, QK_K)) + + +def dequantize_blocks_Q3_K(blocks, block_size, type_size, dtype=None): + n_blocks = blocks.shape[0] + + hmask, qs, scales, d = split_block_dims(blocks, QK_K // 8, QK_K // 4, 12) + d = d.view(torch.float16).to(dtype) + + lscales, hscales = scales[:, :8], scales[:, 8:] + lscales = lscales.reshape((n_blocks, 1, 8)) >> torch.tensor([0, 4], device=d.device, dtype=torch.uint8).reshape( + (1, 2, 1) + ) + lscales = lscales.reshape((n_blocks, 16)) + hscales = hscales.reshape((n_blocks, 1, 4)) >> torch.tensor( + [0, 2, 4, 6], device=d.device, dtype=torch.uint8 + ).reshape((1, 4, 1)) + hscales = hscales.reshape((n_blocks, 16)) + scales = (lscales & 0x0F) | ((hscales & 0x03) << 4) + scales = scales.to(torch.int8) - 32 + + dl = (d * scales).reshape((n_blocks, 16, 1)) + + ql = qs.reshape((n_blocks, -1, 1, 32)) >> torch.tensor([0, 2, 4, 6], device=d.device, dtype=torch.uint8).reshape( + (1, 1, 4, 1) + ) + qh = hmask.reshape(n_blocks, -1, 1, 32) >> torch.arange(0, 8, device=d.device, dtype=torch.uint8).reshape( + (1, 1, 8, 1) + ) + ql = ql.reshape((n_blocks, 16, QK_K // 16)) & 3 + qh = (qh.reshape((n_blocks, 16, QK_K // 16)) & 1) ^ 1 + q = ql.to(torch.int8) - (qh << 2).to(torch.int8) + + return (dl * q).reshape((n_blocks, QK_K)) + + +def dequantize_blocks_Q2_K(blocks, block_size, type_size, dtype=None): n_blocks = blocks.shape[0] - scales, quantized_values, delta, delta_min = split_block_dims(blocks, QK_K_BLOCKSIZE // 16, QK_K_BLOCKSIZE // 4, 2) - delta = delta.view(torch.float16).to(dtype) - delta_min = delta_min.view(torch.float16).to(dtype) + scales, qs, d, dmin = split_block_dims(blocks, QK_K // 16, QK_K // 4, 2) + d = d.view(torch.float16).to(dtype) + dmin = dmin.view(torch.float16).to(dtype) # (n_blocks, 16, 1) - dl = (delta * (scales & 0xF)).reshape((n_blocks, QK_K_BLOCKSIZE // 16, 1)) - ml = (delta_min * (scales >> 4)).reshape((n_blocks, QK_K_BLOCKSIZE // 16, 1)) + dl = (d * (scales & 0xF)).reshape((n_blocks, QK_K // 16, 1)) + ml = (dmin * (scales >> 4)).reshape((n_blocks, QK_K // 16, 1)) - shift = torch.tensor([0, 2, 4, 6], device=delta.device, dtype=torch.uint8).reshape((1, 1, 4, 1)) + shift = torch.tensor([0, 2, 4, 6], device=d.device, dtype=torch.uint8).reshape((1, 1, 4, 1)) - qs = (quantized_values.reshape((n_blocks, -1, 1, 32)) >> shift) & 3 - qs = qs.reshape((n_blocks, QK_K_BLOCKSIZE // 16, 16)) + qs = (qs.reshape((n_blocks, -1, 1, 32)) >> shift) & 3 + qs = qs.reshape((n_blocks, QK_K // 16, 16)) qs = dl * qs - ml return qs.reshape((n_blocks, -1)) -dequantize_fns = { - "MOSTLY_Q2_K": dequantize_Q2_K, +def dequantize_blocks_BF16(blocks, block_size, type_size, dtype=None): + return (blocks.view(torch.int16).to(torch.int32) << 16).view(torch.float32) + + +dequantize_functions = { + gguf.GGMLQuantizationType.BF16: dequantize_blocks_BF16, + gguf.GGMLQuantizationType.Q8_0: dequantize_blocks_Q8_0, + gguf.GGMLQuantizationType.Q5_1: dequantize_blocks_Q5_1, + gguf.GGMLQuantizationType.Q5_0: dequantize_blocks_Q5_0, + gguf.GGMLQuantizationType.Q4_1: dequantize_blocks_Q4_1, + gguf.GGMLQuantizationType.Q4_0: dequantize_blocks_Q4_0, + gguf.GGMLQuantizationType.Q6_K: dequantize_blocks_Q6_K, + gguf.GGMLQuantizationType.Q5_K: dequantize_blocks_Q5_K, + gguf.GGMLQuantizationType.Q4_K: dequantize_blocks_Q4_K, + gguf.GGMLQuantizationType.Q3_K: dequantize_blocks_Q3_K, + gguf.GGMLQuantizationType.Q2_K: dequantize_blocks_Q2_K, } -def _replace_with_gguf_linear(model, compute_dtype, quant_type, qtypes=None): - for name, module in model.named_children(): - if isinstance(module, nn.Linear): - model._modules[name] = GGUFLinear( - module.in_features, - module.out_features, - module.bias is not None, - compute_dtype=compute_dtype, - quant_type=quant_type, - ) - model._modules[name].source_cls = type(module) - # Force requires grad to False to avoid unexpected errors - model._modules[name].requires_grad_(False) +def _quant_shape_from_byte_shape(shape, type_size, block_size): + return (*shape[:-1], shape[-1] // type_size * block_size) - has_children = list(module.children()) - if has_children: - _replace_with_gguf_linear(module, compute_dtype, quant_type) - return model +def dequantize_gguf_tensor(tensor, compute_dtype): + if not hasattr(tensor, "quant_type"): + return tensor + + quant_type = tensor.quant_type + dequant_fn = dequantize_functions[quant_type] + + block_size, type_size = gguf.GGML_QUANT_SIZES[quant_type] + + tensor = torch.tensor(tensor) + tensor = tensor.view(torch.uint8) + shape = _quant_shape_from_byte_shape(tensor.shape, type_size, block_size) + + n_blocks = tensor.numel() // type_size + blocks = tensor.reshape((n_blocks, type_size)) + dequant = dequant_fn(blocks, block_size, type_size) + dequant = dequant.reshape(shape) + dequant = dequant.to(compute_dtype) -class GGUFParameter(torch.nn.Parameter): - def __new__(cls, data, requires_grad=False, tensor_type=None): + return dequant + + +class GGUFParameter(torch.Tensor): + def __new__(cls, data, requires_grad=False, quant_type=None): data = data if data is not None else torch.empty(0) self = torch.Tensor._make_subclass(cls, data, requires_grad) - self.tensor_type = tensor_type + self.quant_type = quant_type return self @@ -90,19 +347,26 @@ def __new__(cls, data, requires_grad=False, tensor_type=None): def __torch_function__(cls, func, types, args=(), kwargs=None): if kwargs is None: kwargs = {} + result = super().__torch_function__(func, types, args, kwargs) - tensor_type = None + # When converting from original format checkpoints we often use splits, cats etc on tensors + # this method ensures that the returned tensor type from those operations remains GGUFParameter + # so that we preserve quant_type information + quant_type = None for arg in args: + if isinstance(arg, list) and (arg[0], GGUFParameter): + quant_type = arg[0].quant_type + break if isinstance(arg, GGUFParameter): - tensor_type = arg.tensor_type + quant_type = arg.quant_type break if isinstance(result, torch.Tensor): - return cls(result, tensor_type=tensor_type) + return cls(result, quant_type=quant_type) # Handle tuples and lists elif isinstance(result, (tuple, list)): # Preserve the original type (tuple or list) - wrapped = [cls(x, tensor_type=tensor_type) if isinstance(x, torch.Tensor) else x for x in result] + wrapped = [cls(x, quant_type=quant_type) if isinstance(x, torch.Tensor) else x for x in result] return type(result)(wrapped) else: return result @@ -115,19 +379,11 @@ def __init__( out_features, bias=False, compute_dtype=None, - quant_type=None, device=None, ) -> None: super().__init__(in_features, out_features, bias, device) self.compute_dtype = compute_dtype - self.quant_type = quant_type - self._dequant_fn = dequantize_fns[self.quant_type] def forward(self, inputs): - is_gguf_quant = hasattr(self.weight, "tensor_type") - if is_gguf_quant: - weight = self._dequant_fn(self.weight, torch.uint8).to(self.compute_dtype) - else: - weight = self.weight - __import__("ipdb").set_trace() + weight = dequantize_gguf_tensor(self.weight, self.compute_dtype) return torch.nn.functional.linear(inputs, weight, self.bias) diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py index 05cd997900eb..58e68b628d77 100644 --- a/src/diffusers/quantizers/quantization_config.py +++ b/src/diffusers/quantizers/quantization_config.py @@ -393,12 +393,10 @@ def to_diff_dict(self) -> Dict[str, Any]: class GGUFQuantizationConfig(QuantizationConfigMixin): - def __init__(self, quant_type: str, qtypes=None, compute_dtype=None, quant_storage=None): + def __init__(self, compute_dtype=None, quant_storage=None): self.quant_method = QuantizationMethod.GGUF - self.quant_type = quant_type self.compute_dtype = compute_dtype self.quant_storage = quant_storage - self.qtypes = qtypes if self.compute_dtype is None: self.compute_dtype = torch.float32 diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py index 24e324ac4382..a9ef3718e9e8 100644 --- a/src/diffusers/utils/__init__.py +++ b/src/diffusers/utils/__init__.py @@ -23,6 +23,7 @@ DEPRECATED_REVISION_ARGS, DIFFUSERS_DYNAMIC_MODULE_NAME, FLAX_WEIGHTS_NAME, + GGUF_FILE_EXTENSION, HF_MODULES_CACHE, HUGGINGFACE_CO_RESOLVE_ENDPOINT, MIN_PEFT_VERSION, From d7f09f27d2b2017d8b3a06f26e207dfeb40e2f94 Mon Sep 17 00:00:00 2001 From: DN6 Date: Tue, 19 Nov 2024 18:46:40 +0530 Subject: [PATCH 11/43] update --- src/diffusers/models/model_loading_utils.py | 13 +++------ .../quantizers/bitsandbytes/bnb_quantizer.py | 5 +++- .../quantizers/gguf/gguf_quantizer.py | 27 ++++++++++++------- src/diffusers/quantizers/gguf/utils.py | 21 --------------- 4 files changed, 26 insertions(+), 40 deletions(-) diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py index e2e7ec83ff2a..b909217c53bd 100644 --- a/src/diffusers/models/model_loading_utils.py +++ b/src/diffusers/models/model_loading_utils.py @@ -182,8 +182,7 @@ def load_model_dict_into_meta( hf_quantizer=None, keep_in_fp32_modules=None, ) -> List[str]: - if hf_quantizer is None: - device = device or torch.device("cpu") + device = device or torch.device("cpu") dtype = dtype or torch.float32 is_quantized = hf_quantizer is not None @@ -223,7 +222,7 @@ def load_model_dict_into_meta( and hf_quantizer.pre_quantized and hf_quantizer.check_if_quantized_param(model, param, param_name, state_dict, param_device=device) ): - hf_quantizer.check_quantized_param_shape(param_name, empty_state_dict[param_name].shape, param.shape) + hf_quantizer.check_quantized_param_shape(param_name, empty_state_dict[param_name], param) else: model_name_or_path_str = f"{model_name_or_path} " if model_name_or_path is not None else "" raise ValueError( @@ -469,12 +468,8 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): # if the tensor is a torch supported dtype do not use GGUFParameter is_gguf_quant = quant_type not in [gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16] - weights = torch.from_numpy(tensor.data) - parsed_parameters[name] = ( - GGUFParameter(weights, quant_type=quant_type) - if is_gguf_quant - else weights.permute(*torch.arange(weights.ndim - 1, -1, -1)) - ) + weights = torch.from_numpy(tensor.data.copy()) + parsed_parameters[name] = GGUFParameter(weights, quant_type=quant_type) if is_gguf_quant else weights if len(reader_keys) > 0: logger.info(f"Some keys of the GGUF file were not considered: {reader_keys}") diff --git a/src/diffusers/quantizers/bitsandbytes/bnb_quantizer.py b/src/diffusers/quantizers/bitsandbytes/bnb_quantizer.py index d5ac1611a571..f7780b66b12b 100644 --- a/src/diffusers/quantizers/bitsandbytes/bnb_quantizer.py +++ b/src/diffusers/quantizers/bitsandbytes/bnb_quantizer.py @@ -204,7 +204,10 @@ def create_quantized_param( module._parameters[tensor_name] = new_value - def check_quantized_param_shape(self, param_name, current_param_shape, loaded_param_shape): + def check_quantized_param_shape(self, param_name, current_param, loaded_param): + current_param_shape = current_param.shape + loaded_param_shape = loaded_param.shape + n = current_param_shape.numel() inferred_shape = (n,) if "bias" in param_name else ((n + 1) // 2, 1) if loaded_param_shape != inferred_shape: diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py index e7c5647f3dd9..4ef843a90c1e 100644 --- a/src/diffusers/quantizers/gguf/gguf_quantizer.py +++ b/src/diffusers/quantizers/gguf/gguf_quantizer.py @@ -9,18 +9,17 @@ from ...models.modeling_utils import ModelMixin from ...utils import ( - is_accelerate_available, + is_gguf_available, is_torch_available, logging, ) -if is_accelerate_available(): - pass - if is_torch_available(): import torch +if is_gguf_available(): + import gguf logger = logging.get_logger(__name__) @@ -32,9 +31,20 @@ def __init__(self, quantization_config, **kwargs): self.compute_dtype = quantization_config.compute_dtype self.pre_quantized = True - def check_quantized_param_shape(self, param_name, current_param_shape, loaded_param_shape): - if _quant_shape_from_byte_shape(loaded_param_shape) == current_param_shape: - return True + def check_quantized_param_shape(self, param_name, current_param, loaded_param): + loaded_param_shape = loaded_param.shape + current_param_shape = current_param.shape + quant_type = loaded_param.quant_type + + block_size, type_size = gguf.GGML_QUANT_SIZES[quant_type] + + inferred_shape = _quant_shape_from_byte_shape(loaded_param_shape, type_size, block_size) + if inferred_shape != current_param_shape: + raise ValueError( + f"{param_name} has an expected quantized shape of: {inferred_shape}, but receieved shape: {loaded_param_shape}" + ) + + return True def check_if_quantized_param( self, @@ -44,8 +54,7 @@ def check_if_quantized_param( state_dict: Dict[str, Any], **kwargs, ) -> bool: - module, tensor_name = get_module_from_name(model, param_name) - if isinstance(module._parameters.get(tensor_name, None), GGUFParameter): + if isinstance(param_value, GGUFParameter): return True return False diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py index dfff6a11ec50..50fb3b84f994 100644 --- a/src/diffusers/quantizers/gguf/utils.py +++ b/src/diffusers/quantizers/gguf/utils.py @@ -17,26 +17,6 @@ import torch.nn as nn -_GGUF_FILE_TYPE_MAPPING = { - 0: "ALL_F32", - 1: "MOSTLY_F16", - 2: "MOSTLY_Q4_0", - 3: "MOSTLY_Q4_1", - 4: "MOSTLY_Q4_1_SOME_F16", - 8: "MOSTLY_Q5_0", - 9: "MOSTLY_Q5_1", - 10: "MOSTLY_Q2_K", - 11: "MOSTLY_Q3_K_S", - 12: "MOSTLY_Q3_K_M", - 13: "MOSTLY_Q3_K_L", - 14: "MOSTLY_Q4_K_S", - 15: "MOSTLY_Q4_K_M", - 16: "MOSTLY_Q5_K_S", - 17: "MOSTLY_Q5_K_M", - 18: "MOSTLY_Q6_K", -} - - def _replace_with_gguf_linear(model, compute_dtype): for name, module in model.named_children(): if isinstance(module, nn.Linear): @@ -321,7 +301,6 @@ def dequantize_gguf_tensor(tensor, compute_dtype): block_size, type_size = gguf.GGML_QUANT_SIZES[quant_type] - tensor = torch.tensor(tensor) tensor = tensor.view(torch.uint8) shape = _quant_shape_from_byte_shape(tensor.shape, type_size, block_size) From 1649936c669f706bed43b977b06cf8d5dbe2e61c Mon Sep 17 00:00:00 2001 From: DN6 Date: Tue, 19 Nov 2024 19:23:39 +0530 Subject: [PATCH 12/43] update --- src/diffusers/quantizers/gguf/utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py index 50fb3b84f994..1382cd5f8b52 100644 --- a/src/diffusers/quantizers/gguf/utils.py +++ b/src/diffusers/quantizers/gguf/utils.py @@ -37,14 +37,15 @@ def _replace_with_gguf_linear(model, compute_dtype): return model -QK_K = 256 -K_SCALE_SIZE = 12 - # dequantize operations based on torch ports of GGUF dequantize_functions # from City96 # more info: https://github.com/city96/ComfyUI-GGUF/blob/main/dequant.py +QK_K = 256 +K_SCALE_SIZE = 12 + + def to_uint32(x): x = x.view(torch.uint8).to(torch.int32) return (x[:, 0] | x[:, 1] << 8 | x[:, 2] << 16 | x[:, 3] << 24).unsqueeze(1) From 28d3a64d6de20dd2071fed8b3a039f336dc9a2ce Mon Sep 17 00:00:00 2001 From: DN6 Date: Tue, 19 Nov 2024 19:27:51 +0530 Subject: [PATCH 13/43] update --- tests/models/test_attention_processor.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/models/test_attention_processor.py b/tests/models/test_attention_processor.py index c1432fee5211..2489604274b4 100644 --- a/tests/models/test_attention_processor.py +++ b/tests/models/test_attention_processor.py @@ -6,7 +6,6 @@ from diffusers import DiffusionPipeline from diffusers.models.attention_processor import Attention, AttnAddedKVProcessor -import pytest class AttnAddedKVProcessorTests(unittest.TestCase): @@ -84,7 +83,6 @@ def test_conversion_when_using_device_map(self): pipe = DiffusionPipeline.from_pretrained( "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None ) - torch.cuda.synchronize() pre_conversion = pipe( "foo", @@ -97,7 +95,6 @@ def test_conversion_when_using_device_map(self): pipe = DiffusionPipeline.from_pretrained( "hf-internal-testing/tiny-stable-diffusion-torch", device_map="balanced", safety_checker=None ) - torch.cuda.synchronize() conversion = pipe( "foo", From c34a4519e018f3f2b69c8c1faa941e4ba547985c Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Thu, 21 Nov 2024 09:09:41 +0100 Subject: [PATCH 14/43] update --- src/diffusers/loaders/single_file_model.py | 8 +- src/diffusers/models/model_loading_utils.py | 2 +- .../models/transformers/transformer_flux.py | 1 - .../quantizers/gguf/gguf_quantizer.py | 16 +++- src/diffusers/quantizers/gguf/utils.py | 73 ++++++++++++------- 5 files changed, 69 insertions(+), 31 deletions(-) diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py index b27ce1c4c84d..58a9b7e9d533 100644 --- a/src/diffusers/loaders/single_file_model.py +++ b/src/diffusers/loaders/single_file_model.py @@ -220,6 +220,7 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] = ) if quantization_config is not None: hf_quantizer = DiffusersAutoQuantizer.from_config(quantization_config) + hf_quantizer.validate_environment() else: hf_quantizer = None @@ -316,7 +317,12 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] = keep_in_fp32_modules = [] if hf_quantizer is not None: - hf_quantizer.preprocess_model(model=model, device_map=None, keep_in_fp32_modules=keep_in_fp32_modules) + hf_quantizer.preprocess_model( + model=model, + device_map=None, + state_dict=diffusers_format_checkpoint, + keep_in_fp32_modules=keep_in_fp32_modules, + ) if is_accelerate_available(): unexpected_keys = load_model_dict_into_meta( diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py index b909217c53bd..52fea22d3eb9 100644 --- a/src/diffusers/models/model_loading_utils.py +++ b/src/diffusers/models/model_loading_utils.py @@ -462,7 +462,7 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): reader_keys = list(fields.keys()) parsed_parameters = {} - for tensor in tqdm(reader.tensors): + for tensor in tqdm(reader.tensors, desc="Loading GGUF Parameters: "): name = tensor.name quant_type = tensor.tensor_type diff --git a/src/diffusers/models/transformers/transformer_flux.py b/src/diffusers/models/transformers/transformer_flux.py index 0ad3be866019..ce88b30bf9af 100644 --- a/src/diffusers/models/transformers/transformer_flux.py +++ b/src/diffusers/models/transformers/transformer_flux.py @@ -521,7 +521,6 @@ def custom_forward(*inputs): ) else: hidden_states = hidden_states + controlnet_block_samples[index_block // interval_control] - hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1) for index_block, block in enumerate(self.single_transformer_blocks): diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py index 4ef843a90c1e..5f214d0949c9 100644 --- a/src/diffusers/quantizers/gguf/gguf_quantizer.py +++ b/src/diffusers/quantizers/gguf/gguf_quantizer.py @@ -9,6 +9,8 @@ from ...models.modeling_utils import ModelMixin from ...utils import ( + is_accelerate_available, + is_accelerate_version, is_gguf_available, is_torch_available, logging, @@ -31,6 +33,16 @@ def __init__(self, quantization_config, **kwargs): self.compute_dtype = quantization_config.compute_dtype self.pre_quantized = True + def validate_environment(self, *args, **kwargs): + if not is_accelerate_available() or is_accelerate_version("<", "0.26.0"): + raise ImportError( + "Loading GGUF Parameters requires `accelerate` installed in your enviroment: `pip install 'accelerate>=0.26.0'`" + ) + if not is_gguf_available(): + raise ImportError( + "To load GGUF format files you must have `gguf` installed in your environment: `pip install gguf`" + ) + def check_quantized_param_shape(self, param_name, current_param, loaded_param): loaded_param_shape = loaded_param.shape current_param_shape = current_param.shape @@ -81,7 +93,8 @@ def _process_model_before_weight_loading( keep_in_fp32_modules: List[str] = [], **kwargs, ): - _replace_with_gguf_linear(model, self.compute_dtype) + state_dict = kwargs.get("state_dict", None) + _replace_with_gguf_linear(model, self.compute_dtype, state_dict) def _process_model_after_weight_loading(self, model: "ModelMixin", **kwargs): return model @@ -92,5 +105,4 @@ def is_serializable(self): @property def is_trainable(self) -> bool: - # Because we're mandating `bitsandbytes` 0.43.3. return False diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py index 1382cd5f8b52..a3f1bc97ac3a 100644 --- a/src/diffusers/quantizers/gguf/utils.py +++ b/src/diffusers/quantizers/gguf/utils.py @@ -12,14 +12,30 @@ # # See the License for the specific language governing permissions and # # limitations under the License. -import gguf import torch import torch.nn as nn +from ...utils import is_gguf_available + + +if is_gguf_available(): + import gguf + + +def _replace_with_gguf_linear(model, compute_dtype, state_dict, prefix=""): + def _should_convert_to_gguf(module, state_dict, prefix): + weight_key = prefix + "weight" + return weight_key in state_dict and isinstance(state_dict[weight_key], GGUFParameter) + + has_children = list(model.children()) + if not has_children: + return -def _replace_with_gguf_linear(model, compute_dtype): for name, module in model.named_children(): - if isinstance(module, nn.Linear): + module_prefix = prefix + name + "." + _replace_with_gguf_linear(module, compute_dtype, state_dict, module_prefix) + + if isinstance(module, nn.Linear) and _should_convert_to_gguf(module, state_dict, module_prefix): model._modules[name] = GGUFLinear( module.in_features, module.out_features, @@ -30,10 +46,6 @@ def _replace_with_gguf_linear(model, compute_dtype): # Force requires grad to False to avoid unexpected errors model._modules[name].requires_grad_(False) - has_children = list(module.children()) - if has_children: - _replace_with_gguf_linear(module, compute_dtype) - return model @@ -274,33 +286,36 @@ def dequantize_blocks_BF16(blocks, block_size, type_size, dtype=None): return (blocks.view(torch.int16).to(torch.int32) << 16).view(torch.float32) -dequantize_functions = { - gguf.GGMLQuantizationType.BF16: dequantize_blocks_BF16, - gguf.GGMLQuantizationType.Q8_0: dequantize_blocks_Q8_0, - gguf.GGMLQuantizationType.Q5_1: dequantize_blocks_Q5_1, - gguf.GGMLQuantizationType.Q5_0: dequantize_blocks_Q5_0, - gguf.GGMLQuantizationType.Q4_1: dequantize_blocks_Q4_1, - gguf.GGMLQuantizationType.Q4_0: dequantize_blocks_Q4_0, - gguf.GGMLQuantizationType.Q6_K: dequantize_blocks_Q6_K, - gguf.GGMLQuantizationType.Q5_K: dequantize_blocks_Q5_K, - gguf.GGMLQuantizationType.Q4_K: dequantize_blocks_Q4_K, - gguf.GGMLQuantizationType.Q3_K: dequantize_blocks_Q3_K, - gguf.GGMLQuantizationType.Q2_K: dequantize_blocks_Q2_K, -} +if is_gguf_available(): + GGML_QUANT_SIZES = gguf.GGML_QUANT_SIZES + + dequantize_functions = { + gguf.GGMLQuantizationType.BF16: dequantize_blocks_BF16, + gguf.GGMLQuantizationType.Q8_0: dequantize_blocks_Q8_0, + gguf.GGMLQuantizationType.Q5_1: dequantize_blocks_Q5_1, + gguf.GGMLQuantizationType.Q5_0: dequantize_blocks_Q5_0, + gguf.GGMLQuantizationType.Q4_1: dequantize_blocks_Q4_1, + gguf.GGMLQuantizationType.Q4_0: dequantize_blocks_Q4_0, + gguf.GGMLQuantizationType.Q6_K: dequantize_blocks_Q6_K, + gguf.GGMLQuantizationType.Q5_K: dequantize_blocks_Q5_K, + gguf.GGMLQuantizationType.Q4_K: dequantize_blocks_Q4_K, + gguf.GGMLQuantizationType.Q3_K: dequantize_blocks_Q3_K, + gguf.GGMLQuantizationType.Q2_K: dequantize_blocks_Q2_K, + } def _quant_shape_from_byte_shape(shape, type_size, block_size): return (*shape[:-1], shape[-1] // type_size * block_size) -def dequantize_gguf_tensor(tensor, compute_dtype): +def dequantize_gguf_tensor(tensor): if not hasattr(tensor, "quant_type"): return tensor quant_type = tensor.quant_type dequant_fn = dequantize_functions[quant_type] - block_size, type_size = gguf.GGML_QUANT_SIZES[quant_type] + block_size, type_size = GGML_QUANT_SIZES[quant_type] tensor = tensor.view(torch.uint8) shape = _quant_shape_from_byte_shape(tensor.shape, type_size, block_size) @@ -310,9 +325,8 @@ def dequantize_gguf_tensor(tensor, compute_dtype): dequant = dequant_fn(blocks, block_size, type_size) dequant = dequant.reshape(shape) - dequant = dequant.to(compute_dtype) - return dequant + return dequant.as_tensor() class GGUFParameter(torch.Tensor): @@ -323,6 +337,9 @@ def __new__(cls, data, requires_grad=False, quant_type=None): return self + def as_tensor(self): + return torch.Tensor._make_subclass(torch.Tensor, self, self.requires_grad) + @classmethod def __torch_function__(cls, func, types, args=(), kwargs=None): if kwargs is None: @@ -365,5 +382,9 @@ def __init__( self.compute_dtype = compute_dtype def forward(self, inputs): - weight = dequantize_gguf_tensor(self.weight, self.compute_dtype) - return torch.nn.functional.linear(inputs, weight, self.bias) + weight = dequantize_gguf_tensor(self.weight) + weight = weight.to(self.compute_dtype) + bias = self.bias.to(self.compute_dtype) + + output = torch.nn.functional.linear(inputs, weight, bias) + return output From 84493dbec5f18968805efc763b8a5a578c0b1633 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Thu, 21 Nov 2024 09:21:44 +0100 Subject: [PATCH 15/43] update --- src/diffusers/quantizers/gguf/utils.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py index a3f1bc97ac3a..c60e813d7000 100644 --- a/src/diffusers/quantizers/gguf/utils.py +++ b/src/diffusers/quantizers/gguf/utils.py @@ -12,12 +12,14 @@ # # See the License for the specific language governing permissions and # # limitations under the License. -import torch -import torch.nn as nn -from ...utils import is_gguf_available +from ...utils import is_gguf_available, is_torch_available +if is_torch_available(): + import torch + import torch.nn as nn + if is_gguf_available(): import gguf From 50bd78431e31175d73e9fd72cbb159f98422e9cb Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Thu, 21 Nov 2024 16:52:35 +0100 Subject: [PATCH 16/43] update --- src/diffusers/__init__.py | 2 +- src/diffusers/quantizers/auto.py | 9 +++- .../quantizers/gguf/gguf_quantizer.py | 11 ++--- src/diffusers/quantizers/gguf/utils.py | 42 ++++++++----------- 4 files changed, 32 insertions(+), 32 deletions(-) diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index a21f44982a61..a79d7d3012d8 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -545,7 +545,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: from .configuration_utils import ConfigMixin - from .quantizers.quantization_config import BitsAndBytesConfig + from .quantizers.quantization_config import BitsAndBytesConfig, GGUFQuantizationConfig try: if not is_onnx_available(): diff --git a/src/diffusers/quantizers/auto.py b/src/diffusers/quantizers/auto.py index 54000fb85330..f3ae0bd1b3b8 100644 --- a/src/diffusers/quantizers/auto.py +++ b/src/diffusers/quantizers/auto.py @@ -15,12 +15,18 @@ Adapted from https://github.com/huggingface/transformers/blob/c409cd81777fb27aadc043ed3d8339dbc020fb3b/src/transformers/quantizers/auto.py """ + import warnings from typing import Dict, Optional, Union from .bitsandbytes import BnB4BitDiffusersQuantizer, BnB8BitDiffusersQuantizer from .gguf import GGUFQuantizer -from .quantization_config import BitsAndBytesConfig, QuantizationConfigMixin, QuantizationMethod +from .quantization_config import ( + BitsAndBytesConfig, + GGUFQuantizationConfig, + QuantizationConfigMixin, + QuantizationMethod, +) AUTO_QUANTIZER_MAPPING = { @@ -32,6 +38,7 @@ AUTO_QUANTIZATION_CONFIG_MAPPING = { "bitsandbytes_4bit": BitsAndBytesConfig, "bitsandbytes_8bit": BitsAndBytesConfig, + "gguf": GGUFQuantizationConfig, } diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py index 5f214d0949c9..033de678c81e 100644 --- a/src/diffusers/quantizers/gguf/gguf_quantizer.py +++ b/src/diffusers/quantizers/gguf/gguf_quantizer.py @@ -1,14 +1,14 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional -from ...utils import get_module_from_name from ..base import DiffusersQuantizer -from .utils import GGUFParameter, _quant_shape_from_byte_shape, _replace_with_gguf_linear if TYPE_CHECKING: from ...models.modeling_utils import ModelMixin + from ...utils import ( + get_module_from_name, is_accelerate_available, is_accelerate_version, is_gguf_available, @@ -17,11 +17,12 @@ ) -if is_torch_available(): +if is_torch_available() and is_gguf_available(): + import gguf import torch -if is_gguf_available(): - import gguf + from .utils import GGUFParameter, _quant_shape_from_byte_shape, _replace_with_gguf_linear + logger = logging.get_logger(__name__) diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py index c60e813d7000..1f4ec0a62bff 100644 --- a/src/diffusers/quantizers/gguf/utils.py +++ b/src/diffusers/quantizers/gguf/utils.py @@ -13,15 +13,9 @@ # # limitations under the License. -from ...utils import is_gguf_available, is_torch_available - - -if is_torch_available(): - import torch - import torch.nn as nn - -if is_gguf_available(): - import gguf +import gguf +import torch +import torch.nn as nn def _replace_with_gguf_linear(model, compute_dtype, state_dict, prefix=""): @@ -288,22 +282,20 @@ def dequantize_blocks_BF16(blocks, block_size, type_size, dtype=None): return (blocks.view(torch.int16).to(torch.int32) << 16).view(torch.float32) -if is_gguf_available(): - GGML_QUANT_SIZES = gguf.GGML_QUANT_SIZES - - dequantize_functions = { - gguf.GGMLQuantizationType.BF16: dequantize_blocks_BF16, - gguf.GGMLQuantizationType.Q8_0: dequantize_blocks_Q8_0, - gguf.GGMLQuantizationType.Q5_1: dequantize_blocks_Q5_1, - gguf.GGMLQuantizationType.Q5_0: dequantize_blocks_Q5_0, - gguf.GGMLQuantizationType.Q4_1: dequantize_blocks_Q4_1, - gguf.GGMLQuantizationType.Q4_0: dequantize_blocks_Q4_0, - gguf.GGMLQuantizationType.Q6_K: dequantize_blocks_Q6_K, - gguf.GGMLQuantizationType.Q5_K: dequantize_blocks_Q5_K, - gguf.GGMLQuantizationType.Q4_K: dequantize_blocks_Q4_K, - gguf.GGMLQuantizationType.Q3_K: dequantize_blocks_Q3_K, - gguf.GGMLQuantizationType.Q2_K: dequantize_blocks_Q2_K, - } +GGML_QUANT_SIZES = gguf.GGML_QUANT_SIZES +dequantize_functions = { + gguf.GGMLQuantizationType.BF16: dequantize_blocks_BF16, + gguf.GGMLQuantizationType.Q8_0: dequantize_blocks_Q8_0, + gguf.GGMLQuantizationType.Q5_1: dequantize_blocks_Q5_1, + gguf.GGMLQuantizationType.Q5_0: dequantize_blocks_Q5_0, + gguf.GGMLQuantizationType.Q4_1: dequantize_blocks_Q4_1, + gguf.GGMLQuantizationType.Q4_0: dequantize_blocks_Q4_0, + gguf.GGMLQuantizationType.Q6_K: dequantize_blocks_Q6_K, + gguf.GGMLQuantizationType.Q5_K: dequantize_blocks_Q5_K, + gguf.GGMLQuantizationType.Q4_K: dequantize_blocks_Q4_K, + gguf.GGMLQuantizationType.Q3_K: dequantize_blocks_Q3_K, + gguf.GGMLQuantizationType.Q2_K: dequantize_blocks_Q2_K, +} def _quant_shape_from_byte_shape(shape, type_size, block_size): From afd5d7d7344b3732861c20698ea9a557a68607ba Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Wed, 4 Dec 2024 10:36:05 +0100 Subject: [PATCH 17/43] update --- src/diffusers/__init__.py | 2 +- src/diffusers/loaders/single_file_model.py | 2 +- src/diffusers/models/model_loading_utils.py | 16 +++++--- .../quantizers/gguf/gguf_quantizer.py | 37 +++++++++++++++---- src/diffusers/quantizers/gguf/utils.py | 25 +++++++++---- src/diffusers/utils/__init__.py | 1 + src/diffusers/utils/import_utils.py | 15 ++++++++ 7 files changed, 76 insertions(+), 22 deletions(-) diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 18a0c10f924c..bba405c8e666 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -338,8 +338,8 @@ "StableDiffusion3ControlNetPipeline", "StableDiffusion3Img2ImgPipeline", "StableDiffusion3InpaintPipeline", - "StableDiffusion3PAGPipeline", "StableDiffusion3PAGImg2ImgPipeline", + "StableDiffusion3PAGPipeline", "StableDiffusion3Pipeline", "StableDiffusionAdapterPipeline", "StableDiffusionAttendAndExcitePipeline", diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py index be6ee935783b..b4edf48103a2 100644 --- a/src/diffusers/loaders/single_file_model.py +++ b/src/diffusers/loaders/single_file_model.py @@ -349,7 +349,7 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] = if hf_quantizer is not None: hf_quantizer.postprocess_model(model) - if torch_dtype is not None: + if torch_dtype is not None and hf_quantizer is None: model.to(torch_dtype) model.eval() diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py index 52fea22d3eb9..2e5ed6a9e21d 100644 --- a/src/diffusers/models/model_loading_utils.py +++ b/src/diffusers/models/model_loading_utils.py @@ -449,7 +449,7 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): import gguf from gguf import GGUFReader - from ..quantizers.gguf.utils import GGUFParameter + from ..quantizers.gguf.utils import SUPPORTED_GGUF_QUANT_TYPES, GGUFParameter else: logger.error( "Loading a GGUF checkpoint in PyTorch, requires both PyTorch and GGUF>=0.10.0 to be installed. Please see " @@ -458,8 +458,6 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): raise ImportError("Please install torch and gguf>=0.10.0 to load a GGUF checkpoint in PyTorch.") reader = GGUFReader(gguf_checkpoint_path) - fields = reader.fields - reader_keys = list(fields.keys()) parsed_parameters = {} for tensor in tqdm(reader.tensors, desc="Loading GGUF Parameters: "): @@ -468,10 +466,16 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): # if the tensor is a torch supported dtype do not use GGUFParameter is_gguf_quant = quant_type not in [gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16] + if is_gguf_quant and quant_type not in SUPPORTED_GGUF_QUANT_TYPES: + raise ValueError( + ( + f"{name} has a quantization type: {quant_type} which is unsupported." + f" Currently the following quantization types are supported: {SUPPORTED_GGUF_QUANT_TYPES}" + "To request support for this quantization type please open an issue here: https://github.com/huggingface/diffusers" + ) + ) + weights = torch.from_numpy(tensor.data.copy()) parsed_parameters[name] = GGUFParameter(weights, quant_type=quant_type) if is_gguf_quant else weights - if len(reader_keys) > 0: - logger.info(f"Some keys of the GGUF file were not considered: {reader_keys}") - return parsed_parameters diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py index 033de678c81e..053b39bc504a 100644 --- a/src/diffusers/quantizers/gguf/gguf_quantizer.py +++ b/src/diffusers/quantizers/gguf/gguf_quantizer.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Any, Dict, List, Optional +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union from ..base import DiffusersQuantizer @@ -12,6 +12,7 @@ is_accelerate_available, is_accelerate_version, is_gguf_available, + is_gguf_version, is_torch_available, logging, ) @@ -21,7 +22,11 @@ import gguf import torch - from .utils import GGUFParameter, _quant_shape_from_byte_shape, _replace_with_gguf_linear + from .utils import ( + GGUFParameter, + _quant_shape_from_byte_shape, + _replace_with_gguf_linear, + ) logger = logging.get_logger(__name__) @@ -39,11 +44,26 @@ def validate_environment(self, *args, **kwargs): raise ImportError( "Loading GGUF Parameters requires `accelerate` installed in your enviroment: `pip install 'accelerate>=0.26.0'`" ) - if not is_gguf_available(): + if not is_gguf_available() or is_gguf_version("<", "0.10.0"): raise ImportError( - "To load GGUF format files you must have `gguf` installed in your environment: `pip install gguf`" + "To load GGUF format files you must have `gguf` installed in your environment: `pip install gguf>=0.10.0`" ) + def adjust_max_memory(self, max_memory: Dict[str, Union[int, str]]) -> Dict[str, Union[int, str]]: + # need more space for buffers that are created during quantization + max_memory = {key: val * 0.90 for key, val in max_memory.items()} + return max_memory + + def adjust_target_dtype(self, target_dtype: "torch.dtype") -> "torch.dtype": + if target_dtype != torch.uint8: + logger.info(f"target_dtype {target_dtype} is replaced by `torch.uint8` for GGUF quantization") + return torch.uint8 + + def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype": + if torch_dtype is None: + torch_dtype = self.compute_dtype + return torch_dtype + def check_quantized_param_shape(self, param_name, current_param, loaded_param): loaded_param_shape = loaded_param.shape current_param_shape = current_param.shape @@ -62,7 +82,7 @@ def check_quantized_param_shape(self, param_name, current_param, loaded_param): def check_if_quantized_param( self, model: "ModelMixin", - param_value: "torch.Tensor", + param_value: Union["GGUFParameter", "torch.Tensor"], param_name: str, state_dict: Dict[str, Any], **kwargs, @@ -82,10 +102,13 @@ def create_quantized_param( unexpected_keys: Optional[List[str]] = None, ): module, tensor_name = get_module_from_name(model, param_name) - if tensor_name not in module._parameters: + if tensor_name not in module._parameters and tensor_name not in module._buffers: raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.") - module._parameters[tensor_name] = param_value + if tensor_name in module._parameters: + module._parameters[tensor_name] = param_value.to(target_device) + if tensor_name in module._buffers: + module._buffers[tensor_name] = param_value.to(target_device) def _process_model_before_weight_loading( self, diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py index 1f4ec0a62bff..b0428a067f43 100644 --- a/src/diffusers/quantizers/gguf/utils.py +++ b/src/diffusers/quantizers/gguf/utils.py @@ -13,10 +13,18 @@ # # limitations under the License. +from contextlib import nullcontext + import gguf import torch import torch.nn as nn +from ...utils import is_accelerate_available + + +if is_accelerate_available(): + from accelerate import init_empty_weights + def _replace_with_gguf_linear(model, compute_dtype, state_dict, prefix=""): def _should_convert_to_gguf(module, state_dict, prefix): @@ -32,12 +40,14 @@ def _should_convert_to_gguf(module, state_dict, prefix): _replace_with_gguf_linear(module, compute_dtype, state_dict, module_prefix) if isinstance(module, nn.Linear) and _should_convert_to_gguf(module, state_dict, module_prefix): - model._modules[name] = GGUFLinear( - module.in_features, - module.out_features, - module.bias is not None, - compute_dtype=compute_dtype, - ) + ctx = init_empty_weights if is_accelerate_available() else nullcontext + with ctx(): + model._modules[name] = GGUFLinear( + module.in_features, + module.out_features, + module.bias is not None, + compute_dtype=compute_dtype, + ) model._modules[name].source_cls = type(module) # Force requires grad to False to avoid unexpected errors model._modules[name].requires_grad_(False) @@ -296,6 +306,7 @@ def dequantize_blocks_BF16(blocks, block_size, type_size, dtype=None): gguf.GGMLQuantizationType.Q3_K: dequantize_blocks_Q3_K, gguf.GGMLQuantizationType.Q2_K: dequantize_blocks_Q2_K, } +SUPPORTED_GGUF_QUANT_TYPES = list(dequantize_functions.keys()) def _quant_shape_from_byte_shape(shape, type_size, block_size): @@ -323,7 +334,7 @@ def dequantize_gguf_tensor(tensor): return dequant.as_tensor() -class GGUFParameter(torch.Tensor): +class GGUFParameter(torch.nn.Parameter): def __new__(cls, data, requires_grad=False, quant_type=None): data = data if data is not None else torch.empty(0) self = torch.Tensor._make_subclass(cls, data, requires_grad) diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py index a9ef3718e9e8..c2f7d8fdd8ca 100644 --- a/src/diffusers/utils/__init__.py +++ b/src/diffusers/utils/__init__.py @@ -68,6 +68,7 @@ is_flax_available, is_ftfy_available, is_gguf_available, + is_gguf_version, is_google_colab, is_inflect_available, is_invisible_watermark_available, diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py index f440bf67cb6c..2a338c630cd1 100644 --- a/src/diffusers/utils/import_utils.py +++ b/src/diffusers/utils/import_utils.py @@ -777,6 +777,21 @@ def is_bitsandbytes_version(operation: str, version: str): return compare_versions(parse(_bitsandbytes_version), operation, version) +def is_gguf_version(operation: str, version: str): + """ + Compares the current Accelerate version to a given reference with an operation. + + Args: + operation (`str`): + A string representation of an operator, such as `">"` or `"<="` + version (`str`): + A version string + """ + if not _is_gguf_available: + return False + return compare_versions(parse(_gguf_version), operation, version) + + def is_k_diffusion_version(operation: str, version: str): """ Compares the current k-diffusion version to a given reference with an operation. From 0ed31bcabedc122b8e99a21d2dd57512b2fea180 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Wed, 4 Dec 2024 14:02:40 +0100 Subject: [PATCH 18/43] update --- src/diffusers/models/modeling_utils.py | 8 +- tests/quantization/gguf/test_gguf.py | 122 +++++++++++++++++++++++++ 2 files changed, 126 insertions(+), 4 deletions(-) create mode 100644 tests/quantization/gguf/test_gguf.py diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py index 76f6c5f6309d..c4472f51a6a4 100644 --- a/src/diffusers/models/modeling_utils.py +++ b/src/diffusers/models/modeling_utils.py @@ -1010,14 +1010,14 @@ def to(self, *args, **kwargs): dtype_present_in_args = True break - # Checks if the model has been loaded in 4-bit or 8-bit with BNB - if getattr(self, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES: + if getattr(self, "is_quantized", False): if dtype_present_in_args: raise ValueError( - "You cannot cast a bitsandbytes model in a new `dtype`. Make sure to load the model using `from_pretrained` using the" - " desired `dtype` by passing the correct `torch_dtype` argument." + "Casting a quantized model to a new `dtype` is unsupported. To set the dtype of unquantized layers, please " + "use the `torch_dtype` argument when loading the model using `from_pretrained` or `from_single_file`" ) + if getattr(self, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES: if getattr(self, "is_loaded_in_8bit", False): raise ValueError( "`.to` is not supported for `8-bit` bitsandbytes models. Please use the model as it is, since the" diff --git a/tests/quantization/gguf/test_gguf.py b/tests/quantization/gguf/test_gguf.py new file mode 100644 index 000000000000..871fb6e271b1 --- /dev/null +++ b/tests/quantization/gguf/test_gguf.py @@ -0,0 +1,122 @@ +import gc +import unittest + +import torch + +from diffusers import FluxTransformer2DModel, GGUFQuantizationConfig +from diffusers.quantizers.gguf.utils import GGUFParameter +from diffusers.utils.testing_utils import ( + nightly, + require_big_gpu_with_torch_cuda, + torch_device, +) + + +@nightly +@require_big_gpu_with_torch_cuda +class GGUFSingleFileTests(unittest.TestCase): + ckpt_path = "https://huggingface.co/city96/FLUX.1-dev-gguf/blob/main/flux1-dev-Q2_K.gguf" + torch_dtype = torch.bfloat16 + + def setUp(self): + gc.collect() + torch.cuda.empty_cache() + + def tearDown(self): + gc.collect() + torch.cuda.empty_cache() + + def get_dummy_inputs(self): + return { + "hidden_states": torch.randn((1, 4096, 64), generator=torch.Generator("cpu").manual_seed(0)).to( + torch_device, self.torch_dtype + ), + "encoder_hidden_states": torch.randn( + (1, 512, 4096), + generator=torch.Generator("cpu").manual_seed(0), + ).to(torch_device, self.torch_dtype), + "pooled_projections": torch.randn( + (1, 768), + generator=torch.Generator("cpu").manual_seed(0), + ).to(torch_device, self.torch_dtype), + "timestep": torch.tensor([1]).to(torch_device, self.torch_dtype), + "img_ids": torch.randn((4096, 3), generator=torch.Generator("cpu").manual_seed(0)).to( + torch_device, self.torch_dtype + ), + "txt_ids": torch.randn((512, 3), generator=torch.Generator("cpu").manual_seed(0)).to( + torch_device, self.torch_dtype + ), + "guidance": torch.tensor([3.5]).to(torch_device, self.torch_dtype), + } + + def test_gguf_parameters(self): + quant_storage_type = torch.uint8 + quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype) + model = FluxTransformer2DModel.from_single_file(self.ckpt_path, quantization_config=quantization_config) + + for param_name, param in model.named_parameters(): + if isinstance(param, GGUFParameter): + assert hasattr(param, "quant_type") + assert param.dtype == quant_storage_type + + def test_gguf_linear_layers(self): + quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype) + model = FluxTransformer2DModel.from_single_file(self.ckpt_path, quantization_config=quantization_config) + + for name, module in model.named_modules(): + if isinstance(module, torch.nn.Linear) and hasattr(module.weight, "quant_type"): + assert module.weight.dtype == torch.uint8 + + def test_gguf_memory(self): + quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype) + + model = FluxTransformer2DModel.from_single_file( + self.ckpt_path, quantization_config=quantization_config, torch_dtype=self.torch_dtype + ) + model.to("cuda") + inputs = self.get_dummy_inputs() + + torch.cuda.reset_peak_memory_stats() + torch.cuda.empty_cache() + with torch.no_grad(): + model(**inputs) + max_memory = torch.cuda.max_memory_allocated() + assert (max_memory / 1024**3) < 5 + + def test_keep_modules_in_fp32(self): + r""" + A simple tests to check if the modules under `_keep_in_fp32_modules` are kept in fp32. + Also ensures if inference works. + """ + FluxTransformer2DModel._keep_in_fp32_modules = ["proj_out"] + + quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype) + model = FluxTransformer2DModel.from_single_file(self.ckpt_path, quantization_config=quantization_config) + + for name, module in model.named_modules(): + if isinstance(module, torch.nn.Linear): + if name in model._keep_in_fp32_modules: + assert module.weight.dtype == torch.float32 + + def test_dtype_assignment(self): + quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype) + model = FluxTransformer2DModel.from_single_file(self.ckpt_path, quantization_config=quantization_config) + + with self.assertRaises(ValueError): + # Tries with a `dtype` + model.to(torch.float16) + + with self.assertRaises(ValueError): + # Tries with a `device` and `dtype` + model.to(device="cuda:0", dtype=torch.float16) + + with self.assertRaises(ValueError): + # Tries with a cast + model.float() + + with self.assertRaises(ValueError): + # Tries with a cast + model.half() + + # This should work + model.to("cuda") From af381ad57d8afed0134650b2bcf6c72b7b4b52f9 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Wed, 4 Dec 2024 14:20:09 +0100 Subject: [PATCH 19/43] update --- src/diffusers/models/model_loading_utils.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py index 2e5ed6a9e21d..1f0df6d6fd2d 100644 --- a/src/diffusers/models/model_loading_utils.py +++ b/src/diffusers/models/model_loading_utils.py @@ -467,11 +467,13 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): # if the tensor is a torch supported dtype do not use GGUFParameter is_gguf_quant = quant_type not in [gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16] if is_gguf_quant and quant_type not in SUPPORTED_GGUF_QUANT_TYPES: + _supported_quants_str = "\n".join([str(type) for type in SUPPORTED_GGUF_QUANT_TYPES]) raise ValueError( ( - f"{name} has a quantization type: {quant_type} which is unsupported." - f" Currently the following quantization types are supported: {SUPPORTED_GGUF_QUANT_TYPES}" - "To request support for this quantization type please open an issue here: https://github.com/huggingface/diffusers" + f"{name} has a quantization type: {str(quant_type)} which is unsupported." + "\n\nCurrently the following quantization types are supported: \n\n" + f"{_supported_quants_str}" + "\n\nTo request support for this quantization type please open an issue here: https://github.com/huggingface/diffusers" ) ) From 52a1bcb7105b6bddba1d471e4ee3e68b84082af5 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Wed, 4 Dec 2024 16:34:25 +0100 Subject: [PATCH 20/43] update --- src/diffusers/quantizers/gguf/gguf_quantizer.py | 2 +- src/diffusers/quantizers/quantization_config.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py index 053b39bc504a..42280294ab71 100644 --- a/src/diffusers/quantizers/gguf/gguf_quantizer.py +++ b/src/diffusers/quantizers/gguf/gguf_quantizer.py @@ -37,7 +37,7 @@ def __init__(self, quantization_config, **kwargs): super().__init__(quantization_config, **kwargs) self.compute_dtype = quantization_config.compute_dtype - self.pre_quantized = True + self.pre_quantized = quantization_config.pre_quantized def validate_environment(self, *args, **kwargs): if not is_accelerate_available() or is_accelerate_version("<", "0.26.0"): diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py index 58e68b628d77..07bf763520db 100644 --- a/src/diffusers/quantizers/quantization_config.py +++ b/src/diffusers/quantizers/quantization_config.py @@ -397,6 +397,7 @@ def __init__(self, compute_dtype=None, quant_storage=None): self.quant_method = QuantizationMethod.GGUF self.compute_dtype = compute_dtype self.quant_storage = quant_storage + self.pre_quantized = True if self.compute_dtype is None: self.compute_dtype = torch.float32 From 67f17000d255da1e922eb6b76e48c37f047c015e Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Wed, 4 Dec 2024 16:54:56 +0100 Subject: [PATCH 21/43] update --- src/diffusers/quantizers/gguf/gguf_quantizer.py | 1 + src/diffusers/utils/testing_utils.py | 13 +++++++++++++ tests/quantization/gguf/test_gguf.py | 10 +++++++++- 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py index 42280294ab71..f0f9aa359ac4 100644 --- a/src/diffusers/quantizers/gguf/gguf_quantizer.py +++ b/src/diffusers/quantizers/gguf/gguf_quantizer.py @@ -49,6 +49,7 @@ def validate_environment(self, *args, **kwargs): "To load GGUF format files you must have `gguf` installed in your environment: `pip install gguf>=0.10.0`" ) + # Copied from diffusers.quantizers.bitsandbytes.bnb_quantizer.BnB4BitDiffusersQuantizer.adjust_max_memory def adjust_max_memory(self, max_memory: Dict[str, Union[int, str]]) -> Dict[str, Union[int, str]]: # need more space for buffers that are created during quantization max_memory = {key: val * 0.90 for key, val in max_memory.items()} diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index b3e381f7d3fb..f1d929c85d1b 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -32,6 +32,7 @@ is_bitsandbytes_available, is_compel_available, is_flax_available, + is_gguf_available, is_note_seq_available, is_onnx_available, is_opencv_available, @@ -476,6 +477,18 @@ def decorator(test_case): return decorator +def require_gguf_version_greater_or_equal(gguf_version): + def decorator(test_case): + correct_gguf_version = is_gguf_available() and version.parse( + version.parse(importlib.metadata.version("gguf")).base_version + ) >= version.parse(gguf_version) + return unittest.skipUnless( + correct_gguf_version, f"Test requires gguf with the version greater than {gguf_version}." + )(test_case) + + return decorator + + def deprecate_after_peft_backend(test_case): """ Decorator marking a test that will be skipped after PEFT backend diff --git a/tests/quantization/gguf/test_gguf.py b/tests/quantization/gguf/test_gguf.py index 871fb6e271b1..710e92df6acd 100644 --- a/tests/quantization/gguf/test_gguf.py +++ b/tests/quantization/gguf/test_gguf.py @@ -4,16 +4,24 @@ import torch from diffusers import FluxTransformer2DModel, GGUFQuantizationConfig -from diffusers.quantizers.gguf.utils import GGUFParameter from diffusers.utils.testing_utils import ( + is_gguf_available, nightly, + require_accelerate, require_big_gpu_with_torch_cuda, + require_gguf_version_greater_or_equal, torch_device, ) +if is_gguf_available(): + from diffusers.quantizers.gguf.utils import GGUFParameter + + @nightly @require_big_gpu_with_torch_cuda +@require_accelerate +@require_gguf_version_greater_or_equal("0.10.0") class GGUFSingleFileTests(unittest.TestCase): ckpt_path = "https://huggingface.co/city96/FLUX.1-dev-gguf/blob/main/flux1-dev-Q2_K.gguf" torch_dtype = torch.bfloat16 From 8abfa5559cb7cffd3d027a58841c8396f69efae2 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Thu, 5 Dec 2024 06:44:01 +0100 Subject: [PATCH 22/43] update --- .../quantizers/gguf/gguf_quantizer.py | 20 +++++++++++++++---- src/diffusers/quantizers/gguf/utils.py | 10 +++++++--- .../quantizers/quantization_config.py | 3 ++- 3 files changed, 25 insertions(+), 8 deletions(-) diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py index f0f9aa359ac4..bb61b1ddd7ac 100644 --- a/src/diffusers/quantizers/gguf/gguf_quantizer.py +++ b/src/diffusers/quantizers/gguf/gguf_quantizer.py @@ -19,10 +19,10 @@ if is_torch_available() and is_gguf_available(): - import gguf import torch from .utils import ( + GGML_QUANT_SIZES, GGUFParameter, _quant_shape_from_byte_shape, _replace_with_gguf_linear, @@ -33,11 +33,17 @@ class GGUFQuantizer(DiffusersQuantizer): + use_keep_in_fp32_modules = True + def __init__(self, quantization_config, **kwargs): super().__init__(quantization_config, **kwargs) self.compute_dtype = quantization_config.compute_dtype self.pre_quantized = quantization_config.pre_quantized + self.modules_to_not_convert = quantization_config.modules_to_not_convert + + if not isinstance(self.modules_to_not_convert, list): + self.modules_to_not_convert = [self.modules_to_not_convert] def validate_environment(self, *args, **kwargs): if not is_accelerate_available() or is_accelerate_version("<", "0.26.0"): @@ -70,7 +76,7 @@ def check_quantized_param_shape(self, param_name, current_param, loaded_param): current_param_shape = current_param.shape quant_type = loaded_param.quant_type - block_size, type_size = gguf.GGML_QUANT_SIZES[quant_type] + block_size, type_size = GGML_QUANT_SIZES[quant_type] inferred_shape = _quant_shape_from_byte_shape(loaded_param_shape, type_size, block_size) if inferred_shape != current_param_shape: @@ -96,7 +102,7 @@ def check_if_quantized_param( def create_quantized_param( self, model: "ModelMixin", - param_value: "torch.Tensor", + param_value: Union["GGUFParameter", "torch.Tensor"], param_name: str, target_device: "torch.device", state_dict: Dict[str, Any], @@ -119,7 +125,13 @@ def _process_model_before_weight_loading( **kwargs, ): state_dict = kwargs.get("state_dict", None) - _replace_with_gguf_linear(model, self.compute_dtype, state_dict) + + self.modules_to_not_convert.extend(keep_in_fp32_modules) + self.modules_to_not_convert = [module for module in self.modules_to_not_convert if module is not None] + + _replace_with_gguf_linear( + model, self.compute_dtype, state_dict, modules_to_not_convert=self.modules_to_not_convert + ) def _process_model_after_weight_loading(self, model: "ModelMixin", **kwargs): return model diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py index b0428a067f43..c72a20712934 100644 --- a/src/diffusers/quantizers/gguf/utils.py +++ b/src/diffusers/quantizers/gguf/utils.py @@ -26,7 +26,7 @@ from accelerate import init_empty_weights -def _replace_with_gguf_linear(model, compute_dtype, state_dict, prefix=""): +def _replace_with_gguf_linear(model, compute_dtype, state_dict, prefix="", modules_to_not_convert=[]): def _should_convert_to_gguf(module, state_dict, prefix): weight_key = prefix + "weight" return weight_key in state_dict and isinstance(state_dict[weight_key], GGUFParameter) @@ -37,9 +37,13 @@ def _should_convert_to_gguf(module, state_dict, prefix): for name, module in model.named_children(): module_prefix = prefix + name + "." - _replace_with_gguf_linear(module, compute_dtype, state_dict, module_prefix) + _replace_with_gguf_linear(module, compute_dtype, state_dict, module_prefix, modules_to_not_convert) - if isinstance(module, nn.Linear) and _should_convert_to_gguf(module, state_dict, module_prefix): + if ( + isinstance(module, nn.Linear) + and _should_convert_to_gguf(module, state_dict, module_prefix) + and name not in modules_to_not_convert + ): ctx = init_empty_weights if is_accelerate_available() else nullcontext with ctx(): model._modules[name] = GGUFLinear( diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py index 07bf763520db..8fb9deadec21 100644 --- a/src/diffusers/quantizers/quantization_config.py +++ b/src/diffusers/quantizers/quantization_config.py @@ -393,11 +393,12 @@ def to_diff_dict(self) -> Dict[str, Any]: class GGUFQuantizationConfig(QuantizationConfigMixin): - def __init__(self, compute_dtype=None, quant_storage=None): + def __init__(self, compute_dtype=None, quant_storage=None, modules_to_not_convert=None): self.quant_method = QuantizationMethod.GGUF self.compute_dtype = compute_dtype self.quant_storage = quant_storage self.pre_quantized = True + self.modules_to_not_convert = modules_to_not_convert if self.compute_dtype is None: self.compute_dtype = torch.float32 From d4b88d787be0ed62e4daf5fbd0880217748b7360 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Thu, 5 Dec 2024 06:45:12 +0100 Subject: [PATCH 23/43] update --- src/diffusers/quantizers/gguf/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/diffusers/quantizers/gguf/__init__.py b/src/diffusers/quantizers/gguf/__init__.py index b3d9082ac803..53af2e180f48 100644 --- a/src/diffusers/quantizers/gguf/__init__.py +++ b/src/diffusers/quantizers/gguf/__init__.py @@ -1 +1,2 @@ from .gguf_quantizer import GGUFQuantizer +from .utils import GGUFLinear, GGUFParameter From 30f13ed310dfc7a427611c7b9f1a478fbb9d0463 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Thu, 5 Dec 2024 09:12:42 +0100 Subject: [PATCH 24/43] update --- tests/quantization/gguf/test_gguf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/quantization/gguf/test_gguf.py b/tests/quantization/gguf/test_gguf.py index 710e92df6acd..94170aa97069 100644 --- a/tests/quantization/gguf/test_gguf.py +++ b/tests/quantization/gguf/test_gguf.py @@ -82,6 +82,7 @@ def test_gguf_memory(self): self.ckpt_path, quantization_config=quantization_config, torch_dtype=self.torch_dtype ) model.to("cuda") + assert (model.get_memory_footprint() / 1024**3) < 5 inputs = self.get_dummy_inputs() torch.cuda.reset_peak_memory_stats() From 9310035f5545abc62e795526e53ca3c65548df02 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Thu, 5 Dec 2024 09:37:36 +0100 Subject: [PATCH 25/43] update --- .github/workflows/nightly_tests.yml | 4 +++- tests/quantization/gguf/test_gguf.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/nightly_tests.yml b/.github/workflows/nightly_tests.yml index e2228fdacf30..f380b4311332 100644 --- a/.github/workflows/nightly_tests.yml +++ b/.github/workflows/nightly_tests.yml @@ -356,6 +356,8 @@ jobs: config: - backend: "bitsandbytes" test_location: "bnb" + - backend: "gguf" + test_location: "gguf" runs-on: group: aws-g6e-xlarge-plus container: @@ -519,4 +521,4 @@ jobs: # if: always() # run: | # pip install slack_sdk tabulate -# python utils/log_reports.py >> $GITHUB_STEP_SUMMARY \ No newline at end of file +# python utils/log_reports.py >> $GITHUB_STEP_SUMMARY diff --git a/tests/quantization/gguf/test_gguf.py b/tests/quantization/gguf/test_gguf.py index 94170aa97069..c7c678947807 100644 --- a/tests/quantization/gguf/test_gguf.py +++ b/tests/quantization/gguf/test_gguf.py @@ -75,7 +75,7 @@ def test_gguf_linear_layers(self): if isinstance(module, torch.nn.Linear) and hasattr(module.weight, "quant_type"): assert module.weight.dtype == torch.uint8 - def test_gguf_memory(self): + def test_gguf_memory_usage(self): quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype) model = FluxTransformer2DModel.from_single_file( From e9303a0198b3b8a1ceeeae9c2a75e03649940239 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Thu, 5 Dec 2024 10:30:46 +0100 Subject: [PATCH 26/43] update --- src/diffusers/loaders/single_file_utils.py | 25 +++- tests/quantization/gguf/test_gguf.py | 155 +++++++++++++++------ 2 files changed, 132 insertions(+), 48 deletions(-) diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py index 10742873ded1..7b36d3e710fb 100644 --- a/src/diffusers/loaders/single_file_utils.py +++ b/src/diffusers/loaders/single_file_utils.py @@ -81,8 +81,14 @@ "open_clip_sd3": "text_encoders.clip_g.transformer.text_model.embeddings.position_embedding.weight", "stable_cascade_stage_b": "down_blocks.1.0.channelwise.0.weight", "stable_cascade_stage_c": "clip_txt_mapper.weight", - "sd3": "model.diffusion_model.joint_blocks.0.context_block.adaLN_modulation.1.bias", - "sd35_large": "model.diffusion_model.joint_blocks.37.x_block.mlp.fc1.weight", + "sd3": [ + "joint_blocks.0.context_block.adaLN_modulation.1.bias", + "model.diffusion_model.joint_blocks.0.context_block.adaLN_modulation.1.bias", + ], + "sd35_large": [ + "joint_blocks.37.x_block.mlp.fc1.weight", + "model.diffusion_model.joint_blocks.37.x_block.mlp.fc1.weight", + ], "animatediff": "down_blocks.0.motion_modules.0.temporal_transformer.transformer_blocks.0.attention_blocks.0.pos_encoder.pe", "animatediff_v2": "mid_block.motion_modules.0.temporal_transformer.norm.bias", "animatediff_sdxl_beta": "up_blocks.2.motion_modules.0.temporal_transformer.norm.weight", @@ -529,13 +535,20 @@ def infer_diffusers_model_type(checkpoint): ): model_type = "stable_cascade_stage_b" - elif CHECKPOINT_KEY_NAMES["sd3"] in checkpoint and checkpoint[CHECKPOINT_KEY_NAMES["sd3"]].shape[-1] == 9216: - if checkpoint["model.diffusion_model.pos_embed"].shape[1] == 36864: + elif any(key in checkpoint for key in CHECKPOINT_KEY_NAMES["sd3"]) and any( + checkpoint[key].shape[-1] == 9216 if key in checkpoint else False for key in CHECKPOINT_KEY_NAMES["sd3"] + ): + if "model.diffusion_model.pos_embed" in checkpoint: + key = "model.diffusion_model.pos_embed" + else: + key = "pos_embed" + + if checkpoint[key].shape[1] == 36864: model_type = "sd3" - elif checkpoint["model.diffusion_model.pos_embed"].shape[1] == 147456: + elif checkpoint[key].shape[1] == 147456: model_type = "sd35_medium" - elif CHECKPOINT_KEY_NAMES["sd35_large"] in checkpoint: + elif any(key in checkpoint for key in CHECKPOINT_KEY_NAMES["sd35_large"]): model_type = "sd35_large" elif CHECKPOINT_KEY_NAMES["animatediff"] in checkpoint: diff --git a/tests/quantization/gguf/test_gguf.py b/tests/quantization/gguf/test_gguf.py index c7c678947807..dfe1100fadc8 100644 --- a/tests/quantization/gguf/test_gguf.py +++ b/tests/quantization/gguf/test_gguf.py @@ -3,7 +3,7 @@ import torch -from diffusers import FluxTransformer2DModel, GGUFQuantizationConfig +from diffusers import FluxTransformer2DModel, GGUFQuantizationConfig, SD3Transformer2DModel from diffusers.utils.testing_utils import ( is_gguf_available, nightly, @@ -22,45 +22,16 @@ @require_big_gpu_with_torch_cuda @require_accelerate @require_gguf_version_greater_or_equal("0.10.0") -class GGUFSingleFileTests(unittest.TestCase): - ckpt_path = "https://huggingface.co/city96/FLUX.1-dev-gguf/blob/main/flux1-dev-Q2_K.gguf" +class GGUFSingleFileTesterMixin: + ckpt_path = None + model_cls = None torch_dtype = torch.bfloat16 - - def setUp(self): - gc.collect() - torch.cuda.empty_cache() - - def tearDown(self): - gc.collect() - torch.cuda.empty_cache() - - def get_dummy_inputs(self): - return { - "hidden_states": torch.randn((1, 4096, 64), generator=torch.Generator("cpu").manual_seed(0)).to( - torch_device, self.torch_dtype - ), - "encoder_hidden_states": torch.randn( - (1, 512, 4096), - generator=torch.Generator("cpu").manual_seed(0), - ).to(torch_device, self.torch_dtype), - "pooled_projections": torch.randn( - (1, 768), - generator=torch.Generator("cpu").manual_seed(0), - ).to(torch_device, self.torch_dtype), - "timestep": torch.tensor([1]).to(torch_device, self.torch_dtype), - "img_ids": torch.randn((4096, 3), generator=torch.Generator("cpu").manual_seed(0)).to( - torch_device, self.torch_dtype - ), - "txt_ids": torch.randn((512, 3), generator=torch.Generator("cpu").manual_seed(0)).to( - torch_device, self.torch_dtype - ), - "guidance": torch.tensor([3.5]).to(torch_device, self.torch_dtype), - } + expected_memory_use_in_gb = 5 def test_gguf_parameters(self): quant_storage_type = torch.uint8 quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype) - model = FluxTransformer2DModel.from_single_file(self.ckpt_path, quantization_config=quantization_config) + model = self.model_cls.from_single_file(self.ckpt_path, quantization_config=quantization_config) for param_name, param in model.named_parameters(): if isinstance(param, GGUFParameter): @@ -69,7 +40,7 @@ def test_gguf_parameters(self): def test_gguf_linear_layers(self): quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype) - model = FluxTransformer2DModel.from_single_file(self.ckpt_path, quantization_config=quantization_config) + model = self.model_cls.from_single_file(self.ckpt_path, quantization_config=quantization_config) for name, module in model.named_modules(): if isinstance(module, torch.nn.Linear) and hasattr(module.weight, "quant_type"): @@ -78,11 +49,11 @@ def test_gguf_linear_layers(self): def test_gguf_memory_usage(self): quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype) - model = FluxTransformer2DModel.from_single_file( + model = self.model_cls.from_single_file( self.ckpt_path, quantization_config=quantization_config, torch_dtype=self.torch_dtype ) model.to("cuda") - assert (model.get_memory_footprint() / 1024**3) < 5 + assert (model.get_memory_footprint() / 1024**3) < self.expected_memory_use_in_gb inputs = self.get_dummy_inputs() torch.cuda.reset_peak_memory_stats() @@ -90,17 +61,17 @@ def test_gguf_memory_usage(self): with torch.no_grad(): model(**inputs) max_memory = torch.cuda.max_memory_allocated() - assert (max_memory / 1024**3) < 5 + assert (max_memory / 1024**3) < self.expected_memory_use_in_gb def test_keep_modules_in_fp32(self): r""" A simple tests to check if the modules under `_keep_in_fp32_modules` are kept in fp32. Also ensures if inference works. """ - FluxTransformer2DModel._keep_in_fp32_modules = ["proj_out"] + self.model_cls._keep_in_fp32_modules = ["proj_out"] quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype) - model = FluxTransformer2DModel.from_single_file(self.ckpt_path, quantization_config=quantization_config) + model = self.model_cls.from_single_file(self.ckpt_path, quantization_config=quantization_config) for name, module in model.named_modules(): if isinstance(module, torch.nn.Linear): @@ -109,7 +80,7 @@ def test_keep_modules_in_fp32(self): def test_dtype_assignment(self): quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype) - model = FluxTransformer2DModel.from_single_file(self.ckpt_path, quantization_config=quantization_config) + model = self.model_cls.from_single_file(self.ckpt_path, quantization_config=quantization_config) with self.assertRaises(ValueError): # Tries with a `dtype` @@ -129,3 +100,103 @@ def test_dtype_assignment(self): # This should work model.to("cuda") + + +class FluxGGUFSingleFileTests(GGUFSingleFileTesterMixin, unittest.TestCase): + ckpt_path = "https://huggingface.co/city96/FLUX.1-dev-gguf/blob/main/flux1-dev-Q2_K.gguf" + torch_dtype = torch.bfloat16 + model_cls = FluxTransformer2DModel + expected_memory_use_in_gb = 5 + + def setUp(self): + gc.collect() + torch.cuda.empty_cache() + + def tearDown(self): + gc.collect() + torch.cuda.empty_cache() + + def get_dummy_inputs(self): + return { + "hidden_states": torch.randn((1, 4096, 64), generator=torch.Generator("cpu").manual_seed(0)).to( + torch_device, self.torch_dtype + ), + "encoder_hidden_states": torch.randn( + (1, 512, 4096), + generator=torch.Generator("cpu").manual_seed(0), + ).to(torch_device, self.torch_dtype), + "pooled_projections": torch.randn( + (1, 768), + generator=torch.Generator("cpu").manual_seed(0), + ).to(torch_device, self.torch_dtype), + "timestep": torch.tensor([1]).to(torch_device, self.torch_dtype), + "img_ids": torch.randn((4096, 3), generator=torch.Generator("cpu").manual_seed(0)).to( + torch_device, self.torch_dtype + ), + "txt_ids": torch.randn((512, 3), generator=torch.Generator("cpu").manual_seed(0)).to( + torch_device, self.torch_dtype + ), + "guidance": torch.tensor([3.5]).to(torch_device, self.torch_dtype), + } + + +class SD35LargeGGUFSingleFileTests(GGUFSingleFileTesterMixin, unittest.TestCase): + ckpt_path = "https://huggingface.co/city96/stable-diffusion-3.5-large-gguf/blob/main/sd3.5_large-Q4_0.gguf" + torch_dtype = torch.bfloat16 + model_cls = SD3Transformer2DModel + expected_memory_use_in_gb = 5 + + def setUp(self): + gc.collect() + torch.cuda.empty_cache() + + def tearDown(self): + gc.collect() + torch.cuda.empty_cache() + + def get_dummy_inputs(self): + return { + "hidden_states": torch.randn((1, 16, 64, 64), generator=torch.Generator("cpu").manual_seed(0)).to( + torch_device, self.torch_dtype + ), + "encoder_hidden_states": torch.randn( + (1, 512, 4096), + generator=torch.Generator("cpu").manual_seed(0), + ).to(torch_device, self.torch_dtype), + "pooled_projections": torch.randn( + (1, 2048), + generator=torch.Generator("cpu").manual_seed(0), + ).to(torch_device, self.torch_dtype), + "timestep": torch.tensor([1]).to(torch_device, self.torch_dtype), + } + + +class SD35MediumGGUFSingleFileTests(GGUFSingleFileTesterMixin, unittest.TestCase): + ckpt_path = "https://huggingface.co/city96/stable-diffusion-3.5-medium-gguf/blob/main/sd3.5_medium-Q3_K_M.gguf" + torch_dtype = torch.bfloat16 + model_cls = SD3Transformer2DModel + expected_memory_use_in_gb = 2 + + def setUp(self): + gc.collect() + torch.cuda.empty_cache() + + def tearDown(self): + gc.collect() + torch.cuda.empty_cache() + + def get_dummy_inputs(self): + return { + "hidden_states": torch.randn((1, 16, 64, 64), generator=torch.Generator("cpu").manual_seed(0)).to( + torch_device, self.torch_dtype + ), + "encoder_hidden_states": torch.randn( + (1, 512, 4096), + generator=torch.Generator("cpu").manual_seed(0), + ).to(torch_device, self.torch_dtype), + "pooled_projections": torch.randn( + (1, 2048), + generator=torch.Generator("cpu").manual_seed(0), + ).to(torch_device, self.torch_dtype), + "timestep": torch.tensor([1]).to(torch_device, self.torch_dtype), + } From e56c26647c4f13b95d724ca401c6cb7ab3c847f4 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Thu, 5 Dec 2024 10:33:52 +0100 Subject: [PATCH 27/43] update --- src/diffusers/models/model_loading_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py index 1f0df6d6fd2d..d10df4a37992 100644 --- a/src/diffusers/models/model_loading_utils.py +++ b/src/diffusers/models/model_loading_utils.py @@ -36,11 +36,11 @@ _get_model_file, deprecate, is_accelerate_available, + is_gguf_available, is_torch_available, is_torch_version, logging, ) -from ..utils.import_utils import is_gguf_available logger = logging.get_logger(__name__) From 1209c3a256feefaada6416fc9cb559d99a6bed09 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Thu, 5 Dec 2024 15:26:40 +0530 Subject: [PATCH 28/43] Update src/diffusers/quantizers/gguf/utils.py Co-authored-by: Sayak Paul --- src/diffusers/quantizers/gguf/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py index c72a20712934..9081575a5962 100644 --- a/src/diffusers/quantizers/gguf/utils.py +++ b/src/diffusers/quantizers/gguf/utils.py @@ -53,7 +53,7 @@ def _should_convert_to_gguf(module, state_dict, prefix): compute_dtype=compute_dtype, ) model._modules[name].source_cls = type(module) - # Force requires grad to False to avoid unexpected errors + # Force requires_grad to False to avoid unexpected errors model._modules[name].requires_grad_(False) return model From db9b6f38dbb42e0aa0c765a643dfebc7c20433c3 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Thu, 5 Dec 2024 12:44:22 +0100 Subject: [PATCH 29/43] update --- src/diffusers/models/model_loading_utils.py | 5 - .../quantizers/gguf/gguf_quantizer.py | 2 +- tests/quantization/gguf/test_gguf.py | 159 +++++++++++++++++- 3 files changed, 159 insertions(+), 7 deletions(-) diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py index d10df4a37992..93ad22ce400b 100644 --- a/src/diffusers/models/model_loading_utils.py +++ b/src/diffusers/models/model_loading_utils.py @@ -427,11 +427,6 @@ def _gguf_parse_value(_value, data_type): return _value -def read_field(reader, field): - value = reader.fields[field] - return [_gguf_parse_value(value.parts[_data_index], value.types) for _data_index in value.data] - - def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): """ Load a GGUF file and return a dictionary of parsed parameters containing tensors, the parsed tokenizer and config diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py index bb61b1ddd7ac..62c2063ac758 100644 --- a/src/diffusers/quantizers/gguf/gguf_quantizer.py +++ b/src/diffusers/quantizers/gguf/gguf_quantizer.py @@ -105,7 +105,7 @@ def create_quantized_param( param_value: Union["GGUFParameter", "torch.Tensor"], param_name: str, target_device: "torch.device", - state_dict: Dict[str, Any], + state_dict: Optional[Dict[str, Any]] = None, unexpected_keys: Optional[List[str]] = None, ): module, tensor_name = get_module_from_name(model, param_name) diff --git a/tests/quantization/gguf/test_gguf.py b/tests/quantization/gguf/test_gguf.py index dfe1100fadc8..265d35c16d1c 100644 --- a/tests/quantization/gguf/test_gguf.py +++ b/tests/quantization/gguf/test_gguf.py @@ -1,12 +1,20 @@ import gc import unittest +import numpy as np import torch -from diffusers import FluxTransformer2DModel, GGUFQuantizationConfig, SD3Transformer2DModel +from diffusers import ( + FluxPipeline, + FluxTransformer2DModel, + GGUFQuantizationConfig, + SD3Transformer2DModel, + StableDiffusion3Pipeline, +) from diffusers.utils.testing_utils import ( is_gguf_available, nightly, + numpy_cosine_similarity_distance, require_accelerate, require_big_gpu_with_torch_cuda, require_gguf_version_greater_or_equal, @@ -68,6 +76,7 @@ def test_keep_modules_in_fp32(self): A simple tests to check if the modules under `_keep_in_fp32_modules` are kept in fp32. Also ensures if inference works. """ + _keep_in_fp32_modules = self.model_cls._keep_in_fp32_modules self.model_cls._keep_in_fp32_modules = ["proj_out"] quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype) @@ -77,6 +86,7 @@ def test_keep_modules_in_fp32(self): if isinstance(module, torch.nn.Linear): if name in model._keep_in_fp32_modules: assert module.weight.dtype == torch.float32 + self.model_cls._keep_in_fp32_modules = _keep_in_fp32_modules def test_dtype_assignment(self): quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype) @@ -139,6 +149,55 @@ def get_dummy_inputs(self): "guidance": torch.tensor([3.5]).to(torch_device, self.torch_dtype), } + def test_pipeline_inference(self): + quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype) + transformer = self.model_cls.from_single_file( + self.ckpt_path, quantization_config=quantization_config, torch_dtype=self.torch_dtype + ) + pipe = FluxPipeline.from_pretrained( + "black-forest-labs/FLUX.1-dev", transformer=transformer, torch_dtype=self.torch_dtype + ) + pipe.enable_model_cpu_offload() + + prompt = "a cat holding a sign that says hello" + output = pipe( + prompt=prompt, num_inference_steps=2, generator=torch.Generator("cpu").manual_seed(0), output_type="np" + ).images[0] + output_slice = output[:3, :3, :].flatten() + expected_slice = np.array( + [ + 0.47265625, + 0.43359375, + 0.359375, + 0.47070312, + 0.421875, + 0.34375, + 0.46875, + 0.421875, + 0.34765625, + 0.46484375, + 0.421875, + 0.34179688, + 0.47070312, + 0.42578125, + 0.34570312, + 0.46875, + 0.42578125, + 0.3515625, + 0.45507812, + 0.4140625, + 0.33984375, + 0.4609375, + 0.41796875, + 0.34375, + 0.45898438, + 0.41796875, + 0.34375, + ] + ) + max_diff = numpy_cosine_similarity_distance(expected_slice, output_slice) + assert max_diff < 1e-4 + class SD35LargeGGUFSingleFileTests(GGUFSingleFileTesterMixin, unittest.TestCase): ckpt_path = "https://huggingface.co/city96/stable-diffusion-3.5-large-gguf/blob/main/sd3.5_large-Q4_0.gguf" @@ -170,6 +229,55 @@ def get_dummy_inputs(self): "timestep": torch.tensor([1]).to(torch_device, self.torch_dtype), } + def test_pipeline_inference(self): + quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype) + transformer = self.model_cls.from_single_file( + self.ckpt_path, quantization_config=quantization_config, torch_dtype=self.torch_dtype + ) + pipe = StableDiffusion3Pipeline.from_pretrained( + "stabilityai/stable-diffusion-3.5-large", transformer=transformer, torch_dtype=self.torch_dtype + ) + pipe.enable_model_cpu_offload() + + prompt = "a cat holding a sign that says hello" + output = pipe( + prompt=prompt, num_inference_steps=2, generator=torch.Generator("cpu").manual_seed(0), output_type="np" + ).images[0] + output_slice = output[:3, :3, :].flatten() + expected_slice = np.array( + [ + 0.17578125, + 0.27539062, + 0.27734375, + 0.11914062, + 0.26953125, + 0.25390625, + 0.109375, + 0.25390625, + 0.25, + 0.15039062, + 0.26171875, + 0.28515625, + 0.13671875, + 0.27734375, + 0.28515625, + 0.12109375, + 0.26757812, + 0.265625, + 0.16210938, + 0.29882812, + 0.28515625, + 0.15625, + 0.30664062, + 0.27734375, + 0.14648438, + 0.29296875, + 0.26953125, + ] + ) + max_diff = numpy_cosine_similarity_distance(expected_slice, output_slice) + assert max_diff < 1e-4 + class SD35MediumGGUFSingleFileTests(GGUFSingleFileTesterMixin, unittest.TestCase): ckpt_path = "https://huggingface.co/city96/stable-diffusion-3.5-medium-gguf/blob/main/sd3.5_medium-Q3_K_M.gguf" @@ -200,3 +308,52 @@ def get_dummy_inputs(self): ).to(torch_device, self.torch_dtype), "timestep": torch.tensor([1]).to(torch_device, self.torch_dtype), } + + def test_pipeline_inference(self): + quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype) + transformer = self.model_cls.from_single_file( + self.ckpt_path, quantization_config=quantization_config, torch_dtype=self.torch_dtype + ) + pipe = StableDiffusion3Pipeline.from_pretrained( + "stabilityai/stable-diffusion-3.5-medium", transformer=transformer, torch_dtype=self.torch_dtype + ) + pipe.enable_model_cpu_offload() + + prompt = "a cat holding a sign that says hello" + output = pipe( + prompt=prompt, num_inference_steps=2, generator=torch.Generator("cpu").manual_seed(0), output_type="np" + ).images[0] + output_slice = output[:3, :3, :].flatten() + expected_slice = np.array( + [ + 0.625, + 0.6171875, + 0.609375, + 0.65625, + 0.65234375, + 0.640625, + 0.6484375, + 0.640625, + 0.625, + 0.6484375, + 0.63671875, + 0.6484375, + 0.66796875, + 0.65625, + 0.65234375, + 0.6640625, + 0.6484375, + 0.6328125, + 0.6640625, + 0.6484375, + 0.640625, + 0.67578125, + 0.66015625, + 0.62109375, + 0.671875, + 0.65625, + 0.62109375, + ] + ) + max_diff = numpy_cosine_similarity_distance(expected_slice, output_slice) + assert max_diff < 1e-4 From 78c78615a4c1c600a36dab4a0397f8ee7fc72692 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Thu, 5 Dec 2024 14:28:56 +0100 Subject: [PATCH 30/43] update --- src/diffusers/loaders/single_file_model.py | 3 +++ src/diffusers/models/model_loading_utils.py | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py index b4edf48103a2..9fd8c18dd738 100644 --- a/src/diffusers/loaders/single_file_model.py +++ b/src/diffusers/loaders/single_file_model.py @@ -205,6 +205,7 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] = revision = kwargs.pop("revision", None) torch_dtype = kwargs.pop("torch_dtype", None) quantization_config = kwargs.pop("quantization_config", None) + device = kwargs.pop("device", None) if isinstance(pretrained_model_link_or_path_or_dict, dict): checkpoint = pretrained_model_link_or_path_or_dict @@ -326,10 +327,12 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] = ) if is_accelerate_available(): + param_device = torch.device(device) if device else torch.device("cpu") unexpected_keys = load_model_dict_into_meta( model, diffusers_format_checkpoint, dtype=torch_dtype, + device=param_device, hf_quantizer=hf_quantizer, keep_in_fp32_modules=keep_in_fp32_modules, ) diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py index 2e648c864e99..220a4abdf723 100644 --- a/src/diffusers/models/model_loading_utils.py +++ b/src/diffusers/models/model_loading_utils.py @@ -184,7 +184,8 @@ def load_model_dict_into_meta( ) -> List[str]: if device is not None and not isinstance(device, (str, torch.device)): raise ValueError(f"Expected device to have type `str` or `torch.device`, but got {type(device)=}.") - device = device or torch.device("cpu") + if hf_quantizer is None: + device = device or torch.device("cpu") dtype = dtype or torch.float32 is_quantized = hf_quantizer is not None From 33eb43142c60ef0478c548adb44ee3282a79ff7a Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Thu, 5 Dec 2024 17:07:06 +0100 Subject: [PATCH 31/43] update --- src/diffusers/quantizers/gguf/__init__.py | 1 - src/diffusers/quantizers/gguf/utils.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/diffusers/quantizers/gguf/__init__.py b/src/diffusers/quantizers/gguf/__init__.py index 53af2e180f48..b3d9082ac803 100644 --- a/src/diffusers/quantizers/gguf/__init__.py +++ b/src/diffusers/quantizers/gguf/__init__.py @@ -1,2 +1 @@ from .gguf_quantizer import GGUFQuantizer -from .utils import GGUFLinear, GGUFParameter diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py index 9081575a5962..7284f75335c8 100644 --- a/src/diffusers/quantizers/gguf/utils.py +++ b/src/diffusers/quantizers/gguf/utils.py @@ -27,7 +27,7 @@ def _replace_with_gguf_linear(model, compute_dtype, state_dict, prefix="", modules_to_not_convert=[]): - def _should_convert_to_gguf(module, state_dict, prefix): + def _should_convert_to_gguf(state_dict, prefix): weight_key = prefix + "weight" return weight_key in state_dict and isinstance(state_dict[weight_key], GGUFParameter) @@ -41,7 +41,7 @@ def _should_convert_to_gguf(module, state_dict, prefix): if ( isinstance(module, nn.Linear) - and _should_convert_to_gguf(module, state_dict, module_prefix) + and _should_convert_to_gguf(state_dict, module_prefix) and name not in modules_to_not_convert ): ctx = init_empty_weights if is_accelerate_available() else nullcontext From 9651ddc758cb8635bef38e8af83e76f26c255bdb Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Thu, 5 Dec 2024 17:12:06 +0100 Subject: [PATCH 32/43] update --- tests/quantization/gguf/test_gguf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/quantization/gguf/test_gguf.py b/tests/quantization/gguf/test_gguf.py index 265d35c16d1c..eb05a2c1b9f3 100644 --- a/tests/quantization/gguf/test_gguf.py +++ b/tests/quantization/gguf/test_gguf.py @@ -53,6 +53,7 @@ def test_gguf_linear_layers(self): for name, module in model.named_modules(): if isinstance(module, torch.nn.Linear) and hasattr(module.weight, "quant_type"): assert module.weight.dtype == torch.uint8 + assert module.bias.dtype == torch.float32 def test_gguf_memory_usage(self): quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype) From 746fd2f7f6a0ef8739b75515ba98c07cc357a3ea Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Thu, 5 Dec 2024 17:17:54 +0100 Subject: [PATCH 33/43] update --- src/diffusers/models/model_loading_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py index 220a4abdf723..af1a1a5250ff 100644 --- a/src/diffusers/models/model_loading_utils.py +++ b/src/diffusers/models/model_loading_utils.py @@ -25,7 +25,6 @@ import safetensors import torch from huggingface_hub.utils import EntryNotFoundError -from tqdm import tqdm from ..utils import ( GGUF_FILE_EXTENSION, @@ -458,7 +457,7 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): reader = GGUFReader(gguf_checkpoint_path) parsed_parameters = {} - for tensor in tqdm(reader.tensors, desc="Loading GGUF Parameters: "): + for tensor in reader.tensors: name = tensor.name quant_type = tensor.tensor_type From e027d46656c6c3e7e47e7b663d23e9901891dfa0 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Thu, 5 Dec 2024 17:19:44 +0100 Subject: [PATCH 34/43] update --- docs/source/en/quantization/gguf.md | 59 +++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 docs/source/en/quantization/gguf.md diff --git a/docs/source/en/quantization/gguf.md b/docs/source/en/quantization/gguf.md new file mode 100644 index 000000000000..511091000ebb --- /dev/null +++ b/docs/source/en/quantization/gguf.md @@ -0,0 +1,59 @@ + + +# GGUF + +The GGUF file format is typically used to store models for inference with [GGML]() and supports a variety of block wise quantization options. Diffusers supports loading checkpoints prequantized and saved in the GGUF format via `from_single_file` loading with Model classes. Support for loading GGUF checkpoint via Pipelines is currently not supported. The dequantizatation functions used for dynamic dequantizatation are based on the great work done by [city96](https://github.com/city96/ComfyUI-GGUF) + +The following example will load the [FLUX.1 DEV](https://huggingface.co/black-forest-labs/FLUX.1-dev) transformer model using the GGUF Q2_K quantization variant. + + +```python +import torch + +from diffusers import FluxPipeline, FluxTransformer2DModel, GGUFQuantizationConfig + +ckpt_path = ( + "https://huggingface.co/city96/FLUX.1-dev-gguf/blob/main/flux1-dev-Q2_K.gguf" +) +transformer = FluxTransformer2DModel.from_single_file( + ckpt_path, + quantization_config=GGUFQuantizationConfig(compute_dtype=torch.bfloat16), + torch_dtype=torch.bfloat16, +) +pipe = FluxPipeline.from_pretrained( + "black-forest-labs/FLUX.1-dev", + transformer=transformer, + generator=torch.manual_seed(0), + torch_dtype=torch.bfloat16, +) +pipe.enable_model_cpu_offload() +prompt = "A cat holding a sign that says hello world" +image = pipe(prompt).images[0] +image.save("flux-gguf.png") +``` + +## Supported Quantization Types + +- BF16 +- Q4_0 +- Q4_1 +- Q5_0 +- Q5_1 +- Q8_0 +- Q2_K +- Q3_K +- Q4_K +- Q5_K +- Q6_K + From 9db239697f70a072200f60ebf54dea601887f505 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Fri, 6 Dec 2024 07:33:43 +0100 Subject: [PATCH 35/43] update --- docs/source/en/_toctree.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 2faabfec30ce..458a611b2e51 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -157,6 +157,8 @@ title: Getting Started - local: quantization/bitsandbytes title: bitsandbytes + - local: quantization/gguf + title: gguf title: Quantization Methods - sections: - local: optimization/fp16 From 7ee89f4cc32b8b0f12e153d7cc58c731eff6eacc Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Fri, 6 Dec 2024 08:00:25 +0100 Subject: [PATCH 36/43] update --- docs/source/en/api/quantization.md | 4 ++++ docs/source/en/quantization/overview.md | 8 ++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/docs/source/en/api/quantization.md b/docs/source/en/api/quantization.md index 2fbde9e707ea..79443b2f4583 100644 --- a/docs/source/en/api/quantization.md +++ b/docs/source/en/api/quantization.md @@ -28,6 +28,10 @@ Learn how to quantize models in the [Quantization](../quantization/overview) gui [[autodoc]] BitsAndBytesConfig +## GGUFQuantizationConfig + +[[autodoc]] GGUFQuantizationConfig + ## DiffusersQuantizer [[autodoc]] quantizers.base.DiffusersQuantizer diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md index d8adbc85a259..28db7d891a6d 100644 --- a/docs/source/en/quantization/overview.md +++ b/docs/source/en/quantization/overview.md @@ -17,7 +17,7 @@ Quantization techniques focus on representing data with less information while a -Interested in adding a new quantization method to Transformers? Refer to the [Contribute new quantization method guide](https://huggingface.co/docs/transformers/main/en/quantization/contribute) to learn more about adding a new quantization method. +Interested in adding a new quantization method to Diffusers? Refer to the [Contribute new quantization method guide](https://huggingface.co/docs/transformers/main/en/quantization/contribute) to learn more about adding a new quantization method. @@ -32,4 +32,8 @@ If you are new to the quantization field, we recommend you to check out these be ## When to use what? -This section will be expanded once Diffusers has multiple quantization backends. Currently, we only support `bitsandbytes`. [This resource](https://huggingface.co/docs/transformers/main/en/quantization/overview#when-to-use-what) provides a good overview of the pros and cons of different quantization techniques. \ No newline at end of file +Diffusers currently supports the following quantization methods. +- `bitsandbytes` +- `gguf` + +[This resource](https://huggingface.co/docs/transformers/main/en/quantization/overview#when-to-use-what) provides a good overview of the pros and cons of different quantization techniques. \ No newline at end of file From edf3e5431447db65649d82f848793c9262c0badf Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Fri, 6 Dec 2024 08:15:31 +0100 Subject: [PATCH 37/43] update --- docs/source/en/quantization/gguf.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/docs/source/en/quantization/gguf.md b/docs/source/en/quantization/gguf.md index 511091000ebb..e6a72049601a 100644 --- a/docs/source/en/quantization/gguf.md +++ b/docs/source/en/quantization/gguf.md @@ -13,10 +13,19 @@ specific language governing permissions and limitations under the License. # GGUF -The GGUF file format is typically used to store models for inference with [GGML]() and supports a variety of block wise quantization options. Diffusers supports loading checkpoints prequantized and saved in the GGUF format via `from_single_file` loading with Model classes. Support for loading GGUF checkpoint via Pipelines is currently not supported. The dequantizatation functions used for dynamic dequantizatation are based on the great work done by [city96](https://github.com/city96/ComfyUI-GGUF) +The GGUF file format is typically used to store models for inference with [GGML](https://github.com/ggerganov/ggml) and supports a variety of block wise quantization options. Diffusers supports loading checkpoints prequantized and saved in the GGUF format via `from_single_file` loading with Model classes. Loading GGUF checkpoints via Pipelines is currently not supported. The following example will load the [FLUX.1 DEV](https://huggingface.co/black-forest-labs/FLUX.1-dev) transformer model using the GGUF Q2_K quantization variant. +Before starting please install gguf in your environment + +```shell +pip install -U gguf +``` + +Since GGUF is a single file format, we will be using `from_single_file` to load the model and pass in the `GGUFQuantizationConfig` when loading the model. + +When using GGUF checkpoints, the quantized weights remain in a low memory `dtype`, typically `torch.unint8` and are dynamically dequantized and cast to the configured `compute_dtype` when running a forward pass through each module in the model. The `GGUFQuantizationConfig` allows you to set the `compute_dtype` for the forward pass of each module. The functions used for dynamic dequantizatation are based on the great work done by [city96](https://github.com/city96/ComfyUI-GGUF) ```python import torch From d3eb54f0e9a5c2216c6ea1bb22363fb132e19bbd Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Fri, 6 Dec 2024 10:02:31 +0100 Subject: [PATCH 38/43] update --- src/diffusers/loaders/single_file_model.py | 1 + .../quantizers/gguf/gguf_quantizer.py | 14 +++++ src/diffusers/quantizers/gguf/utils.py | 57 +++++++++++++++++++ tests/quantization/gguf/test_gguf.py | 21 ++++++- 4 files changed, 92 insertions(+), 1 deletion(-) diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py index 9fd8c18dd738..7f821955fac8 100644 --- a/src/diffusers/loaders/single_file_model.py +++ b/src/diffusers/loaders/single_file_model.py @@ -351,6 +351,7 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] = if hf_quantizer is not None: hf_quantizer.postprocess_model(model) + model.hf_quantizer = hf_quantizer if torch_dtype is not None and hf_quantizer is None: model.to(torch_dtype) diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py index 62c2063ac758..0c760e277ce4 100644 --- a/src/diffusers/quantizers/gguf/gguf_quantizer.py +++ b/src/diffusers/quantizers/gguf/gguf_quantizer.py @@ -24,6 +24,7 @@ from .utils import ( GGML_QUANT_SIZES, GGUFParameter, + _dequantize_gguf_and_restore_linear, _quant_shape_from_byte_shape, _replace_with_gguf_linear, ) @@ -143,3 +144,16 @@ def is_serializable(self): @property def is_trainable(self) -> bool: return False + + def _dequantize(self, model): + is_model_on_cpu = model.device.type == "cpu" + if is_model_on_cpu: + logger.info( + "Model was found to be on CPU (could happen as a result of `enable_model_cpu_offload()`). So, moving it to GPU. After dequantization, will move the model back to CPU again to preserve the previous device." + ) + model.to(torch.cuda.current_device()) + + model = _dequantize_gguf_and_restore_linear(model, self.modules_to_not_convert) + if is_model_on_cpu: + model.to("cpu") + return model diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py index 7284f75335c8..35e5743fbcf0 100644 --- a/src/diffusers/quantizers/gguf/utils.py +++ b/src/diffusers/quantizers/gguf/utils.py @@ -13,6 +13,7 @@ # # limitations under the License. +import inspect from contextlib import nullcontext import gguf @@ -23,7 +24,27 @@ if is_accelerate_available(): + import accelerate from accelerate import init_empty_weights + from accelerate.hooks import add_hook_to_module, remove_hook_from_module + + +# Copied from diffusers.quantizers.bitsandbytes.utils._create_accelerate_new_hook +def _create_accelerate_new_hook(old_hook): + r""" + Creates a new hook based on the old hook. Use it only if you know what you are doing ! This method is a copy of: + https://github.com/huggingface/peft/blob/748f7968f3a31ec06a1c2b0328993319ad9a150a/src/peft/utils/other.py#L245 with + some changes + """ + old_hook_cls = getattr(accelerate.hooks, old_hook.__class__.__name__) + old_hook_attr = old_hook.__dict__ + filtered_old_hook_attr = {} + old_hook_init_signature = inspect.signature(old_hook_cls.__init__) + for k in old_hook_attr.keys(): + if k in old_hook_init_signature.parameters: + filtered_old_hook_attr[k] = old_hook_attr[k] + new_hook = old_hook_cls(**filtered_old_hook_attr) + return new_hook def _replace_with_gguf_linear(model, compute_dtype, state_dict, prefix="", modules_to_not_convert=[]): @@ -59,6 +80,42 @@ def _should_convert_to_gguf(state_dict, prefix): return model +def _dequantize_gguf_and_restore_linear(model, modules_to_not_convert=[]): + for name, module in model.named_children(): + if isinstance(module, GGUFLinear) and name not in modules_to_not_convert: + device = module.weight.device + bias = getattr(module, "bias", None) + + ctx = init_empty_weights if is_accelerate_available() else nullcontext + with ctx(): + new_module = nn.Linear( + module.in_features, + module.out_features, + module.bias is not None, + device=device, + ) + new_module.weight = nn.Parameter(dequantize_gguf_tensor(module.weight)) + if bias is not None: + new_module.bias = bias + + # Create a new hook and attach it in case we use accelerate + if hasattr(module, "_hf_hook"): + old_hook = module._hf_hook + new_hook = _create_accelerate_new_hook(old_hook) + + remove_hook_from_module(module) + add_hook_to_module(new_module, new_hook) + + new_module.to(device) + model._modules[name] = new_module + + has_children = list(module.children()) + if has_children: + _dequantize_gguf_and_restore_linear(module, modules_to_not_convert) + + return model + + # dequantize operations based on torch ports of GGUF dequantize_functions # from City96 # more info: https://github.com/city96/ComfyUI-GGUF/blob/main/dequant.py diff --git a/tests/quantization/gguf/test_gguf.py b/tests/quantization/gguf/test_gguf.py index eb05a2c1b9f3..8ac4c9915c27 100644 --- a/tests/quantization/gguf/test_gguf.py +++ b/tests/quantization/gguf/test_gguf.py @@ -3,6 +3,7 @@ import numpy as np import torch +import torch.nn as nn from diffusers import ( FluxPipeline, @@ -23,7 +24,7 @@ if is_gguf_available(): - from diffusers.quantizers.gguf.utils import GGUFParameter + from diffusers.quantizers.gguf.utils import GGUFLinear, GGUFParameter @nightly @@ -112,6 +113,24 @@ def test_dtype_assignment(self): # This should work model.to("cuda") + def test_dequantize_model(self): + quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype) + model = self.model_cls.from_single_file(self.ckpt_path, quantization_config=quantization_config) + model.dequantize() + + def _check_for_gguf_linear(model): + has_children = list(model.children()) + if not has_children: + return + + for name, module in model.named_children(): + if isinstance(module, nn.Linear): + assert not isinstance(module, GGUFLinear), f"{name} is still GGUFLinear" + assert not isinstance(module.weight, GGUFParameter), f"{name} weight is still GGUFParameter" + + for name, module in model.named_children(): + _check_for_gguf_linear(module) + class FluxGGUFSingleFileTests(GGUFSingleFileTesterMixin, unittest.TestCase): ckpt_path = "https://huggingface.co/city96/FLUX.1-dev-gguf/blob/main/flux1-dev-Q2_K.gguf" From 4f34f149369a2befdb8d6dd65af49cb704b91f4c Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Wed, 11 Dec 2024 08:44:09 +0530 Subject: [PATCH 39/43] Update docs/source/en/quantization/gguf.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/quantization/gguf.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/quantization/gguf.md b/docs/source/en/quantization/gguf.md index e6a72049601a..8d38478ffc58 100644 --- a/docs/source/en/quantization/gguf.md +++ b/docs/source/en/quantization/gguf.md @@ -23,7 +23,7 @@ Before starting please install gguf in your environment pip install -U gguf ``` -Since GGUF is a single file format, we will be using `from_single_file` to load the model and pass in the `GGUFQuantizationConfig` when loading the model. +Since GGUF is a single file format, use [`~FromSingleFileMixin.from_single_file`] to load the model and pass in the [`GGUFQuantizationConfig`] when loading the model. When using GGUF checkpoints, the quantized weights remain in a low memory `dtype`, typically `torch.unint8` and are dynamically dequantized and cast to the configured `compute_dtype` when running a forward pass through each module in the model. The `GGUFQuantizationConfig` allows you to set the `compute_dtype` for the forward pass of each module. The functions used for dynamic dequantizatation are based on the great work done by [city96](https://github.com/city96/ComfyUI-GGUF) From 090efdb899f0171a0f64bb3b50cec4b763e0a87d Mon Sep 17 00:00:00 2001 From: DN6 Date: Wed, 11 Dec 2024 14:23:25 +0530 Subject: [PATCH 40/43] update --- docs/source/en/quantization/gguf.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/source/en/quantization/gguf.md b/docs/source/en/quantization/gguf.md index 8d38478ffc58..dbcd1b1486b2 100644 --- a/docs/source/en/quantization/gguf.md +++ b/docs/source/en/quantization/gguf.md @@ -23,9 +23,11 @@ Before starting please install gguf in your environment pip install -U gguf ``` -Since GGUF is a single file format, use [`~FromSingleFileMixin.from_single_file`] to load the model and pass in the [`GGUFQuantizationConfig`] when loading the model. +Since GGUF is a single file format, use [`~FromSingleFileMixin.from_single_file`] to load the model and pass in the [`GGUFQuantizationConfig`]. -When using GGUF checkpoints, the quantized weights remain in a low memory `dtype`, typically `torch.unint8` and are dynamically dequantized and cast to the configured `compute_dtype` when running a forward pass through each module in the model. The `GGUFQuantizationConfig` allows you to set the `compute_dtype` for the forward pass of each module. The functions used for dynamic dequantizatation are based on the great work done by [city96](https://github.com/city96/ComfyUI-GGUF) +When using GGUF checkpoints, the quantized weights remain in a low memory `dtype`(typically `torch.unint8`) and are dynamically dequantized and cast to the configured `compute_dtype` during each module's forward pass through the model. The `GGUFQuantizationConfig` allows you to set the `compute_dtype`. + +The functions used for dynamic dequantizatation are based on the great work done by [city96](https://github.com/city96/ComfyUI-GGUF), who created the Pytorch ports of the original (`numpy`)[https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/quants.py] implementation by [compilade](https://github.com/compilade). ```python import torch From e67c25a4bda5b48ea4f97b1807039efa3a7186f2 Mon Sep 17 00:00:00 2001 From: DN6 Date: Tue, 17 Dec 2024 13:02:16 +0530 Subject: [PATCH 41/43] update --- src/diffusers/__init__.py | 14 +++++-------- .../quantizers/quantization_config.py | 20 ++++++++++++++----- src/diffusers/utils/import_utils.py | 5 ++++- src/diffusers/utils/testing_utils.py | 1 + 4 files changed, 25 insertions(+), 15 deletions(-) diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 8fca380c8255..e2351a0c53b8 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -31,7 +31,7 @@ "loaders": ["FromOriginalModelMixin"], "models": [], "pipelines": [], - "quantizers.quantization_config": ["BitsAndBytesConfig", "TorchAoConfig", "GGUFQuantizationConfig"], + "quantizers.quantization_config": ["BitsAndBytesConfig", "GGUFQuantizationConfig", "TorchAoConfig"], "schedulers": [], "utils": [ "OptionalDependencyNotAvailable", @@ -428,8 +428,7 @@ if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: - from .utils import \ - dummy_torch_and_transformers_and_k_diffusion_objects # noqa F403 + from .utils import dummy_torch_and_transformers_and_k_diffusion_objects # noqa F403 _import_structure["utils.dummy_torch_and_transformers_and_k_diffusion_objects"] = [ name for name in dir(dummy_torch_and_transformers_and_k_diffusion_objects) if not name.startswith("_") @@ -442,8 +441,7 @@ if not (is_torch_available() and is_transformers_available() and is_sentencepiece_available()): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: - from .utils import \ - dummy_torch_and_transformers_and_sentencepiece_objects # noqa F403 + from .utils import dummy_torch_and_transformers_and_sentencepiece_objects # noqa F403 _import_structure["utils.dummy_torch_and_transformers_and_sentencepiece_objects"] = [ name for name in dir(dummy_torch_and_transformers_and_sentencepiece_objects) if not name.startswith("_") @@ -456,8 +454,7 @@ if not (is_torch_available() and is_transformers_available() and is_onnx_available()): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: - from .utils import \ - dummy_torch_and_transformers_and_onnx_objects # noqa F403 + from .utils import dummy_torch_and_transformers_and_onnx_objects # noqa F403 _import_structure["utils.dummy_torch_and_transformers_and_onnx_objects"] = [ name for name in dir(dummy_torch_and_transformers_and_onnx_objects) if not name.startswith("_") @@ -492,8 +489,7 @@ if not (is_transformers_available() and is_torch_available() and is_note_seq_available()): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: - from .utils import \ - dummy_transformers_and_torch_and_note_seq_objects # noqa F403 + from .utils import dummy_transformers_and_torch_and_note_seq_objects # noqa F403 _import_structure["utils.dummy_transformers_and_torch_and_note_seq_objects"] = [ name for name in dir(dummy_transformers_and_torch_and_note_seq_objects) if not name.startswith("_") diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py index 5d74eb7008cd..fc22e4e65a9a 100644 --- a/src/diffusers/quantizers/quantization_config.py +++ b/src/diffusers/quantizers/quantization_config.py @@ -395,19 +395,29 @@ def to_diff_dict(self) -> Dict[str, Any]: return serializable_config_dict +@dataclass class GGUFQuantizationConfig(QuantizationConfigMixin): - def __init__(self, compute_dtype=None, quant_storage=None, modules_to_not_convert=None): + """This is a config class for GGUF Quantization techniques. + + Args: + compute_dtype: (`torch.dtype`, defaults to `torch.float32`): + This sets the computational type which might be different than the input type. For example, inputs might be + fp32, but computation can be set to bf16 for speedups. + + """ + + def __init__(self, compute_dtype: torch.dtype = None): self.quant_method = QuantizationMethod.GGUF self.compute_dtype = compute_dtype - self.quant_storage = quant_storage self.pre_quantized = True - self.modules_to_not_convert = modules_to_not_convert + + # TODO: (Dhruv) Add this as an init argument when we can support loading unquantized checkpoints. + self.modules_to_not_convert = [] if self.compute_dtype is None: self.compute_dtype = torch.float32 - if self.quant_storage is None: - self.quant_storage = torch.uint8 + @dataclass class TorchAoConfig(QuantizationConfigMixin): """This is a config class for torchao quantization/sparsity techniques. diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py index 40983fe8cae2..3014efebc82e 100644 --- a/src/diffusers/utils/import_utils.py +++ b/src/diffusers/utils/import_utils.py @@ -479,6 +479,8 @@ def is_imageio_available(): def is_gguf_available(): return _is_gguf_available + + def is_torchao_available(): return _is_torchao_available @@ -622,7 +624,8 @@ def is_torchao_available(): """ TORCHAO_IMPORT_ERROR = """ -{0} requires the torchao library but it was not found in your environment. You can install it with pip: `pip install torchao` +{0} requires the torchao library but it was not found in your environment. You can install it with pip: `pip install +torchao` """ BACKENDS_MAPPING = OrderedDict( diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index 4753bc4785b5..e5eac05ac4cd 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -487,6 +487,7 @@ def decorator(test_case): correct_gguf_version, f"Test requires gguf with the version greater than {gguf_version}." )(test_case) + def require_torchao_version_greater(torchao_version): def decorator(test_case): correct_torchao_version = is_torchao_available() and version.parse( From e710bde37d8d6ca7a5f63ab6639096c7bfbe793b Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Tue, 17 Dec 2024 09:39:34 +0100 Subject: [PATCH 42/43] update --- src/diffusers/quantizers/quantization_config.py | 2 +- src/diffusers/utils/testing_utils.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py index fc22e4e65a9a..504105b10d81 100644 --- a/src/diffusers/quantizers/quantization_config.py +++ b/src/diffusers/quantizers/quantization_config.py @@ -412,7 +412,7 @@ def __init__(self, compute_dtype: torch.dtype = None): self.pre_quantized = True # TODO: (Dhruv) Add this as an init argument when we can support loading unquantized checkpoints. - self.modules_to_not_convert = [] + self.modules_to_not_convert = None if self.compute_dtype is None: self.compute_dtype = torch.float32 diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index e5eac05ac4cd..3448b4d28d1f 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -487,6 +487,8 @@ def decorator(test_case): correct_gguf_version, f"Test requires gguf with the version greater than {gguf_version}." )(test_case) + return decorator + def require_torchao_version_greater(torchao_version): def decorator(test_case): From f59e07a6b225eeaf0dc73a76a55c41bffdd518b3 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Tue, 17 Dec 2024 10:14:25 +0100 Subject: [PATCH 43/43] update --- src/diffusers/quantizers/quantization_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py index 504105b10d81..3078be310719 100644 --- a/src/diffusers/quantizers/quantization_config.py +++ b/src/diffusers/quantizers/quantization_config.py @@ -406,7 +406,7 @@ class GGUFQuantizationConfig(QuantizationConfigMixin): """ - def __init__(self, compute_dtype: torch.dtype = None): + def __init__(self, compute_dtype: Optional["torch.dtype"] = None): self.quant_method = QuantizationMethod.GGUF self.compute_dtype = compute_dtype self.pre_quantized = True