From b5eeaa4e479338747018b1408d5827e1c2f848c8 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Mon, 21 Oct 2024 11:37:39 +0200
Subject: [PATCH 01/43] update

---
 src/diffusers/loaders/gguf.py | 114 ++++++++++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)
 create mode 100644 src/diffusers/loaders/gguf.py

diff --git a/src/diffusers/loaders/gguf.py b/src/diffusers/loaders/gguf.py
new file mode 100644
index 000000000000..4d381b80766a
--- /dev/null
+++ b/src/diffusers/loaders/gguf.py
@@ -0,0 +1,114 @@
+# coding=utf-8
+# Copyright 2024 The ggml.ai team and The HuggingFace Inc. team. and pygguf author (github.com/99991)
+# https://github.com/99991/pygguf
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from array import array
+
+from tqdm import tqdm
+
+from ..utils import is_torch_available
+from ..utils.logging import get_logger
+
+
+TORCH_COMPATIBLE_QTYPES = {None, gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16}
+
+if is_torch_available():
+    pass
+
+logger = get_logger(__name__)
+
+
+GGUF_TO_DIFFUSERS_MAPPING = {
+    "ignore": {
+        "GGUF": {
+            "version": "version",
+            "tensor_count": "tensor_count",
+            "kv_count": "kv_count",
+        },
+        "general": {"file_type": "file_type", "quantization_version": "quantization_version"},
+    },
+}
+
+
+def _gguf_parse_value(_value, data_type):
+    if not isinstance(data_type, list):
+        data_type = [data_type]
+    if len(data_type) == 1:
+        data_type = data_type[0]
+        array_data_type = None
+    else:
+        if data_type[0] != 9:
+            raise ValueError("Received multiple types, therefore expected the first type to indicate an array.")
+        data_type, array_data_type = data_type
+
+    if data_type in [0, 1, 2, 3, 4, 5, 10, 11]:
+        _value = int(_value[0])
+    elif data_type in [6, 12]:
+        _value = float(_value[0])
+    elif data_type in [7]:
+        _value = bool(_value[0])
+    elif data_type in [8]:
+        _value = array("B", list(_value)).tobytes().decode()
+    elif data_type in [9]:
+        _value = _gguf_parse_value(_value, array_data_type)
+    return _value
+
+
+def read_field(reader, field):
+    value = reader.fields[field]
+    return [_gguf_parse_value(value.parts[_data_index], value.types) for _data_index in value.data]
+
+
+def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
+    """
+    Load a GGUF file and return a dictionary of parsed parameters containing tensors, the parsed
+    tokenizer and config attributes.
+
+    Args:
+        gguf_checkpoint_path (`str`):
+            The path the to GGUF file to load
+        return_tensors (`bool`, defaults to `True`):
+            Whether to read the tensors from the file and return them. Not doing so is faster
+            and only loads the metadata in memory.
+    """
+
+    """
+    if is_gguf_available() and is_torch_available():
+        from gguf import GGUFReader, dequantize
+    else:
+        logger.error(
+            "Loading a GGUF checkpoint in PyTorch, requires both PyTorch and GGUF>=0.10.0 to be installed. Please see "
+            "https://pytorch.org/ and https://github.com/ggerganov/llama.cpp/tree/master/gguf-py for installation instructions."
+        )
+        raise ImportError("Please install torch and gguf>=0.10.0 to load a GGUF checkpoint in PyTorch.")
+    """
+    from gguf import GGUFReader, dequantize
+
+    reader = GGUFReader(gguf_checkpoint_path)
+    fields = reader.fields
+    reader_keys = list(fields.keys())
+
+    parsed_parameters = {}
+    for tensor in tqdm(reader.tensors, desc="Converting and de-quantizing GGUF tensors..."):
+        name = tensor.name
+        weights = dequantize(tensor.data, tensor.tensor_type)
+
+        parsed_parameters[name] = weights
+
+    if len(reader_keys) > 0:
+        logger.info(f"Some keys of the GGUF file were not considered: {reader_keys}")
+
+    return parsed_parameters

From 71897b1df13b0f4b35932181a6657d73f8156d77 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Mon, 21 Oct 2024 18:47:30 +0200
Subject: [PATCH 02/43] update

---
 src/diffusers/utils/import_utils.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py
index f1323bf00ea4..f440bf67cb6c 100644
--- a/src/diffusers/utils/import_utils.py
+++ b/src/diffusers/utils/import_utils.py
@@ -339,6 +339,14 @@ def is_timm_available():
     except importlib_metadata.PackageNotFoundError:
         _imageio_available = False
 
+_is_gguf_available = importlib.util.find_spec("gguf") is not None
+if _is_gguf_available:
+    try:
+        _gguf_version = importlib_metadata.version("gguf")
+        logger.debug(f"Successfully import gguf version {_gguf_version}")
+    except importlib_metadata.PackageNotFoundError:
+        _is_gguf_available = False
+
 
 def is_torch_available():
     return _torch_available
@@ -460,6 +468,10 @@ def is_imageio_available():
     return _imageio_available
 
 
+def is_gguf_available():
+    return _is_gguf_available
+
+
 # docstyle-ignore
 FLAX_IMPORT_ERROR = """
 {0} requires the FLAX library but it was not found in your environment. Checkout the instructions on the
@@ -593,6 +605,11 @@ def is_imageio_available():
 {0} requires the imageio library and ffmpeg but it was not found in your environment. You can install it with pip: `pip install imageio imageio-ffmpeg`
 """
 
+# docstyle-ignore
+GGUF_IMPORT_ERROR = """
+{0} requires the gguf library but it was not found in your environment. You can install it with pip: `pip install gguf`
+"""
+
 BACKENDS_MAPPING = OrderedDict(
     [
         ("bs4", (is_bs4_available, BS4_IMPORT_ERROR)),
@@ -618,6 +635,7 @@ def is_imageio_available():
         ("bitsandbytes", (is_bitsandbytes_available, BITSANDBYTES_IMPORT_ERROR)),
         ("sentencepiece", (is_sentencepiece_available, SENTENCEPIECE_IMPORT_ERROR)),
         ("imageio", (is_imageio_available, IMAGEIO_IMPORT_ERROR)),
+        ("gguf", (is_gguf_available, GGUF_IMPORT_ERROR)),
     ]
 )
 

From 89ea1eeb2a66ec55a22467bb8f54787c4c6932a1 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Thu, 24 Oct 2024 12:52:29 +0200
Subject: [PATCH 03/43] update

---
 src/diffusers/loaders/gguf.py | 34 +++++++++++++++++++++++++++++-----
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/loaders/gguf.py b/src/diffusers/loaders/gguf.py
index 4d381b80766a..d85e2895f85d 100644
--- a/src/diffusers/loaders/gguf.py
+++ b/src/diffusers/loaders/gguf.py
@@ -14,14 +14,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
+import torch
 from array import array
-
+import gguf
 from tqdm import tqdm
 
 from ..utils import is_torch_available
 from ..utils.logging import get_logger
-
+from ..utils.import_utils import is_gguf_available
 
 TORCH_COMPATIBLE_QTYPES = {None, gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16}
 
@@ -43,6 +43,29 @@
 }
 
 
+class GGMLTensor(torch.Tensor):
+    def __init__(self, dtype, axis):
+        self._dtype = dtype
+        self._axis = axis
+
+    @property
+    def axis(self):
+        return self._axis
+
+    @property
+    def dtype(self):
+        return self._dtype
+
+    def numpy(self):
+        return self.dequantize().cpu().numpy()
+
+    def clone(self, *args, **kwargs):
+        return self
+
+    def detach(self, *args, **kwargs):
+        return self
+
+
 def _gguf_parse_value(_value, data_type):
     if not isinstance(data_type, list):
         data_type = [data_type]
@@ -95,7 +118,8 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
         )
         raise ImportError("Please install torch and gguf>=0.10.0 to load a GGUF checkpoint in PyTorch.")
     """
-    from gguf import GGUFReader, dequantize
+    if is_torch_available():
+        from gguf import GGUFReader, dequantize
 
     reader = GGUFReader(gguf_checkpoint_path)
     fields = reader.fields
@@ -104,7 +128,7 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
     parsed_parameters = {}
     for tensor in tqdm(reader.tensors, desc="Converting and de-quantizing GGUF tensors..."):
         name = tensor.name
-        weights = dequantize(tensor.data, tensor.tensor_type)
+        weights = torch.from_numpy(tensor.data)
 
         parsed_parameters[name] = weights
 

From f0bcd94d43036610f63d84c5f65f902ceca216d8 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Thu, 24 Oct 2024 18:08:17 +0200
Subject: [PATCH 04/43] update

---
 .../quantizers/gguf/gguf_quantizer.py         | 118 ++++++++++++++++++
 1 file changed, 118 insertions(+)
 create mode 100644 src/diffusers/quantizers/gguf/gguf_quantizer.py

diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py
new file mode 100644
index 000000000000..bc6c87ca09ce
--- /dev/null
+++ b/src/diffusers/quantizers/gguf/gguf_quantizer.py
@@ -0,0 +1,118 @@
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+
+from ...utils import get_module_from_name
+from ..base import DiffusersQuantizer
+
+
+if TYPE_CHECKING:
+    from ...models.modeling_utils import ModelMixin
+
+from ...utils import (
+    is_accelerate_available,
+    is_accelerate_version,
+    is_bitsandbytes_available,
+    is_bitsandbytes_version,
+    is_torch_available,
+    logging,
+)
+
+
+if is_torch_available():
+    import torch
+
+logger = logging.get_logger(__name__)
+
+
+class GGUFQuantizer(DiffusersQuantizer)
+    use_keep_in_fp32_modules = True
+
+    def __init__(self, quantization_config, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+
+    def check_quantized_param(
+        self,
+        model: "ModelMixin",
+        param_value: "torch.Tensor",
+        param_name: str,
+        state_dict: Dict[str, Any],
+        **kwargs,
+        ) -> bool:
+
+        return
+
+    def create_quantized_param(
+        self,
+        model: "ModelMixin",
+        param_value: "torch.Tensor",
+        param_name: str,
+        target_device: "torch.device",
+        state_dict: Dict[str, Any],
+        unexpected_keys: Optional[List[str]] = None,
+    ):
+        import bitsandbytes as bnb
+
+        module, tensor_name = get_module_from_name(model, param_name)
+
+        if tensor_name not in module._parameters:
+            raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.")
+
+        old_value = getattr(module, tensor_name)
+
+        if tensor_name == "bias":
+            if param_value is None:
+                new_value = old_value.to(target_device)
+            else:
+                new_value = param_value.to(target_device)
+
+            new_value = torch.nn.Parameter(new_value, requires_grad=old_value.requires_grad)
+            module._parameters[tensor_name] = new_value
+            return
+
+        if not isinstance(module._parameters[tensor_name], bnb.nn.Params4bit):
+            raise ValueError("this function only loads `Linear4bit components`")
+        if (
+            old_value.device == torch.device("meta")
+            and target_device not in ["meta", torch.device("meta")]
+            and param_value is None
+        ):
+            raise ValueError(f"{tensor_name} is on the meta device, we need a `value` to put in on {target_device}.")
+
+        # construct `new_value` for the module._parameters[tensor_name]:
+        if self.pre_quantized:
+            # 4bit loading. Collecting components for restoring quantized weight
+            # This can be expanded to make a universal call for any quantized weight loading
+
+            if not self.is_serializable:
+                raise ValueError(
+                    "Detected int4 weights but the version of bitsandbytes is not compatible with int4 serialization. "
+                    "Make sure to download the latest `bitsandbytes` version. `pip install --upgrade bitsandbytes`."
+                )
+
+            if (param_name + ".quant_state.bitsandbytes__fp4" not in state_dict) and (
+                param_name + ".quant_state.bitsandbytes__nf4" not in state_dict
+            ):
+                raise ValueError(
+                    f"Supplied state dict for {param_name} does not contain `bitsandbytes__*` and possibly other `quantized_stats` components."
+                )
+
+            quantized_stats = {}
+            for k, v in state_dict.items():
+                # `startswith` to counter for edge cases where `param_name`
+                # substring can be present in multiple places in the `state_dict`
+                if param_name + "." in k and k.startswith(param_name):
+                    quantized_stats[k] = v
+                    if unexpected_keys is not None and k in unexpected_keys:
+                        unexpected_keys.remove(k)
+
+            new_value = bnb.nn.Params4bit.from_prequantized(
+                data=param_value,
+                quantized_stats=quantized_stats,
+                requires_grad=False,
+                device=target_device,
+            )
+        else:
+            new_value = param_value.to("cpu")
+            kwargs = old_value.__dict__
+            new_value = bnb.nn.Params4bit(new_value, requires_grad=False, **kwargs).to(target_device)
+
+        module._parameters[tensor_name] = new_value

From 60d1385876db9eb64e096855580f6cc139d811f5 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 29 Oct 2024 19:09:38 +0100
Subject: [PATCH 05/43] update

---
 src/diffusers/loaders/gguf.py              | 12 +++---
 src/diffusers/loaders/single_file_model.py | 48 +++++++++++++++++++++-
 2 files changed, 52 insertions(+), 8 deletions(-)

diff --git a/src/diffusers/loaders/gguf.py b/src/diffusers/loaders/gguf.py
index d85e2895f85d..f305cf8eac06 100644
--- a/src/diffusers/loaders/gguf.py
+++ b/src/diffusers/loaders/gguf.py
@@ -14,14 +14,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import torch
 from array import array
+
 import gguf
+import torch
 from tqdm import tqdm
 
 from ..utils import is_torch_available
-from ..utils.logging import get_logger
 from ..utils.import_utils import is_gguf_available
+from ..utils.logging import get_logger
+
 
 TORCH_COMPATIBLE_QTYPES = {None, gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16}
 
@@ -108,18 +110,14 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
             and only loads the metadata in memory.
     """
 
-    """
     if is_gguf_available() and is_torch_available():
-        from gguf import GGUFReader, dequantize
+        from gguf import GGUFReader
     else:
         logger.error(
             "Loading a GGUF checkpoint in PyTorch, requires both PyTorch and GGUF>=0.10.0 to be installed. Please see "
             "https://pytorch.org/ and https://github.com/ggerganov/llama.cpp/tree/master/gguf-py for installation instructions."
         )
         raise ImportError("Please install torch and gguf>=0.10.0 to load a GGUF checkpoint in PyTorch.")
-    """
-    if is_torch_available():
-        from gguf import GGUFReader, dequantize
 
     reader = GGUFReader(gguf_checkpoint_path)
     fields = reader.fields
diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py
index 3fe1abfbead5..fceef2cb0ca3 100644
--- a/src/diffusers/loaders/single_file_model.py
+++ b/src/diffusers/loaders/single_file_model.py
@@ -17,8 +17,10 @@
 from contextlib import nullcontext
 from typing import Optional
 
+import torch
 from huggingface_hub.utils import validate_hf_hub_args
 
+from ..quantizers import DiffusersAutoQuantizer
 from ..utils import deprecate, is_accelerate_available, logging
 from .single_file_utils import (
     SingleFileComponentError,
@@ -202,6 +204,7 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =
         subfolder = kwargs.pop("subfolder", None)
         revision = kwargs.pop("revision", None)
         torch_dtype = kwargs.pop("torch_dtype", None)
+        quantization_config = kwargs.pop("quantization_config", None)
 
         if isinstance(pretrained_model_link_or_path_or_dict, dict):
             checkpoint = pretrained_model_link_or_path_or_dict
@@ -216,6 +219,36 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =
                 revision=revision,
             )
 
+        pre_quantized = "quantization_config" in config and config["quantization_config"] is not None
+        if pre_quantized or quantization_config is not None:
+            if pre_quantized:
+                config["quantization_config"] = DiffusersAutoQuantizer.merge_quantization_configs(
+                    config["quantization_config"], quantization_config
+                )
+            else:
+                config["quantization_config"] = quantization_config
+            hf_quantizer = DiffusersAutoQuantizer.from_config(
+                config["quantization_config"], pre_quantized=pre_quantized
+            )
+        else:
+            hf_quantizer = None
+
+        if hf_quantizer is not None:
+            hf_quantizer.validate_environment(torch_dtype=torch_dtype)
+            torch_dtype = hf_quantizer.update_torch_dtype(torch_dtype)
+
+        # Check if `_keep_in_fp32_modules` is not None
+        use_keep_in_fp32_modules = (cls._keep_in_fp32_modules is not None) and (
+            (torch_dtype == torch.float16) or hasattr(hf_quantizer, "use_keep_in_fp32_modules")
+        )
+        if use_keep_in_fp32_modules:
+            keep_in_fp32_modules = cls._keep_in_fp32_modules
+            if not isinstance(keep_in_fp32_modules, list):
+                keep_in_fp32_modules = [keep_in_fp32_modules]
+
+        else:
+            keep_in_fp32_modules = []
+
         mapping_functions = SINGLE_FILE_LOADABLE_CLASSES[mapping_class_name]
 
         checkpoint_mapping_fn = mapping_functions["checkpoint_mapping_fn"]
@@ -295,8 +328,17 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =
         with ctx():
             model = cls.from_config(diffusers_model_config)
 
+        if hf_quantizer is not None:
+            hf_quantizer.preprocess_model(model=model, keep_in_fp32_modules=keep_in_fp32_modules)
+
         if is_accelerate_available():
-            unexpected_keys = load_model_dict_into_meta(model, diffusers_format_checkpoint, dtype=torch_dtype)
+            unexpected_keys = load_model_dict_into_meta(
+                model,
+                diffusers_format_checkpoint,
+                dtype=torch_dtype,
+                hf_quantizer=hf_quantizer,
+                keep_in_fp32_modules=keep_in_fp32_modules,
+            )
 
         else:
             _, unexpected_keys = model.load_state_dict(diffusers_format_checkpoint, strict=False)
@@ -310,6 +352,10 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =
                 f"Some weights of the model checkpoint were not used when initializing {cls.__name__}: \n {[', '.join(unexpected_keys)]}"
             )
 
+        if hf_quantizer is not None:
+            hf_quantizer.postprocess_model(model)
+            model.hf_quantizer = hf_quantizer
+
         if torch_dtype is not None:
             model.to(torch_dtype)
 

From 22ed0b054a472d98e307e6760ab7fffdc1f67b9d Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Thu, 31 Oct 2024 12:23:13 +0100
Subject: [PATCH 06/43] update

---
 src/diffusers/loaders/gguf.py               | 136 --------------------
 src/diffusers/loaders/single_file_model.py  |  20 ++-
 src/diffusers/loaders/single_file_utils.py  |   9 +-
 src/diffusers/models/model_loading_utils.py |  83 ++++++++++++
 src/diffusers/quantizers/gguf/utils.py      |   6 +
 src/diffusers/utils/constants.py            |   1 +
 tests/models/test_attention_processor.py    |   3 +
 7 files changed, 108 insertions(+), 150 deletions(-)
 delete mode 100644 src/diffusers/loaders/gguf.py
 create mode 100644 src/diffusers/quantizers/gguf/utils.py

diff --git a/src/diffusers/loaders/gguf.py b/src/diffusers/loaders/gguf.py
deleted file mode 100644
index f305cf8eac06..000000000000
--- a/src/diffusers/loaders/gguf.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The ggml.ai team and The HuggingFace Inc. team. and pygguf author (github.com/99991)
-# https://github.com/99991/pygguf
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from array import array
-
-import gguf
-import torch
-from tqdm import tqdm
-
-from ..utils import is_torch_available
-from ..utils.import_utils import is_gguf_available
-from ..utils.logging import get_logger
-
-
-TORCH_COMPATIBLE_QTYPES = {None, gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16}
-
-if is_torch_available():
-    pass
-
-logger = get_logger(__name__)
-
-
-GGUF_TO_DIFFUSERS_MAPPING = {
-    "ignore": {
-        "GGUF": {
-            "version": "version",
-            "tensor_count": "tensor_count",
-            "kv_count": "kv_count",
-        },
-        "general": {"file_type": "file_type", "quantization_version": "quantization_version"},
-    },
-}
-
-
-class GGMLTensor(torch.Tensor):
-    def __init__(self, dtype, axis):
-        self._dtype = dtype
-        self._axis = axis
-
-    @property
-    def axis(self):
-        return self._axis
-
-    @property
-    def dtype(self):
-        return self._dtype
-
-    def numpy(self):
-        return self.dequantize().cpu().numpy()
-
-    def clone(self, *args, **kwargs):
-        return self
-
-    def detach(self, *args, **kwargs):
-        return self
-
-
-def _gguf_parse_value(_value, data_type):
-    if not isinstance(data_type, list):
-        data_type = [data_type]
-    if len(data_type) == 1:
-        data_type = data_type[0]
-        array_data_type = None
-    else:
-        if data_type[0] != 9:
-            raise ValueError("Received multiple types, therefore expected the first type to indicate an array.")
-        data_type, array_data_type = data_type
-
-    if data_type in [0, 1, 2, 3, 4, 5, 10, 11]:
-        _value = int(_value[0])
-    elif data_type in [6, 12]:
-        _value = float(_value[0])
-    elif data_type in [7]:
-        _value = bool(_value[0])
-    elif data_type in [8]:
-        _value = array("B", list(_value)).tobytes().decode()
-    elif data_type in [9]:
-        _value = _gguf_parse_value(_value, array_data_type)
-    return _value
-
-
-def read_field(reader, field):
-    value = reader.fields[field]
-    return [_gguf_parse_value(value.parts[_data_index], value.types) for _data_index in value.data]
-
-
-def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
-    """
-    Load a GGUF file and return a dictionary of parsed parameters containing tensors, the parsed
-    tokenizer and config attributes.
-
-    Args:
-        gguf_checkpoint_path (`str`):
-            The path the to GGUF file to load
-        return_tensors (`bool`, defaults to `True`):
-            Whether to read the tensors from the file and return them. Not doing so is faster
-            and only loads the metadata in memory.
-    """
-
-    if is_gguf_available() and is_torch_available():
-        from gguf import GGUFReader
-    else:
-        logger.error(
-            "Loading a GGUF checkpoint in PyTorch, requires both PyTorch and GGUF>=0.10.0 to be installed. Please see "
-            "https://pytorch.org/ and https://github.com/ggerganov/llama.cpp/tree/master/gguf-py for installation instructions."
-        )
-        raise ImportError("Please install torch and gguf>=0.10.0 to load a GGUF checkpoint in PyTorch.")
-
-    reader = GGUFReader(gguf_checkpoint_path)
-    fields = reader.fields
-    reader_keys = list(fields.keys())
-
-    parsed_parameters = {}
-    for tensor in tqdm(reader.tensors, desc="Converting and de-quantizing GGUF tensors..."):
-        name = tensor.name
-        weights = torch.from_numpy(tensor.data)
-
-        parsed_parameters[name] = weights
-
-    if len(reader_keys) > 0:
-        logger.info(f"Some keys of the GGUF file were not considered: {reader_keys}")
-
-    return parsed_parameters
diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py
index fceef2cb0ca3..dd00cd4c116e 100644
--- a/src/diffusers/loaders/single_file_model.py
+++ b/src/diffusers/loaders/single_file_model.py
@@ -17,6 +17,7 @@
 from contextlib import nullcontext
 from typing import Optional
 
+from huggingface_hub import QuestionAnsweringInput
 import torch
 from huggingface_hub.utils import validate_hf_hub_args
 
@@ -218,18 +219,15 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =
                 local_files_only=local_files_only,
                 revision=revision,
             )
+        is_gguf = "gguf_metadata" in checkpoint
+        gguf_metadata = checkpoint["gguf_metadata"] if is_gguf else None
 
-        pre_quantized = "quantization_config" in config and config["quantization_config"] is not None
-        if pre_quantized or quantization_config is not None:
-            if pre_quantized:
-                config["quantization_config"] = DiffusersAutoQuantizer.merge_quantization_configs(
-                    config["quantization_config"], quantization_config
-                )
-            else:
-                config["quantization_config"] = quantization_config
-            hf_quantizer = DiffusersAutoQuantizer.from_config(
-                config["quantization_config"], pre_quantized=pre_quantized
-            )
+        while "state_dict" in checkpoint:
+            checkpoint = checkpoint["state_dict"]
+
+        if is_gguf:
+            quantization_config = GGUFConfig()
+            hf_quantizer = DiffusersAutoQuantizer.from_config(quantization_config, pre_quantized=False)
         else:
             hf_quantizer = None
 
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 236fbd0c2295..4c50e36dc4f8 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -353,9 +353,12 @@ def load_single_file_checkpoint(
 
     checkpoint = load_state_dict(pretrained_model_link_or_path)
 
-    # some checkpoints contain the model state dict under a "state_dict" key
-    while "state_dict" in checkpoint:
-        checkpoint = checkpoint["state_dict"]
+    if "gguf_qtypes" in checkpoint:
+        return checkpoint
+    else:
+        # some checkpoints contain the model state dict under a "state_dict" key
+        while "state_dict" in checkpoint:
+            checkpoint = checkpoint["state_dict"]
 
     return checkpoint
 
diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py
index 5277ad2f9389..0a7522591382 100644
--- a/src/diffusers/models/model_loading_utils.py
+++ b/src/diffusers/models/model_loading_utils.py
@@ -25,6 +25,17 @@
 import torch
 from huggingface_hub.utils import EntryNotFoundError
 
+from diffusers.utils.constants import GGUF_FILE_EXTENSION
+from array import array
+
+import torch
+from tqdm import tqdm
+
+from ..utils import is_torch_available
+from ..utils.import_utils import is_gguf_available
+from ..utils.logging import get_logger
+
+
 from ..quantizers.quantization_config import QuantizationMethod
 from ..utils import (
     SAFE_WEIGHTS_INDEX_NAME,
@@ -140,6 +151,8 @@ def load_state_dict(checkpoint_file: Union[str, os.PathLike], variant: Optional[
         file_extension = os.path.basename(checkpoint_file).split(".")[-1]
         if file_extension == SAFETENSORS_FILE_EXTENSION:
             return safetensors.torch.load_file(checkpoint_file, device="cpu")
+        elif file_extension == GGUF_FILE_EXTENSION:
+            return load_gguf_checkpoint(checkpoint_file)
         else:
             weights_only_kwarg = {"weights_only": True} if is_torch_version(">=", "1.13") else {}
             return torch.load(
@@ -389,3 +402,73 @@ def _fetch_index_file_legacy(
                 index_file = None
 
     return index_file
+
+
+def _gguf_parse_value(_value, data_type):
+    if not isinstance(data_type, list):
+        data_type = [data_type]
+    if len(data_type) == 1:
+        data_type = data_type[0]
+        array_data_type = None
+    else:
+        if data_type[0] != 9:
+            raise ValueError("Received multiple types, therefore expected the first type to indicate an array.")
+        data_type, array_data_type = data_type
+
+    if data_type in [0, 1, 2, 3, 4, 5, 10, 11]:
+        _value = int(_value[0])
+    elif data_type in [6, 12]:
+        _value = float(_value[0])
+    elif data_type in [7]:
+        _value = bool(_value[0])
+    elif data_type in [8]:
+        _value = array("B", list(_value)).tobytes().decode()
+    elif data_type in [9]:
+        _value = _gguf_parse_value(_value, array_data_type)
+    return _value
+
+
+def read_field(reader, field):
+    value = reader.fields[field]
+    return [_gguf_parse_value(value.parts[_data_index], value.types) for _data_index in value.data]
+
+
+def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
+    """
+    Load a GGUF file and return a dictionary of parsed parameters containing tensors, the parsed
+    tokenizer and config attributes.
+
+    Args:
+        gguf_checkpoint_path (`str`):
+            The path the to GGUF file to load
+        return_tensors (`bool`, defaults to `True`):
+            Whether to read the tensors from the file and return them. Not doing so is faster
+            and only loads the metadata in memory.
+    """
+
+    if is_gguf_available() and is_torch_available():
+        from gguf import GGUFReader
+    else:
+        logger.error(
+            "Loading a GGUF checkpoint in PyTorch, requires both PyTorch and GGUF>=0.10.0 to be installed. Please see "
+            "https://pytorch.org/ and https://github.com/ggerganov/llama.cpp/tree/master/gguf-py for installation instructions."
+        )
+        raise ImportError("Please install torch and gguf>=0.10.0 to load a GGUF checkpoint in PyTorch.")
+
+    reader = GGUFReader(gguf_checkpoint_path)
+    fields = reader.fields
+    reader_keys = list(fields.keys())
+
+    parsed_parameters = {}
+    qtypes = {}
+    for tensor in tqdm(reader.tensors):
+        name = tensor.name
+        weights = torch.from_numpy(tensor.data)
+
+        parsed_parameters[name] = weights
+        qtypes[name] = str(tensor.tensor_type)
+
+    if len(reader_keys) > 0:
+        logger.info(f"Some keys of the GGUF file were not considered: {reader_keys}")
+
+    return {"state_dict": parsed_parameters, "gguf_metadata": qtypes}
diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py
new file mode 100644
index 000000000000..1be035fe478a
--- /dev/null
+++ b/src/diffusers/quantizers/gguf/utils.py
@@ -0,0 +1,6 @@
+import torch
+
+
+class GGUFParameter(torch.nn.Parameter):
+    def __init__(self, data):
+        super().__init__()
diff --git a/src/diffusers/utils/constants.py b/src/diffusers/utils/constants.py
index 553ac5d1bb27..93b0cd847d91 100644
--- a/src/diffusers/utils/constants.py
+++ b/src/diffusers/utils/constants.py
@@ -34,6 +34,7 @@
 SAFETENSORS_WEIGHTS_NAME = "diffusion_pytorch_model.safetensors"
 SAFE_WEIGHTS_INDEX_NAME = "diffusion_pytorch_model.safetensors.index.json"
 SAFETENSORS_FILE_EXTENSION = "safetensors"
+GGUF_FILE_EXTENSION = "gguf"
 ONNX_EXTERNAL_WEIGHTS_NAME = "weights.pb"
 HUGGINGFACE_CO_RESOLVE_ENDPOINT = os.environ.get("HF_ENDPOINT", "https://huggingface.co")
 DIFFUSERS_DYNAMIC_MODULE_NAME = "diffusers_modules"
diff --git a/tests/models/test_attention_processor.py b/tests/models/test_attention_processor.py
index 2489604274b4..c1432fee5211 100644
--- a/tests/models/test_attention_processor.py
+++ b/tests/models/test_attention_processor.py
@@ -6,6 +6,7 @@
 
 from diffusers import DiffusionPipeline
 from diffusers.models.attention_processor import Attention, AttnAddedKVProcessor
+import pytest
 
 
 class AttnAddedKVProcessorTests(unittest.TestCase):
@@ -83,6 +84,7 @@ def test_conversion_when_using_device_map(self):
         pipe = DiffusionPipeline.from_pretrained(
             "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
         )
+        torch.cuda.synchronize()
 
         pre_conversion = pipe(
             "foo",
@@ -95,6 +97,7 @@ def test_conversion_when_using_device_map(self):
         pipe = DiffusionPipeline.from_pretrained(
             "hf-internal-testing/tiny-stable-diffusion-torch", device_map="balanced", safety_checker=None
         )
+        torch.cuda.synchronize()
 
         conversion = pipe(
             "foo",

From 2e6d3405e3404b9f91b9de880e6854594cc25bf0 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Sun, 3 Nov 2024 09:40:38 +0100
Subject: [PATCH 07/43] update

---
 src/diffusers/loaders/single_file_model.py    |   3 +-
 src/diffusers/models/model_loading_utils.py   |  24 +++-
 .../quantizers/gguf/gguf_quantizer.py         | 105 ++++++------------
 src/diffusers/quantizers/gguf/utils.py        |  55 ++++++++-
 .../quantizers/quantization_config.py         |  14 ++-
 5 files changed, 119 insertions(+), 82 deletions(-)

diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py
index dd00cd4c116e..2c04ba58bafa 100644
--- a/src/diffusers/loaders/single_file_model.py
+++ b/src/diffusers/loaders/single_file_model.py
@@ -222,11 +222,12 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =
         is_gguf = "gguf_metadata" in checkpoint
         gguf_metadata = checkpoint["gguf_metadata"] if is_gguf else None
 
+        # For GGUF models we nest the state_dict along with gguf_metadata
         while "state_dict" in checkpoint:
             checkpoint = checkpoint["state_dict"]
 
         if is_gguf:
-            quantization_config = GGUFConfig()
+            quantization_config = GGUFQuantizationConfig()
             hf_quantizer = DiffusersAutoQuantizer.from_config(quantization_config, pre_quantized=False)
         else:
             hf_quantizer = None
diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py
index 0a7522591382..c650f35076d5 100644
--- a/src/diffusers/models/model_loading_utils.py
+++ b/src/diffusers/models/model_loading_utils.py
@@ -60,6 +60,25 @@
 }
 
 
+_GGUF_FILE_TYPE_MAPPING = {
+    0: "ALL_F32",
+    1: "MOSTLY_F16",
+    2: "MOSTLY_Q4_0",
+    3: "MOSTLY_Q4_1",
+    4: "MOSTLY_Q4_1_SOME_F16",
+    8: "MOSTLY_Q5_0",
+    9: "MOSTLY_Q5_1",
+    10: "MOSTLY_Q2_K",
+    11: "MOSTLY_Q3_K_S",
+    12: "MOSTLY_Q3_K_M",
+    13: "MOSTLY_Q3_K_L",
+    14: "MOSTLY_Q4_K_S",
+    15: "MOSTLY_Q4_K_M",
+    16: "MOSTLY_Q5_K_S",
+    17: "MOSTLY_Q5_K_M",
+    18: "MOSTLY_Q6_K",
+}
+
 if is_accelerate_available():
     from accelerate import infer_auto_device_map
     from accelerate.utils import get_balanced_memory, get_max_memory, set_module_tensor_to_device
@@ -460,15 +479,14 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
     reader_keys = list(fields.keys())
 
     parsed_parameters = {}
-    qtypes = {}
+    metadata = {"gguf_file_type": _GGUF_FILE_TYPE_MAPPING[read_field(reader, "general.file_type")[0]]}
     for tensor in tqdm(reader.tensors):
         name = tensor.name
         weights = torch.from_numpy(tensor.data)
 
         parsed_parameters[name] = weights
-        qtypes[name] = str(tensor.tensor_type)
 
     if len(reader_keys) > 0:
         logger.info(f"Some keys of the GGUF file were not considered: {reader_keys}")
 
-    return {"state_dict": parsed_parameters, "gguf_metadata": qtypes}
+    return {"state_dict": parsed_parameters, "gguf_metadata": metadata}
diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py
index bc6c87ca09ce..eef3f7e43ca4 100644
--- a/src/diffusers/quantizers/gguf/gguf_quantizer.py
+++ b/src/diffusers/quantizers/gguf/gguf_quantizer.py
@@ -1,8 +1,9 @@
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 
+
 from ...utils import get_module_from_name
 from ..base import DiffusersQuantizer
-
+from .utils import GGUFLinear
 
 if TYPE_CHECKING:
     from ...models.modeling_utils import ModelMixin
@@ -10,25 +11,28 @@
 from ...utils import (
     is_accelerate_available,
     is_accelerate_version,
-    is_bitsandbytes_available,
-    is_bitsandbytes_version,
     is_torch_available,
     logging,
 )
 
+if accelerate_is_available():
+    from accelerate import init_empty_weights
 
 if is_torch_available():
     import torch
+    import torch.nn as nn
 
-logger = logging.get_logger(__name__)
 
+logger = logging.get_logger(__name__)
 
-class GGUFQuantizer(DiffusersQuantizer)
-    use_keep_in_fp32_modules = True
 
+class GGUFQuantizer(DiffusersQuantizer):
     def __init__(self, quantization_config, **kwargs):
         super().__init__(quantization_config, **kwargs)
 
+        self.quant_type = quantization_config.quant_type
+        self.compute_dtype = quantization_config.compute_dtype
+
     def check_quantized_param(
         self,
         model: "ModelMixin",
@@ -36,8 +40,7 @@ def check_quantized_param(
         param_name: str,
         state_dict: Dict[str, Any],
         **kwargs,
-        ) -> bool:
-
+    ) -> bool:
         return
 
     def create_quantized_param(
@@ -49,70 +52,24 @@ def create_quantized_param(
         state_dict: Dict[str, Any],
         unexpected_keys: Optional[List[str]] = None,
     ):
-        import bitsandbytes as bnb
-
-        module, tensor_name = get_module_from_name(model, param_name)
-
-        if tensor_name not in module._parameters:
-            raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.")
-
-        old_value = getattr(module, tensor_name)
-
-        if tensor_name == "bias":
-            if param_value is None:
-                new_value = old_value.to(target_device)
-            else:
-                new_value = param_value.to(target_device)
-
-            new_value = torch.nn.Parameter(new_value, requires_grad=old_value.requires_grad)
-            module._parameters[tensor_name] = new_value
-            return
-
-        if not isinstance(module._parameters[tensor_name], bnb.nn.Params4bit):
-            raise ValueError("this function only loads `Linear4bit components`")
-        if (
-            old_value.device == torch.device("meta")
-            and target_device not in ["meta", torch.device("meta")]
-            and param_value is None
-        ):
-            raise ValueError(f"{tensor_name} is on the meta device, we need a `value` to put in on {target_device}.")
-
-        # construct `new_value` for the module._parameters[tensor_name]:
-        if self.pre_quantized:
-            # 4bit loading. Collecting components for restoring quantized weight
-            # This can be expanded to make a universal call for any quantized weight loading
-
-            if not self.is_serializable:
-                raise ValueError(
-                    "Detected int4 weights but the version of bitsandbytes is not compatible with int4 serialization. "
-                    "Make sure to download the latest `bitsandbytes` version. `pip install --upgrade bitsandbytes`."
-                )
-
-            if (param_name + ".quant_state.bitsandbytes__fp4" not in state_dict) and (
-                param_name + ".quant_state.bitsandbytes__nf4" not in state_dict
-            ):
-                raise ValueError(
-                    f"Supplied state dict for {param_name} does not contain `bitsandbytes__*` and possibly other `quantized_stats` components."
-                )
-
-            quantized_stats = {}
-            for k, v in state_dict.items():
-                # `startswith` to counter for edge cases where `param_name`
-                # substring can be present in multiple places in the `state_dict`
-                if param_name + "." in k and k.startswith(param_name):
-                    quantized_stats[k] = v
-                    if unexpected_keys is not None and k in unexpected_keys:
-                        unexpected_keys.remove(k)
-
-            new_value = bnb.nn.Params4bit.from_prequantized(
-                data=param_value,
-                quantized_stats=quantized_stats,
-                requires_grad=False,
-                device=target_device,
-            )
-        else:
-            new_value = param_value.to("cpu")
-            kwargs = old_value.__dict__
-            new_value = bnb.nn.Params4bit(new_value, requires_grad=False, **kwargs).to(target_device)
+        return
 
-        module._parameters[tensor_name] = new_value
+    def _process_model_before_weight_loading(
+        self,
+        model: "ModelMixin",
+        device_map,
+        keep_in_fp32_modules: List[str] = [],
+        **kwargs,
+    ):
+        for name, module in model.named_children():
+            if isinstance(module, nn.Linear) and name not in modules_to_not_convert:
+                with init_empty_weights():
+                    in_features = module.in_features
+                    out_features = module.out_features
+                    model._modules[name] = GGUFLinear(
+                        in_features,
+                        out_features,
+                        module.bias is not None,
+                        compute_dtype=self.compute_dtype,
+                        quant_type=self.quant_type,
+                    )
diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py
index 1be035fe478a..ac38fe63480c 100644
--- a/src/diffusers/quantizers/gguf/utils.py
+++ b/src/diffusers/quantizers/gguf/utils.py
@@ -1,6 +1,55 @@
 import torch
+import torch.nn as nn
+import gguf
 
 
-class GGUFParameter(torch.nn.Parameter):
-    def __init__(self, data):
-        super().__init__()
+QK_K_BLOCKSIZE = 256
+K_SCALE_SIZE = 12
+
+
+def split_block_dims(blocks, *args):
+    n_max = blocks.shape[1]
+    dims = list(args) + [n_max - sum(args)]
+    return torch.split(blocks, dims, dim=1)
+
+
+def dequantize_Q2_K(blocks, block_size, type_size, dtype=None):
+    n_blocks = blocks.shape[0]
+
+    scales, qs, d, dmin = split_block_dims(blocks, QK_K_BLOCKSIZE // 16, QK_K // 4, 2)
+    d = d.view(torch.float16).to(dtype)
+    dmin = dmin.view(torch.float16).to(dtype)
+
+    # (n_blocks, 16, 1)
+    dl = (d * (scales & 0xF)).reshape((n_blocks, QK_K_BLOCKSIZE // 16, 1))
+    ml = (dmin * (scales >> 4)).reshape((n_blocks, QK_K_BLOCKSIZE // 16, 1))
+
+    shift = torch.tensor([0, 2, 4, 6], device=d.device, dtype=torch.uint8).reshape((1, 1, 4, 1))
+
+    qs = (qs.reshape((n_blocks, -1, 1, 32)) >> shift) & 3
+    qs = qs.reshape((n_blocks, QK_K_BLOCKSIZE // 16, 16))
+    qs = dl * qs - ml
+
+    return qs.reshape((n_blocks, -1))
+
+
+class GGUFLinear(nn.Linear):
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        bias=False,
+        compute_dtype=None,
+        quant_type=None,
+        device=None,
+    ) -> None:
+        super().__init__(in_features, out_features, bias, device)
+        self._dequant_fn = gguf.quants.dequantize
+        self.compute_dtype = compute_dtype
+        self.quant_type = quant_type
+
+    def forward(self, inputs):
+        weight = self._dequant_fn(self.weight, self.quant_type).to(self.compute_dtype)
+        bias = self._dequant_fn(self.bias, self.quant_type).to(self.compute_dtype)
+
+        return torch.nn.functional.linear(inputs, weight, bias)
diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py
index f521c5d717d6..213c5ae57da6 100644
--- a/src/diffusers/quantizers/quantization_config.py
+++ b/src/diffusers/quantizers/quantization_config.py
@@ -32,7 +32,6 @@
 
 from ..utils import is_torch_available, logging
 
-
 if is_torch_available():
     import torch
 
@@ -389,3 +388,16 @@ def to_diff_dict(self) -> Dict[str, Any]:
                 serializable_config_dict[key] = value
 
         return serializable_config_dict
+
+
+class GGUFQuantizationConfig(QuantizationConfigMixin):
+    def __init__(self, quant_type: str, compute_dtype=None, quant_storage=None):
+        self.quant_type = quant_type
+        self.compute_dtype = compute_dtype
+        self.quant_storage = quant_storage
+
+        if self.compute_dtype is None:
+            self.compute_dtype = torch.float32
+
+        if self.quant_storage is None:
+            self.quant_storage = torch.uint8

From b5f927c2cda3de767aa5028925fb636baa83405f Mon Sep 17 00:00:00 2001
From: DN6 <dhruv.nair@gmail.com>
Date: Mon, 11 Nov 2024 21:37:33 +0530
Subject: [PATCH 08/43] update

---
 src/diffusers/loaders/single_file_model.py      | 14 +++++++-------
 src/diffusers/quantizers/gguf/gguf_quantizer.py | 17 +++++++++++------
 src/diffusers/quantizers/gguf/utils.py          | 16 ++++++++--------
 src/diffusers/utils/__init__.py                 |  1 +
 4 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py
index 2c04ba58bafa..f3e163126c43 100644
--- a/src/diffusers/loaders/single_file_model.py
+++ b/src/diffusers/loaders/single_file_model.py
@@ -22,7 +22,7 @@
 from huggingface_hub.utils import validate_hf_hub_args
 
 from ..quantizers import DiffusersAutoQuantizer
-from ..utils import deprecate, is_accelerate_available, logging
+from ..utils import deprecate, is_accelerate_available, is_gguf_available, logging
 from .single_file_utils import (
     SingleFileComponentError,
     convert_animatediff_checkpoint_to_diffusers,
@@ -49,6 +49,9 @@
 
     from ..models.modeling_utils import load_model_dict_into_meta
 
+if is_gguf_available():
+    from ..quantizers import GGUFQuantizationConfig
+
 
 SINGLE_FILE_LOADABLE_CLASSES = {
     "StableCascadeUNet": {
@@ -227,15 +230,12 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =
             checkpoint = checkpoint["state_dict"]
 
         if is_gguf:
-            quantization_config = GGUFQuantizationConfig()
-            hf_quantizer = DiffusersAutoQuantizer.from_config(quantization_config, pre_quantized=False)
+            quantization_config = GGUFQuantizationConfig(quant_type=gguf_metadata["gguf_file_type"])
+            # Only support loading pre_quantized gguf checkpoints
+            hf_quantizer = DiffusersAutoQuantizer.from_config(quantization_config, pre_quantized=True)
         else:
             hf_quantizer = None
 
-        if hf_quantizer is not None:
-            hf_quantizer.validate_environment(torch_dtype=torch_dtype)
-            torch_dtype = hf_quantizer.update_torch_dtype(torch_dtype)
-
         # Check if `_keep_in_fp32_modules` is not None
         use_keep_in_fp32_modules = (cls._keep_in_fp32_modules is not None) and (
             (torch_dtype == torch.float16) or hasattr(hf_quantizer, "use_keep_in_fp32_modules")
diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py
index eef3f7e43ca4..bb4607094a8c 100644
--- a/src/diffusers/quantizers/gguf/gguf_quantizer.py
+++ b/src/diffusers/quantizers/gguf/gguf_quantizer.py
@@ -15,7 +15,7 @@
     logging,
 )
 
-if accelerate_is_available():
+if is_accelerate_available():
     from accelerate import init_empty_weights
 
 if is_torch_available():
@@ -52,6 +52,13 @@ def create_quantized_param(
         state_dict: Dict[str, Any],
         unexpected_keys: Optional[List[str]] = None,
     ):
+        module, tensor_name = get_module_from_name(model, param_name)
+        if tensor_name not in module._parameters:
+            raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.")
+        __import__("ipdb").set_trace()
+
+        module._parameters[tensor_name] = param_value
+
         return
 
     def _process_model_before_weight_loading(
@@ -62,13 +69,11 @@ def _process_model_before_weight_loading(
         **kwargs,
     ):
         for name, module in model.named_children():
-            if isinstance(module, nn.Linear) and name not in modules_to_not_convert:
+            if isinstance(module, nn.Linear) and name not in self.modules_to_not_convert:
                 with init_empty_weights():
-                    in_features = module.in_features
-                    out_features = module.out_features
                     model._modules[name] = GGUFLinear(
-                        in_features,
-                        out_features,
+                        module.in_features,
+                        module.out_features,
                         module.bias is not None,
                         compute_dtype=self.compute_dtype,
                         quant_type=self.quant_type,
diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py
index ac38fe63480c..6923865ecb05 100644
--- a/src/diffusers/quantizers/gguf/utils.py
+++ b/src/diffusers/quantizers/gguf/utils.py
@@ -13,20 +13,20 @@ def split_block_dims(blocks, *args):
     return torch.split(blocks, dims, dim=1)
 
 
-def dequantize_Q2_K(blocks, block_size, type_size, dtype=None):
+def dequantize_Q2_K(blocks, dtype=None):
     n_blocks = blocks.shape[0]
 
-    scales, qs, d, dmin = split_block_dims(blocks, QK_K_BLOCKSIZE // 16, QK_K // 4, 2)
-    d = d.view(torch.float16).to(dtype)
-    dmin = dmin.view(torch.float16).to(dtype)
+    scales, quantized_values, delta, delta_min = split_block_dims(blocks, QK_K_BLOCKSIZE // 16, QK_K_BLOCKSIZE // 4, 2)
+    delta = delta.view(torch.float16).to(dtype)
+    delta_min = delta_min.view(torch.float16).to(dtype)
 
     # (n_blocks, 16, 1)
-    dl = (d * (scales & 0xF)).reshape((n_blocks, QK_K_BLOCKSIZE // 16, 1))
-    ml = (dmin * (scales >> 4)).reshape((n_blocks, QK_K_BLOCKSIZE // 16, 1))
+    dl = (delta * (scales & 0xF)).reshape((n_blocks, QK_K_BLOCKSIZE // 16, 1))
+    ml = (delta_min * (scales >> 4)).reshape((n_blocks, QK_K_BLOCKSIZE // 16, 1))
 
-    shift = torch.tensor([0, 2, 4, 6], device=d.device, dtype=torch.uint8).reshape((1, 1, 4, 1))
+    shift = torch.tensor([0, 2, 4, 6], device=delta.device, dtype=torch.uint8).reshape((1, 1, 4, 1))
 
-    qs = (qs.reshape((n_blocks, -1, 1, 32)) >> shift) & 3
+    qs = (quantized_values.reshape((n_blocks, -1, 1, 32)) >> shift) & 3
     qs = qs.reshape((n_blocks, QK_K_BLOCKSIZE // 16, 16))
     qs = dl * qs - ml
 
diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
index c8f64adf3e8a..da2cd55afa03 100644
--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -93,6 +93,7 @@
     is_unidecode_available,
     is_wandb_available,
     is_xformers_available,
+    is_gguf_available,
     requires_backends,
 )
 from .loading_utils import get_module_from_name, load_image, load_video

From 6dc5d225a4545b86b19a0b8db0eebd9ade4a48b7 Mon Sep 17 00:00:00 2001
From: DN6 <dhruv.nair@gmail.com>
Date: Wed, 13 Nov 2024 16:57:33 +0530
Subject: [PATCH 09/43] update

---
 src/diffusers/loaders/single_file_model.py    | 43 +++++----
 src/diffusers/loaders/single_file_utils.py    |  2 +-
 src/diffusers/models/model_loading_utils.py   | 51 ++++-------
 src/diffusers/quantizers/auto.py              |  2 +
 src/diffusers/quantizers/gguf/__init__.py     |  1 +
 .../quantizers/gguf/gguf_quantizer.py         | 55 ++++++++----
 src/diffusers/quantizers/gguf/utils.py        | 88 +++++++++++++++++--
 .../quantizers/quantization_config.py         |  6 +-
 src/diffusers/utils/__init__.py               |  2 +-
 9 files changed, 169 insertions(+), 81 deletions(-)
 create mode 100644 src/diffusers/quantizers/gguf/__init__.py

diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py
index f3e163126c43..7ce4460eb674 100644
--- a/src/diffusers/loaders/single_file_model.py
+++ b/src/diffusers/loaders/single_file_model.py
@@ -17,7 +17,6 @@
 from contextlib import nullcontext
 from typing import Optional
 
-from huggingface_hub import QuestionAnsweringInput
 import torch
 from huggingface_hub.utils import validate_hf_hub_args
 
@@ -50,7 +49,7 @@
     from ..models.modeling_utils import load_model_dict_into_meta
 
 if is_gguf_available():
-    from ..quantizers import GGUFQuantizationConfig
+    from ..quantizers.quantization_config import GGUFQuantizationConfig
 
 
 SINGLE_FILE_LOADABLE_CLASSES = {
@@ -229,25 +228,6 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =
         while "state_dict" in checkpoint:
             checkpoint = checkpoint["state_dict"]
 
-        if is_gguf:
-            quantization_config = GGUFQuantizationConfig(quant_type=gguf_metadata["gguf_file_type"])
-            # Only support loading pre_quantized gguf checkpoints
-            hf_quantizer = DiffusersAutoQuantizer.from_config(quantization_config, pre_quantized=True)
-        else:
-            hf_quantizer = None
-
-        # Check if `_keep_in_fp32_modules` is not None
-        use_keep_in_fp32_modules = (cls._keep_in_fp32_modules is not None) and (
-            (torch_dtype == torch.float16) or hasattr(hf_quantizer, "use_keep_in_fp32_modules")
-        )
-        if use_keep_in_fp32_modules:
-            keep_in_fp32_modules = cls._keep_in_fp32_modules
-            if not isinstance(keep_in_fp32_modules, list):
-                keep_in_fp32_modules = [keep_in_fp32_modules]
-
-        else:
-            keep_in_fp32_modules = []
-
         mapping_functions = SINGLE_FILE_LOADABLE_CLASSES[mapping_class_name]
 
         checkpoint_mapping_fn = mapping_functions["checkpoint_mapping_fn"]
@@ -327,8 +307,27 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =
         with ctx():
             model = cls.from_config(diffusers_model_config)
 
+        if is_gguf:
+            quantization_config = GGUFQuantizationConfig(quant_type=gguf_metadata["gguf_file_type"])
+            # Only support loading pre_quantized gguf checkpoints
+            hf_quantizer = DiffusersAutoQuantizer.from_config(quantization_config, pre_quantized=True)
+        else:
+            hf_quantizer = None
+
+        # Check if `_keep_in_fp32_modules` is not None
+        use_keep_in_fp32_modules = (cls._keep_in_fp32_modules is not None) and (
+            (torch_dtype == torch.float16) or hasattr(hf_quantizer, "use_keep_in_fp32_modules")
+        )
+        if use_keep_in_fp32_modules:
+            keep_in_fp32_modules = cls._keep_in_fp32_modules
+            if not isinstance(keep_in_fp32_modules, list):
+                keep_in_fp32_modules = [keep_in_fp32_modules]
+
+        else:
+            keep_in_fp32_modules = []
+
         if hf_quantizer is not None:
-            hf_quantizer.preprocess_model(model=model, keep_in_fp32_modules=keep_in_fp32_modules)
+            hf_quantizer.preprocess_model(model=model, device_map=None, keep_in_fp32_modules=keep_in_fp32_modules)
 
         if is_accelerate_available():
             unexpected_keys = load_model_dict_into_meta(
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index c2c8496af5d9..a24317783e8e 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -357,7 +357,7 @@ def load_single_file_checkpoint(
 
     checkpoint = load_state_dict(pretrained_model_link_or_path)
 
-    if "gguf_qtypes" in checkpoint:
+    if "gguf_metadata" in checkpoint:
         return checkpoint
     else:
         # some checkpoints contain the model state dict under a "state_dict" key
diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py
index 3f7be9ff246a..0a6f24865c69 100644
--- a/src/diffusers/models/model_loading_utils.py
+++ b/src/diffusers/models/model_loading_utils.py
@@ -17,6 +17,7 @@
 import importlib
 import inspect
 import os
+from array import array
 from collections import OrderedDict
 from pathlib import Path
 from typing import List, Optional, Union
@@ -24,17 +25,9 @@
 import safetensors
 import torch
 from huggingface_hub.utils import EntryNotFoundError
-
-from diffusers.utils.constants import GGUF_FILE_EXTENSION
-from array import array
-
-import torch
 from tqdm import tqdm
 
-from ..utils import is_torch_available
-from ..utils.import_utils import is_gguf_available
-from ..utils.logging import get_logger
-
+from diffusers.utils.constants import GGUF_FILE_EXTENSION
 
 from ..quantizers.quantization_config import QuantizationMethod
 from ..utils import (
@@ -45,9 +38,11 @@
     _get_model_file,
     deprecate,
     is_accelerate_available,
+    is_torch_available,
     is_torch_version,
     logging,
 )
+from ..utils.import_utils import is_gguf_available
 
 
 logger = logging.get_logger(__name__)
@@ -60,25 +55,6 @@
 }
 
 
-_GGUF_FILE_TYPE_MAPPING = {
-    0: "ALL_F32",
-    1: "MOSTLY_F16",
-    2: "MOSTLY_Q4_0",
-    3: "MOSTLY_Q4_1",
-    4: "MOSTLY_Q4_1_SOME_F16",
-    8: "MOSTLY_Q5_0",
-    9: "MOSTLY_Q5_1",
-    10: "MOSTLY_Q2_K",
-    11: "MOSTLY_Q3_K_S",
-    12: "MOSTLY_Q3_K_M",
-    13: "MOSTLY_Q3_K_L",
-    14: "MOSTLY_Q4_K_S",
-    15: "MOSTLY_Q4_K_M",
-    16: "MOSTLY_Q5_K_S",
-    17: "MOSTLY_Q5_K_M",
-    18: "MOSTLY_Q6_K",
-}
-
 if is_accelerate_available():
     from accelerate import infer_auto_device_map
     from accelerate.utils import get_balanced_memory, get_max_memory, set_module_tensor_to_device
@@ -245,12 +221,13 @@ def load_model_dict_into_meta(
         # bnb params are flattened.
         if empty_state_dict[param_name].shape != param.shape:
             if (
-                is_quant_method_bnb
+                is_quantized
                 and hf_quantizer.pre_quantized
                 and hf_quantizer.check_if_quantized_param(model, param, param_name, state_dict, param_device=device)
             ):
                 hf_quantizer.check_quantized_param_shape(param_name, empty_state_dict[param_name].shape, param.shape)
-            elif not is_quant_method_bnb:
+            else:
+                __import__('ipdb').set_trace()
                 model_name_or_path_str = f"{model_name_or_path} " if model_name_or_path is not None else ""
                 raise ValueError(
                     f"Cannot load {model_name_or_path_str} because {param_name} expected shape {empty_state_dict[param_name]}, but got {param.shape}. If you want to instead overwrite randomly initialized weights, please make sure to pass both `low_cpu_mem_usage=False` and `ignore_mismatched_sizes=True`. For more information, see also: https://github.com/huggingface/diffusers/issues/1619#issuecomment-1345604389 as an example."
@@ -473,7 +450,10 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
     """
 
     if is_gguf_available() and is_torch_available():
+        import gguf
         from gguf import GGUFReader
+
+        from ..quantizers.gguf.utils import _GGUF_FILE_TYPE_MAPPING, GGUFParameter
     else:
         logger.error(
             "Loading a GGUF checkpoint in PyTorch, requires both PyTorch and GGUF>=0.10.0 to be installed. Please see "
@@ -486,12 +466,17 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
     reader_keys = list(fields.keys())
 
     parsed_parameters = {}
-    metadata = {"gguf_file_type": _GGUF_FILE_TYPE_MAPPING[read_field(reader, "general.file_type")[0]]}
+    metadata = {"gguf_file_type": _GGUF_FILE_TYPE_MAPPING[read_field(reader, "general.file_type")[0]], "qtypes": {}}
+
     for tensor in tqdm(reader.tensors):
         name = tensor.name
-        weights = torch.from_numpy(tensor.data)
+        tensor_type = tensor.tensor_type
 
-        parsed_parameters[name] = weights
+        # if the tensor is a torch supported dtype do not use GGUFParameter
+        is_gguf_quant = tensor_type not in [gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16]
+
+        weights = torch.from_numpy(tensor.data)
+        parsed_parameters[name] = GGUFParameter(weights, tensor_type=tensor_type) if is_gguf_quant else weights.permute(*torch.arange(weights.ndim - 1, -1, -1))
 
     if len(reader_keys) > 0:
         logger.info(f"Some keys of the GGUF file were not considered: {reader_keys}")
diff --git a/src/diffusers/quantizers/auto.py b/src/diffusers/quantizers/auto.py
index 97cbcdc0e53f..02a8b4fe917c 100644
--- a/src/diffusers/quantizers/auto.py
+++ b/src/diffusers/quantizers/auto.py
@@ -19,12 +19,14 @@
 from typing import Dict, Optional, Union
 
 from .bitsandbytes import BnB4BitDiffusersQuantizer, BnB8BitDiffusersQuantizer
+from .gguf import GGUFQuantizer
 from .quantization_config import BitsAndBytesConfig, QuantizationConfigMixin, QuantizationMethod
 
 
 AUTO_QUANTIZER_MAPPING = {
     "bitsandbytes_4bit": BnB4BitDiffusersQuantizer,
     "bitsandbytes_8bit": BnB8BitDiffusersQuantizer,
+    "gguf": GGUFQuantizer
 }
 
 AUTO_QUANTIZATION_CONFIG_MAPPING = {
diff --git a/src/diffusers/quantizers/gguf/__init__.py b/src/diffusers/quantizers/gguf/__init__.py
new file mode 100644
index 000000000000..b3d9082ac803
--- /dev/null
+++ b/src/diffusers/quantizers/gguf/__init__.py
@@ -0,0 +1 @@
+from .gguf_quantizer import GGUFQuantizer
diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py
index bb4607094a8c..02d01e179676 100644
--- a/src/diffusers/quantizers/gguf/gguf_quantizer.py
+++ b/src/diffusers/quantizers/gguf/gguf_quantizer.py
@@ -1,26 +1,25 @@
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
-
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
 
 from ...utils import get_module_from_name
 from ..base import DiffusersQuantizer
-from .utils import GGUFLinear
+from .utils import _replace_with_gguf_linear
+
 
 if TYPE_CHECKING:
     from ...models.modeling_utils import ModelMixin
 
 from ...utils import (
     is_accelerate_available,
-    is_accelerate_version,
     is_torch_available,
     logging,
 )
 
+
 if is_accelerate_available():
-    from accelerate import init_empty_weights
+    pass
 
 if is_torch_available():
     import torch
-    import torch.nn as nn
 
 
 logger = logging.get_logger(__name__)
@@ -32,6 +31,8 @@ def __init__(self, quantization_config, **kwargs):
 
         self.quant_type = quantization_config.quant_type
         self.compute_dtype = quantization_config.compute_dtype
+        self.qtypes = quantization_config.qtypes
+        self.pre_quantized = True
 
     def check_quantized_param(
         self,
@@ -41,7 +42,20 @@ def check_quantized_param(
         state_dict: Dict[str, Any],
         **kwargs,
     ) -> bool:
-        return
+        return True
+
+    def check_quantized_param_shape(self, param_name, current_param_shape, loaded_param_shape):
+        return True
+
+    def check_if_quantized_param(
+        self,
+        model: "ModelMixin",
+        param_value: "torch.Tensor",
+        param_name: str,
+        state_dict: Dict[str, Any],
+        **kwargs,
+    ) -> bool:
+        return True
 
     def create_quantized_param(
         self,
@@ -55,7 +69,9 @@ def create_quantized_param(
         module, tensor_name = get_module_from_name(model, param_name)
         if tensor_name not in module._parameters:
             raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.")
-        __import__("ipdb").set_trace()
+
+        if param_name == "transformer_blocks.0.attn.to_q.weight":
+            __import__("ipdb").set_trace()
 
         module._parameters[tensor_name] = param_value
 
@@ -68,13 +84,16 @@ def _process_model_before_weight_loading(
         keep_in_fp32_modules: List[str] = [],
         **kwargs,
     ):
-        for name, module in model.named_children():
-            if isinstance(module, nn.Linear) and name not in self.modules_to_not_convert:
-                with init_empty_weights():
-                    model._modules[name] = GGUFLinear(
-                        module.in_features,
-                        module.out_features,
-                        module.bias is not None,
-                        compute_dtype=self.compute_dtype,
-                        quant_type=self.quant_type,
-                    )
+        model = _replace_with_gguf_linear(model, self.compute_dtype, self.quant_type)
+
+    def _process_model_after_weight_loading(self, model: "ModelMixin", **kwargs):
+        return model
+
+    @property
+    def is_serializable(self):
+        return False
+
+    @property
+    def is_trainable(self) -> bool:
+        # Because we're mandating `bitsandbytes` 0.43.3.
+        return False
diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py
index 6923865ecb05..d3d71d00507f 100644
--- a/src/diffusers/quantizers/gguf/utils.py
+++ b/src/diffusers/quantizers/gguf/utils.py
@@ -1,7 +1,26 @@
 import torch
+from torch._prims_common import is_low_precision_dtype
 import torch.nn as nn
 import gguf
 
+_GGUF_FILE_TYPE_MAPPING = {
+    0: "ALL_F32",
+    1: "MOSTLY_F16",
+    2: "MOSTLY_Q4_0",
+    3: "MOSTLY_Q4_1",
+    4: "MOSTLY_Q4_1_SOME_F16",
+    8: "MOSTLY_Q5_0",
+    9: "MOSTLY_Q5_1",
+    10: "MOSTLY_Q2_K",
+    11: "MOSTLY_Q3_K_S",
+    12: "MOSTLY_Q3_K_M",
+    13: "MOSTLY_Q3_K_L",
+    14: "MOSTLY_Q4_K_S",
+    15: "MOSTLY_Q4_K_M",
+    16: "MOSTLY_Q5_K_S",
+    17: "MOSTLY_Q5_K_M",
+    18: "MOSTLY_Q6_K",
+}
 
 QK_K_BLOCKSIZE = 256
 K_SCALE_SIZE = 12
@@ -33,6 +52,62 @@ def dequantize_Q2_K(blocks, dtype=None):
     return qs.reshape((n_blocks, -1))
 
 
+dequantize_fns = {
+    "MOSTLY_Q2_K": dequantize_Q2_K,
+}
+
+
+def _replace_with_gguf_linear(model, compute_dtype, quant_type, qtypes=None):
+    for name, module in model.named_children():
+        if isinstance(module, nn.Linear):
+            model._modules[name] = GGUFLinear(
+                module.in_features,
+                module.out_features,
+                module.bias is not None,
+                compute_dtype=compute_dtype,
+                quant_type=quant_type,
+            )
+            model._modules[name].source_cls = type(module)
+            # Force requires grad to False to avoid unexpected errors
+            model._modules[name].requires_grad_(False)
+
+        has_children = list(module.children())
+        if has_children:
+            _replace_with_gguf_linear(module, compute_dtype, quant_type)
+
+    return model
+
+
+class GGUFParameter(torch.nn.Parameter):
+    def __new__(cls, data, requires_grad=False, tensor_type=None):
+        data = data if data is not None else torch.empty(0)
+        self = torch.Tensor._make_subclass(cls, data, requires_grad)
+        self.tensor_type = tensor_type
+
+        return self
+
+    @classmethod
+    def __torch_function__(cls, func, types, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+        result = super().__torch_function__(func, types, args, kwargs)
+
+        tensor_type = None
+        for arg in args:
+            if isinstance(arg, GGUFParameter):
+                tensor_type = arg.tensor_type
+                break
+        if isinstance(result, torch.Tensor):
+            return cls(result, tensor_type=tensor_type)
+        # Handle tuples and lists
+        elif isinstance(result, (tuple, list)):
+            # Preserve the original type (tuple or list)
+            wrapped = [cls(x, tensor_type=tensor_type) if isinstance(x, torch.Tensor) else x for x in result]
+            return type(result)(wrapped)
+        else:
+            return result
+
+
 class GGUFLinear(nn.Linear):
     def __init__(
         self,
@@ -44,12 +119,15 @@ def __init__(
         device=None,
     ) -> None:
         super().__init__(in_features, out_features, bias, device)
-        self._dequant_fn = gguf.quants.dequantize
         self.compute_dtype = compute_dtype
         self.quant_type = quant_type
+        self._dequant_fn = dequantize_fns[self.quant_type]
 
     def forward(self, inputs):
-        weight = self._dequant_fn(self.weight, self.quant_type).to(self.compute_dtype)
-        bias = self._dequant_fn(self.bias, self.quant_type).to(self.compute_dtype)
-
-        return torch.nn.functional.linear(inputs, weight, bias)
+        is_gguf_quant = hasattr(self.weight, "tensor_type")
+        if is_gguf_quant:
+            weight = self._dequant_fn(self.weight, torch.uint8).to(self.compute_dtype)
+        else:
+            weight = self.weight
+        __import__("ipdb").set_trace()
+        return torch.nn.functional.linear(inputs, weight, self.bias)
diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py
index 213c5ae57da6..05cd997900eb 100644
--- a/src/diffusers/quantizers/quantization_config.py
+++ b/src/diffusers/quantizers/quantization_config.py
@@ -32,6 +32,7 @@
 
 from ..utils import is_torch_available, logging
 
+
 if is_torch_available():
     import torch
 
@@ -40,6 +41,7 @@
 
 class QuantizationMethod(str, Enum):
     BITS_AND_BYTES = "bitsandbytes"
+    GGUF = "gguf"
 
 
 @dataclass
@@ -391,10 +393,12 @@ def to_diff_dict(self) -> Dict[str, Any]:
 
 
 class GGUFQuantizationConfig(QuantizationConfigMixin):
-    def __init__(self, quant_type: str, compute_dtype=None, quant_storage=None):
+    def __init__(self, quant_type: str, qtypes=None, compute_dtype=None, quant_storage=None):
+        self.quant_method = QuantizationMethod.GGUF
         self.quant_type = quant_type
         self.compute_dtype = compute_dtype
         self.quant_storage = quant_storage
+        self.qtypes = qtypes
 
         if self.compute_dtype is None:
             self.compute_dtype = torch.float32
diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
index da2cd55afa03..24e324ac4382 100644
--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -66,6 +66,7 @@
     is_bs4_available,
     is_flax_available,
     is_ftfy_available,
+    is_gguf_available,
     is_google_colab,
     is_inflect_available,
     is_invisible_watermark_available,
@@ -93,7 +94,6 @@
     is_unidecode_available,
     is_wandb_available,
     is_xformers_available,
-    is_gguf_available,
     requires_backends,
 )
 from .loading_utils import get_module_from_name, load_image, load_video

From 428e44be60c633395363bfa37043ea45a8d13dcd Mon Sep 17 00:00:00 2001
From: DN6 <dhruv.nair@gmail.com>
Date: Fri, 15 Nov 2024 16:11:46 +0530
Subject: [PATCH 10/43] update

---
 src/diffusers/__init__.py                     |   2 +-
 src/diffusers/loaders/single_file_model.py    |  22 +-
 src/diffusers/loaders/single_file_utils.py    |   9 +-
 src/diffusers/models/model_loading_utils.py   |  32 +-
 src/diffusers/quantizers/auto.py              |   2 +-
 .../quantizers/gguf/gguf_quantizer.py         |  30 +-
 src/diffusers/quantizers/gguf/utils.py        | 350 +++++++++++++++---
 .../quantizers/quantization_config.py         |   4 +-
 src/diffusers/utils/__init__.py               |   1 +
 9 files changed, 339 insertions(+), 113 deletions(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 533aa5de1e87..a21f44982a61 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -31,7 +31,7 @@
     "loaders": ["FromOriginalModelMixin"],
     "models": [],
     "pipelines": [],
-    "quantizers.quantization_config": ["BitsAndBytesConfig"],
+    "quantizers.quantization_config": ["BitsAndBytesConfig", "GGUFQuantizationConfig"],
     "schedulers": [],
     "utils": [
         "OptionalDependencyNotAvailable",
diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py
index 7ce4460eb674..b27ce1c4c84d 100644
--- a/src/diffusers/loaders/single_file_model.py
+++ b/src/diffusers/loaders/single_file_model.py
@@ -21,7 +21,7 @@
 from huggingface_hub.utils import validate_hf_hub_args
 
 from ..quantizers import DiffusersAutoQuantizer
-from ..utils import deprecate, is_accelerate_available, is_gguf_available, logging
+from ..utils import deprecate, is_accelerate_available, logging
 from .single_file_utils import (
     SingleFileComponentError,
     convert_animatediff_checkpoint_to_diffusers,
@@ -48,9 +48,6 @@
 
     from ..models.modeling_utils import load_model_dict_into_meta
 
-if is_gguf_available():
-    from ..quantizers.quantization_config import GGUFQuantizationConfig
-
 
 SINGLE_FILE_LOADABLE_CLASSES = {
     "StableCascadeUNet": {
@@ -221,12 +218,11 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =
                 local_files_only=local_files_only,
                 revision=revision,
             )
-        is_gguf = "gguf_metadata" in checkpoint
-        gguf_metadata = checkpoint["gguf_metadata"] if is_gguf else None
+        if quantization_config is not None:
+            hf_quantizer = DiffusersAutoQuantizer.from_config(quantization_config)
 
-        # For GGUF models we nest the state_dict along with gguf_metadata
-        while "state_dict" in checkpoint:
-            checkpoint = checkpoint["state_dict"]
+        else:
+            hf_quantizer = None
 
         mapping_functions = SINGLE_FILE_LOADABLE_CLASSES[mapping_class_name]
 
@@ -307,13 +303,6 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =
         with ctx():
             model = cls.from_config(diffusers_model_config)
 
-        if is_gguf:
-            quantization_config = GGUFQuantizationConfig(quant_type=gguf_metadata["gguf_file_type"])
-            # Only support loading pre_quantized gguf checkpoints
-            hf_quantizer = DiffusersAutoQuantizer.from_config(quantization_config, pre_quantized=True)
-        else:
-            hf_quantizer = None
-
         # Check if `_keep_in_fp32_modules` is not None
         use_keep_in_fp32_modules = (cls._keep_in_fp32_modules is not None) and (
             (torch_dtype == torch.float16) or hasattr(hf_quantizer, "use_keep_in_fp32_modules")
@@ -352,7 +341,6 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =
 
         if hf_quantizer is not None:
             hf_quantizer.postprocess_model(model)
-            model.hf_quantizer = hf_quantizer
 
         if torch_dtype is not None:
             model.to(torch_dtype)
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index a24317783e8e..d1bad8b5a7cd 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -357,12 +357,9 @@ def load_single_file_checkpoint(
 
     checkpoint = load_state_dict(pretrained_model_link_or_path)
 
-    if "gguf_metadata" in checkpoint:
-        return checkpoint
-    else:
-        # some checkpoints contain the model state dict under a "state_dict" key
-        while "state_dict" in checkpoint:
-            checkpoint = checkpoint["state_dict"]
+    # some checkpoints contain the model state dict under a "state_dict" key
+    while "state_dict" in checkpoint:
+        checkpoint = checkpoint["state_dict"]
 
     return checkpoint
 
diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py
index 0a6f24865c69..e2e7ec83ff2a 100644
--- a/src/diffusers/models/model_loading_utils.py
+++ b/src/diffusers/models/model_loading_utils.py
@@ -27,10 +27,8 @@
 from huggingface_hub.utils import EntryNotFoundError
 from tqdm import tqdm
 
-from diffusers.utils.constants import GGUF_FILE_EXTENSION
-
-from ..quantizers.quantization_config import QuantizationMethod
 from ..utils import (
+    GGUF_FILE_EXTENSION,
     SAFE_WEIGHTS_INDEX_NAME,
     SAFETENSORS_FILE_EXTENSION,
     WEIGHTS_INDEX_NAME,
@@ -188,7 +186,6 @@ def load_model_dict_into_meta(
         device = device or torch.device("cpu")
     dtype = dtype or torch.float32
     is_quantized = hf_quantizer is not None
-    is_quant_method_bnb = getattr(model, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES
 
     accepts_dtype = "dtype" in set(inspect.signature(set_module_tensor_to_device).parameters.keys())
     empty_state_dict = model.state_dict()
@@ -219,6 +216,7 @@ def load_model_dict_into_meta(
                     set_module_kwargs["dtype"] = dtype
 
         # bnb params are flattened.
+        # gguf quants have a different shape based on the type of quantization applied
         if empty_state_dict[param_name].shape != param.shape:
             if (
                 is_quantized
@@ -227,7 +225,6 @@ def load_model_dict_into_meta(
             ):
                 hf_quantizer.check_quantized_param_shape(param_name, empty_state_dict[param_name].shape, param.shape)
             else:
-                __import__('ipdb').set_trace()
                 model_name_or_path_str = f"{model_name_or_path} " if model_name_or_path is not None else ""
                 raise ValueError(
                     f"Cannot load {model_name_or_path_str} because {param_name} expected shape {empty_state_dict[param_name]}, but got {param.shape}. If you want to instead overwrite randomly initialized weights, please make sure to pass both `low_cpu_mem_usage=False` and `ignore_mismatched_sizes=True`. For more information, see also: https://github.com/huggingface/diffusers/issues/1619#issuecomment-1345604389 as an example."
@@ -438,22 +435,22 @@ def read_field(reader, field):
 
 def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
     """
-    Load a GGUF file and return a dictionary of parsed parameters containing tensors, the parsed
-    tokenizer and config attributes.
+    Load a GGUF file and return a dictionary of parsed parameters containing tensors, the parsed tokenizer and config
+    attributes.
 
     Args:
         gguf_checkpoint_path (`str`):
             The path the to GGUF file to load
         return_tensors (`bool`, defaults to `True`):
-            Whether to read the tensors from the file and return them. Not doing so is faster
-            and only loads the metadata in memory.
+            Whether to read the tensors from the file and return them. Not doing so is faster and only loads the
+            metadata in memory.
     """
 
     if is_gguf_available() and is_torch_available():
         import gguf
         from gguf import GGUFReader
 
-        from ..quantizers.gguf.utils import _GGUF_FILE_TYPE_MAPPING, GGUFParameter
+        from ..quantizers.gguf.utils import GGUFParameter
     else:
         logger.error(
             "Loading a GGUF checkpoint in PyTorch, requires both PyTorch and GGUF>=0.10.0 to be installed. Please see "
@@ -466,19 +463,20 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
     reader_keys = list(fields.keys())
 
     parsed_parameters = {}
-    metadata = {"gguf_file_type": _GGUF_FILE_TYPE_MAPPING[read_field(reader, "general.file_type")[0]], "qtypes": {}}
-
     for tensor in tqdm(reader.tensors):
         name = tensor.name
-        tensor_type = tensor.tensor_type
+        quant_type = tensor.tensor_type
 
         # if the tensor is a torch supported dtype do not use GGUFParameter
-        is_gguf_quant = tensor_type not in [gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16]
-
+        is_gguf_quant = quant_type not in [gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16]
         weights = torch.from_numpy(tensor.data)
-        parsed_parameters[name] = GGUFParameter(weights, tensor_type=tensor_type) if is_gguf_quant else weights.permute(*torch.arange(weights.ndim - 1, -1, -1))
+        parsed_parameters[name] = (
+            GGUFParameter(weights, quant_type=quant_type)
+            if is_gguf_quant
+            else weights.permute(*torch.arange(weights.ndim - 1, -1, -1))
+        )
 
     if len(reader_keys) > 0:
         logger.info(f"Some keys of the GGUF file were not considered: {reader_keys}")
 
-    return {"state_dict": parsed_parameters, "gguf_metadata": metadata}
+    return parsed_parameters
diff --git a/src/diffusers/quantizers/auto.py b/src/diffusers/quantizers/auto.py
index 02a8b4fe917c..54000fb85330 100644
--- a/src/diffusers/quantizers/auto.py
+++ b/src/diffusers/quantizers/auto.py
@@ -26,7 +26,7 @@
 AUTO_QUANTIZER_MAPPING = {
     "bitsandbytes_4bit": BnB4BitDiffusersQuantizer,
     "bitsandbytes_8bit": BnB8BitDiffusersQuantizer,
-    "gguf": GGUFQuantizer
+    "gguf": GGUFQuantizer,
 }
 
 AUTO_QUANTIZATION_CONFIG_MAPPING = {
diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py
index 02d01e179676..e7c5647f3dd9 100644
--- a/src/diffusers/quantizers/gguf/gguf_quantizer.py
+++ b/src/diffusers/quantizers/gguf/gguf_quantizer.py
@@ -2,7 +2,7 @@
 
 from ...utils import get_module_from_name
 from ..base import DiffusersQuantizer
-from .utils import _replace_with_gguf_linear
+from .utils import GGUFParameter, _quant_shape_from_byte_shape, _replace_with_gguf_linear
 
 
 if TYPE_CHECKING:
@@ -29,23 +29,12 @@ class GGUFQuantizer(DiffusersQuantizer):
     def __init__(self, quantization_config, **kwargs):
         super().__init__(quantization_config, **kwargs)
 
-        self.quant_type = quantization_config.quant_type
         self.compute_dtype = quantization_config.compute_dtype
-        self.qtypes = quantization_config.qtypes
         self.pre_quantized = True
 
-    def check_quantized_param(
-        self,
-        model: "ModelMixin",
-        param_value: "torch.Tensor",
-        param_name: str,
-        state_dict: Dict[str, Any],
-        **kwargs,
-    ) -> bool:
-        return True
-
     def check_quantized_param_shape(self, param_name, current_param_shape, loaded_param_shape):
-        return True
+        if _quant_shape_from_byte_shape(loaded_param_shape) == current_param_shape:
+            return True
 
     def check_if_quantized_param(
         self,
@@ -55,7 +44,11 @@ def check_if_quantized_param(
         state_dict: Dict[str, Any],
         **kwargs,
     ) -> bool:
-        return True
+        module, tensor_name = get_module_from_name(model, param_name)
+        if isinstance(module._parameters.get(tensor_name, None), GGUFParameter):
+            return True
+
+        return False
 
     def create_quantized_param(
         self,
@@ -70,13 +63,8 @@ def create_quantized_param(
         if tensor_name not in module._parameters:
             raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.")
 
-        if param_name == "transformer_blocks.0.attn.to_q.weight":
-            __import__("ipdb").set_trace()
-
         module._parameters[tensor_name] = param_value
 
-        return
-
     def _process_model_before_weight_loading(
         self,
         model: "ModelMixin",
@@ -84,7 +72,7 @@ def _process_model_before_weight_loading(
         keep_in_fp32_modules: List[str] = [],
         **kwargs,
     ):
-        model = _replace_with_gguf_linear(model, self.compute_dtype, self.quant_type)
+        _replace_with_gguf_linear(model, self.compute_dtype)
 
     def _process_model_after_weight_loading(self, model: "ModelMixin", **kwargs):
         return model
diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py
index d3d71d00507f..dfff6a11ec50 100644
--- a/src/diffusers/quantizers/gguf/utils.py
+++ b/src/diffusers/quantizers/gguf/utils.py
@@ -1,7 +1,21 @@
+# Copyright 2024 The HuggingFace Team and City96. All rights reserved.
+# #
+# # Licensed under the Apache License, Version 2.0 (the "License");
+# # you may not use this file except in compliance with the License.
+# # You may obtain a copy of the License at
+# #
+# #     http://www.apache.org/licenses/LICENSE-2.0
+# #
+# # Unless required by applicable law or agreed to in writing, software
+# # distributed under the License is distributed on an "AS IS" BASIS,
+# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# # See the License for the specific language governing permissions and
+# # limitations under the License.
+
+import gguf
 import torch
-from torch._prims_common import is_low_precision_dtype
 import torch.nn as nn
-import gguf
+
 
 _GGUF_FILE_TYPE_MAPPING = {
     0: "ALL_F32",
@@ -22,9 +36,39 @@
     18: "MOSTLY_Q6_K",
 }
 
-QK_K_BLOCKSIZE = 256
+
+def _replace_with_gguf_linear(model, compute_dtype):
+    for name, module in model.named_children():
+        if isinstance(module, nn.Linear):
+            model._modules[name] = GGUFLinear(
+                module.in_features,
+                module.out_features,
+                module.bias is not None,
+                compute_dtype=compute_dtype,
+            )
+            model._modules[name].source_cls = type(module)
+            # Force requires grad to False to avoid unexpected errors
+            model._modules[name].requires_grad_(False)
+
+        has_children = list(module.children())
+        if has_children:
+            _replace_with_gguf_linear(module, compute_dtype)
+
+    return model
+
+
+QK_K = 256
 K_SCALE_SIZE = 12
 
+# dequantize operations based on torch ports of GGUF dequantize_functions
+# from City96
+# more info: https://github.com/city96/ComfyUI-GGUF/blob/main/dequant.py
+
+
+def to_uint32(x):
+    x = x.view(torch.uint8).to(torch.int32)
+    return (x[:, 0] | x[:, 1] << 8 | x[:, 2] << 16 | x[:, 3] << 24).unsqueeze(1)
+
 
 def split_block_dims(blocks, *args):
     n_max = blocks.shape[1]
@@ -32,57 +76,270 @@ def split_block_dims(blocks, *args):
     return torch.split(blocks, dims, dim=1)
 
 
-def dequantize_Q2_K(blocks, dtype=None):
+def get_scale_min(scales):
+    n_blocks = scales.shape[0]
+    scales = scales.view(torch.uint8)
+    scales = scales.reshape((n_blocks, 3, 4))
+
+    d, m, m_d = torch.split(scales, scales.shape[-2] // 3, dim=-2)
+
+    sc = torch.cat([d & 0x3F, (m_d & 0x0F) | ((d >> 2) & 0x30)], dim=-1)
+    min = torch.cat([m & 0x3F, (m_d >> 4) | ((m >> 2) & 0x30)], dim=-1)
+
+    return (sc.reshape((n_blocks, 8)), min.reshape((n_blocks, 8)))
+
+
+def dequantize_blocks_Q8_0(blocks, block_size, type_size, dtype=None):
+    d, x = split_block_dims(blocks, 2)
+    d = d.view(torch.float16).to(dtype)
+    x = x.view(torch.int8)
+    return d * x
+
+
+def dequantize_blocks_Q5_1(blocks, block_size, type_size, dtype=None):
+    n_blocks = blocks.shape[0]
+
+    d, m, qh, qs = split_block_dims(blocks, 2, 2, 4)
+    d = d.view(torch.float16).to(dtype)
+    m = m.view(torch.float16).to(dtype)
+    qh = to_uint32(qh)
+
+    qh = qh.reshape((n_blocks, 1)) >> torch.arange(32, device=d.device, dtype=torch.int32).reshape(1, 32)
+    ql = qs.reshape((n_blocks, -1, 1, block_size // 2)) >> torch.tensor(
+        [0, 4], device=d.device, dtype=torch.uint8
+    ).reshape(1, 1, 2, 1)
+    qh = (qh & 1).to(torch.uint8)
+    ql = (ql & 0x0F).reshape((n_blocks, -1))
+
+    qs = ql | (qh << 4)
+    return (d * qs) + m
+
+
+def dequantize_blocks_Q5_0(blocks, block_size, type_size, dtype=None):
+    n_blocks = blocks.shape[0]
+
+    d, qh, qs = split_block_dims(blocks, 2, 4)
+    d = d.view(torch.float16).to(dtype)
+    qh = to_uint32(qh)
+
+    qh = qh.reshape(n_blocks, 1) >> torch.arange(32, device=d.device, dtype=torch.int32).reshape(1, 32)
+    ql = qs.reshape(n_blocks, -1, 1, block_size // 2) >> torch.tensor(
+        [0, 4], device=d.device, dtype=torch.uint8
+    ).reshape(1, 1, 2, 1)
+
+    qh = (qh & 1).to(torch.uint8)
+    ql = (ql & 0x0F).reshape(n_blocks, -1)
+
+    qs = (ql | (qh << 4)).to(torch.int8) - 16
+    return d * qs
+
+
+def dequantize_blocks_Q4_1(blocks, block_size, type_size, dtype=None):
+    n_blocks = blocks.shape[0]
+
+    d, m, qs = split_block_dims(blocks, 2, 2)
+    d = d.view(torch.float16).to(dtype)
+    m = m.view(torch.float16).to(dtype)
+
+    qs = qs.reshape((n_blocks, -1, 1, block_size // 2)) >> torch.tensor(
+        [0, 4], device=d.device, dtype=torch.uint8
+    ).reshape(1, 1, 2, 1)
+    qs = (qs & 0x0F).reshape(n_blocks, -1)
+
+    return (d * qs) + m
+
+
+def dequantize_blocks_Q4_0(blocks, block_size, type_size, dtype=None):
+    n_blocks = blocks.shape[0]
+
+    d, qs = split_block_dims(blocks, 2)
+    d = d.view(torch.float16).to(dtype)
+
+    qs = qs.reshape((n_blocks, -1, 1, block_size // 2)) >> torch.tensor(
+        [0, 4], device=d.device, dtype=torch.uint8
+    ).reshape((1, 1, 2, 1))
+    qs = (qs & 0x0F).reshape((n_blocks, -1)).to(torch.int8) - 8
+    return d * qs
+
+
+def dequantize_blocks_Q6_K(blocks, block_size, type_size, dtype=None):
+    n_blocks = blocks.shape[0]
+
+    (
+        ql,
+        qh,
+        scales,
+        d,
+    ) = split_block_dims(blocks, QK_K // 2, QK_K // 4, QK_K // 16)
+
+    scales = scales.view(torch.int8).to(dtype)
+    d = d.view(torch.float16).to(dtype)
+    d = (d * scales).reshape((n_blocks, QK_K // 16, 1))
+
+    ql = ql.reshape((n_blocks, -1, 1, 64)) >> torch.tensor([0, 4], device=d.device, dtype=torch.uint8).reshape(
+        (1, 1, 2, 1)
+    )
+    ql = (ql & 0x0F).reshape((n_blocks, -1, 32))
+    qh = qh.reshape((n_blocks, -1, 1, 32)) >> torch.tensor([0, 2, 4, 6], device=d.device, dtype=torch.uint8).reshape(
+        (1, 1, 4, 1)
+    )
+    qh = (qh & 0x03).reshape((n_blocks, -1, 32))
+    q = (ql | (qh << 4)).to(torch.int8) - 32
+    q = q.reshape((n_blocks, QK_K // 16, -1))
+
+    return (d * q).reshape((n_blocks, QK_K))
+
+
+def dequantize_blocks_Q5_K(blocks, block_size, type_size, dtype=None):
+    n_blocks = blocks.shape[0]
+
+    d, dmin, scales, qh, qs = split_block_dims(blocks, 2, 2, K_SCALE_SIZE, QK_K // 8)
+
+    d = d.view(torch.float16).to(dtype)
+    dmin = dmin.view(torch.float16).to(dtype)
+
+    sc, m = get_scale_min(scales)
+
+    d = (d * sc).reshape((n_blocks, -1, 1))
+    dm = (dmin * m).reshape((n_blocks, -1, 1))
+
+    ql = qs.reshape((n_blocks, -1, 1, 32)) >> torch.tensor([0, 4], device=d.device, dtype=torch.uint8).reshape(
+        (1, 1, 2, 1)
+    )
+    qh = qh.reshape((n_blocks, -1, 1, 32)) >> torch.arange(0, 8, device=d.device, dtype=torch.uint8).reshape(
+        (1, 1, 8, 1)
+    )
+    ql = (ql & 0x0F).reshape((n_blocks, -1, 32))
+    qh = (qh & 0x01).reshape((n_blocks, -1, 32))
+    q = ql | (qh << 4)
+
+    return (d * q - dm).reshape((n_blocks, QK_K))
+
+
+def dequantize_blocks_Q4_K(blocks, block_size, type_size, dtype=None):
+    n_blocks = blocks.shape[0]
+
+    d, dmin, scales, qs = split_block_dims(blocks, 2, 2, K_SCALE_SIZE)
+    d = d.view(torch.float16).to(dtype)
+    dmin = dmin.view(torch.float16).to(dtype)
+
+    sc, m = get_scale_min(scales)
+
+    d = (d * sc).reshape((n_blocks, -1, 1))
+    dm = (dmin * m).reshape((n_blocks, -1, 1))
+
+    qs = qs.reshape((n_blocks, -1, 1, 32)) >> torch.tensor([0, 4], device=d.device, dtype=torch.uint8).reshape(
+        (1, 1, 2, 1)
+    )
+    qs = (qs & 0x0F).reshape((n_blocks, -1, 32))
+
+    return (d * qs - dm).reshape((n_blocks, QK_K))
+
+
+def dequantize_blocks_Q3_K(blocks, block_size, type_size, dtype=None):
+    n_blocks = blocks.shape[0]
+
+    hmask, qs, scales, d = split_block_dims(blocks, QK_K // 8, QK_K // 4, 12)
+    d = d.view(torch.float16).to(dtype)
+
+    lscales, hscales = scales[:, :8], scales[:, 8:]
+    lscales = lscales.reshape((n_blocks, 1, 8)) >> torch.tensor([0, 4], device=d.device, dtype=torch.uint8).reshape(
+        (1, 2, 1)
+    )
+    lscales = lscales.reshape((n_blocks, 16))
+    hscales = hscales.reshape((n_blocks, 1, 4)) >> torch.tensor(
+        [0, 2, 4, 6], device=d.device, dtype=torch.uint8
+    ).reshape((1, 4, 1))
+    hscales = hscales.reshape((n_blocks, 16))
+    scales = (lscales & 0x0F) | ((hscales & 0x03) << 4)
+    scales = scales.to(torch.int8) - 32
+
+    dl = (d * scales).reshape((n_blocks, 16, 1))
+
+    ql = qs.reshape((n_blocks, -1, 1, 32)) >> torch.tensor([0, 2, 4, 6], device=d.device, dtype=torch.uint8).reshape(
+        (1, 1, 4, 1)
+    )
+    qh = hmask.reshape(n_blocks, -1, 1, 32) >> torch.arange(0, 8, device=d.device, dtype=torch.uint8).reshape(
+        (1, 1, 8, 1)
+    )
+    ql = ql.reshape((n_blocks, 16, QK_K // 16)) & 3
+    qh = (qh.reshape((n_blocks, 16, QK_K // 16)) & 1) ^ 1
+    q = ql.to(torch.int8) - (qh << 2).to(torch.int8)
+
+    return (dl * q).reshape((n_blocks, QK_K))
+
+
+def dequantize_blocks_Q2_K(blocks, block_size, type_size, dtype=None):
     n_blocks = blocks.shape[0]
 
-    scales, quantized_values, delta, delta_min = split_block_dims(blocks, QK_K_BLOCKSIZE // 16, QK_K_BLOCKSIZE // 4, 2)
-    delta = delta.view(torch.float16).to(dtype)
-    delta_min = delta_min.view(torch.float16).to(dtype)
+    scales, qs, d, dmin = split_block_dims(blocks, QK_K // 16, QK_K // 4, 2)
+    d = d.view(torch.float16).to(dtype)
+    dmin = dmin.view(torch.float16).to(dtype)
 
     # (n_blocks, 16, 1)
-    dl = (delta * (scales & 0xF)).reshape((n_blocks, QK_K_BLOCKSIZE // 16, 1))
-    ml = (delta_min * (scales >> 4)).reshape((n_blocks, QK_K_BLOCKSIZE // 16, 1))
+    dl = (d * (scales & 0xF)).reshape((n_blocks, QK_K // 16, 1))
+    ml = (dmin * (scales >> 4)).reshape((n_blocks, QK_K // 16, 1))
 
-    shift = torch.tensor([0, 2, 4, 6], device=delta.device, dtype=torch.uint8).reshape((1, 1, 4, 1))
+    shift = torch.tensor([0, 2, 4, 6], device=d.device, dtype=torch.uint8).reshape((1, 1, 4, 1))
 
-    qs = (quantized_values.reshape((n_blocks, -1, 1, 32)) >> shift) & 3
-    qs = qs.reshape((n_blocks, QK_K_BLOCKSIZE // 16, 16))
+    qs = (qs.reshape((n_blocks, -1, 1, 32)) >> shift) & 3
+    qs = qs.reshape((n_blocks, QK_K // 16, 16))
     qs = dl * qs - ml
 
     return qs.reshape((n_blocks, -1))
 
 
-dequantize_fns = {
-    "MOSTLY_Q2_K": dequantize_Q2_K,
+def dequantize_blocks_BF16(blocks, block_size, type_size, dtype=None):
+    return (blocks.view(torch.int16).to(torch.int32) << 16).view(torch.float32)
+
+
+dequantize_functions = {
+    gguf.GGMLQuantizationType.BF16: dequantize_blocks_BF16,
+    gguf.GGMLQuantizationType.Q8_0: dequantize_blocks_Q8_0,
+    gguf.GGMLQuantizationType.Q5_1: dequantize_blocks_Q5_1,
+    gguf.GGMLQuantizationType.Q5_0: dequantize_blocks_Q5_0,
+    gguf.GGMLQuantizationType.Q4_1: dequantize_blocks_Q4_1,
+    gguf.GGMLQuantizationType.Q4_0: dequantize_blocks_Q4_0,
+    gguf.GGMLQuantizationType.Q6_K: dequantize_blocks_Q6_K,
+    gguf.GGMLQuantizationType.Q5_K: dequantize_blocks_Q5_K,
+    gguf.GGMLQuantizationType.Q4_K: dequantize_blocks_Q4_K,
+    gguf.GGMLQuantizationType.Q3_K: dequantize_blocks_Q3_K,
+    gguf.GGMLQuantizationType.Q2_K: dequantize_blocks_Q2_K,
 }
 
 
-def _replace_with_gguf_linear(model, compute_dtype, quant_type, qtypes=None):
-    for name, module in model.named_children():
-        if isinstance(module, nn.Linear):
-            model._modules[name] = GGUFLinear(
-                module.in_features,
-                module.out_features,
-                module.bias is not None,
-                compute_dtype=compute_dtype,
-                quant_type=quant_type,
-            )
-            model._modules[name].source_cls = type(module)
-            # Force requires grad to False to avoid unexpected errors
-            model._modules[name].requires_grad_(False)
+def _quant_shape_from_byte_shape(shape, type_size, block_size):
+    return (*shape[:-1], shape[-1] // type_size * block_size)
 
-        has_children = list(module.children())
-        if has_children:
-            _replace_with_gguf_linear(module, compute_dtype, quant_type)
 
-    return model
+def dequantize_gguf_tensor(tensor, compute_dtype):
+    if not hasattr(tensor, "quant_type"):
+        return tensor
+
+    quant_type = tensor.quant_type
+    dequant_fn = dequantize_functions[quant_type]
+
+    block_size, type_size = gguf.GGML_QUANT_SIZES[quant_type]
+
+    tensor = torch.tensor(tensor)
+    tensor = tensor.view(torch.uint8)
+    shape = _quant_shape_from_byte_shape(tensor.shape, type_size, block_size)
+
+    n_blocks = tensor.numel() // type_size
+    blocks = tensor.reshape((n_blocks, type_size))
 
+    dequant = dequant_fn(blocks, block_size, type_size)
+    dequant = dequant.reshape(shape)
+    dequant = dequant.to(compute_dtype)
 
-class GGUFParameter(torch.nn.Parameter):
-    def __new__(cls, data, requires_grad=False, tensor_type=None):
+    return dequant
+
+
+class GGUFParameter(torch.Tensor):
+    def __new__(cls, data, requires_grad=False, quant_type=None):
         data = data if data is not None else torch.empty(0)
         self = torch.Tensor._make_subclass(cls, data, requires_grad)
-        self.tensor_type = tensor_type
+        self.quant_type = quant_type
 
         return self
 
@@ -90,19 +347,26 @@ def __new__(cls, data, requires_grad=False, tensor_type=None):
     def __torch_function__(cls, func, types, args=(), kwargs=None):
         if kwargs is None:
             kwargs = {}
+
         result = super().__torch_function__(func, types, args, kwargs)
 
-        tensor_type = None
+        # When converting from original format checkpoints we often use splits, cats etc on tensors
+        # this method ensures that the returned tensor type from those operations remains GGUFParameter
+        # so that we preserve quant_type information
+        quant_type = None
         for arg in args:
+            if isinstance(arg, list) and (arg[0], GGUFParameter):
+                quant_type = arg[0].quant_type
+                break
             if isinstance(arg, GGUFParameter):
-                tensor_type = arg.tensor_type
+                quant_type = arg.quant_type
                 break
         if isinstance(result, torch.Tensor):
-            return cls(result, tensor_type=tensor_type)
+            return cls(result, quant_type=quant_type)
         # Handle tuples and lists
         elif isinstance(result, (tuple, list)):
             # Preserve the original type (tuple or list)
-            wrapped = [cls(x, tensor_type=tensor_type) if isinstance(x, torch.Tensor) else x for x in result]
+            wrapped = [cls(x, quant_type=quant_type) if isinstance(x, torch.Tensor) else x for x in result]
             return type(result)(wrapped)
         else:
             return result
@@ -115,19 +379,11 @@ def __init__(
         out_features,
         bias=False,
         compute_dtype=None,
-        quant_type=None,
         device=None,
     ) -> None:
         super().__init__(in_features, out_features, bias, device)
         self.compute_dtype = compute_dtype
-        self.quant_type = quant_type
-        self._dequant_fn = dequantize_fns[self.quant_type]
 
     def forward(self, inputs):
-        is_gguf_quant = hasattr(self.weight, "tensor_type")
-        if is_gguf_quant:
-            weight = self._dequant_fn(self.weight, torch.uint8).to(self.compute_dtype)
-        else:
-            weight = self.weight
-        __import__("ipdb").set_trace()
+        weight = dequantize_gguf_tensor(self.weight, self.compute_dtype)
         return torch.nn.functional.linear(inputs, weight, self.bias)
diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py
index 05cd997900eb..58e68b628d77 100644
--- a/src/diffusers/quantizers/quantization_config.py
+++ b/src/diffusers/quantizers/quantization_config.py
@@ -393,12 +393,10 @@ def to_diff_dict(self) -> Dict[str, Any]:
 
 
 class GGUFQuantizationConfig(QuantizationConfigMixin):
-    def __init__(self, quant_type: str, qtypes=None, compute_dtype=None, quant_storage=None):
+    def __init__(self, compute_dtype=None, quant_storage=None):
         self.quant_method = QuantizationMethod.GGUF
-        self.quant_type = quant_type
         self.compute_dtype = compute_dtype
         self.quant_storage = quant_storage
-        self.qtypes = qtypes
 
         if self.compute_dtype is None:
             self.compute_dtype = torch.float32
diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
index 24e324ac4382..a9ef3718e9e8 100644
--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -23,6 +23,7 @@
     DEPRECATED_REVISION_ARGS,
     DIFFUSERS_DYNAMIC_MODULE_NAME,
     FLAX_WEIGHTS_NAME,
+    GGUF_FILE_EXTENSION,
     HF_MODULES_CACHE,
     HUGGINGFACE_CO_RESOLVE_ENDPOINT,
     MIN_PEFT_VERSION,

From d7f09f27d2b2017d8b3a06f26e207dfeb40e2f94 Mon Sep 17 00:00:00 2001
From: DN6 <dhruv.nair@gmail.com>
Date: Tue, 19 Nov 2024 18:46:40 +0530
Subject: [PATCH 11/43] update

---
 src/diffusers/models/model_loading_utils.py   | 13 +++------
 .../quantizers/bitsandbytes/bnb_quantizer.py  |  5 +++-
 .../quantizers/gguf/gguf_quantizer.py         | 27 ++++++++++++-------
 src/diffusers/quantizers/gguf/utils.py        | 21 ---------------
 4 files changed, 26 insertions(+), 40 deletions(-)

diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py
index e2e7ec83ff2a..b909217c53bd 100644
--- a/src/diffusers/models/model_loading_utils.py
+++ b/src/diffusers/models/model_loading_utils.py
@@ -182,8 +182,7 @@ def load_model_dict_into_meta(
     hf_quantizer=None,
     keep_in_fp32_modules=None,
 ) -> List[str]:
-    if hf_quantizer is None:
-        device = device or torch.device("cpu")
+    device = device or torch.device("cpu")
     dtype = dtype or torch.float32
     is_quantized = hf_quantizer is not None
 
@@ -223,7 +222,7 @@ def load_model_dict_into_meta(
                 and hf_quantizer.pre_quantized
                 and hf_quantizer.check_if_quantized_param(model, param, param_name, state_dict, param_device=device)
             ):
-                hf_quantizer.check_quantized_param_shape(param_name, empty_state_dict[param_name].shape, param.shape)
+                hf_quantizer.check_quantized_param_shape(param_name, empty_state_dict[param_name], param)
             else:
                 model_name_or_path_str = f"{model_name_or_path} " if model_name_or_path is not None else ""
                 raise ValueError(
@@ -469,12 +468,8 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
 
         # if the tensor is a torch supported dtype do not use GGUFParameter
         is_gguf_quant = quant_type not in [gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16]
-        weights = torch.from_numpy(tensor.data)
-        parsed_parameters[name] = (
-            GGUFParameter(weights, quant_type=quant_type)
-            if is_gguf_quant
-            else weights.permute(*torch.arange(weights.ndim - 1, -1, -1))
-        )
+        weights = torch.from_numpy(tensor.data.copy())
+        parsed_parameters[name] = GGUFParameter(weights, quant_type=quant_type) if is_gguf_quant else weights
 
     if len(reader_keys) > 0:
         logger.info(f"Some keys of the GGUF file were not considered: {reader_keys}")
diff --git a/src/diffusers/quantizers/bitsandbytes/bnb_quantizer.py b/src/diffusers/quantizers/bitsandbytes/bnb_quantizer.py
index d5ac1611a571..f7780b66b12b 100644
--- a/src/diffusers/quantizers/bitsandbytes/bnb_quantizer.py
+++ b/src/diffusers/quantizers/bitsandbytes/bnb_quantizer.py
@@ -204,7 +204,10 @@ def create_quantized_param(
 
         module._parameters[tensor_name] = new_value
 
-    def check_quantized_param_shape(self, param_name, current_param_shape, loaded_param_shape):
+    def check_quantized_param_shape(self, param_name, current_param, loaded_param):
+        current_param_shape = current_param.shape
+        loaded_param_shape = loaded_param.shape
+
         n = current_param_shape.numel()
         inferred_shape = (n,) if "bias" in param_name else ((n + 1) // 2, 1)
         if loaded_param_shape != inferred_shape:
diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py
index e7c5647f3dd9..4ef843a90c1e 100644
--- a/src/diffusers/quantizers/gguf/gguf_quantizer.py
+++ b/src/diffusers/quantizers/gguf/gguf_quantizer.py
@@ -9,18 +9,17 @@
     from ...models.modeling_utils import ModelMixin
 
 from ...utils import (
-    is_accelerate_available,
+    is_gguf_available,
     is_torch_available,
     logging,
 )
 
 
-if is_accelerate_available():
-    pass
-
 if is_torch_available():
     import torch
 
+if is_gguf_available():
+    import gguf
 
 logger = logging.get_logger(__name__)
 
@@ -32,9 +31,20 @@ def __init__(self, quantization_config, **kwargs):
         self.compute_dtype = quantization_config.compute_dtype
         self.pre_quantized = True
 
-    def check_quantized_param_shape(self, param_name, current_param_shape, loaded_param_shape):
-        if _quant_shape_from_byte_shape(loaded_param_shape) == current_param_shape:
-            return True
+    def check_quantized_param_shape(self, param_name, current_param, loaded_param):
+        loaded_param_shape = loaded_param.shape
+        current_param_shape = current_param.shape
+        quant_type = loaded_param.quant_type
+
+        block_size, type_size = gguf.GGML_QUANT_SIZES[quant_type]
+
+        inferred_shape = _quant_shape_from_byte_shape(loaded_param_shape, type_size, block_size)
+        if inferred_shape != current_param_shape:
+            raise ValueError(
+                f"{param_name} has an expected quantized shape of: {inferred_shape}, but receieved shape: {loaded_param_shape}"
+            )
+
+        return True
 
     def check_if_quantized_param(
         self,
@@ -44,8 +54,7 @@ def check_if_quantized_param(
         state_dict: Dict[str, Any],
         **kwargs,
     ) -> bool:
-        module, tensor_name = get_module_from_name(model, param_name)
-        if isinstance(module._parameters.get(tensor_name, None), GGUFParameter):
+        if isinstance(param_value, GGUFParameter):
             return True
 
         return False
diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py
index dfff6a11ec50..50fb3b84f994 100644
--- a/src/diffusers/quantizers/gguf/utils.py
+++ b/src/diffusers/quantizers/gguf/utils.py
@@ -17,26 +17,6 @@
 import torch.nn as nn
 
 
-_GGUF_FILE_TYPE_MAPPING = {
-    0: "ALL_F32",
-    1: "MOSTLY_F16",
-    2: "MOSTLY_Q4_0",
-    3: "MOSTLY_Q4_1",
-    4: "MOSTLY_Q4_1_SOME_F16",
-    8: "MOSTLY_Q5_0",
-    9: "MOSTLY_Q5_1",
-    10: "MOSTLY_Q2_K",
-    11: "MOSTLY_Q3_K_S",
-    12: "MOSTLY_Q3_K_M",
-    13: "MOSTLY_Q3_K_L",
-    14: "MOSTLY_Q4_K_S",
-    15: "MOSTLY_Q4_K_M",
-    16: "MOSTLY_Q5_K_S",
-    17: "MOSTLY_Q5_K_M",
-    18: "MOSTLY_Q6_K",
-}
-
-
 def _replace_with_gguf_linear(model, compute_dtype):
     for name, module in model.named_children():
         if isinstance(module, nn.Linear):
@@ -321,7 +301,6 @@ def dequantize_gguf_tensor(tensor, compute_dtype):
 
     block_size, type_size = gguf.GGML_QUANT_SIZES[quant_type]
 
-    tensor = torch.tensor(tensor)
     tensor = tensor.view(torch.uint8)
     shape = _quant_shape_from_byte_shape(tensor.shape, type_size, block_size)
 

From 1649936c669f706bed43b977b06cf8d5dbe2e61c Mon Sep 17 00:00:00 2001
From: DN6 <dhruv.nair@gmail.com>
Date: Tue, 19 Nov 2024 19:23:39 +0530
Subject: [PATCH 12/43] update

---
 src/diffusers/quantizers/gguf/utils.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py
index 50fb3b84f994..1382cd5f8b52 100644
--- a/src/diffusers/quantizers/gguf/utils.py
+++ b/src/diffusers/quantizers/gguf/utils.py
@@ -37,14 +37,15 @@ def _replace_with_gguf_linear(model, compute_dtype):
     return model
 
 
-QK_K = 256
-K_SCALE_SIZE = 12
-
 # dequantize operations based on torch ports of GGUF dequantize_functions
 # from City96
 # more info: https://github.com/city96/ComfyUI-GGUF/blob/main/dequant.py
 
 
+QK_K = 256
+K_SCALE_SIZE = 12
+
+
 def to_uint32(x):
     x = x.view(torch.uint8).to(torch.int32)
     return (x[:, 0] | x[:, 1] << 8 | x[:, 2] << 16 | x[:, 3] << 24).unsqueeze(1)

From 28d3a64d6de20dd2071fed8b3a039f336dc9a2ce Mon Sep 17 00:00:00 2001
From: DN6 <dhruv.nair@gmail.com>
Date: Tue, 19 Nov 2024 19:27:51 +0530
Subject: [PATCH 13/43] update

---
 tests/models/test_attention_processor.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/models/test_attention_processor.py b/tests/models/test_attention_processor.py
index c1432fee5211..2489604274b4 100644
--- a/tests/models/test_attention_processor.py
+++ b/tests/models/test_attention_processor.py
@@ -6,7 +6,6 @@
 
 from diffusers import DiffusionPipeline
 from diffusers.models.attention_processor import Attention, AttnAddedKVProcessor
-import pytest
 
 
 class AttnAddedKVProcessorTests(unittest.TestCase):
@@ -84,7 +83,6 @@ def test_conversion_when_using_device_map(self):
         pipe = DiffusionPipeline.from_pretrained(
             "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
         )
-        torch.cuda.synchronize()
 
         pre_conversion = pipe(
             "foo",
@@ -97,7 +95,6 @@ def test_conversion_when_using_device_map(self):
         pipe = DiffusionPipeline.from_pretrained(
             "hf-internal-testing/tiny-stable-diffusion-torch", device_map="balanced", safety_checker=None
         )
-        torch.cuda.synchronize()
 
         conversion = pipe(
             "foo",

From c34a4519e018f3f2b69c8c1faa941e4ba547985c Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Thu, 21 Nov 2024 09:09:41 +0100
Subject: [PATCH 14/43] update

---
 src/diffusers/loaders/single_file_model.py    |  8 +-
 src/diffusers/models/model_loading_utils.py   |  2 +-
 .../models/transformers/transformer_flux.py   |  1 -
 .../quantizers/gguf/gguf_quantizer.py         | 16 +++-
 src/diffusers/quantizers/gguf/utils.py        | 73 ++++++++++++-------
 5 files changed, 69 insertions(+), 31 deletions(-)

diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py
index b27ce1c4c84d..58a9b7e9d533 100644
--- a/src/diffusers/loaders/single_file_model.py
+++ b/src/diffusers/loaders/single_file_model.py
@@ -220,6 +220,7 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =
             )
         if quantization_config is not None:
             hf_quantizer = DiffusersAutoQuantizer.from_config(quantization_config)
+            hf_quantizer.validate_environment()
 
         else:
             hf_quantizer = None
@@ -316,7 +317,12 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =
             keep_in_fp32_modules = []
 
         if hf_quantizer is not None:
-            hf_quantizer.preprocess_model(model=model, device_map=None, keep_in_fp32_modules=keep_in_fp32_modules)
+            hf_quantizer.preprocess_model(
+                model=model,
+                device_map=None,
+                state_dict=diffusers_format_checkpoint,
+                keep_in_fp32_modules=keep_in_fp32_modules,
+            )
 
         if is_accelerate_available():
             unexpected_keys = load_model_dict_into_meta(
diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py
index b909217c53bd..52fea22d3eb9 100644
--- a/src/diffusers/models/model_loading_utils.py
+++ b/src/diffusers/models/model_loading_utils.py
@@ -462,7 +462,7 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
     reader_keys = list(fields.keys())
 
     parsed_parameters = {}
-    for tensor in tqdm(reader.tensors):
+    for tensor in tqdm(reader.tensors, desc="Loading GGUF Parameters: "):
         name = tensor.name
         quant_type = tensor.tensor_type
 
diff --git a/src/diffusers/models/transformers/transformer_flux.py b/src/diffusers/models/transformers/transformer_flux.py
index 0ad3be866019..ce88b30bf9af 100644
--- a/src/diffusers/models/transformers/transformer_flux.py
+++ b/src/diffusers/models/transformers/transformer_flux.py
@@ -521,7 +521,6 @@ def custom_forward(*inputs):
                     )
                 else:
                     hidden_states = hidden_states + controlnet_block_samples[index_block // interval_control]
-
         hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
 
         for index_block, block in enumerate(self.single_transformer_blocks):
diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py
index 4ef843a90c1e..5f214d0949c9 100644
--- a/src/diffusers/quantizers/gguf/gguf_quantizer.py
+++ b/src/diffusers/quantizers/gguf/gguf_quantizer.py
@@ -9,6 +9,8 @@
     from ...models.modeling_utils import ModelMixin
 
 from ...utils import (
+    is_accelerate_available,
+    is_accelerate_version,
     is_gguf_available,
     is_torch_available,
     logging,
@@ -31,6 +33,16 @@ def __init__(self, quantization_config, **kwargs):
         self.compute_dtype = quantization_config.compute_dtype
         self.pre_quantized = True
 
+    def validate_environment(self, *args, **kwargs):
+        if not is_accelerate_available() or is_accelerate_version("<", "0.26.0"):
+            raise ImportError(
+                "Loading GGUF Parameters requires `accelerate` installed in your enviroment: `pip install 'accelerate>=0.26.0'`"
+            )
+        if not is_gguf_available():
+            raise ImportError(
+                "To load GGUF format files you must have `gguf` installed in your environment: `pip install gguf`"
+            )
+
     def check_quantized_param_shape(self, param_name, current_param, loaded_param):
         loaded_param_shape = loaded_param.shape
         current_param_shape = current_param.shape
@@ -81,7 +93,8 @@ def _process_model_before_weight_loading(
         keep_in_fp32_modules: List[str] = [],
         **kwargs,
     ):
-        _replace_with_gguf_linear(model, self.compute_dtype)
+        state_dict = kwargs.get("state_dict", None)
+        _replace_with_gguf_linear(model, self.compute_dtype, state_dict)
 
     def _process_model_after_weight_loading(self, model: "ModelMixin", **kwargs):
         return model
@@ -92,5 +105,4 @@ def is_serializable(self):
 
     @property
     def is_trainable(self) -> bool:
-        # Because we're mandating `bitsandbytes` 0.43.3.
         return False
diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py
index 1382cd5f8b52..a3f1bc97ac3a 100644
--- a/src/diffusers/quantizers/gguf/utils.py
+++ b/src/diffusers/quantizers/gguf/utils.py
@@ -12,14 +12,30 @@
 # # See the License for the specific language governing permissions and
 # # limitations under the License.
 
-import gguf
 import torch
 import torch.nn as nn
 
+from ...utils import is_gguf_available
+
+
+if is_gguf_available():
+    import gguf
+
+
+def _replace_with_gguf_linear(model, compute_dtype, state_dict, prefix=""):
+    def _should_convert_to_gguf(module, state_dict, prefix):
+        weight_key = prefix + "weight"
+        return weight_key in state_dict and isinstance(state_dict[weight_key], GGUFParameter)
+
+    has_children = list(model.children())
+    if not has_children:
+        return
 
-def _replace_with_gguf_linear(model, compute_dtype):
     for name, module in model.named_children():
-        if isinstance(module, nn.Linear):
+        module_prefix = prefix + name + "."
+        _replace_with_gguf_linear(module, compute_dtype, state_dict, module_prefix)
+
+        if isinstance(module, nn.Linear) and _should_convert_to_gguf(module, state_dict, module_prefix):
             model._modules[name] = GGUFLinear(
                 module.in_features,
                 module.out_features,
@@ -30,10 +46,6 @@ def _replace_with_gguf_linear(model, compute_dtype):
             # Force requires grad to False to avoid unexpected errors
             model._modules[name].requires_grad_(False)
 
-        has_children = list(module.children())
-        if has_children:
-            _replace_with_gguf_linear(module, compute_dtype)
-
     return model
 
 
@@ -274,33 +286,36 @@ def dequantize_blocks_BF16(blocks, block_size, type_size, dtype=None):
     return (blocks.view(torch.int16).to(torch.int32) << 16).view(torch.float32)
 
 
-dequantize_functions = {
-    gguf.GGMLQuantizationType.BF16: dequantize_blocks_BF16,
-    gguf.GGMLQuantizationType.Q8_0: dequantize_blocks_Q8_0,
-    gguf.GGMLQuantizationType.Q5_1: dequantize_blocks_Q5_1,
-    gguf.GGMLQuantizationType.Q5_0: dequantize_blocks_Q5_0,
-    gguf.GGMLQuantizationType.Q4_1: dequantize_blocks_Q4_1,
-    gguf.GGMLQuantizationType.Q4_0: dequantize_blocks_Q4_0,
-    gguf.GGMLQuantizationType.Q6_K: dequantize_blocks_Q6_K,
-    gguf.GGMLQuantizationType.Q5_K: dequantize_blocks_Q5_K,
-    gguf.GGMLQuantizationType.Q4_K: dequantize_blocks_Q4_K,
-    gguf.GGMLQuantizationType.Q3_K: dequantize_blocks_Q3_K,
-    gguf.GGMLQuantizationType.Q2_K: dequantize_blocks_Q2_K,
-}
+if is_gguf_available():
+    GGML_QUANT_SIZES = gguf.GGML_QUANT_SIZES
+
+    dequantize_functions = {
+        gguf.GGMLQuantizationType.BF16: dequantize_blocks_BF16,
+        gguf.GGMLQuantizationType.Q8_0: dequantize_blocks_Q8_0,
+        gguf.GGMLQuantizationType.Q5_1: dequantize_blocks_Q5_1,
+        gguf.GGMLQuantizationType.Q5_0: dequantize_blocks_Q5_0,
+        gguf.GGMLQuantizationType.Q4_1: dequantize_blocks_Q4_1,
+        gguf.GGMLQuantizationType.Q4_0: dequantize_blocks_Q4_0,
+        gguf.GGMLQuantizationType.Q6_K: dequantize_blocks_Q6_K,
+        gguf.GGMLQuantizationType.Q5_K: dequantize_blocks_Q5_K,
+        gguf.GGMLQuantizationType.Q4_K: dequantize_blocks_Q4_K,
+        gguf.GGMLQuantizationType.Q3_K: dequantize_blocks_Q3_K,
+        gguf.GGMLQuantizationType.Q2_K: dequantize_blocks_Q2_K,
+    }
 
 
 def _quant_shape_from_byte_shape(shape, type_size, block_size):
     return (*shape[:-1], shape[-1] // type_size * block_size)
 
 
-def dequantize_gguf_tensor(tensor, compute_dtype):
+def dequantize_gguf_tensor(tensor):
     if not hasattr(tensor, "quant_type"):
         return tensor
 
     quant_type = tensor.quant_type
     dequant_fn = dequantize_functions[quant_type]
 
-    block_size, type_size = gguf.GGML_QUANT_SIZES[quant_type]
+    block_size, type_size = GGML_QUANT_SIZES[quant_type]
 
     tensor = tensor.view(torch.uint8)
     shape = _quant_shape_from_byte_shape(tensor.shape, type_size, block_size)
@@ -310,9 +325,8 @@ def dequantize_gguf_tensor(tensor, compute_dtype):
 
     dequant = dequant_fn(blocks, block_size, type_size)
     dequant = dequant.reshape(shape)
-    dequant = dequant.to(compute_dtype)
 
-    return dequant
+    return dequant.as_tensor()
 
 
 class GGUFParameter(torch.Tensor):
@@ -323,6 +337,9 @@ def __new__(cls, data, requires_grad=False, quant_type=None):
 
         return self
 
+    def as_tensor(self):
+        return torch.Tensor._make_subclass(torch.Tensor, self, self.requires_grad)
+
     @classmethod
     def __torch_function__(cls, func, types, args=(), kwargs=None):
         if kwargs is None:
@@ -365,5 +382,9 @@ def __init__(
         self.compute_dtype = compute_dtype
 
     def forward(self, inputs):
-        weight = dequantize_gguf_tensor(self.weight, self.compute_dtype)
-        return torch.nn.functional.linear(inputs, weight, self.bias)
+        weight = dequantize_gguf_tensor(self.weight)
+        weight = weight.to(self.compute_dtype)
+        bias = self.bias.to(self.compute_dtype)
+
+        output = torch.nn.functional.linear(inputs, weight, bias)
+        return output

From 84493dbec5f18968805efc763b8a5a578c0b1633 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Thu, 21 Nov 2024 09:21:44 +0100
Subject: [PATCH 15/43] update

---
 src/diffusers/quantizers/gguf/utils.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py
index a3f1bc97ac3a..c60e813d7000 100644
--- a/src/diffusers/quantizers/gguf/utils.py
+++ b/src/diffusers/quantizers/gguf/utils.py
@@ -12,12 +12,14 @@
 # # See the License for the specific language governing permissions and
 # # limitations under the License.
 
-import torch
-import torch.nn as nn
 
-from ...utils import is_gguf_available
+from ...utils import is_gguf_available, is_torch_available
 
 
+if is_torch_available():
+    import torch
+    import torch.nn as nn
+
 if is_gguf_available():
     import gguf
 

From 50bd78431e31175d73e9fd72cbb159f98422e9cb Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Thu, 21 Nov 2024 16:52:35 +0100
Subject: [PATCH 16/43] update

---
 src/diffusers/__init__.py                     |  2 +-
 src/diffusers/quantizers/auto.py              |  9 +++-
 .../quantizers/gguf/gguf_quantizer.py         | 11 ++---
 src/diffusers/quantizers/gguf/utils.py        | 42 ++++++++-----------
 4 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index a21f44982a61..a79d7d3012d8 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -545,7 +545,7 @@
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
     from .configuration_utils import ConfigMixin
-    from .quantizers.quantization_config import BitsAndBytesConfig
+    from .quantizers.quantization_config import BitsAndBytesConfig, GGUFQuantizationConfig
 
     try:
         if not is_onnx_available():
diff --git a/src/diffusers/quantizers/auto.py b/src/diffusers/quantizers/auto.py
index 54000fb85330..f3ae0bd1b3b8 100644
--- a/src/diffusers/quantizers/auto.py
+++ b/src/diffusers/quantizers/auto.py
@@ -15,12 +15,18 @@
 Adapted from
 https://github.com/huggingface/transformers/blob/c409cd81777fb27aadc043ed3d8339dbc020fb3b/src/transformers/quantizers/auto.py
 """
+
 import warnings
 from typing import Dict, Optional, Union
 
 from .bitsandbytes import BnB4BitDiffusersQuantizer, BnB8BitDiffusersQuantizer
 from .gguf import GGUFQuantizer
-from .quantization_config import BitsAndBytesConfig, QuantizationConfigMixin, QuantizationMethod
+from .quantization_config import (
+    BitsAndBytesConfig,
+    GGUFQuantizationConfig,
+    QuantizationConfigMixin,
+    QuantizationMethod,
+)
 
 
 AUTO_QUANTIZER_MAPPING = {
@@ -32,6 +38,7 @@
 AUTO_QUANTIZATION_CONFIG_MAPPING = {
     "bitsandbytes_4bit": BitsAndBytesConfig,
     "bitsandbytes_8bit": BitsAndBytesConfig,
+    "gguf": GGUFQuantizationConfig,
 }
 
 
diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py
index 5f214d0949c9..033de678c81e 100644
--- a/src/diffusers/quantizers/gguf/gguf_quantizer.py
+++ b/src/diffusers/quantizers/gguf/gguf_quantizer.py
@@ -1,14 +1,14 @@
 from typing import TYPE_CHECKING, Any, Dict, List, Optional
 
-from ...utils import get_module_from_name
 from ..base import DiffusersQuantizer
-from .utils import GGUFParameter, _quant_shape_from_byte_shape, _replace_with_gguf_linear
 
 
 if TYPE_CHECKING:
     from ...models.modeling_utils import ModelMixin
 
+
 from ...utils import (
+    get_module_from_name,
     is_accelerate_available,
     is_accelerate_version,
     is_gguf_available,
@@ -17,11 +17,12 @@
 )
 
 
-if is_torch_available():
+if is_torch_available() and is_gguf_available():
+    import gguf
     import torch
 
-if is_gguf_available():
-    import gguf
+    from .utils import GGUFParameter, _quant_shape_from_byte_shape, _replace_with_gguf_linear
+
 
 logger = logging.get_logger(__name__)
 
diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py
index c60e813d7000..1f4ec0a62bff 100644
--- a/src/diffusers/quantizers/gguf/utils.py
+++ b/src/diffusers/quantizers/gguf/utils.py
@@ -13,15 +13,9 @@
 # # limitations under the License.
 
 
-from ...utils import is_gguf_available, is_torch_available
-
-
-if is_torch_available():
-    import torch
-    import torch.nn as nn
-
-if is_gguf_available():
-    import gguf
+import gguf
+import torch
+import torch.nn as nn
 
 
 def _replace_with_gguf_linear(model, compute_dtype, state_dict, prefix=""):
@@ -288,22 +282,20 @@ def dequantize_blocks_BF16(blocks, block_size, type_size, dtype=None):
     return (blocks.view(torch.int16).to(torch.int32) << 16).view(torch.float32)
 
 
-if is_gguf_available():
-    GGML_QUANT_SIZES = gguf.GGML_QUANT_SIZES
-
-    dequantize_functions = {
-        gguf.GGMLQuantizationType.BF16: dequantize_blocks_BF16,
-        gguf.GGMLQuantizationType.Q8_0: dequantize_blocks_Q8_0,
-        gguf.GGMLQuantizationType.Q5_1: dequantize_blocks_Q5_1,
-        gguf.GGMLQuantizationType.Q5_0: dequantize_blocks_Q5_0,
-        gguf.GGMLQuantizationType.Q4_1: dequantize_blocks_Q4_1,
-        gguf.GGMLQuantizationType.Q4_0: dequantize_blocks_Q4_0,
-        gguf.GGMLQuantizationType.Q6_K: dequantize_blocks_Q6_K,
-        gguf.GGMLQuantizationType.Q5_K: dequantize_blocks_Q5_K,
-        gguf.GGMLQuantizationType.Q4_K: dequantize_blocks_Q4_K,
-        gguf.GGMLQuantizationType.Q3_K: dequantize_blocks_Q3_K,
-        gguf.GGMLQuantizationType.Q2_K: dequantize_blocks_Q2_K,
-    }
+GGML_QUANT_SIZES = gguf.GGML_QUANT_SIZES
+dequantize_functions = {
+    gguf.GGMLQuantizationType.BF16: dequantize_blocks_BF16,
+    gguf.GGMLQuantizationType.Q8_0: dequantize_blocks_Q8_0,
+    gguf.GGMLQuantizationType.Q5_1: dequantize_blocks_Q5_1,
+    gguf.GGMLQuantizationType.Q5_0: dequantize_blocks_Q5_0,
+    gguf.GGMLQuantizationType.Q4_1: dequantize_blocks_Q4_1,
+    gguf.GGMLQuantizationType.Q4_0: dequantize_blocks_Q4_0,
+    gguf.GGMLQuantizationType.Q6_K: dequantize_blocks_Q6_K,
+    gguf.GGMLQuantizationType.Q5_K: dequantize_blocks_Q5_K,
+    gguf.GGMLQuantizationType.Q4_K: dequantize_blocks_Q4_K,
+    gguf.GGMLQuantizationType.Q3_K: dequantize_blocks_Q3_K,
+    gguf.GGMLQuantizationType.Q2_K: dequantize_blocks_Q2_K,
+}
 
 
 def _quant_shape_from_byte_shape(shape, type_size, block_size):

From afd5d7d7344b3732861c20698ea9a557a68607ba Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Wed, 4 Dec 2024 10:36:05 +0100
Subject: [PATCH 17/43] update

---
 src/diffusers/__init__.py                     |  2 +-
 src/diffusers/loaders/single_file_model.py    |  2 +-
 src/diffusers/models/model_loading_utils.py   | 16 +++++---
 .../quantizers/gguf/gguf_quantizer.py         | 37 +++++++++++++++----
 src/diffusers/quantizers/gguf/utils.py        | 25 +++++++++----
 src/diffusers/utils/__init__.py               |  1 +
 src/diffusers/utils/import_utils.py           | 15 ++++++++
 7 files changed, 76 insertions(+), 22 deletions(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 18a0c10f924c..bba405c8e666 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -338,8 +338,8 @@
             "StableDiffusion3ControlNetPipeline",
             "StableDiffusion3Img2ImgPipeline",
             "StableDiffusion3InpaintPipeline",
-            "StableDiffusion3PAGPipeline",
             "StableDiffusion3PAGImg2ImgPipeline",
+            "StableDiffusion3PAGPipeline",
             "StableDiffusion3Pipeline",
             "StableDiffusionAdapterPipeline",
             "StableDiffusionAttendAndExcitePipeline",
diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py
index be6ee935783b..b4edf48103a2 100644
--- a/src/diffusers/loaders/single_file_model.py
+++ b/src/diffusers/loaders/single_file_model.py
@@ -349,7 +349,7 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =
         if hf_quantizer is not None:
             hf_quantizer.postprocess_model(model)
 
-        if torch_dtype is not None:
+        if torch_dtype is not None and hf_quantizer is None:
             model.to(torch_dtype)
 
         model.eval()
diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py
index 52fea22d3eb9..2e5ed6a9e21d 100644
--- a/src/diffusers/models/model_loading_utils.py
+++ b/src/diffusers/models/model_loading_utils.py
@@ -449,7 +449,7 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
         import gguf
         from gguf import GGUFReader
 
-        from ..quantizers.gguf.utils import GGUFParameter
+        from ..quantizers.gguf.utils import SUPPORTED_GGUF_QUANT_TYPES, GGUFParameter
     else:
         logger.error(
             "Loading a GGUF checkpoint in PyTorch, requires both PyTorch and GGUF>=0.10.0 to be installed. Please see "
@@ -458,8 +458,6 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
         raise ImportError("Please install torch and gguf>=0.10.0 to load a GGUF checkpoint in PyTorch.")
 
     reader = GGUFReader(gguf_checkpoint_path)
-    fields = reader.fields
-    reader_keys = list(fields.keys())
 
     parsed_parameters = {}
     for tensor in tqdm(reader.tensors, desc="Loading GGUF Parameters: "):
@@ -468,10 +466,16 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
 
         # if the tensor is a torch supported dtype do not use GGUFParameter
         is_gguf_quant = quant_type not in [gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16]
+        if is_gguf_quant and quant_type not in SUPPORTED_GGUF_QUANT_TYPES:
+            raise ValueError(
+                (
+                    f"{name} has a quantization type: {quant_type} which is unsupported."
+                    f" Currently the following quantization types are supported: {SUPPORTED_GGUF_QUANT_TYPES}"
+                    "To request support for this quantization type please open an issue here: https://github.com/huggingface/diffusers"
+                )
+            )
+
         weights = torch.from_numpy(tensor.data.copy())
         parsed_parameters[name] = GGUFParameter(weights, quant_type=quant_type) if is_gguf_quant else weights
 
-    if len(reader_keys) > 0:
-        logger.info(f"Some keys of the GGUF file were not considered: {reader_keys}")
-
     return parsed_parameters
diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py
index 033de678c81e..053b39bc504a 100644
--- a/src/diffusers/quantizers/gguf/gguf_quantizer.py
+++ b/src/diffusers/quantizers/gguf/gguf_quantizer.py
@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING, Any, Dict, List, Optional
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 
 from ..base import DiffusersQuantizer
 
@@ -12,6 +12,7 @@
     is_accelerate_available,
     is_accelerate_version,
     is_gguf_available,
+    is_gguf_version,
     is_torch_available,
     logging,
 )
@@ -21,7 +22,11 @@
     import gguf
     import torch
 
-    from .utils import GGUFParameter, _quant_shape_from_byte_shape, _replace_with_gguf_linear
+    from .utils import (
+        GGUFParameter,
+        _quant_shape_from_byte_shape,
+        _replace_with_gguf_linear,
+    )
 
 
 logger = logging.get_logger(__name__)
@@ -39,11 +44,26 @@ def validate_environment(self, *args, **kwargs):
             raise ImportError(
                 "Loading GGUF Parameters requires `accelerate` installed in your enviroment: `pip install 'accelerate>=0.26.0'`"
             )
-        if not is_gguf_available():
+        if not is_gguf_available() or is_gguf_version("<", "0.10.0"):
             raise ImportError(
-                "To load GGUF format files you must have `gguf` installed in your environment: `pip install gguf`"
+                "To load GGUF format files you must have `gguf` installed in your environment: `pip install gguf>=0.10.0`"
             )
 
+    def adjust_max_memory(self, max_memory: Dict[str, Union[int, str]]) -> Dict[str, Union[int, str]]:
+        # need more space for buffers that are created during quantization
+        max_memory = {key: val * 0.90 for key, val in max_memory.items()}
+        return max_memory
+
+    def adjust_target_dtype(self, target_dtype: "torch.dtype") -> "torch.dtype":
+        if target_dtype != torch.uint8:
+            logger.info(f"target_dtype {target_dtype} is replaced by `torch.uint8` for GGUF quantization")
+        return torch.uint8
+
+    def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
+        if torch_dtype is None:
+            torch_dtype = self.compute_dtype
+        return torch_dtype
+
     def check_quantized_param_shape(self, param_name, current_param, loaded_param):
         loaded_param_shape = loaded_param.shape
         current_param_shape = current_param.shape
@@ -62,7 +82,7 @@ def check_quantized_param_shape(self, param_name, current_param, loaded_param):
     def check_if_quantized_param(
         self,
         model: "ModelMixin",
-        param_value: "torch.Tensor",
+        param_value: Union["GGUFParameter", "torch.Tensor"],
         param_name: str,
         state_dict: Dict[str, Any],
         **kwargs,
@@ -82,10 +102,13 @@ def create_quantized_param(
         unexpected_keys: Optional[List[str]] = None,
     ):
         module, tensor_name = get_module_from_name(model, param_name)
-        if tensor_name not in module._parameters:
+        if tensor_name not in module._parameters and tensor_name not in module._buffers:
             raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.")
 
-        module._parameters[tensor_name] = param_value
+        if tensor_name in module._parameters:
+            module._parameters[tensor_name] = param_value.to(target_device)
+        if tensor_name in module._buffers:
+            module._buffers[tensor_name] = param_value.to(target_device)
 
     def _process_model_before_weight_loading(
         self,
diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py
index 1f4ec0a62bff..b0428a067f43 100644
--- a/src/diffusers/quantizers/gguf/utils.py
+++ b/src/diffusers/quantizers/gguf/utils.py
@@ -13,10 +13,18 @@
 # # limitations under the License.
 
 
+from contextlib import nullcontext
+
 import gguf
 import torch
 import torch.nn as nn
 
+from ...utils import is_accelerate_available
+
+
+if is_accelerate_available():
+    from accelerate import init_empty_weights
+
 
 def _replace_with_gguf_linear(model, compute_dtype, state_dict, prefix=""):
     def _should_convert_to_gguf(module, state_dict, prefix):
@@ -32,12 +40,14 @@ def _should_convert_to_gguf(module, state_dict, prefix):
         _replace_with_gguf_linear(module, compute_dtype, state_dict, module_prefix)
 
         if isinstance(module, nn.Linear) and _should_convert_to_gguf(module, state_dict, module_prefix):
-            model._modules[name] = GGUFLinear(
-                module.in_features,
-                module.out_features,
-                module.bias is not None,
-                compute_dtype=compute_dtype,
-            )
+            ctx = init_empty_weights if is_accelerate_available() else nullcontext
+            with ctx():
+                model._modules[name] = GGUFLinear(
+                    module.in_features,
+                    module.out_features,
+                    module.bias is not None,
+                    compute_dtype=compute_dtype,
+                )
             model._modules[name].source_cls = type(module)
             # Force requires grad to False to avoid unexpected errors
             model._modules[name].requires_grad_(False)
@@ -296,6 +306,7 @@ def dequantize_blocks_BF16(blocks, block_size, type_size, dtype=None):
     gguf.GGMLQuantizationType.Q3_K: dequantize_blocks_Q3_K,
     gguf.GGMLQuantizationType.Q2_K: dequantize_blocks_Q2_K,
 }
+SUPPORTED_GGUF_QUANT_TYPES = list(dequantize_functions.keys())
 
 
 def _quant_shape_from_byte_shape(shape, type_size, block_size):
@@ -323,7 +334,7 @@ def dequantize_gguf_tensor(tensor):
     return dequant.as_tensor()
 
 
-class GGUFParameter(torch.Tensor):
+class GGUFParameter(torch.nn.Parameter):
     def __new__(cls, data, requires_grad=False, quant_type=None):
         data = data if data is not None else torch.empty(0)
         self = torch.Tensor._make_subclass(cls, data, requires_grad)
diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
index a9ef3718e9e8..c2f7d8fdd8ca 100644
--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -68,6 +68,7 @@
     is_flax_available,
     is_ftfy_available,
     is_gguf_available,
+    is_gguf_version,
     is_google_colab,
     is_inflect_available,
     is_invisible_watermark_available,
diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py
index f440bf67cb6c..2a338c630cd1 100644
--- a/src/diffusers/utils/import_utils.py
+++ b/src/diffusers/utils/import_utils.py
@@ -777,6 +777,21 @@ def is_bitsandbytes_version(operation: str, version: str):
     return compare_versions(parse(_bitsandbytes_version), operation, version)
 
 
+def is_gguf_version(operation: str, version: str):
+    """
+    Compares the current Accelerate version to a given reference with an operation.
+
+    Args:
+        operation (`str`):
+            A string representation of an operator, such as `">"` or `"<="`
+        version (`str`):
+            A version string
+    """
+    if not _is_gguf_available:
+        return False
+    return compare_versions(parse(_gguf_version), operation, version)
+
+
 def is_k_diffusion_version(operation: str, version: str):
     """
     Compares the current k-diffusion version to a given reference with an operation.

From 0ed31bcabedc122b8e99a21d2dd57512b2fea180 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Wed, 4 Dec 2024 14:02:40 +0100
Subject: [PATCH 18/43] update

---
 src/diffusers/models/modeling_utils.py |   8 +-
 tests/quantization/gguf/test_gguf.py   | 122 +++++++++++++++++++++++++
 2 files changed, 126 insertions(+), 4 deletions(-)
 create mode 100644 tests/quantization/gguf/test_gguf.py

diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
index 76f6c5f6309d..c4472f51a6a4 100644
--- a/src/diffusers/models/modeling_utils.py
+++ b/src/diffusers/models/modeling_utils.py
@@ -1010,14 +1010,14 @@ def to(self, *args, **kwargs):
                     dtype_present_in_args = True
                     break
 
-        # Checks if the model has been loaded in 4-bit or 8-bit with BNB
-        if getattr(self, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES:
+        if getattr(self, "is_quantized", False):
             if dtype_present_in_args:
                 raise ValueError(
-                    "You cannot cast a bitsandbytes model in a new `dtype`. Make sure to load the model using `from_pretrained` using the"
-                    " desired `dtype` by passing the correct `torch_dtype` argument."
+                    "Casting a quantized model to a new `dtype` is unsupported. To set the dtype of unquantized layers, please "
+                    "use the `torch_dtype` argument when loading the model using `from_pretrained` or `from_single_file`"
                 )
 
+        if getattr(self, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES:
             if getattr(self, "is_loaded_in_8bit", False):
                 raise ValueError(
                     "`.to` is not supported for `8-bit` bitsandbytes models. Please use the model as it is, since the"
diff --git a/tests/quantization/gguf/test_gguf.py b/tests/quantization/gguf/test_gguf.py
new file mode 100644
index 000000000000..871fb6e271b1
--- /dev/null
+++ b/tests/quantization/gguf/test_gguf.py
@@ -0,0 +1,122 @@
+import gc
+import unittest
+
+import torch
+
+from diffusers import FluxTransformer2DModel, GGUFQuantizationConfig
+from diffusers.quantizers.gguf.utils import GGUFParameter
+from diffusers.utils.testing_utils import (
+    nightly,
+    require_big_gpu_with_torch_cuda,
+    torch_device,
+)
+
+
+@nightly
+@require_big_gpu_with_torch_cuda
+class GGUFSingleFileTests(unittest.TestCase):
+    ckpt_path = "https://huggingface.co/city96/FLUX.1-dev-gguf/blob/main/flux1-dev-Q2_K.gguf"
+    torch_dtype = torch.bfloat16
+
+    def setUp(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_dummy_inputs(self):
+        return {
+            "hidden_states": torch.randn((1, 4096, 64), generator=torch.Generator("cpu").manual_seed(0)).to(
+                torch_device, self.torch_dtype
+            ),
+            "encoder_hidden_states": torch.randn(
+                (1, 512, 4096),
+                generator=torch.Generator("cpu").manual_seed(0),
+            ).to(torch_device, self.torch_dtype),
+            "pooled_projections": torch.randn(
+                (1, 768),
+                generator=torch.Generator("cpu").manual_seed(0),
+            ).to(torch_device, self.torch_dtype),
+            "timestep": torch.tensor([1]).to(torch_device, self.torch_dtype),
+            "img_ids": torch.randn((4096, 3), generator=torch.Generator("cpu").manual_seed(0)).to(
+                torch_device, self.torch_dtype
+            ),
+            "txt_ids": torch.randn((512, 3), generator=torch.Generator("cpu").manual_seed(0)).to(
+                torch_device, self.torch_dtype
+            ),
+            "guidance": torch.tensor([3.5]).to(torch_device, self.torch_dtype),
+        }
+
+    def test_gguf_parameters(self):
+        quant_storage_type = torch.uint8
+        quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype)
+        model = FluxTransformer2DModel.from_single_file(self.ckpt_path, quantization_config=quantization_config)
+
+        for param_name, param in model.named_parameters():
+            if isinstance(param, GGUFParameter):
+                assert hasattr(param, "quant_type")
+                assert param.dtype == quant_storage_type
+
+    def test_gguf_linear_layers(self):
+        quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype)
+        model = FluxTransformer2DModel.from_single_file(self.ckpt_path, quantization_config=quantization_config)
+
+        for name, module in model.named_modules():
+            if isinstance(module, torch.nn.Linear) and hasattr(module.weight, "quant_type"):
+                assert module.weight.dtype == torch.uint8
+
+    def test_gguf_memory(self):
+        quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype)
+
+        model = FluxTransformer2DModel.from_single_file(
+            self.ckpt_path, quantization_config=quantization_config, torch_dtype=self.torch_dtype
+        )
+        model.to("cuda")
+        inputs = self.get_dummy_inputs()
+
+        torch.cuda.reset_peak_memory_stats()
+        torch.cuda.empty_cache()
+        with torch.no_grad():
+            model(**inputs)
+        max_memory = torch.cuda.max_memory_allocated()
+        assert (max_memory / 1024**3) < 5
+
+    def test_keep_modules_in_fp32(self):
+        r"""
+        A simple tests to check if the modules under `_keep_in_fp32_modules` are kept in fp32.
+        Also ensures if inference works.
+        """
+        FluxTransformer2DModel._keep_in_fp32_modules = ["proj_out"]
+
+        quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype)
+        model = FluxTransformer2DModel.from_single_file(self.ckpt_path, quantization_config=quantization_config)
+
+        for name, module in model.named_modules():
+            if isinstance(module, torch.nn.Linear):
+                if name in model._keep_in_fp32_modules:
+                    assert module.weight.dtype == torch.float32
+
+    def test_dtype_assignment(self):
+        quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype)
+        model = FluxTransformer2DModel.from_single_file(self.ckpt_path, quantization_config=quantization_config)
+
+        with self.assertRaises(ValueError):
+            # Tries with a `dtype`
+            model.to(torch.float16)
+
+        with self.assertRaises(ValueError):
+            # Tries with a `device` and `dtype`
+            model.to(device="cuda:0", dtype=torch.float16)
+
+        with self.assertRaises(ValueError):
+            # Tries with a cast
+            model.float()
+
+        with self.assertRaises(ValueError):
+            # Tries with a cast
+            model.half()
+
+        # This should work
+        model.to("cuda")

From af381ad57d8afed0134650b2bcf6c72b7b4b52f9 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Wed, 4 Dec 2024 14:20:09 +0100
Subject: [PATCH 19/43] update

---
 src/diffusers/models/model_loading_utils.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py
index 2e5ed6a9e21d..1f0df6d6fd2d 100644
--- a/src/diffusers/models/model_loading_utils.py
+++ b/src/diffusers/models/model_loading_utils.py
@@ -467,11 +467,13 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
         # if the tensor is a torch supported dtype do not use GGUFParameter
         is_gguf_quant = quant_type not in [gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16]
         if is_gguf_quant and quant_type not in SUPPORTED_GGUF_QUANT_TYPES:
+            _supported_quants_str = "\n".join([str(type) for type in SUPPORTED_GGUF_QUANT_TYPES])
             raise ValueError(
                 (
-                    f"{name} has a quantization type: {quant_type} which is unsupported."
-                    f" Currently the following quantization types are supported: {SUPPORTED_GGUF_QUANT_TYPES}"
-                    "To request support for this quantization type please open an issue here: https://github.com/huggingface/diffusers"
+                    f"{name} has a quantization type: {str(quant_type)} which is unsupported."
+                    "\n\nCurrently the following quantization types are supported: \n\n"
+                    f"{_supported_quants_str}"
+                    "\n\nTo request support for this quantization type please open an issue here: https://github.com/huggingface/diffusers"
                 )
             )
 

From 52a1bcb7105b6bddba1d471e4ee3e68b84082af5 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Wed, 4 Dec 2024 16:34:25 +0100
Subject: [PATCH 20/43] update

---
 src/diffusers/quantizers/gguf/gguf_quantizer.py | 2 +-
 src/diffusers/quantizers/quantization_config.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py
index 053b39bc504a..42280294ab71 100644
--- a/src/diffusers/quantizers/gguf/gguf_quantizer.py
+++ b/src/diffusers/quantizers/gguf/gguf_quantizer.py
@@ -37,7 +37,7 @@ def __init__(self, quantization_config, **kwargs):
         super().__init__(quantization_config, **kwargs)
 
         self.compute_dtype = quantization_config.compute_dtype
-        self.pre_quantized = True
+        self.pre_quantized = quantization_config.pre_quantized
 
     def validate_environment(self, *args, **kwargs):
         if not is_accelerate_available() or is_accelerate_version("<", "0.26.0"):
diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py
index 58e68b628d77..07bf763520db 100644
--- a/src/diffusers/quantizers/quantization_config.py
+++ b/src/diffusers/quantizers/quantization_config.py
@@ -397,6 +397,7 @@ def __init__(self, compute_dtype=None, quant_storage=None):
         self.quant_method = QuantizationMethod.GGUF
         self.compute_dtype = compute_dtype
         self.quant_storage = quant_storage
+        self.pre_quantized = True
 
         if self.compute_dtype is None:
             self.compute_dtype = torch.float32

From 67f17000d255da1e922eb6b76e48c37f047c015e Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Wed, 4 Dec 2024 16:54:56 +0100
Subject: [PATCH 21/43] update

---
 src/diffusers/quantizers/gguf/gguf_quantizer.py |  1 +
 src/diffusers/utils/testing_utils.py            | 13 +++++++++++++
 tests/quantization/gguf/test_gguf.py            | 10 +++++++++-
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py
index 42280294ab71..f0f9aa359ac4 100644
--- a/src/diffusers/quantizers/gguf/gguf_quantizer.py
+++ b/src/diffusers/quantizers/gguf/gguf_quantizer.py
@@ -49,6 +49,7 @@ def validate_environment(self, *args, **kwargs):
                 "To load GGUF format files you must have `gguf` installed in your environment: `pip install gguf>=0.10.0`"
             )
 
+    # Copied from diffusers.quantizers.bitsandbytes.bnb_quantizer.BnB4BitDiffusersQuantizer.adjust_max_memory
     def adjust_max_memory(self, max_memory: Dict[str, Union[int, str]]) -> Dict[str, Union[int, str]]:
         # need more space for buffers that are created during quantization
         max_memory = {key: val * 0.90 for key, val in max_memory.items()}
diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py
index b3e381f7d3fb..f1d929c85d1b 100644
--- a/src/diffusers/utils/testing_utils.py
+++ b/src/diffusers/utils/testing_utils.py
@@ -32,6 +32,7 @@
     is_bitsandbytes_available,
     is_compel_available,
     is_flax_available,
+    is_gguf_available,
     is_note_seq_available,
     is_onnx_available,
     is_opencv_available,
@@ -476,6 +477,18 @@ def decorator(test_case):
     return decorator
 
 
+def require_gguf_version_greater_or_equal(gguf_version):
+    def decorator(test_case):
+        correct_gguf_version = is_gguf_available() and version.parse(
+            version.parse(importlib.metadata.version("gguf")).base_version
+        ) >= version.parse(gguf_version)
+        return unittest.skipUnless(
+            correct_gguf_version, f"Test requires gguf with the version greater than {gguf_version}."
+        )(test_case)
+
+    return decorator
+
+
 def deprecate_after_peft_backend(test_case):
     """
     Decorator marking a test that will be skipped after PEFT backend
diff --git a/tests/quantization/gguf/test_gguf.py b/tests/quantization/gguf/test_gguf.py
index 871fb6e271b1..710e92df6acd 100644
--- a/tests/quantization/gguf/test_gguf.py
+++ b/tests/quantization/gguf/test_gguf.py
@@ -4,16 +4,24 @@
 import torch
 
 from diffusers import FluxTransformer2DModel, GGUFQuantizationConfig
-from diffusers.quantizers.gguf.utils import GGUFParameter
 from diffusers.utils.testing_utils import (
+    is_gguf_available,
     nightly,
+    require_accelerate,
     require_big_gpu_with_torch_cuda,
+    require_gguf_version_greater_or_equal,
     torch_device,
 )
 
 
+if is_gguf_available():
+    from diffusers.quantizers.gguf.utils import GGUFParameter
+
+
 @nightly
 @require_big_gpu_with_torch_cuda
+@require_accelerate
+@require_gguf_version_greater_or_equal("0.10.0")
 class GGUFSingleFileTests(unittest.TestCase):
     ckpt_path = "https://huggingface.co/city96/FLUX.1-dev-gguf/blob/main/flux1-dev-Q2_K.gguf"
     torch_dtype = torch.bfloat16

From 8abfa5559cb7cffd3d027a58841c8396f69efae2 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Thu, 5 Dec 2024 06:44:01 +0100
Subject: [PATCH 22/43] update

---
 .../quantizers/gguf/gguf_quantizer.py         | 20 +++++++++++++++----
 src/diffusers/quantizers/gguf/utils.py        | 10 +++++++---
 .../quantizers/quantization_config.py         |  3 ++-
 3 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py
index f0f9aa359ac4..bb61b1ddd7ac 100644
--- a/src/diffusers/quantizers/gguf/gguf_quantizer.py
+++ b/src/diffusers/quantizers/gguf/gguf_quantizer.py
@@ -19,10 +19,10 @@
 
 
 if is_torch_available() and is_gguf_available():
-    import gguf
     import torch
 
     from .utils import (
+        GGML_QUANT_SIZES,
         GGUFParameter,
         _quant_shape_from_byte_shape,
         _replace_with_gguf_linear,
@@ -33,11 +33,17 @@
 
 
 class GGUFQuantizer(DiffusersQuantizer):
+    use_keep_in_fp32_modules = True
+
     def __init__(self, quantization_config, **kwargs):
         super().__init__(quantization_config, **kwargs)
 
         self.compute_dtype = quantization_config.compute_dtype
         self.pre_quantized = quantization_config.pre_quantized
+        self.modules_to_not_convert = quantization_config.modules_to_not_convert
+
+        if not isinstance(self.modules_to_not_convert, list):
+            self.modules_to_not_convert = [self.modules_to_not_convert]
 
     def validate_environment(self, *args, **kwargs):
         if not is_accelerate_available() or is_accelerate_version("<", "0.26.0"):
@@ -70,7 +76,7 @@ def check_quantized_param_shape(self, param_name, current_param, loaded_param):
         current_param_shape = current_param.shape
         quant_type = loaded_param.quant_type
 
-        block_size, type_size = gguf.GGML_QUANT_SIZES[quant_type]
+        block_size, type_size = GGML_QUANT_SIZES[quant_type]
 
         inferred_shape = _quant_shape_from_byte_shape(loaded_param_shape, type_size, block_size)
         if inferred_shape != current_param_shape:
@@ -96,7 +102,7 @@ def check_if_quantized_param(
     def create_quantized_param(
         self,
         model: "ModelMixin",
-        param_value: "torch.Tensor",
+        param_value: Union["GGUFParameter", "torch.Tensor"],
         param_name: str,
         target_device: "torch.device",
         state_dict: Dict[str, Any],
@@ -119,7 +125,13 @@ def _process_model_before_weight_loading(
         **kwargs,
     ):
         state_dict = kwargs.get("state_dict", None)
-        _replace_with_gguf_linear(model, self.compute_dtype, state_dict)
+
+        self.modules_to_not_convert.extend(keep_in_fp32_modules)
+        self.modules_to_not_convert = [module for module in self.modules_to_not_convert if module is not None]
+
+        _replace_with_gguf_linear(
+            model, self.compute_dtype, state_dict, modules_to_not_convert=self.modules_to_not_convert
+        )
 
     def _process_model_after_weight_loading(self, model: "ModelMixin", **kwargs):
         return model
diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py
index b0428a067f43..c72a20712934 100644
--- a/src/diffusers/quantizers/gguf/utils.py
+++ b/src/diffusers/quantizers/gguf/utils.py
@@ -26,7 +26,7 @@
     from accelerate import init_empty_weights
 
 
-def _replace_with_gguf_linear(model, compute_dtype, state_dict, prefix=""):
+def _replace_with_gguf_linear(model, compute_dtype, state_dict, prefix="", modules_to_not_convert=[]):
     def _should_convert_to_gguf(module, state_dict, prefix):
         weight_key = prefix + "weight"
         return weight_key in state_dict and isinstance(state_dict[weight_key], GGUFParameter)
@@ -37,9 +37,13 @@ def _should_convert_to_gguf(module, state_dict, prefix):
 
     for name, module in model.named_children():
         module_prefix = prefix + name + "."
-        _replace_with_gguf_linear(module, compute_dtype, state_dict, module_prefix)
+        _replace_with_gguf_linear(module, compute_dtype, state_dict, module_prefix, modules_to_not_convert)
 
-        if isinstance(module, nn.Linear) and _should_convert_to_gguf(module, state_dict, module_prefix):
+        if (
+            isinstance(module, nn.Linear)
+            and _should_convert_to_gguf(module, state_dict, module_prefix)
+            and name not in modules_to_not_convert
+        ):
             ctx = init_empty_weights if is_accelerate_available() else nullcontext
             with ctx():
                 model._modules[name] = GGUFLinear(
diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py
index 07bf763520db..8fb9deadec21 100644
--- a/src/diffusers/quantizers/quantization_config.py
+++ b/src/diffusers/quantizers/quantization_config.py
@@ -393,11 +393,12 @@ def to_diff_dict(self) -> Dict[str, Any]:
 
 
 class GGUFQuantizationConfig(QuantizationConfigMixin):
-    def __init__(self, compute_dtype=None, quant_storage=None):
+    def __init__(self, compute_dtype=None, quant_storage=None, modules_to_not_convert=None):
         self.quant_method = QuantizationMethod.GGUF
         self.compute_dtype = compute_dtype
         self.quant_storage = quant_storage
         self.pre_quantized = True
+        self.modules_to_not_convert = modules_to_not_convert
 
         if self.compute_dtype is None:
             self.compute_dtype = torch.float32

From d4b88d787be0ed62e4daf5fbd0880217748b7360 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Thu, 5 Dec 2024 06:45:12 +0100
Subject: [PATCH 23/43] update

---
 src/diffusers/quantizers/gguf/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/diffusers/quantizers/gguf/__init__.py b/src/diffusers/quantizers/gguf/__init__.py
index b3d9082ac803..53af2e180f48 100644
--- a/src/diffusers/quantizers/gguf/__init__.py
+++ b/src/diffusers/quantizers/gguf/__init__.py
@@ -1 +1,2 @@
 from .gguf_quantizer import GGUFQuantizer
+from .utils import GGUFLinear, GGUFParameter

From 30f13ed310dfc7a427611c7b9f1a478fbb9d0463 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Thu, 5 Dec 2024 09:12:42 +0100
Subject: [PATCH 24/43] update

---
 tests/quantization/gguf/test_gguf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/quantization/gguf/test_gguf.py b/tests/quantization/gguf/test_gguf.py
index 710e92df6acd..94170aa97069 100644
--- a/tests/quantization/gguf/test_gguf.py
+++ b/tests/quantization/gguf/test_gguf.py
@@ -82,6 +82,7 @@ def test_gguf_memory(self):
             self.ckpt_path, quantization_config=quantization_config, torch_dtype=self.torch_dtype
         )
         model.to("cuda")
+        assert (model.get_memory_footprint() / 1024**3) < 5
         inputs = self.get_dummy_inputs()
 
         torch.cuda.reset_peak_memory_stats()

From 9310035f5545abc62e795526e53ca3c65548df02 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Thu, 5 Dec 2024 09:37:36 +0100
Subject: [PATCH 25/43] update

---
 .github/workflows/nightly_tests.yml  | 4 +++-
 tests/quantization/gguf/test_gguf.py | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/nightly_tests.yml b/.github/workflows/nightly_tests.yml
index e2228fdacf30..f380b4311332 100644
--- a/.github/workflows/nightly_tests.yml
+++ b/.github/workflows/nightly_tests.yml
@@ -356,6 +356,8 @@ jobs:
         config:
           - backend: "bitsandbytes"
             test_location: "bnb"
+          - backend: "gguf"
+            test_location: "gguf"
     runs-on:
       group: aws-g6e-xlarge-plus
     container:
@@ -519,4 +521,4 @@ jobs:
 #        if: always()
 #        run: |
 #          pip install slack_sdk tabulate
-#          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
\ No newline at end of file
+#          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
diff --git a/tests/quantization/gguf/test_gguf.py b/tests/quantization/gguf/test_gguf.py
index 94170aa97069..c7c678947807 100644
--- a/tests/quantization/gguf/test_gguf.py
+++ b/tests/quantization/gguf/test_gguf.py
@@ -75,7 +75,7 @@ def test_gguf_linear_layers(self):
             if isinstance(module, torch.nn.Linear) and hasattr(module.weight, "quant_type"):
                 assert module.weight.dtype == torch.uint8
 
-    def test_gguf_memory(self):
+    def test_gguf_memory_usage(self):
         quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype)
 
         model = FluxTransformer2DModel.from_single_file(

From e9303a0198b3b8a1ceeeae9c2a75e03649940239 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Thu, 5 Dec 2024 10:30:46 +0100
Subject: [PATCH 26/43] update

---
 src/diffusers/loaders/single_file_utils.py |  25 +++-
 tests/quantization/gguf/test_gguf.py       | 155 +++++++++++++++------
 2 files changed, 132 insertions(+), 48 deletions(-)

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 10742873ded1..7b36d3e710fb 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -81,8 +81,14 @@
     "open_clip_sd3": "text_encoders.clip_g.transformer.text_model.embeddings.position_embedding.weight",
     "stable_cascade_stage_b": "down_blocks.1.0.channelwise.0.weight",
     "stable_cascade_stage_c": "clip_txt_mapper.weight",
-    "sd3": "model.diffusion_model.joint_blocks.0.context_block.adaLN_modulation.1.bias",
-    "sd35_large": "model.diffusion_model.joint_blocks.37.x_block.mlp.fc1.weight",
+    "sd3": [
+        "joint_blocks.0.context_block.adaLN_modulation.1.bias",
+        "model.diffusion_model.joint_blocks.0.context_block.adaLN_modulation.1.bias",
+    ],
+    "sd35_large": [
+        "joint_blocks.37.x_block.mlp.fc1.weight",
+        "model.diffusion_model.joint_blocks.37.x_block.mlp.fc1.weight",
+    ],
     "animatediff": "down_blocks.0.motion_modules.0.temporal_transformer.transformer_blocks.0.attention_blocks.0.pos_encoder.pe",
     "animatediff_v2": "mid_block.motion_modules.0.temporal_transformer.norm.bias",
     "animatediff_sdxl_beta": "up_blocks.2.motion_modules.0.temporal_transformer.norm.weight",
@@ -529,13 +535,20 @@ def infer_diffusers_model_type(checkpoint):
     ):
         model_type = "stable_cascade_stage_b"
 
-    elif CHECKPOINT_KEY_NAMES["sd3"] in checkpoint and checkpoint[CHECKPOINT_KEY_NAMES["sd3"]].shape[-1] == 9216:
-        if checkpoint["model.diffusion_model.pos_embed"].shape[1] == 36864:
+    elif any(key in checkpoint for key in CHECKPOINT_KEY_NAMES["sd3"]) and any(
+        checkpoint[key].shape[-1] == 9216 if key in checkpoint else False for key in CHECKPOINT_KEY_NAMES["sd3"]
+    ):
+        if "model.diffusion_model.pos_embed" in checkpoint:
+            key = "model.diffusion_model.pos_embed"
+        else:
+            key = "pos_embed"
+
+        if checkpoint[key].shape[1] == 36864:
             model_type = "sd3"
-        elif checkpoint["model.diffusion_model.pos_embed"].shape[1] == 147456:
+        elif checkpoint[key].shape[1] == 147456:
             model_type = "sd35_medium"
 
-    elif CHECKPOINT_KEY_NAMES["sd35_large"] in checkpoint:
+    elif any(key in checkpoint for key in CHECKPOINT_KEY_NAMES["sd35_large"]):
         model_type = "sd35_large"
 
     elif CHECKPOINT_KEY_NAMES["animatediff"] in checkpoint:
diff --git a/tests/quantization/gguf/test_gguf.py b/tests/quantization/gguf/test_gguf.py
index c7c678947807..dfe1100fadc8 100644
--- a/tests/quantization/gguf/test_gguf.py
+++ b/tests/quantization/gguf/test_gguf.py
@@ -3,7 +3,7 @@
 
 import torch
 
-from diffusers import FluxTransformer2DModel, GGUFQuantizationConfig
+from diffusers import FluxTransformer2DModel, GGUFQuantizationConfig, SD3Transformer2DModel
 from diffusers.utils.testing_utils import (
     is_gguf_available,
     nightly,
@@ -22,45 +22,16 @@
 @require_big_gpu_with_torch_cuda
 @require_accelerate
 @require_gguf_version_greater_or_equal("0.10.0")
-class GGUFSingleFileTests(unittest.TestCase):
-    ckpt_path = "https://huggingface.co/city96/FLUX.1-dev-gguf/blob/main/flux1-dev-Q2_K.gguf"
+class GGUFSingleFileTesterMixin:
+    ckpt_path = None
+    model_cls = None
     torch_dtype = torch.bfloat16
-
-    def setUp(self):
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def tearDown(self):
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def get_dummy_inputs(self):
-        return {
-            "hidden_states": torch.randn((1, 4096, 64), generator=torch.Generator("cpu").manual_seed(0)).to(
-                torch_device, self.torch_dtype
-            ),
-            "encoder_hidden_states": torch.randn(
-                (1, 512, 4096),
-                generator=torch.Generator("cpu").manual_seed(0),
-            ).to(torch_device, self.torch_dtype),
-            "pooled_projections": torch.randn(
-                (1, 768),
-                generator=torch.Generator("cpu").manual_seed(0),
-            ).to(torch_device, self.torch_dtype),
-            "timestep": torch.tensor([1]).to(torch_device, self.torch_dtype),
-            "img_ids": torch.randn((4096, 3), generator=torch.Generator("cpu").manual_seed(0)).to(
-                torch_device, self.torch_dtype
-            ),
-            "txt_ids": torch.randn((512, 3), generator=torch.Generator("cpu").manual_seed(0)).to(
-                torch_device, self.torch_dtype
-            ),
-            "guidance": torch.tensor([3.5]).to(torch_device, self.torch_dtype),
-        }
+    expected_memory_use_in_gb = 5
 
     def test_gguf_parameters(self):
         quant_storage_type = torch.uint8
         quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype)
-        model = FluxTransformer2DModel.from_single_file(self.ckpt_path, quantization_config=quantization_config)
+        model = self.model_cls.from_single_file(self.ckpt_path, quantization_config=quantization_config)
 
         for param_name, param in model.named_parameters():
             if isinstance(param, GGUFParameter):
@@ -69,7 +40,7 @@ def test_gguf_parameters(self):
 
     def test_gguf_linear_layers(self):
         quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype)
-        model = FluxTransformer2DModel.from_single_file(self.ckpt_path, quantization_config=quantization_config)
+        model = self.model_cls.from_single_file(self.ckpt_path, quantization_config=quantization_config)
 
         for name, module in model.named_modules():
             if isinstance(module, torch.nn.Linear) and hasattr(module.weight, "quant_type"):
@@ -78,11 +49,11 @@ def test_gguf_linear_layers(self):
     def test_gguf_memory_usage(self):
         quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype)
 
-        model = FluxTransformer2DModel.from_single_file(
+        model = self.model_cls.from_single_file(
             self.ckpt_path, quantization_config=quantization_config, torch_dtype=self.torch_dtype
         )
         model.to("cuda")
-        assert (model.get_memory_footprint() / 1024**3) < 5
+        assert (model.get_memory_footprint() / 1024**3) < self.expected_memory_use_in_gb
         inputs = self.get_dummy_inputs()
 
         torch.cuda.reset_peak_memory_stats()
@@ -90,17 +61,17 @@ def test_gguf_memory_usage(self):
         with torch.no_grad():
             model(**inputs)
         max_memory = torch.cuda.max_memory_allocated()
-        assert (max_memory / 1024**3) < 5
+        assert (max_memory / 1024**3) < self.expected_memory_use_in_gb
 
     def test_keep_modules_in_fp32(self):
         r"""
         A simple tests to check if the modules under `_keep_in_fp32_modules` are kept in fp32.
         Also ensures if inference works.
         """
-        FluxTransformer2DModel._keep_in_fp32_modules = ["proj_out"]
+        self.model_cls._keep_in_fp32_modules = ["proj_out"]
 
         quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype)
-        model = FluxTransformer2DModel.from_single_file(self.ckpt_path, quantization_config=quantization_config)
+        model = self.model_cls.from_single_file(self.ckpt_path, quantization_config=quantization_config)
 
         for name, module in model.named_modules():
             if isinstance(module, torch.nn.Linear):
@@ -109,7 +80,7 @@ def test_keep_modules_in_fp32(self):
 
     def test_dtype_assignment(self):
         quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype)
-        model = FluxTransformer2DModel.from_single_file(self.ckpt_path, quantization_config=quantization_config)
+        model = self.model_cls.from_single_file(self.ckpt_path, quantization_config=quantization_config)
 
         with self.assertRaises(ValueError):
             # Tries with a `dtype`
@@ -129,3 +100,103 @@ def test_dtype_assignment(self):
 
         # This should work
         model.to("cuda")
+
+
+class FluxGGUFSingleFileTests(GGUFSingleFileTesterMixin, unittest.TestCase):
+    ckpt_path = "https://huggingface.co/city96/FLUX.1-dev-gguf/blob/main/flux1-dev-Q2_K.gguf"
+    torch_dtype = torch.bfloat16
+    model_cls = FluxTransformer2DModel
+    expected_memory_use_in_gb = 5
+
+    def setUp(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_dummy_inputs(self):
+        return {
+            "hidden_states": torch.randn((1, 4096, 64), generator=torch.Generator("cpu").manual_seed(0)).to(
+                torch_device, self.torch_dtype
+            ),
+            "encoder_hidden_states": torch.randn(
+                (1, 512, 4096),
+                generator=torch.Generator("cpu").manual_seed(0),
+            ).to(torch_device, self.torch_dtype),
+            "pooled_projections": torch.randn(
+                (1, 768),
+                generator=torch.Generator("cpu").manual_seed(0),
+            ).to(torch_device, self.torch_dtype),
+            "timestep": torch.tensor([1]).to(torch_device, self.torch_dtype),
+            "img_ids": torch.randn((4096, 3), generator=torch.Generator("cpu").manual_seed(0)).to(
+                torch_device, self.torch_dtype
+            ),
+            "txt_ids": torch.randn((512, 3), generator=torch.Generator("cpu").manual_seed(0)).to(
+                torch_device, self.torch_dtype
+            ),
+            "guidance": torch.tensor([3.5]).to(torch_device, self.torch_dtype),
+        }
+
+
+class SD35LargeGGUFSingleFileTests(GGUFSingleFileTesterMixin, unittest.TestCase):
+    ckpt_path = "https://huggingface.co/city96/stable-diffusion-3.5-large-gguf/blob/main/sd3.5_large-Q4_0.gguf"
+    torch_dtype = torch.bfloat16
+    model_cls = SD3Transformer2DModel
+    expected_memory_use_in_gb = 5
+
+    def setUp(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_dummy_inputs(self):
+        return {
+            "hidden_states": torch.randn((1, 16, 64, 64), generator=torch.Generator("cpu").manual_seed(0)).to(
+                torch_device, self.torch_dtype
+            ),
+            "encoder_hidden_states": torch.randn(
+                (1, 512, 4096),
+                generator=torch.Generator("cpu").manual_seed(0),
+            ).to(torch_device, self.torch_dtype),
+            "pooled_projections": torch.randn(
+                (1, 2048),
+                generator=torch.Generator("cpu").manual_seed(0),
+            ).to(torch_device, self.torch_dtype),
+            "timestep": torch.tensor([1]).to(torch_device, self.torch_dtype),
+        }
+
+
+class SD35MediumGGUFSingleFileTests(GGUFSingleFileTesterMixin, unittest.TestCase):
+    ckpt_path = "https://huggingface.co/city96/stable-diffusion-3.5-medium-gguf/blob/main/sd3.5_medium-Q3_K_M.gguf"
+    torch_dtype = torch.bfloat16
+    model_cls = SD3Transformer2DModel
+    expected_memory_use_in_gb = 2
+
+    def setUp(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def tearDown(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_dummy_inputs(self):
+        return {
+            "hidden_states": torch.randn((1, 16, 64, 64), generator=torch.Generator("cpu").manual_seed(0)).to(
+                torch_device, self.torch_dtype
+            ),
+            "encoder_hidden_states": torch.randn(
+                (1, 512, 4096),
+                generator=torch.Generator("cpu").manual_seed(0),
+            ).to(torch_device, self.torch_dtype),
+            "pooled_projections": torch.randn(
+                (1, 2048),
+                generator=torch.Generator("cpu").manual_seed(0),
+            ).to(torch_device, self.torch_dtype),
+            "timestep": torch.tensor([1]).to(torch_device, self.torch_dtype),
+        }

From e56c26647c4f13b95d724ca401c6cb7ab3c847f4 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Thu, 5 Dec 2024 10:33:52 +0100
Subject: [PATCH 27/43] update

---
 src/diffusers/models/model_loading_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py
index 1f0df6d6fd2d..d10df4a37992 100644
--- a/src/diffusers/models/model_loading_utils.py
+++ b/src/diffusers/models/model_loading_utils.py
@@ -36,11 +36,11 @@
     _get_model_file,
     deprecate,
     is_accelerate_available,
+    is_gguf_available,
     is_torch_available,
     is_torch_version,
     logging,
 )
-from ..utils.import_utils import is_gguf_available
 
 
 logger = logging.get_logger(__name__)

From 1209c3a256feefaada6416fc9cb559d99a6bed09 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Thu, 5 Dec 2024 15:26:40 +0530
Subject: [PATCH 28/43] Update src/diffusers/quantizers/gguf/utils.py

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
---
 src/diffusers/quantizers/gguf/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py
index c72a20712934..9081575a5962 100644
--- a/src/diffusers/quantizers/gguf/utils.py
+++ b/src/diffusers/quantizers/gguf/utils.py
@@ -53,7 +53,7 @@ def _should_convert_to_gguf(module, state_dict, prefix):
                     compute_dtype=compute_dtype,
                 )
             model._modules[name].source_cls = type(module)
-            # Force requires grad to False to avoid unexpected errors
+            # Force requires_grad to False to avoid unexpected errors
             model._modules[name].requires_grad_(False)
 
     return model

From db9b6f38dbb42e0aa0c765a643dfebc7c20433c3 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Thu, 5 Dec 2024 12:44:22 +0100
Subject: [PATCH 29/43] update

---
 src/diffusers/models/model_loading_utils.py   |   5 -
 .../quantizers/gguf/gguf_quantizer.py         |   2 +-
 tests/quantization/gguf/test_gguf.py          | 159 +++++++++++++++++-
 3 files changed, 159 insertions(+), 7 deletions(-)

diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py
index d10df4a37992..93ad22ce400b 100644
--- a/src/diffusers/models/model_loading_utils.py
+++ b/src/diffusers/models/model_loading_utils.py
@@ -427,11 +427,6 @@ def _gguf_parse_value(_value, data_type):
     return _value
 
 
-def read_field(reader, field):
-    value = reader.fields[field]
-    return [_gguf_parse_value(value.parts[_data_index], value.types) for _data_index in value.data]
-
-
 def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
     """
     Load a GGUF file and return a dictionary of parsed parameters containing tensors, the parsed tokenizer and config
diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py
index bb61b1ddd7ac..62c2063ac758 100644
--- a/src/diffusers/quantizers/gguf/gguf_quantizer.py
+++ b/src/diffusers/quantizers/gguf/gguf_quantizer.py
@@ -105,7 +105,7 @@ def create_quantized_param(
         param_value: Union["GGUFParameter", "torch.Tensor"],
         param_name: str,
         target_device: "torch.device",
-        state_dict: Dict[str, Any],
+        state_dict: Optional[Dict[str, Any]] = None,
         unexpected_keys: Optional[List[str]] = None,
     ):
         module, tensor_name = get_module_from_name(model, param_name)
diff --git a/tests/quantization/gguf/test_gguf.py b/tests/quantization/gguf/test_gguf.py
index dfe1100fadc8..265d35c16d1c 100644
--- a/tests/quantization/gguf/test_gguf.py
+++ b/tests/quantization/gguf/test_gguf.py
@@ -1,12 +1,20 @@
 import gc
 import unittest
 
+import numpy as np
 import torch
 
-from diffusers import FluxTransformer2DModel, GGUFQuantizationConfig, SD3Transformer2DModel
+from diffusers import (
+    FluxPipeline,
+    FluxTransformer2DModel,
+    GGUFQuantizationConfig,
+    SD3Transformer2DModel,
+    StableDiffusion3Pipeline,
+)
 from diffusers.utils.testing_utils import (
     is_gguf_available,
     nightly,
+    numpy_cosine_similarity_distance,
     require_accelerate,
     require_big_gpu_with_torch_cuda,
     require_gguf_version_greater_or_equal,
@@ -68,6 +76,7 @@ def test_keep_modules_in_fp32(self):
         A simple tests to check if the modules under `_keep_in_fp32_modules` are kept in fp32.
         Also ensures if inference works.
         """
+        _keep_in_fp32_modules = self.model_cls._keep_in_fp32_modules
         self.model_cls._keep_in_fp32_modules = ["proj_out"]
 
         quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype)
@@ -77,6 +86,7 @@ def test_keep_modules_in_fp32(self):
             if isinstance(module, torch.nn.Linear):
                 if name in model._keep_in_fp32_modules:
                     assert module.weight.dtype == torch.float32
+        self.model_cls._keep_in_fp32_modules = _keep_in_fp32_modules
 
     def test_dtype_assignment(self):
         quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype)
@@ -139,6 +149,55 @@ def get_dummy_inputs(self):
             "guidance": torch.tensor([3.5]).to(torch_device, self.torch_dtype),
         }
 
+    def test_pipeline_inference(self):
+        quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype)
+        transformer = self.model_cls.from_single_file(
+            self.ckpt_path, quantization_config=quantization_config, torch_dtype=self.torch_dtype
+        )
+        pipe = FluxPipeline.from_pretrained(
+            "black-forest-labs/FLUX.1-dev", transformer=transformer, torch_dtype=self.torch_dtype
+        )
+        pipe.enable_model_cpu_offload()
+
+        prompt = "a cat holding a sign that says hello"
+        output = pipe(
+            prompt=prompt, num_inference_steps=2, generator=torch.Generator("cpu").manual_seed(0), output_type="np"
+        ).images[0]
+        output_slice = output[:3, :3, :].flatten()
+        expected_slice = np.array(
+            [
+                0.47265625,
+                0.43359375,
+                0.359375,
+                0.47070312,
+                0.421875,
+                0.34375,
+                0.46875,
+                0.421875,
+                0.34765625,
+                0.46484375,
+                0.421875,
+                0.34179688,
+                0.47070312,
+                0.42578125,
+                0.34570312,
+                0.46875,
+                0.42578125,
+                0.3515625,
+                0.45507812,
+                0.4140625,
+                0.33984375,
+                0.4609375,
+                0.41796875,
+                0.34375,
+                0.45898438,
+                0.41796875,
+                0.34375,
+            ]
+        )
+        max_diff = numpy_cosine_similarity_distance(expected_slice, output_slice)
+        assert max_diff < 1e-4
+
 
 class SD35LargeGGUFSingleFileTests(GGUFSingleFileTesterMixin, unittest.TestCase):
     ckpt_path = "https://huggingface.co/city96/stable-diffusion-3.5-large-gguf/blob/main/sd3.5_large-Q4_0.gguf"
@@ -170,6 +229,55 @@ def get_dummy_inputs(self):
             "timestep": torch.tensor([1]).to(torch_device, self.torch_dtype),
         }
 
+    def test_pipeline_inference(self):
+        quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype)
+        transformer = self.model_cls.from_single_file(
+            self.ckpt_path, quantization_config=quantization_config, torch_dtype=self.torch_dtype
+        )
+        pipe = StableDiffusion3Pipeline.from_pretrained(
+            "stabilityai/stable-diffusion-3.5-large", transformer=transformer, torch_dtype=self.torch_dtype
+        )
+        pipe.enable_model_cpu_offload()
+
+        prompt = "a cat holding a sign that says hello"
+        output = pipe(
+            prompt=prompt, num_inference_steps=2, generator=torch.Generator("cpu").manual_seed(0), output_type="np"
+        ).images[0]
+        output_slice = output[:3, :3, :].flatten()
+        expected_slice = np.array(
+            [
+                0.17578125,
+                0.27539062,
+                0.27734375,
+                0.11914062,
+                0.26953125,
+                0.25390625,
+                0.109375,
+                0.25390625,
+                0.25,
+                0.15039062,
+                0.26171875,
+                0.28515625,
+                0.13671875,
+                0.27734375,
+                0.28515625,
+                0.12109375,
+                0.26757812,
+                0.265625,
+                0.16210938,
+                0.29882812,
+                0.28515625,
+                0.15625,
+                0.30664062,
+                0.27734375,
+                0.14648438,
+                0.29296875,
+                0.26953125,
+            ]
+        )
+        max_diff = numpy_cosine_similarity_distance(expected_slice, output_slice)
+        assert max_diff < 1e-4
+
 
 class SD35MediumGGUFSingleFileTests(GGUFSingleFileTesterMixin, unittest.TestCase):
     ckpt_path = "https://huggingface.co/city96/stable-diffusion-3.5-medium-gguf/blob/main/sd3.5_medium-Q3_K_M.gguf"
@@ -200,3 +308,52 @@ def get_dummy_inputs(self):
             ).to(torch_device, self.torch_dtype),
             "timestep": torch.tensor([1]).to(torch_device, self.torch_dtype),
         }
+
+    def test_pipeline_inference(self):
+        quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype)
+        transformer = self.model_cls.from_single_file(
+            self.ckpt_path, quantization_config=quantization_config, torch_dtype=self.torch_dtype
+        )
+        pipe = StableDiffusion3Pipeline.from_pretrained(
+            "stabilityai/stable-diffusion-3.5-medium", transformer=transformer, torch_dtype=self.torch_dtype
+        )
+        pipe.enable_model_cpu_offload()
+
+        prompt = "a cat holding a sign that says hello"
+        output = pipe(
+            prompt=prompt, num_inference_steps=2, generator=torch.Generator("cpu").manual_seed(0), output_type="np"
+        ).images[0]
+        output_slice = output[:3, :3, :].flatten()
+        expected_slice = np.array(
+            [
+                0.625,
+                0.6171875,
+                0.609375,
+                0.65625,
+                0.65234375,
+                0.640625,
+                0.6484375,
+                0.640625,
+                0.625,
+                0.6484375,
+                0.63671875,
+                0.6484375,
+                0.66796875,
+                0.65625,
+                0.65234375,
+                0.6640625,
+                0.6484375,
+                0.6328125,
+                0.6640625,
+                0.6484375,
+                0.640625,
+                0.67578125,
+                0.66015625,
+                0.62109375,
+                0.671875,
+                0.65625,
+                0.62109375,
+            ]
+        )
+        max_diff = numpy_cosine_similarity_distance(expected_slice, output_slice)
+        assert max_diff < 1e-4

From 78c78615a4c1c600a36dab4a0397f8ee7fc72692 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Thu, 5 Dec 2024 14:28:56 +0100
Subject: [PATCH 30/43] update

---
 src/diffusers/loaders/single_file_model.py  | 3 +++
 src/diffusers/models/model_loading_utils.py | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py
index b4edf48103a2..9fd8c18dd738 100644
--- a/src/diffusers/loaders/single_file_model.py
+++ b/src/diffusers/loaders/single_file_model.py
@@ -205,6 +205,7 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =
         revision = kwargs.pop("revision", None)
         torch_dtype = kwargs.pop("torch_dtype", None)
         quantization_config = kwargs.pop("quantization_config", None)
+        device = kwargs.pop("device", None)
 
         if isinstance(pretrained_model_link_or_path_or_dict, dict):
             checkpoint = pretrained_model_link_or_path_or_dict
@@ -326,10 +327,12 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =
             )
 
         if is_accelerate_available():
+            param_device = torch.device(device) if device else torch.device("cpu")
             unexpected_keys = load_model_dict_into_meta(
                 model,
                 diffusers_format_checkpoint,
                 dtype=torch_dtype,
+                device=param_device,
                 hf_quantizer=hf_quantizer,
                 keep_in_fp32_modules=keep_in_fp32_modules,
             )
diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py
index 2e648c864e99..220a4abdf723 100644
--- a/src/diffusers/models/model_loading_utils.py
+++ b/src/diffusers/models/model_loading_utils.py
@@ -184,7 +184,8 @@ def load_model_dict_into_meta(
 ) -> List[str]:
     if device is not None and not isinstance(device, (str, torch.device)):
         raise ValueError(f"Expected device to have type `str` or `torch.device`, but got {type(device)=}.")
-    device = device or torch.device("cpu")
+    if hf_quantizer is None:
+        device = device or torch.device("cpu")
     dtype = dtype or torch.float32
     is_quantized = hf_quantizer is not None
 

From 33eb43142c60ef0478c548adb44ee3282a79ff7a Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Thu, 5 Dec 2024 17:07:06 +0100
Subject: [PATCH 31/43] update

---
 src/diffusers/quantizers/gguf/__init__.py | 1 -
 src/diffusers/quantizers/gguf/utils.py    | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/quantizers/gguf/__init__.py b/src/diffusers/quantizers/gguf/__init__.py
index 53af2e180f48..b3d9082ac803 100644
--- a/src/diffusers/quantizers/gguf/__init__.py
+++ b/src/diffusers/quantizers/gguf/__init__.py
@@ -1,2 +1 @@
 from .gguf_quantizer import GGUFQuantizer
-from .utils import GGUFLinear, GGUFParameter
diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py
index 9081575a5962..7284f75335c8 100644
--- a/src/diffusers/quantizers/gguf/utils.py
+++ b/src/diffusers/quantizers/gguf/utils.py
@@ -27,7 +27,7 @@
 
 
 def _replace_with_gguf_linear(model, compute_dtype, state_dict, prefix="", modules_to_not_convert=[]):
-    def _should_convert_to_gguf(module, state_dict, prefix):
+    def _should_convert_to_gguf(state_dict, prefix):
         weight_key = prefix + "weight"
         return weight_key in state_dict and isinstance(state_dict[weight_key], GGUFParameter)
 
@@ -41,7 +41,7 @@ def _should_convert_to_gguf(module, state_dict, prefix):
 
         if (
             isinstance(module, nn.Linear)
-            and _should_convert_to_gguf(module, state_dict, module_prefix)
+            and _should_convert_to_gguf(state_dict, module_prefix)
             and name not in modules_to_not_convert
         ):
             ctx = init_empty_weights if is_accelerate_available() else nullcontext

From 9651ddc758cb8635bef38e8af83e76f26c255bdb Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Thu, 5 Dec 2024 17:12:06 +0100
Subject: [PATCH 32/43] update

---
 tests/quantization/gguf/test_gguf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/quantization/gguf/test_gguf.py b/tests/quantization/gguf/test_gguf.py
index 265d35c16d1c..eb05a2c1b9f3 100644
--- a/tests/quantization/gguf/test_gguf.py
+++ b/tests/quantization/gguf/test_gguf.py
@@ -53,6 +53,7 @@ def test_gguf_linear_layers(self):
         for name, module in model.named_modules():
             if isinstance(module, torch.nn.Linear) and hasattr(module.weight, "quant_type"):
                 assert module.weight.dtype == torch.uint8
+                assert module.bias.dtype == torch.float32
 
     def test_gguf_memory_usage(self):
         quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype)

From 746fd2f7f6a0ef8739b75515ba98c07cc357a3ea Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Thu, 5 Dec 2024 17:17:54 +0100
Subject: [PATCH 33/43] update

---
 src/diffusers/models/model_loading_utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py
index 220a4abdf723..af1a1a5250ff 100644
--- a/src/diffusers/models/model_loading_utils.py
+++ b/src/diffusers/models/model_loading_utils.py
@@ -25,7 +25,6 @@
 import safetensors
 import torch
 from huggingface_hub.utils import EntryNotFoundError
-from tqdm import tqdm
 
 from ..utils import (
     GGUF_FILE_EXTENSION,
@@ -458,7 +457,7 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
     reader = GGUFReader(gguf_checkpoint_path)
 
     parsed_parameters = {}
-    for tensor in tqdm(reader.tensors, desc="Loading GGUF Parameters: "):
+    for tensor in reader.tensors:
         name = tensor.name
         quant_type = tensor.tensor_type
 

From e027d46656c6c3e7e47e7b663d23e9901891dfa0 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Thu, 5 Dec 2024 17:19:44 +0100
Subject: [PATCH 34/43] update

---
 docs/source/en/quantization/gguf.md | 59 +++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100644 docs/source/en/quantization/gguf.md

diff --git a/docs/source/en/quantization/gguf.md b/docs/source/en/quantization/gguf.md
new file mode 100644
index 000000000000..511091000ebb
--- /dev/null
+++ b/docs/source/en/quantization/gguf.md
@@ -0,0 +1,59 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+-->
+
+# GGUF
+
+The GGUF file format is typically used to store models for inference with [GGML]() and supports a variety of block wise quantization options. Diffusers supports loading checkpoints prequantized and saved in the GGUF format via `from_single_file` loading with Model classes. Support for loading GGUF checkpoint via Pipelines is currently not supported. The dequantizatation functions used for dynamic dequantizatation are based on the great work done by [city96](https://github.com/city96/ComfyUI-GGUF) 
+
+The following example will load the [FLUX.1 DEV](https://huggingface.co/black-forest-labs/FLUX.1-dev) transformer model using the GGUF Q2_K quantization variant.
+
+
+```python
+import torch
+
+from diffusers import FluxPipeline, FluxTransformer2DModel, GGUFQuantizationConfig
+
+ckpt_path = (
+    "https://huggingface.co/city96/FLUX.1-dev-gguf/blob/main/flux1-dev-Q2_K.gguf"
+)
+transformer = FluxTransformer2DModel.from_single_file(
+    ckpt_path,
+    quantization_config=GGUFQuantizationConfig(compute_dtype=torch.bfloat16),
+    torch_dtype=torch.bfloat16,
+)
+pipe = FluxPipeline.from_pretrained(
+    "black-forest-labs/FLUX.1-dev",
+    transformer=transformer,
+    generator=torch.manual_seed(0),
+    torch_dtype=torch.bfloat16,
+)
+pipe.enable_model_cpu_offload()
+prompt = "A cat holding a sign that says hello world"
+image = pipe(prompt).images[0]
+image.save("flux-gguf.png")
+```
+
+## Supported Quantization Types
+
+- BF16
+- Q4_0
+- Q4_1
+- Q5_0
+- Q5_1
+- Q8_0
+- Q2_K
+- Q3_K
+- Q4_K
+- Q5_K
+- Q6_K
+

From 9db239697f70a072200f60ebf54dea601887f505 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 6 Dec 2024 07:33:43 +0100
Subject: [PATCH 35/43] update

---
 docs/source/en/_toctree.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 2faabfec30ce..458a611b2e51 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -157,6 +157,8 @@
     title: Getting Started
   - local: quantization/bitsandbytes
     title: bitsandbytes
+  - local: quantization/gguf
+    title: gguf
   title: Quantization Methods
 - sections:
   - local: optimization/fp16

From 7ee89f4cc32b8b0f12e153d7cc58c731eff6eacc Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 6 Dec 2024 08:00:25 +0100
Subject: [PATCH 36/43] update

---
 docs/source/en/api/quantization.md      | 4 ++++
 docs/source/en/quantization/overview.md | 8 ++++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/api/quantization.md b/docs/source/en/api/quantization.md
index 2fbde9e707ea..79443b2f4583 100644
--- a/docs/source/en/api/quantization.md
+++ b/docs/source/en/api/quantization.md
@@ -28,6 +28,10 @@ Learn how to quantize models in the [Quantization](../quantization/overview) gui
 
 [[autodoc]] BitsAndBytesConfig
 
+## GGUFQuantizationConfig
+
+[[autodoc]] GGUFQuantizationConfig
+
 ## DiffusersQuantizer
 
 [[autodoc]] quantizers.base.DiffusersQuantizer
diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md
index d8adbc85a259..28db7d891a6d 100644
--- a/docs/source/en/quantization/overview.md
+++ b/docs/source/en/quantization/overview.md
@@ -17,7 +17,7 @@ Quantization techniques focus on representing data with less information while a
 
 <Tip>
 
-Interested in adding a new quantization method to Transformers? Refer to the [Contribute new quantization method guide](https://huggingface.co/docs/transformers/main/en/quantization/contribute) to learn more about adding a new quantization method.
+Interested in adding a new quantization method to Diffusers? Refer to the [Contribute new quantization method guide](https://huggingface.co/docs/transformers/main/en/quantization/contribute) to learn more about adding a new quantization method.
 
 </Tip>
 
@@ -32,4 +32,8 @@ If you are new to the quantization field, we recommend you to check out these be
 
 ## When to use what?
 
-This section will be expanded once Diffusers has multiple quantization backends. Currently, we only support `bitsandbytes`. [This resource](https://huggingface.co/docs/transformers/main/en/quantization/overview#when-to-use-what) provides a good overview of the pros and cons of different quantization techniques. 
\ No newline at end of file
+Diffusers currently supports the following quantization methods.
+- `bitsandbytes`
+- `gguf`
+
+[This resource](https://huggingface.co/docs/transformers/main/en/quantization/overview#when-to-use-what) provides a good overview of the pros and cons of different quantization techniques.
\ No newline at end of file

From edf3e5431447db65649d82f848793c9262c0badf Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 6 Dec 2024 08:15:31 +0100
Subject: [PATCH 37/43] update

---
 docs/source/en/quantization/gguf.md | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/docs/source/en/quantization/gguf.md b/docs/source/en/quantization/gguf.md
index 511091000ebb..e6a72049601a 100644
--- a/docs/source/en/quantization/gguf.md
+++ b/docs/source/en/quantization/gguf.md
@@ -13,10 +13,19 @@ specific language governing permissions and limitations under the License.
 
 # GGUF
 
-The GGUF file format is typically used to store models for inference with [GGML]() and supports a variety of block wise quantization options. Diffusers supports loading checkpoints prequantized and saved in the GGUF format via `from_single_file` loading with Model classes. Support for loading GGUF checkpoint via Pipelines is currently not supported. The dequantizatation functions used for dynamic dequantizatation are based on the great work done by [city96](https://github.com/city96/ComfyUI-GGUF) 
+The GGUF file format is typically used to store models for inference with [GGML](https://github.com/ggerganov/ggml) and supports a variety of block wise quantization options. Diffusers supports loading checkpoints prequantized and saved in the GGUF format via `from_single_file` loading with Model classes. Loading GGUF checkpoints via Pipelines is currently not supported.
 
 The following example will load the [FLUX.1 DEV](https://huggingface.co/black-forest-labs/FLUX.1-dev) transformer model using the GGUF Q2_K quantization variant.
 
+Before starting please install gguf in your environment
+
+```shell
+pip install -U gguf
+```
+
+Since GGUF is a single file format, we will be using `from_single_file` to load the model and pass in the `GGUFQuantizationConfig` when loading the model.
+
+When using GGUF checkpoints, the quantized weights remain in a low memory `dtype`, typically `torch.unint8` and are dynamically dequantized and cast to the configured `compute_dtype` when running a forward pass through each module in the model. The `GGUFQuantizationConfig` allows you to set the `compute_dtype` for the forward pass of each module. The functions used for dynamic dequantizatation are based on the great work done by [city96](https://github.com/city96/ComfyUI-GGUF)
 
 ```python
 import torch

From d3eb54f0e9a5c2216c6ea1bb22363fb132e19bbd Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 6 Dec 2024 10:02:31 +0100
Subject: [PATCH 38/43] update

---
 src/diffusers/loaders/single_file_model.py    |  1 +
 .../quantizers/gguf/gguf_quantizer.py         | 14 +++++
 src/diffusers/quantizers/gguf/utils.py        | 57 +++++++++++++++++++
 tests/quantization/gguf/test_gguf.py          | 21 ++++++-
 4 files changed, 92 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py
index 9fd8c18dd738..7f821955fac8 100644
--- a/src/diffusers/loaders/single_file_model.py
+++ b/src/diffusers/loaders/single_file_model.py
@@ -351,6 +351,7 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =
 
         if hf_quantizer is not None:
             hf_quantizer.postprocess_model(model)
+            model.hf_quantizer = hf_quantizer
 
         if torch_dtype is not None and hf_quantizer is None:
             model.to(torch_dtype)
diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py
index 62c2063ac758..0c760e277ce4 100644
--- a/src/diffusers/quantizers/gguf/gguf_quantizer.py
+++ b/src/diffusers/quantizers/gguf/gguf_quantizer.py
@@ -24,6 +24,7 @@
     from .utils import (
         GGML_QUANT_SIZES,
         GGUFParameter,
+        _dequantize_gguf_and_restore_linear,
         _quant_shape_from_byte_shape,
         _replace_with_gguf_linear,
     )
@@ -143,3 +144,16 @@ def is_serializable(self):
     @property
     def is_trainable(self) -> bool:
         return False
+
+    def _dequantize(self, model):
+        is_model_on_cpu = model.device.type == "cpu"
+        if is_model_on_cpu:
+            logger.info(
+                "Model was found to be on CPU (could happen as a result of `enable_model_cpu_offload()`). So, moving it to GPU. After dequantization, will move the model back to CPU again to preserve the previous device."
+            )
+            model.to(torch.cuda.current_device())
+
+        model = _dequantize_gguf_and_restore_linear(model, self.modules_to_not_convert)
+        if is_model_on_cpu:
+            model.to("cpu")
+        return model
diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py
index 7284f75335c8..35e5743fbcf0 100644
--- a/src/diffusers/quantizers/gguf/utils.py
+++ b/src/diffusers/quantizers/gguf/utils.py
@@ -13,6 +13,7 @@
 # # limitations under the License.
 
 
+import inspect
 from contextlib import nullcontext
 
 import gguf
@@ -23,7 +24,27 @@
 
 
 if is_accelerate_available():
+    import accelerate
     from accelerate import init_empty_weights
+    from accelerate.hooks import add_hook_to_module, remove_hook_from_module
+
+
+# Copied from diffusers.quantizers.bitsandbytes.utils._create_accelerate_new_hook
+def _create_accelerate_new_hook(old_hook):
+    r"""
+    Creates a new hook based on the old hook. Use it only if you know what you are doing ! This method is a copy of:
+    https://github.com/huggingface/peft/blob/748f7968f3a31ec06a1c2b0328993319ad9a150a/src/peft/utils/other.py#L245 with
+    some changes
+    """
+    old_hook_cls = getattr(accelerate.hooks, old_hook.__class__.__name__)
+    old_hook_attr = old_hook.__dict__
+    filtered_old_hook_attr = {}
+    old_hook_init_signature = inspect.signature(old_hook_cls.__init__)
+    for k in old_hook_attr.keys():
+        if k in old_hook_init_signature.parameters:
+            filtered_old_hook_attr[k] = old_hook_attr[k]
+    new_hook = old_hook_cls(**filtered_old_hook_attr)
+    return new_hook
 
 
 def _replace_with_gguf_linear(model, compute_dtype, state_dict, prefix="", modules_to_not_convert=[]):
@@ -59,6 +80,42 @@ def _should_convert_to_gguf(state_dict, prefix):
     return model
 
 
+def _dequantize_gguf_and_restore_linear(model, modules_to_not_convert=[]):
+    for name, module in model.named_children():
+        if isinstance(module, GGUFLinear) and name not in modules_to_not_convert:
+            device = module.weight.device
+            bias = getattr(module, "bias", None)
+
+            ctx = init_empty_weights if is_accelerate_available() else nullcontext
+            with ctx():
+                new_module = nn.Linear(
+                    module.in_features,
+                    module.out_features,
+                    module.bias is not None,
+                    device=device,
+                )
+            new_module.weight = nn.Parameter(dequantize_gguf_tensor(module.weight))
+            if bias is not None:
+                new_module.bias = bias
+
+            # Create a new hook and attach it in case we use accelerate
+            if hasattr(module, "_hf_hook"):
+                old_hook = module._hf_hook
+                new_hook = _create_accelerate_new_hook(old_hook)
+
+                remove_hook_from_module(module)
+                add_hook_to_module(new_module, new_hook)
+
+            new_module.to(device)
+            model._modules[name] = new_module
+
+        has_children = list(module.children())
+        if has_children:
+            _dequantize_gguf_and_restore_linear(module, modules_to_not_convert)
+
+    return model
+
+
 # dequantize operations based on torch ports of GGUF dequantize_functions
 # from City96
 # more info: https://github.com/city96/ComfyUI-GGUF/blob/main/dequant.py
diff --git a/tests/quantization/gguf/test_gguf.py b/tests/quantization/gguf/test_gguf.py
index eb05a2c1b9f3..8ac4c9915c27 100644
--- a/tests/quantization/gguf/test_gguf.py
+++ b/tests/quantization/gguf/test_gguf.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 import torch
+import torch.nn as nn
 
 from diffusers import (
     FluxPipeline,
@@ -23,7 +24,7 @@
 
 
 if is_gguf_available():
-    from diffusers.quantizers.gguf.utils import GGUFParameter
+    from diffusers.quantizers.gguf.utils import GGUFLinear, GGUFParameter
 
 
 @nightly
@@ -112,6 +113,24 @@ def test_dtype_assignment(self):
         # This should work
         model.to("cuda")
 
+    def test_dequantize_model(self):
+        quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype)
+        model = self.model_cls.from_single_file(self.ckpt_path, quantization_config=quantization_config)
+        model.dequantize()
+
+        def _check_for_gguf_linear(model):
+            has_children = list(model.children())
+            if not has_children:
+                return
+
+            for name, module in model.named_children():
+                if isinstance(module, nn.Linear):
+                    assert not isinstance(module, GGUFLinear), f"{name} is still GGUFLinear"
+                    assert not isinstance(module.weight, GGUFParameter), f"{name} weight is still GGUFParameter"
+
+        for name, module in model.named_children():
+            _check_for_gguf_linear(module)
+
 
 class FluxGGUFSingleFileTests(GGUFSingleFileTesterMixin, unittest.TestCase):
     ckpt_path = "https://huggingface.co/city96/FLUX.1-dev-gguf/blob/main/flux1-dev-Q2_K.gguf"

From 4f34f149369a2befdb8d6dd65af49cb704b91f4c Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Wed, 11 Dec 2024 08:44:09 +0530
Subject: [PATCH 39/43] Update docs/source/en/quantization/gguf.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/en/quantization/gguf.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/quantization/gguf.md b/docs/source/en/quantization/gguf.md
index e6a72049601a..8d38478ffc58 100644
--- a/docs/source/en/quantization/gguf.md
+++ b/docs/source/en/quantization/gguf.md
@@ -23,7 +23,7 @@ Before starting please install gguf in your environment
 pip install -U gguf
 ```
 
-Since GGUF is a single file format, we will be using `from_single_file` to load the model and pass in the `GGUFQuantizationConfig` when loading the model.
+Since GGUF is a single file format, use [`~FromSingleFileMixin.from_single_file`] to load the model and pass in the [`GGUFQuantizationConfig`] when loading the model.
 
 When using GGUF checkpoints, the quantized weights remain in a low memory `dtype`, typically `torch.unint8` and are dynamically dequantized and cast to the configured `compute_dtype` when running a forward pass through each module in the model. The `GGUFQuantizationConfig` allows you to set the `compute_dtype` for the forward pass of each module. The functions used for dynamic dequantizatation are based on the great work done by [city96](https://github.com/city96/ComfyUI-GGUF)
 

From 090efdb899f0171a0f64bb3b50cec4b763e0a87d Mon Sep 17 00:00:00 2001
From: DN6 <dhruv.nair@gmail.com>
Date: Wed, 11 Dec 2024 14:23:25 +0530
Subject: [PATCH 40/43] update

---
 docs/source/en/quantization/gguf.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/quantization/gguf.md b/docs/source/en/quantization/gguf.md
index 8d38478ffc58..dbcd1b1486b2 100644
--- a/docs/source/en/quantization/gguf.md
+++ b/docs/source/en/quantization/gguf.md
@@ -23,9 +23,11 @@ Before starting please install gguf in your environment
 pip install -U gguf
 ```
 
-Since GGUF is a single file format, use [`~FromSingleFileMixin.from_single_file`] to load the model and pass in the [`GGUFQuantizationConfig`] when loading the model.
+Since GGUF is a single file format, use [`~FromSingleFileMixin.from_single_file`] to load the model and pass in the [`GGUFQuantizationConfig`].
 
-When using GGUF checkpoints, the quantized weights remain in a low memory `dtype`, typically `torch.unint8` and are dynamically dequantized and cast to the configured `compute_dtype` when running a forward pass through each module in the model. The `GGUFQuantizationConfig` allows you to set the `compute_dtype` for the forward pass of each module. The functions used for dynamic dequantizatation are based on the great work done by [city96](https://github.com/city96/ComfyUI-GGUF)
+When using GGUF checkpoints, the quantized weights remain in a low memory `dtype`(typically `torch.unint8`) and are dynamically dequantized and cast to the configured `compute_dtype` during each module's forward pass through the model. The `GGUFQuantizationConfig` allows you to set the `compute_dtype`. 
+
+The functions used for dynamic dequantizatation are based on the great work done by [city96](https://github.com/city96/ComfyUI-GGUF), who created the Pytorch ports of the original (`numpy`)[https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/quants.py] implementation by [compilade](https://github.com/compilade).
 
 ```python
 import torch

From e67c25a4bda5b48ea4f97b1807039efa3a7186f2 Mon Sep 17 00:00:00 2001
From: DN6 <dhruv.nair@gmail.com>
Date: Tue, 17 Dec 2024 13:02:16 +0530
Subject: [PATCH 41/43] update

---
 src/diffusers/__init__.py                     | 14 +++++--------
 .../quantizers/quantization_config.py         | 20 ++++++++++++++-----
 src/diffusers/utils/import_utils.py           |  5 ++++-
 src/diffusers/utils/testing_utils.py          |  1 +
 4 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 8fca380c8255..e2351a0c53b8 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -31,7 +31,7 @@
     "loaders": ["FromOriginalModelMixin"],
     "models": [],
     "pipelines": [],
-    "quantizers.quantization_config": ["BitsAndBytesConfig", "TorchAoConfig", "GGUFQuantizationConfig"],
+    "quantizers.quantization_config": ["BitsAndBytesConfig", "GGUFQuantizationConfig", "TorchAoConfig"],
     "schedulers": [],
     "utils": [
         "OptionalDependencyNotAvailable",
@@ -428,8 +428,7 @@
     if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from .utils import \
-        dummy_torch_and_transformers_and_k_diffusion_objects  # noqa F403
+    from .utils import dummy_torch_and_transformers_and_k_diffusion_objects  # noqa F403
 
     _import_structure["utils.dummy_torch_and_transformers_and_k_diffusion_objects"] = [
         name for name in dir(dummy_torch_and_transformers_and_k_diffusion_objects) if not name.startswith("_")
@@ -442,8 +441,7 @@
     if not (is_torch_available() and is_transformers_available() and is_sentencepiece_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from .utils import \
-        dummy_torch_and_transformers_and_sentencepiece_objects  # noqa F403
+    from .utils import dummy_torch_and_transformers_and_sentencepiece_objects  # noqa F403
 
     _import_structure["utils.dummy_torch_and_transformers_and_sentencepiece_objects"] = [
         name for name in dir(dummy_torch_and_transformers_and_sentencepiece_objects) if not name.startswith("_")
@@ -456,8 +454,7 @@
     if not (is_torch_available() and is_transformers_available() and is_onnx_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from .utils import \
-        dummy_torch_and_transformers_and_onnx_objects  # noqa F403
+    from .utils import dummy_torch_and_transformers_and_onnx_objects  # noqa F403
 
     _import_structure["utils.dummy_torch_and_transformers_and_onnx_objects"] = [
         name for name in dir(dummy_torch_and_transformers_and_onnx_objects) if not name.startswith("_")
@@ -492,8 +489,7 @@
     if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from .utils import \
-        dummy_transformers_and_torch_and_note_seq_objects  # noqa F403
+    from .utils import dummy_transformers_and_torch_and_note_seq_objects  # noqa F403
 
     _import_structure["utils.dummy_transformers_and_torch_and_note_seq_objects"] = [
         name for name in dir(dummy_transformers_and_torch_and_note_seq_objects) if not name.startswith("_")
diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py
index 5d74eb7008cd..fc22e4e65a9a 100644
--- a/src/diffusers/quantizers/quantization_config.py
+++ b/src/diffusers/quantizers/quantization_config.py
@@ -395,19 +395,29 @@ def to_diff_dict(self) -> Dict[str, Any]:
         return serializable_config_dict
 
 
+@dataclass
 class GGUFQuantizationConfig(QuantizationConfigMixin):
-    def __init__(self, compute_dtype=None, quant_storage=None, modules_to_not_convert=None):
+    """This is a config class for GGUF Quantization techniques.
+
+    Args:
+        compute_dtype: (`torch.dtype`, defaults to `torch.float32`):
+            This sets the computational type which might be different than the input type. For example, inputs might be
+            fp32, but computation can be set to bf16 for speedups.
+
+    """
+
+    def __init__(self, compute_dtype: torch.dtype = None):
         self.quant_method = QuantizationMethod.GGUF
         self.compute_dtype = compute_dtype
-        self.quant_storage = quant_storage
         self.pre_quantized = True
-        self.modules_to_not_convert = modules_to_not_convert
+
+        # TODO: (Dhruv) Add this as an init argument when we can support loading unquantized checkpoints.
+        self.modules_to_not_convert = []
 
         if self.compute_dtype is None:
             self.compute_dtype = torch.float32
 
-        if self.quant_storage is None:
-            self.quant_storage = torch.uint8
+
 @dataclass
 class TorchAoConfig(QuantizationConfigMixin):
     """This is a config class for torchao quantization/sparsity techniques.
diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py
index 40983fe8cae2..3014efebc82e 100644
--- a/src/diffusers/utils/import_utils.py
+++ b/src/diffusers/utils/import_utils.py
@@ -479,6 +479,8 @@ def is_imageio_available():
 
 def is_gguf_available():
     return _is_gguf_available
+
+
 def is_torchao_available():
     return _is_torchao_available
 
@@ -622,7 +624,8 @@ def is_torchao_available():
 """
 
 TORCHAO_IMPORT_ERROR = """
-{0} requires the torchao library but it was not found in your environment. You can install it with pip: `pip install torchao`
+{0} requires the torchao library but it was not found in your environment. You can install it with pip: `pip install
+torchao`
 """
 
 BACKENDS_MAPPING = OrderedDict(
diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py
index 4753bc4785b5..e5eac05ac4cd 100644
--- a/src/diffusers/utils/testing_utils.py
+++ b/src/diffusers/utils/testing_utils.py
@@ -487,6 +487,7 @@ def decorator(test_case):
             correct_gguf_version, f"Test requires gguf with the version greater than {gguf_version}."
         )(test_case)
 
+
 def require_torchao_version_greater(torchao_version):
     def decorator(test_case):
         correct_torchao_version = is_torchao_available() and version.parse(

From e710bde37d8d6ca7a5f63ab6639096c7bfbe793b Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 17 Dec 2024 09:39:34 +0100
Subject: [PATCH 42/43] update

---
 src/diffusers/quantizers/quantization_config.py | 2 +-
 src/diffusers/utils/testing_utils.py            | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py
index fc22e4e65a9a..504105b10d81 100644
--- a/src/diffusers/quantizers/quantization_config.py
+++ b/src/diffusers/quantizers/quantization_config.py
@@ -412,7 +412,7 @@ def __init__(self, compute_dtype: torch.dtype = None):
         self.pre_quantized = True
 
         # TODO: (Dhruv) Add this as an init argument when we can support loading unquantized checkpoints.
-        self.modules_to_not_convert = []
+        self.modules_to_not_convert = None
 
         if self.compute_dtype is None:
             self.compute_dtype = torch.float32
diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py
index e5eac05ac4cd..3448b4d28d1f 100644
--- a/src/diffusers/utils/testing_utils.py
+++ b/src/diffusers/utils/testing_utils.py
@@ -487,6 +487,8 @@ def decorator(test_case):
             correct_gguf_version, f"Test requires gguf with the version greater than {gguf_version}."
         )(test_case)
 
+    return decorator
+
 
 def require_torchao_version_greater(torchao_version):
     def decorator(test_case):

From f59e07a6b225eeaf0dc73a76a55c41bffdd518b3 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 17 Dec 2024 10:14:25 +0100
Subject: [PATCH 43/43] update

---
 src/diffusers/quantizers/quantization_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py
index 504105b10d81..3078be310719 100644
--- a/src/diffusers/quantizers/quantization_config.py
+++ b/src/diffusers/quantizers/quantization_config.py
@@ -406,7 +406,7 @@ class GGUFQuantizationConfig(QuantizationConfigMixin):
 
     """
 
-    def __init__(self, compute_dtype: torch.dtype = None):
+    def __init__(self, compute_dtype: Optional["torch.dtype"] = None):
         self.quant_method = QuantizationMethod.GGUF
         self.compute_dtype = compute_dtype
         self.pre_quantized = True