From 8d05e0dc5a683bcd4a6bf801bfd8458a2f501152 Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Mon, 29 May 2023 10:27:21 +0000 Subject: [PATCH 01/10] add README --- README.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 00000000000000..e69de29bb2d1d6 From 6c20653d3a73d17977e0a8dd8ab12f01ac2bcfc4 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 27 Jun 2023 12:07:41 +0000 Subject: [PATCH 02/10] add code base --- VERSION | 1 + paddlevlp/__init__.py | 19 + paddlevlp/datasets/__init__.py | 17 + paddlevlp/datasets/caption_dataset.py | 98 + paddlevlp/datasets/coco_caption.py | 17 + paddlevlp/datasets/dataset.py | 844 ++++++++ paddlevlp/examples/blip2/__init__.py | 13 + paddlevlp/examples/blip2/run_predict.py | 98 + .../examples/blip2/run_pretrain_stage2.py | 271 +++ paddlevlp/models/__init__.py | 16 + paddlevlp/models/blip2/__init__.py | 13 + paddlevlp/models/blip2/configuration.py | 400 ++++ paddlevlp/models/blip2/modeling.py | 1925 +++++++++++++++++ paddlevlp/optimization.py | 106 + paddlevlp/processors/__init__.py | 16 + paddlevlp/processors/blip_processing.py | 661 ++++++ .../processors/image_processing_utils.py | 553 +++++ paddlevlp/processors/image_transform_utils.py | 795 +++++++ paddlevlp/processors/image_utils.py | 305 +++ paddlevlp/processors/processing_utils.py | 538 +++++ paddlevlp/processors/utils.py | 27 + paddlevlp/trainer/__init__.py | 15 + paddlevlp/trainer/trainer.py | 15 + paddlevlp/utils/__init__.py | 13 + paddlevlp/utils/downloader.py | 492 +++++ paddlevlp/utils/env.py | 84 + paddlevlp/utils/log.py | 123 ++ requirements.txt | 2 + setup.py | 73 + 29 files changed, 7550 insertions(+) create mode 100644 VERSION create mode 100644 paddlevlp/__init__.py create mode 100644 paddlevlp/datasets/__init__.py create mode 100644 paddlevlp/datasets/caption_dataset.py create mode 100644 paddlevlp/datasets/coco_caption.py create mode 100644 paddlevlp/datasets/dataset.py create mode 100644 paddlevlp/examples/blip2/__init__.py create mode 100644 paddlevlp/examples/blip2/run_predict.py create mode 100644 paddlevlp/examples/blip2/run_pretrain_stage2.py create mode 100644 paddlevlp/models/__init__.py create mode 100644 paddlevlp/models/blip2/__init__.py create mode 100644 paddlevlp/models/blip2/configuration.py create mode 100644 paddlevlp/models/blip2/modeling.py create mode 100644 paddlevlp/optimization.py create mode 100644 paddlevlp/processors/__init__.py create mode 100644 paddlevlp/processors/blip_processing.py create mode 100644 paddlevlp/processors/image_processing_utils.py create mode 100644 paddlevlp/processors/image_transform_utils.py create mode 100644 paddlevlp/processors/image_utils.py create mode 100644 paddlevlp/processors/processing_utils.py create mode 100644 paddlevlp/processors/utils.py create mode 100644 paddlevlp/trainer/__init__.py create mode 100644 paddlevlp/trainer/trainer.py create mode 100644 paddlevlp/utils/__init__.py create mode 100644 paddlevlp/utils/downloader.py create mode 100644 paddlevlp/utils/env.py create mode 100644 paddlevlp/utils/log.py create mode 100644 requirements.txt create mode 100644 setup.py diff --git a/VERSION b/VERSION new file mode 100644 index 00000000000000..6e8bf73aa550d4 --- /dev/null +++ b/VERSION @@ -0,0 +1 @@ +0.1.0 diff --git a/paddlevlp/__init__.py b/paddlevlp/__init__.py new file mode 100644 index 00000000000000..058cfa738ac127 --- /dev/null +++ b/paddlevlp/__init__.py @@ -0,0 +1,19 @@ +# copyright (c) 2023 paddlepaddle authors. all rights reserved. +# copyright 2023 the salesforce team authors and the huggingface team. all rights reserved. +# +# licensed under the apache license, version 2.0 (the "license"); +# you may not use this file except in compliance with the license. +# you may obtain a copy of the license at +# +# http://www.apache.org/licenses/license-2.0 +# +# unless required by applicable law or agreed to in writing, software +# distributed under the license is distributed on an "as is" basis, +# without warranties or conditions of any kind, either express or implied. +# see the license for the specific language governing permissions and +# limitations under the license. + +from .datasets import * +from .models import * +from .optimization import * +from .processors import * diff --git a/paddlevlp/datasets/__init__.py b/paddlevlp/datasets/__init__.py new file mode 100644 index 00000000000000..22899151a584d9 --- /dev/null +++ b/paddlevlp/datasets/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .caption_dataset import * +from .coco_caption import * +from .dataset import * diff --git a/paddlevlp/datasets/caption_dataset.py b/paddlevlp/datasets/caption_dataset.py new file mode 100644 index 00000000000000..5ef67626f95029 --- /dev/null +++ b/paddlevlp/datasets/caption_dataset.py @@ -0,0 +1,98 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +import json +import os + +from paddlevlp.utils.env import DATA_HOME +from paddlevlp.utils.log import logger + +from .dataset import DatasetBuilder + +# from paddle.dataset.common import md5file +# from paddle.utils.download import get_path_from_url + + +__all__ = ["CaptionDataset"] + + +class CaptionDataset(DatasetBuilder): + """ + Caption dataset. + """ + + URL = "https://bj.bcebos.com/paddlemix/datasets/coco.tar.gz" + META_INFO = collections.namedtuple( + "META_INFO", ("images", "annotations", "images_md5", "annotations_md5") + ) + MD5 = "" + SPLITS = { + "train": META_INFO( + os.path.join("coco", "images"), + os.path.join("coco", "annotations/coco_karpathy_train_debug.json"), + "", + "aa31ac474cf6250ebb81d18348a07ed8", + ), + "val": META_INFO( + os.path.join("coco", "images"), + os.path.join("coco", "annotations/coco_karpathy_val.json"), + "", + "b273847456ef5580e33713b1f7de52a0", + ), + "test": META_INFO( + os.path.join("coco", "images"), + os.path.join("coco", "annotations/coco_karpathy_test.json"), + "", + "3ff34b0ef2db02d01c37399f6a2a6cd1", + ), + } + + def _get_data(self, mode, **kwargs): + # default_root = '/paddle/wangguanzhong/blip-jinman/PaddleNLP/blip2' + logger.info("default dataset root is {}".format(DATA_HOME)) + images, annotations, image_hash, anno_hash = self.SPLITS[mode] + image_fullname = os.path.join(DATA_HOME, images) + anno_fullname = os.path.join(DATA_HOME, annotations) + # if ( + # (not os.path.exists(src_fullname) or (src_data_hash and not md5file(src_fullname) == src_data_hash)) + # or (not os.path.exists(tgt_fullname) or (tgt_data_hash and not md5file(tgt_fullname) == tgt_data_hash)) + # or (not os.path.exists(vocab_fullname) or (vocab_hash and not md5file(vocab_fullname) == vocab_hash)) + # ): + # get_path_from_url(self.URL, default_root, self.MD5) + + return image_fullname, anno_fullname, mode + + def _gen_image_id(self, anno): + img_ids = {} + n = 0 + for ann in anno: + img_id = ann["image_id"] + if img_id not in img_ids.keys(): + img_ids[img_id] = n + n += 1 + return img_ids + + def _read(self, filename, *args): + image_root, anno_path, mode = filename + annotations = json.load(open(anno_path, "r")) + image_ids = self._gen_image_id(annotations) + + for ann in annotations: + image_path = os.path.join(image_root, ann["image"]) + yield_data = {"image": image_path, "image_id": image_ids[ann["image_id"]]} + if mode == "train": + # only train mode has text input + yield_data["text_input"] = ann["caption"] + yield yield_data diff --git a/paddlevlp/datasets/coco_caption.py b/paddlevlp/datasets/coco_caption.py new file mode 100644 index 00000000000000..bee4ae6c15e79e --- /dev/null +++ b/paddlevlp/datasets/coco_caption.py @@ -0,0 +1,17 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddlevlp.datasets.caption_dataset import CaptionDataset + +COCOCaption = CaptionDataset diff --git a/paddlevlp/datasets/dataset.py b/paddlevlp/datasets/dataset.py new file mode 100644 index 00000000000000..d62d8ed0b85ffa --- /dev/null +++ b/paddlevlp/datasets/dataset.py @@ -0,0 +1,844 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import atexit +import inspect +import os +import time +import warnings +from collections import namedtuple + +import datasets +from multiprocess import Pool, RLock + +import paddlevlp + +try: + import paddle.distributed as dist +except Exception: + warnings.warn("paddle.distributed is not contains in you paddle!") + +import importlib +from functools import partial + +from paddle.io import Dataset, IterableDataset +from paddle.utils.download import _get_unique_endpoints + +from paddlevlp.utils.env import DATA_HOME + +__all__ = ["MapDataset", "DatasetBuilder", "IterDataset", "load_dataset"] + +DATASETS_MODULE_PATH = "paddlevlp.datasets." + +# Patch for intranet +from datasets import load_dataset as origin_load_dataset # noqa: E402 + + +def load_from_ppvlp(path, *args, **kwargs): + ppvlp_path = paddlevlp.datasets.__path__[0] + new_path = os.path.split(path)[-1] + new_path = os.path.join(ppvlp_path, "hf_datasets", new_path + ".py") + if os.path.exists(new_path): + return origin_load_dataset(new_path, *args, **kwargs) + else: + return origin_load_dataset(path, *args, **kwargs) + + +datasets.load_dataset = load_from_ppvlp + + +class DatasetTuple: + def __init__(self, splits): + self.identifier_map, identifiers = self._gen_identifier_map(splits) + self.tuple_cls = namedtuple("datasets", identifiers) + self.tuple = self.tuple_cls(*[None for _ in splits]) + + def __getitem__(self, key): + if isinstance(key, (int, slice)): + return self.tuple[key] + if isinstance(key, str): + return getattr(self.tuple, self.identifier_map[key]) + + def __setitem__(self, key, value): + self.tuple = self.tuple._replace(**{self.identifier_map[key]: value}) + + def _gen_identifier_map(self, splits): + identifier_map = {} + identifiers = [] + for i in range(len(splits)): + identifiers.append("splits_" + str(i)) + identifier_map[splits[i]] = "splits_" + str(i) + return identifier_map, identifiers + + def __len__(self): + return len(self.tuple) + + +def import_main_class(module_path): + """ + Import a module at module_path and return its DatasetBuilder class. + + """ + module_path = DATASETS_MODULE_PATH + module_path + module = importlib.import_module(module_path) + main_cls_type = DatasetBuilder + + # Find the main class in our imported module + module_main_cls = None + for name, obj in module.__dict__.items(): + if isinstance(obj, type) and issubclass(obj, main_cls_type): + if name == "DatasetBuilder": + continue + module_main_cls = obj + break + + return module_main_cls + + +def load_from_hf(path, name=None, splits=None, **kwargs): + from datasets import DatasetDict + from datasets import load_dataset as load_hf_dataset + from datasets.features import ClassLabel + + try: + hf_datasets = load_hf_dataset(path, name=name, split=splits, **kwargs) + except FileNotFoundError: + raise FileNotFoundError( + "Couldn't find the dataset script for '" + + path + + "' on PaddleNLP or HuggingFace" + ) + else: + label_list = [] + if isinstance(hf_datasets, DatasetDict): + datasets = DatasetTuple(list(hf_datasets.keys())) + for split, ds in hf_datasets.items(): + for feature in ds.features.values(): + if isinstance(feature, ClassLabel): + label_list = feature.names + datasets[split] = MapDataset(ds, label_list=label_list) + elif isinstance(hf_datasets, list): + datasets = DatasetTuple(splits) + for i, split in enumerate(splits): + for feature in hf_datasets[i].features.values(): + if isinstance(feature, ClassLabel): + label_list = feature.names + datasets[split] = MapDataset(hf_datasets[i], label_list=label_list) + else: + for feature in hf_datasets.features.values(): + if isinstance(feature, ClassLabel): + label_list = feature.names + datasets = MapDataset(hf_datasets, label_list=label_list) + return datasets + + +def load_dataset( + path_or_read_func, name=None, data_files=None, splits=None, lazy=None, **kwargs +): + """ + This method will load a dataset, either form PaddleNLP library or from a + self-defined data loading script, by calling functions in `DatasetBuilder`. + + For all the names of datasets in PaddleNLP library, see here: `dataset_list + `__. + + Either `splits` or `data_files` must be specified. + + Args: + path_or_read_func (str|callable): Name of the dataset processing script + in PaddleNLP library or a custom data reading function. + name (str, optional): Additional name to select a more specific dataset. + Defaults to None. + data_files (str|list|tuple|dict, optional): Defining the path of dataset + files. If None. `splits` must be specified. Defaults to None. + splits (str|list|tuple, optional): Which split of the data to load. If None. + `data_files` must be specified. Defaults to None. + lazy (bool, optional): Weather to return `MapDataset` or an `IterDataset`. + True for `IterDataset`. False for `MapDataset`. If None, return the + default type of this dataset. Defaults to None. + kwargs (dict): Other keyword arguments to be passed to the `DatasetBuilder`. + + Returns: + A `MapDataset` or `IterDataset` or a tuple of those. + + For how to use this function, please see `dataset_load + `__ + and `dataset_self_defined + `__ + + """ + if inspect.isfunction(path_or_read_func): + assert lazy is not None, "lazy can not be None in custom mode." + kwargs["name"] = name + kwargs["data_files"] = data_files + kwargs["splits"] = splits + custom_kwargs = {} + for name in inspect.signature(path_or_read_func).parameters.keys(): + if name in kwargs.keys(): + custom_kwargs[name] = kwargs[name] + + reader_instance = SimpleBuilder(lazy=lazy, read_func=path_or_read_func) + return reader_instance.read(**custom_kwargs) + else: + try: + reader_cls = import_main_class(path_or_read_func) + except ModuleNotFoundError: + datasets = load_from_hf( + path_or_read_func, name=name, splits=splits, **kwargs + ) + else: + reader_instance = reader_cls(lazy=lazy, name=name, **kwargs) + + # Check if selected name and split is valid in this DatasetBuilder + if hasattr(reader_instance, "BUILDER_CONFIGS"): + if name in reader_cls.BUILDER_CONFIGS.keys(): + split_names = reader_cls.BUILDER_CONFIGS[name]["splits"].keys() + else: + raise ValueError( + 'Invalid name "{}". Should be one of {}.'.format( + name, list(reader_cls.BUILDER_CONFIGS.keys()) + ) + ) + elif hasattr(reader_instance, "SPLITS"): + split_names = reader_instance.SPLITS.keys() + else: + raise AttributeError( + "Either 'SPLITS' or 'BUILDER_CONFIGS' must be implemented for DatasetBuilder." + ) + + selected_splits = [] + if isinstance(splits, list) or isinstance(splits, tuple): + selected_splits.extend(splits) + else: + selected_splits += [splits] + + for split_name in selected_splits: + if split_name not in split_names and split_name is not None: + raise ValueError( + 'Invalid split "{}". Should be one of {}.'.format( + split_name, list(split_names) + ) + ) + + datasets = reader_instance.read_datasets( + data_files=data_files, splits=splits + ) + return datasets + + +class MapDataset(Dataset): + """ + Wraps a map-style dataset-like object as an instance of `MapDataset`, and equips it + with `map` and other utility methods. All non-magic methods of the raw object + are also accessible. + + Args: + data (list|Dataset): An object with `__getitem__` and `__len__` methods. It could + be a list or a subclass of `paddle.io.Dataset`. + kwargs (dict, optional): Other information to be passed to the dataset. + + For examples of this class, please see `dataset_self_defined + `__. + + """ + + def __init__(self, data, **kwargs): + self.data = data + self._transform_pipline = [] + self.new_data = self.data + self.info = kwargs + self.label_list = self.info.pop("label_list", None) + self.vocab_info = self.info.pop("vocab_info", None) + + def _transform(self, data): + for fn in self._transform_pipline: + data = fn(data) + return data + + def __getitem__(self, idx): + """ + Basic function of `MapDataset` to get sample from dataset with a given + index. + """ + return ( + self._transform(self.new_data[idx]) + if self._transform_pipline + else self.new_data[idx] + ) + + def __len__(self): + """ + Returns the number of samples in dataset. + """ + return len(self.new_data) + + def filter(self, fn, num_workers=0): + """ + Filters samples by the filter function and uses the filtered data to + update this dataset. + + Args: + fn (callable): A filter function that takes a sample as input and + returns a boolean. Samples that return False would be discarded. + num_workers(int, optional): Number of processes for multiprocessing. If + set to 0, it doesn't use multiprocessing. Defaults to `0`. + """ + assert num_workers >= 0, "num_workers should be a non-negative value" + if num_workers > 1: + shards = [ + self._shard(num_shards=num_workers, index=index, contiguous=True) + for index in range(num_workers) + ] + kwds_per_shard = [ + dict(self=shards[rank], fn=fn) for rank in range(num_workers) + ] + pool = Pool(num_workers, initargs=(RLock(),)) + + results = [ + pool.apply_async(self.__class__._filter, kwds=kwds) + for kwds in kwds_per_shard + ] + transformed_shards = [r.get() for r in results] + + pool.close() + pool.join() + self.new_data = [] + for i in range(num_workers): + self.new_data += transformed_shards[i].new_data + return self + else: + return self._filter(fn) + + def _filter(self, fn): + self.new_data = [ + self.new_data[idx] + for idx in range(len(self.new_data)) + if fn(self.new_data[idx]) + ] + return self + + def shard(self, num_shards=None, index=None, contiguous=False): + self.new_data = self._shard( + num_shards=num_shards, index=index, contiguous=contiguous + ).data + return self + + def _shard(self, num_shards=None, index=None, contiguous=False): + """ + Split the dataset into `num_shards` pieces. Note that the size of each + shard might be different because the original dataset may not be evenly + divisible. + + Args: + num_shards (int, optional): An integer representing the number of + data shards. If None, `num_shards` would be number of trainers. + Defaults to `None`. + index (int, optional): An integer representing the index of the + current shard. If None, `index` would be the current trainer rank + id. Defaults to `None`. + contiguous: (bool, optional): If true, contiguous chunks of data + will be select for sharding. And total number of examples will + be the same. Otherwise each shard will contain all examples of + dataset whose index mod `num_shards` = `index`. Defaults to `False`. + """ + if num_shards is None: + num_shards = dist.get_world_size() + if index is None: + index = dist.get_rank() + + if contiguous: + div = len(self) // num_shards + mod = len(self) % num_shards + start = div * index + min(index, mod) + end = start + div + (1 if index < mod else 0) + new_data = [self.new_data[idx] for idx in range(start, end)] + else: + new_data = [ + self.new_data[idx] + for idx in range(len(self.new_data)) + if idx % num_shards == index + ] + + return MapDataset(new_data) + + def map(self, fn, lazy=True, batched=False, num_workers=0): + """ + Performs specific function on the dataset to transform and update every sample. + + Args: + fn (callable): Transformations to be performed. It receives single + sample as argument if batched is False. Else it receives all examples. + lazy (bool, optional): If True, transformations would be delayed and + performed on demand. Otherwise, transforms all samples at once. Note that + if `fn` is stochastic, `lazy` should be True or you will get the same + result on all epochs. Defaults to False. + batched(bool, optional): If True, transformations would take all examples as + input and return a collection of transformed examples. Note that if set + True, `lazy` option would be ignored. Defaults to False. + num_workers(int, optional): Number of processes for multiprocessing. If + set to 0, it doesn't use multiprocessing. Note that if set to positive + value, `lazy` option would be ignored. Defaults to 0. + """ + + assert num_workers >= 0, "num_workers should be a non-negative value" + if num_workers > 1: + shards = [ + self._shard(num_shards=num_workers, index=index, contiguous=True) + for index in range(num_workers) + ] + kwds_per_shard = [ + dict(self=shards[rank], fn=fn, lazy=False, batched=batched) + for rank in range(num_workers) + ] + pool = Pool(num_workers, initargs=(RLock(),)) + results = [ + pool.apply_async(self.__class__._map, kwds=kwds) + for kwds in kwds_per_shard + ] + transformed_shards = [r.get() for r in results] + pool.close() + pool.join() + self.new_data = [] + for i in range(num_workers): + self.new_data += transformed_shards[i].new_data + return self + else: + return self._map(fn, lazy=lazy, batched=batched) + + def _map(self, fn, lazy=True, batched=False): + if batched: + self.new_data = fn(self.new_data) + elif lazy: + self._transform_pipline.append(fn) + else: + self.new_data = [ + fn(self.new_data[idx]) for idx in range(len(self.new_data)) + ] + return self + + +class IterDataset(IterableDataset): + """ + Wraps a dataset-like object as an instance of `IterDataset`, and equips it with + `map` and other utility methods. All non-magic methods of the raw object + also accessible. + + Args: + data (Iterable): An object with `__iter__` function. It can be a Iterable or a + subclass of `paddle.io.IterableDataset`. + kwargs (dict, optional): Other information to be passed to the dataset. + + For examples of this class, please see `dataset_self_defined + `__. + """ + + def __init__(self, data, **kwargs): + self.data = data + self._transform_pipline = [] + self._filter_pipline = [] + + self.label_list = kwargs.pop("label_list", None) + self.vocab_info = kwargs.pop("vocab_info", None) + + def _transform(self, data): + for fn in self._transform_pipline: + data = fn(data) + return data + + def _shard_filter(self, num_samples): + return True + + def _filter(self, data): + for fn in self._filter_pipline: + if not fn(data): + return False + return True + + def __iter__(self): + """ + yields sample sequentially. + """ + num_samples = 0 + if inspect.isfunction(self.data): + for example in self.data(): + if ( + not self._filter_pipline or self._filter(self._filter_pipline) + ) and self._shard_filter(num_samples=num_samples): + yield self._transform( + example + ) if self._transform_pipline else example + num_samples += 1 + else: + if inspect.isgenerator(self.data): + warnings.warn( + "Reciving generator as data source, data can only be iterated once" + ) + for example in self.data: + if ( + not self._filter_pipline or self._filter(self._filter_pipline) + ) and self._shard_filter(num_samples=num_samples): + yield self._transform( + example + ) if self._transform_pipline else example + num_samples += 1 + + def filter(self, fn): + """ + Filters samples by the filter function and uses the filtered data to + update this dataset. + + Args: + fn (callable): A filter function that takes a sample as input and + returns a boolean. Samples that return False are discarded. + """ + + self._filter_pipline.append(fn) + + return self + + def shard(self, num_shards=None, index=None): + """ + Split the dataset into `num_shards` pieces. + + Args: + num_shards (int, optional): An integer representing the number of + data shards. If None, `num_shards` would be number of trainers. + Defaults to None. + index (int, optional): An integer representing the index of the + current shard. If None, `index` would be the current trainer rank + id. Defaults to None. + """ + if num_shards is None: + num_shards = dist.get_world_size() + if index is None: + index = dist.get_rank() + + def sharder(num_shards, index, num_samples): + if num_samples % num_shards == index: + return True + else: + return False + + fn = partial(sharder, num_shards=num_shards, index=index) + self._shard_filter = fn + return self + + def map(self, fn): + """ + Performs specific function on the dataset to transform and update every sample. + + Args: + fn (callable): Transformations to be performed. It receives single + sample as argument. + """ + + self._transform_pipline.append(fn) + + return self + + +class DatasetBuilder: + """ + A base class for all DatasetBuilder. It provides a `read()` function to turn + a data file into a MapDataset or IterDataset. + + `_get_data()` function and `_read()` function should be implemented to download + data file and read data file into a `Iterable` of the examples. + + For how to define a custom `DatasetBuilder`, please see `contribute_dataset + `__. + """ + + lazy = False + + def __init__(self, lazy=None, name=None, **config): + if lazy is not None: + self.lazy = lazy + self.name = name + self.config = config + + def read_datasets(self, splits=None, data_files=None): + def remove_if_exit(filepath): + if isinstance(filepath, (list, tuple)): + for file in filepath: + try: + os.remove(file) + except OSError: + pass + else: + try: + os.remove(filepath) + except OSError: + pass + + if data_files is None: + if splits is None: + splits = ( + list(self.BUILDER_CONFIGS[self.name]["splits"].keys()) + if hasattr(self, "BUILDER_CONFIGS") + else list(self.SPLITS.keys()) + ) + + assert ( + isinstance(splits, str) + or (isinstance(splits, list) and isinstance(splits[0], str)) + or (isinstance(splits, tuple) and isinstance(splits[0], str)) + ), "`splits` should be a string or list of string or a tuple of string." + + if isinstance(splits, str): + splits = [splits] + datasets = DatasetTuple(splits) + parallel_env = dist.ParallelEnv() + unique_endpoints = _get_unique_endpoints(parallel_env.trainer_endpoints[:]) + # move register hook to first and register togather + lock_files = [] + for split in splits: + lock_file = os.path.join(DATA_HOME, self.__class__.__name__) + if self.name is not None: + lock_file = lock_file + "." + self.name + lock_file += "." + split + ".done" + "." + str(os.getppid()) + lock_files.append(lock_file) + # Must register to all procs to make the lock file can be removed + # when any proc breaks. Otherwise, the single registered proc may + # not receive proper singal send by the parent proc to exit. + atexit.register(lambda: remove_if_exit(lock_files)) + for split in splits: + filename = self._get_data(split) + lock_file = os.path.join(DATA_HOME, self.__class__.__name__) + if self.name is not None: + lock_file = lock_file + "." + self.name + lock_file += "." + split + ".done" + "." + str(os.getppid()) + # `lock_file` indicates the finished status of`_get_data`. + # `_get_data` only works in the `unique_endpoints` specified + # proc since `get_path_from_url` only work for it. The other + # procs wait `_get_data` to be finished. + if parallel_env.current_endpoint in unique_endpoints: + f = open(lock_file, "w") + f.close() + else: + while not os.path.exists(lock_file): + time.sleep(1) + datasets[split] = self.read(filename=filename, split=split) + else: + assert ( + isinstance(data_files, str) + or isinstance(data_files, tuple) + or isinstance(data_files, list) + ), "`data_files` should be a string or tuple or list of strings." + if isinstance(data_files, str): + data_files = [data_files] + default_split = "train" + if splits: + if isinstance(splits, str): + splits = [splits] + datasets = DatasetTuple(splits) + assert len(splits) == len( + data_files + ), "Number of `splits` and number of `data_files` should be the same if you want to specify the split of loacl data file." + for i in range(len(data_files)): + datasets[splits[i]] = self.read( + filename=data_files[i], split=splits[i] + ) + else: + datasets = DatasetTuple( + ["split" + str(i) for i in range(len(data_files))] + ) + for i in range(len(data_files)): + datasets["split" + str(i)] = self.read( + filename=data_files[i], split=default_split + ) + + return datasets if len(datasets) > 1 else datasets[0] + + def read(self, filename, split="train"): + """ + Returns a dataset containing all the examples that can be read from the file path. + + If `self.lazy` is False, this eagerly reads all instances from `self._read()` + and returns a `MapDataset`. + + If `self.lazy` is True, this returns an `IterDataset`, which internally + relies on the generator created from `self._read()` to lazily produce examples. + In this case your implementation of `_read()` must also be lazy + (that is, not load all examples into memory at once). + + Args: + filename (str): Path of data file to read, usually provided by `_get_data` + function. + split (str, optional): The split name of selected dataset. This only makes + a different when data files of different splits have different structures. + + Returns: + A `MapDataset|IterDataset`. + """ + + label_list = self.get_labels() + vocab_info = self.get_vocab() + + def _create_dict(labels): + # For multiple labels in the form of list. + if isinstance(labels[0], list) or isinstance(labels[0], tuple): + label_dict = [] + for sub_labels in labels: + sub_dict = {} + for i, label in enumerate(sub_labels): + sub_dict[label] = i + label_dict.append(sub_dict) + else: + label_dict = {} + for i, label in enumerate(labels): + label_dict[label] = i + return label_dict + + def _convert_label_to_id(labels, label_dict): + if isinstance(labels, list) or isinstance(labels, tuple): + for label_idx in range(len(labels)): + labels[label_idx] = label_dict[labels[label_idx]] + else: + labels = label_dict[labels] + return labels + + if self.lazy: + + def generate_examples(): + generator = ( + self._read(filename, split) + if self._read.__code__.co_argcount > 2 + else self._read(filename) + ) + for example in generator: + # We need to check if the example contains label column and confirm its name. + # For now we only allow `label` or `labels` to be the name of label column. + if "labels" in example.keys(): + label_col = "labels" + elif "label" in example.keys(): + label_col = "label" + else: + label_col = None + + # Convert class label to label ids. + if label_list is not None and example.get(label_col, None): + label_dict = _create_dict(label_list) + # For multiple labels in the form of list. + if isinstance(label_dict, list): + for idx, sub_dict in enumerate(label_dict): + example[label_col][idx] = _convert_label_to_id( + example[label_col][idx], sub_dict + ) + else: + example[label_col] = _convert_label_to_id( + example[label_col], label_dict + ) + + yield example + else: + yield example + + return IterDataset( + generate_examples(), label_list=label_list, vocab_info=vocab_info + ) + else: + examples = ( + self._read(filename, split) + if self._read.__code__.co_argcount > 2 + else self._read(filename) + ) + + # Then some validation. + if not isinstance(examples, list): + examples = list(examples) + + if not examples: + raise ValueError( + "No instances were read from the given filepath {}. " + "Is the path correct?".format(filename) + ) + + # We need to check if the example contains label column and confirm its name. + # For now we only allow `label` or `labels` to be the name of label column. + if "labels" in examples[0].keys(): + label_col = "labels" + elif "label" in examples[0].keys(): + label_col = "label" + else: + label_col = None + + # Convert class label to label ids. + if label_list is not None and examples[0].get(label_col, None): + label_dict = _create_dict(label_list) + for idx in range(len(examples)): + # For multiple labels in the form of list. + if isinstance(label_dict, list): + for i, sub_dict in enumerate(label_dict): + examples[idx][label_col][i] = _convert_label_to_id( + examples[idx][label_col][i], sub_dict + ) + else: + examples[idx][label_col] = _convert_label_to_id( + examples[idx][label_col], label_dict + ) + + return MapDataset(examples, label_list=label_list, vocab_info=vocab_info) + + def _read(self, filename: str, *args): + """ + Reads examples from the given file_path and returns them as an + `Iterable` (which could be a list or a generator). + + This method must be implemented in self-defined `DatasetBuilder`. + """ + raise NotImplementedError + + def _get_data(self, mode: str): + """ + Downloads examples from the given URL and customized split + informations and returns a filepath. + + This method must be implemented in self-defined `DatasetBuilder`. + """ + raise NotImplementedError + + def get_labels(self): + """ + Returns list of class labels of the dataset if specified. + """ + return None + + def get_vocab(self): + """ + Returns vocab file path of the dataset if specified. + """ + return None + + +class SimpleBuilder(DatasetBuilder): + def __init__(self, lazy, read_func): + self._read = read_func + self.lazy = lazy + + def read(self, **kwargs): + if self.lazy: + + def generate_examples(): + generator = self._read(**kwargs) + for example in generator: + yield example + + return IterDataset(generate_examples) + else: + examples = self._read(**kwargs) + if hasattr(examples, "__len__") and hasattr(examples, "__getitem__"): + return MapDataset(examples) + else: + return MapDataset(list(examples)) diff --git a/paddlevlp/examples/blip2/__init__.py b/paddlevlp/examples/blip2/__init__.py new file mode 100644 index 00000000000000..595add0aed9e11 --- /dev/null +++ b/paddlevlp/examples/blip2/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlevlp/examples/blip2/run_predict.py b/paddlevlp/examples/blip2/run_predict.py new file mode 100644 index 00000000000000..6c90dd87df984e --- /dev/null +++ b/paddlevlp/examples/blip2/run_predict.py @@ -0,0 +1,98 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass, field + +import paddle +import requests +from paddlenlp.trainer import PdArgumentParser +from PIL import Image + +from paddlevlp.models.blip2.modeling import Blip2ForConditionalGeneration +from paddlevlp.processors.blip_processing import Blip2Processor +from paddlevlp.utils.log import logger + + +@dataclass +class DataArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + Using `PdArgumentParser` we can turn this class + into argparse arguments to be able to specify them on + the command line. + """ + + input_image: str = field( + metadata={"help": "The name of input image."} + ) # "http://images.cocodataset.org/val2017/000000039769.jpg" + prompt: str = field( + default=None, metadata={"help": "The prompt of the image to be generated."} + ) # "Question: how many cats are there? Answer:" + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. + """ + + model_name_or_path: str = field( + default="Salesforce/blip2-opt-2.7b", + metadata={"help": "Path to pretrained model or model identifier"}, + ) + pretrained_model_path: str = field( + default=None, + metadata={ + "help": "The path to pre-trained model that we will use for inference." + }, + ) + + +def main(): + parser = PdArgumentParser((ModelArguments, DataArguments)) + model_args, data_args = parser.parse_args_into_dataclasses() + url = ( + data_args.input_image + ) # "http://images.cocodataset.org/val2017/000000039769.jpg" + image = Image.open(requests.get(url, stream=True).raw) + + prompt = data_args.prompt + processor = Blip2Processor.from_pretrained( + model_args.model_name_or_path + ) # "Salesforce/blip2-opt-2.7b" + inputs = processor( + images=image, + text=prompt, + return_tensors="pd", + return_attention_mask=True, + mode="test", + ) + model = Blip2ForConditionalGeneration.from_pretrained(model_args.model_name_or_path) + + # load checkpoint + if model_args.pretrained_model_path: + weight = paddle.load(model_args.pretrained_model_path) + model.set_state_dict(weight) + + model.eval() + model.to("gpu") # doctest: +IGNORE_RESULT + generated_ids, scores = model.generate(**inputs) + generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[ + 0 + ].strip() + logger.info("Generate text: {}".format(generated_text)) + + +if __name__ == "__main__": + main() diff --git a/paddlevlp/examples/blip2/run_pretrain_stage2.py b/paddlevlp/examples/blip2/run_pretrain_stage2.py new file mode 100644 index 00000000000000..de5cf419326c8c --- /dev/null +++ b/paddlevlp/examples/blip2/run_pretrain_stage2.py @@ -0,0 +1,271 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from dataclasses import dataclass, field + +import paddle +from paddlenlp.trainer import (PdArgumentParser, TrainingArguments, + get_last_checkpoint) +from paddlenlp.transformers import AutoConfig, OPTConfig, T5Config + +import paddlevlp +from paddlevlp.datasets import load_dataset +from paddlevlp.models.blip2.configuration import (Blip2Config, + Blip2QFormerConfig, + Blip2VisionConfig) +from paddlevlp.models.blip2.modeling import Blip2ForConditionalGeneration +from paddlevlp.optimization import FilterParamsName +from paddlevlp.processors.blip_processing import Blip2Processor +from paddlevlp.trainer import Trainer +from paddlevlp.utils.log import logger + + +class BlipCollator: + """ + Data collator that will dynamically pad the inputs to the longest sequence in the batch. + + Args: + processor (`paddlevlp.processors.ProcessorMixin`): + The processor used for pre-process the data. + """ + + def __init__(self, processor): + self.processor = processor + + def __call__(self, data_list): + images = [sample["image"] for sample in data_list] + text = [sample["text_input"] for sample in data_list] + batch = self.processor( + images=images, + text=text, + max_length=32, + return_tensors="pd", + return_attention_mask=True, + mode="train", + ) + return batch + + +@dataclass +class DataArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + Using `PdArgumentParser` we can turn this class + into argparse arguments to be able to specify them on + the command line. + """ + + task_name: str = field( + default="coco_caption", + metadata={"help": "The name of the task to use (via the datasets library)."}, + ) + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. + """ + + model_name_or_path: str = field( + default="Salesforce/blip2-opt-2.7b", + metadata={"help": "Path to pretrained model or model identifier"}, + ) + + text_model_name_or_path: str = field( + default="facebook/opt-2.7b", + metadata={"help": "The type of text model to use (OPT, T5)."}, + ) + + +@dataclass +class PreTrainingArguments(TrainingArguments): + """ + Arguments pertaining to what training options we are going to use during pretraining. + """ + + pretrained_model_path: str = field( + default=None, + metadata={ + "help": "The path to pre-trained model that we will use for pretraining." + }, + ) + weight_decay: float = field( + default=0.5, metadata={"help": "Weight decay if we apply some."} + ) + learning_rate: float = field( + default=1e-4, metadata={"help": "The initial learning rate."} + ) + num_train_epochs: float = field( + default=10.0, metadata={"help": "Total number of training epochs to perform."} + ) + warmup_start_lr: float = field( + default=1e-6, metadata={"help": "Initial learning rate of warm up."} + ) + eta_min: float = field( + default=1e-5, metadata={"help": "The minimum value of learning rate."} + ) + warmup_steps: int = field( + default=2000, metadata={"help": "Number of warmup steps."} + ) + lr_scheduler_name: str = field( + default="CosineDecayWithWarmup", metadata={"help": "The scheduler name to use."} + ) + + +def create_scheduler(dataset_len, config): + lr_sched_func = getattr(paddlevlp.optimization, config.lr_scheduler_name) + lr_sched = lr_sched_func( + learning_rate=config.learning_rate, + epochs=config.num_train_epochs, + eta_min=config.eta_min, + warmup_steps=config.warmup_steps, + warmup_start_lr=config.warmup_start_lr, + step_each_epoch=dataset_len, + ) + return lr_sched + + +def create_optimizer_and_scheduler(model, dataset_len, config): + lr_sched = create_scheduler(dataset_len, config) + param_filter = FilterParamsName() + p_wd, p_non_wd = param_filter(model) + optimizer = paddle.optimizer.AdamW( + parameters=p_wd + p_non_wd, + learning_rate=lr_sched, + weight_decay=float(config.weight_decay), + beta1=config.adam_beta1, + beta2=config.adam_beta2, + apply_decay_param_fun=param_filter.apply_decay_param_fun, + ) + return optimizer, lr_sched + + +def get_text_config(text_model_name_or_path): + if "t5" in text_model_name_or_path: + text_config = T5Config.from_pretrained(text_model_name_or_path) + elif "opt" in text_model_name_or_path: + text_config = OPTConfig.from_pretrained(text_model_name_or_path) + else: + text_config = AutoConfig.from_pretrained(text_model_name_or_path) + return text_config + + +def create_model(config): + # blip2_config = Blip2ForConditionalGeneration(onfig.model_name_or_path) + vision_config = Blip2VisionConfig.from_pretrained(config.model_name_or_path) + qformer_config = Blip2QFormerConfig.from_pretrained(config.model_name_or_path) + text_config = get_text_config(config.text_model_name_or_path) + blip2_config = Blip2Config.from_vision_qformer_text_configs( + vision_config, qformer_config, text_config + ) + + model = Blip2ForConditionalGeneration(blip2_config) + return model + + +def load_pretrained_model(model, pretrained_model_path): + if pretrained_model_path is None: + return + + if not os.path.exists(pretrained_model_path): + ValueError( + "Cannot find pretrained model path: {}".format(pretrained_model_path) + ) + + state_dict = paddle.load(pretrained_model_path) + for key in model.state_dict().keys(): + if key in state_dict.keys(): + if state_dict[key].shape != model.state_dict()[key].shape: + del state_dict[key] + + model.set_state_dict(state_dict) + + +def main(): + parser = PdArgumentParser((ModelArguments, DataArguments, PreTrainingArguments)) + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + # Log model and data config + training_args.print_config(model_args, "Model") + training_args.print_config(data_args, "Data") + + paddle.set_device(training_args.device) + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, " + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16 or training_args.bf16}" + ) + + # Detecting last checkpoint + last_checkpoint = None + if ( + os.path.isdir(training_args.output_dir) + and training_args.do_train + and not training_args.overwrite_output_dir + ): + last_checkpoint = get_last_checkpoint(training_args.output_dir) + # if last_checkpoint is None and len( + # os.listdir(training_args.output_dir)) > 1: + # raise ValueError( + # f"Output directory ({training_args.output_dir}) already exists and is not empty. " + # "Use --overwrite_output_dir to overcome.") + if last_checkpoint is not None and training_args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # create dataset + processor = Blip2Processor.from_pretrained(model_args.model_name_or_path) + blip_collator = BlipCollator(processor) + train_dataset = load_dataset(data_args.task_name, splits="train") + dataset_len = len(train_dataset) + + # create model + model = create_model(model_args) + load_pretrained_model(model, training_args.pretrained_model_path) + + # load model for debug + # weight = paddle.load('/paddle/wangguanzhong/blip-jinman/PaddleNLP/blip2/blip2_checkout_4_output.pdparams') + # model.set_state_dict(weight) + + # create optimizer + optimizer, lr_sched = create_optimizer_and_scheduler( + model, dataset_len, training_args + ) + + # create trainer + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + data_collator=blip_collator, + optimizers=(optimizer, lr_sched), + ) + + # Training + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + checkpoint = last_checkpoint + + if training_args.do_train: + trainer.train(resume_from_checkpoint=checkpoint) + trainer.save_model() + trainer.save_state() + + +if __name__ == "__main__": + main() diff --git a/paddlevlp/models/__init__.py b/paddlevlp/models/__init__.py new file mode 100644 index 00000000000000..904dfbb7a6d3d2 --- /dev/null +++ b/paddlevlp/models/__init__.py @@ -0,0 +1,16 @@ +# copyright (c) 2023 paddlepaddle authors. all rights reserved. +# copyright 2023 the salesforce team authors and the huggingface team. all rights reserved. +# +# licensed under the apache license, version 2.0 (the "license"); +# you may not use this file except in compliance with the license. +# you may obtain a copy of the license at +# +# http://www.apache.org/licenses/license-2.0 +# +# unless required by applicable law or agreed to in writing, software +# distributed under the license is distributed on an "as is" basis, +# without warranties or conditions of any kind, either express or implied. +# see the license for the specific language governing permissions and +# limitations under the license. + +from .blip2.modeling import * diff --git a/paddlevlp/models/blip2/__init__.py b/paddlevlp/models/blip2/__init__.py new file mode 100644 index 00000000000000..595add0aed9e11 --- /dev/null +++ b/paddlevlp/models/blip2/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlevlp/models/blip2/configuration.py b/paddlevlp/models/blip2/configuration.py new file mode 100644 index 00000000000000..d05ade37c2dcb1 --- /dev/null +++ b/paddlevlp/models/blip2/configuration.py @@ -0,0 +1,400 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" BLIP-2 model configuration""" +import copy +import os +from typing import Union + +from paddlenlp.transformers import AutoConfig +from paddlenlp.transformers.auto.modeling import \ + MODEL_FOR_CAUSAL_LM_MAPPING_NAMES +from paddlenlp.transformers.configuration_utils import PretrainedConfig +from paddlenlp.transformers.opt.configuration import OPTConfig +from paddlenlp.transformers.t5.configuration import T5Config +from paddlenlp.utils.log import logger + +__all__ = [ + "Blip2VisionConfig", + "Blip2QFormerConfig", + "Blip2Config", +] + + +class Blip2VisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Blip2VisionModel`]. It is used to instantiate a + BLIP-2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a + configuration defaults will yield a similar configuration to that of the BLIP-2 + [Salesforce/blip2-opt-2.7b](https://huggingface.co/Salesforce/blip2-opt-2.7b) architecture. + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + Args: + hidden_size (`int`, *optional*, defaults to 1408): + Dimensionality of the encoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 6144): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + num_hidden_layers (`int`, *optional*, defaults to 39): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 14): + The size (resolution) of each patch. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults + to 1e-5): The epsilon used by the layer normalization layers. + dropout (`float`, *optional*, defaults to 0.0): + The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + initializer_factor (`float``, *optional*, defaults to 1): + A factor for initializing all weight matrices (should be kept to 1, used internally for initialization + testing). + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the queries and values in the self-attention layers. + Example: + ```python + >>> from paddlenlp.transformers import Blip2VisionConfig, Blip2VisionModel + >>> # Initializing a Blip2VisionConfig with Salesforce/blip2-opt-2.7b style configuration + >>> configuration = Blip2VisionConfig() + >>> # Initializing a Blip2VisionModel (with random weights) from the Salesforce/blip2-opt-2.7b style configuration + >>> model = Blip2VisionModel(configuration) + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "blip_2_vision_model" + + def __init__( + self, + hidden_size=1408, + intermediate_size=6144, + projection_dim=512, + num_hidden_layers=39, + num_attention_heads=16, + num_channels=3, + image_size=224, + patch_size=14, + hidden_act="gelu", + layer_norm_eps=1e-6, + dropout=0.0, + attention_dropout=0.0, + initializer_range=1e-10, + initializer_factor=1.0, + qkv_bias=True, + **kwargs, + ): + kwargs["return_dict"] = kwargs.pop("return_dict", True) + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.projection_dim = projection_dim + self.dropout = dropout + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_channels = num_channels + self.patch_size = patch_size + self.image_size = image_size + self.initializer_range = initializer_range + self.initializer_factor = initializer_factor + self.attention_dropout = attention_dropout + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + self.qkv_bias = qkv_bias + + @classmethod + def from_pretrained( + cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs + ) -> "PretrainedConfig": + config_dict, kwargs = cls.get_config_dict( + pretrained_model_name_or_path, **kwargs + ) + + # get the vision config dict if we are loading from Blip2Config + if config_dict.get("model_type") == "blip-2": + config_dict = config_dict["vision_config"] + + if ( + "model_type" in config_dict + and hasattr(cls, "model_type") + and config_dict["model_type"] != cls.model_type + ): + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) + + +class Blip2QFormerConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Blip2QFormerModel`]. It is used to instantiate a + BLIP-2 Querying Transformer (Q-Former) model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the BLIP-2 + [Salesforce/blip2-opt-2.7b](https://huggingface.co/Salesforce/blip2-opt-2.7b) architecture. Configuration objects + inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from + [`PretrainedConfig`] for more information. + Note that [`Blip2QFormerModel`] is very similar to [`BertLMHeadModel`] with interleaved cross-attention. + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the Q-Former model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling the model. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + position_embedding_type (`str`, *optional*, defaults to `"absolute"`): + Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For + positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to + [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). + For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models + with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658). + classifier_dropout (`float`, *optional*): + The dropout ratio for the classification head. + cross_attention_frequency (`int`, *optional*, defaults to 2): + The frequency of adding cross-attention to the Transformer layers. + encoder_hidden_size (`int`, *optional*, defaults to 1408): + The hidden size of the hidden states for cross-attention. + Examples: + ```python + >>> from paddlenlp.transformers import Blip2QFormerConfig, Blip2QFormerModel + >>> # Initializing a BLIP-2 Salesforce/blip2-opt-2.7b style configuration + >>> configuration = Blip2QFormerConfig() + >>> # Initializing a model (with random weights) from the Salesforce/blip2-opt-2.7b style configuration + >>> model = Blip2QFormerModel(configuration) + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + model_type = "blip_2_qformer" + + def __init__( + self, + vocab_size=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=0, + position_embedding_type="absolute", + classifier_dropout=None, + cross_attention_frequency=2, + encoder_hidden_size=1408, + **kwargs, + ): + kwargs["return_dict"] = kwargs.pop("return_dict", True) + super().__init__(pad_token_id=pad_token_id, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.position_embedding_type = position_embedding_type + self.classifier_dropout = classifier_dropout + self.cross_attention_frequency = cross_attention_frequency + self.encoder_hidden_size = encoder_hidden_size + + @classmethod + def from_pretrained( + cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs + ) -> "PretrainedConfig": + config_dict, kwargs = cls.get_config_dict( + pretrained_model_name_or_path, **kwargs + ) + + # get the qformer config dict if we are loading from Blip2Config + if config_dict.get("model_type") == "blip-2": + config_dict = config_dict["qformer_config"] + + if ( + "model_type" in config_dict + and hasattr(cls, "model_type") + and config_dict["model_type"] != cls.model_type + ): + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) + + +class Blip2Config(PretrainedConfig): + r""" + [`Blip2Config`] is the configuration class to store the configuration of a [`Blip2ForConditionalGeneration`]. It is + used to instantiate a BLIP-2 model according to the specified arguments, defining the vision model, Q-Former model + and language model configs. Instantiating a configuration with the defaults will yield a similar configuration to + that of the BLIP-2 [Salesforce/blip2-opt-2.7b](https://huggingface.co/Salesforce/blip2-opt-2.7b) architecture. + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + Args: + vision_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`Blip2VisionConfig`]. + qformer_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`Blip2QFormerConfig`]. + text_config (`dict`, *optional*): + Dictionary of configuration options used to initialize any [`PretrainedConfig`]. + num_query_tokens (`int`, *optional*, defaults to 32): + The number of query tokens passed through the Transformer. + kwargs (*optional*): + Dictionary of keyword arguments. + Example: + ```python + >>> from paddlenlp.transformers import ( + ... Blip2VisionConfig, + ... Blip2QFormerConfig, + ... OPTConfig, + ... Blip2Config, + ... Blip2ForConditionalGeneration, + ... ) + >>> # Initializing a Blip2Config with Salesforce/blip2-opt-2.7b style configuration + >>> configuration = Blip2Config() + >>> # Initializing a Blip2ForConditionalGeneration (with random weights) from the Salesforce/blip2-opt-2.7b style configuration + >>> model = Blip2ForConditionalGeneration(configuration) + >>> # Accessing the model configuration + >>> configuration = model.config + >>> # We can also initialize a Blip2Config from a Blip2VisionConfig, Blip2QFormerConfig and any PretrainedConfig + >>> # Initializing BLIP-2 vision, BLIP-2 Q-Former and language model configurations + >>> vision_config = Blip2VisionConfig() + >>> qformer_config = Blip2QFormerConfig() + >>> text_config = OPTConfig() + >>> config = Blip2Config.from_text_vision_configs(vision_config, qformer_config, text_config) + ```""" + + model_type = "blip-2" + is_composition = True + + def __init__( + self, + vision_config=None, + qformer_config=None, + text_config=None, + num_query_tokens=32, + **kwargs, + ): + super().__init__(**kwargs) + + if vision_config is None: + vision_config = {} + logger.info( + "vision_config is None. initializing the Blip2VisionConfig with default values." + ) + + if qformer_config is None: + qformer_config = {} + logger.info( + "qformer_config is None. Initializing the Blip2QFormerConfig with default values." + ) + + if text_config is None: + text_config = {} + logger.info( + "text_config is None. Initializing the text config with default values (`OPTConfig`)." + ) + self.vision_config = Blip2VisionConfig(**vision_config) + self.qformer_config = Blip2QFormerConfig(**qformer_config) + text_model_type = ( + text_config["model_type"] if "model_type" in text_config else "opt" + ) + # self.text_config = CONFIG_MAPPING[text_model_type](**text_config) + if text_model_type == "t5": + self.text_config = T5Config(**text_config) + elif text_model_type == "opt": + self.text_config = OPTConfig(**text_config) + else: + self.text_config = AutoConfig(**text_config) + + self.num_query_tokens = num_query_tokens + self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size + self.use_decoder_only_language_model = ( + self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES + ) + # CONFIGURATION_MODEL_MAPPING = get_init_configurations() + # self.use_decoder_only_language_model = self.text_config.model_type in CONFIGURATION_MODEL_MAPPING + self.initializer_factor = 1.0 + self.initializer_range = 0.02 + self.freeze_vit = True + + @classmethod + def from_vision_qformer_text_configs( + cls, + vision_config: Blip2VisionConfig, + qformer_config: Blip2QFormerConfig, + text_config: PretrainedConfig, + **kwargs, + ): + r""" + Instantiate a [`Blip2Config`] (or a derived class) from a BLIP-2 vision model, Q-Former and language model + configurations. + Returns: + [`Blip2Config`]: An instance of a configuration object + """ + + return cls( + vision_config=vision_config.to_dict(), + qformer_config=qformer_config.to_dict(), + text_config=text_config.to_dict(), + **kwargs, + ) + + def to_dict(self): + """ + Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. + Returns: + `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, + """ + output = copy.deepcopy(self.__dict__) + output["vision_config"] = self.vision_config.to_dict() + output["qformer_config"] = self.qformer_config.to_dict() + output["text_config"] = self.text_config.to_dict() + output["model_type"] = self.__class__.model_type + return output diff --git a/paddlevlp/models/blip2/modeling.py b/paddlevlp/models/blip2/modeling.py new file mode 100644 index 00000000000000..7cda33b843ee53 --- /dev/null +++ b/paddlevlp/models/blip2/modeling.py @@ -0,0 +1,1925 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" Paddle BLIP2 model.""" + +import math +from dataclasses import dataclass +from typing import Any, Optional, Tuple, Union + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.distributed.fleet.utils import recompute +from paddle.nn import CrossEntropyLoss +from paddlenlp.transformers.activations import ACT2FN +from paddlenlp.transformers.model_outputs import ( + BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPooling, BaseModelOutputWithPoolingAndCrossAttentions, + ModelOutput) +from paddlenlp.transformers.model_utils import ( + PretrainedModel, apply_chunking_to_forward, + find_pruneable_heads_and_indices, prune_linear_layer) +from paddlenlp.transformers.opt.configuration import OPTConfig +from paddlenlp.transformers.opt.modeling import OPTForCausalLM +from paddlenlp.transformers.t5.configuration import T5Config +from paddlenlp.transformers.t5.modeling import T5ForConditionalGeneration +from paddlenlp.utils.initializer import normal_, ones_, zeros_ +from paddlenlp.utils.log import logger + +from .configuration import Blip2Config, Blip2QFormerConfig, Blip2VisionConfig + +BLIP_2_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "Salesforce/blip2-flan-t5-xl", + "Salesforce/blip2-opt-2.7b", +] + +__all__ = [ + "Blip2QFormerModel", + "Blip2Model", + "Blip2PretrainedModel", + "Blip2VisionModel", + "Blip2ForConditionalGeneration", +] + + +def Parameter(tensor): + return paddle.create_parameter( + tensor.shape, + dtype=tensor.dtype, + default_initializer=nn.initializer.Assign(tensor), + ) + + +@dataclass +class Blip2ForConditionalGenerationModelOutput(ModelOutput): + """ + Class defining the outputs of [`Blip2ForConditionalGeneration`]. + Args: + loss (`paddle.Tensor`, *optional*, returned when `labels` is provided, `paddle.Tensor` of shape `(1,)`): + Language modeling loss from the language model. + logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head of the language model. + vision_outputs (`BaseModelOutputWithPooling`): + Outputs of the vision encoder. + qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`): + Outputs of the Q-Former (Querying Transformer). + language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`): + Outputs of the language model. + """ + + loss: Optional[Tuple[paddle.Tensor]] = None + logits: Optional[Tuple[paddle.Tensor]] = None + vision_outputs: Optional[paddle.Tensor] = None + qformer_outputs: Optional[Tuple[paddle.Tensor]] = None + language_model_outputs: Optional[Tuple[paddle.Tensor]] = None + + def to_tuple(self) -> Tuple[Any]: + return tuple( + self[k] + if k not in ["vision_outputs", "qformer_outputs", "language_model_outputs"] + else getattr(self, k).to_tuple() + for k in self.keys() + ) + + +# Copied from paddlenlp.transformers.blip.modeling.BlipVisionEmbeddings with Blip->Blip2 +class Blip2VisionEmbeddings(nn.Layer): + def __init__(self, config: Blip2VisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.class_embedding = Parameter( + paddle.randn([1, 1, self.embed_dim], dtype=paddle.get_default_dtype()), + ) + + self.patch_embedding = nn.Conv2D( + in_channels=3, + out_channels=self.embed_dim, + kernel_size=self.patch_size, + stride=self.patch_size, + ) + + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + 1 + + self.position_embedding = Parameter( + paddle.randn( + [1, self.num_positions, self.embed_dim], + dtype=paddle.get_default_dtype(), + ) + ) + + def forward(self, pixel_values: paddle.Tensor) -> paddle.Tensor: + batch_size = pixel_values.shape[0] + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding( + pixel_values + ) # shape = [*, width, grid, grid] + # print('DEBUG!!!!! pixel_values: ', np.abs(pixel_values.numpy()).mean()) + # print('DEBUG!!!!! patch_embedding weight: ', np.abs(self.patch_embedding.weight.numpy()).mean(), self.patch_embedding.weight.shape) + patch_embeds = patch_embeds.flatten(2).transpose([0, 2, 1]) + class_embeds = self.class_embedding.expand([batch_size, 1, -1]).cast( + target_dtype + ) + embeddings = paddle.concat([class_embeds, patch_embeds], axis=1) + embeddings = embeddings + self.position_embedding[ + :, : embeddings.shape[1], : + ].cast(target_dtype) + # rint('DEBUG!!!!embeddings: ', np.abs(embeddings.numpy()).mean()) + return embeddings + + +class Blip2Attention(nn.Layer): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.dropout = nn.Dropout(config.attention_dropout) + + # small tweak here compared to CLIP, no bias here + self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias_attr=False) + + if config.qkv_bias: + q_bias = Parameter( + paddle.zeros([self.embed_dim], dtype=paddle.get_default_dtype()) + ) + v_bias = Parameter( + paddle.zeros([self.embed_dim], dtype=paddle.get_default_dtype()) + ) + else: + q_bias = None + v_bias = None + + if q_bias is not None: + qkv_bias = paddle.concat((q_bias, paddle.zeros_like(v_bias), v_bias)) + self.qkv.bias = Parameter(qkv_bias) + + self.projection = nn.Linear(self.embed_dim, self.embed_dim) + + def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int): + return tensor.reshape([bsz, seq_len, self.num_heads, self.head_dim]).transpose( + [0, 2, 1, 3] + ) + + def forward( + self, + hidden_states: paddle.Tensor, + head_mask: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + bsz, tgt_len, embed_dim = hidden_states.shape + + mixed_qkv = self.qkv(hidden_states) + + mixed_qkv = mixed_qkv.reshape( + [bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads] + ).transpose([2, 0, 3, 1, 4]) + query_states, key_states, value_states = ( + mixed_qkv[0], + mixed_qkv[1], + mixed_qkv[2], + ) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = paddle.matmul(query_states, key_states, transpose_y=True) + + attention_scores = attention_scores * self.scale + + # Normalize the attention scores to probabilities. + attention_probs = F.softmax(attention_scores, axis=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = paddle.matmul(attention_probs, value_states).transpose( + [0, 2, 1, 3] + ) + + new_context_layer_shape = context_layer.shape[:-2] + [ + self.embed_dim, + ] + context_layer = context_layer.reshape(new_context_layer_shape) + + output = self.projection(context_layer) + + outputs = (output, attention_probs) if output_attentions else (output, None) + + return outputs + + +# Copied from paddlenlp.transformers.blip.modeling.BlipMLP +class Blip2MLP(nn.Layer): + def __init__(self, config): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.hidden_act] + self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + + def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +# Copied from paddlenlp.transformers.blip.modeling.BlipEncoderLayer with Blip->Blip2 +class Blip2EncoderLayer(nn.Layer): + def __init__(self, config: Blip2Config): + super().__init__() + self.embed_dim = config.hidden_size + self.self_attn = Blip2Attention(config) + self.layer_norm1 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps) + self.mlp = Blip2MLP(config) + self.layer_norm2 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps) + + def forward( + self, + hidden_states: paddle.Tensor, + attention_mask: paddle.Tensor, + output_attentions: Optional[bool] = False, + ) -> Tuple[paddle.Tensor]: + """ + Args: + hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`paddle.Tensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + `(config.encoder_attention_heads,)`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + # print('DEBUG!! vit input: ', np.abs(hidden_states.numpy()).mean()) + + hidden_states = self.layer_norm1(hidden_states) + + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + head_mask=attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = hidden_states + residual + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + # print('DEBUG!! vit layer mlp: ', np.abs(hidden_states.numpy()).mean()) + + hidden_states = hidden_states + residual + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class Blip2PretrainedModel(PretrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = Blip2Config + base_model_prefix = "blip" + supports_gradient_checkpointing = True + _keys_to_ignore_on_load_missing = [ + r"position_ids", + r"language_model.encoder.embed_tokens.weight", + r"language_model.decoder.embed_tokens.weight", + ] + _no_split_modules = ["Blip2Attention", "T5Block", "OPTDecoderLayer"] + _keep_in_fp32_modules = ["wo"] + + def _init_weights(self, module): + """Initialize the weights""" + factor = self.config.initializer_range + if ( + isinstance(module, nn.Conv2D) + or isinstance(module, nn.Embedding) + or isinstance(module, nn.Linear) + ): + normal_(module.weight, mean=0.0, std=factor) + if hasattr(module, "bias") and module.bias is not None: + zeros_(module.bias) + + if isinstance(module, Blip2VisionEmbeddings): + if hasattr(self.config, "vision_config"): + factor = self.config.vision_config.initializer_range + trunc_normal_ = nn.initializer.TruncatedNormal(mean=0.0, std=factor) + trunc_normal_(module.position_embedding) + trunc_normal_( + module.class_embedding, + ) + + elif isinstance(module, nn.LayerNorm): + zeros_(module.bias) + ones_(module.weight) + elif isinstance(module, nn.Linear) and module.bias is not None: + zeros_(module.bias) + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, Blip2Encoder): + module.gradient_checkpointing = value + + +class Blip2Encoder(nn.Layer): + """ + Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a + [`Blip2EncoderLayer`]. + Args: + config (`Blip2Config`): + The corresponding vision configuration for the `Blip2Encoder`. + """ + + def __init__(self, config: Blip2Config): + super().__init__() + self.config = config + self.layers = nn.LayerList( + [Blip2EncoderLayer(config) for _ in range(config.num_hidden_layers)] + ) + self.gradient_checkpointing = False + + def forward( + self, + inputs_embeds, + attention_mask: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: + r""" + Args: + inputs_embeds (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + [What are attention masks?](../glossary#attention-mask) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + hidden_states = inputs_embeds + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, output_attentions) + + return custom_forward + + layer_outputs = recompute( + create_custom_forward(encoder_layer), + hidden_states, + attention_mask, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [hidden_states, encoder_states, all_attentions] + if v is not None + ) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=encoder_states, + attentions=all_attentions, + ) + + +class Blip2VisionModel(Blip2PretrainedModel): + main_input_name = "pixel_values" + config_class = Blip2VisionConfig + + def __init__(self, config: Blip2VisionConfig): + super().__init__(config) + self.config = config + embed_dim = config.hidden_size + + self.embeddings = Blip2VisionEmbeddings(config) + self.encoder = Blip2Encoder(config) + self.post_layernorm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps) + + def forward( + self, + pixel_values: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + r""" + Returns: + """ + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + hidden_states = self.embeddings(pixel_values) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = encoder_outputs[0] + last_hidden_state = self.post_layernorm(last_hidden_state) + + pooled_output = last_hidden_state[:, 0, :] + pooled_output = self.post_layernorm(pooled_output) + + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + def get_input_embeddings(self): + return self.embeddings + + +class Blip2QFormerMultiHeadAttention(nn.Layer): + def __init__(self, config, is_cross_attention=False): + super().__init__() + self.config = config + if config.hidden_size % config.num_attention_heads != 0 and not hasattr( + config, "embedding_size" + ): + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention heads (%d)" + % (config.hidden_size, config.num_attention_heads) + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + if is_cross_attention: + self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size) + self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size) + else: + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = getattr( + config, "position_embedding_type", "absolute" + ) + if ( + self.position_embedding_type == "relative_key" + or self.position_embedding_type == "relative_key_query" + ): + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding( + 2 * config.max_position_embeddings - 1, self.attention_head_size + ) + self.save_attention = False + + def save_attn_gradients(self, attn_gradients): + self.attn_gradients = attn_gradients + + def get_attn_gradients(self): + return self.attn_gradients + + def save_attention_map(self, attention_map): + self.attention_map = attention_map + + def get_attention_map(self): + return self.attention_map + + def transpose_for_scores(self, x): + new_x_shape = x.shape[:-1] + [ + self.num_attention_heads, + self.attention_head_size, + ] + x = x.reshape(new_x_shape) + return x.transpose([0, 2, 1, 3]) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = paddle.concat([past_key_value[0], key_layer], axis=2) + value_layer = paddle.concat([past_key_value[1], value_layer], axis=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + mixed_query_layer = self.query(hidden_states) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = paddle.matmul(query_layer, key_layer, transpose_y=True) + + if ( + self.position_embedding_type == "relative_key" + or self.position_embedding_type == "relative_key_query" + ): + seq_length = hidden_states.shape[1] + position_ids_l = paddle.arange(seq_length, dtype="int64").reshape([-1, 1]) + position_ids_r = paddle.arange(seq_length, dtype="int64").reshape([1, -1]) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding( + distance + self.max_position_embeddings - 1 + ) + positional_embedding = positional_embedding.cast( + dtype=query_layer.dtype + ) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = paddle.einsum( + "bhld,lrd->bhlr", query_layer, positional_embedding + ) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = paddle.einsum( + "bhld,lrd->bhlr", query_layer, positional_embedding + ) + relative_position_scores_key = paddle.einsum( + "bhrd,lrd->bhlr", key_layer, positional_embedding + ) + attention_scores = ( + attention_scores + + relative_position_scores_query + + relative_position_scores_key + ) + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(axis=-1)(attention_scores) + + if is_cross_attention and self.save_attention: + self.save_attention_map(attention_probs) + attention_probs.register_hook(self.save_attn_gradients) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs_dropped = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs_dropped = attention_probs_dropped * head_mask + + context_layer = paddle.matmul(attention_probs_dropped, value_layer) + + context_layer = context_layer.transpose([0, 2, 1, 3]) + new_context_layer_shape = context_layer.shape[:-2] + [ + self.all_head_size, + ] + context_layer = context_layer.reshape(new_context_layer_shape) + + outputs = ( + (context_layer, attention_probs) if output_attentions else (context_layer,) + ) + + outputs = outputs + (past_key_value,) + return outputs + + +# Copied from paddlenlp.transformers.bert.modeling.BertSelfOutput with Bert->Blip2QFormer +class Blip2QFormerSelfOutput(nn.Layer): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward( + self, hidden_states: paddle.Tensor, input_tensor: paddle.Tensor + ) -> paddle.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class Blip2QFormerAttention(nn.Layer): + def __init__(self, config, is_cross_attention=False): + super().__init__() + self.attention = Blip2QFormerMultiHeadAttention(config, is_cross_attention) + self.output = Blip2QFormerSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, + self.attention.num_attention_heads, + self.attention.attention_head_size, + self.pruned_heads, + ) + + # Prune linear layers + self.attention.query = prune_linear_layer(self.attention.query, index) + self.attention.key = prune_linear_layer(self.attention.key, index) + self.attention.value = prune_linear_layer(self.attention.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.attention.num_attention_heads = self.attention.num_attention_heads - len( + heads + ) + self.attention.all_head_size = ( + self.attention.attention_head_size * self.attention.num_attention_heads + ) + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: paddle.Tensor, + attention_mask: Optional[paddle.Tensor] = None, + head_mask: Optional[paddle.Tensor] = None, + encoder_hidden_states: Optional[paddle.Tensor] = None, + encoder_attention_mask: Optional[paddle.Tensor] = None, + past_key_value: Optional[Tuple[Tuple[paddle.Tensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[paddle.Tensor]: + self_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[ + 1: + ] # add attentions if we output them + return outputs + + +# Copied from paddlenlp.transformers.bert.modeling.BertIntermediate with Bert->Blip2QFormer +class Blip2QFormerIntermediate(nn.Layer): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from paddlenlp.transformers.bert.modeling.BertOutput with Bert->Blip2QFormer +class Blip2QFormerOutput(nn.Layer): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward( + self, hidden_states: paddle.Tensor, input_tensor: paddle.Tensor + ) -> paddle.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class Blip2QFormerLayer(nn.Layer): + def __init__(self, config, layer_idx): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = Blip2QFormerAttention(config) + + self.layer_idx = layer_idx + + if layer_idx % config.cross_attention_frequency == 0: + self.crossattention = Blip2QFormerAttention(config, is_cross_attention=True) + self.has_cross_attention = True + else: + self.has_cross_attention = False + + self.intermediate_query = Blip2QFormerIntermediate(config) + self.output_query = Blip2QFormerOutput(config) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + query_length=0, + ): + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = ( + past_key_value[:2] if past_key_value is not None else None + ) + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:-1] + + present_key_value = self_attention_outputs[-1] + + if query_length > 0: + query_attention_output = attention_output[:, :query_length, :] + + if self.has_cross_attention: + if encoder_hidden_states is None: + raise ValueError( + "encoder_hidden_states must be given for cross-attention layers" + ) + cross_attention_outputs = self.crossattention( + query_attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + output_attentions=output_attentions, + ) + query_attention_output = cross_attention_outputs[0] + # add cross attentions if we output attention weights + outputs = outputs + cross_attention_outputs[1:-1] + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk_query, + self.chunk_size_feed_forward, + self.seq_len_dim, + query_attention_output, + ) + + if attention_output.shape[1] > query_length: + layer_output_text = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output[:, query_length:, :], + ) + layer_output = paddle.concat([layer_output, layer_output_text], axis=1) + else: + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output, + ) + outputs = (layer_output,) + outputs + + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + def feed_forward_chunk_query(self, attention_output): + intermediate_output = self.intermediate_query(attention_output) + layer_output = self.output_query(intermediate_output, attention_output) + return layer_output + + +class Blip2QFormerEncoder(nn.Layer): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.LayerList( + [ + Blip2QFormerLayer(config, layer_idx) + for layer_idx in range(config.num_hidden_layers) + ] + ) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + query_length=0, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions else None + + next_decoder_cache = () if use_cache else None + + for i in range(self.config.num_hidden_layers): + layer_module = self.layer[i] + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + if getattr(self.config, "gradient_checkpointing", False) and self.training: + if use_cache: + logger.warn( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + def create_custom_forward(module): + def custom_forward(*inputs): + return module( + *inputs, past_key_value, output_attentions, query_length + ) + + return custom_forward + + layer_outputs = recompute( + create_custom_forward(layer_module), + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + query_length, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if layer_module.has_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +class Blip2QFormerModel(Blip2PretrainedModel): + """ + Querying Transformer (Q-Former), used in BLIP-2. + """ + + def __init__(self, config: Blip2QFormerConfig): + super().__init__(config) + self.config = config + + self.layernorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + self.encoder = Blip2QFormerEncoder(config) + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def get_extended_attention_mask( + self, + attention_mask: paddle.Tensor, + input_shape: Tuple[int], + has_query: bool = False, + ) -> paddle.Tensor: + """ + Makes broadcastable attention and causal masks so that future and masked tokens are ignored. + Arguments: + attention_mask (`paddle.Tensor`): + Mask with ones indicating tokens to attend to, zeros for tokens to ignore. + input_shape (`Tuple[int]`): + The shape of the input to the model. + Returns: + `paddle.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`. + """ + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + if attention_mask.dim() == 3: + extended_attention_mask = attention_mask[:, None, :, :] + elif attention_mask.dim() == 2: + # Provided a padding mask of dimensions [batch_size, seq_length] + # - the model is an encoder, so make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] + extended_attention_mask = attention_mask[:, None, None, :] + else: + raise ValueError( + "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format( + input_shape, attention_mask.shape + ) + ) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.cast( + dtype=self.config.dtype + ) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + return extended_attention_mask + + def invert_attention_mask( + self, encoder_attention_mask: paddle.Tensor + ) -> paddle.Tensor: + """ + Invert an attention mask (e.g., switches 0. and 1.). + Args: + encoder_attention_mask (`paddle.Tensor`): An attention mask. + Returns: + `paddle.Tensor`: The inverted attention mask. + """ + if encoder_attention_mask.ndim == 3: + encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] + if encoder_attention_mask.ndim == 2: + encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] + # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition + # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow + # /transformer/transformer_layers.py#L270 + # encoder_extended_attention_mask = (encoder_extended_attention_mask == + # encoder_extended_attention_mask.transpose(-1, -2)) + encoder_extended_attention_mask = encoder_extended_attention_mask.cast( + dtype=self.config.dtype + ) # fp16 compatibility + encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e4 + + return encoder_extended_attention_mask + + def get_head_mask( + self, + head_mask: Optional[paddle.Tensor], + num_hidden_layers: int, + is_attention_chunked: bool = False, + ) -> paddle.Tensor: + """ + Prepare the head mask if needed. + Args: + head_mask (`paddle.Tensor` with shape `[num_heads]` or `[num_hidden_layers x num_heads]`, *optional*): + The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard). + num_hidden_layers (`int`): + The number of hidden layers in the model. + is_attention_chunked: (`bool`, *optional*, defaults to `False`): + Whether or not the attentions scores are computed by chunks or not. + Returns: + `paddle.Tensor` with shape `[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or list with + `[None]` for each layer. + """ + if head_mask is not None: + head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers) + if is_attention_chunked is True: + head_mask = head_mask.unsqueeze(-1) + else: + head_mask = [None] * num_hidden_layers + + return head_mask + + def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers): + """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]""" + if head_mask.ndim == 1: + head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) + head_mask = head_mask.expand([num_hidden_layers, -1, -1, -1, -1]) + elif head_mask.ndim == 2: + head_mask = ( + head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) + ) # We can specify head_mask for each layer + assert head_mask.ndim == 5, f"head_mask.dim != 5, instead {head_mask.dim()}" + head_mask = head_mask.cast( + dtype=self.config.dtype + ) # switch to float if need + fp16 compatibility + return head_mask + + def forward( + self, + query_embeds, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + encoder_hidden_states (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`tuple(tuple(paddle.Tensor))` of length `config.n_layers` with each tuple having 4 tensors of: + shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and + value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are + used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key + value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape + `(batch_size, sequence_length)`. + use_cache (`bool`, `optional`): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + # past_key_values_length + past_key_values_length = ( + past_key_values[0][0].shape[2] - self.config.query_length + if past_key_values is not None + else 0 + ) + + query_length = query_embeds.shape[1] if query_embeds is not None else 0 + + embedding_output = self.layernorm( + query_embeds.cast(self.layernorm.weight.dtype) + ) + embedding_output = self.dropout(embedding_output) + input_shape = embedding_output.shape[:-1] + batch_size, seq_length = input_shape + + if attention_mask is None: + attention_mask = paddle.ones( + ((batch_size, seq_length + past_key_values_length)) + ) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask = self.get_extended_attention_mask( + attention_mask, input_shape + ) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if encoder_hidden_states is not None: + if type(encoder_hidden_states) == list: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[ + 0 + ].shape + else: + ( + encoder_batch_size, + encoder_sequence_length, + _, + ) = encoder_hidden_states.shape + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + + if type(encoder_attention_mask) == list: + encoder_extended_attention_mask = [ + self.invert_attention_mask(mask) for mask in encoder_attention_mask + ] + elif encoder_attention_mask is None: + encoder_attention_mask = paddle.ones(encoder_hidden_shape) + encoder_extended_attention_mask = self.invert_attention_mask( + encoder_attention_mask + ) + else: + encoder_extended_attention_mask = self.invert_attention_mask( + encoder_attention_mask + ) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + query_length=query_length, + ) + sequence_output = encoder_outputs[0] + pooled_output = sequence_output[:, 0, :] + # print('DEBUG!!!sequence_output', sequence_output.shape, np.abs(sequence_output.numpy()).mean()) + # print('DEBUG!!!pooled_output', pooled_output.shape, np.abs(pooled_output.numpy()).mean()) + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +class Blip2Model(Blip2PretrainedModel): + config_class = Blip2Config + main_input_name = "pixel_values" + + def __init__(self, config: Blip2Config): + super().__init__(config) + + self.vision_model = Blip2VisionModel(config.vision_config) + self.query_tokens = Parameter( + paddle.zeros( + [1, config.num_query_tokens, config.qformer_config.hidden_size] + ) + ) + self.qformer = Blip2QFormerModel(config.qformer_config) + + self.language_projection = nn.Linear( + config.qformer_config.hidden_size, config.text_config.hidden_size + ) + if config.use_decoder_only_language_model: + if isinstance(config.text_config, OPTConfig): + language_model = OPTForCausalLM(config.text_config) + else: + raise NotImplementedError + else: + if isinstance(config.text_config, T5Config): + language_model = T5ForConditionalGeneration(config.text_config) + else: + raise NotImplementedError + self.language_model = language_model + + def get_input_embeddings(self) -> nn.Layer: + return self.vision_model.embeddings.patch_embedding + + def get_text_features( + self, + input_ids: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + decoder_input_ids: Optional[paddle.Tensor] = None, + decoder_attention_mask: Optional[paddle.Tensor] = None, + labels: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + r""" + Returns: + text_outputs (`CausalLMOutputWithPast`, or `tuple(paddle.Tensor)` if `return_dict=False`): + The language model outputs. If `return_dict=True`, the output is a [`CausalLMOutputWithPast`] that + contains the language model logits, the past key values and the hidden states if + `output_hidden_states=True`. + Examples: + ```python + >>> import paddle + >>> from paddlenlp.transformers import AutoTokenizer, Blip2Model + >>> model = Blip2Model.from_pretrained("Salesforce/blip2-flan-t5-xl") + >>> model.to(device) # doctest: +IGNORE_RESULT + >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/blip2-flan-t5-xl") + >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt").to(device) + >>> text_features = model.get_text_features(**inputs) + ```""" + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + if self.config.use_decoder_only_language_model: + text_outputs = self.language_model( + input_ids=input_ids, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + else: + inputs_embeds = self.language_model.get_input_embeddings()(input_ids) + + text_outputs = self.language_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + labels=labels, + ) + + return text_outputs + + def get_image_features( + self, + pixel_values: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + r""" + Returns: + vision_outputs (`BaseModelOutputWithPooling` or tuple of `paddle.Tensor`): + The vision model outputs. If `return_dict=True`, the output is a [`BaseModelOutputWithPooling`] that + contains the image features, the pooled image features and the hidden states if + `output_hidden_states=True`. + Examples: + ```python + >>> import paddle + >>> from PIL import Image + >>> import requests + >>> from paddlenlp.transformers import AutoProcessor, Blip2Model + >>> model = Blip2Model.from_pretrained("Salesforce/blip2-flan-t5-xl") + >>> model.to(device) # doctest: +IGNORE_RESULT + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip2-flan-t5-xl") + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> inputs = processor(images=image, return_tensors="pd") + >>> image_outputs = model.get_image_features(**inputs) + ```""" + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + if not return_dict: + last_hidden_state = vision_outputs[0] + pooled_output = vision_outputs[1] + else: + last_hidden_state = vision_outputs.last_hidden_state + pooled_output = vision_outputs.pooler_output + + if not return_dict: + return (last_hidden_state, pooled_output) + vision_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=vision_outputs.hidden_states, + attentions=vision_outputs.attentions, + ) + + def get_qformer_features( + self, + pixel_values: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + r""" + Returns: + vision_outputs (`BaseModelOutputWithPooling` or tuple of `paddle.Tensor`): + The vision model outputs. If `return_dict=True`, the output is a [`BaseModelOutputWithPooling`] that + contains the image features, the pooled image features and the hidden states if + `output_hidden_states=True`. + Examples: + ```python + >>> import paddle + >>> from PIL import Image + >>> import requests + >>> from paddlenlp.transformers import Blip2Processor, Blip2Model + >>> processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl") + >>> model = Blip2Model.from_pretrained("Salesforce/blip2-flan-t5-xl") + >>> model.to(device) # doctest: +IGNORE_RESULT + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> inputs = processor(images=image, return_tensors="pt") + >>> qformer_outputs = model.get_qformer_features(**inputs) + ```""" + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + image_embeds = vision_outputs[0] + + # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention + image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64") + + query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1]) + query_outputs = self.qformer( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + return query_outputs + + def forward( + self, + pixel_values: paddle.Tensor, + input_ids: paddle.Tensor, + attention_mask: Optional[paddle.Tensor] = None, + decoder_input_ids: Optional[paddle.Tensor] = None, + decoder_attention_mask: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + labels: Optional[paddle.Tensor] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, Blip2ForConditionalGenerationModelOutput]: + r""" + Returns: + Examples: + ```python + >>> from PIL import Image + >>> import requests + >>> from paddlenlp.transformers import Blip2Processor, Blip2Model + >>> import paddle + >>> processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl") + >>> model = Blip2Model.from_pretrained("Salesforce/blip2-flan-t5-xl") + >>> model.to(device) # doctest: +IGNORE_RESULT + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> prompt = "Question: how many cats are there? Answer:" + >>> inputs = processor(images=image, text=prompt, return_tensors="pd") + >>> outputs = model(pixel_values=inputs["pixel_values"],input_ids=inputs["input_ids"]) + ```""" + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + # step 1: forward the images through the vision encoder, + # to get image embeddings of shape (batch_size, seq_len, hidden_size) + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + image_embeds = vision_outputs[0] + + # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention + image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64") + + query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1]) + query_outputs = self.qformer( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + query_output = query_outputs[0] + + # step 3: use the language model, conditioned on the query outputs and the prompt + language_model_inputs = self.language_projection(query_output) + language_model_attention_mask = paddle.ones( + language_model_inputs.shape[:-1], dtype="int64" + ) + inputs_embeds = self.language_model.get_input_embeddings()(input_ids) + inputs_embeds = paddle.concat([language_model_inputs, inputs_embeds], axis=1) + + if attention_mask is None: + attention_mask = paddle.ones_like(input_ids) + + attention_mask = paddle.concat( + [language_model_attention_mask, attention_mask], axis=1 + ) + + if self.config.use_decoder_only_language_model: + outputs = self.language_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + logits = outputs.logits if return_dict else outputs[0] + loss = None + # we compute the loss here since we need to take into account the sequence length of the query embeds + if labels is not None: + logits = logits[:, -labels.shape[1] :, :] + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :] + shift_labels = labels[..., 1:] + + # Flatten the tokens + loss_fct = CrossEntropyLoss(reduction="mean") + + loss = loss_fct( + shift_logits.reshape([-1, self.config.text_config.vocab_size]), + shift_labels.reshape([-1]), + ) + else: + outputs = self.language_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + labels=labels, + ) + loss = outputs.loss if return_dict else outputs[0] + logits = outputs.logits if return_dict else outputs[1] + + if not return_dict: + output = (logits, vision_outputs, query_outputs, outputs) + return ((loss,) + output) if loss is not None else output + + return Blip2ForConditionalGenerationModelOutput( + loss=loss, + logits=logits, + vision_outputs=vision_outputs, + qformer_outputs=query_outputs, + language_model_outputs=outputs, + ) + + +class Blip2ForConditionalGeneration(Blip2PretrainedModel): + config_class = Blip2Config + main_input_name = "pixel_values" + + def __init__(self, config: Blip2Config): + super().__init__(config) + self.vision_model = Blip2VisionModel(config.vision_config) + # self.post_layernorm = nn.LayerNorm(config.vision_config.hidden_size, epsilon=config.vision_config.layer_norm_eps) + self.freeze_vit = config.freeze_vit + if self.freeze_vit: + # freeze vit except the post layer norm layer. + for name, param in self.vision_model.named_parameters(): + if "post_layernorm" not in name: + param.stop_gradient = True + self.vision_model.eval() + logger.info("freeze vision encoder") + self.query_tokens = Parameter( + paddle.zeros( + [1, config.num_query_tokens, config.qformer_config.hidden_size] + ) + ) + self.qformer = Blip2QFormerModel(config.qformer_config) + self.language_projection = nn.Linear( + config.qformer_config.hidden_size, config.text_config.hidden_size + ) + if config.use_decoder_only_language_model: + # language_model = AutoModelForCausalLM.from_config(config.text_config) + if isinstance(config.text_config, OPTConfig): + language_model = OPTForCausalLM(config.text_config) + else: + raise NotImplementedError + else: + # language_model = AutoModelForSeq2SeqLM.from_config(config.text_config) + if isinstance(config.text_config, T5Config): + language_model = T5ForConditionalGeneration(config.text_config) + else: + raise NotImplementedError + self.language_model = language_model + for name, param in self.language_model.named_parameters(): + param.stop_gradient = True + self.pad_token_id = config.text_config.pad_token_id + + def get_input_embeddings(self) -> nn.Layer: + return self.vision_model.embeddings.patch_embedding + + def forward( + self, + pixel_values: paddle.Tensor, + input_ids: paddle.Tensor, + attention_mask: Optional[paddle.Tensor] = None, + decoder_input_ids: Optional[paddle.Tensor] = None, + decoder_attention_mask: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + labels: Optional[paddle.Tensor] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, Blip2ForConditionalGenerationModelOutput]: + r""" + Returns: + Examples: + Image captioning (without providing a text prompt): + ```python + >>> from PIL import Image + >>> import requests + >>> from paddlenlp.transformers import Blip2Processor, Blip2ForConditionalGeneration + >>> import paddle + >>> processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl") + >>> model = Blip2ForConditionalGeneration.from_pretrained( + ... "Salesforce/blip2-flan-t5-xl" + ... ) + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> inputs = processor(images=image, return_tensors="pd") + >>> generated_ids, scores = model.generate(**inputs) + >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip() + >>> print(generated_text) + two cats laying on a couch + ``` + Visual question answering (prompt = question): + ```python + >>> from PIL import Image + >>> import requests + >>> from paddlenlp.transformers import Blip2Processor, Blip2ForConditionalGeneration + >>> import paddle + >>> processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl") + >>> model = Blip2ForConditionalGeneration.from_pretrained( + ... "Salesforce/blip2-flan-t5-xl" + ... ) + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> prompt = "Question: how many cats are there? Answer:" + >>> inputs = processor(images=image, text=prompt, return_tensors="pd") + >>> generated_ids, scores= model.generate(**inputs) + >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip() + >>> print(generated_text) + two + ```""" + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + # step 1: forward the images through the vision encoder, + # to get image embeddings of shape (batch_size, seq_len, hidden_size) + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + image_embeds = vision_outputs[0] + + # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention + image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64") + + query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1]) + # print('DEBUG!! Blip2ForCond query_tokens: ', query_tokens.shape, np.abs(query_tokens.numpy()).mean()) + # print('DEBUG!! Blip2ForCond image_embeds: ', image_embeds.shape, np.abs(image_embeds.numpy()).mean()) + # print('DEBUG!! Blip2ForCond image_attention_mask: ', image_attention_mask.shape, np.abs(image_attention_mask.numpy()).mean()) + query_outputs = self.qformer( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + query_output = query_outputs[0] + + # step 3: use the language model, conditioned on the query outputs and the prompt + # print('DEBUG!!! Blip2ForCond query_output: ', query_output.shape, query_output) + language_model_inputs = self.language_projection(query_output) + language_model_attention_mask = paddle.ones( + language_model_inputs.shape[:-1], dtype="int64" + ) + inputs_embeds = self.language_model.get_input_embeddings()(input_ids) + inputs_embeds = paddle.concat([language_model_inputs, inputs_embeds], axis=1) + + if attention_mask is None: + attention_mask = paddle.ones_like(input_ids) + + attention_mask = paddle.concat( + [language_model_attention_mask, attention_mask], axis=1 + ) + + targets = input_ids * ( + 1 - (input_ids == self.pad_token_id).astype(input_ids.dtype) + ) + (input_ids == self.pad_token_id).astype(input_ids.dtype) * (-100) + + empty_targets = paddle.ones( + language_model_attention_mask.shape, dtype="int64" + ).fill_(-100) + labels = paddle.concat([empty_targets, targets], axis=1) + labels.stop_gradient = True + + if self.config.use_decoder_only_language_model: + outputs = self.language_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + logits = outputs.logits if return_dict else outputs + loss = None + + # we compute the loss here since we need to take into account the sequence length of the query embeds + if labels is not None: + logits = logits[:, -labels.shape[1] :, :] + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :] + shift_labels = labels[..., 1:] + + # Flatten the tokens + loss_fct = CrossEntropyLoss(reduction="none") + shift_logits = shift_logits.reshape( + [-1, self.config.text_config.vocab_size] + ) + shift_labels = shift_labels.reshape([-1]) + loss = ( + loss_fct(shift_logits, shift_labels).sum() + / (shift_labels > 0).sum() + ) + else: + outputs = self.language_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + labels=labels, + ) + loss = outputs.loss if return_dict else outputs[0] + logits = outputs.logits if return_dict else outputs[1] + # print('DEBUG!!! Blip2ForCond loss: ', loss.shape, loss) + if not return_dict: + output = (logits, vision_outputs, query_outputs, outputs) + return ((loss,) + output) if loss is not None else output + + return Blip2ForConditionalGenerationModelOutput( + loss=loss, + logits=logits, + vision_outputs=vision_outputs, + qformer_outputs=query_outputs, + language_model_outputs=outputs, + ) + + @paddle.no_grad() + def generate( + self, + pixel_values: paddle.Tensor, + input_ids: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + **generate_kwargs, + ) -> paddle.Tensor: + """ + Overrides `generate` function to be able to use the model as a conditional generator. + Args: + pixel_values (`paddle.Tensor` of shape (batch_size, num_channels, height, width)): + Input images to be processed. + input_ids (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*): + The sequence used as a prompt for the generation. + attention_mask (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*): + Mask to avoid performing attention on padding token indices + Returns: + captions (list): A list of strings of length batch_size * num_captions. + """ + batch_size = pixel_values.shape[0] + image_embeds = self.vision_model( + pixel_values, return_dict=True + ).last_hidden_state + # print('DEBUG!!! image_embeds: ', image_embeds.shape, ' ', np.abs(image_embeds.numpy()).mean(), image_embeds) + image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64") + + query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1]) + query_outputs = self.qformer( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + return_dict=True, + ) + query_output = query_outputs.last_hidden_state + + language_model_inputs = self.language_projection(query_output) + language_attention_mask = paddle.ones( + language_model_inputs.shape[:-1], dtype="int64" + ) + if input_ids is None: + input_ids = paddle.to_tensor([[self.config.text_config.bos_token_id]]).tile( + [batch_size, 1] + ) + if attention_mask is None: + attention_mask = paddle.ones_like(input_ids) + attention_mask = paddle.concat( + [language_attention_mask, attention_mask], axis=1 + ) + # concatenate query embeddings with prompt embeddings + inputs_embeds = self.language_model.get_input_embeddings()(input_ids) + # print('DEBUG!! input_ids: ', input_ids.shape,'', np.abs(input_ids.numpy()).mean(), input_ids) + # print('DEBUG!! inputs_embeds: ', inputs_embeds.shape,'', np.abs(inputs_embeds.numpy()).mean(), inputs_embeds) + inputs_embeds = paddle.concat([language_model_inputs, inputs_embeds], axis=1) + # print('DEBUG!! inputs_embeds concat: ', inputs_embeds.shape,'', np.abs(inputs_embeds.numpy()).mean(), inputs_embeds) + + outputs = self.language_model.generate( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + decode_strategy="beam_search", + length_penalty=1.0, + num_beams=5, + max_length=30, + **generate_kwargs, + ) + return outputs diff --git a/paddlevlp/optimization.py b/paddlevlp/optimization.py new file mode 100644 index 00000000000000..9715228f9f4a77 --- /dev/null +++ b/paddlevlp/optimization.py @@ -0,0 +1,106 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +from paddle.optimizer.lr import LRScheduler + +from paddlevlp.utils.log import logger + +__all__ = [ + "CosineDecayWithWarmup", + "FilterParamsName", +] + + +class CosineDecayWithWarmup(LRScheduler): + """ + Cosine learning rate decay + lr = 0.05 * (math.cos(epoch * (math.pi / epochs)) + 1) + Args: + lr(float): initial learning rate + step_each_epoch(int): steps each epoch + epochs(int): total training epochs + eta_min(float): Minimum learning rate. Default: 0.0. + warmup_start_lr(float): Initial learning rate of warm up. Default: 0.0. + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + """ + + def __init__( + self, + learning_rate, + epochs, + eta_min=0.0, + warmup_steps=0, + warmup_start_lr=0.0, + last_epoch=-1, + step_each_epoch=1, + **kwargs + ): + self.start_lr = learning_rate + self.T_max = epochs + self.eta_min = eta_min + self.last_epoch = last_epoch + self.warmup_steps = warmup_steps + self.warmup_start_lr = warmup_start_lr + self.last_lr = self.start_lr + self.cur_step = 0 + self.last_epoch = last_epoch + self.step_each_epoch = step_each_epoch + if self.warmup_steps > 0: + self.last_lr = self.warmup_start_lr + super().__init__(learning_rate=self.last_lr, last_epoch=self.last_epoch) + + def step(self): + self.cur_step += 1 + cur_step_in_epoch = (self.cur_step - 2) % self.step_each_epoch + if self.cur_step < self.warmup_steps and self.last_epoch == 0: + self.last_lr = self.warmup_start_lr + ( + self.start_lr - self.warmup_start_lr + ) * cur_step_in_epoch / max(self.warmup_steps, 1) + else: + self.last_lr = (self.start_lr - self.eta_min) * 0.5 * ( + 1.0 + math.cos(math.pi * self.last_epoch / self.T_max) + ) + self.eta_min + self.last_epoch += 1 + + def get_lr(self): + return self.last_lr + + +class FilterParamsName(object): + """ + FilterParamsName is a utility class to filter out some params from optimizer. + """ + + def __init__(self): + self.p_non_wd_name = [] + + def __call__(self, model): + num_parameters = 0 + p_wd, p_non_wd = [], [] + for n, p in model.named_parameters(): + if p.stop_gradient: + continue # frozen weights + if p.ndim < 2 or "bias" in n or "norm" in n.lower(): + p_non_wd.append(p) + self.p_non_wd_name.append(n) + else: + p_wd.append(p) + num_parameters += p.numel() + logger.info("number of trainable parameters: %d" % num_parameters) + return p_wd, p_non_wd + + def apply_decay_param_fun(self, name): + return name not in self.p_non_wd_name diff --git a/paddlevlp/processors/__init__.py b/paddlevlp/processors/__init__.py new file mode 100644 index 00000000000000..4738e3272555e6 --- /dev/null +++ b/paddlevlp/processors/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .blip_processing import * diff --git a/paddlevlp/processors/blip_processing.py b/paddlevlp/processors/blip_processing.py new file mode 100644 index 00000000000000..916be3b2493ed7 --- /dev/null +++ b/paddlevlp/processors/blip_processing.py @@ -0,0 +1,661 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Processor class for BLIP-2. +""" + +import re +from typing import Dict, List, Optional, Tuple, Union + +import numpy as np +import PIL +from paddlenlp.transformers.tokenizer_utils_base import (BatchEncoding, + PreTokenizedInput, + TensorType, TextInput) + +from .base_processing import ProcessorMixin +from .image_transform_utils import (convert_to_rgb, normalize, + random_horizontal_flip, + random_resized_crop, rescale, resize, + to_channel_dimension_format) +from .image_utils import (IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, + ChannelDimension, ImageInput, PILImageResampling, + load_image, to_numpy_array, valid_images) +from .processing_utils import (BaseImageProcessor, BaseTextProcessor, + get_size_dict) + +__all__ = [ + "Blip2Processor", + "BlipImageProcessor", + "BlipTextProcessor", +] + + +class Blip2Processor(ProcessorMixin): + r""" + Constructs a BLIP-2 processor which wraps a BLIP2 image processor and an OPT/T5 tokenizer into a single processor. + [`Blip2Processor`] offers all the functionalities of [`BlipImageProcessor`] and [`AutoTokenizer`]. See the docstring + of [`~Blip2Processor.__call__`] and [`~Blip2Processor.decode`] for more information. + Args: + image_processor (`BlipImageProcessor`): + An instance of [`BlipImageProcessor`]. The image processor is a required input. + tokenizer (`AutoTokenizer`): + An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input. + """ + attributes = ["image_processor", "text_processor", "tokenizer"] + image_processor_class = "BlipImageProcessor" + text_processor_class = "BlipTextProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__(self, image_processor, text_processor, tokenizer): + super().__init__(image_processor, text_processor, tokenizer) + + def __call__( + self, + images=None, + text: Union[ + TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput] + ] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + max_length=32, + mode="train", + **kwargs, + ) -> BatchEncoding: + """ + Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` + and `kwargs` arguments to Bert's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode + the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to + Blip2ImageProcessor's [`~Blip2ImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring + of the above two methods for more information. + + Args: + + images (`PIL.Image.Image`, `np.ndarray`, `paddle.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[paddle.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or Paddle + tensor. In case of a NumPy array/Paddle tensor, each image should be of shape (C, H, W), where C is a + number of channels, H and W are image height and width. + text (`str`, `List[str]`, `List[List[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + - `'pd'`: Return Paddle `paddle.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + max_length (`int`, *optional*): + If set to a number, will limit the total sequence returned so + that it has a maximum length. + mode (`str`, *optional*): + The mode of ("train", "val", "test") + + Returns: + [`BatchEncoding`]: A [`BatchEncoding`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + """ + if images is None and text is None: + raise ValueError("You have to specify either images or text.") + + # Get only text + if images is None: + text_encoding = self.text_processor(text, mode=mode) + text_encoding = self.tokenizer( + text=text_encoding, + return_tensors=return_tensors, + return_token_type_ids=False, + max_length=32, + padding=True, + **kwargs, + ) + return text_encoding + + # add pixel_values + encoding_image_processor = self.image_processor( + images, return_tensors=return_tensors, mode=mode + ) + + if text is not None: + text_encoding = self.text_processor(text, mode=mode) + text_encoding = self.tokenizer( + text=text_encoding, + return_tensors=return_tensors, + return_token_type_ids=False, + max_length=max_length, + padding=True, + **kwargs, + ) + else: + text_encoding = None + # eos_token_id = None + + if text_encoding is not None: + encoding_image_processor.update(text_encoding) + + return encoding_image_processor + + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer + to the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + + +class BlipTextProcessor(BaseTextProcessor): + r""" + Constructs a BLIP text processor. + + Args: + prompt(`str`, *optional*, defaults to `""`): + The prompt (used for generating prompts) that will be prepended to each generated text. + do_caption (`bool`, *optional*, defaults to `False`): + Whether to do the caption task. + do_question(`bool`, *optional*, defaults to `False`): + Whether to do the question task. + max_words (`int`, *optional*, defaults to `50`): + The maximum number of words to keep in the span of text. + + """ + + def __init__( + self, + prompt: str = "", + do_caption: bool = False, + do_question: bool = False, + max_words: int = 50, + **kwargs, + ): + super().__init__(**kwargs) + if do_question and do_caption: + raise ValueError( + "do_caption and do_question cannot be set at the same time." + ) + if not do_caption and not do_question: + raise ValueError("Either do_caption or do_question must be set to True.") + self.prompt = prompt + self.do_caption = do_caption + self.do_question = do_question + self.max_words = max_words + + def __call__( + self, + text, + do_caption: Optional[bool] = None, + do_question: Optional[bool] = None, + mode: str = "train", + **kwargs, + ): + """ + Preprocess the text before tokenization. + + Args: + text (`str`): + Text to preprocess. + do_caption (`bool`, *optional*, defaults to `False`): + Whether to do the caption task. + do_question(`bool`, *optional*, defaults to `False`): + Whether to do the question task. + mode(`str`, *optional*, defaults to `train`): + The mode of ("train", "val", "test") + + """ + do_caption = do_caption if do_caption is not None else self.do_caption + do_question = do_question if do_question is not None else self.do_question + if do_caption and do_question: + raise ValueError( + "do_caption and do_question cannot be set at the same time." + ) + if not do_caption and not do_question: + raise ValueError("Either do_caption or do_question must be set to True.") + + if not isinstance(text, (list, tuple)): + text = [text] + # import pdb; pdb.set_trace() + if do_caption: + results = [self.prompt + self.pre_caption(t) for t in text] + if do_question: + results = [self.pre_question(t) for t in text] + if mode == "train": + results = [res + "\n" for res in results] + return results + + def pre_caption(self, caption: str) -> str: + """ + Preprocess the text before tokenization. + """ + caption = re.sub( + r"([.!\"()*#:;~])", + " ", + caption.lower(), + ) + caption = re.sub( + r"\s{2,}", + " ", + caption, + ) + caption = caption.rstrip("\n") + caption = caption.strip(" ") + + # truncate caption + caption_words = caption.split(" ") + if len(caption_words) > self.max_words: + caption = " ".join(caption_words[: self.max_words]) + + return caption + + def pre_question(self, question: str) -> str: + """ + Preprocess the text before tokenization. + """ + question = re.sub( + r"([.!\"()*#:;~])", + "", + question.lower(), + ) + question = question.rstrip(" ") + + # truncate question + question_words = question.split(" ") + if len(question_words) > self.max_words: + question = " ".join(question_words[: self.max_words]) + + return question + + +class BlipImageProcessor(BaseImageProcessor): + r""" + Constructs a BLIP image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the + `do_resize` parameter in the `preprocess` method. + size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`): + Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess` + method. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be + overridden by the `resample` parameter in the `preprocess` method. + do_rescale (`bool`, *optional*, defaults to `True`): + Wwhether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the + `do_rescale` parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be + overridden by the `rescale_factor` parameter in the `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. Can be overridden by the `do_normalize` parameter in the `preprocess` method. + image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be + overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + Can be overridden by the `image_std` parameter in the `preprocess` method. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + do_rand_resize_crop (`bool`, *optional*, defaults to `False`): + Whether to *randomly crop* the image at random in the height and width dimensions. + rand_resize_crop_prob (`float`, *optional*, defaults to `0.5`): + Probability of applying a random crop to the image. + scale (`list|tuple`, *optional*, defaults to `(0.08, 1.0)`): + Scale range of the cropped image before resizing, relatively to the origin image. + mode (`str`, *optional*): + The mode of ("train", "val", "test") + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Dict[str, int] = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: bool = True, + do_flip: bool = False, + flip_prob: float = 0.5, + do_rand_resize_crop: bool = False, + scale: Optional[Union[List[float], Tuple[float]]] = (0.08, 1.0), + do_collate: bool = False, + mode: str = "train", + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"height": 384, "width": 384} + size = get_size_dict(size, default_to_square=True) + + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = ( + image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN + ) + self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD + self.do_convert_rgb = do_convert_rgb + self.do_flip = do_flip + self.flip_prob = flip_prob + self.do_rand_resize_crop = do_rand_resize_crop + self.scale = scale + self.do_collate = do_collate + + def resize( + self, + image: np.ndarray, + size: Dict[str, int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image. + + Resizes the shorter side of the image to `size["shortest_edge"]` while preserving the aspect ratio. If the + longer side is larger than the max size `(int(`size["shortest_edge"]` * 1333 / 800))`, the longer side is then + resized to the max size while preserving the aspect ratio. + + Args: + image (`np.ndarray`): + Image to resize. + size (`Dict[str, int]`): + Controls the size of the output image. Should be of the form `{"shortest_edge": int}`. + resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`): + Resampling filter to use when resiizing the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + """ + size = get_size_dict(size, default_to_square=True) + output_size = (size["width"], size["height"]) + return resize( + image, + size=output_size, + resample=resample, + data_format=data_format, + **kwargs, + ) + + def rescale( + self, + image: np.ndarray, + scale: Union[int, float], + data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ): + """ + Rescale an image by a scale factor. image = image * scale. + + Args: + image (`np.ndarray`): + Image to rescale. + scale (`int` or `float`): + Scale to apply to the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + """ + return rescale(image, scale=scale, data_format=data_format, **kwargs) + + def normalize( + self, + image: np.ndarray, + mean: Union[float, List[float]], + std: Union[float, List[float]], + data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Normalize an image. image = (image - image_mean) / image_std. + + Args: + image (`np.ndarray`): + Image to normalize. + mean (`float` or `List[float]`): + Image mean. + std (`float` or `List[float]`): + Image standard deviation. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + """ + return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs) + + def random_resized_crop( + self, + image: np.ndarray, + size: Union[int, List, Tuple], + scale: float, + resample: PILImageResampling = PILImageResampling.BICUBIC, + **kwargs, + ) -> np.ndarray: + """ + Crop the input data to random size and aspect ratio. + A crop of random size (default: of 0.08 to 1.0) of the original size and a random + aspect ratio (default: of 3/4 to 1.33) of the original aspect ratio is made. + After applying crop transfrom, the input data will be resized to given size. + + Args: + image (`np.ndarray`): + Image to resize to and crop. + size (Union[int, List, Tuple]): + Size of cropped image. + scale (`float`): + Scale to apply to the image. + resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`): + Resampling filter to use when resiizing the image. + """ + size = list(size.values()) + return random_resized_crop( + image, size=size, scale=scale, resample=resample, **kwargs + ) + + def random_horizontal_flip( + self, image: np.ndarray, flip_prob: float, **kwargs + ) -> np.ndarray: + """ + Horizontally flip the input data randomly with a given probability. + + Args: + image (`np.ndarray`): + Image to flip. + flip_prob (`float`): + Probability of flipping the image. + """ + return random_horizontal_flip(image, flip_prob=flip_prob, **kwargs) + + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[Dict[str, int]] = None, + resample: PILImageResampling = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + do_convert_rgb: bool = None, + do_flip: bool = None, + flip_prob: float = None, + do_rand_resize_crop: bool = None, + scale: Optional[Union[List[float], Tuple[float]]] = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + mode: str = None, + **kwargs, + ) -> PIL.Image.Image: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to `self.size`): + Controls the size of the image after `resize`. The shortest edge of the image is resized to + `size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image + is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest + edge equal to `int(size["shortest_edge"] * (1333 / 800))`. + resample (`PILImageResampling`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image values between [0 - 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Image mean to normalize the image by if `do_normalize` is set to `True`. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to normalize the image by if `do_normalize` is set to `True`. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + do_rand_resize_crop (`bool`, *optional*, defaults to `False`): + Whether to *randomly crop* the image at random in the height and width dimensions. + scale (`list|tuple`, *optional*, defaults to `(0.08, 1.0)`): + Scale range of the cropped image before resizing, relatively to the origin image. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.PADDLE` or `'pt'`: Return a batch of type `paddle.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: defaults to the channel dimension format of the input image. + mode (`str`, *optional*): + The mode of ("train", "val", "test") + """ + do_resize = do_resize if do_resize is not None else self.do_resize + resample = resample if resample is not None else self.resample + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = ( + rescale_factor if rescale_factor is not None else self.rescale_factor + ) + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_convert_rgb = ( + do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + ) + do_flip = do_flip if do_flip is not None else self.do_flip + flip_prob = flip_prob if flip_prob is not None else self.flip_prob + scale = scale if scale is not None else self.scale + do_rand_resize_crop = ( + do_rand_resize_crop + if do_rand_resize_crop is not None + else self.do_rand_resize_crop + ) + + size = size if size is not None else self.size + size = get_size_dict(size, default_to_square=False) + + if not isinstance(images, (list, tuple)): + images = [images] + + if isinstance(images[0], str): + images = [load_image(image) for image in images] + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "paddle.Tensor." + ) + + if do_resize and size is None or resample is None: + raise ValueError( + "Size and resample must be specified if do_resize is True." + ) + + if do_rescale and rescale_factor is None: + raise ValueError("Rescale factor must be specified if do_rescale is True.") + + if do_normalize and (image_mean is None or image_std is None): + raise ValueError( + "Image mean and std must be specified if do_normalize is True." + ) + + if do_flip and flip_prob is None: + raise ValueError("Flip probability must be specified if do_flip is True.") + + if do_rand_resize_crop and scale is None: + raise ValueError( + "Random resize crop probability must be specified if do_rand_resize_crop is True." + ) + + # PIL RGBA images are converted to RGB + if do_convert_rgb: + images = [convert_to_rgb(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + if do_rand_resize_crop and mode == "train": + images = [ + self.random_resized_crop( + image=image, size=size, scale=scale, resample=resample + ) + for image in images + ] + elif do_resize and mode != "train": + images = [ + self.resize(image=image, size=size, resample=resample) + for image in images + ] + + if do_flip and mode == "train": + images = [ + self.random_horizontal_flip(image=image, flip_prob=flip_prob) + for image in images + ] + + if do_rescale: + images = [ + self.rescale(image=image, scale=rescale_factor) for image in images + ] + + if do_normalize: + images = [ + self.normalize(image=image, mean=image_mean, std=image_std) + for image in images + ] + + images = [to_channel_dimension_format(image, data_format) for image in images] + + data = {"pixel_values": images} + return BatchEncoding(data=data, tensor_type=return_tensors) diff --git a/paddlevlp/processors/image_processing_utils.py b/paddlevlp/processors/image_processing_utils.py new file mode 100644 index 00000000000000..e476b8549c8f38 --- /dev/null +++ b/paddlevlp/processors/image_processing_utils.py @@ -0,0 +1,553 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import json +import os +import tempfile +from typing import Any, Dict, Iterable, Optional, Tuple, Union + +import numpy as np +from huggingface_hub import (create_repo, get_hf_file_metadata, + hf_hub_download, hf_hub_url, + repo_type_and_id_from_hf_id, upload_folder) +from huggingface_hub.utils import EntryNotFoundError +from paddlenlp import __version__ +from paddlenlp.transformers.feature_extraction_utils import \ + BatchFeature as BaseBatchFeature + +from paddlevlp.utils.downloader import (COMMUNITY_MODEL_PREFIX, + get_path_from_url_with_filelock, + resolve_cache_dir) +from paddlevlp.utils.log import logger + +IMAGE_PROCESSOR_NAME = "image_preprocessor_config.json" +TEXT_PROCESSOR_NAME = "text_processor_config.json" + + +class BatchFeature(BaseBatchFeature): + r""" + Holds the output of the image processor specific `__call__` methods. + + This class is derived from a python dictionary and can be used as a dictionary. + + Args: + data (`dict`): + Dictionary of lists/arrays/tensors returned by the __call__ method ('pixel_values', etc.). + tensor_type (`Union[None, str, TensorType]`, *optional*): + You can give a tensor_type here to convert the lists of integers in Paddle/Numpy Tensors at + initialization. + """ + + +class ImageProcessingMixin(object): + """ + This is an image processor mixin used to provide saving/loading functionality for sequential and image feature + extractors. + """ + + _auto_class = None + + def __init__(self, **kwargs): + """Set elements of `kwargs` as attributes.""" + # Pop "processor_class" as it should be saved as private attribute + self._processor_class = kwargs.pop("processor_class", None) + # Additional attributes without default values + for key, value in kwargs.items(): + try: + setattr(self, key, value) + except AttributeError as err: + logger.error(f"Can't set {key} with value {value} for {self}") + raise err + + def _set_processor_class(self, processor_class: str): + """Sets processor class as an attribute.""" + self._processor_class = processor_class + + @classmethod + def from_pretrained( + cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs + ): + r""" + Instantiate a type of [`~processing_utils.ImageProcessingMixin`] from an image processor. + + Args: + pretrained_model_name_or_path (`str` or `os.PathLike`): + This can be either: + + - a string, the *model id* of a pretrained image_processor hosted inside a model repo on + huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or + namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`. + - a path to a *directory* containing a image processor file saved using the + [`~processing_utils.ImageProcessingMixin.save_pretrained`] method, e.g., + `./my_model_directory/`. + - a path or url to a saved image processor JSON *file*, e.g., + `./my_model_directory/preprocessor_config.json`. + cache_dir (`str` or `os.PathLike`, *optional*): + Path to a directory in which a downloaded pretrained model image processor should be cached if the + standard cache should not be used. + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force to (re-)download the image processor files and override the cached versions if + they exist. + resume_download (`bool`, *optional*, defaults to `False`): + Whether or not to delete incompletely received file. Attempts to resume the download if such a file + exists. + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. + use_auth_token (`str` or `bool`, *optional*): + The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use + the token generated when running `huggingface-cli login` (stored in `~/.huggingface`). + revision (`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any + identifier allowed by git. + + + + + To test a pull request you made on the Hub, you can pass `revision="refs/pr/". + + + + return_unused_kwargs (`bool`, *optional*, defaults to `False`): + If `False`, then this function returns just the final image processor object. If `True`, then this + functions returns a `Tuple(image_processor, unused_kwargs)` where *unused_kwargs* is a dictionary + consisting of the key/value pairs whose keys are not image processor attributes: i.e., the part of + `kwargs` which has not been used to update `image_processor` and is otherwise ignored. + kwargs (`Dict[str, Any]`, *optional*): + The values in kwargs of any keys which are image processor attributes will be used to override the + loaded values. Behavior concerning key/value pairs whose keys are *not* image processor attributes is + controlled by the `return_unused_kwargs` keyword parameter. + + Returns: + A image processor of type [`~processing_utils.ImageProcessingMixin`]. + + Examples: + + ```python + # We can't instantiate directly the base class *ImageProcessingMixin* so let's show the examples on a + # derived class: *CLIPImageProcessor* + image_processor = CLIPImageProcessor.from_pretrained( + "openai/clip-vit-base-patch32" + ) # Download image_processing_config from huggingface.co and cache. + image_processor = CLIPImageProcessor.from_pretrained( + "./test/saved_model/" + ) # E.g. image processor (or model) was saved using *save_pretrained('./test/saved_model/')* + image_processor = CLIPImageProcessor.from_pretrained("./test/saved_model/preprocessor_config.json") + image_processor = CLIPImageProcessor.from_pretrained( + "openai/clip-vit-base-patch32", do_normalize=False, foo=False + ) + assert image_processor.do_normalize is False + image_processor, unused_kwargs = CLIPImageProcessor.from_pretrained( + "openai/clip-vit-base-patch32", do_normalize=False, foo=False, return_unused_kwargs=True + ) + assert image_processor.do_normalize is False + assert unused_kwargs == {"foo": False} + ```""" + image_processor_dict, kwargs = cls.get_image_processor_dict( + pretrained_model_name_or_path, **kwargs + ) + + return cls.from_dict(image_processor_dict, **kwargs) + + def save_pretrained(self, save_directory: Union[str, os.PathLike], **kwargs): + """ + Save an image processor object to the directory `save_directory`, so that it can be re-loaded using the + [`~processing_utils.ImageProcessingMixin.from_pretrained`] class method. + + Args: + save_directory (`str` or `os.PathLike`): + Directory where the image processor JSON file will be saved (will be created if it does not exist). + kwargs: + Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method. + """ + if os.path.isfile(save_directory): + raise AssertionError( + f"Provided path ({save_directory}) should be a directory, not a file" + ) + + os.makedirs(save_directory, exist_ok=True) + + # If we save using the predefined names, we can load using `from_pretrained` + output_image_processor_file = os.path.join(save_directory, IMAGE_PROCESSOR_NAME) + + self.to_json_file(output_image_processor_file) + logger.info(f"Image processor saved in {output_image_processor_file}") + + return [output_image_processor_file] + + def save_to_hf_hub( + self, + repo_id: str, + private: Optional[bool] = None, + subfolder: Optional[str] = None, + commit_message: Optional[str] = None, + revision: Optional[str] = None, + create_pr: bool = False, + ): + """ + Uploads all elements of this processor to a new HuggingFace Hub repository. + Args: + repo_id (str): Repository name for your processor in the Hub. + private (bool, optional): Whether theprocessor is set to private + subfolder (str, optional): Push to a subfolder of the repo instead of the root + commit_message (str, optional) — The summary / title / first line of the generated commit. Defaults to: f"Upload {path_in_repo} with huggingface_hub" + revision (str, optional) — The git revision to commit from. Defaults to the head of the "main" branch. + create_pr (boolean, optional) — Whether or not to create a Pull Request with that commit. Defaults to False. + If revision is not set, PR is opened against the "main" branch. If revision is set and is a branch, PR is opened against this branch. + If revision is set and is not a branch name (example: a commit oid), an RevisionNotFoundError is returned by the server. + + Returns: The url of the commit of your model in the given repository. + """ + repo_url = create_repo(repo_id, private=private, exist_ok=True) + + # Infer complete repo_id from repo_url + # Can be different from the input `repo_id` if repo_owner was implicit + _, repo_owner, repo_name = repo_type_and_id_from_hf_id(repo_url) + + repo_id = f"{repo_owner}/{repo_name}" + + # Check if README file already exist in repo + try: + get_hf_file_metadata( + hf_hub_url(repo_id=repo_id, filename="README.md", revision=revision) + ) + has_readme = True + except EntryNotFoundError: + has_readme = False + + with tempfile.TemporaryDirectory() as root_dir: + if subfolder is not None: + save_dir = os.path.join(root_dir, subfolder) + else: + save_dir = root_dir + # save model + self.save_pretrained(save_dir) + # Add readme if does not exist + logger.info("README.md not found, adding the default README.md") + if not has_readme: + with open(os.path.join(root_dir, "README.md"), "w") as f: + f.write(f"---\nlibrary_name: paddlenlp\n---\n# {repo_id}") + + # Upload model and return + logger.info(f"Pushing to the {repo_id}. This might take a while") + return upload_folder( + repo_id=repo_id, + repo_type="model", + folder_path=root_dir, + commit_message=commit_message, + revision=revision, + create_pr=create_pr, + ) + + @classmethod + def get_image_processor_dict( + cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs + ) -> Tuple[Dict[str, Any], Dict[str, Any]]: + """ + From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a + image processor of type [`~image_processor_utils.ImageProcessingMixin`] using `from_dict`. + + Parameters: + pretrained_model_name_or_path (`str` or `os.PathLike`): + The identifier of the pre-trained checkpoint from which we want the dictionary of parameters. + from_hf_hub (bool, optional): whether to load from Huggingface Hub + subfolder (str, optional) An optional value corresponding to a folder inside the repo. + + + Returns: + `Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the image processor object. + """ + cache_dir = kwargs.pop("cache_dir", None) + from_hf_hub = kwargs.pop("from_hf_hub", False) + subfolder = kwargs.pop("subfolder", None) + cache_dir = resolve_cache_dir( + pretrained_model_name_or_path, from_hf_hub, cache_dir + ) + + pretrained_model_name_or_path = str(pretrained_model_name_or_path) + is_local = os.path.isdir(pretrained_model_name_or_path) + if os.path.isdir(pretrained_model_name_or_path): + resolved_image_processor_file = os.path.join( + pretrained_model_name_or_path, IMAGE_PROCESSOR_NAME + ) + elif os.path.isfile(pretrained_model_name_or_path): + resolved_image_processor_file = pretrained_model_name_or_path + is_local = True + elif from_hf_hub: + image_processor_file = IMAGE_PROCESSOR_NAME + resolved_image_processor_file = hf_hub_download( + repo_id=pretrained_model_name_or_path, + filename=image_processor_file, + cache_dir=cache_dir, + subfolder=subfolder, + library_name="PaddleNLP", + library_version=__version__, + ) + else: + # Assuming from community-contributed pretrained models + image_processor_file = "/".join( + [ + COMMUNITY_MODEL_PREFIX, + pretrained_model_name_or_path, + IMAGE_PROCESSOR_NAME, + ] + ) + try: + # Load from local folder or from cache or download from model Hub and cache + resolved_image_processor_file = get_path_from_url_with_filelock( + image_processor_file, cache_dir + ) + except EnvironmentError: + # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to + # the original exception. + raise + except Exception: + # For any other exception, we throw a generic error. + raise EnvironmentError( + f"Can't load image processor for '{pretrained_model_name_or_path}'. If you were trying to load" + " it from 'BOS', make sure you don't have a local directory with the" + f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a" + f" directory containing a {IMAGE_PROCESSOR_NAME} file" + ) + + try: + # Load image_processor dict + with open(resolved_image_processor_file, "r", encoding="utf-8") as reader: + text = reader.read() + image_processor_dict = json.loads(text) + + except json.JSONDecodeError: + raise EnvironmentError( + f"It looks like the config file at '{resolved_image_processor_file}' is not a valid JSON file." + ) + + if is_local: + logger.info(f"loading configuration file {resolved_image_processor_file}") + else: + logger.info( + f"loading configuration file {image_processor_file} from cache at {resolved_image_processor_file}" + ) + + return image_processor_dict, kwargs + + @classmethod + def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs): + """ + Instantiates a type of [`~processing_utils.ImageProcessingMixin`] from a Python dictionary of parameters. + + Args: + image_processor_dict (`Dict[str, Any]`): + Dictionary that will be used to instantiate the image processor object. Such a dictionary can be + retrieved from a pretrained checkpoint by leveraging the + [`~processing_utils.ImageProcessingMixin.to_dict`] method. + kwargs (`Dict[str, Any]`): + Additional parameters from which to initialize the image processor object. + + Returns: + [`~processing_utils.ImageProcessingMixin`]: The image processor object instantiated from those + parameters. + """ + return_unused_kwargs = kwargs.pop("return_unused_kwargs", False) + + image_processor = cls(**image_processor_dict) + + # Update image_processor with kwargs if needed + to_remove = [] + for key, value in kwargs.items(): + if hasattr(image_processor, key): + setattr(image_processor, key, value) + to_remove.append(key) + for key in to_remove: + kwargs.pop(key, None) + + logger.info(f"Image processor {image_processor}") + if return_unused_kwargs: + return image_processor, kwargs + else: + return image_processor + + def to_dict(self) -> Dict[str, Any]: + """ + Serializes this instance to a Python dictionary. + + Returns: + `Dict[str, Any]`: Dictionary of all the attributes that make up this image processor instance. + """ + output = copy.deepcopy(self.__dict__) + output["image_processor_type"] = self.__class__.__name__ + + return output + + @classmethod + def from_json_file(cls, json_file: Union[str, os.PathLike]): + """ + Instantiates a image processor of type [`~processing_utils.ImageProcessingMixin`] from the path to a JSON + file of parameters. + + Args: + json_file (`str` or `os.PathLike`): + Path to the JSON file containing the parameters. + + Returns: + A image processor of type [`~processing_utils.ImageProcessingMixin`]: The image_processor object + instantiated from that JSON file. + """ + with open(json_file, "r", encoding="utf-8") as reader: + text = reader.read() + image_processor_dict = json.loads(text) + return cls(**image_processor_dict) + + def to_json_string(self) -> str: + """ + Serializes this instance to a JSON string. + + Returns: + `str`: String containing all the attributes that make up this feature_extractor instance in JSON format. + """ + dictionary = self.to_dict() + + for key, value in dictionary.items(): + if isinstance(value, np.ndarray): + dictionary[key] = value.tolist() + + # make sure private name "_processor_class" is correctly + # saved as "processor_class" + _processor_class = dictionary.pop("_processor_class", None) + if _processor_class is not None: + dictionary["processor_class"] = _processor_class + + return json.dumps(dictionary, indent=2, sort_keys=True) + "\n" + + def to_json_file(self, json_file_path: Union[str, os.PathLike]): + """ + Save this instance to a JSON file. + + Args: + json_file_path (`str` or `os.PathLike`): + Path to the JSON file in which this image_processor instance's parameters will be saved. + """ + with open(json_file_path, "w", encoding="utf-8") as writer: + writer.write(self.to_json_string()) + + def __repr__(self): + return f"{self.__class__.__name__} {self.to_json_string()}" + + +class BaseImageProcessor(ImageProcessingMixin): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def __call__(self, images, **kwargs) -> BatchFeature: + """Preprocess an image or a batch of images.""" + return self.preprocess(images, **kwargs) + + def preprocess(self, images, **kwargs) -> BatchFeature: + raise NotImplementedError( + "Each image processor must implement its own preprocess method" + ) + + +VALID_SIZE_DICT_KEYS = ( + {"height", "width"}, + {"shortest_edge"}, + {"shortest_edge", "longest_edge"}, +) + + +def is_valid_size_dict(size_dict): + if not isinstance(size_dict, dict): + return False + + size_dict_keys = set(size_dict.keys()) + for allowed_keys in VALID_SIZE_DICT_KEYS: + if size_dict_keys == allowed_keys: + return True + return False + + +def convert_to_size_dict( + size, + max_size: Optional[int] = None, + default_to_square: bool = True, + height_width_order: bool = True, +): + # By default, if size is an int we assume it represents a tuple of (size, size). + if isinstance(size, int) and default_to_square: + if max_size is not None: + raise ValueError( + "Cannot specify both size as an int, with default_to_square=True and max_size" + ) + return {"height": size, "width": size} + # In other configs, if size is an int and default_to_square is False, size represents the length of + # the shortest edge after resizing. + elif isinstance(size, int) and not default_to_square: + size_dict = {"shortest_edge": size} + if max_size is not None: + size_dict["longest_edge"] = max_size + return size_dict + # Otherwise, if size is a tuple it's either (height, width) or (width, height) + elif isinstance(size, (tuple, list)) and height_width_order: + return {"height": size[0], "width": size[1]} + elif isinstance(size, (tuple, list)) and not height_width_order: + return {"height": size[1], "width": size[0]} + + raise ValueError(f"Could not convert size input to size dict: {size}") + + +def get_size_dict( + size: Union[int, Iterable[int], Dict[str, int]] = None, + max_size: Optional[int] = None, + height_width_order: bool = True, + default_to_square: bool = True, + param_name="size", +) -> dict: + """ + Converts the old size parameter in the config into the new dict expected in the config. This is to ensure backwards + compatibility with the old image processor configs and removes ambiguity over whether the tuple is in (height, + width) or (width, height) format. + + - If `size` is tuple, it is converted to `{"height": size[0], "width": size[1]}` or `{"height": size[1], "width": + size[0]}` if `height_width_order` is `False`. + - If `size` is an int, and `default_to_square` is `True`, it is converted to `{"height": size, "width": size}`. + - If `size` is an int and `default_to_square` is False, it is converted to `{"shortest_edge": size}`. If `max_size` + is set, it is added to the dict as `{"longest_edge": max_size}`. + + Args: + size (`Union[int, Iterable[int], Dict[str, int]]`, *optional*): + The `size` parameter to be cast into a size dictionary. + max_size (`Optional[int]`, *optional*): + The `max_size` parameter to be cast into a size dictionary. + height_width_order (`bool`, *optional*, defaults to `True`): + If `size` is a tuple, whether it's in (height, width) or (width, height) order. + default_to_square (`bool`, *optional*, defaults to `True`): + If `size` is an int, whether to default to a square image or not. + """ + if not isinstance(size, dict): + size_dict = convert_to_size_dict( + size, max_size, default_to_square, height_width_order + ) + logger.info( + f"{param_name} should be a dictionary on of the following set of keys: {VALID_SIZE_DICT_KEYS}, got {size}." + f" Converted to {size_dict}.", + ) + else: + size_dict = size + + if not is_valid_size_dict(size_dict): + raise ValueError( + f"{param_name} must have one of the following set of keys: {VALID_SIZE_DICT_KEYS}, got {size_dict.keys()}" + ) + return size_dict diff --git a/paddlevlp/processors/image_transform_utils.py b/paddlevlp/processors/image_transform_utils.py new file mode 100644 index 00000000000000..d5221d6707f930 --- /dev/null +++ b/paddlevlp/processors/image_transform_utils.py @@ -0,0 +1,795 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import random +import warnings +from typing import Iterable, List, Optional, Tuple, Union + +import numpy as np +import paddle +import PIL +from paddle.vision.transforms import functional as F +from PIL import Image + +from .image_utils import (ChannelDimension, ImageInput, PILImageResampling, + TensorType, get_channel_dimension_axis, + get_image_size, infer_channel_dimension_format, + to_numpy_array) +from .utils import ExplicitEnum + + +def is_paddle_tensor(tensor): + return paddle.is_tensor(tensor) + + +def to_channel_dimension_format( + image: np.ndarray, + channel_dim: Union[ChannelDimension, str], + input_channel_dim: Optional[Union[ChannelDimension, str]] = None, +) -> np.ndarray: + """ + Converts `image` to the channel dimension format specified by `channel_dim`. + + Args: + image (`numpy.ndarray`): + The image to have its channel dimension set. + channel_dim (`ChannelDimension`): + The channel dimension format to use. + + Returns: + `np.ndarray`: The image with the channel dimension set to `channel_dim`. + """ + if not isinstance(image, np.ndarray): + raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}") + + if input_channel_dim is None: + input_channel_dim = infer_channel_dimension_format(image) + + target_channel_dim = ChannelDimension(channel_dim) + if input_channel_dim == target_channel_dim: + return image + + if target_channel_dim == ChannelDimension.FIRST: + image = image.transpose((2, 0, 1)) + elif target_channel_dim == ChannelDimension.LAST: + image = image.transpose((1, 2, 0)) + else: + raise ValueError("Unsupported channel dimension format: {}".format(channel_dim)) + + return image + + +def rescale( + image: np.ndarray, + scale: float, + data_format: Optional[ChannelDimension] = None, + dtype=np.float32, +) -> np.ndarray: + """ + Rescales `image` by `scale`. + + Args: + image (`np.ndarray`): + The image to rescale. + scale (`float`): + The scale to use for rescaling the image. + data_format (`ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + dtype (`np.dtype`, *optional*, defaults to `np.float32`): + The dtype of the output image. Defaults to `np.float32`. Used for backwards compatibility with feature + extractors. + + Returns: + `np.ndarray`: The rescaled image. + """ + if not isinstance(image, np.ndarray): + raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}") + + rescaled_image = image * scale + if data_format is not None: + rescaled_image = to_channel_dimension_format(rescaled_image, data_format) + rescaled_image = rescaled_image.astype(dtype) + return rescaled_image + + +def to_pil_image( + image: Union[np.ndarray, "PIL.Image.Image", "paddle.Tensor"], + do_rescale: Optional[bool] = None, +) -> "PIL.Image.Image": + """ + Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if + needed. + + Args: + image (`PIL.Image.Image` or `numpy.ndarray` or `paddle.Tensor`): + The image to convert to the `PIL.Image` format. + do_rescale (`bool`, *optional*): + Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will default + to `True` if the image type is a floating type, `False` otherwise. + + Returns: + `PIL.Image.Image`: The converted image. + """ + if isinstance(image, PIL.Image.Image): + return image + + # Convert all tensors to numpy arrays before converting to PIL image + if is_paddle_tensor(image): + image = image.numpy() + elif not isinstance(image, np.ndarray): + raise ValueError("Input image type not supported: {}".format(type(image))) + + # If the channel as been moved to first dim, we put it back at the end. + image = to_channel_dimension_format(image, ChannelDimension.LAST) + + # If there is a single channel, we squeeze it, as otherwise PIL can't handle it. + image = np.squeeze(image, axis=-1) if image.shape[-1] == 1 else image + + # PIL.Image can only store uint8 values, so we rescale the image to be between 0 and 255 if needed. + do_rescale = ( + isinstance(image.flat[0], (float, np.float32, np.float64)) + if do_rescale is None + else do_rescale + ) + if do_rescale: + image = rescale(image, 255) + image = image.astype(np.uint8) + return PIL.Image.fromarray(image) + + +# Logic adapted from torchvision resizing logic: https://github.com/pytorch/vision/blob/511924c1ced4ce0461197e5caa64ce5b9e558aab/torchvision/transforms/functional.py#L366 +def get_resize_output_image_size( + input_image: np.ndarray, + size: Union[int, Tuple[int, int], List[int], Tuple[int]], + default_to_square: bool = True, + max_size: Optional[int] = None, +) -> tuple: + """ + Find the target (height, width) dimension of the output image after resizing given the input image and the desired + size. + + Args: + input_image (`np.ndarray`): + The image to resize. + size (`int` or `Tuple[int, int]` or List[int] or Tuple[int]): + The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be matched to + this. + + If `size` is an int and `default_to_square` is `True`, then image will be resized to (size, size). If + `size` is an int and `default_to_square` is `False`, then smaller edge of the image will be matched to this + number. i.e, if height > width, then image will be rescaled to (size * height / width, size). + default_to_square (`bool`, *optional*, defaults to `True`): + How to convert `size` when it is a single int. If set to `True`, the `size` will be converted to a square + (`size`,`size`). If set to `False`, will replicate + [`torchvision.transforms.Resize`](https://pytorch.org/vision/stable/transforms.html#torchvision.transforms.Resize) + with support for resizing only the smallest edge and providing an optional `max_size`. + max_size (`int`, *optional*): + The maximum allowed for the longer edge of the resized image: if the longer edge of the image is greater + than `max_size` after being resized according to `size`, then the image is resized again so that the longer + edge is equal to `max_size`. As a result, `size` might be overruled, i.e the smaller edge may be shorter + than `size`. Only used if `default_to_square` is `False`. + + Returns: + `tuple`: The target (height, width) dimension of the output image after resizing. + """ + if isinstance(size, (tuple, list)): + if len(size) == 2: + return tuple(size) + elif len(size) == 1: + # Perform same logic as if size was an int + size = size[0] + else: + raise ValueError("size must have 1 or 2 elements if it is a list or tuple") + + if default_to_square: + return (size, size) + + height, width = get_image_size(input_image) + short, long = (width, height) if width <= height else (height, width) + requested_new_short = size + + new_short, new_long = requested_new_short, int(requested_new_short * long / short) + + if max_size is not None: + if max_size <= requested_new_short: + raise ValueError( + f"max_size = {max_size} must be strictly greater than the requested " + f"size for the smaller edge size = {size}" + ) + if new_long > max_size: + new_short, new_long = int(max_size * new_short / new_long), max_size + + return (new_long, new_short) if width <= height else (new_short, new_long) + + +def resize( + image, + size: Tuple[int, int], + resample: "PILImageResampling" = None, + reducing_gap: Optional[int] = None, + data_format: Optional[ChannelDimension] = None, + return_numpy: bool = True, +) -> np.ndarray: + """ + Resizes `image` to `(height, width)` specified by `size` using the PIL library. + + Args: + image (`PIL.Image.Image` or `np.ndarray` or `paddle.Tensor`): + The image to resize. + size (`Tuple[int, int]`): + The size to use for resizing the image. + resample (`int`, *optional*, defaults to `PILImageResampling.BILINEAR`): + The filter to user for resampling. + reducing_gap (`int`, *optional*): + Apply optimization by resizing the image in two steps. The bigger `reducing_gap`, the closer the result to + the fair resampling. See corresponding Pillow documentation for more details. + data_format (`ChannelDimension`, *optional*): + The channel dimension format of the output image. If unset, will use the inferred format from the input. + return_numpy (`bool`, *optional*, defaults to `True`): + Whether or not to return the resized image as a numpy array. If False a `PIL.Image.Image` object is + returned. + + Returns: + `np.ndarray`: The resized image. + """ + resample = resample if resample is not None else PILImageResampling.BILINEAR + + if not len(size) == 2: + raise ValueError("size must have 2 elements") + + # For all transformations, we want to keep the same data format as the input image unless otherwise specified. + # The resized image from PIL will always have channels last, so find the input format first. + data_format = ( + infer_channel_dimension_format(image) if data_format is None else data_format + ) + + # To maintain backwards compatibility with the resizing done in previous image feature extractors, we use + # the pillow library to resize the image and then convert back to numpy + if not isinstance(image, PIL.Image.Image): + image = to_pil_image(image) + height, width = size + # PIL images are in the format (width, height) + resized_image = image.resize( + (width, height), resample=resample, reducing_gap=reducing_gap + ) + + if return_numpy: + resized_image = np.array(resized_image) + # If the input image channel dimension was of size 1, then it is dropped when converting to a PIL image + # so we need to add it back if necessary. + resized_image = ( + np.expand_dims(resized_image, axis=-1) + if resized_image.ndim == 2 + else resized_image + ) + # The image is always in channels last format after converting from a PIL image + resized_image = to_channel_dimension_format( + resized_image, data_format, input_channel_dim=ChannelDimension.LAST + ) + return resized_image + + +def normalize( + image: np.ndarray, + mean: Union[float, Iterable[float]], + std: Union[float, Iterable[float]], + data_format: Optional[ChannelDimension] = None, +) -> np.ndarray: + """ + Normalizes `image` using the mean and standard deviation specified by `mean` and `std`. + + image = (image - mean) / std + + Args: + image (`np.ndarray`): + The image to normalize. + mean (`float` or `Iterable[float]`): + The mean to use for normalization. + std (`float` or `Iterable[float]`): + The standard deviation to use for normalization. + data_format (`ChannelDimension`, *optional*): + The channel dimension format of the output image. If unset, will use the inferred format from the input. + """ + if isinstance(image, PIL.Image.Image): + warnings.warn( + "PIL.Image.Image inputs are deprecated and will be removed in v4.26.0. Please use numpy arrays instead.", + FutureWarning, + ) + # Convert PIL image to numpy array with the same logic as in the previous feature extractor normalize - + # casting to numpy array and dividing by 255. + image = to_numpy_array(image) + image = rescale(image, scale=1 / 255) + + if not isinstance(image, np.ndarray): + raise ValueError("image must be a numpy array") + + input_data_format = infer_channel_dimension_format(image) + channel_axis = get_channel_dimension_axis(image) + num_channels = image.shape[channel_axis] + + if isinstance(mean, Iterable): + if len(mean) != num_channels: + raise ValueError( + f"mean must have {num_channels} elements if it is an iterable, got {len(mean)}" + ) + else: + mean = [mean] * num_channels + mean = np.array(mean, dtype=image.dtype) + + if isinstance(std, Iterable): + if len(std) != num_channels: + raise ValueError( + f"std must have {num_channels} elements if it is an iterable, got {len(std)}" + ) + else: + std = [std] * num_channels + std = np.array(std, dtype=image.dtype) + + if input_data_format == ChannelDimension.LAST: + image = (image - mean) / std + else: + image = ((image.T - mean) / std).T + + image = ( + to_channel_dimension_format(image, data_format) + if data_format is not None + else image + ) + return image + + +def center_crop( + image: np.ndarray, + size: Tuple[int, int], + data_format: Optional[Union[str, ChannelDimension]] = None, + return_numpy: Optional[bool] = None, +) -> np.ndarray: + """ + Crops the `image` to the specified `size` using a center crop. Note that if the image is too small to be cropped to + the size given, it will be padded (so the returned result will always be of size `size`). + + Args: + image (`np.ndarray`): + The image to crop. + size (`Tuple[int, int]`): + The target size for the cropped image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + If unset, will use the inferred format of the input image. + return_numpy (`bool`, *optional*): + Whether or not to return the cropped image as a numpy array. Used for backwards compatibility with the + previous ImageFeatureExtractionMixin method. + - Unset: will return the same type as the input image. + - `True`: will return a numpy array. + - `False`: will return a `PIL.Image.Image` object. + Returns: + `np.ndarray`: The cropped image. + """ + if isinstance(image, PIL.Image.Image): + warnings.warn( + "PIL.Image.Image inputs are deprecated and will be removed in v4.26.0. Please use numpy arrays instead.", + FutureWarning, + ) + image = to_numpy_array(image) + return_numpy = False if return_numpy is None else return_numpy + else: + return_numpy = True if return_numpy is None else return_numpy + + if not isinstance(image, np.ndarray): + raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}") + + if not isinstance(size, Iterable) or len(size) != 2: + raise ValueError( + "size must have 2 elements representing the height and width of the output image" + ) + + input_data_format = infer_channel_dimension_format(image) + output_data_format = data_format if data_format is not None else input_data_format + + # We perform the crop in (C, H, W) format and then convert to the output format + image = to_channel_dimension_format(image, ChannelDimension.FIRST) + + orig_height, orig_width = get_image_size(image) + crop_height, crop_width = size + crop_height, crop_width = int(crop_height), int(crop_width) + + # In case size is odd, (image_shape[0] + size[0]) // 2 won't give the proper result. + top = (orig_height - crop_height) // 2 + bottom = top + crop_height + # In case size is odd, (image_shape[1] + size[1]) // 2 won't give the proper result. + left = (orig_width - crop_width) // 2 + right = left + crop_width + + # Check if cropped area is within image boundaries + if top >= 0 and bottom <= orig_height and left >= 0 and right <= orig_width: + image = image[..., top:bottom, left:right] + image = to_channel_dimension_format(image, output_data_format) + return image + + # Otherwise, we may need to pad if the image is too small. Oh joy... + new_height = max(crop_height, orig_height) + new_width = max(crop_width, orig_width) + new_shape = image.shape[:-2] + (new_height, new_width) + new_image = np.zeros_like(image, shape=new_shape) + + # If the image is too small, pad it with zeros + top_pad = (new_height - orig_height) // 2 + bottom_pad = top_pad + orig_height + left_pad = (new_width - orig_width) // 2 + right_pad = left_pad + orig_width + new_image[..., top_pad:bottom_pad, left_pad:right_pad] = image + + top += top_pad + bottom += top_pad + left += left_pad + right += left_pad + + new_image = new_image[ + ..., max(0, top) : min(new_height, bottom), max(0, left) : min(new_width, right) + ] + new_image = to_channel_dimension_format(new_image, output_data_format) + + if not return_numpy: + new_image = to_pil_image(new_image) + + return new_image + + +def _center_to_corners_format_paddle(bboxes_center: "paddle.Tensor") -> "paddle.Tensor": + center_x, center_y, width, height = bboxes_center.unbind(-1) + bbox_corners = paddle.stack( + # top left x, top left y, bottom right x, bottom right y + [ + (center_x - 0.5 * width), + (center_y - 0.5 * height), + (center_x + 0.5 * width), + (center_y + 0.5 * height), + ], + axis=-1, + ) + return bbox_corners + + +def _center_to_corners_format_numpy(bboxes_center: np.ndarray) -> np.ndarray: + center_x, center_y, width, height = bboxes_center.T + bboxes_corners = np.stack( + # top left x, top left y, bottom right x, bottom right y + [ + center_x - 0.5 * width, + center_y - 0.5 * height, + center_x + 0.5 * width, + center_y + 0.5 * height, + ], + axis=-1, + ) + return bboxes_corners + + +# 2 functions below inspired by https://github.com/facebookresearch/detr/blob/master/util/box_ops.py +def center_to_corners_format(bboxes_center: TensorType) -> TensorType: + """ + Converts bounding boxes from center format to corners format. + + center format: contains the coordinate for the center of the box and its width, height dimensions + (center_x, center_y, width, height) + corners format: contains the coodinates for the top-left and bottom-right corners of the box + (top_left_x, top_left_y, bottom_right_x, bottom_right_y) + """ + # Function is used during model forward pass, so we use the input framework if possible, without + # converting to numpy + if is_paddle_tensor(bboxes_center): + return _center_to_corners_format_paddle(bboxes_center) + elif isinstance(bboxes_center, np.ndarray): + return _center_to_corners_format_numpy(bboxes_center) + + raise ValueError(f"Unsupported input type {type(bboxes_center)}") + + +def _corners_to_center_format_paddle( + bboxes_corners: "paddle.Tensor", +) -> "paddle.Tensor": + top_left_x, top_left_y, bottom_right_x, bottom_right_y = bboxes_corners.unbind(-1) + b = [ + (top_left_x + bottom_right_x) / 2, # center x + (top_left_y + bottom_right_y) / 2, # center y + (bottom_right_x - top_left_x), # width + (bottom_right_y - top_left_y), # height + ] + return paddle.stack(b, axis=-1) + + +def _corners_to_center_format_numpy(bboxes_corners: np.ndarray) -> np.ndarray: + top_left_x, top_left_y, bottom_right_x, bottom_right_y = bboxes_corners.T + bboxes_center = np.stack( + [ + (top_left_x + bottom_right_x) / 2, # center x + (top_left_y + bottom_right_y) / 2, # center y + (bottom_right_x - top_left_x), # width + (bottom_right_y - top_left_y), # height + ], + axis=-1, + ) + return bboxes_center + + +def corners_to_center_format(bboxes_corners: TensorType) -> TensorType: + """ + Converts bounding boxes from corners format to center format. + + corners format: contains the coodinates for the top-left and bottom-right corners of the box + (top_left_x, top_left_y, bottom_right_x, bottom_right_y) + center format: contains the coordinate for the center of the box and its the width, height dimensions + (center_x, center_y, width, height) + """ + # Inverse function accepts different input types so implemented here too + if is_paddle_tensor(bboxes_corners): + return _corners_to_center_format_paddle(bboxes_corners) + elif isinstance(bboxes_corners, np.ndarray): + return _corners_to_center_format_numpy(bboxes_corners) + + raise ValueError(f"Unsupported input type {type(bboxes_corners)}") + + +# 2 functions below copied from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py +# Copyright (c) 2018, Alexander Kirillov +# All rights reserved. +def rgb_to_id(color): + """ + Converts RGB color to unique ID. + """ + if isinstance(color, np.ndarray) and len(color.shape) == 3: + if color.dtype == np.uint8: + color = color.astype(np.int32) + return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2] + return int(color[0] + 256 * color[1] + 256 * 256 * color[2]) + + +def id_to_rgb(id_map): + """ + Converts unique ID to RGB color. + """ + if isinstance(id_map, np.ndarray): + id_map_copy = id_map.copy() + rgb_shape = tuple(list(id_map.shape) + [3]) + rgb_map = np.zeros(rgb_shape, dtype=np.uint8) + for i in range(3): + rgb_map[..., i] = id_map_copy % 256 + id_map_copy //= 256 + return rgb_map + color = [] + for _ in range(3): + color.append(id_map % 256) + id_map //= 256 + return color + + +class PaddingMode(ExplicitEnum): + """ + Enum class for the different padding modes to use when padding images. + """ + + CONSTANT = "constant" + REFLECT = "reflect" + REPLICATE = "replicate" + SYMMETRIC = "symmetric" + + +def pad( + image: np.ndarray, + padding: Union[int, Tuple[int, int], Iterable[Tuple[int, int]]], + mode: PaddingMode = PaddingMode.CONSTANT, + constant_values: Union[float, Iterable[float]] = 0.0, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, +) -> np.ndarray: + """ + Pads the `image` with the specified (height, width) `padding` and `mode`. + + Args: + image (`np.ndarray`): + The image to pad. + padding (`int` or `Tuple[int, int]` or `Iterable[Tuple[int, int]]`): + Padding to apply to the edges of the height, width axes. Can be one of three formats: + - `((before_height, after_height), (before_width, after_width))` unique pad widths for each axis. + - `((before, after),)` yields same before and after pad for height and width. + - `(pad,)` or int is a shortcut for before = after = pad width for all axes. + mode (`PaddingMode`): + The padding mode to use. Can be one of: + - `"constant"`: pads with a constant value. + - `"reflect"`: pads with the reflection of the vector mirrored on the first and last values of the + vector along each axis. + - `"replicate"`: pads with the replication of the last value on the edge of the array along each axis. + - `"symmetric"`: pads with the reflection of the vector mirrored along the edge of the array. + constant_values (`float` or `Iterable[float]`, *optional*): + The value to use for the padding if `mode` is `"constant"`. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + If unset, will use same as the input image. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + If unset, will use the inferred format of the input image. + + Returns: + `np.ndarray`: The padded image. + + """ + if input_data_format is None: + input_data_format = infer_channel_dimension_format(image) + + def _expand_for_data_format(values): + """ + Convert values to be in the format expected by np.pad based on the data format. + """ + if isinstance(values, (int, float)): + values = ((values, values), (values, values)) + elif isinstance(values, tuple) and len(values) == 1: + values = ((values[0], values[0]), (values[0], values[0])) + elif ( + isinstance(values, tuple) + and len(values) == 2 + and isinstance(values[0], int) + ): + values = (values, values) + elif ( + isinstance(values, tuple) + and len(values) == 2 + and isinstance(values[0], tuple) + ): + values = values + else: + raise ValueError(f"Unsupported format: {values}") + + # add 0 for channel dimension + values = ( + ((0, 0), *values) + if input_data_format == ChannelDimension.FIRST + else (*values, (0, 0)) + ) + + # Add additional padding if there's a batch dimension + values = (0, *values) if image.ndim == 4 else values + return values + + padding = _expand_for_data_format(padding) + + if mode == PaddingMode.CONSTANT: + constant_values = _expand_for_data_format(constant_values) + image = np.pad(image, padding, mode="constant", constant_values=constant_values) + elif mode == PaddingMode.REFLECT: + image = np.pad(image, padding, mode="reflect") + elif mode == PaddingMode.REPLICATE: + image = np.pad(image, padding, mode="edge") + elif mode == PaddingMode.SYMMETRIC: + image = np.pad(image, padding, mode="symmetric") + else: + raise ValueError(f"Invalid padding mode: {mode}") + + image = ( + to_channel_dimension_format(image, data_format) + if data_format is not None + else image + ) + return image + + +def convert_to_rgb(image: ImageInput) -> ImageInput: + """ + Converts an image to RGB format. Only converts if the image is of type PIL.Image.Image, otherwise returns the image + as is. + + Args: + image (Image): + The image to convert. + """ + + if not isinstance(image, PIL.Image.Image): + return image + + image = image.convert("RGB") + return image + + +def decode_image(image_path: str) -> ImageInput: + """ + Loads an image from a file. + + Args: + image path(str): Path to the image. + """ + image = Image.open(image_path) + return image + + +def random_horizontal_flip( + image: np.ndarray, + flip_prob: float, +) -> np.ndarray: + """ + Randomly flips the image horizontally. + + Args: + image (np.ndarray): Image to be flipped. + flip_prob (float): Probability that the image will be flipped. + """ + if random.random() < flip_prob: + return F.hflip(image) + return image + + +def get_crop_param(image, scale, ratio, attempts=10): + height, width = get_image_size(image) + area = height * width + np.random.seed(0) + random.seed(0) + for _ in range(attempts): + target_area = np.random.uniform(*scale) * area + log_ratio = tuple(math.log(x) for x in ratio) + aspect_ratio = math.exp(np.random.uniform(*log_ratio)) + + w = int(round(math.sqrt(target_area * aspect_ratio))) + h = int(round(math.sqrt(target_area / aspect_ratio))) + + if 0 < w <= width and 0 < h <= height: + i = random.randint(0, height - h) + j = random.randint(0, width - w) + return i, j, h, w + + # Fallback to central crop + in_ratio = float(width) / float(height) + if in_ratio < min(ratio): + w = width + h = int(round(w / min(ratio))) + elif in_ratio > max(ratio): + h = height + w = int(round(h * max(ratio))) + else: + # return whole image + w = width + h = height + i = (height - h) // 2 + j = (width - w) // 2 + return i, j, h, w + + +def random_resized_crop( + image: np.ndarray, + size: Union[int, List, Tuple], + scale: float = (0.08, 1.0), + ratio: float = (3.0 / 4, 4.0 / 3), + resample: "PILImageResampling" = None, +) -> np.ndarray: + """ + Crop the input data to random size and aspect ratio. + A crop of random size (default: of 0.08 to 1.0) of the original size and a random + aspect ratio (default: of 3/4 to 1.33) of the original aspect ratio is made. + After applying crop transfrom, the input data will be resized to given size. + + Args: + image (np.ndarray): Image to be cropped. + size (Union[int, List, Tuple]): Size of cropped image. + scale (float): Random scale factor. + aspect (float): Random aspect ratio. + resample (`int`, *optional*, defaults to `PILImageResampling.BILINEAR`): + The filter to user for resampling. + """ + i, j, h, w = get_crop_param(image, scale, ratio) + cropped_img = F.crop(image, i, j, h, w) + return resize(cropped_img, size, resample) diff --git a/paddlevlp/processors/image_utils.py b/paddlevlp/processors/image_utils.py new file mode 100644 index 00000000000000..7fb5f606914f4a --- /dev/null +++ b/paddlevlp/processors/image_utils.py @@ -0,0 +1,305 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from collections import UserDict +from typing import Dict, Iterable, List, Tuple, Union + +import numpy as np +import paddle +import PIL.Image +import PIL.ImageOps +import requests +from packaging import version + +from .utils import ExplicitEnum + +IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406] +IMAGENET_DEFAULT_STD = [0.229, 0.224, 0.225] +IMAGENET_STANDARD_MEAN = [0.5, 0.5, 0.5] +IMAGENET_STANDARD_STD = [0.5, 0.5, 0.5] + + +def is_paddle_tensor(tensor): + return paddle.is_tensor(tensor) + + +def to_numpy(obj): + """ + Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a Numpy array. + """ + if isinstance(obj, (dict, UserDict)): + return {k: to_numpy(v) for k, v in obj.items()} + elif isinstance(obj, (list, tuple)): + return np.array(obj) + elif is_paddle_tensor(obj): + return obj.detach().cpu().numpy() + else: + return obj + + +if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"): + PILImageResampling = PIL.Image.Resampling +else: + PILImageResampling = PIL.Image + + +ImageInput = Union[ + "PIL.Image.Image", + np.ndarray, + "paddle.Tensor", + List["PIL.Image.Image"], + List[np.ndarray], + List["paddle.Tensor"], +] # noqa + + +class TensorType(ExplicitEnum): + """ + Possible values for the `return_tensors` argument in [`PretrainedTokenizerBase.__call__`]. Useful for + tab-completion in an IDE. + """ + + PADDLE = "pd" + NUMPY = "np" + + +class ChannelDimension(ExplicitEnum): + FIRST = "channels_first" + LAST = "channels_last" + + +def is_valid_image(img): + return ( + isinstance(img, PIL.Image.Image) + or isinstance(img, np.ndarray) + or is_paddle_tensor(img) + ) + + +def valid_images(imgs): + # If we have an list of images, make sure every image is valid + if isinstance(imgs, (list, tuple)): + for img in imgs: + if not valid_images(img): + return False + # If not a list of tuple, we have been given a single image or batched tensor of images + elif not is_valid_image(imgs): + return False + return True + + +def is_batched(img): + if isinstance(img, (list, tuple)): + return is_valid_image(img[0]) + return False + + +def make_list_of_images(images, expected_ndims: int = 3) -> List[ImageInput]: + """ + Ensure that the input is a list of images. If the input is a single image, it is converted to a list of length 1. + If the input is a batch of images, it is converted to a list of images. + Args: + images (`ImageInput`): + Image of images to turn into a list of images. + expected_ndims (`int`, *optional*, defaults to 3): + Expected number of dimensions for a single input image. If the input image has a different number of + dimensions, an error is raised. + """ + if is_batched(images): + return images + + # Either the input is a single image, in which case we create a list of length 1 + if isinstance(images, PIL.Image.Image): + # PIL images are never batched + return [images] + + if is_valid_image(images): + if images.ndim == expected_ndims + 1: + # Batch of images + images = list(images) + elif images.ndim == expected_ndims: + # Single image + images = [images] + else: + raise ValueError( + f"Invalid image shape. Expected either {expected_ndims + 1} or {expected_ndims} dimensions, but got" + f" {images.ndim} dimensions." + ) + return images + raise ValueError( + "Invalid image type. Expected either PIL.Image.Image, numpy.ndarray, paddle.Tensor " + f"but got {type(images)}." + ) + + +def to_numpy_array(img) -> np.ndarray: + if not is_valid_image(img): + raise ValueError(f"Invalid image type: {type(img)}") + + if isinstance(img, PIL.Image.Image): + return np.array(img) + return to_numpy(img) + + +def infer_channel_dimension_format(image: np.ndarray) -> ChannelDimension: + """ + Infers the channel dimension format of `image`. + + Args: + image (`np.ndarray`): + The image to infer the channel dimension of. + + Returns: + The channel dimension of the image. + """ + if image.ndim == 3: + first_dim, last_dim = 0, 2 + elif image.ndim == 4: + first_dim, last_dim = 1, 3 + else: + raise ValueError(f"Unsupported number of image dimensions: {image.ndim}") + + if image.shape[first_dim] in (1, 3): + return ChannelDimension.FIRST + elif image.shape[last_dim] in (1, 3): + return ChannelDimension.LAST + raise ValueError("Unable to infer channel dimension format") + + +def get_channel_dimension_axis(image: np.ndarray) -> int: + """ + Returns the channel dimension axis of the image. + + Args: + image (`np.ndarray`): + The image to get the channel dimension axis of. + + Returns: + The channel dimension axis of the image. + """ + channel_dim = infer_channel_dimension_format(image) + if channel_dim == ChannelDimension.FIRST: + return image.ndim - 3 + elif channel_dim == ChannelDimension.LAST: + return image.ndim - 1 + raise ValueError(f"Unsupported data format: {channel_dim}") + + +def get_image_size( + image: np.ndarray, channel_dim: ChannelDimension = None +) -> Tuple[int, int]: + """ + Returns the (height, width) dimensions of the image. + + Args: + image (`np.ndarray`): + The image to get the dimensions of. + channel_dim (`ChannelDimension`, *optional*): + Which dimension the channel dimension is in. If `None`, will infer the channel dimension from the image. + + Returns: + A tuple of the image's height and width. + """ + if channel_dim is None: + channel_dim = infer_channel_dimension_format(image) + + if channel_dim == ChannelDimension.FIRST: + return image.shape[-2], image.shape[-1] + elif channel_dim == ChannelDimension.LAST: + return image.shape[-3], image.shape[-2] + else: + raise ValueError(f"Unsupported data format: {channel_dim}") + + +def is_valid_annotation_coco_detection( + annotation: Dict[str, Union[List, Tuple]] +) -> bool: + if ( + isinstance(annotation, dict) + and "image_id" in annotation + and "annotations" in annotation + and isinstance(annotation["annotations"], (list, tuple)) + and ( + # an image can have no annotations + len(annotation["annotations"]) == 0 + or isinstance(annotation["annotations"][0], dict) + ) + ): + return True + return False + + +def is_valid_annotation_coco_panoptic( + annotation: Dict[str, Union[List, Tuple]] +) -> bool: + if ( + isinstance(annotation, dict) + and "image_id" in annotation + and "segments_info" in annotation + and "file_name" in annotation + and isinstance(annotation["segments_info"], (list, tuple)) + and ( + # an image can have no segments + len(annotation["segments_info"]) == 0 + or isinstance(annotation["segments_info"][0], dict) + ) + ): + return True + return False + + +def valid_coco_detection_annotations( + annotations: Iterable[Dict[str, Union[List, Tuple]]] +) -> bool: + return all(is_valid_annotation_coco_detection(ann) for ann in annotations) + + +def valid_coco_panoptic_annotations( + annotations: Iterable[Dict[str, Union[List, Tuple]]] +) -> bool: + return all(is_valid_annotation_coco_panoptic(ann) for ann in annotations) + + +def load_image(image: Union[str, "PIL.Image.Image"]) -> "PIL.Image.Image": + """ + Loads `image` to a PIL Image. + + Args: + image (`str` or `PIL.Image.Image`): + The image to convert to the PIL Image format. + + Returns: + `PIL.Image.Image`: A PIL Image. + """ + if isinstance(image, str): + if image.startswith("http://") or image.startswith("https://"): + # We need to actually check for a real protocol, otherwise it's impossible to use a local file + # like http_huggingface_co.png + image = PIL.Image.open(requests.get(image, stream=True).raw) + elif os.path.isfile(image): + image = PIL.Image.open(image) + else: + raise ValueError( + f"Incorrect path or url, URLs must start with `http://` or `https://`, and {image} is not a valid path" + ) + elif isinstance(image, PIL.Image.Image): + image = image + else: + raise ValueError( + "Incorrect format used for image. Should be an url linking to an image, a local path, or a PIL image." + ) + image = PIL.ImageOps.exif_transpose(image) + image = image.convert("RGB") + return image diff --git a/paddlevlp/processors/processing_utils.py b/paddlevlp/processors/processing_utils.py new file mode 100644 index 00000000000000..b1bd5072598189 --- /dev/null +++ b/paddlevlp/processors/processing_utils.py @@ -0,0 +1,538 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import json +import os +import tempfile +from typing import Any, Dict, Iterable, Optional, Tuple, Union + +import numpy as np +from huggingface_hub import (create_repo, get_hf_file_metadata, + hf_hub_download, hf_hub_url, + repo_type_and_id_from_hf_id, upload_folder) +from huggingface_hub.utils import EntryNotFoundError +from paddlenlp import __version__ +from paddlenlp.transformers.tokenizer_utils_base import BatchEncoding + +from paddlevlp.utils.downloader import (COMMUNITY_MODEL_PREFIX, + get_path_from_url_with_filelock, + resolve_cache_dir) +from paddlevlp.utils.log import logger + +PROCESSOR_CONFIG_MAPPING = { + "image": "image_preprocessor_config.json", + "text": "text_preprocessor_config.json", +} + + +class BaseProcessingMixin(object): + """ + This is an base processor mixin used to provide saving/loading functionality for sequential and feature + extractors. + """ + + _auto_class = None + input_type = None + + def __init__(self, **kwargs): + """Set elements of `kwargs` as attributes.""" + # Pop "processor_class" as it should be saved as private attribute + self._processor_class = kwargs.pop("processor_class", None) + # Additional attributes without default values + for key, value in kwargs.items(): + try: + setattr(self, key, value) + except AttributeError as err: + logger.error(f"Can't set {key} with value {value} for {self}") + raise err + + def _set_processor_class(self, processor_class: str): + """Sets processor class as an attribute.""" + self._processor_class = processor_class + + @classmethod + def from_pretrained( + cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs + ): + r""" + Instantiate a type of [`~processing_utils.BaseProcessingMixin`] from an processor. + + Args: + pretrained_model_name_or_path (`str` or `os.PathLike`): + This can be either: + + - a string, the *model id* of a pretrained processor hosted inside a model repo on + huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or + namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`. + - a path to a *directory* containing a processor file saved using the + [`~processing_utils.BaseProcessingMixin.save_pretrained`] method, e.g., + `./my_model_directory/`. + - a path or url to a saved processor JSON *file*, e.g., + `./my_model_directory/preprocessor_config.json`. + cache_dir (`str` or `os.PathLike`, *optional*): + Path to a directory in which a downloaded pretrained model processor should be cached if the + standard cache should not be used. + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force to (re-)download the processor files and override the cached versions if + they exist. + resume_download (`bool`, *optional*, defaults to `False`): + Whether or not to delete incompletely received file. Attempts to resume the download if such a file + exists. + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. + use_auth_token (`str` or `bool`, *optional*): + The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use + the token generated when running `huggingface-cli login` (stored in `~/.huggingface`). + revision (`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any + identifier allowed by git. + + + + + To test a pull request you made on the Hub, you can pass `revision="refs/pr/". + + + + return_unused_kwargs (`bool`, *optional*, defaults to `False`): + If `False`, then this function returns just the final processor object. If `True`, then this + functions returns a `Tuple(processor, unused_kwargs)` where *unused_kwargs* is a dictionary + consisting of the key/value pairs whose keys are not processor attributes: i.e., the part of + `kwargs` which has not been used to update `processor` and is otherwise ignored. + kwargs (`Dict[str, Any]`, *optional*): + The values in kwargs of any keys which are processor attributes will be used to override the + loaded values. Behavior concerning key/value pairs whose keys are *not* processor attributes is + controlled by the `return_unused_kwargs` keyword parameter. + + Returns: + A processor of type [`~processing_utils.BaseProcessingMixin`]. + ```""" + processor_dict, kwargs = cls.get_processor_dict( + pretrained_model_name_or_path, **kwargs + ) + + return cls.from_dict(processor_dict, **kwargs) + + def save_pretrained(self, save_directory: Union[str, os.PathLike], **kwargs): + """ + Save an processor object to the directory `save_directory`, so that it can be re-loaded using the + [`~processing_utils.BaseProcessingMixin.from_pretrained`] class method. + + Args: + save_directory (`str` or `os.PathLike`): + Directory where the processor JSON file will be saved (will be created if it does not exist). + kwargs: + Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method. + """ + if os.path.isfile(save_directory): + raise AssertionError( + f"Provided path ({save_directory}) should be a directory, not a file" + ) + + os.makedirs(save_directory, exist_ok=True) + + # If we save using the predefined names, we can load using `from_pretrained` + output_processor_file = os.path.join( + save_directory, PROCESSOR_CONFIG_MAPPING[self.input_type] + ) + + self.to_json_file(output_processor_file) + logger.info(f"processor saved in {output_processor_file}") + + return [output_processor_file] + + def save_to_hf_hub( + self, + repo_id: str, + private: Optional[bool] = None, + subfolder: Optional[str] = None, + commit_message: Optional[str] = None, + revision: Optional[str] = None, + create_pr: bool = False, + ): + """ + Uploads all elements of this processor to a new HuggingFace Hub repository. + Args: + repo_id (str): Repository name for your processor in the Hub. + private (bool, optional): Whether theprocessor is set to private + subfolder (str, optional): Push to a subfolder of the repo instead of the root + commit_message (str, optional) — The summary / title / first line of the generated commit. Defaults to: f"Upload {path_in_repo} with huggingface_hub" + revision (str, optional) — The git revision to commit from. Defaults to the head of the "main" branch. + create_pr (boolean, optional) — Whether or not to create a Pull Request with that commit. Defaults to False. + If revision is not set, PR is opened against the "main" branch. If revision is set and is a branch, PR is opened against this branch. + If revision is set and is not a branch name (example: a commit oid), an RevisionNotFoundError is returned by the server. + + Returns: The url of the commit of your model in the given repository. + """ + repo_url = create_repo(repo_id, private=private, exist_ok=True) + + # Infer complete repo_id from repo_url + # Can be different from the input `repo_id` if repo_owner was implicit + _, repo_owner, repo_name = repo_type_and_id_from_hf_id(repo_url) + + repo_id = f"{repo_owner}/{repo_name}" + + # Check if README file already exist in repo + try: + get_hf_file_metadata( + hf_hub_url(repo_id=repo_id, filename="README.md", revision=revision) + ) + has_readme = True + except EntryNotFoundError: + has_readme = False + + with tempfile.TemporaryDirectory() as root_dir: + if subfolder is not None: + save_dir = os.path.join(root_dir, subfolder) + else: + save_dir = root_dir + # save model + self.save_pretrained(save_dir) + # Add readme if does not exist + logger.info("README.md not found, adding the default README.md") + if not has_readme: + with open(os.path.join(root_dir, "README.md"), "w") as f: + f.write(f"---\nlibrary_name: paddlenlp\n---\n# {repo_id}") + + # Upload model and return + logger.info(f"Pushing to the {repo_id}. This might take a while") + return upload_folder( + repo_id=repo_id, + repo_type="model", + folder_path=root_dir, + commit_message=commit_message, + revision=revision, + create_pr=create_pr, + ) + + @classmethod + def get_processor_dict( + cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs + ) -> Tuple[Dict[str, Any], Dict[str, Any]]: + """ + From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a + processor of type [`~processor_utils.BaseProcessingMixin`] using `from_dict`. + + Parameters: + pretrained_model_name_or_path (`str` or `os.PathLike`): + The identifier of the pre-trained checkpoint from which we want the dictionary of parameters. + from_hf_hub (bool, optional): whether to load from Huggingface Hub + subfolder (str, optional) An optional value corresponding to a folder inside the repo. + + + Returns: + `Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the processor object. + """ + cache_dir = kwargs.pop("cache_dir", None) + from_hf_hub = kwargs.pop("from_hf_hub", False) + subfolder = kwargs.pop("subfolder", None) + cache_dir = resolve_cache_dir( + pretrained_model_name_or_path, from_hf_hub, cache_dir + ) + + pretrained_model_name_or_path = str(pretrained_model_name_or_path) + is_local = os.path.isdir(pretrained_model_name_or_path) + if os.path.isdir(pretrained_model_name_or_path): + resolved_processor_file = os.path.join( + pretrained_model_name_or_path, PROCESSOR_CONFIG_MAPPING[cls.input_type] + ) + elif os.path.isfile(pretrained_model_name_or_path): + resolved_processor_file = pretrained_model_name_or_path + is_local = True + elif from_hf_hub: + processor_file = PROCESSOR_CONFIG_MAPPING[cls.input_type] + resolved_processor_file = hf_hub_download( + repo_id=pretrained_model_name_or_path, + filename=processor_file, + cache_dir=cache_dir, + subfolder=subfolder, + library_name="PaddleNLP", + library_version=__version__, + ) + else: + # Assuming from community-contributed pretrained models + processor_file = "/".join( + [ + COMMUNITY_MODEL_PREFIX, + pretrained_model_name_or_path, + PROCESSOR_CONFIG_MAPPING[cls.input_type], + ] + ) + try: + # Load from local folder or from cache or download from model Hub and cache + resolved_processor_file = get_path_from_url_with_filelock( + processor_file, cache_dir + ) + except EnvironmentError: + # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to + # the original exception. + raise + except Exception: + # For any other exception, we throw a generic error. + raise EnvironmentError( + f"Can't load processor for '{pretrained_model_name_or_path}'. If you were trying to load" + " it from 'BOS', make sure you don't have a local directory with the" + f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a" + f" directory containing a {PROCESSOR_CONFIG_MAPPING[cls.input_type]} file" + ) + + try: + # Load processor dict + with open(resolved_processor_file, "r", encoding="utf-8") as reader: + text = reader.read() + processor_dict = json.loads(text) + + except json.JSONDecodeError: + raise EnvironmentError( + f"It looks like the config file at '{resolved_processor_file}' is not a valid JSON file." + ) + + if is_local: + logger.info(f"loading configuration file {resolved_processor_file}") + else: + logger.info( + f"loading configuration file {processor_file} from cache at {resolved_processor_file}" + ) + + return processor_dict, kwargs + + @classmethod + def from_dict(cls, processor_dict: Dict[str, Any], **kwargs): + """ + Instantiates a type of [`~processing_utils.BaseProcessingMixin`] from a Python dictionary of parameters. + + Args: + processor_dict (`Dict[str, Any]`): + Dictionary that will be used to instantiate the processor object. Such a dictionary can be + retrieved from a pretrained checkpoint by leveraging the + [`~processing_utils.BaseProcessingMixin.to_dict`] method. + kwargs (`Dict[str, Any]`): + Additional parameters from which to initialize the processor object. + + Returns: + [`~processing_utils.BaseProcessingMixin`]: The processor object instantiated from those + parameters. + """ + return_unused_kwargs = kwargs.pop("return_unused_kwargs", False) + + processor = cls(**processor_dict) + + # Update processor with kwargs if needed + to_remove = [] + for key, value in kwargs.items(): + if hasattr(processor, key): + setattr(processor, key, value) + to_remove.append(key) + for key in to_remove: + kwargs.pop(key, None) + + logger.info(f"Processor {processor}") + if return_unused_kwargs: + return processor, kwargs + else: + return processor + + def to_dict(self) -> Dict[str, Any]: + """ + Serializes this instance to a Python dictionary. + + Returns: + `Dict[str, Any]`: Dictionary of all the attributes that make up this processor instance. + """ + output = copy.deepcopy(self.__dict__) + output["processor_type"] = self.__class__.__name__ + + return output + + @classmethod + def from_json_file(cls, json_file: Union[str, os.PathLike]): + """ + Instantiates a processor of type [`~processing_utils.BaseProcessingMixin`] from the path to a JSON + file of parameters. + + Args: + json_file (`str` or `os.PathLike`): + Path to the JSON file containing the parameters. + + Returns: + A processor of type [`~processing_utils.BaseProcessingMixin`]: The processor object + instantiated from that JSON file. + """ + with open(json_file, "r", encoding="utf-8") as reader: + text = reader.read() + processor_dict = json.loads(text) + return cls(**processor_dict) + + def to_json_string(self) -> str: + """ + Serializes this instance to a JSON string. + + Returns: + `str`: String containing all the attributes that make up this feature_extractor instance in JSON format. + """ + dictionary = self.to_dict() + + for key, value in dictionary.items(): + if isinstance(value, np.ndarray): + dictionary[key] = value.tolist() + + # make sure private name "_processor_class" is correctly + # saved as "processor_class" + _processor_class = dictionary.pop("_processor_class", None) + if _processor_class is not None: + dictionary["processor_class"] = _processor_class + + return json.dumps(dictionary, indent=2, sort_keys=True) + "\n" + + def to_json_file(self, json_file_path: Union[str, os.PathLike]): + """ + Save this instance to a JSON file. + + Args: + json_file_path (`str` or `os.PathLike`): + Path to the JSON file in which this processor instance's parameters will be saved. + """ + with open(json_file_path, "w", encoding="utf-8") as writer: + writer.write(self.to_json_string()) + + def __repr__(self): + return f"{self.__class__.__name__} {self.to_json_string()}" + + +class BaseImageProcessor(BaseProcessingMixin): + input_type = "image" + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def __call__(self, images, **kwargs) -> BatchEncoding: + """Preprocess an image or a batch of images.""" + return self.preprocess(images, **kwargs) + + def preprocess(self, images, **kwargs) -> BatchEncoding: + raise NotImplementedError( + "Each image processor must implement its own preprocess method" + ) + + +class BaseTextProcessor(BaseProcessingMixin): + input_type = "text" + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def __call__(self, text, **kwargs) -> BatchEncoding: + """Preprocess an image or a batch of images.""" + return self.preprocess(text, **kwargs) + + def preprocess(self, text, **kwargs) -> BatchEncoding: + raise NotImplementedError( + "Each image processor must implement its own preprocess method" + ) + + +VALID_SIZE_DICT_KEYS = ( + {"height", "width"}, + {"shortest_edge"}, + {"shortest_edge", "longest_edge"}, +) + + +def is_valid_size_dict(size_dict): + if not isinstance(size_dict, dict): + return False + + size_dict_keys = set(size_dict.keys()) + for allowed_keys in VALID_SIZE_DICT_KEYS: + if size_dict_keys == allowed_keys: + return True + return False + + +def convert_to_size_dict( + size, + max_size: Optional[int] = None, + default_to_square: bool = True, + height_width_order: bool = True, +): + # By default, if size is an int we assume it represents a tuple of (size, size). + if isinstance(size, int) and default_to_square: + if max_size is not None: + raise ValueError( + "Cannot specify both size as an int, with default_to_square=True and max_size" + ) + return {"height": size, "width": size} + # In other configs, if size is an int and default_to_square is False, size represents the length of + # the shortest edge after resizing. + elif isinstance(size, int) and not default_to_square: + size_dict = {"shortest_edge": size} + if max_size is not None: + size_dict["longest_edge"] = max_size + return size_dict + # Otherwise, if size is a tuple it's either (height, width) or (width, height) + elif isinstance(size, (tuple, list)) and height_width_order: + return {"height": size[0], "width": size[1]} + elif isinstance(size, (tuple, list)) and not height_width_order: + return {"height": size[1], "width": size[0]} + + raise ValueError(f"Could not convert size input to size dict: {size}") + + +def get_size_dict( + size: Union[int, Iterable[int], Dict[str, int]] = None, + max_size: Optional[int] = None, + height_width_order: bool = True, + default_to_square: bool = True, + param_name="size", +) -> dict: + """ + Converts the old size parameter in the config into the new dict expected in the config. This is to ensure backwards + compatibility with the old image processor configs and removes ambiguity over whether the tuple is in (height, + width) or (width, height) format. + + - If `size` is tuple, it is converted to `{"height": size[0], "width": size[1]}` or `{"height": size[1], "width": + size[0]}` if `height_width_order` is `False`. + - If `size` is an int, and `default_to_square` is `True`, it is converted to `{"height": size, "width": size}`. + - If `size` is an int and `default_to_square` is False, it is converted to `{"shortest_edge": size}`. If `max_size` + is set, it is added to the dict as `{"longest_edge": max_size}`. + + Args: + size (`Union[int, Iterable[int], Dict[str, int]]`, *optional*): + The `size` parameter to be cast into a size dictionary. + max_size (`Optional[int]`, *optional*): + The `max_size` parameter to be cast into a size dictionary. + height_width_order (`bool`, *optional*, defaults to `True`): + If `size` is a tuple, whether it's in (height, width) or (width, height) order. + default_to_square (`bool`, *optional*, defaults to `True`): + If `size` is an int, whether to default to a square image or not. + """ + if not isinstance(size, dict): + size_dict = convert_to_size_dict( + size, max_size, default_to_square, height_width_order + ) + logger.info( + f"{param_name} should be a dictionary on of the following set of keys: {VALID_SIZE_DICT_KEYS}, got {size}." + f" Converted to {size_dict}.", + ) + else: + size_dict = size + + if not is_valid_size_dict(size_dict): + raise ValueError( + f"{param_name} must have one of the following set of keys: {VALID_SIZE_DICT_KEYS}, got {size_dict.keys()}" + ) + return size_dict diff --git a/paddlevlp/processors/utils.py b/paddlevlp/processors/utils.py new file mode 100644 index 00000000000000..34dd36fe33fea3 --- /dev/null +++ b/paddlevlp/processors/utils.py @@ -0,0 +1,27 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from enum import Enum + + +class ExplicitEnum(Enum): + """ + Enum with more explicit error message for missing values. + """ + + @classmethod + def _missing_(cls, value): + raise ValueError( + f"{value} is not a valid {cls.__name__}, please select one of {list(cls._value2member_map_.keys())}" + ) diff --git a/paddlevlp/trainer/__init__.py b/paddlevlp/trainer/__init__.py new file mode 100644 index 00000000000000..bcdf4663fb4b70 --- /dev/null +++ b/paddlevlp/trainer/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .trainer import * diff --git a/paddlevlp/trainer/trainer.py b/paddlevlp/trainer/trainer.py new file mode 100644 index 00000000000000..ea566a6e5b12ef --- /dev/null +++ b/paddlevlp/trainer/trainer.py @@ -0,0 +1,15 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddlenlp.trainer.trainer import Trainer diff --git a/paddlevlp/utils/__init__.py b/paddlevlp/utils/__init__.py new file mode 100644 index 00000000000000..595add0aed9e11 --- /dev/null +++ b/paddlevlp/utils/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlevlp/utils/downloader.py b/paddlevlp/utils/downloader.py new file mode 100644 index 00000000000000..3944b318ba7332 --- /dev/null +++ b/paddlevlp/utils/downloader.py @@ -0,0 +1,492 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import hashlib +import json +import os +import os.path as osp +import shutil +import tarfile +import threading +import time +import uuid +import zipfile +from typing import Optional, Union + +import requests +from filelock import FileLock +from huggingface_hub import get_hf_file_metadata, hf_hub_url +from huggingface_hub.utils import EntryNotFoundError +from tqdm.auto import tqdm + +from .env import (DOWNLOAD_SERVER, FAILED_STATUS, HF_CACHE_HOME, MODEL_HOME, + SUCCESS_STATUS) +from .log import logger + +__all__ = ["get_weights_path_from_url", "resolve_cache_dir"] + + +COMMUNITY_MODEL_PREFIX = os.getenv( + "COMMUNITY_MODEL_PREFIX", "https://bj.bcebos.com/paddlenlp/models/community" +) +WEIGHTS_HOME = osp.expanduser("~/.cache/paddle/hapi/weights") +DOWNLOAD_RETRY_LIMIT = 3 +DOWNLOAD_CHECK = False + + +def is_url(path): + """ + Whether path is URL. + Args: + path (string): URL string or not. + """ + return path.startswith("http://") or path.startswith("https://") + + +def get_weights_path_from_url(url, md5sum=None): + """Get weights path from WEIGHT_HOME, if not exists, + download it from url. + Args: + url (str): download url + md5sum (str): md5 sum of download package + + Returns: + str: a local path to save downloaded weights. + Examples: + .. code-block:: python + from paddle.utils.download import get_weights_path_from_url + resnet18_pretrained_weight_url = 'https://paddle-hapi.bj.bcebos.com/models/resnet18.pdparams' + local_weight_path = get_weights_path_from_url(resnet18_pretrained_weight_url) + """ + path = get_path_from_url(url, WEIGHTS_HOME, md5sum) + return path + + +def _map_path(url, root_dir): + # parse path after download under root_dir + fname = osp.split(url)[-1] + fpath = fname + return osp.join(root_dir, fpath) + + +def get_path_from_url(url, root_dir, md5sum=None, check_exist=True): + """Download from given url to root_dir. + if file or directory specified by url is exists under + root_dir, return the path directly, otherwise download + from url and decompress it, return the path. + Args: + url (str): download url + root_dir (str): root dir for downloading, it should be + WEIGHTS_HOME or DATASET_HOME + md5sum (str): md5 sum of download package + + Returns: + str: a local path to save downloaded models & weights & datasets. + """ + + assert is_url(url), "downloading from {} not a url".format(url) + # parse path after download to decompress under root_dir + fullpath = _map_path(url, root_dir) + + if osp.exists(fullpath) and check_exist and _md5check(fullpath, md5sum): + logger.info("Found {}".format(fullpath)) + else: + fullpath = _download(url, root_dir, md5sum) + + if tarfile.is_tarfile(fullpath) or zipfile.is_zipfile(fullpath): + fullpath = _decompress(fullpath) + + # model tokenizer config, [file-lock] + return fullpath + + +def get_path_from_url_with_filelock( + url: str, + root_dir: str, + md5sum: Optional[str] = None, + check_exist: bool = True, + timeout: float = -1, +) -> str: + """construct `get_path_from_url` for `model_utils` to enable downloading multiprocess-safe + + Args: + url (str): the url of resource file + root_dir (str): the local download path + md5sum (str, optional): md5sum string for file. Defaults to None. + check_exist (bool, optional): whether check the file is exist. Defaults to True. + timeout (int, optional): the timeout for downloading. Defaults to -1. + + Returns: + str: the path of downloaded file + """ + + os.makedirs(root_dir, exist_ok=True) + + # create lock file, which is empty, under the `LOCK_FILE_HOME` directory. + lock_file_name = hashlib.md5((url + root_dir).encode("utf-8")).hexdigest() + + # create `.lock` private directory in the cache dir + lock_file_path = os.path.join(root_dir, ".lock", lock_file_name) + + os.makedirs(os.path.dirname(lock_file_path), exist_ok=True) + + with FileLock(lock_file_path, timeout=timeout): + result = get_path_from_url( + url=url, root_dir=root_dir, md5sum=md5sum, check_exist=check_exist + ) + return result + + +def _download(url, path, md5sum=None): + """ + Download from url, save to path. + url (str): download url + path (str): download to given path + """ + os.makedirs(path, exist_ok=True) + + fname = osp.split(url)[-1] + fullname = osp.join(path, fname) + retry_cnt = 0 + + while not (osp.exists(fullname) and _md5check(fullname, md5sum)): + if retry_cnt < DOWNLOAD_RETRY_LIMIT: + retry_cnt += 1 + else: + raise RuntimeError( + "Download from {} failed. " "Retry limit reached".format(url) + ) + + logger.info("Downloading {} from {}".format(fname, url)) + + req = requests.get(url, stream=True) + if req.status_code != 200: + raise RuntimeError( + "Downloading from {} failed with code " + "{}!".format(url, req.status_code) + ) + + # For protecting download interupted, download to + # tmp_fullname firstly, move tmp_fullname to fullname + # after download finished + tmp_fullname = fullname + "_tmp" + total_size = req.headers.get("content-length") + with open(tmp_fullname, "wb") as f: + if total_size: + with tqdm( + total=int(total_size), unit="B", unit_scale=True, unit_divisor=1024 + ) as pbar: + for chunk in req.iter_content(chunk_size=1024): + f.write(chunk) + pbar.update(len(chunk)) + else: + for chunk in req.iter_content(chunk_size=1024): + if chunk: + f.write(chunk) + shutil.move(tmp_fullname, fullname) + + return fullname + + +def _md5check(fullname, md5sum=None): + if md5sum is None: + return True + + logger.info("File {} md5 checking...".format(fullname)) + md5 = hashlib.md5() + with open(fullname, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + md5.update(chunk) + calc_md5sum = md5.hexdigest() + + if calc_md5sum != md5sum: + logger.info( + "File {} md5 check failed, {}(calc) != " + "{}(base)".format(fullname, calc_md5sum, md5sum) + ) + return False + return True + + +def _md5(text): + """ + Calculate the md5 value of the input text. + """ + + md5code = hashlib.md5(text.encode()) + return md5code.hexdigest() + + +def _decompress(fname): + """ + Decompress for zip and tar file + """ + logger.info("Decompressing {}...".format(fname)) + + # For protecting decompressing interupted, + # decompress to fpath_tmp directory firstly, if decompress + # successed, move decompress files to fpath and delete + # fpath_tmp and remove download compress file. + + if tarfile.is_tarfile(fname): + uncompressed_path = _uncompress_file_tar(fname) + elif zipfile.is_zipfile(fname): + uncompressed_path = _uncompress_file_zip(fname) + else: + raise TypeError("Unsupport compress file type {}".format(fname)) + + return uncompressed_path + + +def _uncompress_file_zip(filepath): + files = zipfile.ZipFile(filepath, "r") + file_list = files.namelist() + + file_dir = os.path.dirname(filepath) + + if _is_a_single_file(file_list): + rootpath = file_list[0] + uncompressed_path = os.path.join(file_dir, rootpath) + + for item in file_list: + files.extract(item, file_dir) + + elif _is_a_single_dir(file_list): + rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[-1] + uncompressed_path = os.path.join(file_dir, rootpath) + + for item in file_list: + files.extract(item, file_dir) + + else: + rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1] + uncompressed_path = os.path.join(file_dir, rootpath) + if not os.path.exists(uncompressed_path): + os.makedirs(uncompressed_path) + for item in file_list: + files.extract(item, os.path.join(file_dir, rootpath)) + + files.close() + + return uncompressed_path + + +def _uncompress_file_tar(filepath, mode="r:*"): + files = tarfile.open(filepath, mode) + file_list = files.getnames() + file_dir = os.path.dirname(filepath) + + if _is_a_single_file(file_list): + rootpath = file_list[0] + uncompressed_path = os.path.join(file_dir, rootpath) + files.extractall(file_dir, files.getmembers()) + elif _is_a_single_dir(file_list): + rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[-1] + uncompressed_path = os.path.join(file_dir, rootpath) + files.extractall(file_dir, files.getmembers()) + else: + rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1] + uncompressed_path = os.path.join(file_dir, rootpath) + if not os.path.exists(uncompressed_path): + os.makedirs(uncompressed_path) + + files.extractall(os.path.join(file_dir, rootpath), files.getmembers()) + + files.close() + + return uncompressed_path + + +def _is_a_single_file(file_list): + if len(file_list) == 1 and file_list[0].find(os.sep) < -1: + return True + return False + + +def _is_a_single_dir(file_list): + new_file_list = [] + for file_path in file_list: + if "/" in file_path: + file_path = file_path.replace("/", os.sep) + elif "\\" in file_path: + file_path = file_path.replace("\\", os.sep) + new_file_list.append(file_path) + + file_name = new_file_list[0].split(os.sep)[0] + for i in range(1, len(new_file_list)): + if file_name != new_file_list[i].split(os.sep)[0]: + return False + return True + + +class DownloaderCheck(threading.Thread): + """ + Check the resource applicability when downloading the models. + """ + + def __init__(self, task, command="taskflow", addition=None): + threading.Thread.__init__(self) + self.command = command + self.task = task + self.addition = addition + self._initialize() + + def uri_path(self, server_url, api): + srv = server_url + if server_url.endswith("/"): + srv = server_url[:-1] + if api.startswith("/"): + srv += api + else: + api = "/" + api + srv += api + return srv + + def _initialize(self): + etime = str(int(time.time())) + self.full_hash_flag = _md5(str(uuid.uuid1())[-12:]) + self.hash_flag = _md5(str(uuid.uuid1())[9:18]) + "-" + etime + + def request_check(self, task, command, addition): + if task is None: + return SUCCESS_STATUS + payload = {"word": self.task} + api_url = self.uri_path(DOWNLOAD_SERVER, "stat") + cache_path = os.path.join("~") + if os.path.exists(cache_path): + extra = { + "command": self.command, + "mtime": os.stat(cache_path).st_mtime, + "hub_name": self.hash_flag, + "cache_info": self.full_hash_flag, + } + else: + extra = { + "command": self.command, + "mtime": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), + "hub_name": self.hash_flag, + "cache_info": self.full_hash_flag, + } + if addition is not None: + extra.update({"addition": addition}) + try: + import paddle + import paddlenlp + + payload["hub_version"] = " " + payload["ppnlp_version"] = paddlenlp.__version__ + payload["paddle_version"] = paddle.__version__.split("-")[0] + payload["from"] = "ppnlp" + payload["extra"] = json.dumps(extra) + r = requests.get(api_url, payload, timeout=1).json() + if r.get("update_cache", 0) == 1: + return SUCCESS_STATUS + else: + return FAILED_STATUS + except Exception: + return FAILED_STATUS + + def run(self): + self.request_check(self.task, self.command, self.addition) + + +def download_check(model_id, model_class, addition=None): + logger.disable() + global DOWNLOAD_CHECK + if not DOWNLOAD_CHECK: + DOWNLOAD_CHECK = True + checker = DownloaderCheck(model_id, model_class, addition) + checker.start() + checker.join() + logger.enable() + + +def url_file_exists(url: str) -> bool: + """check whether the url file exists + + refer to: https://stackoverflow.com/questions/2486145/python-check-if-url-to-jpg-exists + + Args: + url (str): the url of target file + + Returns: + bool: whether the url file exists + """ + if not is_url(url): + return False + + result = requests.head(url) + return result.status_code == requests.codes.ok + + +def hf_file_exists( + repo_id: str, + filename: str, + token: Union[bool, str, None] = None, + subfolder: Optional[str] = None, +) -> bool: + """Check whether the HF file exists + + Args: + repo_id (`str`): A namespace (user or an organization) name and a repo name separated by a `/`. + filename (`str`): The name of the file in the repo. + token (`str` or `bool`, *optional*): A token to be used for the download. + - If `True`, the token is read from the HuggingFace config folder. + - If `False` or `None`, no token is provided. + - If a string, it's used as the authentication token. + subfolder (str, optional) An optional value corresponding to a folder inside the repo. + Returns: + bool: whether the HF file exists + """ + + url = hf_hub_url(repo_id=repo_id, filename=filename, subfolder=subfolder) + try: + _ = get_hf_file_metadata( + url=url, + token=token, + ) + return True + except EntryNotFoundError: + return False + + +def resolve_cache_dir( + pretrained_model_name_or_path: str, + from_hf_hub: bool, + cache_dir: Optional[str] = None, +) -> str: + """resolve cache dir for PretrainedModel and PretrainedConfig + + Args: + pretrained_model_name_or_path (str): the name or path of pretrained model + from_hf_hub (bool): if load from huggingface hub + cache_dir (str): cache_dir for models + """ + if os.path.isdir(pretrained_model_name_or_path): + return pretrained_model_name_or_path + + # hf hub library takes care of appending the model name so we don't append the model name + if from_hf_hub: + if cache_dir is not None: + return cache_dir + else: + return HF_CACHE_HOME + else: + if cache_dir is not None: + # since model_clas.from_pretrained calls config_clas.from_pretrained, the model_name may get appended twice + if cache_dir.endswith(pretrained_model_name_or_path): + return cache_dir + else: + return os.path.join(cache_dir, pretrained_model_name_or_path) + return os.path.join(MODEL_HOME, pretrained_model_name_or_path) diff --git a/paddlevlp/utils/env.py b/paddlevlp/utils/env.py new file mode 100644 index 00000000000000..e2ecda491afbe3 --- /dev/null +++ b/paddlevlp/utils/env.py @@ -0,0 +1,84 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This module is used to store environmental variables in PaddleMIX. +PPMIX_HOME --> the root directory for storing PaddleMIX related data. Default to ~/.paddlemix. Users can change the +├ default value through the PPMIX_HOME environment variable. +├─ MODEL_HOME --> Store model files. +└─ DATA_HOME --> Store automatically downloaded datasets. +""" +import os + + +def _get_user_home(): + return os.path.expanduser("~") + + +def _get_ppmix_home(): + if "PPMIX_HOME" in os.environ: + home_path = os.environ["PPMIX_HOME"] + if os.path.exists(home_path): + if os.path.isdir(home_path): + return home_path + else: + raise RuntimeError( + "The environment variable PPMIX_HOME {} is not a directory.".format( + home_path + ) + ) + else: + return home_path + return os.path.join(_get_user_home(), ".paddlemix") + + +def _get_sub_home(directory, parent_home=_get_ppmix_home()): + home = os.path.join(parent_home, directory) + if not os.path.exists(home): + os.makedirs(home, exist_ok=True) + return home + + +def _get_bool_env(env_key: str, default_value: str) -> bool: + """get boolean environment variable, which can be "true", "True", "1" + + Args: + env_key (str): key of env variable + """ + value = os.getenv(env_key, default_value).lower() + return value in ["true", "1"] + + +USER_HOME = _get_user_home() +PPMIX_HOME = _get_ppmix_home() +MODEL_HOME = _get_sub_home("models") +HF_CACHE_HOME = os.environ.get("HUGGINGFACE_HUB_CACHE", MODEL_HOME) +DATA_HOME = _get_sub_home("datasets") +PACKAGE_HOME = _get_sub_home("packages") +DOWNLOAD_SERVER = "http://paddlepaddle.org.cn/paddlehub" +FAILED_STATUS = -1 +SUCCESS_STATUS = 0 + +LEGACY_CONFIG_NAME = "model_config.json" +CONFIG_NAME = "config.json" +TOKENIZER_CONFIG_NAME = "tokenizer_config.json" +PYTORCH_WEIGHT_FILE_NAME = "pytorch_model.bin" +PADDLE_WEIGHT_FILE_NAME = "model_state.pdparams" +LORA_CONFIG_NAME = "lora_config.json" +PREFIX_CONFIG_NAME = "prefix_config.json" +LORA_WEIGHT_FILE_NAME = "lora_model_state.pdparams" +PREFIX_WEIGHT_FILE_NAME = "prefix_model_state.pdparams" +PAST_KEY_VALUES_FILE_NAME = "pre_caches.npy" + +# for conversion +ENABLE_TORCH_CHECKPOINT = _get_bool_env("ENABLE_TORCH_CHECKPOINT", "true") diff --git a/paddlevlp/utils/log.py b/paddlevlp/utils/log.py new file mode 100644 index 00000000000000..78d2d824b99a14 --- /dev/null +++ b/paddlevlp/utils/log.py @@ -0,0 +1,123 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import functools +import logging +import threading +import time + +import colorlog + +loggers = {} + +log_config = { + "DEBUG": {"level": 10, "color": "purple"}, + "INFO": {"level": 20, "color": "green"}, + "TRAIN": {"level": 21, "color": "cyan"}, + "EVAL": {"level": 22, "color": "blue"}, + "WARNING": {"level": 30, "color": "yellow"}, + "ERROR": {"level": 40, "color": "red"}, + "CRITICAL": {"level": 50, "color": "bold_red"}, +} + + +class Logger(object): + """ + Deafult logger in PaddleNLP + + Args: + name(str) : Logger name, default is 'PaddleNLP' + """ + + def __init__(self, name: str = None): + name = "PaddleNLP" if not name else name + self.logger = logging.getLogger(name) + + for key, conf in log_config.items(): + logging.addLevelName(conf["level"], key) + self.__dict__[key] = functools.partial(self.__call__, conf["level"]) + self.__dict__[key.lower()] = functools.partial(self.__call__, conf["level"]) + + self.format = colorlog.ColoredFormatter( + "%(log_color)s[%(asctime)-15s] [%(levelname)8s]%(reset)s - %(message)s", + log_colors={key: conf["color"] for key, conf in log_config.items()}, + ) + + self.handler = logging.StreamHandler() + self.handler.setFormatter(self.format) + + self.logger.addHandler(self.handler) + self.logLevel = "DEBUG" + self.logger.setLevel(logging.DEBUG) + self.logger.propagate = False + self._is_enable = True + + def disable(self): + self._is_enable = False + + def enable(self): + self._is_enable = True + + def set_level(self, log_level: str): + assert ( + log_level in log_config + ), f"Invalid log level. Choose among {log_config.keys()}" + self.logger.setLevel(log_level) + + @property + def is_enable(self) -> bool: + return self._is_enable + + def __call__(self, log_level: str, msg: str): + if not self.is_enable: + return + + self.logger.log(log_level, msg) + + @contextlib.contextmanager + def use_terminator(self, terminator: str): + old_terminator = self.handler.terminator + self.handler.terminator = terminator + yield + self.handler.terminator = old_terminator + + @contextlib.contextmanager + def processing(self, msg: str, interval: float = 0.1): + """ + Continuously print a progress bar with rotating special effects. + + Args: + msg(str): Message to be printed. + interval(float): Rotation interval. Default to 0.1. + """ + end = False + + def _printer(): + index = 0 + flags = ["\\", "|", "/", "-"] + while not end: + flag = flags[index % len(flags)] + with self.use_terminator("\r"): + self.info("{}: {}".format(msg, flag)) + time.sleep(interval) + index += 1 + + t = threading.Thread(target=_printer) + t.start() + yield + end = True + + +logger = Logger() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000000000..789aec7d1ae15e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +numpy +paddlenlp \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 00000000000000..ceef21be8dcbb2 --- /dev/null +++ b/setup.py @@ -0,0 +1,73 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +from setuptools import find_packages, setup + +description = "PaddleMIX" + +with open("requirements.txt") as fin: + REQUIRED_PACKAGES = fin.read() + + +def read(file: str): + current_dir = os.path.dirname(__file__) + path = os.path.join(current_dir, file) + with open(path, "r", encoding="utf-8") as f: + content = f.read().strip() + return content + + +def read_version(): + """read version of paddlemix""" + return read("VERSION") + + +def read_readme(): + return read("README.md") + + +def read_requirements(): + content = read("requirements.txt") + packages = content.split("\n") + return packages + + +setup( + name="paddlemix", + packages=find_packages(), + version=read_version(), + author="PaddleMIX Team", + author_email="paddlemix@baidu.com", + description=description, + long_description=read_readme(), + long_description_content_type="text/markdown", + url="", + keywords=["paddle", "paddlemix"], + install_requires=REQUIRED_PACKAGES, + python_requires=">=3.6", + entry_points={ + "console_scripts": ["ppdiffusers-cli=ppdiffusers.commands.ppdiffusers_cli:main"] + }, + classifiers=[ + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + ], + license="Apache 2.0", +) From 46176c431fa61a3f23e24c22b8e17cb38922a41e Mon Sep 17 00:00:00 2001 From: Milen <1649759610@qq.com> Date: Thu, 29 Jun 2023 05:37:37 +0000 Subject: [PATCH 03/10] add MiniGPT4 --- paddlevlp/models/minigpt4/__init__.py | 13 + paddlevlp/models/minigpt4/configuration.py | 348 ++++ paddlevlp/models/minigpt4/modeling.py | 1775 +++++++++++++++++ paddlevlp/processors/__init__.py | 2 + .../processors/minigpt4_image_processing.py | 284 +++ paddlevlp/processors/minigpt4_processing.py | 245 +++ 6 files changed, 2667 insertions(+) create mode 100644 paddlevlp/models/minigpt4/__init__.py create mode 100644 paddlevlp/models/minigpt4/configuration.py create mode 100644 paddlevlp/models/minigpt4/modeling.py create mode 100644 paddlevlp/processors/minigpt4_image_processing.py create mode 100644 paddlevlp/processors/minigpt4_processing.py diff --git a/paddlevlp/models/minigpt4/__init__.py b/paddlevlp/models/minigpt4/__init__.py new file mode 100644 index 00000000000000..595add0aed9e11 --- /dev/null +++ b/paddlevlp/models/minigpt4/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlevlp/models/minigpt4/configuration.py b/paddlevlp/models/minigpt4/configuration.py new file mode 100644 index 00000000000000..4f9a5ec08b782f --- /dev/null +++ b/paddlevlp/models/minigpt4/configuration.py @@ -0,0 +1,348 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" MiniGPT4 model configuration """ +import copy +import os +from typing import Union + +from paddlenlp.utils.log import logger +from paddlenlp.transformers.auto.modeling import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES +from paddlenlp.transformers.configuration_utils import PretrainedConfig +from paddlenlp.transformers.llama.configuration import LlamaConfig + +__all__ = ["MiniGPT4VisionConfig", "MiniGPT4QFormerConfig", "MiniGPT4Config"] + + +class MiniGPT4VisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`MiniGPT4VisionModel`]. It is used to instantiate a + MiniGPT4 vision encoder according to the specified arguments, defining the model architecture. + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + Args: + hidden_size (`int`, *optional*, defaults to 1408): + Dimensionality of the encoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 6144): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + num_hidden_layers (`int`, *optional*, defaults to 39): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 14): + The size (resolution) of each patch. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults + to 1e-5): The epsilon used by the layer normalization layers. + dropout (`float`, *optional*, defaults to 0.0): + The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + initializer_factor (`float``, *optional*, defaults to 1): + A factor for initializing all weight matrices (should be kept to 1, used internally for initialization + testing). + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the queries and values in the self-attention layers. + Example: + ```python + >>> from paddlenlp.transformers import MiniGPT4VisionConfig, MiniGPT4VisionModel + >>> # Initializing a MiniGPT4VisionConfig + >>> configuration = MiniGPT4VisionConfig() + >>> # Initializing a MiniGPT4VisionModel (with random weights) from the configuration above. + >>> model = MiniGPT4VisionModel(configuration) + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "mimigpt4_vision_model" + + def __init__( + self, + hidden_size=1408, + intermediate_size=6144, + projection_dim=512, + num_hidden_layers=39, + num_attention_heads=16, + num_channels=3, + image_size=224, + patch_size=14, + hidden_act="gelu", + layer_norm_eps=0.00001, + dropout=0.0, + attention_dropout=0.0, + initializer_range=1e-10, + initializer_factor=1.0, + qkv_bias=True, + **kwargs, + ): + kwargs["return_dict"] = kwargs.pop("return_dict", True) + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.projection_dim = projection_dim + self.dropout = dropout + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_channels = num_channels + self.patch_size = patch_size + self.image_size = image_size + self.initializer_range = initializer_range + self.initializer_factor = initializer_factor + self.attention_dropout = attention_dropout + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + self.qkv_bias = qkv_bias + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + # get the vision config dict if we are loading from MiniGPT4Config + if config_dict.get("model_type") == "minigpt4": + config_dict = config_dict["vision_config"] + + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) + + +class MiniGPT4QFormerConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`MiniGPT4QFormerModel`]. It is used to instantiate a + MiniGPT4 Querying Transformer (Q-Former) model according to the specified arguments, defining the model architecture. + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from + [`PretrainedConfig`] for more information. + Note that [`MiniGPT4QFormerModel`] is very similar to [`BertLMHeadModel`] with interleaved cross-attention. + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the Q-Former model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling the model. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + position_embedding_type (`str`, *optional*, defaults to `"absolute"`): + Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For + positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to + [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). + For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models + with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658). + classifier_dropout (`float`, *optional*): + The dropout ratio for the classification head. + cross_attention_frequency (`int`, *optional*, defaults to 2): + The frequency of adding cross-attention to the Transformer layers. + encoder_hidden_size (`int`, *optional*, defaults to 1408): + The hidden size of the hidden states for cross-attention. + Examples: + ```python + >>> from paddlenlp.transformers import MiniGPT4QFormerConfig, MiniGPT4QFormerModel + >>> # Initializing a MiniGPT4 configuration + >>> configuration = MiniGPT4QFormerConfig() + >>> # Initializing a model (with random weights) from the configuration above + >>> model = MiniGPT4QFormerModel(configuration) + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + model_type = "minigpt4_qformer" + + def __init__( + self, + vocab_size=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=0, + position_embedding_type="absolute", + classifier_dropout=None, + cross_attention_frequency=2, + encoder_hidden_size=1408, + **kwargs, + ): + super().__init__(pad_token_id=pad_token_id, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.position_embedding_type = position_embedding_type + self.classifier_dropout = classifier_dropout + self.cross_attention_frequency = cross_attention_frequency + self.encoder_hidden_size = encoder_hidden_size + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + + # get the qformer config dict if we are loading from MiniGPT4Config + if config_dict.get("model_type") == "minigpt4": + config_dict = config_dict["qformer_config"] + + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) + + +class MiniGPT4Config(PretrainedConfig): + r""" + [`MiniGPT4Config`] is the configuration class to store the configuration of a [`MiniGPT4ForConditionalGeneration`]. It is + used to instantiate a MiniGPT4 model according to the specified arguments, defining the vision model, Q-Former model + and language model configs. + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + Args: + vision_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`MiniGPT4VisionConfig`]. + qformer_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`MiniGPT4QFormerConfig`]. + text_config (`dict`, *optional*): + Dictionary of configuration options used to initialize any [`PretrainedConfig`]. + num_query_tokens (`int`, *optional*, defaults to 32): + The number of query tokens passed through the Transformer. + kwargs (*optional*): + Dictionary of keyword arguments. + Example: + ```python + >>> from paddlenlp.transformers import ( + ... MiniGPT4VisionConfig, + ... MiniGPT4QFormerConfig, + ... LlamaConfig, + ... MiniGPT4Config, + ... MiniGPT4ForConditionalGeneration, + ... ) + >>> # Initializing a MiniGPT4Config configuration + >>> configuration = MiniGPT4Config() + >>> # Initializing a MiniGPT4ForConditionalGeneration (with random weights) from the configuration above + >>> model = MiniGPT4ForConditionalGeneration(configuration) + >>> # Accessing the model configuration + >>> configuration = model.config + >>> # We can also initialize a MiniGPT4Config from a MiniGPT4VisionConfig, MiniGPT4QFormerConfig and any PretrainedConfig + >>> # Initializing MiniGPT4 vision, MiniGPT4 Q-Former and language model configurations + >>> vision_config = MiniGPT4VisionConfig() + >>> qformer_config = MiniGPT4QFormerConfig() + >>> text_config = LlamaConfig() + >>> config = MiniGPT4Config.from_text_vision_configs(vision_config, qformer_config, text_config) + ```""" + + model_type = "minigpt4" + is_composition = True + + def __init__(self, vision_config=None, qformer_config=None, text_config=None, num_query_tokens=32, **kwargs): + super().__init__(**kwargs) + + if vision_config is None: + vision_config = {} + logger.info("vision_config is None. initializing the MiniGPT4VisionConfig with default values.") + + if qformer_config is None: + qformer_config = {} + logger.info("qformer_config is None. Initializing the MiniGPT4QFormerConfig with default values.") + + if text_config is None: + text_config = {} + logger.info("text_config is None. Initializing the text config with default values (`LlamaConfig`).") + self.vision_config = MiniGPT4VisionConfig(**vision_config) + self.qformer_config = MiniGPT4QFormerConfig(**qformer_config) + text_model_type = text_config["model_type"] if "model_type" in text_config else "llama" + + if text_model_type == "llama": + self.text_config = LlamaConfig(**text_config) + else: + raise ValueError("Only llama accepted for model_type, but accepted {}.".format(text_model_type)) + + self.num_query_tokens = num_query_tokens + self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size + self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES + + self.initializer_factor = 1.0 + self.initializer_range = 0.02 + + @classmethod + def from_vision_qformer_text_configs( + cls, + vision_config: MiniGPT4VisionConfig, + qformer_config: MiniGPT4QFormerConfig, + text_config: PretrainedConfig, + **kwargs, + ): + r""" + Instantiate a [`MiniGPT4Config`] (or a derived class) from a vision model, Q-Former and language model + configurations. + Returns: + [`MiniGPT4`]: An instance of a configuration object + """ + + return cls( + vision_config=vision_config.to_dict(), + qformer_config=qformer_config.to_dict(), + text_config=text_config.to_dict(), + **kwargs, + ) + + def to_dict(self): + """ + Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. + Returns: + `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, + """ + output = copy.deepcopy(self.__dict__) + output["vision_config"] = self.vision_config.to_dict() + output["qformer_config"] = self.qformer_config.to_dict() + output["text_config"] = self.text_config.to_dict() + output["model_type"] = self.__class__.model_type + return output diff --git a/paddlevlp/models/minigpt4/modeling.py b/paddlevlp/models/minigpt4/modeling.py new file mode 100644 index 00000000000000..4239675bb7aaab --- /dev/null +++ b/paddlevlp/models/minigpt4/modeling.py @@ -0,0 +1,1775 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from dataclasses import dataclass +from typing import Any, Optional, Tuple, Union + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.distributed.fleet.utils import recompute +from paddle.nn import CrossEntropyLoss + +from paddlenlp.ops import transfer_param +from paddlenlp.utils.log import logger + +from paddlenlp.utils.initializer import normal_, ones_, zeros_ +from paddlenlp.transformers.activations import ACT2FN +from paddlenlp.transformers.llama.modeling import LlamaForCausalLM +from paddlenlp.transformers.model_outputs import ( + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPooling, + BaseModelOutputWithPoolingAndCrossAttentions, + ModelOutput, +) +from paddlenlp.transformers.model_utils import ( + PretrainedModel, + apply_chunking_to_forward, + find_pruneable_heads_and_indices, + prune_linear_layer, +) + +MiniGPT4_PRETRAINED_MODEL_ARCHIVE_LIST = [] + +from .configuration import MiniGPT4Config, MiniGPT4QFormerConfig, MiniGPT4VisionConfig + +__all__ = [ + "MiniGPT4Model", + "MiniGPT4PretrainedModel", + "MiniGPT4QFormerModel", + "MiniGPT4VisionModel", + "MiniGPT4ForConditionalGeneration", +] + + +def Parameter(tensor): + return paddle.create_parameter(tensor.shape, dtype=tensor.dtype, default_initializer=nn.initializer.Assign(tensor)) + + +def convert_weights_to_dtype(model, dtype: str): + # trying to convert model dtype if necessary + if dtype not in ["float16", "float32", "float64"]: + raise ValueError("Not supported dtype: {}., only [float16, float32, float64] supported.".format(dtype)) + dtype_mapping = { + "float16": paddle.float16, + "float32": paddle.float32, + "float64": paddle.float64, + } + + def convert_for_vit(layer): + if isinstance(layer, (nn.Linear, nn.Conv1D, nn.Conv2D)): + if layer.weight.dtype != dtype_mapping[dtype]: + layer.weight = transfer_param(layer.weight, restore_data=True, dtype=dtype) + if layer.bias is not None and layer.bias.dtype != dtype_mapping[dtype]: + layer.bias = transfer_param(layer.bias, restore_data=True, dtype=dtype) + + if isinstance(model, MiniGPT4VisionModel): + model.apply(convert_for_vit) + elif isinstance(model, (MiniGPT4QFormerModel, LlamaForCausalLM)): + model.to(dtype=dtype) + else: + raise TypeError("Not support model type: {}.".format(type(model))) + + +@dataclass +class MiniGPT4ForConditionalGenerationModelOutput(ModelOutput): + """ + Class defining the outputs of [`MiniGPT4ForConditionalGeneration`]. + Args: + loss (`paddle.Tensor`, *optional*, returned when `labels` is provided, `paddle.Tensor` of shape `(1,)`): + Language modeling loss from the language model. + logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head of the language model. + vision_outputs (`BaseModelOutputWithPooling`): + Outputs of the vision encoder. + qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`): + Outputs of the Q-Former (Querying Transformer). + language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`): + Outputs of the language model. + """ + + loss: Optional[Tuple[paddle.Tensor]] = None + logits: Optional[Tuple[paddle.Tensor]] = None + vision_outputs: Optional[paddle.Tensor] = None + qformer_outputs: Optional[Tuple[paddle.Tensor]] = None + language_model_outputs: Optional[Tuple[paddle.Tensor]] = None + + def to_tuple(self) -> Tuple[Any]: + return tuple( + self[k] + if k not in ["vision_outputs", "qformer_outputs", "language_model_outputs"] + else getattr(self, k).to_tuple() + for k in self.keys() + ) + + +class MiniGPT4PretrainedModel(PretrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = MiniGPT4Config + base_model_prefix = "minigpt4" + supports_gradient_checkpointing = True + _keys_to_ignore_on_load_missing = [ + r"position_ids", + ] + + def _init_weights(self, module): + """Initialize the weights""" + factor = self.config.initializer_range + if isinstance(module, nn.Conv2D) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear): + normal_(module.weight, mean=0.0, std=factor) + if hasattr(module, "bias") and module.bias is not None: + zeros_(module.bias) + + if isinstance(module, MiniGPT4VisionEmbeddings): + if hasattr(self.config, "vision_config"): + factor = self.config.vision_config.initializer_range + trunc_normal_ = nn.initializer.TruncatedNormal(mean=0.0, std=factor) + trunc_normal_(module.position_embedding) + trunc_normal_( + module.class_embedding, + ) + elif isinstance(module, nn.LayerNorm): + zeros_(module.bias) + ones_(module.weight) + elif isinstance(module, nn.Linear) and module.bias is not None: + zeros_(module.bias) + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, MiniGPT4Encoder): + module.gradient_checkpointing = value + + @classmethod + def from_pretrained( + cls, pretrained_model_name_or_path, from_hf_hub: bool = False, subfolder: str = None, *args, **kwargs + ): + vit_dtype = kwargs.pop("vit_dtype", "float16") + qformer_dtype = kwargs.pop("qformer_dtype", "float32") + llama_dtype = kwargs.pop("llama_dtype", "float16") + + model = super().from_pretrained( + pretrained_model_name_or_path, from_hf_hub=from_hf_hub, subfolder=subfolder, *args, **kwargs + ) + + logger.info("Trying to convert dtype for MiniGPT4 model, it may take a while.") + if isinstance(model, (MiniGPT4Model, MiniGPT4ForConditionalGeneration)): + convert_weights_to_dtype(model.vision_model, dtype=vit_dtype) + convert_weights_to_dtype(model.qformer, dtype=qformer_dtype) + convert_weights_to_dtype(model.language_model, dtype=llama_dtype) + elif isinstance(model, MiniGPT4VisionModel): + convert_weights_to_dtype(model, dtype=vit_dtype) + elif isinstance(model, MiniGPT4QFormerModel): + convert_weights_to_dtype(model, dtype=qformer_dtype) + elif isinstance(model, LlamaForCausalLM): + convert_weights_to_dtype(model, dtype=llama_dtype) + else: + raise TypeError("Not supported model type: {}.".format(type(model))) + + return model + + +class MiniGPT4VisionEmbeddings(nn.Layer): + def __init__(self, config: MiniGPT4VisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.class_embedding = Parameter(paddle.randn([1, 1, self.embed_dim])) + + self.patch_embedding = nn.Conv2D( + in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size + ) + + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + 1 + + self.position_embedding = Parameter(paddle.randn([1, self.num_positions, self.embed_dim])) + + def forward(self, pixel_values: paddle.Tensor) -> paddle.Tensor: + batch_size = pixel_values.shape[0] + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(pixel_values) # shape = [*, width, grid, grid] + patch_embeds_shape = paddle.shape(patch_embeds) + patch_embeds = paddle.reshape( + patch_embeds, shape=[patch_embeds_shape[0], patch_embeds_shape[1], -1] + ).transpose([0, 2, 1]) + + class_embeds = self.class_embedding.expand([batch_size, 1, -1]).cast(target_dtype) + embeddings = paddle.concat([class_embeds, patch_embeds], axis=1) + embeddings = embeddings + self.position_embedding[:, : embeddings.shape[1], :].cast(target_dtype) + return embeddings + + +class MiniGPT4Attention(nn.Layer): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.dropout = nn.Dropout(config.attention_dropout) + + # small tweak here compared to CLIP, no bias here + self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias_attr=False) + + if config.qkv_bias: + q_bias = Parameter(paddle.zeros([self.embed_dim])) + v_bias = Parameter(paddle.zeros([self.embed_dim])) + else: + q_bias = None + v_bias = None + + if q_bias is not None: + qkv_bias = paddle.concat((q_bias, paddle.zeros_like(v_bias), v_bias)) + self.qkv.bias = Parameter(qkv_bias) + + self.projection = nn.Linear(self.embed_dim, self.embed_dim) + + def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int): + return tensor.reshape([bsz, seq_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3]) + + def forward( + self, + hidden_states: paddle.Tensor, + head_mask: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + bsz, tgt_len, embed_dim = hidden_states.shape + + mixed_qkv = self.qkv(hidden_states) + + mixed_qkv = mixed_qkv.reshape([bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads]).transpose( + [2, 0, 3, 1, 4] + ) + query_states, key_states, value_states = ( + mixed_qkv[0], + mixed_qkv[1], + mixed_qkv[2], + ) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = paddle.matmul(query_states, key_states, transpose_y=True) + + attention_scores = attention_scores * self.scale + + # Normalize the attention scores to probabilities. + attention_probs = F.softmax(attention_scores, axis=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = paddle.matmul(attention_probs, value_states).transpose([0, 2, 1, 3]) + + new_context_layer_shape = context_layer.shape[:-2] + [ + self.embed_dim, + ] + context_layer = context_layer.reshape(new_context_layer_shape) + + output = self.projection(context_layer) + + outputs = (output, attention_probs) if output_attentions else (output, None) + + return outputs + + +class MiniGPT4MLP(nn.Layer): + def __init__(self, config): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.hidden_act] + self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + + def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +class MiniGPT4EncoderLayer(nn.Layer): + def __init__(self, config: MiniGPT4Config): + super().__init__() + self.embed_dim = config.hidden_size + self.self_attn = MiniGPT4Attention(config) + self.layer_norm1 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps) + self.mlp = MiniGPT4MLP(config) + self.layer_norm2 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps) + + def forward( + self, + hidden_states: paddle.Tensor, + attention_mask: paddle.Tensor, + output_attentions: Optional[bool] = False, + ) -> Tuple[paddle.Tensor]: + """ + Args: + hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`paddle.Tensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + `(config.encoder_attention_heads,)`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + head_mask=attention_mask, + output_attentions=output_attentions, + ) + hidden_states = hidden_states + residual + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + + hidden_states = hidden_states + residual + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class MiniGPT4Encoder(nn.Layer): + """ + Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a + [`MiniGPT4EncoderLayer`]. + Args: + config (`MiniGPT4Config`): + The corresponding vision configuration for the `MiniGPT4Encoder`. + """ + + def __init__(self, config: MiniGPT4Config): + super().__init__() + self.config = config + self.layers = nn.LayerList([MiniGPT4EncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + inputs_embeds, + attention_mask: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: + r""" + Args: + inputs_embeds (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + [What are attention masks?](../glossary#attention-mask) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + hidden_states = inputs_embeds + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, output_attentions) + + return custom_forward + + layer_outputs = recompute( + create_custom_forward(encoder_layer), + hidden_states, + attention_mask, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +class MiniGPT4VisionModel(MiniGPT4PretrainedModel): + main_input_name = "pixel_values" + config_class = MiniGPT4VisionConfig + + def __init__(self, config: MiniGPT4VisionConfig): + super().__init__(config) + self.config = config + embed_dim = config.hidden_size + + self.embeddings = MiniGPT4VisionEmbeddings(config) + self.encoder = MiniGPT4Encoder(config) + self.post_layernorm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps) + + def forward( + self, + pixel_values: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + r""" + Returns: + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + hidden_states = self.embeddings(pixel_values) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = encoder_outputs[0] + last_hidden_state = self.post_layernorm(last_hidden_state) + + pooled_output = last_hidden_state[:, 0, :] + pooled_output = self.post_layernorm(pooled_output) + + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + def get_input_embeddings(self): + return self.embeddings + + +class MiniGPT4QFormerMultiHeadAttention(nn.Layer): + def __init__(self, config, is_cross_attention=False): + super().__init__() + self.config = config + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention heads (%d)" + % (config.hidden_size, config.num_attention_heads) + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + if is_cross_attention: + self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size) + self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size) + else: + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + self.save_attention = False + + def save_attn_gradients(self, attn_gradients): + self.attn_gradients = attn_gradients + + def get_attn_gradients(self): + return self.attn_gradients + + def save_attention_map(self, attention_map): + self.attention_map = attention_map + + def get_attention_map(self): + return self.attention_map + + def transpose_for_scores(self, x): + new_x_shape = x.shape[:-1] + [self.num_attention_heads, self.attention_head_size] + x = x.reshape(new_x_shape) + return x.transpose([0, 2, 1, 3]) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = paddle.concat([past_key_value[0], key_layer], axis=2) + value_layer = paddle.concat([past_key_value[1], value_layer], axis=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + mixed_query_layer = self.query(hidden_states) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = paddle.matmul(query_layer, key_layer, transpose_y=True) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + seq_length = hidden_states.shape[1] + position_ids_l = paddle.arange(seq_length, dtype="int64").reshape([-1, 1]) + position_ids_r = paddle.arange(seq_length, dtype="int64").reshape([1, -1]) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.cast(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = paddle.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = paddle.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = paddle.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(axis=-1)(attention_scores) + + if is_cross_attention and self.save_attention: + self.save_attention_map(attention_probs) + attention_probs.register_hook(self.save_attn_gradients) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs_dropped = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs_dropped = attention_probs_dropped * head_mask + + context_layer = paddle.matmul(attention_probs_dropped, value_layer) + + context_layer = context_layer.transpose([0, 2, 1, 3]) + new_context_layer_shape = context_layer.shape[:-2] + [ + self.all_head_size, + ] + context_layer = context_layer.reshape(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + outputs = outputs + (past_key_value,) + return outputs + + +class MiniGPT4QFormerSelfOutput(nn.Layer): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: paddle.Tensor, input_tensor: paddle.Tensor) -> paddle.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class MiniGPT4QFormerAttention(nn.Layer): + def __init__(self, config, is_cross_attention=False): + super().__init__() + self.attention = MiniGPT4QFormerMultiHeadAttention(config, is_cross_attention) + self.output = MiniGPT4QFormerSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.attention.query = prune_linear_layer(self.attention.query, index) + self.attention.key = prune_linear_layer(self.attention.key, index) + self.attention.value = prune_linear_layer(self.attention.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, axis=1) + + # Update hyper params and store pruned heads + self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads) + self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: paddle.Tensor, + attention_mask: Optional[paddle.Tensor] = None, + head_mask: Optional[paddle.Tensor] = None, + encoder_hidden_states: Optional[paddle.Tensor] = None, + encoder_attention_mask: Optional[paddle.Tensor] = None, + past_key_value: Optional[Tuple[Tuple[paddle.Tensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[paddle.Tensor]: + self_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class MiniGPT4QFormerIntermediate(nn.Layer): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class MiniGPT4QFormerOutput(nn.Layer): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: paddle.Tensor, input_tensor: paddle.Tensor) -> paddle.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class MiniGPT4QFormerLayer(nn.Layer): + def __init__(self, config, layer_idx): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = MiniGPT4QFormerAttention(config) + + self.layer_idx = layer_idx + + if layer_idx % config.cross_attention_frequency == 0: + self.crossattention = MiniGPT4QFormerAttention(config, is_cross_attention=True) + self.has_cross_attention = True + else: + self.has_cross_attention = False + + self.intermediate_query = MiniGPT4QFormerIntermediate(config) + self.output_query = MiniGPT4QFormerOutput(config) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + query_length=0, + ): + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:-1] + + present_key_value = self_attention_outputs[-1] + + if query_length > 0: + query_attention_output = attention_output[:, :query_length, :] + + if self.has_cross_attention: + if encoder_hidden_states is None: + raise ValueError("encoder_hidden_states must be given for cross-attention layers") + cross_attention_outputs = self.crossattention( + query_attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + output_attentions=output_attentions, + ) + query_attention_output = cross_attention_outputs[0] + # add cross attentions if we output attention weights + outputs = outputs + cross_attention_outputs[1:-1] + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk_query, + self.chunk_size_feed_forward, + self.seq_len_dim, + query_attention_output, + ) + + if attention_output.shape[1] > query_length: + layer_output_text = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output[:, query_length:, :], + ) + layer_output = paddle.concat([layer_output, layer_output_text], axis=1) + else: + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output, + ) + outputs = (layer_output,) + outputs + + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + def feed_forward_chunk_query(self, attention_output): + intermediate_output = self.intermediate_query(attention_output) + layer_output = self.output_query(intermediate_output, attention_output) + return layer_output + + +class MiniGPT4QFormerEncoder(nn.Layer): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.LayerList( + [MiniGPT4QFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + query_length=0, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions else None + + next_decoder_cache = () if use_cache else None + + for i in range(self.config.num_hidden_layers): + layer_module = self.layer[i] + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + if getattr(self.config, "gradient_checkpointing", False) and self.training: + if use_cache: + logger.warn( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, past_key_value, output_attentions, query_length) + + return custom_forward + + layer_outputs = recompute( + create_custom_forward(layer_module), + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + query_length, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if layer_module.has_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +class MiniGPT4QFormerModel(MiniGPT4PretrainedModel): + """ + Querying Transformer (Q-Former), used in MiniGPT4. + """ + + def __init__(self, config: MiniGPT4QFormerConfig): + super().__init__(config) + self.config = config + + self.layernorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + self.encoder = MiniGPT4QFormerEncoder(config) + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def get_extended_attention_mask( + self, + attention_mask: paddle.Tensor, + input_shape: Tuple[int], + has_query: bool = False, + ) -> paddle.Tensor: + """ + Makes broadcastable attention and causal masks so that future and masked tokens are ignored. + Arguments: + attention_mask (`paddle.Tensor`): + Mask with ones indicating tokens to attend to, zeros for tokens to ignore. + input_shape (`Tuple[int]`): + The shape of the input to the model. + Returns: + `paddle.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`. + """ + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + if attention_mask.dim() == 3: + extended_attention_mask = attention_mask[:, None, :, :] + elif attention_mask.dim() == 2: + # Provided a padding mask of dimensions [batch_size, seq_length] + # - the model is an encoder, so make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] + extended_attention_mask = attention_mask[:, None, None, :] + else: + raise ValueError( + "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format( + input_shape, attention_mask.shape + ) + ) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.cast(dtype=self.layernorm.weight.dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + return extended_attention_mask + + def invert_attention_mask(self, encoder_attention_mask: paddle.Tensor) -> paddle.Tensor: + """ + Invert an attention mask (e.g., switches 0. and 1.). + Args: + encoder_attention_mask (`paddle.Tensor`): An attention mask. + Returns: + `paddle.Tensor`: The inverted attention mask. + """ + if encoder_attention_mask.ndim == 3: + encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] + if encoder_attention_mask.ndim == 2: + encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] + # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition + # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow + # /transformer/transformer_layers.py#L270 + encoder_extended_attention_mask = encoder_extended_attention_mask.cast( + dtype=self.layernorm.weight.dtype + ) # fp16 compatibility + encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e4 + + return encoder_extended_attention_mask + + def get_head_mask( + self, head_mask: Optional[paddle.Tensor], num_hidden_layers: int, is_attention_chunked: bool = False + ) -> paddle.Tensor: + """ + Prepare the head mask if needed. + Args: + head_mask (`paddle.Tensor` with shape `[num_heads]` or `[num_hidden_layers x num_heads]`, *optional*): + The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard). + num_hidden_layers (`int`): + The number of hidden layers in the model. + is_attention_chunked: (`bool`, *optional*, defaults to `False`): + Whether or not the attentions scores are computed by chunks or not. + Returns: + `paddle.Tensor` with shape `[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or list with + `[None]` for each layer. + """ + if head_mask is not None: + head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers) + if is_attention_chunked is True: + head_mask = head_mask.unsqueeze(-1) + else: + head_mask = [None] * num_hidden_layers + + return head_mask + + def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers): + """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]""" + if head_mask.ndim == 1: + head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) + head_mask = head_mask.expand([num_hidden_layers, -1, -1, -1, -1]) + elif head_mask.ndim == 2: + head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer + assert head_mask.ndim == 5, f"head_mask.dim != 5, instead {head_mask.dim()}" + head_mask = head_mask.cast(dtype=self.config.dtype) # switch to float if need + fp16 compatibility + return head_mask + + def forward( + self, + query_embeds, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + encoder_hidden_states (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`tuple(tuple(paddle.Tensor))` of length `config.n_layers` with each tuple having 4 tensors of: + shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and + value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are + used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key + value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape + `(batch_size, sequence_length)`. + use_cache (`bool`, `optional`): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # past_key_values_length + past_key_values_length = ( + past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0 + ) + + query_length = query_embeds.shape[1] if query_embeds is not None else 0 + + embedding_output = self.layernorm(query_embeds.cast(self.layernorm.weight.dtype)) + embedding_output = self.dropout(embedding_output) + + input_shape = embedding_output.shape[:-1] + batch_size, seq_length = input_shape + + if attention_mask is None: + attention_mask = paddle.ones(((batch_size, seq_length + past_key_values_length))) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if encoder_hidden_states is not None: + if type(encoder_hidden_states) == list: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].shape + else: + ( + encoder_batch_size, + encoder_sequence_length, + _, + ) = encoder_hidden_states.shape + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + + if type(encoder_attention_mask) == list: + encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask] + elif encoder_attention_mask is None: + encoder_attention_mask = paddle.ones(encoder_hidden_shape) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + query_length=query_length, + ) + sequence_output = encoder_outputs[0] + pooled_output = sequence_output[:, 0, :] + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +class MiniGPT4Model(MiniGPT4PretrainedModel): + config_class = MiniGPT4Config + main_input_name = "pixel_values" + + def __init__(self, config: MiniGPT4Config): + super().__init__(config) + + self.vision_model = MiniGPT4VisionModel(config.vision_config) + + self.query_tokens = Parameter(paddle.zeros([1, config.num_query_tokens, config.qformer_config.hidden_size])) + self.qformer = MiniGPT4QFormerModel(config.qformer_config) + + self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size) + self.language_model = LlamaForCausalLM(config.text_config) + + def get_input_embeddings(self) -> nn.Layer: + return self.vision_model.embeddings.patch_embedding + + def get_text_features( + self, + input_ids: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs + ): + r""" + Returns: + text_outputs (`CausalLMOutputWithPast`, or `tuple(paddle.Tensor)` if `return_dict=False`): + The language model outputs. If `return_dict=True`, the output is a [`CausalLMOutputWithPast`] that + contains the language model logits, the past key values and the hidden states if + `output_hidden_states=True`. + Examples: + ```python + >>> import paddle + >>> from paddlenlp.transformers import LlamaTokenizer, MiniGPT4Model + >>> tokenizer = LlamaTokenizer.from_pretrained("model_name") + >>> tokenizer.pad_token = tokenizer.eos_token + >>> model = MiniGPT4Model.from_pretrained("model_name") + >>> model.eval() + >>> inputs = tokenizer(["a photo of a cat"], padding=True, return_tensors="pd", return_token_type_ids=False) + >>> text_features = model.get_text_features(**inputs) + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + text_outputs = self.language_model( + input_ids=input_ids, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + return text_outputs + + def get_image_features( + self, + pixel_values: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs + ): + r""" + Returns: + vision_outputs (`BaseModelOutputWithPooling` or tuple of `paddle.Tensor`): + The vision model outputs. If `return_dict=True`, the output is a [`BaseModelOutputWithPooling`] that + contains the image features, the pooled image features and the hidden states if + `output_hidden_states=True`. + Examples: + ```python + >>> import paddle + >>> from PIL import Image + >>> import requests + >>> from paddlenlp.transformers import MinitGPT4Processor, MiniGPT4Model + >>> processor = MinitGPT4Processor.from_pretrained("model_name") + >>> model = MiniGPT4Model.from_pretrained("model_name") + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> inputs = processor.process_images(images=image, return_tensors="pd") + >>> image_outputs = model.get_image_features(**inputs) + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype) + vision_outputs = self.vision_model( + pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + return vision_outputs + + def get_qformer_features( + self, + pixel_values: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs + ): + r""" + Returns: + vision_outputs (`BaseModelOutputWithPooling` or tuple of `paddle.Tensor`): + The vision model outputs. If `return_dict=True`, the output is a [`BaseModelOutputWithPooling`] that + contains the image features, the pooled image features and the hidden states if + `output_hidden_states=True`. + Examples: + ```python + >>> import paddle + >>> from PIL import Image + >>> import requests + >>> from paddlenlp.transformers import MinitGPT4Processor, MiniGPT4Model + >>> processor = MinitGPT4Processor.from_pretrained("model_name") + >>> model = MiniGPT4Model.from_pretrained("model_name") + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> inputs = processor.process_images(images=image, return_tensors="pd") + >>> qformer_outputs = model.get_qformer_features(**inputs) + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # step 1: forward the images through the vision encoder, + # to get image embeddings of shape (batch_size, seq_len, hidden_size) + pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype) + vision_outputs = self.vision_model( + pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + image_embeds = vision_outputs[0] + image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64") + + # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention + query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1]) + query_tokens = paddle.cast(query_tokens, self.qformer.layernorm.weight.dtype) + image_embeds = paddle.cast(image_embeds, self.qformer.layernorm.weight.dtype) + query_outputs = self.qformer( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + ) + + return query_outputs + + def forward( + self, + pixel_values: paddle.Tensor, # processed image + first_input_ids: paddle.Tensor, + second_input_ids: paddle.Tensor, + first_attention_mask: Optional[paddle.Tensor] = None, + second_attention_mask: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + labels: Optional[paddle.Tensor] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, MiniGPT4ForConditionalGenerationModelOutput]: + r""" + Returns: + Examples: + ```python + >>> from PIL import Image + >>> import requests + >>> import paddle + >>> from paddlenlp.transformers import MiniGPT4Processor, MiniGPT4Model + >>> processor = MiniGPT4Processor.from_pretrained("model_name") + >>> model = MiniGPT4Model.from_pretrained("model_name") + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "describe this image" + >>> prompt = "###Human: ###Assistant:" + >>> inputs = processor(images=image, texts=text, prompts=prompt, return_tensors="pd") + >>> outputs = model(**inputs) + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # step 1: forward the images through the vision encoder, + # to get image embeddings of shape (batch_size, seq_len, hidden_size) + pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype) + vision_outputs = self.vision_model(pixel_values, return_dict=True) + image_embeds = vision_outputs.last_hidden_state + image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64") + + # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention + query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1]) + query_tokens = paddle.cast(query_tokens, self.qformer.layernorm.weight.dtype) + image_embeds = paddle.cast(image_embeds, self.qformer.layernorm.weight.dtype) + query_outputs = self.qformer( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + return_dict=True, + ) + query_output = query_outputs.last_hidden_state + + # step 3: use the language model, conditioned on the text and image + language_model_inputs = self.language_projection(query_output) + language_model_attention_mask = paddle.ones(language_model_inputs.shape[:-1], dtype="int64") + + first_embeds = self.language_model.llama.embed_tokens(first_input_ids) + second_embeds = self.language_model.llama.embed_tokens(second_input_ids) + language_model_inputs = paddle.cast(language_model_inputs, dtype=first_embeds.dtype) + inputs_embeds = paddle.concat([first_embeds, language_model_inputs, second_embeds], axis=1) + + if first_attention_mask is None: + first_attention_mask = paddle.ones(first_embeds.shape[:-1], dtype="int64") + if second_attention_mask is None: + second_attention_mask = paddle.ones(second_embeds.shape[:-1], dtype="int64") + attention_mask = paddle.concat( + [first_attention_mask, language_model_attention_mask, second_attention_mask], axis=1 + ) + + outputs = self.language_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + logits = outputs.logits if return_dict else outputs[0] + loss = None + # we compute the loss here since we need to take into account the sequence length of the query embeds + if labels is not None: + logits = logits[:, -labels.shape[1] :, :] + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :] + shift_labels = labels[..., 1:] + + # Flatten the tokens + loss_fct = CrossEntropyLoss(reduction="mean") + + loss = loss_fct(shift_logits.reshape([-1, self.config.text_config.vocab_size]), shift_labels.reshape([-1])) + + if not return_dict: + output = (logits, vision_outputs, query_outputs, outputs) + return ((loss,) + output) if loss is not None else output + + return MiniGPT4ForConditionalGenerationModelOutput( + loss=loss, + logits=logits, + vision_outputs=vision_outputs, + qformer_outputs=query_outputs, + language_model_outputs=outputs, + ) + + +class MiniGPT4ForConditionalGeneration(MiniGPT4PretrainedModel): + config_class = MiniGPT4Config + main_input_name = "pixel_values" + + def __init__(self, config: MiniGPT4Config): + super().__init__(config) + self.config = config + self.vision_model = MiniGPT4VisionModel(config.vision_config) + + self.query_tokens = Parameter(paddle.zeros([1, config.num_query_tokens, config.qformer_config.hidden_size])) + self.qformer = MiniGPT4QFormerModel(config.qformer_config) + self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size) + self.language_model = LlamaForCausalLM(config.text_config) + + def get_input_embeddings(self) -> nn.Layer: + return self.vision_model.embeddings.patch_embedding + + def forward( + self, + pixel_values: paddle.Tensor, # processed image + first_input_ids: paddle.Tensor, + second_input_ids: paddle.Tensor, + first_attention_mask: Optional[paddle.Tensor] = None, + second_attention_mask: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + labels: Optional[paddle.Tensor] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, MiniGPT4ForConditionalGenerationModelOutput]: + r""" + Examples: + ```python + >>> from PIL import Image + >>> import requests + >>> import paddle + >>> from paddlenlp.transformers import MiniGPT4Processor, MiniGPT4ForConditionalGeneration + >>> processor = MiniGPT4Processor.from_pretrained("model_name") + >>> model = MiniGPT4ForConditionalGeneration.from_pretrained("model_name") + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "describe this image" + >>> prompt = "###Human: ###Assistant:" + >>> inputs = processor(images=image, texts=text, prompts=prompt, return_tensors="pd") + >>> outputs = model(**inputs) + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # step 1: forward the images through the vision encoder, + # to get image embeddings of shape (batch_size, seq_len, hidden_size) + pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype) + vision_outputs = self.vision_model(pixel_values, return_dict=True) + image_embeds = vision_outputs.last_hidden_state + image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64") + + # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention + query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1]) + query_tokens = paddle.cast(query_tokens, self.qformer.layernorm.weight.dtype) + image_embeds = paddle.cast(image_embeds, self.qformer.layernorm.weight.dtype) + query_outputs = self.qformer( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + return_dict=True, + ) + query_output = query_outputs.last_hidden_state + + # step 3: use the language model, conditioned on the text and image + language_model_inputs = self.language_projection(query_output) + language_model_attention_mask = paddle.ones(language_model_inputs.shape[:-1], dtype="int64") + + first_embeds = self.language_model.llama.embed_tokens(first_input_ids) + second_embeds = self.language_model.llama.embed_tokens(second_input_ids) + language_model_inputs = paddle.cast(language_model_inputs, dtype=first_embeds.dtype) + inputs_embeds = paddle.concat([first_embeds, language_model_inputs, second_embeds], axis=1) + + if first_attention_mask is None: + first_attention_mask = paddle.ones(first_embeds.shape[:-1], dtype="int64") + if second_attention_mask is None: + second_attention_mask = paddle.ones(second_embeds.shape[:-1], dtype="int64") + attention_mask = paddle.concat( + [first_attention_mask, language_model_attention_mask, second_attention_mask], axis=1 + ) + + outputs = self.language_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + logits = outputs.logits if return_dict else outputs[0] + loss = None + # we compute the loss here since we need to take into account the sequence length of the query embeds + if labels is not None: + logits = logits[:, -labels.shape[1] :, :] + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :] + shift_labels = labels[..., 1:] + + # Flatten the tokens + loss_fct = CrossEntropyLoss(reduction="mean") + + loss = loss_fct(shift_logits.reshape([-1, self.config.text_config.vocab_size]), shift_labels.reshape([-1])) + + if not return_dict: + output = (logits, vision_outputs, query_outputs, outputs) + return ((loss,) + output) if loss is not None else output + + return MiniGPT4ForConditionalGenerationModelOutput( + loss=loss, + logits=logits, + vision_outputs=vision_outputs, + qformer_outputs=query_outputs, + language_model_outputs=outputs, + ) + + @paddle.no_grad() + def generate( + self, + pixel_values: paddle.Tensor, # processed image + first_input_ids: paddle.Tensor, + second_input_ids: paddle.Tensor, + first_attention_mask: Optional[paddle.Tensor] = None, + second_attention_mask: Optional[paddle.Tensor] = None, + **generate_kwargs, + ) -> paddle.Tensor: + """ + Overrides `generate` function to be able to use the model as a conditional generator. + Args: + pixel_values (`paddle.Tensor` of shape (batch_size, num_channels, height, width)): + Input images to be processed. + first_input_ids (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*): + The first input prompt before the tag ``, it's embeddings will concat with image embeddings and the embeddings of the second_input_ids for the generation. + second_input_ids (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*): + The second input prompt after the tag ``, it's embeddings will concat with image embeddings and the embeddings of the first_input_ids for the generation. + first_attention_mask (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*): + The attention mask corresponding with the first_input_ids, whill will mask to avoid performing attention on padding token indices. + second_attention_mask (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*): + The attention mask corresponding with the second_input_ids, whill will mask to avoid performing attention on padding token indices. + Returns: + captions (list): A list of strings of length batch_size * num_captions. + + Examples: + ```python + >>> from PIL import Image + >>> import requests + >>> import paddle + >>> from paddlenlp.transformers import MiniGPT4Processor, MiniGPT4ForConditionalGeneration + >>> processor = MiniGPT4Processor.from_pretrained("model_name") + >>> model = MiniGPT4ForConditionalGeneration.from_pretrained("model_name") + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "describe this image" + >>> prompt = "###Human: ###Assistant:" + >>> inputs = processor(images=image, texts=text, prompts=prompt, return_tensors="pd") + >>> generated_ids, scores= model.generate(**inputs) + >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip() + """ + # step 1: forward the images through the vision encoder, + # to get image embeddings of shape (batch_size, seq_len, hidden_size) + pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype) + vision_outputs = self.vision_model(pixel_values, return_dict=True) + image_embeds = vision_outputs.last_hidden_state + image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64") + + # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention + query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1]) + query_tokens = paddle.cast(query_tokens, self.qformer.layernorm.weight.dtype) + image_embeds = paddle.cast(image_embeds, self.qformer.layernorm.weight.dtype) + query_outputs = self.qformer( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + return_dict=True, + ) + query_output = query_outputs.last_hidden_state + + # step 3: use the language model, conditioned on the text and image + language_model_inputs = self.language_projection(query_output) + language_model_attention_mask = paddle.ones(language_model_inputs.shape[:-1], dtype="int64") + + first_embeds = self.language_model.llama.embed_tokens(first_input_ids) + second_embeds = self.language_model.llama.embed_tokens(second_input_ids) + language_model_inputs = paddle.cast(language_model_inputs, dtype=first_embeds.dtype) + inputs_embeds = paddle.concat([first_embeds, language_model_inputs, second_embeds], axis=1) + + if first_attention_mask is None: + first_attention_mask = paddle.ones(first_embeds.shape[:-1], dtype="int64") + if second_attention_mask is None: + second_attention_mask = paddle.ones(second_embeds.shape[:-1], dtype="int64") + attention_mask = paddle.concat( + [first_attention_mask, language_model_attention_mask, second_attention_mask], axis=1 + ) + + outputs = self.language_model.generate( + inputs_embeds=inputs_embeds, attention_mask=attention_mask, **generate_kwargs + ) + + return outputs + + @paddle.no_grad() + def encode_images( + self, + pixel_values: paddle.Tensor, # processed image + ) -> paddle.Tensor: + """ + Overrides `generate` function to be able to use the model as a conditional generator. + Args: + pixel_values (`paddle.Tensor` of shape (batch_size, num_channels, height, width)): + Input images to be processed. + Returns: + captions (list): A list of strings of length batch_size * num_captions. + + Examples: + ```python + >>> from PIL import Image + >>> import requests + >>> import paddle + >>> from paddlenlp.transformers import MiniGPT4Processor, MiniGPT4ForConditionalGeneration + >>> processor = MiniGPT4Processor.from_pretrained("model_name") + >>> model = MiniGPT4ForConditionalGeneration.from_pretrained("model_name") + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> image = processor.process_images(images=image, return_tensors="pd") + >>> image_features, image_attention_mask = model.encode_images(**image) + """ + # step 1: forward the images through the vision encoder, + # to get image embeddings of shape (batch_size, seq_len, hidden_size) + pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype) + vision_outputs = self.vision_model(pixel_values, return_dict=True) + image_embeds = vision_outputs.last_hidden_state + image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64") + + # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention + query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1]) + query_tokens = paddle.cast(query_tokens, self.qformer.layernorm.weight.dtype) + image_embeds = paddle.cast(image_embeds, self.qformer.layernorm.weight.dtype) + query_outputs = self.qformer( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + return_dict=True, + ) + query_output = query_outputs.last_hidden_state + + # step 3: use the language model, conditioned on the text and image + language_model_inputs = self.language_projection(query_output) + language_model_attention_mask = paddle.ones(language_model_inputs.shape[:-1], dtype="int64") + + return language_model_inputs, language_model_attention_mask + + @paddle.no_grad() + def generate_with_image_features( + self, + image_features: paddle.Tensor, + first_input_ids: paddle.Tensor, + second_input_ids: paddle.Tensor, + image_attention_mask: Optional[paddle.Tensor] = None, + first_attention_mask: Optional[paddle.Tensor] = None, + second_attention_mask: Optional[paddle.Tensor] = None, + **generate_kwargs, + ) -> paddle.Tensor: + """ + Overrides `generate` function to be able to use the model as a conditional generator. + Args: + image_features (`paddle.Tensor` of shape (batch_size, num_channels, height, width)): + Image features extracted with vit and qformer, specifically, the features extracted with the method `encoded_images`. + first_input_ids (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*): + The first input prompt before the tag ``, it's embeddings will concat with image embeddings and the embeddings of the second_input_ids for the generation. + second_input_ids (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*): + The second input prompt after the tag ``, it's embeddings will concat with image embeddings and the embeddings of the first_input_ids for the generation. + image_attention_mask (`paddle.Tensor` of shape (batch_size, image_sequence_length), *optional*): + The attention mask to the image_features. + first_attention_mask (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*): + The attention mask corresponding to the first_input_ids. + second_attention_mask (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*): + The attention mask corresponding to the second_input_ids. + Returns: + captions (list): A list of strings of length batch_size * num_captions. + + Examples: + ```python + >>> from PIL import Image + >>> import requests + >>> import paddle + >>> from paddlenlp.transformers import MiniGPT4Processor, MiniGPT4ForConditionalGeneration + >>> processor = MiniGPT4Processor.from_pretrained("model_name") + >>> model = MiniGPT4ForConditionalGeneration.from_pretrained("model_name") + >>> url = "https://paddlenlp.bj.bcebos.com/data/images/dog.png" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> processed_image = processor.process_images(images=image, return_tensors="pd") + >>> image_features, image_attention_mask = model.encode_images(**processed_image) + >>> text = "describe this image" + >>> prompt = "###Human: ###Assistant:" + >>> inputs = processor(text=text, prompt=prompt, return_tensors="pd") + >>> generated_ids, scores= model.generate_with_image_features(image_features, image_attention_mask=image_attention_mask, **inputs) + >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip() + """ + first_embeds = self.language_model.llama.embed_tokens(first_input_ids) + second_embeds = self.language_model.llama.embed_tokens(second_input_ids) + image_features = paddle.cast(image_features, dtype=first_embeds.dtype) + inputs_embeds = paddle.concat([first_embeds, image_features, second_embeds], axis=1) + + if first_attention_mask is None: + first_attention_mask = paddle.ones(first_embeds.shape[:-1], dtype="int64") + if second_attention_mask is None: + second_attention_mask = paddle.ones(second_embeds.shape[:-1], dtype="int64") + if image_attention_mask is None: + image_attention_mask = paddle.ones(image_features.shape[:-1], dtype="int64") + + attention_mask = paddle.concat([first_attention_mask, image_attention_mask, second_attention_mask], axis=1) + + outputs = self.language_model.generate( + inputs_embeds=inputs_embeds, attention_mask=attention_mask, **generate_kwargs + ) + + return outputs diff --git a/paddlevlp/processors/__init__.py b/paddlevlp/processors/__init__.py index 4738e3272555e6..04006999f0b629 100644 --- a/paddlevlp/processors/__init__.py +++ b/paddlevlp/processors/__init__.py @@ -14,3 +14,5 @@ # limitations under the License. from .blip_processing import * +from .minigpt4_processing import * +from .minigpt4_image_processing import * diff --git a/paddlevlp/processors/minigpt4_image_processing.py b/paddlevlp/processors/minigpt4_image_processing.py new file mode 100644 index 00000000000000..3a0b3302e9c799 --- /dev/null +++ b/paddlevlp/processors/minigpt4_image_processing.py @@ -0,0 +1,284 @@ +# coding=utf-8 +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for MiniGPT4.""" + +from typing import Dict, List, Optional, Union + +import numpy as np +import PIL + +from paddlenlp.transformers.image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from paddlenlp.transformers.image_transforms import ( + convert_to_rgb, + normalize, + rescale, + resize, + to_channel_dimension_format, +) +from paddlenlp.transformers.image_utils import ( + ChannelDimension, + ImageInput, + PILImageResampling, + is_batched, + to_numpy_array, + valid_images, +) +from paddlenlp.transformers.tokenizer_utils_base import TensorType + +__all__ = [ + "MiniGPT4ImageProcessor", +] + + +class MiniGPT4ImageProcessor(BaseImageProcessor): + r""" + Constructs a MiniGPT4 image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the + `do_resize` parameter in the `preprocess` method. + size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`): + Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess` + method. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be + overridden by the `resample` parameter in the `preprocess` method. + do_rescale (`bool`, *optional*, defaults to `True`): + Wwhether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the + `do_rescale` parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be + overridden by the `rescale_factor` parameter in the `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. Can be overridden by the `do_normalize` parameter in the `preprocess` method. + image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be + overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + Can be overridden by the `image_std` parameter in the `preprocess` method. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Dict[str, int] = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: bool = True, + **kwargs + ) -> None: + super().__init__(**kwargs) + default_image_mean = [0.48145466, 0.4578275, 0.40821073] + default_image_std = [0.26862954, 0.26130258, 0.27577711] + size = size if size is not None else {"height": 224, "width": 224} + size = get_size_dict(size, default_to_square=True) + + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else default_image_mean + self.image_std = image_std if image_std is not None else default_image_std + self.do_convert_rgb = do_convert_rgb + + def resize( + self, + image: np.ndarray, + size: Dict[str, int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs + ) -> np.ndarray: + """ + Resize an image. + + Resizes the shorter side of the image to `size["shortest_edge"]` while preserving the aspect ratio. If the + longer side is larger than the max size `(int(`size["shortest_edge"]` * 1333 / 800))`, the longer side is then + resized to the max size while preserving the aspect ratio. + + Args: + image (`np.ndarray`): + Image to resize. + size (`Dict[str, int]`): + Controls the size of the output image. Should be of the form `{"shortest_edge": int}`. + resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`): + Resampling filter to use when resiizing the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + """ + size = get_size_dict(size, default_to_square=True) + output_size = (size["width"], size["height"]) + return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs) + + def rescale( + self, + image: np.ndarray, + scale: Union[int, float], + data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs + ): + """ + Rescale an image by a scale factor. image = image * scale. + + Args: + image (`np.ndarray`): + Image to rescale. + scale (`int` or `float`): + Scale to apply to the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + """ + return rescale(image, scale=scale, data_format=data_format, **kwargs) + + def normalize( + self, + image: np.ndarray, + mean: Union[float, List[float]], + std: Union[float, List[float]], + data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs + ) -> np.ndarray: + """ + Normalize an image. image = (image - image_mean) / image_std. + + Args: + image (`np.ndarray`): + Image to normalize. + mean (`float` or `List[float]`): + Image mean. + std (`float` or `List[float]`): + Image standard deviation. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + """ + return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs) + + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[Dict[str, int]] = None, + resample: PILImageResampling = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + do_convert_rgb: bool = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + **kwargs, + ) -> PIL.Image.Image: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to `self.size`): + Controls the size of the image after `resize`. The shortest edge of the image is resized to + `size["shortest_edge"]` while preserving the aspect ratio. If the longest edge of this resized image + is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest + edge equal to `int(size["shortest_edge"] * (1333 / 800))`. + resample (`PILImageResampling`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image values between [0 - 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Image mean to normalize the image by if `do_normalize` is set to `True`. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to normalize the image by if `do_normalize` is set to `True`. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.PADDLE` or `'pt'`: Return a batch of type `paddle.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: defaults to the channel dimension format of the input image. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + resample = resample if resample is not None else self.resample + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + + size = size if size is not None else self.size + size = get_size_dict(size, default_to_square=False) + + if not is_batched(images): + images = [images] + + if not valid_images(images): + raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.") + + if do_resize and size is None or resample is None: + raise ValueError("Size and resample must be specified if do_resize is True.") + + if do_rescale and rescale_factor is None: + raise ValueError("Rescale factor must be specified if do_rescale is True.") + + if do_normalize and (image_mean is None or image_std is None): + raise ValueError("Image mean and std must be specified if do_normalize is True.") + + # PIL RGBA images are converted to RGB + if do_convert_rgb: + images = [convert_to_rgb(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if do_resize: + images = [self.resize(image=image, size=size, resample=resample) for image in images] + + if do_rescale: + images = [self.rescale(image=image, scale=rescale_factor) for image in images] + + if do_normalize: + images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images] + + images = [to_channel_dimension_format(image, data_format) for image in images] + + data = {"pixel_values": images} + return BatchFeature(data=data, tensor_type=return_tensors) diff --git a/paddlevlp/processors/minigpt4_processing.py b/paddlevlp/processors/minigpt4_processing.py new file mode 100644 index 00000000000000..f71acc7e4298e9 --- /dev/null +++ b/paddlevlp/processors/minigpt4_processing.py @@ -0,0 +1,245 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Processor class for MiniGPT4. +""" + +from typing import List, Optional, Union + +import numpy as np +import paddle +from PIL import Image + +from paddlenlp.transformers.image_processing_utils import BatchFeature +from paddlenlp.transformers.image_utils import ImageInput +from paddlenlp.transformers.processing_utils import ProcessorMixin +from paddlenlp.transformers.tokenizer_utils_base import BatchEncoding, TensorType, TextInput + +__all__ = [ + "MiniGPT4Processor", +] + + +class MiniGPT4Processor(ProcessorMixin): + r""" + Constructs a MiniGPT4 processor which wraps a MiniGPT4 image processor and an llama tokenizer into a single processor. + [`MiniGPT4Processor`] offers all the functionalities of [`MiniGPT4ImageProcessor`] and [`LlamaTokenizer`]. See the docstring + of [`~MiniGPT4ImageProcessor.__call__`] and [`~LlamaTokenizer.decode`] for more information. + + Args: + image_processor (`MiniGPT4ImageProcessor`): + An instance of [`MiniGPT4ImageProcessor`]. The image processor is a required input. + tokenizer (`LlamaTokenizer`): + An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input. + + Examples: + ```python + >>> import requests + >>> from PIL import Image + + >>> import paddle + >>> from paddlenlp.transformers import MiniGPT4Processor + + >>> # load processor + >>> minigpt4_13b_path = "model_name" + >>> processor = MiniGPT4Processor.from_pretrained(minigpt4_13b_path) + >>> print("load processor and model done!") + + >>> # prepare model inputs for MiniGPT4 + >>> url = "https://paddlenlp.bj.bcebos.com/data/images/mugs.png" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "describe this image" + >>> prompt = "Give the following image: ImageContent. You will be able to see the image once I provide it to you. Please answer my questions.###Human: ###Assistant:" + >>> res = processor([image], text, prompt) + ```""" + attributes = ["image_processor", "tokenizer"] + image_processor_class = "MiniGPT4ImageProcessor" + tokenizer_class = "LlamaTokenizer" + + def __init__(self, image_processor, tokenizer): + tokenizer.return_token_type_ids = False + tokenizer.model_input_names = ["input_ids", "attention_mask"] + tokenizer.padding_side = "right" + tokenizer.pad_token = tokenizer.eos_token + super().__init__(image_processor, tokenizer) + self.current_processor = self.image_processor + self.default_prompt = "###Human: ###Assistant: " + self.image_tag = "" + self.text_tag = "" + + def process_images( + self, + images: ImageInput, + return_tensors: Optional[Union[str, TensorType]] = TensorType.PADDLE, + **kwargs, + ) -> BatchFeature: + """ + This method uses [`MiniGPT4ImageProcessor.__call__`] method to prepare image(s) for the model. + Please refer to the docstring of the method for more information. + """ + if not images: + raise ValueError("You have to input correct images.") + + if isinstance(images, (Image.Image, np.ndarray, paddle.Tensor)): + images = [images] + + # processing with image processor + processed_images = self.image_processor(images, return_tensors=return_tensors) + + return processed_images + + def process_texts( + self, + texts: Union[TextInput, List[TextInput]], + prompts: Union[TextInput, List[TextInput]] = None, + return_tensors: Optional[Union[str, TensorType]] = TensorType.PADDLE, + **kwargs, + ): + prompts = prompts if prompts is not None else [self.default_prompt] + + if (not isinstance(texts, TextInput)) and (not isinstance(texts, list)): + raise TypeError("Unsupported type for texts: {}, only str and list type supported.".format(type(texts))) + if prompts is not None and (not isinstance(prompts, TextInput)) and (not isinstance(prompts, list)): + raise TypeError( + "Unsupported type for prompts: {}, only str and list type supported.".format(type(prompts)) + ) + + if isinstance(prompts, list): + if isinstance(texts, list) and len(prompts) != len(texts): + raise ValueError( + "The length of prompts not is equal to texts' length: {} != {}".format(len(prompts), len(texts)) + ) + elif isinstance(texts, TextInput): + texts = [texts] * len(prompts) + else: + if isinstance(texts, TextInput): + texts = [texts] + prompts = [prompts] + else: + prompts = [prompts] * len(texts) + + assemble_texts = [] + for text, prompt in zip(texts, prompts): + if self.image_tag not in text: + if self.image_tag not in prompt: + raise ValueError( + "A prompt should contain a image tag `{}` to insert image embeddings. if you don't want to use prompt function, you have to input a text with the image tag `{}`.".format( + self.image_tag, self.image_tag + ) + ) + if self.text_tag not in prompt: + raise ValueError( + "A prompt should contain a text tag `{}` to insert text information.".format(self.text_tag) + ) + assemble_texts.append(prompt.replace(self.text_tag, text)) + else: + assemble_texts.append(text) + + # processing with text tokenizer + first_texts, second_texts = zip(*[assemble_text.split(self.image_tag) for assemble_text in assemble_texts]) + first_text_encoding = self.tokenizer( + text=first_texts, return_tensors=return_tensors, add_special_tokens=True, **kwargs + ) + second_text_encoding = self.tokenizer( + text=second_texts, return_tensors=return_tensors, add_special_tokens=False, **kwargs + ) + + encoded_texts = BatchEncoding( + { + "first_input_ids": first_text_encoding["input_ids"], + "first_attention_mask": first_text_encoding["attention_mask"], + "second_input_ids": second_text_encoding["input_ids"], + "second_attention_mask": second_text_encoding["attention_mask"], + } + ) + return encoded_texts + + def __call__( + self, + images: ImageInput = None, + text: str = None, + prompt: str = None, + return_tensors: Optional[Union[str, TensorType]] = TensorType.PADDLE, + **kwargs, + ) -> BatchFeature: + """ + This method uses [`MiniGPT4ImageProcessor.__call__`] method to prepare image(s) for the model, and + [`LlamaTokenizer.__call__`] to prepare text for the model. + Please refer to the docstring of the above two methods for more information. + """ + prompt = prompt if prompt is not None else self.default_prompt + + if images is None and text is None: + raise ValueError("Images and text are None, you have to specify either images or texts.") + if images is not None and not isinstance(images, (Image.Image, np.ndarray, paddle.Tensor, list)): + raise TypeError( + "A type in [Image.Image, np.ndarray, paddle.Tensor, list] for images is expected, but received {}.".format( + type(images) + ) + ) + if text is not None and not isinstance(text, str): + raise TypeError("A str type of text is expected, but received {}.".format(type(text))) + if prompt is not None and not isinstance(prompt, str): + raise TypeError("A str type of prompt is expected, but received {}.".format(type(prompt))) + + if images is not None and not isinstance(images, list): + images = [images] + if text is not None and images is not None: + texts = [text] * len(images) + prompts = [prompt] * len(images) + elif text is not None and images is None: + texts = [text] + prompts = [prompt] + + # image-only mode + if text is None: + # processing with image processor + processed_features = self.process_images(images, return_tensors=return_tensors, **kwargs) + return processed_features + + # text-only mode + if images is None: + # processing with text tokenizer + encoded_texts = self.process_texts(texts, prompts, **kwargs) + return encoded_texts + + # text-image mode + processed_features = self.image_processor(images, return_tensors=return_tensors) + encoded_texts = self.process_texts(texts, prompts, **kwargs) + processed_features.update(encoded_texts) + + return processed_features + + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer + to the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) From 68459f6eb69e5714631e8bc926db6ea60f296aee Mon Sep 17 00:00:00 2001 From: Milen <1649759610@qq.com> Date: Thu, 29 Jun 2023 05:59:13 +0000 Subject: [PATCH 04/10] add examples for minigpt4 --- paddlevlp/examples/minigpt4/README.md | 47 +++++++ paddlevlp/examples/minigpt4/merge_weight.py | 88 +++++++++++++ .../minigpt4/paddle_minigpt4_instrction.md | 117 ++++++++++++++++++ paddlevlp/examples/minigpt4/run_predict.py | 68 ++++++++++ 4 files changed, 320 insertions(+) create mode 100644 paddlevlp/examples/minigpt4/README.md create mode 100644 paddlevlp/examples/minigpt4/merge_weight.py create mode 100644 paddlevlp/examples/minigpt4/paddle_minigpt4_instrction.md create mode 100644 paddlevlp/examples/minigpt4/run_predict.py diff --git a/paddlevlp/examples/minigpt4/README.md b/paddlevlp/examples/minigpt4/README.md new file mode 100644 index 00000000000000..48c9f73840762b --- /dev/null +++ b/paddlevlp/examples/minigpt4/README.md @@ -0,0 +1,47 @@ +# MiniGPT4 + +## 1. 模型简介 + +MiniGPT4 是一个具有图像理解能力的开源模型,其基于 Vicuna 大语言模型 以及 BLIP-2 中的VIT和Qformer模块进行训练,使得MiniGPT4 拥有类似于GPT4的非凡能力,例如详细的图像描述生成和从手写草稿创建网站。 此外 MiniGPT4 还具备一些的其他新的功能,包括根据给定图像写故事和诗歌,为图像中显示的问题提供解决方案,教用户如何根据食物照片做饭等。下图展示了MiniGPT4的模型结构, 更多信息请参考[MiniGPT4](https://arxiv.org/abs/2304.10592)。 + +
+ + +## 2. 获取MiniGPT4 权重以及相关配置 +这里可以分两步:1. 获取MiniGPT4权重;2. 获取相关配置,包括模型参数说明以及tokenizer相关文件等。 +### 2.1 获取MiniGPT4权重 +目前需要用户手动下载MiniGPT4权重和并转换为相应的 Paddle 版权重,为方便转换,本项目提供了相应的操作说明和转换脚本,详情请参考[MiniGPT4 权重下载和转换说明](./paddle_minigpt4_instrction.md)。 + +### 2.2 获取相关配置 +下载相关的配置文件,这里提供了两版配置文件,请根据你的需要,点击下载即可。 +| files Aligned with MiniGPT4-7B | files Aligned with MiniGPT4-13B | +:-------------------------------------:|:-----------------------------------: + [Download](https://paddlenlp.bj.bcebos.com/models/community/minigpt4-7b/minigpt4_7b.tar.gz)|[Download](https://paddlenlp.bj.bcebos.com/models/community/minigpt4-13b/minigpt4_13b.tar.gz) | + + +下载之后进行解压,请将其中相关文件放至 与 MiniGPT4 权重相同的目录中。 + + +## 3. 模型预测 +在下载和转换好上述模型权重之后,可执行以下命令进行模型预测。其中参数 `pretrained_name_or_path` 用于指定 MiniGPT4 的保存目录。 + +``` +python run_predict.py \ + -- pretrained_name_or_path "your minigpt4 path" + +``` + +下图这个示例展示了在使用MiniGPT-7b时的效果: + +输入图片:
+ +输入文本:“describe this image” + +输出: +``` +The image shows two mugs with cats on them, one is black and white and the other is blue and white. The mugs are sitting on a table with a book in the background. The mugs have a whimsical, cartoon-like appearance. The cats on the mugs are looking at each other with a playful expression. The overall mood of the image is lighthearted and fun.### +``` + + +## Reference +- [MiniGPT-4: Enhancing Vision-language Understanding with Advanced Large Language Models](https://minigpt-4.github.io/) diff --git a/paddlevlp/examples/minigpt4/merge_weight.py b/paddlevlp/examples/minigpt4/merge_weight.py new file mode 100644 index 00000000000000..8f74d7c6a96052 --- /dev/null +++ b/paddlevlp/examples/minigpt4/merge_weight.py @@ -0,0 +1,88 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os + +os.environ["CUDA_VISIBLE_DEVICES"] = "0" +os.environ["FLAGS_use_cuda_managed_memory"] = "true" + +import paddle +import torch + +from paddlenlp.transformers import LlamaForCausalLM + + +def merge(args): + model_dict = {} + # load the first item: blip2-flan-t5-xxl + state_dict = paddle.load(args.blip2_path) + for n, p in state_dict.items(): + if n.startswith("vision_model") or n.startswith("qformer") or n == "query_tokens": + model_dict[n] = p + print("[1/3] load ViT, qformer and query_tokens from blip2-flan-t5-xxl done!") + + # load the second item: vicuna + llama_model = LlamaForCausalLM.from_pretrained(args.vicuna_path) + + for n, p in llama_model.named_parameters(): + new_name = "language_model." + n + model_dict[new_name] = p + print("[2/3] load vicuna(llama typel) done!") + + # load the third item: minigpt4 + minigpt4_state_dict = torch.load(args.minigpt4_path) + for n, p in minigpt4_state_dict["model"].items(): + if n.startswith("llama_model.model"): + new_name = n.replace("llama_model.model", "language_model.llama") + new_p = paddle.to_tensor(p.cpu().numpy()) + model_dict[new_name] = new_p + + if n.startswith("llama_proj"): + new_name = n.replace("llama_proj", "language_projection") + if n.endswith("weight"): + new_p = paddle.to_tensor(p.cpu().numpy()).transpose([1, 0]) + else: + new_p = paddle.to_tensor(p.cpu().numpy()) + model_dict[new_name] = new_p + + print("[3/3] load language_projection, some llama weights from minigpt4 done!") + + save_path = os.path.join(args.save_path, "model_state.pdparams") + paddle.save(model_dict, save_path) + print("The checkpoint of minigpt4 has been saved to :{}".format(save_path)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument("--blip2_path", default="/blip2/dirname", type=str, help="The dir name of blip2-flan-t5-xxl.") + parser.add_argument("--vicuna_path", default="/vicuna/dirname", type=str, help="The dir name of vicuna.") + parser.add_argument( + "--minigpt4_path", default="/minigpt4/prerained_minigpt4.pth", type=str, help="The checkpoint path of vicuna." + ) + parser.add_argument("--save_path", default="/save/to/dirname", type=str, help="The saving path of minigpt4.") + args = parser.parse_args() + + args.blip2_path = os.path.join(args.blip2_path, "model_state.pdparams") + if not os.path.exists(args.blip2_path): + raise ValueError("Not found the file: {}".format(args.blip2_path)) + if not os.path.isdir(args.vicuna_path): + raise ValueError("It is not a directory: {}".format(args.vicuna_path)) + if not os.path.exists(args.minigpt4_path): + raise ValueError("Not found the file: {}".format(args.minigpt4_path)) + if not os.path.exists(args.save_path): + os.makedirs(args.save_path) + + merge(args) diff --git a/paddlevlp/examples/minigpt4/paddle_minigpt4_instrction.md b/paddlevlp/examples/minigpt4/paddle_minigpt4_instrction.md new file mode 100644 index 00000000000000..7b84aea48bd7c6 --- /dev/null +++ b/paddlevlp/examples/minigpt4/paddle_minigpt4_instrction.md @@ -0,0 +1,117 @@ +# 获取和转换 Paddle 版 MiniGPT4 权重 + +## 1. 准备 MiniGPT4 中所有模块的权重 + +你需要下载3个权重,以获取最终 MiniGPT4的权重,分别是: +- Pretrained MiniGPT-4 +- Vicuna Weight +- Blip2 Weight + +### 1.1 下载 MiniGPT4 的预训练权重 + +根据你准备的Vicuna模型版本,下载预训练的MiniGPT4 权重。 + +| Checkpoint Aligned with Vicuna 7B | Checkpoint Aligned with Vicuna 13B | +:-------------------------------------:|:-----------------------------------: +[Download](https://drive.google.com/file/d/1RY9jV0dyqLX-o38LrumkKRh6Jtaop58R/view?usp=sharing) | [Download](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view?usp=share_link) + +### 1.2准备 ViT and Qformer 权重 +MiniGPT4中使用的ViT和Qformer Weight来自blip2-flan-t5-xxl,这个weight在PaddleNLP中进行了转换。 所以你可以从 PaddleNLP 下载它,你有两种下载方式进行下载: + +#### 1.2.1 通过 paddlenlp 方式加载 +直接通过paddlenlp的模型加载方法进行下载,下载后一般会存入 `PPNLP_HOME` 指定的目录。 + +```python +import os +os.environ["CUDA_VISIBLE_DEVICES"]="0" + +import paddle +from paddlenlp.transformers import Blip2Model, Blip2VisionModel, Blip2VisionConfig, Blip2QFormerConfig, Blip2QFormerModel + +Blip2Model.from_pretrained("Salesforce/blip2-flan-t5-xxl") +``` + +#### 1.2.2 直接点击下载 +可以直接进行点击下载: + +| blip2-flan-t5-xxl 权重 | 点击下载 | +:-------------------------------------:|:-----------------------------------: +| model_state.pdparams | [Download](https://paddlenlp.bj.bcebos.com/models/community/Salesforce/blip2-flan-t5-xxl/model_state.pdparams) | + +### 1.3 准备 Vicuna 权重 + +这里需要下载两个权重:Vicuna delta Weight和huggingface-formated Llama Weight。 然后你应该结合这两个重量来获得可以使用的Vicuna 权重。 + +#### 1.3.1 下载 Vicuna delta 权重 + +这里展示两种Vicuna delta 权重,请根据需要选择一种并点击下载。 + +| vicuna-7b-delta-v0 | vicuna-13b-delta-v0 | +:-------------------------------------:|:-----------------------------------: + [Download](https://huggingface.co/lmsys/vicuna-7b-delta-v0/tree/main) | [Download](https://huggingface.co/lmsys/vicuna-13b-delta-v0g) + +#### 1.3.2 根据以上选择的vicuna delta 权重,下载 相应的 llama 权重。 + +| llama-7b | llama-13b | +:-------------------------------------:|:-----------------------------------: + [Download](https://huggingface.co/decapoda-research/llama-7b-hf/tree/main) | [Download](https://huggingface.co/decapoda-research/llama-13b-hf) + + +#### 1.3.3 结合上面的两个权重,得到可以使用的 vicuna 权重 +- 为组合如上两个权重,请安装以下工具: + +```shell +pip install git+https://github.com/lm-sys/FastChat.git@v0.1.10 +``` +- 运行以下命令,获取最终可用的vicuna 权重 + +```shell +python -m fastchat.model.apply_delta --base /path/to/llama-13bOR7b-hf/ --target /path/to/save/working/vicuna-13b/weight/ --delta /path/to/vicuna-13bOR7b-delta-v0/ +``` + +## 2. 将多个 pytorch 子权重文件合并为一个权重文件 + +Pytorch版的权重文件可能是由多个子权重文件组合而成,为使用PaddleNLP进行加载并自动转换为Paddle版,需要将其合并为一个文件: + +### 2.1 下载MiniGPT库 +在开始之前,请确保已经下载了 [MiniGPT4](https://github.com/Vision-CAIR/MiniGPT-4.git) 库: + +``` +git clone https://github.com/Vision-CAIR/MiniGPT-4.git +``` + +### 2.2 获取完整的 vicuna 权重 +进入到MiniGPT4文件夹,执行以下代码,获取完整的 vicuna 权重文件: +```python +import argparse +import os +os.environ["CUDA_VISIBLE_DEVICES"]="0" +os.environ["FLAGS_use_cuda_managed_memory"]="true" + +import torch +from minigpt4.models.modeling_llama import LlamaForCausalLM + +llama_model = LlamaForCausalLM.from_pretrained("/path/to/save/working/vicuna-13b/") +torch.save(llama_model.state_dict(), "/path/to/save/working/vicuna-13b/pytorch_model.bin") +``` + +## 3. 合并以上所有权重,获取最终的 Paddle 版 MiniGPT4 权重 +这里提供了一个合并以上权重的脚本,你可以通过设置相关权重路径 以获取最终的 MiniGPT4 权重。 + +```shell +python merge_weight.py \ + --blip2_path "your dir name of blip2" \ + --vicuna_path "your dir name of vicuna" \ + --minigpt4_path "your ckpt path of minigpt4" \ + --save_path "your dir name saving the final minigpt4" +``` + +**参数说明**: +- `blip2_path`: 存放 blip2 权重的目录名 +- `vicuna_path`: 存放 vicuna_path 权重的目录名 +- `minigpt4_path`: 存放 blip2 权重的文件地址,比如./prerained_minigpt4_7b.pth +- `save_path`: 保存 Paddle 版 MiniGPT3 权重的目录名 + +## 3. More Reference + +- [MiniGPT Official Site](https://github.com/Vision-CAIR/MiniGPT-4) diff --git a/paddlevlp/examples/minigpt4/run_predict.py b/paddlevlp/examples/minigpt4/run_predict.py new file mode 100644 index 00000000000000..4b36089f3c91a8 --- /dev/null +++ b/paddlevlp/examples/minigpt4/run_predict.py @@ -0,0 +1,68 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os + +os.environ["CUDA_VISIBLE_DEVICES"] = "0" +os.environ["FLAGS_use_cuda_managed_memory"] = "true" +import requests +from PIL import Image + +from paddlenlp.transformers import MiniGPT4ForConditionalGeneration, MiniGPT4Processor + + +def predict(args): + # load MiniGPT4 moel and processor + model = MiniGPT4ForConditionalGeneration.from_pretrained(args.pretrained_name_or_path) + model.eval() + processor = MiniGPT4Processor.from_pretrained(args.pretrained_name_or_path) + print("load processor and model done!") + + # prepare model inputs for MiniGPT4 + url = "https://paddlenlp.bj.bcebos.com/data/images/mugs.png" + image = Image.open(requests.get(url, stream=True).raw) + + text = "describe this image" + prompt = "Give the following image: ImageContent. You will be able to see the image once I provide it to you. Please answer my questions.###Human: ###Assistant:" + inputs = processor([image], text, prompt) + + # generate with MiniGPT4 + # breakpoint + generate_kwargs = { + "max_length": 300, + "num_beams": 1, + "top_p": 1.0, + "repetition_penalty": 1.0, + "length_penalty": 0, + "temperature": 1, + "decode_strategy": "greedy_search", + "eos_token_id": [[835], [2277, 29937]], + } + outputs = model.generate(**inputs, **generate_kwargs) + msg = processor.batch_decode(outputs[0]) + print("Inference result: ", msg) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--pretrained_name_or_path", + default="your directory of minigpt4", + type=str, + help="The dir name of minigpt4 checkpoint.", + ) + args = parser.parse_args() + + predict(args) From 3d50d5396f62a1f8124af22730c3ec2420e52d96 Mon Sep 17 00:00:00 2001 From: Milen <1649759610@qq.com> Date: Thu, 29 Jun 2023 11:45:53 +0000 Subject: [PATCH 05/10] [New Feature] drop some paddlenlp and add some files --- paddlevlp/activations.py | 174 +++++ paddlevlp/examples/minigpt4/run_predict.py | 3 +- paddlevlp/models/__init__.py | 2 + paddlevlp/models/minigpt4/configuration.py | 2 +- paddlevlp/models/minigpt4/modeling.py | 10 +- paddlevlp/processors/base_processing.py | 140 ++++ paddlevlp/processors/image_transforms.py | 656 ++++++++++++++++++ .../processors/minigpt4_image_processing.py | 9 +- paddlevlp/processors/minigpt4_processing.py | 7 +- paddlevlp/processors/utils.py | 1 - paddlevlp/utils/initializer.py | 421 +++++++++++ paddlevlp/utils/log.py | 2 +- paddlevlp/utils/parameters.py | 53 ++ 13 files changed, 1463 insertions(+), 17 deletions(-) create mode 100644 paddlevlp/activations.py create mode 100644 paddlevlp/processors/base_processing.py create mode 100644 paddlevlp/processors/image_transforms.py create mode 100644 paddlevlp/utils/initializer.py create mode 100644 paddlevlp/utils/parameters.py diff --git a/paddlevlp/activations.py b/paddlevlp/activations.py new file mode 100644 index 00000000000000..db1aecc829d96a --- /dev/null +++ b/paddlevlp/activations.py @@ -0,0 +1,174 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from collections import OrderedDict + +import paddle +import paddle.nn.functional as F +from paddle import Tensor, nn + + +class NewGELUActivation(nn.Layer): + """ + Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see + the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415 + """ + + def forward(self, input: Tensor) -> Tensor: + return ( + 0.5 * input * (1.0 + paddle.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * paddle.pow(input, 3.0)))) + ) + + +class GELUActivation(nn.Layer): + """ + Original Implementation of the GELU activation function in Google BERT repo when initially created. For + information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 + + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional + Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415 + """ + + def __init__(self, use_gelu_python: bool = False): + super().__init__() + if use_gelu_python: + self.act = self._gelu_python + else: + self.act = nn.functional.gelu + + def _gelu_python(self, input: Tensor) -> Tensor: + return input * 0.5 * (1.0 + paddle.erf(input / math.sqrt(2.0))) + + def forward(self, input: Tensor) -> Tensor: + return self.act(input) + + +class FastGELUActivation(nn.Layer): + """ + Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs + """ + + def forward(self, input: Tensor) -> Tensor: + return 0.5 * input * (1.0 + paddle.tanh(input * 0.7978845608 * (1.0 + 0.044715 * input * input))) + + +class QuickGELUActivation(nn.Layer): + """ + Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs + """ + + def forward(self, input: Tensor) -> Tensor: + return input * F.sigmoid(1.702 * input) + + +class ClippedGELUActivation(nn.Layer): + """ + Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as + it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to + https://arxiv.org/abs/2004.09602. + + Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when + initially created. + + For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 + + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))). See https://arxiv.org/abs/1606.08415 + """ + + def __init__(self, min: float, max: float): + if min > max: + raise ValueError(f"min should be < max (got min: {min}, max: {max})") + + super().__init__() + self.min = min + self.max = max + + def forward(self, x: Tensor) -> Tensor: + return paddle.clip(gelu(x), self.min, self.max) + + +class SiLUActivation(nn.Layer): + """ + See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear + Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function + Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated + Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with + later. + """ + + def forward(self, input: Tensor) -> Tensor: + return F.silu(input) + + +class MishActivation(nn.Layer): + """ + See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://arxiv.org/abs/1908.08681). Also + visit the official repository for the paper: https://github.com/digantamisra98/Mish + """ + + def forward(self, input: Tensor) -> Tensor: + return F.mish(input) + + +class LinearActivation(nn.Layer): + """ + Applies the linear activation function, i.e. forwarding input directly to output. + """ + + def forward(self, input: Tensor) -> Tensor: + return input + + +class ClassInstantier(OrderedDict): + def __getitem__(self, key): + content = super().__getitem__(key) + cls, kwargs = content if isinstance(content, tuple) else (content, {}) + return cls(**kwargs) + + +ACT2CLS = { + "gelu": GELUActivation, + "gelu_10": (ClippedGELUActivation, {"min": -10, "max": 10}), + "gelu_fast": FastGELUActivation, + "gelu_new": NewGELUActivation, + "gelu_python": (GELUActivation, {"use_gelu_python": True}), + "linear": LinearActivation, + "mish": MishActivation, + "quick_gelu": QuickGELUActivation, + "relu": nn.ReLU, + "relu6": nn.ReLU6, + "sigmoid": nn.Sigmoid, + "silu": SiLUActivation, + "swish": SiLUActivation, + "tanh": nn.Tanh, +} +ACT2FN = ClassInstantier(ACT2CLS) + + +def get_activation(activation_string): + if activation_string in ACT2FN: + return ACT2FN[activation_string] + else: + raise KeyError(f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}") + + +# For backwards compatibility with: from activations import gelu_python +gelu_python = get_activation("gelu_python") +gelu_new = get_activation("gelu_new") +gelu = get_activation("gelu") +gelu_fast = get_activation("gelu_fast") +quick_gelu = get_activation("quick_gelu") +silu = get_activation("silu") +mish = get_activation("mish") +linear_act = get_activation("linear") \ No newline at end of file diff --git a/paddlevlp/examples/minigpt4/run_predict.py b/paddlevlp/examples/minigpt4/run_predict.py index 4b36089f3c91a8..cb9ac139002753 100644 --- a/paddlevlp/examples/minigpt4/run_predict.py +++ b/paddlevlp/examples/minigpt4/run_predict.py @@ -20,7 +20,7 @@ import requests from PIL import Image -from paddlenlp.transformers import MiniGPT4ForConditionalGeneration, MiniGPT4Processor +from paddlevlp import MiniGPT4ForConditionalGeneration, MiniGPT4Processor def predict(args): @@ -39,7 +39,6 @@ def predict(args): inputs = processor([image], text, prompt) # generate with MiniGPT4 - # breakpoint generate_kwargs = { "max_length": 300, "num_beams": 1, diff --git a/paddlevlp/models/__init__.py b/paddlevlp/models/__init__.py index 904dfbb7a6d3d2..77ef10b5801c9c 100644 --- a/paddlevlp/models/__init__.py +++ b/paddlevlp/models/__init__.py @@ -14,3 +14,5 @@ # limitations under the license. from .blip2.modeling import * +from .minigpt4.configuration import * +from .minigpt4.modeling import * \ No newline at end of file diff --git a/paddlevlp/models/minigpt4/configuration.py b/paddlevlp/models/minigpt4/configuration.py index 4f9a5ec08b782f..9ba18709ca2c60 100644 --- a/paddlevlp/models/minigpt4/configuration.py +++ b/paddlevlp/models/minigpt4/configuration.py @@ -17,7 +17,7 @@ import os from typing import Union -from paddlenlp.utils.log import logger +from ...utils.log import logger from paddlenlp.transformers.auto.modeling import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES from paddlenlp.transformers.configuration_utils import PretrainedConfig from paddlenlp.transformers.llama.configuration import LlamaConfig diff --git a/paddlevlp/models/minigpt4/modeling.py b/paddlevlp/models/minigpt4/modeling.py index 4239675bb7aaab..ad647b3279459c 100644 --- a/paddlevlp/models/minigpt4/modeling.py +++ b/paddlevlp/models/minigpt4/modeling.py @@ -22,11 +22,6 @@ from paddle.distributed.fleet.utils import recompute from paddle.nn import CrossEntropyLoss -from paddlenlp.ops import transfer_param -from paddlenlp.utils.log import logger - -from paddlenlp.utils.initializer import normal_, ones_, zeros_ -from paddlenlp.transformers.activations import ACT2FN from paddlenlp.transformers.llama.modeling import LlamaForCausalLM from paddlenlp.transformers.model_outputs import ( BaseModelOutput, @@ -42,6 +37,11 @@ prune_linear_layer, ) +from ...utils.log import logger +from ...activations import ACT2FN +from ...utils.initializer import normal_, ones_, zeros_ +from ...utils.parameters import transfer_param + MiniGPT4_PRETRAINED_MODEL_ARCHIVE_LIST = [] from .configuration import MiniGPT4Config, MiniGPT4QFormerConfig, MiniGPT4VisionConfig diff --git a/paddlevlp/processors/base_processing.py b/paddlevlp/processors/base_processing.py new file mode 100644 index 00000000000000..7c599e12b5b3ea --- /dev/null +++ b/paddlevlp/processors/base_processing.py @@ -0,0 +1,140 @@ +# coding=utf-8 +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + Processing saving/loading class for common processors. +""" + +import os + +import paddlenlp.transformers +import paddlevlp.processors + + +class ProcessorMixin(object): + """ + This is a mixin used to provide saving/loading functionality for all processor classes. + """ + + attributes = ["feature_extractor", "tokenizer"] + # Names need to be attr_class for attr in attributes + feature_extractor_class = None + tokenizer_class = None + _auto_class = None + + # args have to match the attributes class attribute + def __init__(self, *args, **kwargs): + # Sanitize args and kwargs + for key in kwargs: + if key not in self.attributes: + raise TypeError(f"Unexepcted keyword argument {key}.") + for arg, attribute_name in zip(args, self.attributes): + if attribute_name in kwargs: + raise TypeError(f"Got multiple values for argument {attribute_name}.") + else: + kwargs[attribute_name] = arg + + if len(kwargs) != len(self.attributes): + raise ValueError( + f"This processor requires {len(self.attributes)} arguments: {', '.join(self.attributes)}. Got " + f"{len(args)} arguments instead." + ) + + # Check each arg is of the proper class (this will also catch a user initializing in the wrong order) + for attribute_name, arg in kwargs.items(): + setattr(self, attribute_name, arg) + + def __repr__(self): + attributes_repr = [f"- {name}: {repr(getattr(self, name))}" for name in self.attributes] + attributes_repr = "\n".join(attributes_repr) + return f"{self.__class__.__name__}:\n{attributes_repr}" + + def save_pretrained(self, save_directory, **kwargs): + """ + Saves the attributes of this processor (feature extractor, tokenizer...) in the specified directory so that it + can be reloaded using the [`~ProcessorMixin.from_pretrained`] method. + + + + This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and + [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods + above for more information. + + + + Args: + save_directory (`str` or `os.PathLike`): + Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will + be created if it does not exist). + kwargs: + Additional key word arguments. + """ + os.makedirs(save_directory, exist_ok=True) + + for attribute_name in self.attributes: + attribute = getattr(self, attribute_name) + # Include the processor class in the attribute config so this processor can then be reloaded with the + # `AutoProcessor` API. + if hasattr(attribute, "_set_processor_class"): + attribute._set_processor_class(self.__class__.__name__) + attribute.save_pretrained(save_directory) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): + r""" + Instantiate a processor associated with a pretrained model. + + + + This class method is simply calling the feature extractor + [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and the tokenizer + [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`] methods. Please refer to the docstrings of the + methods above for more information. + + + + Args: + pretrained_model_name_or_path (`str` or `os.PathLike`): + This can be either: + + - a string, the name of a community-contributed pretrained or built-in pretrained model. + - a path to a *directory* containing a feature extractor file saved using the + [`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`. + - a path or url to a saved feature extractor JSON *file*, e.g., + `./my_model_directory/preprocessor_config.json`. + **kwargs + Additional keyword arguments passed along to both + [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and + [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. + """ + args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs) + return cls(*args) + + @classmethod + def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs): + args = [] + for attribute_name in cls.attributes: + class_name = getattr(cls, f"{attribute_name}_class") + # attribute class in paddlevlp has higher priority, usually used by vision class + attribute_class = getattr(paddlevlp.processors, class_name, None) + if attribute_class is None: + attribute_class = getattr(paddlenlp.transformers, class_name) + args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs)) + return args + + @property + def model_input_names(self): + first_attribute = getattr(self, self.attributes[0]) + return getattr(first_attribute, "model_input_names", None) \ No newline at end of file diff --git a/paddlevlp/processors/image_transforms.py b/paddlevlp/processors/image_transforms.py new file mode 100644 index 00000000000000..c090cc4758cb27 --- /dev/null +++ b/paddlevlp/processors/image_transforms.py @@ -0,0 +1,656 @@ +# coding=utf-8 +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from typing import Iterable, List, Optional, Tuple, Union + +import numpy as np +import paddle +import PIL + +from .image_utils import ( + ChannelDimension, + ImageInput, + PILImageResampling, + get_channel_dimension_axis, + get_image_size, + infer_channel_dimension_format, + to_numpy_array, +) + +from paddlenlp.transformers.tokenizer_utils_base import ExplicitEnum, TensorType + + +def is_paddle_tensor(tensor): + return paddle.is_tensor(tensor) + + +def to_channel_dimension_format( + image: np.ndarray, + channel_dim: Union[ChannelDimension, str], + input_channel_dim: Optional[Union[ChannelDimension, str]] = None, +) -> np.ndarray: + """ + Converts `image` to the channel dimension format specified by `channel_dim`. + + Args: + image (`numpy.ndarray`): + The image to have its channel dimension set. + channel_dim (`ChannelDimension`): + The channel dimension format to use. + + Returns: + `np.ndarray`: The image with the channel dimension set to `channel_dim`. + """ + if not isinstance(image, np.ndarray): + raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}") + + if input_channel_dim is None: + input_channel_dim = infer_channel_dimension_format(image) + + target_channel_dim = ChannelDimension(channel_dim) + if input_channel_dim == target_channel_dim: + return image + + if target_channel_dim == ChannelDimension.FIRST: + image = image.transpose((2, 0, 1)) + elif target_channel_dim == ChannelDimension.LAST: + image = image.transpose((1, 2, 0)) + else: + raise ValueError("Unsupported channel dimension format: {}".format(channel_dim)) + + return image + + +def rescale( + image: np.ndarray, scale: float, data_format: Optional[ChannelDimension] = None, dtype=np.float32 +) -> np.ndarray: + """ + Rescales `image` by `scale`. + + Args: + image (`np.ndarray`): + The image to rescale. + scale (`float`): + The scale to use for rescaling the image. + data_format (`ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + dtype (`np.dtype`, *optional*, defaults to `np.float32`): + The dtype of the output image. Defaults to `np.float32`. Used for backwards compatibility with feature + extractors. + + Returns: + `np.ndarray`: The rescaled image. + """ + if not isinstance(image, np.ndarray): + raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}") + + rescaled_image = image * scale + if data_format is not None: + rescaled_image = to_channel_dimension_format(rescaled_image, data_format) + rescaled_image = rescaled_image.astype(dtype) + return rescaled_image + + +def to_pil_image( + image: Union[np.ndarray, "PIL.Image.Image", "paddle.Tensor"], + do_rescale: Optional[bool] = None, +) -> "PIL.Image.Image": + """ + Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if + needed. + + Args: + image (`PIL.Image.Image` or `numpy.ndarray` or `paddle.Tensor`): + The image to convert to the `PIL.Image` format. + do_rescale (`bool`, *optional*): + Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will default + to `True` if the image type is a floating type, `False` otherwise. + + Returns: + `PIL.Image.Image`: The converted image. + """ + if isinstance(image, PIL.Image.Image): + return image + + # Convert all tensors to numpy arrays before converting to PIL image + if is_paddle_tensor(image): + image = image.numpy() + elif not isinstance(image, np.ndarray): + raise ValueError("Input image type not supported: {}".format(type(image))) + + # If the channel as been moved to first dim, we put it back at the end. + image = to_channel_dimension_format(image, ChannelDimension.LAST) + + # If there is a single channel, we squeeze it, as otherwise PIL can't handle it. + image = np.squeeze(image, axis=-1) if image.shape[-1] == 1 else image + + # PIL.Image can only store uint8 values, so we rescale the image to be between 0 and 255 if needed. + do_rescale = isinstance(image.flat[0], (float, np.float32, np.float64)) if do_rescale is None else do_rescale + if do_rescale: + image = rescale(image, 255) + image = image.astype(np.uint8) + return PIL.Image.fromarray(image) + + +# Logic adapted from torchvision resizing logic: https://github.com/pytorch/vision/blob/511924c1ced4ce0461197e5caa64ce5b9e558aab/torchvision/transforms/functional.py#L366 +def get_resize_output_image_size( + input_image: np.ndarray, + size: Union[int, Tuple[int, int], List[int], Tuple[int]], + default_to_square: bool = True, + max_size: Optional[int] = None, +) -> tuple: + """ + Find the target (height, width) dimension of the output image after resizing given the input image and the desired + size. + + Args: + input_image (`np.ndarray`): + The image to resize. + size (`int` or `Tuple[int, int]` or List[int] or Tuple[int]): + The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be matched to + this. + + If `size` is an int and `default_to_square` is `True`, then image will be resized to (size, size). If + `size` is an int and `default_to_square` is `False`, then smaller edge of the image will be matched to this + number. i.e, if height > width, then image will be rescaled to (size * height / width, size). + default_to_square (`bool`, *optional*, defaults to `True`): + How to convert `size` when it is a single int. If set to `True`, the `size` will be converted to a square + (`size`,`size`). If set to `False`, will replicate + [`torchvision.transforms.Resize`](https://pytorch.org/vision/stable/transforms.html#torchvision.transforms.Resize) + with support for resizing only the smallest edge and providing an optional `max_size`. + max_size (`int`, *optional*): + The maximum allowed for the longer edge of the resized image: if the longer edge of the image is greater + than `max_size` after being resized according to `size`, then the image is resized again so that the longer + edge is equal to `max_size`. As a result, `size` might be overruled, i.e the smaller edge may be shorter + than `size`. Only used if `default_to_square` is `False`. + + Returns: + `tuple`: The target (height, width) dimension of the output image after resizing. + """ + if isinstance(size, (tuple, list)): + if len(size) == 2: + return tuple(size) + elif len(size) == 1: + # Perform same logic as if size was an int + size = size[0] + else: + raise ValueError("size must have 1 or 2 elements if it is a list or tuple") + + if default_to_square: + return (size, size) + + height, width = get_image_size(input_image) + short, long = (width, height) if width <= height else (height, width) + requested_new_short = size + + new_short, new_long = requested_new_short, int(requested_new_short * long / short) + + if max_size is not None: + if max_size <= requested_new_short: + raise ValueError( + f"max_size = {max_size} must be strictly greater than the requested " + f"size for the smaller edge size = {size}" + ) + if new_long > max_size: + new_short, new_long = int(max_size * new_short / new_long), max_size + + return (new_long, new_short) if width <= height else (new_short, new_long) + + +def resize( + image, + size: Tuple[int, int], + resample: "PILImageResampling" = None, + reducing_gap: Optional[int] = None, + data_format: Optional[ChannelDimension] = None, + return_numpy: bool = True, +) -> np.ndarray: + """ + Resizes `image` to `(height, width)` specified by `size` using the PIL library. + + Args: + image (`PIL.Image.Image` or `np.ndarray` or `paddle.Tensor`): + The image to resize. + size (`Tuple[int, int]`): + The size to use for resizing the image. + resample (`int`, *optional*, defaults to `PILImageResampling.BILINEAR`): + The filter to user for resampling. + reducing_gap (`int`, *optional*): + Apply optimization by resizing the image in two steps. The bigger `reducing_gap`, the closer the result to + the fair resampling. See corresponding Pillow documentation for more details. + data_format (`ChannelDimension`, *optional*): + The channel dimension format of the output image. If unset, will use the inferred format from the input. + return_numpy (`bool`, *optional*, defaults to `True`): + Whether or not to return the resized image as a numpy array. If False a `PIL.Image.Image` object is + returned. + + Returns: + `np.ndarray`: The resized image. + """ + resample = resample if resample is not None else PILImageResampling.BILINEAR + + if not len(size) == 2: + raise ValueError("size must have 2 elements") + + # For all transformations, we want to keep the same data format as the input image unless otherwise specified. + # The resized image from PIL will always have channels last, so find the input format first. + data_format = infer_channel_dimension_format(image) if data_format is None else data_format + + # To maintain backwards compatibility with the resizing done in previous image feature extractors, we use + # the pillow library to resize the image and then convert back to numpy + if not isinstance(image, PIL.Image.Image): + image = to_pil_image(image) + height, width = size + # PIL images are in the format (width, height) + resized_image = image.resize((width, height), resample=resample, reducing_gap=reducing_gap) + + if return_numpy: + resized_image = np.array(resized_image) + # If the input image channel dimension was of size 1, then it is dropped when converting to a PIL image + # so we need to add it back if necessary. + resized_image = np.expand_dims(resized_image, axis=-1) if resized_image.ndim == 2 else resized_image + # The image is always in channels last format after converting from a PIL image + resized_image = to_channel_dimension_format( + resized_image, data_format, input_channel_dim=ChannelDimension.LAST + ) + return resized_image + + +def normalize( + image: np.ndarray, + mean: Union[float, Iterable[float]], + std: Union[float, Iterable[float]], + data_format: Optional[ChannelDimension] = None, +) -> np.ndarray: + """ + Normalizes `image` using the mean and standard deviation specified by `mean` and `std`. + + image = (image - mean) / std + + Args: + image (`np.ndarray`): + The image to normalize. + mean (`float` or `Iterable[float]`): + The mean to use for normalization. + std (`float` or `Iterable[float]`): + The standard deviation to use for normalization. + data_format (`ChannelDimension`, *optional*): + The channel dimension format of the output image. If unset, will use the inferred format from the input. + """ + if isinstance(image, PIL.Image.Image): + warnings.warn( + "PIL.Image.Image inputs are deprecated and will be removed in v4.26.0. Please use numpy arrays instead.", + FutureWarning, + ) + # Convert PIL image to numpy array with the same logic as in the previous feature extractor normalize - + # casting to numpy array and dividing by 255. + image = to_numpy_array(image) + image = rescale(image, scale=1 / 255) + + if not isinstance(image, np.ndarray): + raise ValueError("image must be a numpy array") + + input_data_format = infer_channel_dimension_format(image) + channel_axis = get_channel_dimension_axis(image) + num_channels = image.shape[channel_axis] + + if isinstance(mean, Iterable): + if len(mean) != num_channels: + raise ValueError(f"mean must have {num_channels} elements if it is an iterable, got {len(mean)}") + else: + mean = [mean] * num_channels + mean = np.array(mean, dtype=image.dtype) + + if isinstance(std, Iterable): + if len(std) != num_channels: + raise ValueError(f"std must have {num_channels} elements if it is an iterable, got {len(std)}") + else: + std = [std] * num_channels + std = np.array(std, dtype=image.dtype) + + if input_data_format == ChannelDimension.LAST: + image = (image - mean) / std + else: + image = ((image.T - mean) / std).T + + image = to_channel_dimension_format(image, data_format) if data_format is not None else image + return image + + +def center_crop( + image: np.ndarray, + size: Tuple[int, int], + data_format: Optional[Union[str, ChannelDimension]] = None, + return_numpy: Optional[bool] = None, +) -> np.ndarray: + """ + Crops the `image` to the specified `size` using a center crop. Note that if the image is too small to be cropped to + the size given, it will be padded (so the returned result will always be of size `size`). + + Args: + image (`np.ndarray`): + The image to crop. + size (`Tuple[int, int]`): + The target size for the cropped image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + If unset, will use the inferred format of the input image. + return_numpy (`bool`, *optional*): + Whether or not to return the cropped image as a numpy array. Used for backwards compatibility with the + previous ImageFeatureExtractionMixin method. + - Unset: will return the same type as the input image. + - `True`: will return a numpy array. + - `False`: will return a `PIL.Image.Image` object. + Returns: + `np.ndarray`: The cropped image. + """ + if isinstance(image, PIL.Image.Image): + warnings.warn( + "PIL.Image.Image inputs are deprecated and will be removed in v4.26.0. Please use numpy arrays instead.", + FutureWarning, + ) + image = to_numpy_array(image) + return_numpy = False if return_numpy is None else return_numpy + else: + return_numpy = True if return_numpy is None else return_numpy + + if not isinstance(image, np.ndarray): + raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}") + + if not isinstance(size, Iterable) or len(size) != 2: + raise ValueError("size must have 2 elements representing the height and width of the output image") + + input_data_format = infer_channel_dimension_format(image) + output_data_format = data_format if data_format is not None else input_data_format + + # We perform the crop in (C, H, W) format and then convert to the output format + image = to_channel_dimension_format(image, ChannelDimension.FIRST) + + orig_height, orig_width = get_image_size(image) + crop_height, crop_width = size + crop_height, crop_width = int(crop_height), int(crop_width) + + # In case size is odd, (image_shape[0] + size[0]) // 2 won't give the proper result. + top = (orig_height - crop_height) // 2 + bottom = top + crop_height + # In case size is odd, (image_shape[1] + size[1]) // 2 won't give the proper result. + left = (orig_width - crop_width) // 2 + right = left + crop_width + + # Check if cropped area is within image boundaries + if top >= 0 and bottom <= orig_height and left >= 0 and right <= orig_width: + image = image[..., top:bottom, left:right] + image = to_channel_dimension_format(image, output_data_format) + return image + + # Otherwise, we may need to pad if the image is too small. Oh joy... + new_height = max(crop_height, orig_height) + new_width = max(crop_width, orig_width) + new_shape = image.shape[:-2] + (new_height, new_width) + new_image = np.zeros_like(image, shape=new_shape) + + # If the image is too small, pad it with zeros + top_pad = (new_height - orig_height) // 2 + bottom_pad = top_pad + orig_height + left_pad = (new_width - orig_width) // 2 + right_pad = left_pad + orig_width + new_image[..., top_pad:bottom_pad, left_pad:right_pad] = image + + top += top_pad + bottom += top_pad + left += left_pad + right += left_pad + + new_image = new_image[..., max(0, top) : min(new_height, bottom), max(0, left) : min(new_width, right)] + new_image = to_channel_dimension_format(new_image, output_data_format) + + if not return_numpy: + new_image = to_pil_image(new_image) + + return new_image + + +def _center_to_corners_format_paddle(bboxes_center: "paddle.Tensor") -> "paddle.Tensor": + center_x, center_y, width, height = bboxes_center.unbind(-1) + bbox_corners = paddle.stack( + # top left x, top left y, bottom right x, bottom right y + [(center_x - 0.5 * width), (center_y - 0.5 * height), (center_x + 0.5 * width), (center_y + 0.5 * height)], + axis=-1, + ) + return bbox_corners + + +def _center_to_corners_format_numpy(bboxes_center: np.ndarray) -> np.ndarray: + center_x, center_y, width, height = bboxes_center.T + bboxes_corners = np.stack( + # top left x, top left y, bottom right x, bottom right y + [center_x - 0.5 * width, center_y - 0.5 * height, center_x + 0.5 * width, center_y + 0.5 * height], + axis=-1, + ) + return bboxes_corners + + +# 2 functions below inspired by https://github.com/facebookresearch/detr/blob/master/util/box_ops.py +def center_to_corners_format(bboxes_center: TensorType) -> TensorType: + """ + Converts bounding boxes from center format to corners format. + + center format: contains the coordinate for the center of the box and its width, height dimensions + (center_x, center_y, width, height) + corners format: contains the coodinates for the top-left and bottom-right corners of the box + (top_left_x, top_left_y, bottom_right_x, bottom_right_y) + """ + # Function is used during model forward pass, so we use the input framework if possible, without + # converting to numpy + if is_paddle_tensor(bboxes_center): + return _center_to_corners_format_paddle(bboxes_center) + elif isinstance(bboxes_center, np.ndarray): + return _center_to_corners_format_numpy(bboxes_center) + + raise ValueError(f"Unsupported input type {type(bboxes_center)}") + + +def _corners_to_center_format_paddle(bboxes_corners: "paddle.Tensor") -> "paddle.Tensor": + top_left_x, top_left_y, bottom_right_x, bottom_right_y = bboxes_corners.unbind(-1) + b = [ + (top_left_x + bottom_right_x) / 2, # center x + (top_left_y + bottom_right_y) / 2, # center y + (bottom_right_x - top_left_x), # width + (bottom_right_y - top_left_y), # height + ] + return paddle.stack(b, axis=-1) + + +def _corners_to_center_format_numpy(bboxes_corners: np.ndarray) -> np.ndarray: + top_left_x, top_left_y, bottom_right_x, bottom_right_y = bboxes_corners.T + bboxes_center = np.stack( + [ + (top_left_x + bottom_right_x) / 2, # center x + (top_left_y + bottom_right_y) / 2, # center y + (bottom_right_x - top_left_x), # width + (bottom_right_y - top_left_y), # height + ], + axis=-1, + ) + return bboxes_center + + +def corners_to_center_format(bboxes_corners: TensorType) -> TensorType: + """ + Converts bounding boxes from corners format to center format. + + corners format: contains the coodinates for the top-left and bottom-right corners of the box + (top_left_x, top_left_y, bottom_right_x, bottom_right_y) + center format: contains the coordinate for the center of the box and its the width, height dimensions + (center_x, center_y, width, height) + """ + # Inverse function accepts different input types so implemented here too + if is_paddle_tensor(bboxes_corners): + return _corners_to_center_format_paddle(bboxes_corners) + elif isinstance(bboxes_corners, np.ndarray): + return _corners_to_center_format_numpy(bboxes_corners) + + raise ValueError(f"Unsupported input type {type(bboxes_corners)}") + + +# 2 functions below copied from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py +# Copyright (c) 2018, Alexander Kirillov +# All rights reserved. +def rgb_to_id(color): + """ + Converts RGB color to unique ID. + """ + if isinstance(color, np.ndarray) and len(color.shape) == 3: + if color.dtype == np.uint8: + color = color.astype(np.int32) + return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2] + return int(color[0] + 256 * color[1] + 256 * 256 * color[2]) + + +def id_to_rgb(id_map): + """ + Converts unique ID to RGB color. + """ + if isinstance(id_map, np.ndarray): + id_map_copy = id_map.copy() + rgb_shape = tuple(list(id_map.shape) + [3]) + rgb_map = np.zeros(rgb_shape, dtype=np.uint8) + for i in range(3): + rgb_map[..., i] = id_map_copy % 256 + id_map_copy //= 256 + return rgb_map + color = [] + for _ in range(3): + color.append(id_map % 256) + id_map //= 256 + return color + + +class PaddingMode(ExplicitEnum): + """ + Enum class for the different padding modes to use when padding images. + """ + + CONSTANT = "constant" + REFLECT = "reflect" + REPLICATE = "replicate" + SYMMETRIC = "symmetric" + + +def pad( + image: np.ndarray, + padding: Union[int, Tuple[int, int], Iterable[Tuple[int, int]]], + mode: PaddingMode = PaddingMode.CONSTANT, + constant_values: Union[float, Iterable[float]] = 0.0, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, +) -> np.ndarray: + """ + Pads the `image` with the specified (height, width) `padding` and `mode`. + + Args: + image (`np.ndarray`): + The image to pad. + padding (`int` or `Tuple[int, int]` or `Iterable[Tuple[int, int]]`): + Padding to apply to the edges of the height, width axes. Can be one of three formats: + - `((before_height, after_height), (before_width, after_width))` unique pad widths for each axis. + - `((before, after),)` yields same before and after pad for height and width. + - `(pad,)` or int is a shortcut for before = after = pad width for all axes. + mode (`PaddingMode`): + The padding mode to use. Can be one of: + - `"constant"`: pads with a constant value. + - `"reflect"`: pads with the reflection of the vector mirrored on the first and last values of the + vector along each axis. + - `"replicate"`: pads with the replication of the last value on the edge of the array along each axis. + - `"symmetric"`: pads with the reflection of the vector mirrored along the edge of the array. + constant_values (`float` or `Iterable[float]`, *optional*): + The value to use for the padding if `mode` is `"constant"`. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + If unset, will use same as the input image. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + If unset, will use the inferred format of the input image. + + Returns: + `np.ndarray`: The padded image. + + """ + if input_data_format is None: + input_data_format = infer_channel_dimension_format(image) + + def _expand_for_data_format(values): + """ + Convert values to be in the format expected by np.pad based on the data format. + """ + if isinstance(values, (int, float)): + values = ((values, values), (values, values)) + elif isinstance(values, tuple) and len(values) == 1: + values = ((values[0], values[0]), (values[0], values[0])) + elif isinstance(values, tuple) and len(values) == 2 and isinstance(values[0], int): + values = (values, values) + elif isinstance(values, tuple) and len(values) == 2 and isinstance(values[0], tuple): + values = values + else: + raise ValueError(f"Unsupported format: {values}") + + # add 0 for channel dimension + values = ((0, 0), *values) if input_data_format == ChannelDimension.FIRST else (*values, (0, 0)) + + # Add additional padding if there's a batch dimension + values = (0, *values) if image.ndim == 4 else values + return values + + padding = _expand_for_data_format(padding) + + if mode == PaddingMode.CONSTANT: + constant_values = _expand_for_data_format(constant_values) + image = np.pad(image, padding, mode="constant", constant_values=constant_values) + elif mode == PaddingMode.REFLECT: + image = np.pad(image, padding, mode="reflect") + elif mode == PaddingMode.REPLICATE: + image = np.pad(image, padding, mode="edge") + elif mode == PaddingMode.SYMMETRIC: + image = np.pad(image, padding, mode="symmetric") + else: + raise ValueError(f"Invalid padding mode: {mode}") + + image = to_channel_dimension_format(image, data_format) if data_format is not None else image + return image + + +def convert_to_rgb(image: ImageInput) -> ImageInput: + """ + Converts an image to RGB format. Only converts if the image is of type PIL.Image.Image, otherwise returns the image + as is. + + Args: + image (Image): + The image to convert. + """ + + if not isinstance(image, PIL.Image.Image): + return image + + image = image.convert("RGB") + return image \ No newline at end of file diff --git a/paddlevlp/processors/minigpt4_image_processing.py b/paddlevlp/processors/minigpt4_image_processing.py index 3a0b3302e9c799..08f70cd83d7f95 100644 --- a/paddlevlp/processors/minigpt4_image_processing.py +++ b/paddlevlp/processors/minigpt4_image_processing.py @@ -20,15 +20,17 @@ import numpy as np import PIL -from paddlenlp.transformers.image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict -from paddlenlp.transformers.image_transforms import ( +from paddlenlp.transformers.tokenizer_utils_base import TensorType + +from .image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from .image_transforms import ( convert_to_rgb, normalize, rescale, resize, to_channel_dimension_format, ) -from paddlenlp.transformers.image_utils import ( +from .image_utils import ( ChannelDimension, ImageInput, PILImageResampling, @@ -36,7 +38,6 @@ to_numpy_array, valid_images, ) -from paddlenlp.transformers.tokenizer_utils_base import TensorType __all__ = [ "MiniGPT4ImageProcessor", diff --git a/paddlevlp/processors/minigpt4_processing.py b/paddlevlp/processors/minigpt4_processing.py index f71acc7e4298e9..0b46d01ef468d2 100644 --- a/paddlevlp/processors/minigpt4_processing.py +++ b/paddlevlp/processors/minigpt4_processing.py @@ -23,11 +23,12 @@ import paddle from PIL import Image -from paddlenlp.transformers.image_processing_utils import BatchFeature -from paddlenlp.transformers.image_utils import ImageInput -from paddlenlp.transformers.processing_utils import ProcessorMixin from paddlenlp.transformers.tokenizer_utils_base import BatchEncoding, TensorType, TextInput +from .image_processing_utils import BatchFeature +from .image_utils import ImageInput +from .base_processing import ProcessorMixin + __all__ = [ "MiniGPT4Processor", ] diff --git a/paddlevlp/processors/utils.py b/paddlevlp/processors/utils.py index 34dd36fe33fea3..896c4bcd24820b 100644 --- a/paddlevlp/processors/utils.py +++ b/paddlevlp/processors/utils.py @@ -14,7 +14,6 @@ from enum import Enum - class ExplicitEnum(Enum): """ Enum with more explicit error message for missing values. diff --git a/paddlevlp/utils/initializer.py b/paddlevlp/utils/initializer.py new file mode 100644 index 00000000000000..f963a6de0ae25f --- /dev/null +++ b/paddlevlp/utils/initializer.py @@ -0,0 +1,421 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This code is based on https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py +Ths copyright of pytorch/pytorch is a BSD-style license, as found in the LICENSE file. +""" + +import math +import warnings + +import numpy as np +import paddle +import paddle.nn as nn +from paddle.fluid import core +from paddle.fluid.core import VarDesc +from paddle.fluid.framework import convert_np_dtype_to_dtype_ + +__all__ = [ + "uniform_", + "normal_", + "constant_", + "ones_", + "zeros_", + "xavier_uniform_", + "xavier_normal_", + "kaiming_uniform_", + "kaiming_normal_", + "linear_init_", + "conv_init_", + "reset_initialized_parameter", +] + + +def _no_grad_uniform_(tensor, a, b): + with paddle.no_grad(): + tensor.uniform_(min=a, max=b) + return tensor + + +def _no_grad_normal_(tensor, mean=0.0, std=1.0): + with paddle.no_grad(): + tensor.set_value(paddle.normal(mean=mean, std=std, shape=tensor.shape)) + return tensor + + +def _no_grad_fill_(tensor, value=0.0): + with paddle.no_grad(): + tensor.fill_(value) + return tensor + + +def uniform_(tensor, a, b): + """ + Modified tensor inspace using uniform_ + Args: + tensor (paddle.Tensor): paddle Tensor + a (float|int): min value. + b (float|int): max value. + Return: + tensor + """ + return _no_grad_uniform_(tensor, a, b) + + +def normal_(tensor, mean=0.0, std=1.0): + """ + Modified tensor inspace using normal_ + Args: + tensor (paddle.Tensor): paddle Tensor + mean (float|int): mean value. + std (float|int): std value. + Return: + tensor + """ + return _no_grad_normal_(tensor, mean, std) + + +def constant_(tensor, value=0.0): + """ + Modified tensor inspace using constant_ + Args: + tensor (paddle.Tensor): paddle Tensor + value (float|int): value to fill tensor. + Return: + tensor + """ + return _no_grad_fill_(tensor, value) + + +def ones_(tensor): + """ + Modified tensor inspace using ones_ + Args: + tensor (paddle.Tensor): paddle Tensor + Return: + tensor + """ + return _no_grad_fill_(tensor, 1) + + +def zeros_(tensor): + """ + Modified tensor inspace using zeros_ + Args: + tensor (paddle.Tensor): paddle Tensor + Return: + tensor + """ + return _no_grad_fill_(tensor, 0) + + +def vector_(tensor, vector): + with paddle.no_grad(): + tensor.set_value(paddle.to_tensor(vector, dtype=tensor.dtype)) + return tensor + + +def _calculate_fan_in_and_fan_out(tensor, reverse=False): + """ + Calculate (fan_in, _fan_out) for tensor + Args: + tensor (Tensor): paddle.Tensor + reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. e.g. : conv.weight [cout, cin, kh, kw] is False; linear.weight [cin, cout] is True + Return: + Tuple[fan_in, fan_out] + """ + if tensor.ndim < 2: + raise ValueError("Fan in and fan out can not be computed for tensor with fewer than 2 dimensions") + + if reverse: + num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1] + else: + num_input_fmaps, num_output_fmaps = tensor.shape[1], tensor.shape[0] + + receptive_field_size = 1 + if tensor.ndim > 2: + receptive_field_size = np.prod(tensor.shape[2:]) + + fan_in = num_input_fmaps * receptive_field_size + fan_out = num_output_fmaps * receptive_field_size + + return fan_in, fan_out + + +def xavier_uniform_(tensor, gain=1.0, reverse=False): + """ + Modified tensor inspace using xavier_uniform_ + Args: + tensor (paddle.Tensor): paddle Tensor + gain (float): super parameter, 1. default. + reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. + Return: + tensor + """ + fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse) + std = gain * math.sqrt(2.0 / float(fan_in + fan_out)) + k = math.sqrt(3.0) * std + return _no_grad_uniform_(tensor, -k, k) + + +def xavier_normal_(tensor, gain=1.0, reverse=False): + """ + Modified tensor inspace using xavier_normal_ + Args: + tensor (paddle.Tensor): paddle Tensor + gain (float): super parameter, 1. default. + reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. + Return: + tensor + """ + fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse) + std = gain * math.sqrt(2.0 / float(fan_in + fan_out)) + return _no_grad_normal_(tensor, 0, std) + + +# reference: https://pytorch.org/docs/stable/_modules/torch/nn/init.html +def _calculate_correct_fan(tensor, mode, reverse=False): + mode = mode.lower() + valid_modes = ["fan_in", "fan_out"] + if mode not in valid_modes: + raise ValueError("Mode {} not supported, please use one of {}".format(mode, valid_modes)) + + fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse) + + return fan_in if mode == "fan_in" else fan_out + + +def _calculate_gain(nonlinearity, param=None): + linear_fns = ["linear", "conv1d", "conv2d", "conv3d", "conv_transpose1d", "conv_transpose2d", "conv_transpose3d"] + if nonlinearity in linear_fns or nonlinearity == "sigmoid": + return 1 + elif nonlinearity == "tanh": + return 5.0 / 3 + elif nonlinearity == "relu": + return math.sqrt(2.0) + elif nonlinearity == "leaky_relu": + if param is None: + negative_slope = 0.01 + elif not isinstance(param, bool) and isinstance(param, int) or isinstance(param, float): + # True/False are instances of int, hence check above + negative_slope = param + else: + raise ValueError("negative_slope {} not a valid number".format(param)) + return math.sqrt(2.0 / (1 + negative_slope**2)) + elif nonlinearity == "selu": + return 3.0 / 4 + else: + raise ValueError("Unsupported nonlinearity {}".format(nonlinearity)) + + +def kaiming_uniform_(tensor, a=0, mode="fan_in", nonlinearity="leaky_relu", reverse=False): + """ + Modified tensor inspace using kaiming_uniform method + Args: + tensor (paddle.Tensor): paddle Tensor + mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut + nonlinearity (str): nonlinearity method name + reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. + Return: + tensor + """ + fan = _calculate_correct_fan(tensor, mode, reverse) + gain = _calculate_gain(nonlinearity, a) + std = gain / math.sqrt(fan) + k = math.sqrt(3.0) * std + return _no_grad_uniform_(tensor, -k, k) + + +def kaiming_normal_(tensor, a=0, mode="fan_in", nonlinearity="leaky_relu", reverse=False): + """ + Modified tensor inspace using kaiming_normal_ + Args: + tensor (paddle.Tensor): paddle Tensor + mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut + nonlinearity (str): nonlinearity method name + reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. + Return: + tensor + """ + fan = _calculate_correct_fan(tensor, mode, reverse) + gain = _calculate_gain(nonlinearity, a) + std = gain / math.sqrt(fan) + return _no_grad_normal_(tensor, 0, std) + + +def linear_init_(module): + bound = 1 / math.sqrt(module.weight.shape[0]) + uniform_(module.weight, -bound, bound) + uniform_(module.bias, -bound, bound) + + +def conv_init_(module): + bound = 1 / np.sqrt(np.prod(module.weight.shape[1:])) + uniform_(module.weight, -bound, bound) + if module.bias is not None: + uniform_(module.bias, -bound, bound) + + +def bias_init_with_prob(prior_prob=0.01): + """initialize conv/fc bias value according to a given probability value.""" + bias_init = float(-np.log((1 - prior_prob) / prior_prob)) + return bias_init + + +@paddle.no_grad() +def reset_initialized_parameter(model, include_self=True): + """ + Reset initialized parameter using following method for [conv, linear, embedding, bn] + Args: + model (paddle.Layer): paddle Layer + include_self (bool: False): include_self for Layer.named_sublayers method. Indicate whether including itself + Return: + None + """ + for _, m in model.named_sublayers(include_self=include_self): + if isinstance(m, nn.Conv2D): + k = float(m._groups) / (m._in_channels * m._kernel_size[0] * m._kernel_size[1]) + k = math.sqrt(k) + _no_grad_uniform_(m.weight, -k, k) + if hasattr(m, "bias") and getattr(m, "bias") is not None: + _no_grad_uniform_(m.bias, -k, k) + + elif isinstance(m, nn.Linear): + k = math.sqrt(1.0 / m.weight.shape[0]) + _no_grad_uniform_(m.weight, -k, k) + if hasattr(m, "bias") and getattr(m, "bias") is not None: + _no_grad_uniform_(m.bias, -k, k) + + elif isinstance(m, nn.Embedding): + _no_grad_normal_(m.weight, mean=0.0, std=1.0) + + elif isinstance(m, (nn.BatchNorm2D, nn.LayerNorm)): + _no_grad_fill_(m.weight, 1.0) + if hasattr(m, "bias") and getattr(m, "bias") is not None: + _no_grad_fill_(m.bias, 0) + + +def _transform(t, device, dtype, blocking): + if device is None: + device = t.place + if dtype is None: + dtype = t.dtype + + if type(dtype) is not VarDesc.VarType: + dtype = convert_np_dtype_to_dtype_(dtype) + + # 1. gpu place need to determine whether the memory is sufficient for allocation: + if t.place.is_gpu_place(): + # for gpu, minimum memory allocation unit is 256 bytes. + size_dtype = core.size_of_dtype(dtype) + # Note(zhangbo): Paddle GPU minimum memory allocation unit is 256 bytes, waiting_alloc_memory will comput ‘t’ occupied memory space. + # Coefficient 1.2 is used to avoid OOM that may occur in this critical state when the memory is just enough. + waiting_alloc_memory = ((np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2 + gpu_memory_available = core.gpu_memory_available() + if gpu_memory_available < waiting_alloc_memory: + # Copy param / Tensor to cpu + t_used = t._copy_to(paddle.CPUPlace(), blocking) # k-v type will error + # Release mem of t + t.value().get_tensor()._clear() + else: + t_used = t + else: + t_used = t + + # 2. cast param / Tensor to dtype + if dtype is not None and dtype != t_used.dtype: + with paddle.fluid.framework._dygraph_place_guard(place=t_used.place): + t_casted = t_used.cast(dtype=dtype) + else: + t_casted = t_used + + # 3. Copy casted cpu param / Tensor to device + if device is not None and not t_casted.place._equals(device): + new_t = t_casted._copy_to(device, blocking) + else: + new_t = t_casted + + # 4. share Tensor to origin param / Tensor + dst_tensor = t.value().get_tensor() + src_tensor = new_t.value().get_tensor() + dst_tensor._share_data_with(src_tensor) + + return t + + +def to( + self, + device=None, + dtype=None, + blocking=None, + floating_only=True, +): + """ + Cast the parameters and buffers of Layer by the give device, dtype and blocking. + + Parameters: + device(str|paddle.CPUPlace()|paddle.CUDAPlace()|paddle.CUDAPinnedPlace()|paddle.XPUPlace()|None, optional): The device of the Layer which want to be stored. + If None, the device is the same with the original Tensor. If device is string, it can be ``cpu``, ``gpu:x`` and ``xpu:x``, where ``x`` is the + index of the GPUs or XPUs. Default: None. + + dtype(str|numpy.dtype|paddle.dtype|None, optional): The type of the data. If None, the dtype is the same with the original Tensor. Default: None. + + blocking(bool|None, optional): If False and the source is in pinned memory, the copy will be + asynchronous with respect to the host. Otherwise, the argument has no effect. If None, the blocking is set True. Default: None. + + floating_only(bool|False, optional): If True, only cast all floating point parameters and buffers of Layer by the give device, dtype and blocking. + + Returns: + self + + """ + + if device is None and dtype is None and blocking is None: + return self + + if device is not None: + if isinstance(device, str): + device = paddle.device._convert_to_place(device) + elif isinstance( + device, + ( + core.CPUPlace, + core.CUDAPlace, + core.CUDAPinnedPlace, + core.XPUPlace, + ), + ): + pass + else: + raise ValueError( + "device value error, must be str, paddle.CPUPlace(), paddle.CUDAPlace(), paddle.CUDAPinnedPlace() or paddle.XPUPlace(), but the type of device is " + + type(device).__name__ + ) + + if blocking is None: + blocking = True + else: + assert isinstance(blocking, bool), "blocking value error, must be the True, False or None" + + def transform(t, device, dtype, blocking): + if floating_only and (not paddle.is_floating_point(t)): + return t + return _transform(t, device, dtype, blocking) + + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=UserWarning) + transform(self, device, dtype, blocking) + + return self \ No newline at end of file diff --git a/paddlevlp/utils/log.py b/paddlevlp/utils/log.py index 78d2d824b99a14..cce25443e414f3 100644 --- a/paddlevlp/utils/log.py +++ b/paddlevlp/utils/log.py @@ -42,7 +42,7 @@ class Logger(object): """ def __init__(self, name: str = None): - name = "PaddleNLP" if not name else name + name = "PaddleMIX" if not name else name self.logger = logging.getLogger(name) for key, conf in log_config.items(): diff --git a/paddlevlp/utils/parameters.py b/paddlevlp/utils/parameters.py new file mode 100644 index 00000000000000..168e029791da21 --- /dev/null +++ b/paddlevlp/utils/parameters.py @@ -0,0 +1,53 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle + + +def transfer_param(p, is_bias=False, dtype="float16", restore_data=False): + param_shape = p.shape + # Allow CPU/GPU and float16/float32 transfer + # NOTE: str(p.place) differs between paddle develop and 2.2 + if str(p.dtype)[-len(dtype) :] == dtype and ("gpu" in str(p.place).lower() or "cuda" in str(p.place).lower()): + return p + if restore_data: + if ( + getattr(paddle.fluid.framework, "_in_eager_mode_", False) + and getattr(paddle.fluid.framework, "_dygraph_tracer_", None) is not None + ) or ( + hasattr(paddle.fluid.framework, "global_var") + and getattr(paddle.fluid.framework.global_var, "_in_eager_mode_", False) + and getattr(paddle.fluid.framework.global_var, "_dygraph_tracer_", None) is not None + ): + param_data = p.numpy() + new_p = paddle.create_parameter(shape=param_shape, dtype=dtype, is_bias=is_bias) + new_p.set_value(param_data.astype(dtype)) + return new_p + elif paddle.in_dynamic_mode(): + param_data = p.numpy() + # Creating parameters with Assign initializer is too slow. Maybe we + # can cast to fp16 directly and get a tensor, while we do it more + # elaborately to get a ParamBase. Also note `VarBase.set_value` + # enforce the same dtype and can not be used directly. + new_p = type(p)(shape=param_shape, dtype=dtype, is_bias=is_bias) + new_p.value().get_tensor().set(param_data.astype(dtype), paddle.framework._current_expected_place()) + return new_p + else: + param_data = np.array(paddle.static.global_scope().find_var(p.name).get_tensor()) + return paddle.create_parameter( + shape=param_shape, + dtype=dtype, + is_bias=is_bias, + default_initializer=paddle.nn.initializer.Assign(param_data) if restore_data else None, + ) \ No newline at end of file From 9d94a9db8adc29960ace23bbaed33e17553bde1a Mon Sep 17 00:00:00 2001 From: LokeZhou Date: Mon, 3 Jul 2023 07:41:49 +0000 Subject: [PATCH 06/10] add groundingdino --- paddlevlp/examples/groundingdino/README.md | 25 + paddlevlp/examples/groundingdino/__init__.py | 13 + .../examples/groundingdino/run_predict.py | 124 ++ paddlevlp/models/groundingdino/__init__.py | 14 + .../models/groundingdino/backbone/__init__.py | 1 + .../models/groundingdino/backbone/backbone.py | 94 ++ .../backbone/position_encoding.py | 182 +++ .../backbone/swin_transformer.py | 897 ++++++++++++++ paddlevlp/models/groundingdino/bert_model.py | 715 +++++++++++ paddlevlp/models/groundingdino/bertwarper.py | 277 +++++ .../models/groundingdino/configuration.py | 168 +++ paddlevlp/models/groundingdino/csrc/README.md | 85 ++ .../csrc/ms_deformable_attn_op.cc | 65 + .../csrc/ms_deformable_attn_op.cu | 1073 +++++++++++++++++ .../csrc/setup_ms_deformable_attn_op.py | 7 + .../csrc/test_ms_deformable_attn_op.py | 140 +++ .../models/groundingdino/fuse_modules.py | 312 +++++ paddlevlp/models/groundingdino/layers.py | 256 ++++ paddlevlp/models/groundingdino/modeling.py | 285 +++++ .../models/groundingdino/ms_deform_attn.py | 210 ++++ paddlevlp/models/groundingdino/transformer.py | 970 +++++++++++++++ .../groundingdino/transformer_vanilla.py | 122 ++ paddlevlp/models/groundingdino/utils.py | 270 +++++ paddlevlp/processors/__init__.py | 1 + .../processors/groundingdino_processing.py | 365 ++++++ paddlevlp/processors/utils.py | 9 + 26 files changed, 6680 insertions(+) create mode 100644 paddlevlp/examples/groundingdino/README.md create mode 100644 paddlevlp/examples/groundingdino/__init__.py create mode 100644 paddlevlp/examples/groundingdino/run_predict.py create mode 100644 paddlevlp/models/groundingdino/__init__.py create mode 100644 paddlevlp/models/groundingdino/backbone/__init__.py create mode 100644 paddlevlp/models/groundingdino/backbone/backbone.py create mode 100644 paddlevlp/models/groundingdino/backbone/position_encoding.py create mode 100644 paddlevlp/models/groundingdino/backbone/swin_transformer.py create mode 100644 paddlevlp/models/groundingdino/bert_model.py create mode 100644 paddlevlp/models/groundingdino/bertwarper.py create mode 100644 paddlevlp/models/groundingdino/configuration.py create mode 100644 paddlevlp/models/groundingdino/csrc/README.md create mode 100644 paddlevlp/models/groundingdino/csrc/ms_deformable_attn_op.cc create mode 100644 paddlevlp/models/groundingdino/csrc/ms_deformable_attn_op.cu create mode 100644 paddlevlp/models/groundingdino/csrc/setup_ms_deformable_attn_op.py create mode 100644 paddlevlp/models/groundingdino/csrc/test_ms_deformable_attn_op.py create mode 100644 paddlevlp/models/groundingdino/fuse_modules.py create mode 100644 paddlevlp/models/groundingdino/layers.py create mode 100644 paddlevlp/models/groundingdino/modeling.py create mode 100644 paddlevlp/models/groundingdino/ms_deform_attn.py create mode 100644 paddlevlp/models/groundingdino/transformer.py create mode 100644 paddlevlp/models/groundingdino/transformer_vanilla.py create mode 100644 paddlevlp/models/groundingdino/utils.py create mode 100644 paddlevlp/processors/groundingdino_processing.py diff --git a/paddlevlp/examples/groundingdino/README.md b/paddlevlp/examples/groundingdino/README.md new file mode 100644 index 00000000000000..d2a004578e15a7 --- /dev/null +++ b/paddlevlp/examples/groundingdino/README.md @@ -0,0 +1,25 @@ +# Grounding DINO + +## 1. 模型简介 + +Paddle implementation of [Grounding DINO](https://arxiv.org/abs/2303.05499), a stronger open-set object detector. + + +## 2. Demo + +## 2.1 prepare +```bash +#Multi-scale deformable attention custom OP compilation +cd /paddlevlp/models/groundingdino/csrc/ +python setup_ms_deformable_attn_op.py install + +``` +## 2.2 dynamic inference +```bash +python3.8 run_predict.py -dt groundingdino-swint-ogc +-i image_you_want_to_detect.jpg \ +-o "dir you want to save the output" \ +-t "Detect Cat" +``` + + diff --git a/paddlevlp/examples/groundingdino/__init__.py b/paddlevlp/examples/groundingdino/__init__.py new file mode 100644 index 00000000000000..595add0aed9e11 --- /dev/null +++ b/paddlevlp/examples/groundingdino/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlevlp/examples/groundingdino/run_predict.py b/paddlevlp/examples/groundingdino/run_predict.py new file mode 100644 index 00000000000000..f461caac41cf3d --- /dev/null +++ b/paddlevlp/examples/groundingdino/run_predict.py @@ -0,0 +1,124 @@ +import argparse +import os +import numpy as np +import paddle +import paddle.nn.functional as F + +from paddlevlp.processors.groundingdino_processing import GroudingDinoProcessor +from paddlevlp.models.groundingdino.modeling import GroundingDinoModel +from PIL import Image, ImageDraw, ImageFont + + +def plot_boxes_to_image(image_pil, tgt): + H, W = tgt["size"] + boxes = tgt["boxes"] + labels = tgt["labels"] + assert len(boxes) == len(labels), "boxes and labels must have same length" + + draw = ImageDraw.Draw(image_pil) + mask = Image.new("L", image_pil.size, 0) + mask_draw = ImageDraw.Draw(mask) + + # draw boxes and masks + for box, label in zip(boxes, labels): + # from 0..1 to 0..W, 0..H + box = box * paddle.to_tensor([W, H, W, H]) + # from xywh to xyxy + box[:2] -= box[2:] / 2 + box[2:] += box[:2] + # random color + color = tuple(np.random.randint(0, 255, size=3).tolist()) + # draw + x0, y0, x1, y1 = box.numpy() + x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1) + + draw.rectangle([x0, y0, x1, y1], outline=color, width=6) + # draw.text((x0, y0), str(label), fill=color) + + font = ImageFont.load_default() + if hasattr(font, "getbbox"): + bbox = draw.textbbox((x0, y0), str(label), font) + else: + w, h = draw.textsize(str(label), font) + bbox = (x0, y0, w + x0, y0 + h) + # bbox = draw.textbbox((x0, y0), str(label)) + draw.rectangle(bbox, fill=color) + draw.text((x0, y0), str(label), fill="white") + + mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6) + + return image_pil, mask + +def main(): + parser = argparse.ArgumentParser("Grounding DINO example", add_help=True) + parser.add_argument("--dino_type", "-dt", type=str, default="groundingdino-swint-ogc", help="dino type") + parser.add_argument("--image_path", "-i", type=str, required=True, help="path to image file") + parser.add_argument("--text_prompt", "-t", type=str, required=True, help="text prompt") + parser.add_argument( + "--output_dir", "-o", type=str, default="outputs", help="output directory" + ) + + parser.add_argument("--box_threshold", type=float, default=0.3, help="box threshold") + parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold") + parser.add_argument( + "--visual", + type=eval, + default=True, + ) + + + args = parser.parse_args() + + + #bulid processor + processor = GroudingDinoProcessor.from_pretrained( + 'bert-base-uncased' + ) + #bulid model + print(f'dino_model {args.dino_type}') + dino_model = GroundingDinoModel.from_pretrained(args.dino_type) + + #read image + image_pil = Image.open(args.image_path).convert("RGB") + #preprocess image text_prompt + image_tensor,mask,tokenized_out = processor(images=image_pil,text=args.text_prompt) + + with paddle.no_grad(): + outputs = dino_model(image_tensor,mask, input_ids=tokenized_out['input_ids'], + attention_mask=tokenized_out['attention_mask'],text_self_attention_masks=tokenized_out['text_self_attention_masks'], + position_ids=tokenized_out['position_ids']) + + logits = F.sigmoid(outputs["pred_logits"])[0] # (nq, 256) + boxes = outputs["pred_boxes"][0] # (nq, 4) + + # filter output + logits_filt = logits.clone() + boxes_filt = boxes.clone() + filt_mask = logits_filt.max(axis=1) > args.box_threshold + logits_filt = logits_filt[filt_mask] # num_filt, 256 + boxes_filt = boxes_filt[filt_mask] # num_filt, 4 + + # build pred + pred_phrases = [] + for logit, box in zip(logits_filt, boxes_filt): + pred_phrase = processor.decode(logit > args.text_threshold) + pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})") + + + size = image_pil.size + pred_dict = { + "boxes": boxes_filt, + "size": [size[1], size[0]], # H,W + "labels": pred_phrases, + } + print("output:",pred_dict) + + if args.visual: + # make dir + os.makedirs(args.output_dir, exist_ok=True) + image_with_box = plot_boxes_to_image(image_pil, pred_dict)[0] + image_with_box.save(os.path.join(args.output_dir, "pred.jpg")) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/paddlevlp/models/groundingdino/__init__.py b/paddlevlp/models/groundingdino/__init__.py new file mode 100644 index 00000000000000..d1ff79f33aafb8 --- /dev/null +++ b/paddlevlp/models/groundingdino/__init__.py @@ -0,0 +1,14 @@ +# ------------------------------------------------------------------------ +# Grounding DINO +# url: https://github.com/IDEA-Research/GroundingDINO +# Copyright (c) 2023 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Conditional DETR +# Copyright (c) 2021 Microsoft. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Copied from DETR (https://github.com/facebookresearch/detr) +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# ------------------------------------------------------------------------ + diff --git a/paddlevlp/models/groundingdino/backbone/__init__.py b/paddlevlp/models/groundingdino/backbone/__init__.py new file mode 100644 index 00000000000000..76e4b272b479a2 --- /dev/null +++ b/paddlevlp/models/groundingdino/backbone/__init__.py @@ -0,0 +1 @@ +from .backbone import build_backbone diff --git a/paddlevlp/models/groundingdino/backbone/backbone.py b/paddlevlp/models/groundingdino/backbone/backbone.py new file mode 100644 index 00000000000000..397a1fc36b234f --- /dev/null +++ b/paddlevlp/models/groundingdino/backbone/backbone.py @@ -0,0 +1,94 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Backbone modules. +""" + +from typing import Dict, List, Optional +from collections import OrderedDict + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +from .position_encoding import build_position_encoding +from .swin_transformer import SwinTransformerModel + + +class Joiner(nn.Sequential): + def __init__(self, backbone, position_embedding): + super().__init__(backbone, position_embedding) + + def forward(self, x:paddle.Tensor,m:paddle.take): + xs,masks = self[0](x,m) + pos = [] + for mask in masks: + pos.append(self[1](mask).astype(x.dtype)) + return xs, masks,pos + + +def build_backbone(args): + """ + Useful args: + - backbone: backbone name + - lr_backbone: + - dilation + - return_interm_indices: available: [0,1,2,3], [1,2,3], [3] + - backbone_freeze_keywords: + - use_checkpoint: for swin only for now + + """ + position_embedding = build_position_encoding(args) + train_backbone = True + if not train_backbone: + raise ValueError("Please set lr_backbone > 0") + return_interm_indices = args.return_interm_indices + assert return_interm_indices in [[0, 1, 2, 3], [1, 2, 3], [3]] + args.backbone_freeze_keywords + use_checkpoint = getattr(args, "use_checkpoint", False) + + if args.backbone in [ + "swin_T_224_1k", + "swin_B_224_22k", + "swin_B_384_22k", + "swin_L_224_22k", + "swin_L_384_22k", + ]: + pretrain_img_size = int(args.backbone.split("_")[-2]) + backbone = SwinTransformerModel.from_pretrained( + args.backbone, + pretrain_img_size=pretrain_img_size, + out_indices=tuple(return_interm_indices), + dilation=False, + use_checkpoint=use_checkpoint, + ) + + bb_num_channels = backbone.num_features[4 - len(return_interm_indices) :] + else: + raise NotImplementedError("Unknown backbone {}".format(args.backbone)) + + assert len(bb_num_channels) == len( + return_interm_indices + ), f"len(bb_num_channels) {len(bb_num_channels)} != len(return_interm_indices) {len(return_interm_indices)}" + + model = Joiner(backbone, position_embedding) + model.num_channels = bb_num_channels + assert isinstance( + bb_num_channels, List + ), "bb_num_channels is expected to be a List but {}".format(type(bb_num_channels)) + + return model diff --git a/paddlevlp/models/groundingdino/backbone/position_encoding.py b/paddlevlp/models/groundingdino/backbone/position_encoding.py new file mode 100644 index 00000000000000..821b0fcc161a6b --- /dev/null +++ b/paddlevlp/models/groundingdino/backbone/position_encoding.py @@ -0,0 +1,182 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Various positional encodings for the transformer. +""" +import math +from matplotlib.pyplot import axis + +import paddle +import paddle.nn as nn +from paddlenlp.utils.initializer import uniform_ + + + +class PositionEmbeddingSine(nn.Layer): + """ + This is a more standard version of the position embedding, very similar to the one + used by the Attention is all you need paper, generalized to work on images. + """ + + def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): + super().__init__() + self.num_pos_feats = num_pos_feats + self.temperature = temperature + self.normalize = normalize + if scale is not None and normalize is False: + raise ValueError("normalize should be True if scale is passed") + if scale is None: + scale = 2 * math.pi + self.scale = scale + + def forward(self, mask:paddle.Tensor): + + assert mask is not None + not_mask = ~mask + y_embed = not_mask.astype(paddle.float32).cumsum(1) + x_embed = not_mask.astype(paddle.float32).cumsum(2) + if self.normalize: + eps = 1e-6 + # if os.environ.get("SHILONG_AMP", None) == '1': + # eps = 1e-4 + # else: + # eps = 1e-6 + y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale + + dim_t = 2 * (paddle.arange(self.num_pos_feats) // 2).astype(paddle.float32x) + dim_t = self.temperature ** (dim_t / self.num_pos_feats) + + pos_x = x_embed[:, :, :, None] / dim_t + pos_y = y_embed[:, :, :, None] / dim_t + pos_x = paddle.stack( + (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), axis=4 + ).flatten(3) + pos_y = paddle.stack( + (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), axis=4 + ).flatten(3) + pos = paddle.concat((pos_y, pos_x), axis=3).transpose([0, 3, 1, 2]) + return pos + + +class PositionEmbeddingSineHW(nn.Layer): + """ + This is a more standard version of the position embedding, very similar to the one + used by the Attention is all you need paper, generalized to work on images. + """ + + def __init__( + self, num_pos_feats=64, temperatureH=10000, temperatureW=10000, normalize=False, scale=None + ): + super().__init__() + self.num_pos_feats = num_pos_feats + self.temperatureH = temperatureH + self.temperatureW = temperatureW + self.normalize = normalize + if scale is not None and normalize is False: + raise ValueError("normalize should be True if scale is passed") + if scale is None: + scale = 2 * math.pi + self.scale = scale + + def forward(self, mask:paddle.Tensor): + + assert mask is not None + not_mask = ~mask + y_embed = not_mask.astype(paddle.float32).cumsum(1) + x_embed = not_mask.astype(paddle.float32).cumsum(2) + + # import ipdb; ipdb.set_trace() + + if self.normalize: + eps = 1e-6 + y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale + + dim_tx = paddle.arange(self.num_pos_feats) + dim_tx = self.temperatureW ** (2 * (paddle.floor_divide(dim_tx, paddle.to_tensor(2))) / self.num_pos_feats) + pos_x = x_embed[:, :, :, None] / dim_tx + + dim_ty = paddle.arange(self.num_pos_feats) + dim_ty = self.temperatureH ** (2 * (paddle.floor_divide(dim_ty, paddle.to_tensor(2))) / self.num_pos_feats) + pos_y = y_embed[:, :, :, None] / dim_ty + + pos_x = paddle.stack( + (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), axis=4 + ).flatten(3) + pos_y = paddle.stack( + (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), axis=4 + ).flatten(3) + pos = paddle.concat((pos_y, pos_x), axis=3).transpose([0, 3, 1, 2]) + + # import ipdb; ipdb.set_trace() + + return pos + + +class PositionEmbeddingLearned(nn.Layer): + """ + Absolute pos embedding, learned. + """ + + def __init__(self, num_pos_feats=256): + super().__init__() + self.row_embed = nn.Embedding(50, num_pos_feats) + self.col_embed = nn.Embedding(50, num_pos_feats) + self.reset_parameters() + + def reset_parameters(self): + uniform_(self.row_embed.weight) + uniform_(self.col_embed.weight) + + def forward(self, x: paddle.Tensor): + + h, w = x.shape[-2:] + i = paddle.arange(w) + j = paddle.arange(h) + x_emb = self.col_embed(i) + y_emb = self.row_embed(j) + pos = ( + paddle.concat( + [ + x_emb.unsqueeze(0).tile([h, 1, 1]), + y_emb.unsqueeze(1).tile([1, w, 1]), + ], + axis=-1, + ) + .transpose([2, 0, 1]) + .unsqueeze(0) + .tile([x.shape[0], 1, 1, 1]) + ) + return pos + + +def build_position_encoding(args): + N_steps = args.hidden_dim // 2 + if args.position_embedding in ("v2", "sine"): + # TODO find a better way of exposing other arguments + position_embedding = PositionEmbeddingSineHW( + N_steps, + temperatureH=args.pe_temperatureH, + temperatureW=args.pe_temperatureW, + normalize=True, + ) + elif args.position_embedding in ("v3", "learned"): + position_embedding = PositionEmbeddingLearned(N_steps) + else: + raise ValueError(f"not supported {args.position_embedding}") + + return position_embedding diff --git a/paddlevlp/models/groundingdino/backbone/swin_transformer.py b/paddlevlp/models/groundingdino/backbone/swin_transformer.py new file mode 100644 index 00000000000000..cd636f1b7965d6 --- /dev/null +++ b/paddlevlp/models/groundingdino/backbone/swin_transformer.py @@ -0,0 +1,897 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.distributed.fleet.utils import recompute +from paddle.nn.initializer import Constant +from ..layers import DropPath, to_2tuple +trunc_normal_ = nn.initializer.TruncatedNormal(std=0.02) + +from paddlenlp.transformers.configuration_utils import PretrainedConfig +from paddlenlp.transformers.model_utils import PretrainedModel, register_base_model + +""" swin_transformer model configuration""" +__all__ = ["SWIN_TRANSFORMER_PRETRAINED_INIT_CONFIGURATION", "SwinTransformerConfig", "SWIN_TRANSFORMER_PRETRAINED_RESOURCE_FILES_MAP"] + + +SWIN_TRANSFORMER_PRETRAINED_INIT_CONFIGURATION = { + "swin_T_224_1k": { + "in_chans": 3, + "embed_dim": 96, + "depths": [2, 2, 6, 2], + "num_heads": [3, 6, 12, 24], + "window_size": 7, + "pretrain_img_size": 224, + "patch_size": 4, + "out_indices": (0, 1, 2, 3), + "mlp_ratio": 4.0, + "qkv_bias": True, + "qk_scale": None, + "drop_rate": 0.0, + "attn_drop_rate": 0.0, + "drop_path_rate": 0.2, + "norm_layer": "LayerNorm", + "ape": False, + "patch_norm": True, + "frozen_stages": -1, + "dilation": False, + "use_checkpoint": False, + + + }, + "swin_B_224_22k": { + "in_chans": 3, + "embed_dim": 128, + "depths": [2, 2, 18, 2], + "num_heads": [4, 8, 16, 32], + "window_size": 7, + "pretrain_img_size": 224, + "patch_size": 4, + "out_indices": (0, 1, 2, 3), + "mlp_ratio": 4.0, + "qkv_bias": True, + "qk_scale": None, + "drop_rate": 0.0, + "attn_drop_rate": 0.0, + "drop_path_rate": 0.2, + "norm_layer": "LayerNorm", + "ape": False, + "patch_norm": True, + "frozen_stages": -1, + "dilation": False, + "use_checkpoint": False + }, + "swin_B_384_22k": { + "in_chans": 3, + "embed_dim": 128, + "depths": [2, 2, 18, 2], + "num_heads": [4, 8, 16, 32], + "window_size": 12, + "pretrain_img_size": 384, + "patch_size": 4, + "out_indices": (0, 1, 2, 3), + "mlp_ratio": 4.0, + "qkv_bias": True, + "qk_scale": None, + "drop_rate": 0.0, + "attn_drop_rate": 0.0, + "drop_path_rate": 0.2, + "norm_layer": "LayerNorm", + "ape": False, + "patch_norm": True, + "frozen_stages": -1, + "dilation": False, + "use_checkpoint":False + }, + "swin_L_224_22k": { + "in_chans": 3, + "embed_dim": 192, + "depths": [2, 2, 18, 2], + "num_heads": [6, 12, 24, 48], + "window_size": 7, + "pretrain_img_size": 224, + "patch_size": 4, + "out_indices": (0, 1, 2, 3), + "mlp_ratio": 4.0, + "qkv_bias": True, + "qk_scale": None, + "drop_rate": 0.0, + "attn_drop_rate": 0.0, + "drop_path_rate": 0.2, + "norm_layer": "LayerNorm", + "ape": False, + "patch_norm": True, + "frozen_stages": -1, + "dilation": False, + "use_checkpoint": False + }, + "swin_L_384_22k":{ + "in_chans": 3, + "embed_dim": 192, + "depths": [2, 2, 18, 2], + "num_heads": [6, 12, 24, 48], + "window_size": 12, + "pretrain_img_size": 384, + "patch_size": 4, + "out_indices": (0, 1, 2, 3), + "mlp_ratio": 4.0, + "qkv_bias": True, + "qk_scale": None, + "drop_rate": 0.0, + "attn_drop_rate": 0.0, + "drop_path_rate": 0.2, + "norm_layer": "LayerNorm", + "ape": False, + "patch_norm": True, + "frozen_stages": -1, + "dilation": False, + "use_checkpoint": False + }, + +} + +SWIN_TRANSFORMER_PRETRAINED_RESOURCE_FILES_MAP = { + "model_state": { + "swin_T_224_1k": "https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_tiny_patch4_window7_224_22kto1k_pretrained.pdparams", + "swin_B_224_22k": "https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window7_224_22kto1k_pretrained.pdparams", + "swin_B_384_22k": "https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window12_384_22kto1k_pretrained.pdparams", + "swin_L_224_22k": "https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window7_224_22kto1k_pretrained.pdparams", + "swin_L_384_22k": "https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window12_384_22kto1k_pretrained.pdparams", + } +} + + +class SwinTransformerConfig(PretrainedConfig): + + model_type = "swintransformer" + pretrained_init_configuration = SWIN_TRANSFORMER_PRETRAINED_INIT_CONFIGURATION + + def __init__( + self, + in_chans=3, + embed_dim=128, + depths=[2, 2, 18, 2], + num_heads=[4, 8, 16, 32], + window_size=7, + pretrain_img_size=224, + patch_size=4, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop_rate=0.0, + attn_drop_rate=0.0, + drop_path_rate=0.2, + norm_layer=nn.LayerNorm, + ape=False, + patch_norm=True, + out_indices=(0, 1, 2, 3), + frozen_stages=-1, + dilation=False, + use_checkpoint=False + ): + super().__init__() + self.in_chans = in_chans + self.embed_dim = embed_dim + self.depths = depths + self.num_heads = num_heads + self.window_size = window_size + self.pretrain_img_size = pretrain_img_size + self.patch_size = 4 + self.mlp_ratio = mlp_ratio + self.qkv_bias = qkv_bias + self.qk_scale = qk_scale + self.drop_rate = drop_rate + self.attn_drop_rate = attn_drop_rate + self.drop_path_rate = drop_path_rate + self.norm_layer = norm_layer + self.ape = ape + self.patch_norm = patch_norm + self.out_indices = out_indices + self.frozen_stages = frozen_stages + self.dilation = dilation + self.use_checkpoint = use_checkpoint + +class SwinTransformerPretrainedModel(PretrainedModel): + """ + See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details. + """ + + model_config_file = "config.json" + config_class = SwinTransformerConfig + resource_files_names = {"model_state": "model_state.pdparams"} + base_model_prefix = "swintransformer" + + pretrained_init_configuration = SWIN_TRANSFORMER_PRETRAINED_INIT_CONFIGURATION + pretrained_resource_files_map = SWIN_TRANSFORMER_PRETRAINED_RESOURCE_FILES_MAP + + +class Mlp(nn.Layer): + """Multilayer perceptron.""" + + def __init__( + self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0 + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +def window_partition(x, window_size): + """ + Args: + x: (B, H, W, C) + window_size (int): window size + Returns: + windows: (num_windows*B, window_size, window_size, C) + """ + B, H, W, C = x.shape + x = x.reshape([B, H // window_size, window_size, W // window_size, window_size, C]) + windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape([-1, window_size, window_size, C]) + return windows + + +def window_reverse(windows, window_size, H, W): + """ + Args: + windows: (num_windows*B, window_size, window_size, C) + window_size (int): Window size + H (int): Height of image + W (int): Width of image + Returns: + x: (B, H, W, C) + """ + B = int(windows.shape[0] / (H * W / window_size / window_size)) + x = windows.reshape([B, H // window_size, W // window_size, window_size, window_size, -1]) + x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([B, H, W, -1]) + return x + + +class WindowAttention(nn.Layer): + """Window based multi-head self attention (W-MSA) module with relative position bias. + It supports both of shifted and non-shifted window. + Args: + dim (int): Number of input channels. + window_size (tuple[int]): The height and width of the window. + num_heads (int): Number of attention heads. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set + attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 + proj_drop (float, optional): Dropout ratio of output. Default: 0.0 + """ + + def __init__( + self, + dim, + window_size, + num_heads, + qkv_bias=True, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + ): + + super().__init__() + self.dim = dim + self.window_size = window_size # Wh, Ww + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + + # define a parameter table of relative position bias + self.relative_position_bias_table = self.create_parameter( + shape=[(2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads], + dtype=paddle.float32, + default_initializer=Constant(0.) + ) # 2*Wh-1 * 2*Ww-1, nH + + # get pair-wise relative position index for each token inside the window + coords_h = paddle.arange(self.window_size[0]) + coords_w = paddle.arange(self.window_size[1]) + coords = paddle.stack(paddle.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = paddle.flatten(coords, 1) # 2, Wh*Ww + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.transpose([1, 2, 0]) # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += self.window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 + self.relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + + self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + trunc_normal_(self.relative_position_bias_table) + self.softmax = nn.Softmax(axis=-1) + + def forward(self, x, mask=None): + """Forward function. + Args: + x: input features with shape of (num_windows*B, N, C) + mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None + """ + B_, N, C = x.shape + qkv = ( + self.qkv(x) + .reshape([B_, N, 3, self.num_heads, C // self.num_heads]) + .transpose([2, 0, 3, 1, 4]) + ) + q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + + q = q * self.scale + attn = paddle.mm(q, k.transpose([0, 1, 3, 2])) + index = self.relative_position_index.flatten() + + relative_position_bias = paddle.index_select( + self.relative_position_bias_table, index) + + relative_position_bias = relative_position_bias.reshape([ + self.window_size[0] * self.window_size[1], + self.window_size[0] * self.window_size[1], -1 + ]) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.transpose( + [2, 0, 1]) # nH, Wh*Ww, Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) + + if mask is not None: + nW = mask.shape[0] + attn = attn.reshape([-1, nW, self.num_heads, N, N]) + mask.unsqueeze(1).unsqueeze(0) + attn = attn.reshape([-1, self.num_heads, N, N]) + attn = self.softmax(attn) + else: + attn = self.softmax(attn) + + attn = self.attn_drop(attn) + + x = paddle.mm(attn, v).transpose([0, 2, 1, 3]).reshape([-1, N, C]) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class SwinTransformerBlock(nn.Layer): + """Swin Transformer Block. + Args: + dim (int): Number of input channels. + num_heads (int): Number of attention heads. + window_size (int): Window size. + shift_size (int): Shift size for SW-MSA. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float, optional): Stochastic depth rate. Default: 0.0 + act_layer (nn.Module, optional): Activation layer. Default: nn.GELU + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__( + self, + dim, + num_heads, + window_size=7, + shift_size=0, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + ): + super().__init__() + self.dim = dim + self.num_heads = num_heads + self.window_size = window_size + self.shift_size = shift_size + self.mlp_ratio = mlp_ratio + assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size" + + self.norm1 = norm_layer(dim) + self.attn = WindowAttention( + dim, + window_size=to_2tuple(self.window_size), + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + ) + + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop + ) + + self.H = None + self.W = None + + def forward(self, x, mask_matrix): + """Forward function. + Args: + x: Input feature, tensor size (B, H*W, C). + H, W: Spatial resolution of the input feature. + mask_matrix: Attention mask for cyclic shift. + """ + B, L, C = x.shape + H, W = self.H, self.W + assert L == H * W, "input feature has wrong size" + + shortcut = x + x = self.norm1(x) + x = x.reshape([B, H, W, C]) + + # pad feature maps to multiples of window size + pad_r = (self.window_size - W % self.window_size) % self.window_size + pad_b = (self.window_size - H % self.window_size) % self.window_size + pad_list = paddle.zeros([4],dtype="int32") + pad_list[1] = pad_r + pad_list[3] = pad_b + x = F.pad(x,pad_list,data_format='NHWC') + _, Hp, Wp, _ = x.shape + + # cyclic shift + if self.shift_size > 0: + shifted_x = paddle.roll(x, shifts=(-self.shift_size, -self.shift_size), axis=(1, 2)) + attn_mask = mask_matrix + else: + shifted_x = x + attn_mask = None + + # partition windows + x_windows = window_partition( + shifted_x, self.window_size + ) # nW*B, window_size, window_size, C + x_windows = x_windows.reshape( + [-1, self.window_size * self.window_size, C] + ) # nW*B, window_size*window_size, C + + # W-MSA/SW-MSA + attn_windows = self.attn(x_windows, mask=attn_mask) # nW*B, window_size*window_size, C + + # merge windows + attn_windows = attn_windows.reshape([-1, self.window_size, self.window_size, C]) + shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C + + # reverse cyclic shift + if self.shift_size > 0: + x = paddle.roll(shifted_x, shifts=(self.shift_size, self.shift_size), axis=(1, 2)) + else: + x = shifted_x + + if pad_r > 0 or pad_b > 0: + x = x[:, :H, :W, :] + + x = x.reshape([B, H * W, C]) + + # FFN + x = shortcut + self.drop_path(x) + x = x + self.drop_path(self.mlp(self.norm2(x))) + + return x + + +class PatchMerging(nn.Layer): + """Patch Merging Layer + Args: + dim (int): Number of input channels. + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, dim, norm_layer=nn.LayerNorm): + super().__init__() + self.dim = dim + self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False) + self.norm = norm_layer(4 * dim) + + def forward(self, x, H, W): + """Forward function. + Args: + x: Input feature, tensor size (B, H*W, C). + H, W: Spatial resolution of the input feature. + """ + B, L, C = x.shape + assert L == H * W, "input feature has wrong size" + + x = x.reshape([B, H, W, C]) + + # padding + pad_input = (H % 2 == 1) or (W % 2 == 1) + if pad_input: + pad_list = paddle.zeros([4],dtype="int32") + pad_list[1] = H % 2 + pad_list[3] = W % 2 + x = F.pad(x, pad_list) + + x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C + x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C + x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C + x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C + x = paddle.concat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C + x = x.reshape([B, -1, 4 * C]) # B H/2*W/2 4*C + + + x = self.norm(x) + x = self.reduction(x) + + return x + + +class BasicLayer(nn.Layer): + """A basic Swin Transformer layer for one stage. + Args: + dim (int): Number of feature channels + depth (int): Depths of this stage. + num_heads (int): Number of attention head. + window_size (int): Local window size. Default: 7. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. + """ + + def __init__( + self, + dim, + depth, + num_heads, + window_size=7, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + norm_layer=nn.LayerNorm, + downsample=None, + use_checkpoint=False, + ): + super().__init__() + self.window_size = window_size + self.shift_size = window_size // 2 + self.depth = depth + self.use_checkpoint = use_checkpoint + + # build blocks + self.blocks = nn.LayerList( + [ + SwinTransformerBlock( + dim=dim, + num_heads=num_heads, + window_size=window_size, + shift_size=0 if (i % 2 == 0) else window_size // 2, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop, + attn_drop=attn_drop, + drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, + norm_layer=norm_layer, + ) + for i in range(depth) + ] + ) + + # patch merging layer + if downsample is not None: + self.downsample = downsample(dim=dim, norm_layer=norm_layer) + else: + self.downsample = None + + def forward(self, x, H, W): + """Forward function. + Args: + x: Input feature, tensor size (B, H*W, C). + H, W: Spatial resolution of the input feature. + """ + + # calculate attention mask for SW-MSA + Hp = paddle.ceil(paddle.to_tensor(H / self.window_size)).astype("int32") * self.window_size + Wp = paddle.ceil(paddle.to_tensor(W / self.window_size)).astype("int32") * self.window_size + img_mask = paddle.zeros((1,Hp,Wp,1), dtype=paddle.float32) # 1 Hp Wp 1 + h_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + w_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + cnt = 0 + for h in h_slices: + for w in w_slices: + img_mask[:, h, w, :] = cnt + cnt += 1 + + mask_windows = window_partition( + img_mask, self.window_size + ) # nW, window_size, window_size, 1 + mask_windows = mask_windows.reshape([-1, self.window_size * self.window_size]) + attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + attn_mask = -100.0 * paddle.ones_like(attn_mask) * (attn_mask != 0).astype(paddle.float32) + + for blk in self.blocks: + blk.H, blk.W = H, W + if self.use_checkpoint: + x = recompute(blk, x, attn_mask, **{"preserve_rng_state": True}) + else: + x = blk(x, attn_mask) + if self.downsample is not None: + x_down = self.downsample(x, H, W) + Wh, Ww = (H + 1) // 2, (W + 1) // 2 + return x, H, W, x_down, Wh, Ww + else: + return x, H, W, x, H, W + + +class PatchEmbed(nn.Layer): + """Image to Patch Embedding + Args: + patch_size (int): Patch token size. Default: 4. + in_chans (int): Number of input image channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + norm_layer (nn.Module, optional): Normalization layer. Default: None + """ + + def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None): + super().__init__() + patch_size = to_2tuple(patch_size) + self.patch_size = patch_size + + self.in_chans = in_chans + self.embed_dim = embed_dim + + self.proj = nn.Conv2D(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + if norm_layer is not None: + self.norm = norm_layer(embed_dim) + else: + self.norm = None + + def forward(self, x): + """Forward function.""" + # padding + _, _, H, W = x.shape + if W % self.patch_size[1] != 0: + pad_list = paddle.zeros([4],dtype="int32") + pad_list[1] = self.patch_size[1] - W % self.patch_size[1] + x = F.pad(x, pad_list) + if H % self.patch_size[0] != 0: + pad_list = paddle.zeros([4],dtype="int32") + pad_list[3] = self.patch_size[0] - H % self.patch_size[0] + x = F.pad(x, pad_list) + + x = self.proj(x) # B C Wh Ww + if self.norm is not None: + Wh, Ww = x.shape[2:] + x = x.flatten(2).transpose([0, 2, 1]) + x = self.norm(x) + x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww]) + + return x + +@register_base_model +class SwinTransformerModel(SwinTransformerPretrainedModel): + """Swin Transformer backbone. + A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` - + https://arxiv.org/pdf/2103.14030 + """ + + def __init__(self,config: SwinTransformerConfig): + super(SwinTransformerModel,self).__init__(config) + + self.pretrain_img_size = config.pretrain_img_size + self.num_layers = len(config.depths) + self.in_chans = config.in_chans + self.embed_dim = config.embed_dim + self.ape = config.ape + self.patch_norm = config.patch_norm + self.norm_layer = nn.LayerNorm + self.out_indices = config.out_indices + self.frozen_stages = config.frozen_stages + self.dilation = config.dilation + self.patch_size = config.patch_size + self.patch_norm = config.patch_norm + self.drop_path_rate = config.drop_path_rate + + # split image into non-overlapping patches + self.patch_embed = PatchEmbed( + patch_size=self.patch_size, + in_chans=self.in_chans, + embed_dim=self.embed_dim, + norm_layer=self.norm_layer if self.patch_norm else None, + ) + + # absolute position embedding + if self.ape: + patch_size = to_2tuple(self.patch_size) + patches_resolution = [ + self.pretrain_img_size[0] // self.patch_size[0], + self.pretrain_img_size[1] // self.patch_size[1], + ] + + self.absolute_pos_embed = self.create_parameter( + shape=[1, self.embed_dim, patches_resolution[0], patches_resolution[1]], + dtype=paddle.float32, + default_initializer=Constant(0.) + ) + trunc_normal_(self.absolute_pos_embed) + + self.pos_drop = nn.Dropout(p=config.drop_rate) + + # stochastic depth + dpr = [ + x.item() for x in paddle.linspace(0, config.drop_path_rate, sum(config.depths)) + ] # stochastic depth decay rule + + # build layers + self.layers = nn.LayerList() + # prepare downsample list + downsamplelist = [PatchMerging for i in range(self.num_layers)] + downsamplelist[-1] = None + num_features = [int(self.embed_dim * 2**i) for i in range(self.num_layers)] + if self.dilation: + downsamplelist[-2] = None + num_features[-1] = int(self.embed_dim * 2 ** (self.num_layers - 1)) // 2 + for i_layer in range(self.num_layers): + layer = BasicLayer( + dim=num_features[i_layer], + depth=config.depths[i_layer], + num_heads=config.num_heads[i_layer], + window_size=config.window_size, + mlp_ratio=config.mlp_ratio, + qkv_bias=config.qkv_bias, + qk_scale=config.qk_scale, + drop=config.drop_rate, + attn_drop=config.attn_drop_rate, + drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])], + norm_layer=self.norm_layer, + downsample=downsamplelist[i_layer], + use_checkpoint=config.use_checkpoint, + ) + self.layers.append(layer) + + self.num_features = num_features + + # add a norm layer for each output + for i_layer in self.out_indices: + layer = self.norm_layer(num_features[i_layer]) + layer_name = f"norm{i_layer}" + self.add_sublayer(layer_name, layer) + + self._freeze_stages() + + def _freeze_stages(self): + if self.frozen_stages >= 0: + self.patch_embed.eval() + for param in self.patch_embed.parameters(): + param.stop_gradient = True + + if self.frozen_stages >= 1 and self.ape: + self.absolute_pos_embed.stop_gradient = Trueƒ + + if self.frozen_stages >= 2: + self.pos_drop.eval() + for i in range(0, self.frozen_stages - 1): + m = self.layers[i] + m.eval() + for param in m.parameters(): + param.stop_gradient = True + + + def forward_raw(self, x): + """Forward function.""" + x = self.patch_embed(x) + + Wh, Ww = x.shape[2:4] + if self.ape: + # interpolate the position embedding to the corresponding size + absolute_pos_embed = F.interpolate( + self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic" + ) + x = (x + absolute_pos_embed).flatten(2).transpose([0, 2, 1]) # B Wh*Ww C + else: + x = x.flatten(2).transpose([0, 2, 1]) + x = self.pos_drop(x) + + outs = [] + for i in range(self.num_layers): + layer = self.layers[i] + x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww) + + + if i in self.out_indices: + norm_layer = getattr(self, f"norm{i}") + x_out = norm_layer(x_out) + + out = x_out.reshape((-1, H, W, self.num_features[i])).transpose((0, 3, 1, 2)) + outs.append(out) + # in: + # torch.Size([2, 3, 1024, 1024]) + # outs: + # [torch.Size([2, 192, 256, 256]), torch.Size([2, 384, 128, 128]), \ + # torch.Size([2, 768, 64, 64]), torch.Size([2, 1536, 32, 32])] + return tuple(outs) + + def forward_with_mask(self, x:paddle.Tensor, m:paddle.Tensor): + + """Forward function.""" + x = self.patch_embed(x) + + Wh, Ww = x.shape[2], x.shape[3] + if self.ape: + # interpolate the position embedding to the corresponding size + absolute_pos_embed = F.interpolate( + self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic" + ) + x = (x + absolute_pos_embed).flatten(2).transpose([0, 2, 1]) # B Wh*Ww C + else: + x = x.flatten(2).transpose([0, 2, 1]) + x = self.pos_drop(x) + + outs = [] + for i in range(self.num_layers): + layer = self.layers[i] + x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww) + + if i in self.out_indices: + norm_layer = getattr(self, f"norm{i}") + x_out = norm_layer(x_out) + + out = x_out.reshape((-1, H, W, self.num_features[i])).transpose((0, 3, 1, 2)) + outs.append(out) + + feat_dict = [] + mask_dict = [] + for idx, out_i in enumerate(outs): + assert m is not None + mask = F.interpolate(m[None].cast(paddle.float32), size=out_i.shape[-2:]).cast(paddle.bool)[0] + feat_dict.append(out_i) + mask_dict.append(mask) + + return feat_dict,mask_dict + + def forward(self, x:paddle.Tensor, m=None): + if m is not None: + return self.forward_with_mask(x,m) + else: + return self.forward_raw(x) + + diff --git a/paddlevlp/models/groundingdino/bert_model.py b/paddlevlp/models/groundingdino/bert_model.py new file mode 100644 index 00000000000000..e0cbf877fba3a9 --- /dev/null +++ b/paddlevlp/models/groundingdino/bert_model.py @@ -0,0 +1,715 @@ +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddlenlp.taskflow.utils import pad_batch_data +from typing import Any, Callable, Dict, List, Optional, Tuple, Union +import math +import numpy as np +import warnings +from paddlenlp.transformers.bert.modeling import BaseModelOutputWithPoolingAndCrossAttentions + + +class GELUActivation(nn.Layer): + """ + Original Implementation of the GELU activation function in Google BERT repo when initially created. For + information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 + + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional + Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415 + """ + + def __init__(self, use_gelu_python: bool = False): + super().__init__() + self.act = nn.functional.gelu + + def forward(self, input): + return self.act(input) + + +class BertSelfAttention(nn.Layer): + def __init__(self, config, clamp_min_for_underflow=False, clamp_max_for_overflow=False): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") # 'absolute' + self.clamp_min_for_underflow = clamp_min_for_underflow + self.clamp_max_for_overflow = clamp_max_for_overflow + + self.is_decoder = config.is_decoder + + def transpose_for_scores(self, x): + new_x_shape = tuple(x.shape[:-1]) + (self.num_attention_heads, self.attention_head_size) + x = x.reshape(new_x_shape) + return x.transpose([0, 2, 1, 3]) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = paddle.concat([past_key_value[0], key_layer], axis=2) + value_layer = paddle.concat([past_key_value[1], value_layer], axis=2) + else: # here + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + # return query_layer,key_layer + if self.is_decoder: # False + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = paddle.matmul(query_layer, key_layer.transpose([0, 1, 3, 2])) + # return attention_scores + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + + if self.clamp_min_for_underflow: + attention_scores = paddle.clip(attention_scores, min=-50000) # Do not increase -50000, data type half has quite limited range + if self.clamp_max_for_overflow: + attention_scores = paddle.clip(attention_scores, max=50000) # Do not increase 50000, data type half has quite limited range + + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(axis=-1)(attention_scores) + + # if math.isnan(attention_probs.sum().item()): + # for i in range(attention_probs.size(1)): + # for j in range(attention_probs.size(2)): + # if math.isnan(attention_probs[0, i, j].sum().item()): + # print(i, j) + # pdb.set_trace() + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = paddle.matmul(attention_probs, value_layer) + + context_layer = context_layer.transpose([0, 2, 1, 3]) + new_context_layer_shape = tuple(context_layer.shape[:-2]) + (self.all_head_size,) + context_layer = context_layer.reshape(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + if self.is_decoder: + outputs = outputs + (past_key_value,) + return outputs + + +class BertSelfOutput(nn.Layer): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) # diff 7.2274e-06 + hidden_states = self.dropout(hidden_states) # diff 4.22e-05 + # hidden_states + input_tensor diff : 7.22e-6 + hidden_states = self.LayerNorm(hidden_states + input_tensor) #diff 1.087e-05 + return hidden_states + + +class BertAttention(nn.Layer): + def __init__(self, config, clamp_min_for_underflow=False, clamp_max_for_overflow=False): + super().__init__() + self.self = BertSelfAttention(config, clamp_min_for_underflow, clamp_max_for_overflow) + self.output = BertSelfOutput(config) + self.pruned_heads = set() + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) #pass + # return self_outputs + attention_output = self.output(self_outputs[0], hidden_states) + # print(attention_output.shape, self_outputs[0].shape, len(self_outputs)) + # attention_output 1.087e-05, self_outputs 1.31e-06 , hidden_states 1.33e-08 + # return attention_output, self_outputs, hidden_states + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class BertIntermediate(nn.Layer): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = GELUActivation() + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class BertOutput(nn.Layer): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + +class BertEmbeddings(nn.Layer): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + self.register_buffer("position_ids", paddle.arange(config.max_position_embeddings).reshape((1, -1))) + self.register_buffer( + "token_type_ids", paddle.zeros(self.position_ids.shape, dtype=paddle.int64), persistable=False + ) + + def forward( + self, + input_ids = None, + token_type_ids = None, + position_ids = None, + inputs_embeds = None, + past_key_values_length = 0, + ): + if input_ids is not None: + input_shape = input_ids.shape + else: + input_shape = inputs_embeds.shape[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] + + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand([input_shape[0], seq_length]) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = paddle.zeros(input_shape, dtype=paddle.int64) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + # return inputs_embeds + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + +class BertLayer(nn.Layer): + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = BertAttention(config) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = BertAttention(config, position_embedding_type="absolute") + self.intermediate = BertIntermediate(config) + self.output = BertOutput(config) + + def forward( + self, + hidden_states, + attention_mask = None, + head_mask = None, + encoder_hidden_states = None, + encoder_attention_mask = None, + past_key_value = None, + output_attentions = False, + ): + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + # return self_attention_outputs + attention_output = self_attention_outputs[0] + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + else: + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + cross_attn_present_key_value = None + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers" + " by setting `config.add_cross_attention=True`" + ) + + # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + cross_attn_past_key_value, + output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + + # add cross-attn cache to positions 3,4 of present_key_value tuple + cross_attn_present_key_value = cross_attention_outputs[-1] + present_key_value = present_key_value + cross_attn_present_key_value + + layer_output = self.feed_forward_chunk(attention_output) + # return layer_output, attention_output + outputs = (layer_output,) + outputs + + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + +class BertEncoder(nn.Layer): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.LayerList([BertLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + attention_mask = None, + head_mask = None, + encoder_hidden_states = None, + encoder_attention_mask = None, + past_key_values = None, + use_cache = None, + output_attentions = False, + output_hidden_states = False, + return_dict = True, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + next_decoder_cache = () if use_cache else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + # return layer_outputs + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=hidden_states, + pooler_output=None, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +class BertPooler(nn.Layer): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class BertModel(nn.Layer): + """ + + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in [Attention is + all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, + Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. + + To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set + to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and + `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. + """ + + def __init__(self, config, add_pooling_layer=True): + super().__init__() + self.config = config + + self.embeddings = BertEmbeddings(config) + self.encoder = BertEncoder(config) + self.pooler = BertPooler(config) if add_pooling_layer else None + + # Initialize weights and apply final processing + # self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def get_extended_attention_mask( + self, attention_mask: paddle.Tensor, input_shape: Tuple[int], device: str = None, dtype: np.float = None + ) -> paddle.Tensor: + if dtype is None: + dtype = np.float32 + + if not (attention_mask.dim() == 2 and self.config.is_decoder): + # show warning only if it won't be shown in `create_extended_attention_mask_for_decoder` + if device is not None: + warnings.warn( + "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning + ) + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + if attention_mask.dim() == 3: + extended_attention_mask = attention_mask[:, None, :, :] + elif attention_mask.dim() == 2: + # Provided a padding mask of dimensions [batch_size, seq_length] + # - if the model is a decoder, apply a causal mask in addition to the padding mask + # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] + extended_attention_mask = attention_mask[:, None, None, :] + else: + raise ValueError( + f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})" + ) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and the dtype's smallest value for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = paddle.cast(extended_attention_mask, dtype=dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * np.finfo(dtype).min + return extended_attention_mask + + def get_head_mask( + self, head_mask, num_hidden_layers, is_attention_chunked = False + ): + head_mask = [None] * num_hidden_layers + return head_mask + + def forward( + self, + input_ids = None, + attention_mask = None, + token_type_ids = None, + position_ids = None, + head_mask = None, + inputs_embeds = None, + encoder_hidden_states = None, + encoder_attention_mask = None, + past_key_values = None, + use_cache = None, + output_attentions = None, + output_hidden_states = None, + return_dict = None, + ): + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = input_ids.shape + elif inputs_embeds is not None: + input_shape = inputs_embeds.shape[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + + if attention_mask is None: + attention_mask = paddle.ones(((batch_size, seq_length + past_key_values_length))) + + if token_type_ids is None: + if hasattr(self.embeddings, "token_type_ids"): + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand([batch_size, seq_length]) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = paddle.zeros(input_shape, dtype=paddle.int64) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.shape + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = paddle.ones(encoder_hidden_shape) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + # return embedding_output + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # return encoder_outputs + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + +class language_model(nn.Layer): + def __init__(self, cfg, bert_config): + super().__init__() + self.cfg = cfg + self.bert_name = cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE + print("LANGUAGE BACKBONE USE GRADIENT CHECKPOINTING: ", self.cfg.MODEL.LANGUAGE_BACKBONE.USE_CHECKPOINT) + bert_config.gradient_checkpointing = self.cfg.MODEL.LANGUAGE_BACKBONE.USE_CHECKPOINT + + # bert_config.attention_probs_dropout_prob = 0.0 + # bert_config.hidden_dropout_prob = 0.0 + + self.model = BertModel(bert_config) + self.language_dim = 768 + self.num_layers = cfg.MODEL.LANGUAGE_BACKBONE.N_LAYERS + + def forward(self, x): + input = x["input_ids"] + mask = x["attention_mask"] + + if self.cfg.MODEL.DYHEAD.FUSE_CONFIG.USE_DOT_PRODUCT_TOKEN_LOSS: # true + # with padding, always 256 + outputs = self.model( + input_ids=input, + attention_mask=mask, + output_hidden_states=True, + ) + # outputs has 13 layers, 1 input layer and 12 hidden layers + encoded_layers = outputs.hidden_states[1:] + features = None + features = paddle.stack(encoded_layers[-self.num_layers:], 1).mean(1) + + # language embedding has shape [len(phrase), seq_len, language_dim] + features = features / self.num_layers + + embedded = paddle.cast(features * mask.unsqueeze(-1), paddle.float32) + aggregate = embedded.sum(1) / (paddle.cast(mask.sum(-1).unsqueeze(-1),paddle.float32)) + + ret = { + "aggregate": aggregate, + "embedded": embedded, + "masks": mask, + "hidden": encoded_layers[-1] + } + return ret \ No newline at end of file diff --git a/paddlevlp/models/groundingdino/bertwarper.py b/paddlevlp/models/groundingdino/bertwarper.py new file mode 100644 index 00000000000000..d4c75bccdbe339 --- /dev/null +++ b/paddlevlp/models/groundingdino/bertwarper.py @@ -0,0 +1,277 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from .bert_model import BertModel +from paddlenlp.transformers.model_outputs import BaseModelOutputWithPoolingAndCrossAttentions + + +class BertModelWarper(nn.Layer): + def __init__(self, bert_model): + super().__init__() + bert_model = BertModel(bert_model.config) + + self.config = bert_model.config + self.embeddings = bert_model.embeddings + self.encoder = bert_model.encoder + self.pooler = bert_model.pooler + + self.get_extended_attention_mask = bert_model.get_extended_attention_mask + # self.invert_attention_mask = bert_model.invert_attention_mask + self.get_head_mask = bert_model.get_head_mask + self.use_return_dict = True + + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` + (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` + instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. + use_cache (:obj:`bool`, `optional`): + If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up + decoding (see :obj:`past_key_values`). + """ + output_attentions = ( + output_attentions if output_attentions is not None else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.use_return_dict + + if self.config.is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = input_ids.shape + batch_size, seq_length = input_shape + elif inputs_embeds is not None: + input_shape = inputs_embeds.shape[:-1] + batch_size, seq_length = input_shape + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + + # past_key_values_length + past_key_values_length = ( + past_key_values[0][0].shape[2] if past_key_values is not None else 0 + ) + + if attention_mask is None: + attention_mask = paddle.ones( + ((batch_size, seq_length + past_key_values_length)) + ) + if token_type_ids is None: + token_type_ids = paddle.zeros(input_shape, dtype=paddle.int64) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: paddle.Tensor = self.get_extended_attention_mask( + attention_mask, input_shape + ) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + # if self.config.is_decoder and encoder_hidden_states is not None: + # encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.shape + # encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + # if encoder_attention_mask is None: + # encoder_attention_mask = paddle.ones(encoder_hidden_shape, device=device) + # encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + # else: + # encoder_extended_attention_mask = None + encoder_extended_attention_mask = None + # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO': + # import ipdb; ipdb.set_trace() + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +class TextEncoderShell(nn.Layer): + def __init__(self, text_encoder): + super().__init__() + self.text_encoder = text_encoder + self.config = self.text_encoder.config + + def forward(self, **kw): + # feed into text encoder + return self.text_encoder(**kw) + + +def generate_masks_with_special_tokens(tokenized, special_tokens_list, tokenizer): + """Generate attention mask between each pair of special tokens + Args: + input_ids (torch.Tensor): input ids. Shape: [bs, num_token] + special_tokens_mask (list): special tokens mask. + Returns: + torch.Tensor: attention mask between each special tokens. + """ + input_ids = tokenized["input_ids"] + bs, num_token = input_ids.shape + # special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens + special_tokens_mask = paddle.zeros((bs, num_token), dtype=paddle.bool) + for special_token in special_tokens_list: + special_tokens_mask |= input_ids == special_token + + # idxs: each row is a list of indices of special tokens + idxs = paddle.nonzero(special_tokens_mask) + + # generate attention mask and positional ids + attention_mask = ( + paddle.eye(num_token, dtype=paddle.bool).unsqueeze(0).tile([bs, 1, 1]) + ) + position_ids = paddle.zeros((bs, num_token)) + previous_col = 0 + for i in range(idxs.shape[0]): + row, col = idxs[i] + if (col == 0) or (col == num_token - 1): + attention_mask[row, col, col] = True + position_ids[row, col] = 0 + else: + attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True + position_ids[row, previous_col + 1 : col + 1] = paddle.arange(0, col - previous_col) + + previous_col = col + + # # padding mask + # padding_mask = tokenized['attention_mask'] + # attention_mask = attention_mask & padding_mask.unsqueeze(1).bool() & padding_mask.unsqueeze(2).bool() + + return attention_mask, position_ids.cast(paddle.int64) + + +def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_tokens_list, tokenizer): + """Generate attention mask between each pair of special tokens + Args: + input_ids (torch.Tensor): input ids. Shape: [bs, num_token] + special_tokens_mask (list): special tokens mask. + Returns: + torch.Tensor: attention mask between each special tokens. + """ + input_ids = tokenized["input_ids"] + bs, num_token = input_ids.shape + # special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens + special_tokens_mask = paddle.zeros((bs, num_token), dtype=paddle.bool) + for special_token in special_tokens_list: + special_tokens_mask |= input_ids == special_token + + # idxs: each row is a list of indices of special tokens + idxs = paddle.nonzero(special_tokens_mask) + + # generate attention mask and positional ids + attention_mask = ( + paddle.eye(num_token, dtype=paddle.int32).cast(paddle.bool).unsqueeze(0).tile([bs, 1, 1]) + ) + position_ids = paddle.zeros((bs, num_token)) + cate_to_token_mask_list = [[] for _ in range(bs)] + previous_col = 0 + for i in range(idxs.shape[0]): + row, col = idxs[i] + if (col == 0) or (col == num_token - 1): + attention_mask[row, col, col] = True + position_ids[row, col] = 0 + else: + attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True + position_ids[row, previous_col + 1 : col + 1] = paddle.arange(0, col - previous_col) + c2t_maski = paddle.zeros([num_token,]).cast(paddle.bool) + c2t_maski[previous_col + 1 : col] = True + cate_to_token_mask_list[row].append(c2t_maski) + previous_col = col + + # cate_to_token_mask_list = [ + # paddle.stack(cate_to_token_mask_listi, axis=0) + # for cate_to_token_mask_listi in cate_to_token_mask_list + # ] + + # # padding mask + # padding_mask = tokenized['attention_mask'] + # attention_mask = attention_mask & padding_mask.unsqueeze(1).bool() & padding_mask.unsqueeze(2).bool() + + return attention_mask, position_ids.cast(paddle.int64), cate_to_token_mask_list diff --git a/paddlevlp/models/groundingdino/configuration.py b/paddlevlp/models/groundingdino/configuration.py new file mode 100644 index 00000000000000..d39c42461b99d0 --- /dev/null +++ b/paddlevlp/models/groundingdino/configuration.py @@ -0,0 +1,168 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" GroundingDino model configuration""" + +from paddlenlp.transformers.configuration_utils import PretrainedConfig + +__all__ = ["GROUNDINGDINO_PRETRAINED_INIT_CONFIGURATION", "GroundingDinoConfig", "GROUNDINGDINO_PRETRAINED_RESOURCE_FILES_MAP"] + +GROUNDINGDINO_PRETRAINED_INIT_CONFIGURATION = { + "groundingdino-swint-ogc": { + "modelname" : "groundingdino", + "backbone" : "swin_T_224_1k", + "position_embedding" : "sine", + "pe_temperatureH" : 20, + "pe_temperatureW" : 20, + "return_interm_indices" : [1, 2, 3], + "backbone_freeze_keywords" : None, + "enc_layers" : 6, + "dec_layers" : 6, + "pre_norm" : False, + "dim_feedforward" : 2048, + "hidden_dim" : 256, + "dropout" : 0.0, + "nheads" : 8, + "num_queries" : 900, + "query_dim" : 4, + "num_patterns" : 0, + "num_feature_levels" : 4, + "enc_n_points" : 4, + "dec_n_points" : 4, + "two_stage_type" : "standard", + "two_stage_bbox_embed_share" : False, + "two_stage_class_embed_share" : False, + "transformer_activation" : "relu", + "dec_pred_bbox_embed_share" : True, + "dn_box_noise_scale" : 1.0, + "dn_label_noise_ratio" : 0.5, + "dn_label_coef" : 1.0, + "dn_bbox_coef" : 1.0, + "embed_init_tgt" :True, + "dn_labelbook_size" : 2000, + "max_text_len" : 256, + "text_encoder_type" : "bert-base-uncased", + "use_text_enhancer" : True, + "use_fusion_layer" : True, + "use_checkpoint" : False, + "use_transformer_ckpt" : False, + "use_text_cross_attention" : True, + "text_dropout" : 0.0, + "fusion_dropout" : 0.0, + "fusion_droppath" : 0.1, + "sub_sentence_present" : True + }, +} + +GROUNDINGDINO_PRETRAINED_RESOURCE_FILES_MAP = { + "model_state": { + "groundingdino-swint-ogc": "https://bj.bcebos.com/v1/paddledet/models/groundingdino_swint_ogc.pdparams", + } +} + + +class GroundingDinoConfig(PretrainedConfig): + + model_type = "groundingdino" + pretrained_init_configuration = GROUNDINGDINO_PRETRAINED_INIT_CONFIGURATION + + def __init__( + self, + modelname = "groundingdino", + backbone = "swin_T_224_1k", + position_embedding = "sine", + pe_temperatureH = 20, + pe_temperatureW = 20, + return_interm_indices = [1, 2, 3], + backbone_freeze_keywords = None, + enc_layers = 6, + dec_layers = 6, + pre_norm = False, + dim_feedforward = 2048, + hidden_dim = 256, + dropout = 0.0, + nheads = 8, + num_queries = 900, + query_dim = 4, + num_patterns = 0, + num_feature_levels = 4, + enc_n_points = 4, + dec_n_points = 4, + two_stage_type = "standard", + two_stage_bbox_embed_share = False, + two_stage_class_embed_share = False, + transformer_activation = "relu", + dec_pred_bbox_embed_share = True, + dn_box_noise_scale = 1.0, + dn_label_noise_ratio = 0.5, + dn_label_coef = 1.0, + dn_bbox_coef = 1.0, + embed_init_tgt = True, + dn_labelbook_size = 2000, + max_text_len = 256, + text_encoder_type = "bert-base-uncased", + use_text_enhancer = True, + use_fusion_layer = True, + use_checkpoint = False, + use_transformer_ckpt = False, + use_text_cross_attention = True, + text_dropout = 0.0, + fusion_dropout = 0.0, + fusion_droppath = 0.1, + sub_sentence_present = True + ): + super().__init__() + self.modelname = modelname + self.backbone = backbone + self.position_embedding = position_embedding + self.pe_temperatureH = pe_temperatureH + self.pe_temperatureW = pe_temperatureW + self.return_interm_indices = return_interm_indices + self.backbone_freeze_keywords = backbone_freeze_keywords + self.enc_layers = enc_layers + self.dec_layers = dec_layers + self.pre_norm = pre_norm + self.dim_feedforward = dim_feedforward + self.hidden_dim = hidden_dim + self.dropout = dropout + self.nheads = nheads + self.num_queries = num_queries + self.query_dim = query_dim + self.num_patterns = num_patterns + self.num_feature_levels = num_feature_levels + self.enc_n_points = enc_n_points + self.dec_n_points = dec_n_points + self.two_stage_type = two_stage_type + self.two_stage_bbox_embed_share = two_stage_bbox_embed_share + self.two_stage_class_embed_share = two_stage_class_embed_share + self.transformer_activation = transformer_activation + self.dec_pred_bbox_embed_share = dec_pred_bbox_embed_share + self.dn_box_noise_scale = dn_box_noise_scale + self.dn_label_noise_ratio = dn_label_noise_ratio + self.dn_label_coef = dn_label_coef + self.dn_bbox_coef = dn_bbox_coef + self.embed_init_tgt = embed_init_tgt + self.dn_labelbook_size = dn_labelbook_size + self.max_text_len = max_text_len + self.text_encoder_type = text_encoder_type + self.use_text_enhancer = use_text_enhancer + self.use_fusion_layer = use_fusion_layer + self.use_checkpoint = use_checkpoint + self.use_transformer_ckpt = use_transformer_ckpt + self.use_text_cross_attention = use_text_cross_attention + self.text_dropout = text_dropout + self.fusion_dropout = fusion_dropout + self.fusion_droppath = fusion_dropout + self.sub_sentence_present = sub_sentence_present diff --git a/paddlevlp/models/groundingdino/csrc/README.md b/paddlevlp/models/groundingdino/csrc/README.md new file mode 100644 index 00000000000000..290926d56a3ae2 --- /dev/null +++ b/paddlevlp/models/groundingdino/csrc/README.md @@ -0,0 +1,85 @@ +# Multi-scale deformable attention自定义OP编译 +该自定义OP是参考[自定义外部算子](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/custom_op/new_cpp_op_cn.html) 。 + +## 1. 环境依赖 +- Paddle >= 2.3.2 +- gcc 8.2 + +## 2. 安装 +请在当前路径下进行编译安装 +``` +cd PaddleDetection/ppdet/modeling/transformers/ext_op/ +python setup_ms_deformable_attn_op.py install +``` + +编译完成后即可使用,以下为`ms_deformable_attn`的使用示例 +``` +# 引入自定义op +from deformable_detr_ops import ms_deformable_attn + +# 构造fake input tensor +bs, n_heads, c = 2, 8, 8 +query_length, n_levels, n_points = 2, 2, 2 +spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64) +level_start_index = paddle.concat((paddle.to_tensor( + [0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1])) +value_length = sum([(H * W).item() for H, W in spatial_shapes]) + +def get_test_tensors(channels): + value = paddle.rand( + [bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01 + sampling_locations = paddle.rand( + [bs, query_length, n_heads, n_levels, n_points, 2], + dtype=paddle.float32) + attention_weights = paddle.rand( + [bs, query_length, n_heads, n_levels, n_points], + dtype=paddle.float32) + 1e-5 + attention_weights /= attention_weights.sum(-1, keepdim=True).sum( + -2, keepdim=True) + return [value, sampling_locations, attention_weights] + +value, sampling_locations, attention_weights = get_test_tensors(c) + +output = ms_deformable_attn(value, + spatial_shapes, + level_start_index, + sampling_locations, + attention_weights) +``` + +## 3. 单元测试 +可以通过执行单元测试来确认自定义算子功能的正确性,执行单元测试的示例如下所示: +``` +python test_ms_deformable_attn_op.py +``` +运行成功后,打印如下: +``` +*True check_forward_equal_with_paddle_float: max_abs_err 6.98e-10 max_rel_err 2.03e-07 +*tensor1 True check_gradient_numerical(D=30) +*tensor2 True check_gradient_numerical(D=30) +*tensor3 True check_gradient_numerical(D=30) +*tensor1 True check_gradient_numerical(D=32) +*tensor2 True check_gradient_numerical(D=32) +*tensor3 True check_gradient_numerical(D=32) +*tensor1 True check_gradient_numerical(D=64) +*tensor2 True check_gradient_numerical(D=64) +*tensor3 True check_gradient_numerical(D=64) +*tensor1 True check_gradient_numerical(D=71) +*tensor2 True check_gradient_numerical(D=71) +*tensor3 True check_gradient_numerical(D=71) +*tensor1 True check_gradient_numerical(D=128) +*tensor2 True check_gradient_numerical(D=128) +*tensor3 True check_gradient_numerical(D=128) +*tensor1 True check_gradient_numerical(D=1024) +*tensor2 True check_gradient_numerical(D=1024) +*tensor3 True check_gradient_numerical(D=1024) +*tensor1 True check_gradient_numerical(D=1025) +*tensor2 True check_gradient_numerical(D=1025) +*tensor3 True check_gradient_numerical(D=1025) +*tensor1 True check_gradient_numerical(D=2048) +*tensor2 True check_gradient_numerical(D=2048) +*tensor3 True check_gradient_numerical(D=2048) +*tensor1 True check_gradient_numerical(D=3096) +*tensor2 True check_gradient_numerical(D=3096) +*tensor3 True check_gradient_numerical(D=3096) +``` diff --git a/paddlevlp/models/groundingdino/csrc/ms_deformable_attn_op.cc b/paddlevlp/models/groundingdino/csrc/ms_deformable_attn_op.cc new file mode 100644 index 00000000000000..d1758adbcd9951 --- /dev/null +++ b/paddlevlp/models/groundingdino/csrc/ms_deformable_attn_op.cc @@ -0,0 +1,65 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/extension.h" + +#include + +// declare GPU implementation +std::vector +MSDeformableAttnCUDAForward(const paddle::Tensor &value, + const paddle::Tensor &value_spatial_shapes, + const paddle::Tensor &value_level_start_index, + const paddle::Tensor &sampling_locations, + const paddle::Tensor &attention_weights); + +std::vector MSDeformableAttnCUDABackward( + const paddle::Tensor &value, const paddle::Tensor &value_spatial_shapes, + const paddle::Tensor &value_level_start_index, + const paddle::Tensor &sampling_locations, + const paddle::Tensor &attention_weights, const paddle::Tensor &grad_out); + +//// CPU not implemented + +std::vector> +MSDeformableAttnInferShape(std::vector value_shape, + std::vector value_spatial_shapes_shape, + std::vector value_level_start_index_shape, + std::vector sampling_locations_shape, + std::vector attention_weights_shape) { + return {{value_shape[0], sampling_locations_shape[1], + value_shape[2] * value_shape[3]}}; +} + +std::vector +MSDeformableAttnInferDtype(paddle::DataType value_dtype, + paddle::DataType value_spatial_shapes_dtype, + paddle::DataType value_level_start_index_dtype, + paddle::DataType sampling_locations_dtype, + paddle::DataType attention_weights_dtype) { + return {value_dtype}; +} + +PD_BUILD_OP(ms_deformable_attn) + .Inputs({"Value", "SpatialShapes", "LevelIndex", "SamplingLocations", + "AttentionWeights"}) + .Outputs({"Out"}) + .SetKernelFn(PD_KERNEL(MSDeformableAttnCUDAForward)) + .SetInferShapeFn(PD_INFER_SHAPE(MSDeformableAttnInferShape)) + .SetInferDtypeFn(PD_INFER_DTYPE(MSDeformableAttnInferDtype)); + +PD_BUILD_GRAD_OP(ms_deformable_attn) + .Inputs({"Value", "SpatialShapes", "LevelIndex", "SamplingLocations", + "AttentionWeights", paddle::Grad("Out")}) + .Outputs({paddle::Grad("Value"), paddle::Grad("SpatialShapes"), + paddle::Grad("LevelIndex"), paddle::Grad("SamplingLocations"), + paddle::Grad("AttentionWeights")}) + .SetKernelFn(PD_KERNEL(MSDeformableAttnCUDABackward)); diff --git a/paddlevlp/models/groundingdino/csrc/ms_deformable_attn_op.cu b/paddlevlp/models/groundingdino/csrc/ms_deformable_attn_op.cu new file mode 100644 index 00000000000000..d5a8d16181bb53 --- /dev/null +++ b/paddlevlp/models/groundingdino/csrc/ms_deformable_attn_op.cu @@ -0,0 +1,1073 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/extension.h" + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +const int CUDA_NUM_THREADS = 1024; +inline int GET_BLOCKS(const int N, const int num_threads) { + return (N + num_threads - 1) / num_threads; +} + +// forward bilinear +template +__device__ data_t deformable_attn_bilinear_forward( + const data_t *&bottom_data, const int &height, const int &width, + const int &nheads, const int &channels, const data_t &h, const data_t &w, + const int &m, const int &c) { + const int h_low = floor(h); + const int w_low = floor(w); + const int h_high = h_low + 1; + const int w_high = w_low + 1; + + const data_t lh = h - h_low; + const data_t lw = w - w_low; + const data_t hh = 1 - lh, hw = 1 - lw; + + const int w_stride = nheads * channels; + const int h_stride = width * w_stride; + const int h_low_ptr_offset = h_low * h_stride; + const int h_high_ptr_offset = h_low_ptr_offset + h_stride; + const int w_low_ptr_offset = w_low * w_stride; + const int w_high_ptr_offset = w_low_ptr_offset + w_stride; + const int base_ptr = m * channels + c; + + data_t v1 = 0; + if (h_low >= 0 && w_low >= 0) { + const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; + v1 = bottom_data[ptr1]; + } + data_t v2 = 0; + if (h_low >= 0 && w_high <= width - 1) { + const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; + v2 = bottom_data[ptr2]; + } + data_t v3 = 0; + if (h_high <= height - 1 && w_low >= 0) { + const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; + v3 = bottom_data[ptr3]; + } + data_t v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) { + const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; + v4 = bottom_data[ptr4]; + } + + const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + + const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + +// forward kernel +template +__global__ void deformable_attn_cuda_kernel_forward( + const int n, const data_t *data_value, const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, const data_t *data_sampling_loc, + const data_t *data_attn_weight, const int batch_size, + const int value_length, const int num_heads, const int channels, + const int num_levels, const int query_length, const int num_points, + data_t *output_data_ptr) { + CUDA_KERNEL_LOOP(index, n) { + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % query_length; + _temp /= query_length; + const int b_col = _temp; + + data_t *data_ptr = output_data_ptr + index; + int data_weight_ptr = sampling_index * num_levels * num_points; + int data_loc_w_ptr = data_weight_ptr << 1; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * value_length * qid_stride; + data_t col = 0; + + for (int l_col = 0; l_col < num_levels; ++l_col) { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const data_t *data_value_ptr = data_value + (data_value_ptr_init_offset + + level_start_id * qid_stride); + for (int p_col = 0; p_col < num_points; ++p_col) { + const data_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const data_t weight = data_attn_weight[data_weight_ptr]; + + const data_t h_im = loc_h * spatial_h - 0.5; + const data_t w_im = loc_w * spatial_w - 0.5; + + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { + col += deformable_attn_bilinear_forward( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, + h_im, w_im, m_col, c_col) * + weight; + } + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + } + } + *data_ptr = col; + } +} + +#define CHECK_INPUT_GPU(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.") +// forward +std::vector +MSDeformableAttnCUDAForward(const paddle::Tensor &value, + const paddle::Tensor &value_spatial_shapes, + const paddle::Tensor &value_level_start_index, + const paddle::Tensor &sampling_locations, + const paddle::Tensor &attention_weights) { + + CHECK_INPUT_GPU(value); + CHECK_INPUT_GPU(value_spatial_shapes); + CHECK_INPUT_GPU(value_level_start_index); + CHECK_INPUT_GPU(sampling_locations); + CHECK_INPUT_GPU(attention_weights); + + const int batch_size = value.shape()[0]; + const int value_length = value.shape()[1]; + const int num_heads = value.shape()[2]; + const int channels = value.shape()[3]; + + const int num_levels = value_spatial_shapes.shape()[0]; + const int query_length = sampling_locations.shape()[1]; + const int num_points = sampling_locations.shape()[4]; + + auto output = paddle::full({batch_size, query_length, num_heads * channels}, + 0, value.dtype(), paddle::GPUPlace()); + + const int num_kernels = batch_size * query_length * num_heads * channels; + deformable_attn_cuda_kernel_forward + <<>>(num_kernels, value.data(), + value_spatial_shapes.data(), + value_level_start_index.data(), + sampling_locations.data(), + attention_weights.data(), batch_size, + value_length, num_heads, channels, num_levels, + query_length, num_points, output.data()); + return {output}; +} + +// backward bilinear +template +__device__ void deformable_attn_bilinear_backward( + const data_t *&bottom_data, const int &height, const int &width, + const int &nheads, const int &channels, const data_t &h, const data_t &w, + const int &m, const int &c, const data_t &top_grad, + const data_t &attn_weight, data_t *&grad_value, data_t *grad_sampling_loc, + data_t *grad_attn_weight) { + const int h_low = floor(h); + const int w_low = floor(w); + const int h_high = h_low + 1; + const int w_high = w_low + 1; + + const data_t lh = h - h_low; + const data_t lw = w - w_low; + const data_t hh = 1 - lh, hw = 1 - lw; + + const int w_stride = nheads * channels; + const int h_stride = width * w_stride; + const int h_low_ptr_offset = h_low * h_stride; + const int h_high_ptr_offset = h_low_ptr_offset + h_stride; + const int w_low_ptr_offset = w_low * w_stride; + const int w_high_ptr_offset = w_low_ptr_offset + w_stride; + const int base_ptr = m * channels + c; + + const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + const data_t top_grad_value = top_grad * attn_weight; + data_t grad_h_weight = 0, grad_w_weight = 0; + + data_t v1 = 0; + if (h_low >= 0 && w_low >= 0) { + const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; + v1 = bottom_data[ptr1]; + grad_h_weight -= hw * v1; + grad_w_weight -= hh * v1; + atomicAdd(grad_value + ptr1, w1 * top_grad_value); + } + data_t v2 = 0; + if (h_low >= 0 && w_high <= width - 1) { + const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; + v2 = bottom_data[ptr2]; + grad_h_weight -= lw * v2; + grad_w_weight += hh * v2; + atomicAdd(grad_value + ptr2, w2 * top_grad_value); + } + data_t v3 = 0; + if (h_high <= height - 1 && w_low >= 0) { + const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; + v3 = bottom_data[ptr3]; + grad_h_weight += hw * v3; + grad_w_weight -= lh * v3; + atomicAdd(grad_value + ptr3, w3 * top_grad_value); + } + data_t v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) { + const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; + v4 = bottom_data[ptr4]; + grad_h_weight += lw * v4; + grad_w_weight += lh * v4; + atomicAdd(grad_value + ptr4, w4 * top_grad_value); + } + + const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + *grad_attn_weight = top_grad * val; + *grad_sampling_loc = width * grad_w_weight * top_grad_value; + *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value; +} + +template +__device__ void deformable_attn_bilinear_backward_gm( + const data_t *&bottom_data, const int &height, const int &width, + const int &nheads, const int &channels, const data_t &h, const data_t &w, + const int &m, const int &c, const data_t &top_grad, + const data_t &attn_weight, data_t *&grad_value, data_t *grad_sampling_loc, + data_t *grad_attn_weight) { + const int h_low = floor(h); + const int w_low = floor(w); + const int h_high = h_low + 1; + const int w_high = w_low + 1; + + const data_t lh = h - h_low; + const data_t lw = w - w_low; + const data_t hh = 1 - lh, hw = 1 - lw; + + const int w_stride = nheads * channels; + const int h_stride = width * w_stride; + const int h_low_ptr_offset = h_low * h_stride; + const int h_high_ptr_offset = h_low_ptr_offset + h_stride; + const int w_low_ptr_offset = w_low * w_stride; + const int w_high_ptr_offset = w_low_ptr_offset + w_stride; + const int base_ptr = m * channels + c; + + const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + const data_t top_grad_value = top_grad * attn_weight; + data_t grad_h_weight = 0, grad_w_weight = 0; + + data_t v1 = 0; + if (h_low >= 0 && w_low >= 0) { + const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; + v1 = bottom_data[ptr1]; + grad_h_weight -= hw * v1; + grad_w_weight -= hh * v1; + atomicAdd(grad_value + ptr1, w1 * top_grad_value); + } + data_t v2 = 0; + if (h_low >= 0 && w_high <= width - 1) { + const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; + v2 = bottom_data[ptr2]; + grad_h_weight -= lw * v2; + grad_w_weight += hh * v2; + atomicAdd(grad_value + ptr2, w2 * top_grad_value); + } + data_t v3 = 0; + if (h_high <= height - 1 && w_low >= 0) { + const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; + v3 = bottom_data[ptr3]; + grad_h_weight += hw * v3; + grad_w_weight -= lh * v3; + atomicAdd(grad_value + ptr3, w3 * top_grad_value); + } + data_t v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) { + const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; + v4 = bottom_data[ptr4]; + grad_h_weight += lw * v4; + grad_w_weight += lh * v4; + atomicAdd(grad_value + ptr4, w4 * top_grad_value); + } + + const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + atomicAdd(grad_attn_weight, top_grad * val); + atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value); + atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value); +} + +// backward kernels +// channels > 1024 +template +__global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v2_multi_blocks( + const int n, const data_t *grad_col, const data_t *data_value, + const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, + const data_t *data_sampling_loc, const data_t *data_attn_weight, + const int batch_size, const int value_length, const int num_heads, + const int channels, const int num_levels, const int query_length, + const int num_points, data_t *grad_value, data_t *grad_sampling_loc, + data_t *grad_attn_weight) { + CUDA_KERNEL_LOOP(index, n) { + extern __shared__ int _s[]; + data_t *cache_grad_sampling_loc = (data_t *)_s; + data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; + unsigned int tid = threadIdx.x; + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % query_length; + _temp /= query_length; + const int b_col = _temp; + + const data_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_points; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * value_length * qid_stride; + + for (int l_col = 0; l_col < num_levels; ++l_col) { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = + data_value_ptr_init_offset + level_start_id * qid_stride; + const data_t *data_value_ptr = data_value + value_ptr_offset; + data_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col = 0; p_col < num_points; ++p_col) { + const data_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const data_t weight = data_attn_weight[data_weight_ptr]; + + const data_t h_im = loc_h * spatial_h - 0.5; + const data_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight + threadIdx.x) = 0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { + deformable_attn_bilinear_backward( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, + w_im, m_col, c_col, top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc + (threadIdx.x << 1), + cache_grad_attn_weight + threadIdx.x); + } + + __syncthreads(); + + for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0; + s >>= 1, spre >>= 1) { + if (tid < s) { + const unsigned int xid1 = tid << 1; + const unsigned int xid2 = (tid + s) << 1; + cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; + cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; + cache_grad_sampling_loc[xid1 + 1] += + cache_grad_sampling_loc[xid2 + 1]; + if (tid + (s << 1) < spre) { + cache_grad_attn_weight[tid] += + cache_grad_attn_weight[tid + (s << 1)]; + cache_grad_sampling_loc[xid1] += + cache_grad_sampling_loc[xid2 + (s << 1)]; + cache_grad_sampling_loc[xid1 + 1] += + cache_grad_sampling_loc[xid2 + 1 + (s << 1)]; + } + } + __syncthreads(); + } + + if (tid == 0) { + atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]); + atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]); + atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]); + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + +template +__global__ void deformable_attn_cuda_kernel_backward_gm( + const int n, const data_t *grad_col, const data_t *data_value, + const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, + const data_t *data_sampling_loc, const data_t *data_attn_weight, + const int batch_size, const int value_length, const int num_heads, + const int channels, const int num_levels, const int query_length, + const int num_points, data_t *grad_value, data_t *grad_sampling_loc, + data_t *grad_attn_weight) { + CUDA_KERNEL_LOOP(index, n) { + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % query_length; + _temp /= query_length; + const int b_col = _temp; + + const data_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_points; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * value_length * qid_stride; + + for (int l_col = 0; l_col < num_levels; ++l_col) { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = + data_value_ptr_init_offset + level_start_id * qid_stride; + const data_t *data_value_ptr = data_value + value_ptr_offset; + data_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col = 0; p_col < num_points; ++p_col) { + const data_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const data_t weight = data_attn_weight[data_weight_ptr]; + + const data_t h_im = loc_h * spatial_h - 0.5; + const data_t w_im = loc_w * spatial_w - 0.5; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { + deformable_attn_bilinear_backward_gm( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, + w_im, m_col, c_col, top_grad, weight, grad_value_ptr, + grad_sampling_loc, grad_attn_weight); + } + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + +// channels <= 1024 +template +__global__ void +deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1( + const int n, const data_t *grad_col, const data_t *data_value, + const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, + const data_t *data_sampling_loc, const data_t *data_attn_weight, + const int batch_size, const int value_length, const int num_heads, + const int channels, const int num_levels, const int query_length, + const int num_points, data_t *grad_value, data_t *grad_sampling_loc, + data_t *grad_attn_weight) { + CUDA_KERNEL_LOOP(index, n) { + __shared__ data_t cache_grad_sampling_loc[blockSize * 2]; + __shared__ data_t cache_grad_attn_weight[blockSize]; + unsigned int tid = threadIdx.x; + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % query_length; + _temp /= query_length; + const int b_col = _temp; + + const data_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_points; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * value_length * qid_stride; + + for (int l_col = 0; l_col < num_levels; ++l_col) { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = + data_value_ptr_init_offset + level_start_id * qid_stride; + const data_t *data_value_ptr = data_value + value_ptr_offset; + data_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col = 0; p_col < num_points; ++p_col) { + const data_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const data_t weight = data_attn_weight[data_weight_ptr]; + + const data_t h_im = loc_h * spatial_h - 0.5; + const data_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight + threadIdx.x) = 0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { + deformable_attn_bilinear_backward( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, + w_im, m_col, c_col, top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc + (threadIdx.x << 1), + cache_grad_attn_weight + threadIdx.x); + } + + __syncthreads(); + if (tid == 0) { + data_t _grad_w = cache_grad_sampling_loc[0], + _grad_h = cache_grad_sampling_loc[1], + _grad_a = cache_grad_attn_weight[0]; + int sid = 2; + for (unsigned int tid = 1; tid < blockSize; ++tid) { + _grad_w += cache_grad_sampling_loc[sid]; + _grad_h += cache_grad_sampling_loc[sid + 1]; + _grad_a += cache_grad_attn_weight[tid]; + sid += 2; + } + + *grad_sampling_loc = _grad_w; + *(grad_sampling_loc + 1) = _grad_h; + *grad_attn_weight = _grad_a; + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + +template +__global__ void +deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2( + const int n, const data_t *grad_col, const data_t *data_value, + const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, + const data_t *data_sampling_loc, const data_t *data_attn_weight, + const int batch_size, const int value_length, const int num_heads, + const int channels, const int num_levels, const int query_length, + const int num_points, data_t *grad_value, data_t *grad_sampling_loc, + data_t *grad_attn_weight) { + CUDA_KERNEL_LOOP(index, n) { + __shared__ data_t cache_grad_sampling_loc[blockSize * 2]; + __shared__ data_t cache_grad_attn_weight[blockSize]; + unsigned int tid = threadIdx.x; + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % query_length; + _temp /= query_length; + const int b_col = _temp; + + const data_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_points; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * value_length * qid_stride; + + for (int l_col = 0; l_col < num_levels; ++l_col) { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = + data_value_ptr_init_offset + level_start_id * qid_stride; + const data_t *data_value_ptr = data_value + value_ptr_offset; + data_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col = 0; p_col < num_points; ++p_col) { + const data_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const data_t weight = data_attn_weight[data_weight_ptr]; + + const data_t h_im = loc_h * spatial_h - 0.5; + const data_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight + threadIdx.x) = 0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { + deformable_attn_bilinear_backward( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, + w_im, m_col, c_col, top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc + (threadIdx.x << 1), + cache_grad_attn_weight + threadIdx.x); + } + + __syncthreads(); + + for (unsigned int s = blockSize / 2; s > 0; s >>= 1) { + if (tid < s) { + const unsigned int xid1 = tid << 1; + const unsigned int xid2 = (tid + s) << 1; + cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; + cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; + cache_grad_sampling_loc[xid1 + 1] += + cache_grad_sampling_loc[xid2 + 1]; + } + __syncthreads(); + } + + if (tid == 0) { + *grad_sampling_loc = cache_grad_sampling_loc[0]; + *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1]; + *grad_attn_weight = cache_grad_attn_weight[0]; + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + +template +__global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v1( + const int n, const data_t *grad_col, const data_t *data_value, + const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, + const data_t *data_sampling_loc, const data_t *data_attn_weight, + const int batch_size, const int value_length, const int num_heads, + const int channels, const int num_levels, const int query_length, + const int num_points, data_t *grad_value, data_t *grad_sampling_loc, + data_t *grad_attn_weight) { + CUDA_KERNEL_LOOP(index, n) { + extern __shared__ int _s[]; + data_t *cache_grad_sampling_loc = (data_t *)_s; + data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; + unsigned int tid = threadIdx.x; + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % query_length; + _temp /= query_length; + const int b_col = _temp; + + const data_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_points; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * value_length * qid_stride; + + for (int l_col = 0; l_col < num_levels; ++l_col) { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = + data_value_ptr_init_offset + level_start_id * qid_stride; + const data_t *data_value_ptr = data_value + value_ptr_offset; + data_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col = 0; p_col < num_points; ++p_col) { + const data_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const data_t weight = data_attn_weight[data_weight_ptr]; + + const data_t h_im = loc_h * spatial_h - 0.5; + const data_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight + threadIdx.x) = 0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { + deformable_attn_bilinear_backward( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, + w_im, m_col, c_col, top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc + (threadIdx.x << 1), + cache_grad_attn_weight + threadIdx.x); + } + + __syncthreads(); + if (tid == 0) { + data_t _grad_w = cache_grad_sampling_loc[0], + _grad_h = cache_grad_sampling_loc[1], + _grad_a = cache_grad_attn_weight[0]; + int sid = 2; + for (unsigned int tid = 1; tid < blockDim.x; ++tid) { + _grad_w += cache_grad_sampling_loc[sid]; + _grad_h += cache_grad_sampling_loc[sid + 1]; + _grad_a += cache_grad_attn_weight[tid]; + sid += 2; + } + + *grad_sampling_loc = _grad_w; + *(grad_sampling_loc + 1) = _grad_h; + *grad_attn_weight = _grad_a; + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + +template +__global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v2( + const int n, const data_t *grad_col, const data_t *data_value, + const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, + const data_t *data_sampling_loc, const data_t *data_attn_weight, + const int batch_size, const int value_length, const int num_heads, + const int channels, const int num_levels, const int query_length, + const int num_points, data_t *grad_value, data_t *grad_sampling_loc, + data_t *grad_attn_weight) { + CUDA_KERNEL_LOOP(index, n) { + extern __shared__ int _s[]; + data_t *cache_grad_sampling_loc = (data_t *)_s; + data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; + unsigned int tid = threadIdx.x; + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % query_length; + _temp /= query_length; + const int b_col = _temp; + + const data_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_points; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * value_length * qid_stride; + + for (int l_col = 0; l_col < num_levels; ++l_col) { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = + data_value_ptr_init_offset + level_start_id * qid_stride; + const data_t *data_value_ptr = data_value + value_ptr_offset; + data_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col = 0; p_col < num_points; ++p_col) { + const data_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const data_t weight = data_attn_weight[data_weight_ptr]; + + const data_t h_im = loc_h * spatial_h - 0.5; + const data_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight + threadIdx.x) = 0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { + deformable_attn_bilinear_backward( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, + w_im, m_col, c_col, top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc + (threadIdx.x << 1), + cache_grad_attn_weight + threadIdx.x); + } + + __syncthreads(); + + for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0; + s >>= 1, spre >>= 1) { + if (tid < s) { + const unsigned int xid1 = tid << 1; + const unsigned int xid2 = (tid + s) << 1; + cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; + cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; + cache_grad_sampling_loc[xid1 + 1] += + cache_grad_sampling_loc[xid2 + 1]; + if (tid + (s << 1) < spre) { + cache_grad_attn_weight[tid] += + cache_grad_attn_weight[tid + (s << 1)]; + cache_grad_sampling_loc[xid1] += + cache_grad_sampling_loc[xid2 + (s << 1)]; + cache_grad_sampling_loc[xid1 + 1] += + cache_grad_sampling_loc[xid2 + 1 + (s << 1)]; + } + } + __syncthreads(); + } + + if (tid == 0) { + *grad_sampling_loc = cache_grad_sampling_loc[0]; + *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1]; + *grad_attn_weight = cache_grad_attn_weight[0]; + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + +// backward branch +template +void deformable_attn_cuda_backward( + cudaStream_t stream, const data_t *grad_out, const data_t *data_value, + const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, + const data_t *data_sampling_loc, const data_t *data_attn_weight, + const int batch_size, const int value_length, const int num_heads, + const int channels, const int num_levels, const int query_length, + const int num_points, data_t *grad_value, data_t *grad_sampling_loc, + data_t *grad_attn_weight) { + const int num_threads = + (channels > CUDA_NUM_THREADS) ? CUDA_NUM_THREADS : channels; + const int num_kernels = batch_size * query_length * num_heads * channels; + const int num_actual_kernels = + batch_size * query_length * num_heads * channels; + if (channels > 1024) { + if ((channels & 1023) == 0) { + deformable_attn_cuda_kernel_backward_shm_reduce_v2_multi_blocks + <<>>( + num_kernels, grad_out, data_value, data_spatial_shapes, + data_level_start_index, data_sampling_loc, data_attn_weight, + batch_size, value_length, num_heads, channels, num_levels, + query_length, num_points, grad_value, grad_sampling_loc, + grad_attn_weight); + } else { + deformable_attn_cuda_kernel_backward_gm + <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, + data_level_start_index, data_sampling_loc, + data_attn_weight, batch_size, value_length, num_heads, + channels, num_levels, query_length, num_points, + grad_value, grad_sampling_loc, grad_attn_weight); + } + } else { + switch (channels) { + case 1: + deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1 + <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, + data_level_start_index, data_sampling_loc, + data_attn_weight, batch_size, value_length, num_heads, + channels, num_levels, query_length, num_points, + grad_value, grad_sampling_loc, grad_attn_weight); + break; + case 2: + deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1 + <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, + data_level_start_index, data_sampling_loc, + data_attn_weight, batch_size, value_length, num_heads, + channels, num_levels, query_length, num_points, + grad_value, grad_sampling_loc, grad_attn_weight); + break; + case 4: + deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1 + <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, + data_level_start_index, data_sampling_loc, + data_attn_weight, batch_size, value_length, num_heads, + channels, num_levels, query_length, num_points, + grad_value, grad_sampling_loc, grad_attn_weight); + break; + case 8: + deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1 + <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, + data_level_start_index, data_sampling_loc, + data_attn_weight, batch_size, value_length, num_heads, + channels, num_levels, query_length, num_points, + grad_value, grad_sampling_loc, grad_attn_weight); + break; + case 16: + deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1 + <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, + data_level_start_index, data_sampling_loc, + data_attn_weight, batch_size, value_length, num_heads, + channels, num_levels, query_length, num_points, + grad_value, grad_sampling_loc, grad_attn_weight); + break; + case 32: + deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1 + <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, + data_level_start_index, data_sampling_loc, + data_attn_weight, batch_size, value_length, num_heads, + channels, num_levels, query_length, num_points, + grad_value, grad_sampling_loc, grad_attn_weight); + break; + case 64: + deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2 + <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, + data_level_start_index, data_sampling_loc, + data_attn_weight, batch_size, value_length, num_heads, + channels, num_levels, query_length, num_points, + grad_value, grad_sampling_loc, grad_attn_weight); + break; + case 128: + deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2 + <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, + data_level_start_index, data_sampling_loc, + data_attn_weight, batch_size, value_length, num_heads, + channels, num_levels, query_length, num_points, + grad_value, grad_sampling_loc, grad_attn_weight); + break; + case 256: + deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2 + <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, + data_level_start_index, data_sampling_loc, + data_attn_weight, batch_size, value_length, num_heads, + channels, num_levels, query_length, num_points, + grad_value, grad_sampling_loc, grad_attn_weight); + break; + case 512: + deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2 + <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, + data_level_start_index, data_sampling_loc, + data_attn_weight, batch_size, value_length, num_heads, + channels, num_levels, query_length, num_points, + grad_value, grad_sampling_loc, grad_attn_weight); + break; + case 1024: + deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2 + <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, + data_level_start_index, data_sampling_loc, + data_attn_weight, batch_size, value_length, num_heads, + channels, num_levels, query_length, num_points, + grad_value, grad_sampling_loc, grad_attn_weight); + break; + default: + if (channels < 64) { + deformable_attn_cuda_kernel_backward_shm_reduce_v1 + <<>>( + num_kernels, grad_out, data_value, data_spatial_shapes, + data_level_start_index, data_sampling_loc, data_attn_weight, + batch_size, value_length, num_heads, channels, num_levels, + query_length, num_points, grad_value, grad_sampling_loc, + grad_attn_weight); + } else { + deformable_attn_cuda_kernel_backward_shm_reduce_v2 + <<>>( + num_kernels, grad_out, data_value, data_spatial_shapes, + data_level_start_index, data_sampling_loc, data_attn_weight, + batch_size, value_length, num_heads, channels, num_levels, + query_length, num_points, grad_value, grad_sampling_loc, + grad_attn_weight); + } + } + } +} + +// backward +std::vector MSDeformableAttnCUDABackward( + const paddle::Tensor &value, const paddle::Tensor &value_spatial_shapes, + const paddle::Tensor &value_level_start_index, + const paddle::Tensor &sampling_locations, + const paddle::Tensor &attention_weights, const paddle::Tensor &grad_out) { + + CHECK_INPUT_GPU(value); + CHECK_INPUT_GPU(value_spatial_shapes); + CHECK_INPUT_GPU(value_level_start_index); + CHECK_INPUT_GPU(sampling_locations); + CHECK_INPUT_GPU(attention_weights); + CHECK_INPUT_GPU(grad_out); + + const int batch_size = value.shape()[0]; + const int value_length = value.shape()[1]; + const int num_heads = value.shape()[2]; + const int channels = value.shape()[3]; + + const int num_levels = value_spatial_shapes.shape()[0]; + const int query_length = sampling_locations.shape()[1]; + const int num_points = sampling_locations.shape()[4]; + + auto grad_value = + paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace()); + auto grad_spatial_shapes = + paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace()); + auto grad_level_start_index = + paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace()); + auto grad_sampling_locations = + paddle::full(sampling_locations.shape(), 0, sampling_locations.dtype(), + paddle::GPUPlace()); + auto grad_attention_weights = + paddle::full(attention_weights.shape(), 0, attention_weights.dtype(), + paddle::GPUPlace()); + + deformable_attn_cuda_backward( + value.stream(), grad_out.data(), value.data(), + value_spatial_shapes.data(), + value_level_start_index.data(), sampling_locations.data(), + attention_weights.data(), batch_size, value_length, num_heads, + channels, num_levels, query_length, num_points, grad_value.data(), + grad_sampling_locations.data(), + grad_attention_weights.data()); + + return {grad_value, grad_spatial_shapes, grad_level_start_index, + grad_sampling_locations, grad_attention_weights}; +} diff --git a/paddlevlp/models/groundingdino/csrc/setup_ms_deformable_attn_op.py b/paddlevlp/models/groundingdino/csrc/setup_ms_deformable_attn_op.py new file mode 100644 index 00000000000000..7c3c386677e5d5 --- /dev/null +++ b/paddlevlp/models/groundingdino/csrc/setup_ms_deformable_attn_op.py @@ -0,0 +1,7 @@ +from paddle.utils.cpp_extension import CUDAExtension, setup + +if __name__ == "__main__": + setup( + name='deformable_detr_ops', + ext_modules=CUDAExtension( + sources=['ms_deformable_attn_op.cc', 'ms_deformable_attn_op.cu'])) diff --git a/paddlevlp/models/groundingdino/csrc/test_ms_deformable_attn_op.py b/paddlevlp/models/groundingdino/csrc/test_ms_deformable_attn_op.py new file mode 100644 index 00000000000000..94a05737cbcd6d --- /dev/null +++ b/paddlevlp/models/groundingdino/csrc/test_ms_deformable_attn_op.py @@ -0,0 +1,140 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +import os +import sys +import random +import numpy as np +import paddle +# add python path of PaddleDetection to sys.path +parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 5))) +if parent_path not in sys.path: + sys.path.append(parent_path) + +from ppdet.modeling.transformers.utils import deformable_attention_core_func +ms_deform_attn_core_paddle = deformable_attention_core_func + +try: + gpu_index = int(sys.argv[1]) +except: + gpu_index = 0 +print(f'Use gpu {gpu_index} to test...') +paddle.set_device(f'gpu:{gpu_index}') + +try: + from deformable_detr_ops import ms_deformable_attn +except Exception as e: + print('import deformable_detr_ops error', e) + sys.exit(-1) + +paddle.seed(1) +random.seed(1) +np.random.seed(1) + +bs, n_heads, c = 2, 8, 8 +query_length, n_levels, n_points = 2, 2, 2 +spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64) +level_start_index = paddle.concat((paddle.to_tensor( + [0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1])) +value_length = sum([(H * W).item() for H, W in spatial_shapes]) + + +def get_test_tensors(channels): + value = paddle.rand( + [bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01 + sampling_locations = paddle.rand( + [bs, query_length, n_heads, n_levels, n_points, 2], + dtype=paddle.float32) + attention_weights = paddle.rand( + [bs, query_length, n_heads, n_levels, n_points], + dtype=paddle.float32) + 1e-5 + attention_weights /= attention_weights.sum(-1, keepdim=True).sum( + -2, keepdim=True) + + return [value, sampling_locations, attention_weights] + + +@paddle.no_grad() +def check_forward_equal_with_paddle_float(): + value, sampling_locations, attention_weights = get_test_tensors(c) + + output_paddle = ms_deform_attn_core_paddle( + value, spatial_shapes, level_start_index, sampling_locations, + attention_weights).detach().cpu() + output_cuda = ms_deformable_attn(value, spatial_shapes, level_start_index, + sampling_locations, + attention_weights).detach().cpu() + fwdok = paddle.allclose( + output_cuda, output_paddle, rtol=1e-2, atol=1e-3).item() + max_abs_err = (output_cuda - output_paddle).abs().max().item() + max_rel_err = ( + (output_cuda - output_paddle).abs() / output_paddle.abs()).max().item() + + print( + f'*{fwdok} check_forward_equal_with_paddle_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}' + ) + + +def check_gradient_numerical(channels=4): + value_paddle, sampling_locations_paddle, attention_weights_paddle = get_test_tensors( + channels) + value_paddle.stop_gradient = False + sampling_locations_paddle.stop_gradient = False + attention_weights_paddle.stop_gradient = False + + value_cuda = value_paddle.detach().clone() + sampling_locations_cuda = sampling_locations_paddle.detach().clone() + attention_weights_cuda = attention_weights_paddle.detach().clone() + value_cuda.stop_gradient = False + sampling_locations_cuda.stop_gradient = False + attention_weights_cuda.stop_gradient = False + + output_paddle = ms_deform_attn_core_paddle( + value_paddle, spatial_shapes, level_start_index, + sampling_locations_paddle, attention_weights_paddle) + output_paddle.sum().backward() + + output_cuda = ms_deformable_attn(value_cuda, spatial_shapes, + level_start_index, sampling_locations_cuda, + attention_weights_cuda) + output_cuda.sum().backward() + + res = paddle.allclose( + value_paddle.grad, value_cuda.grad, rtol=1e-2, atol=1e-3).item() + print(f'*tensor1 {res} check_gradient_numerical(D={channels})') + + res = paddle.allclose( + sampling_locations_paddle.grad, + sampling_locations_cuda.grad, + rtol=1e-2, + atol=1e-3).item() + print(f'*tensor2 {res} check_gradient_numerical(D={channels})') + + res = paddle.allclose( + attention_weights_paddle.grad, + attention_weights_cuda.grad, + rtol=1e-2, + atol=1e-3).item() + print(f'*tensor3 {res} check_gradient_numerical(D={channels})') + + +if __name__ == '__main__': + check_forward_equal_with_paddle_float() + + for channels in [30, 32, 64, 71, 128, 1024, 1025, 2048, 3096]: + check_gradient_numerical(channels) diff --git a/paddlevlp/models/groundingdino/fuse_modules.py b/paddlevlp/models/groundingdino/fuse_modules.py new file mode 100644 index 00000000000000..0dc731cfa66e7d --- /dev/null +++ b/paddlevlp/models/groundingdino/fuse_modules.py @@ -0,0 +1,312 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn.initializer import Constant +from .layers import DropPath +from paddlenlp.utils.initializer import constant_,xavier_uniform_ +from .utils import masked_fill + + +class FeatureResizer(nn.Layer): + """ + This class takes as input a set of embeddings of dimension C1 and outputs a set of + embedding of dimension C2, after a linear transformation, dropout and normalization (LN). + """ + + def __init__(self, input_feat_size, output_feat_size, dropout, do_ln=True): + super().__init__() + self.do_ln = do_ln + # Object feature encoding + self.fc = nn.Linear(input_feat_size, output_feat_size, bias=True) + self.layer_norm = nn.LayerNorm(output_feat_size, eps=1e-12) + self.dropout = nn.Dropout(dropout) + + def forward(self, encoder_features): + x = self.fc(encoder_features) + if self.do_ln: + x = self.layer_norm(x) + output = self.dropout(x) + return output + + +def l1norm(X, dim, eps=1e-8): + """L1-normalize columns of X""" + norm = paddle.abs(X).sum(axis=dim, keepdim=True) + eps + X = paddle.divide(X, norm) + return X + + +def l2norm(X, dim, eps=1e-8): + """L2-normalize columns of X""" + norm = paddle.pow(X, 2).sum(dim=dim, keepdim=True).sqrt() + eps + X = paddle.divide(X, norm) + return X + + +def func_attention(query, context, smooth=1, raw_feature_norm="softmax", eps=1e-8): + """ + query: (n_context, queryL, d) + context: (n_context, sourceL, d) + """ + batch_size_q, queryL = query.shape[:2] + batch_size, sourceL = context.shape[:2] + + # Get attention + # --> (batch, d, queryL) + queryT = query.transpose([0, 2, 1]) + + # (batch, sourceL, d)(batch, d, queryL) + # --> (batch, sourceL, queryL) + attn = paddle.bmm(context, queryT) + if raw_feature_norm == "softmax": + # --> (batch*sourceL, queryL) + attn = attn.reshape([batch_size * sourceL, queryL]) + attn = nn.Softmax()(attn) + # --> (batch, sourceL, queryL) + attn = attn.reshape(batch_size, sourceL, queryL) + elif raw_feature_norm == "l2norm": + attn = l2norm(attn, 2) + elif raw_feature_norm == "clipped_l2norm": + attn = nn.LeakyReLU(0.1)(attn) + attn = l2norm(attn, 2) + else: + raise ValueError("unknown first norm type:", raw_feature_norm) + # --> (batch, queryL, sourceL) + attn = attn.transpose([0, 2, 1]) + # --> (batch*queryL, sourceL) + attn = attn.reshape([batch_size * queryL, sourceL]) + attn = nn.Softmax()(attn * smooth) + # --> (batch, queryL, sourceL) + attn = attn.reshape([batch_size, queryL, sourceL]) + # --> (batch, sourceL, queryL) + attnT = attn.transpose([0, 2, 1]) + + # --> (batch, d, sourceL) + contextT = context.transpose([0, 2, 1]) + # (batch x d x sourceL)(batch x sourceL x queryL) + # --> (batch, d, queryL) + weightedContext = paddle.bmm(contextT, attnT) + # --> (batch, queryL, d) + weightedContext = weightedContext.transpose([0, 2, 1]) + + return weightedContext, attnT + + +class BiMultiHeadAttention(nn.Layer): + def __init__(self, v_dim, l_dim, embed_dim, num_heads, dropout=0.1, cfg=None): + super(BiMultiHeadAttention, self).__init__() + + self.embed_dim = embed_dim + self.num_heads = num_heads + self.head_dim = embed_dim // num_heads + self.v_dim = v_dim + self.l_dim = l_dim + + assert ( + self.head_dim * self.num_heads == self.embed_dim + ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})." + self.scale = self.head_dim ** (-0.5) + self.dropout = dropout + + self.v_proj = nn.Linear(self.v_dim, self.embed_dim) + self.l_proj = nn.Linear(self.l_dim, self.embed_dim) + self.values_v_proj = nn.Linear(self.v_dim, self.embed_dim) + self.values_l_proj = nn.Linear(self.l_dim, self.embed_dim) + + self.out_v_proj = nn.Linear(self.embed_dim, self.v_dim) + self.out_l_proj = nn.Linear(self.embed_dim, self.l_dim) + + self.stable_softmax_2d = True + self.clamp_min_for_underflow = True + self.clamp_max_for_overflow = True + + self._reset_parameters() + + def _shape(self, tensor, seq_len, bsz): + return tensor.reshape([bsz, seq_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3]) + + def _reset_parameters(self): + xavier_uniform_(self.v_proj.weight) + constant_(self.v_proj.bias) + xavier_uniform_(self.l_proj.weight) + constant_(self.l_proj.bias) + xavier_uniform_(self.values_v_proj.weight) + constant_(self.values_v_proj.bias) + xavier_uniform_(self.values_l_proj.weight) + constant_(self.values_l_proj.bias) + xavier_uniform_(self.out_v_proj.weight) + constant_(self.out_v_proj.bias) + xavier_uniform_(self.out_l_proj.weight) + constant_(self.out_l_proj.bias) + + def forward(self, v, l, attention_mask_v=None, attention_mask_l=None): + """_summary_ + + Args: + v (_type_): bs, n_img, dim + l (_type_): bs, n_text, dim + attention_mask_v (_type_, optional): _description_. bs, n_img + attention_mask_l (_type_, optional): _description_. bs, n_text + + Returns: + _type_: _description_ + """ + + bsz, tgt_len, _ = v.shape + + query_states = self.v_proj(v) * self.scale + key_states = self._shape(self.l_proj(l), -1, bsz) + value_v_states = self._shape(self.values_v_proj(v), -1, bsz) + value_l_states = self._shape(self.values_l_proj(l), -1, bsz) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, tgt_len, bsz).reshape(proj_shape) + key_states = key_states.reshape(proj_shape) + value_v_states = value_v_states.reshape(proj_shape) + value_l_states = value_l_states.reshape(proj_shape) + + src_len = key_states.shape[1] + attn_weights = paddle.bmm(query_states, key_states.transpose([0, 2, 1])) # bs*nhead, nimg, ntxt + + if attn_weights.shape != [bsz * self.num_heads, tgt_len, src_len]: + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.shape}" + ) + + if self.stable_softmax_2d: + attn_weights = attn_weights - attn_weights.max() + + if self.clamp_min_for_underflow: + attn_weights = paddle.clip( + attn_weights, min=-50000 + ) # Do not increase -50000, data type half has quite limited range + if self.clamp_max_for_overflow: + attn_weights = paddle.clip( + attn_weights, max=50000 + ) # Do not increase 50000, data type half has quite limited range + + attn_weights_T = attn_weights.transpose([0, 2, 1]) + attn_weights_l = attn_weights_T - paddle.max(attn_weights_T, axis=-1, keepdim=True) + if self.clamp_min_for_underflow: + attn_weights_l = paddle.clip( + attn_weights_l, min=-50000 + ) # Do not increase -50000, data type half has quite limited range + if self.clamp_max_for_overflow: + attn_weights_l = paddle.clip( + attn_weights_l, max=50000 + ) # Do not increase 50000, data type half has quite limited range + + # mask vison for language + if attention_mask_v is not None: + + attention_mask_v = ( + attention_mask_v[:, None, None, :].cast(paddle.float32).tile([1, self.num_heads, 1, 1]).flatten(0, 1) + ) + attn_weights_l = masked_fill(attn_weights_l, attention_mask_v == 1., float("-inf")) + + attn_weights_l = F.softmax(attn_weights_l, axis=-1) + + # mask language for vision + if attention_mask_l is not None: + attention_mask_l = ( + attention_mask_l[:, None, None, :].cast(paddle.float32).tile([1, self.num_heads, 1, 1]).flatten(0, 1) + ) + attn_weights = masked_fill(attn_weights, attention_mask_l == 1., float("-inf")) + + attn_weights_v = F.softmax(attn_weights, axis=-1) + + attn_probs_v = F.dropout(attn_weights_v, p=self.dropout, training=self.training) + attn_probs_l = F.dropout(attn_weights_l, p=self.dropout, training=self.training) + + attn_output_v = paddle.bmm(attn_probs_v, value_l_states) + attn_output_l = paddle.bmm(attn_probs_l, value_v_states) + + if attn_output_v.shape != [bsz * self.num_heads, tgt_len, self.head_dim]: + raise ValueError( + f"`attn_output_v` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output_v.shape}" + ) + + if attn_output_l.shape != [bsz * self.num_heads, src_len, self.head_dim]: + raise ValueError( + f"`attn_output_l` should be of size {(bsz, self.num_heads, src_len, self.head_dim)}, but is {attn_output_l.shape}" + ) + + attn_output_v = attn_output_v.reshape([bsz, self.num_heads, tgt_len, self.head_dim]) + attn_output_v = attn_output_v.transpose([0, 2, 1, 3]) + attn_output_v = attn_output_v.reshape([bsz, tgt_len, self.embed_dim]) + + attn_output_l = attn_output_l.reshape([bsz, self.num_heads, src_len, self.head_dim]) + attn_output_l = attn_output_l.transpose([0, 2, 1, 3]) + attn_output_l = attn_output_l.reshape([bsz, src_len, self.embed_dim]) + + attn_output_v = self.out_v_proj(attn_output_v) + attn_output_l = self.out_l_proj(attn_output_l) + + return attn_output_v, attn_output_l + + +# Bi-Direction MHA (text->image, image->text) +class BiAttentionBlock(nn.Layer): + def __init__( + self, + v_dim, + l_dim, + embed_dim, + num_heads, + dropout=0.1, + drop_path=0.0, + init_values=1e-4, + cfg=None, + ): + """ + Inputs: + embed_dim - Dimensionality of input and attention feature vectors + hidden_dim - Dimensionality of hidden layer in feed-forward network + (usually 2-4x larger than embed_dim) + num_heads - Number of heads to use in the Multi-Head Attention block + dropout - Amount of dropout to apply in the feed-forward network + """ + super(BiAttentionBlock, self).__init__() + + # pre layer norm + self.layer_norm_v = nn.LayerNorm(v_dim) + self.layer_norm_l = nn.LayerNorm(l_dim) + self.attn = BiMultiHeadAttention( + v_dim=v_dim, l_dim=l_dim, embed_dim=embed_dim, num_heads=num_heads, dropout=dropout + ) + + # add layer scale for training stability + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.gamma_v = self.create_parameter( + shape=[v_dim], attr=paddle.ParamAttr(initializer=Constant(init_values)), + ) + self.gamma_l = self.create_parameter( + shape=[l_dim], attr=paddle.ParamAttr(initializer=Constant(init_values)), + ) + + def forward(self, v, l, attention_mask_v=None, attention_mask_l=None): + v = self.layer_norm_v(v) + l = self.layer_norm_l(l) + delta_v, delta_l = self.attn( + v, l, attention_mask_v=attention_mask_v, attention_mask_l=attention_mask_l + ) + # v, l = v + delta_v, l + delta_l + v = v + self.drop_path(self.gamma_v * delta_v) + l = l + self.drop_path(self.gamma_l * delta_l) + return v, l + diff --git a/paddlevlp/models/groundingdino/layers.py b/paddlevlp/models/groundingdino/layers.py new file mode 100644 index 00000000000000..fe7b072affc41b --- /dev/null +++ b/paddlevlp/models/groundingdino/layers.py @@ -0,0 +1,256 @@ +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddlenlp.utils.initializer import constant_,xavier_uniform_ + +from itertools import repeat +import collections.abc + + +def _ntuple(n): + def parse(x): + if isinstance(x, collections.abc.Iterable) and not isinstance(x, str): + return x + return tuple(repeat(x, n)) + return parse + + +to_1tuple = _ntuple(1) +to_2tuple = _ntuple(2) +to_3tuple = _ntuple(3) +to_4tuple = _ntuple(4) +to_ntuple = _ntuple + + +def _convert_attention_mask(attn_mask, dtype): + """ + Convert the attention mask to the target dtype we expect. + Parameters: + attn_mask (Tensor, optional): A tensor used in multi-head attention + to prevents attention to some unwanted positions, usually the + paddings or the subsequent positions. It is a tensor with shape + broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. + When the data type is bool, the unwanted positions have `False` + values and the others have `True` values. When the data type is + int, the unwanted positions have 0 values and the others have 1 + values. When the data type is float, the unwanted positions have + `-INF` values and the others have 0 values. It can be None when + nothing wanted or needed to be prevented attention to. Default None. + dtype (VarType): The target type of `attn_mask` we expect. + Returns: + Tensor: A Tensor with shape same as input `attn_mask`, with data type `dtype`. + """ + return nn.layer.transformer._convert_attention_mask(attn_mask, dtype) + + +class MultiHeadAttention(nn.Layer): + """ + Attention mapps queries and a set of key-value pairs to outputs, and + Multi-Head Attention performs multiple parallel attention to jointly attending + to information from different representation subspaces. + + Please refer to `Attention Is All You Need `_ + for more details. + + Parameters: + embed_dim (int): The expected feature size in the input and output. + num_heads (int): The number of heads in multi-head attention. + dropout (float, optional): The dropout probability used on attention + weights to drop some attention targets. 0 for no dropout. Default 0 + kdim (int, optional): The feature size in key. If None, assumed equal to + `embed_dim`. Default None. + vdim (int, optional): The feature size in value. If None, assumed equal to + `embed_dim`. Default None. + need_weights (bool, optional): Indicate whether to return the attention + weights. Default False. + + Examples: + + .. code-block:: python + + import paddle + + # encoder input: [batch_size, sequence_length, d_model] + query = paddle.rand((2, 4, 128)) + # self attention mask: [batch_size, num_heads, query_len, query_len] + attn_mask = paddle.rand((2, 2, 4, 4)) + multi_head_attn = paddle.nn.MultiHeadAttention(128, 2) + output = multi_head_attn(query, None, None, attn_mask=attn_mask) # [2, 4, 128] + """ + + def __init__(self, + embed_dim, + num_heads, + dropout=0., + kdim=None, + vdim=None, + need_weights=False): + super(MultiHeadAttention, self).__init__() + self.embed_dim = embed_dim + self.kdim = kdim if kdim is not None else embed_dim + self.vdim = vdim if vdim is not None else embed_dim + self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim + + self.num_heads = num_heads + self.dropout = dropout + self.need_weights = need_weights + + self.head_dim = embed_dim // num_heads + assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" + + if self._qkv_same_embed_dim: + self.in_proj_weight = self.create_parameter( + shape=[embed_dim, 3 * embed_dim], + attr=None, + dtype=self._dtype, + is_bias=False) + self.in_proj_bias = self.create_parameter( + shape=[3 * embed_dim], + attr=None, + dtype=self._dtype, + is_bias=True) + else: + self.q_proj = nn.Linear(embed_dim, embed_dim) + self.k_proj = nn.Linear(self.kdim, embed_dim) + self.v_proj = nn.Linear(self.vdim, embed_dim) + + self.out_proj = nn.Linear(embed_dim, embed_dim) + self._type_list = ('q_proj', 'k_proj', 'v_proj') + + self._reset_parameters() + + def _reset_parameters(self): + for p in self.parameters(): + if p.dim() > 1: + xavier_uniform_(p) + else: + constant_(p) + + + def compute_qkv(self, tensor, index): + if self._qkv_same_embed_dim: + tensor = F.linear( + x=tensor, + weight=self.in_proj_weight[:, index * self.embed_dim:(index + 1) + * self.embed_dim], + bias=self.in_proj_bias[index * self.embed_dim:(index + 1) * + self.embed_dim] + if self.in_proj_bias is not None else None) + else: + tensor = getattr(self, self._type_list[index])(tensor) + tensor = tensor.reshape( + [0, 0, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3]) + return tensor + + + def forward(self, query, key=None, value=None, attn_mask=None): + r""" + Applies multi-head attention to map queries and a set of key-value pairs + to outputs. + + Parameters: + query (Tensor): The queries for multi-head attention. It is a + tensor with shape `[batch_size, query_length, embed_dim]`. The + data type should be float32 or float64. + key (Tensor, optional): The keys for multi-head attention. It is + a tensor with shape `[batch_size, key_length, kdim]`. The + data type should be float32 or float64. If None, use `query` as + `key`. Default None. + value (Tensor, optional): The values for multi-head attention. It + is a tensor with shape `[batch_size, value_length, vdim]`. + The data type should be float32 or float64. If None, use `query` as + `value`. Default None. + attn_mask (Tensor, optional): A tensor used in multi-head attention + to prevents attention to some unwanted positions, usually the + paddings or the subsequent positions. It is a tensor with shape + broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. + When the data type is bool, the unwanted positions have `False` + values and the others have `True` values. When the data type is + int, the unwanted positions have 0 values and the others have 1 + values. When the data type is float, the unwanted positions have + `-INF` values and the others have 0 values. It can be None when + nothing wanted or needed to be prevented attention to. Default None. + + Returns: + Tensor|tuple: It is a tensor that has the same shape and data type \ + as `query`, representing attention output. Or a tuple if \ + `need_weights` is True or `cache` is not None. If `need_weights` \ + is True, except for attention output, the tuple also includes \ + the attention weights tensor shaped `[batch_size, num_heads, query_length, key_length]`. \ + If `cache` is not None, the tuple then includes the new cache \ + having the same type as `cache`, and if it is `StaticCache`, it \ + is same as the input `cache`, if it is `Cache`, the new cache \ + reserves tensors concatanating raw tensors with intermediate \ + results of current query. + """ + key = query if key is None else key + value = query if value is None else value + # compute q ,k ,v + q, k, v = (self.compute_qkv(t, i) + for i, t in enumerate([query, key, value])) + + # scale dot product attention + product = paddle.matmul(x=q, y=k, transpose_y=True) + scaling = float(self.head_dim)**-0.5 + product = product * scaling + + if attn_mask is not None: + # Support bool or int mask + attn_mask = _convert_attention_mask(attn_mask, product.dtype) + product = product + attn_mask + weights = F.softmax(product) + if self.dropout: + weights = F.dropout( + weights, + self.dropout, + training=self.training, + mode="upscale_in_train") + + out = paddle.matmul(weights, v) + + # combine heads + out = paddle.transpose(out, perm=[0, 2, 1, 3]) + out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) + + # project to output + out = self.out_proj(out) + + outs = [out] + if self.need_weights: + outs.append(weights) + return out if len(outs) == 1 else tuple(outs) + + +def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, + the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for + changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use + 'survival rate' as the argument. + + """ + if drop_prob == 0. or not training: + return x + keep_prob = 1 - drop_prob + shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = paddle.bernoulli(paddle.full(shape, keep_prob, dtype=x.dtype)) + if keep_prob > 0.0 and scale_by_keep: + random_tensor = paddle.divide(random_tensor, paddle.to_tensor(keep_prob)) + return x * random_tensor + + +class DropPath(nn.Layer): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + self.scale_by_keep = scale_by_keep + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training, self.scale_by_keep) + + def extra_repr(self): + return f'drop_prob={round(self.drop_prob,3):0.3f}' \ No newline at end of file diff --git a/paddlevlp/models/groundingdino/modeling.py b/paddlevlp/models/groundingdino/modeling.py new file mode 100644 index 00000000000000..11f5fcf76559cc --- /dev/null +++ b/paddlevlp/models/groundingdino/modeling.py @@ -0,0 +1,285 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +from typing import List + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import Tensor +from paddle.nn import Layer + + +from paddlenlp.transformers.model_utils import PretrainedModel, register_base_model +from paddlenlp.utils.initializer import constant_,xavier_uniform_ +from paddlenlp.transformers import AutoTokenizer, BertModel, RobertaModel + + +from .utils import MLP, ContrastiveEmbed,inverse_sigmoid + +from .bertwarper import ( + BertModelWarper, + generate_masks_with_special_tokens, + generate_masks_with_special_tokens_and_transfer_map, +) + +from .configuration import ( + GROUNDINGDINO_PRETRAINED_INIT_CONFIGURATION, + GROUNDINGDINO_PRETRAINED_RESOURCE_FILES_MAP, + GroundingDinoConfig, +) +from .backbone import build_backbone +from .transformer import build_transformer + + +__all__ = [ + "GroundingDinoModel", + "GroundingDinoPretrainedModel", +] + + +class GroundingDinoPretrainedModel(PretrainedModel): + """ + See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details. + """ + + model_config_file = "config.json" + config_class = GroundingDinoConfig + resource_files_names = {"model_state": "model_state.pdparams"} + base_model_prefix = "groundding" + + pretrained_init_configuration = GROUNDINGDINO_PRETRAINED_INIT_CONFIGURATION + pretrained_resource_files_map = GROUNDINGDINO_PRETRAINED_RESOURCE_FILES_MAP + +@register_base_model +class GroundingDinoModel(GroundingDinoPretrainedModel): + """ + Args: + config (:class:`GroundingDinoConfig`): + An instance of BertConfig used to construct BertModel. + """ + + def __init__(self, config: GroundingDinoConfig): + super(GroundingDinoModel, self).__init__(config) + + self.query_dim = config.query_dim + self.backbone = build_backbone(config) + self.transformer = build_transformer(config) + self.hidden_dim = hidden_dim = self.transformer.d_model + self.num_feature_levels = config.num_feature_levels + self.nheads = config.nheads + self.max_text_len = config.max_text_len + self.sub_sentence_present = config.sub_sentence_present + + # bert + if config.text_encoder_type == "bert-base-uncased": + self.bert = BertModel.from_pretrained(config.text_encoder_type) + elif config.text_encoder_type == "roberta-base": + self.bert = RobertaModel.from_pretrained(config.text_encoder_type) + else: + raise ValueError("Unknown text_encoder_type {}".format(config.text_encoder_type)) + self.bert.pooler.dense.weight.stop_gradient = True + self.bert.pooler.dense.bias.stop_gradient = True + self.bert = BertModelWarper(bert_model=self.bert) + + self.feat_map = nn.Linear(self.bert.config.hidden_size, self.hidden_dim, bias_attr=True) + constant_(self.feat_map.bias, 0) + xavier_uniform_(self.feat_map.weight) + + + # prepare input projection layers + if config.num_feature_levels > 1: + num_backbone_outs = len(self.backbone.num_channels) + input_proj_list = [] + for _ in range(num_backbone_outs): + in_channels = self.backbone.num_channels[_] + input_proj_list.append( + nn.Sequential( + nn.Conv2D(in_channels, hidden_dim, kernel_size=1), + nn.GroupNorm(32, hidden_dim), + ) + ) + for _ in range(config.num_feature_levels - num_backbone_outs): + input_proj_list.append( + nn.Sequential( + nn.Conv2D(in_channels, hidden_dim, kernel_size=3, stride=2, padding=1), + nn.GroupNorm(32, hidden_dim), + ) + ) + in_channels = hidden_dim + self.input_proj = nn.LayerList(input_proj_list) + else: + assert two_stage_type == "no", "two_stage_type should be no if num_feature_levels=1 !!!" + self.input_proj = nn.LayerList( + [ + nn.Sequential( + nn.Conv2D(self.backbone.num_channels[-1], hidden_dim, kernel_size=1), + nn.GroupNorm(32, hidden_dim), + ) + ] + ) + + # prepare class & box embed + _class_embed = ContrastiveEmbed() + + _bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3) + constant_(_bbox_embed.layers[-1].weight, 0) + constant_(_bbox_embed.layers[-1].bias, 0) + + if config.dec_pred_bbox_embed_share: + box_embed_layerlist = [_bbox_embed for i in range(self.transformer.num_decoder_layers)] + else: + box_embed_layerlist = [ + copy.deepcopy(_bbox_embed) for i in range(self.transformer.num_decoder_layers) + ] + class_embed_layerlist = [_class_embed for i in range(self.transformer.num_decoder_layers)] + self.bbox_embed = nn.LayerList(box_embed_layerlist) + self.class_embed = nn.LayerList(class_embed_layerlist) + self.transformer.decoder.bbox_embed = self.bbox_embed + self.transformer.decoder.class_embed = self.class_embed + + # two stage + assert config.two_stage_type in ["no", "standard"], "unknown param {} of two_stage_type".format( + config.two_stage_type + ) + if config.two_stage_type != "no": + if config.two_stage_bbox_embed_share: + assert config.dec_pred_bbox_embed_share + self.transformer.enc_out_bbox_embed = _bbox_embed + else: + self.transformer.enc_out_bbox_embed = copy.deepcopy(_bbox_embed) + + if config.two_stage_class_embed_share: + assert config.dec_pred_bbox_embed_share + self.transformer.enc_out_class_embed = _class_embed + else: + self.transformer.enc_out_class_embed = copy.deepcopy(_class_embed) + + self.refpoint_embed = None + + self._reset_parameters() + + def _reset_parameters(self): + # init input_proj + for proj in self.input_proj: + xavier_uniform_(proj[0].weight, gain=1) + constant_(proj[0].bias, 0) + + def init_ref_points(self, use_num_queries): + self.refpoint_embed = nn.Embedding(use_num_queries, self.query_dim) + + + def forward( + self, + x: paddle.Tensor, + m: paddle.Tensor, + input_ids:paddle.Tensor, + attention_mask:paddle.Tensor, + text_self_attention_masks:paddle.Tensor, + position_ids:paddle.Tensor = None, + targets: List = None + + ): + + tokenized = { + "input_ids": input_ids, + "attention_mask":attention_mask, + } + + # extract text embeddings + if self.sub_sentence_present: + tokenized_for_encoder = {k: v for k, v in tokenized.items() if k != "attention_mask"} + tokenized_for_encoder["attention_mask"] = text_self_attention_masks + tokenized_for_encoder["position_ids"] = position_ids + else: + # import ipdb; ipdb.set_trace() + tokenized_for_encoder = tokenized + + bert_output = self.bert(**tokenized_for_encoder) # bs, 195, 768 + + encoded_text = self.feat_map(bert_output["last_hidden_state"]) # bs, 195, d_model + text_token_mask = tokenized["attention_mask"].cast(paddle.bool) # bs, 195 + # text_token_mask: True for nomask, False for mask + # text_self_attention_masks: True for nomask, False for mask + + if encoded_text.shape[1] > self.max_text_len: + encoded_text = encoded_text[:, : self.max_text_len, :] + text_token_mask = text_token_mask[:, : self.max_text_len] + position_ids = position_ids[:, : self.max_text_len] + text_self_attention_masks = text_self_attention_masks[ + :, : self.max_text_len, : self.max_text_len + ] + + text_dict = { + "encoded_text": encoded_text, # bs, 195, d_model + "text_token_mask": text_token_mask, # bs, 195 + "position_ids": position_ids, # bs, 195 + "text_self_attention_masks": text_self_attention_masks, # bs, 195,195 + } + + features,feat_masks,poss = self.backbone(x,m) + + + srcs = [] + masks = [] + for l, src in enumerate(features): + # src, mask = feat.decompose() + srcs.append(self.input_proj[l](src)) + masks.append(feat_masks[l]) + # assert mask is not None + + if self.num_feature_levels > len(srcs): + _len_srcs = len(srcs) + for l in range(_len_srcs, self.num_feature_levels): + if l == _len_srcs: + # src = self.input_proj[l](features[-1].tensors) + src = self.input_proj[l](features[-1]) + else: + src = self.input_proj[l](srcs[-1]) + # m = samples.mask + mask = F.interpolate(m[None].cast(paddle.float32), size=src.shape[-2:]).cast(paddle.bool)[0] + # pos_l = self.backbone[1](NestedTensor(src, mask)).cast(src.dtype) + pos_l = self.backbone[1](mask).cast(src.dtype) + srcs.append(src) + masks.append(mask) + poss.append(pos_l) + + input_query_bbox = input_query_label = attn_mask = dn_meta = None + hs, reference, hs_enc, ref_enc, init_box_proposal = self.transformer( + srcs, masks, input_query_bbox, poss, input_query_label, attn_mask, text_dict + ) + + # deformable-detr-like anchor update + outputs_coord_list = [] + for dec_lid, (layer_ref_sig, layer_bbox_embed, layer_hs) in enumerate( + zip(reference[:-1], self.bbox_embed, hs) + ): + layer_delta_unsig = layer_bbox_embed(layer_hs) + layer_outputs_unsig = layer_delta_unsig + inverse_sigmoid(layer_ref_sig) + layer_outputs_unsig = F.sigmoid(layer_outputs_unsig) + outputs_coord_list.append(layer_outputs_unsig) + outputs_coord_list = paddle.stack(outputs_coord_list) + + # output + outputs_class = paddle.stack( + [ + layer_cls_embed(layer_hs, text_dict) + for layer_cls_embed, layer_hs in zip(self.class_embed, hs) + ] + ) + + out = {"pred_logits": outputs_class[-1], "pred_boxes": outputs_coord_list[-1]} + + return out \ No newline at end of file diff --git a/paddlevlp/models/groundingdino/ms_deform_attn.py b/paddlevlp/models/groundingdino/ms_deform_attn.py new file mode 100644 index 00000000000000..6b0a43c37fc938 --- /dev/null +++ b/paddlevlp/models/groundingdino/ms_deform_attn.py @@ -0,0 +1,210 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr +from paddlenlp.utils.initializer import constant_,xavier_uniform_ + + + +# helpers +def _is_power_of_2(n): + if (not isinstance(n, int)) or (n < 0): + raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))) + return (n & (n - 1) == 0) and n != 0 + + +def deformable_attention_core_func(value, value_spatial_shapes, + value_level_start_index, sampling_locations, + attention_weights): + """ + Args: + value (Tensor): [bs, value_length, n_head, c] + value_spatial_shapes (Tensor): [n_levels, 2] + value_level_start_index (Tensor): [n_levels] + sampling_locations (Tensor): [bs, query_length, n_head, n_levels, n_points, 2] + attention_weights (Tensor): [bs, query_length, n_head, n_levels, n_points] + + Returns: + output (Tensor): [bs, Length_{query}, C] + """ + bs, _, n_head, c = value.shape + _, Len_q, _, n_levels, n_points, _ = sampling_locations.shape + + value_list = value.split( + value_spatial_shapes.prod(1).split(n_levels), axis=1) + sampling_grids = 2 * sampling_locations - 1 + sampling_value_list = [] + for level, (h, w) in enumerate(value_spatial_shapes): + # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ + value_l_ = value_list[level].flatten(2).transpose( + [0, 2, 1]).reshape([bs * n_head, c, h, w]) + # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 + sampling_grid_l_ = sampling_grids[:, :, :, level].transpose( + [0, 2, 1, 3, 4]).flatten(0, 1) + # N_*M_, D_, Lq_, P_ + sampling_value_l_ = F.grid_sample( + value_l_, + sampling_grid_l_, + mode='bilinear', + padding_mode='zeros', + align_corners=False) + sampling_value_list.append(sampling_value_l_) + # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_*M_, 1, Lq_, L_*P_) + attention_weights = attention_weights.transpose([0, 2, 1, 3, 4]).reshape( + [bs * n_head, 1, Len_q, n_levels * n_points]) + output = (paddle.stack( + sampling_value_list, axis=-2).flatten(-2) * + attention_weights).sum(-1).reshape([bs, n_head * c, Len_q]) + + return output.transpose([0, 2, 1]) + +class MSDeformableAttention(nn.Layer): + def __init__(self, + embed_dim=256, + num_heads=8, + num_levels=4, + num_points=4, + lr_mult=0.1, + batch_first=False): + """ + Multi-Scale Deformable Attention Module + """ + super(MSDeformableAttention, self).__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.num_levels = num_levels + self.num_points = num_points + self.total_points = num_heads * num_levels * num_points + + self.head_dim = embed_dim // num_heads + assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" + + self.sampling_offsets = nn.Linear( + embed_dim, + self.total_points * 2, + weight_attr=ParamAttr(learning_rate=lr_mult), + bias_attr=ParamAttr(learning_rate=lr_mult)) + + self.attention_weights = nn.Linear(embed_dim, self.total_points) + self.value_proj = nn.Linear(embed_dim, embed_dim) + self.output_proj = nn.Linear(embed_dim, embed_dim) + try: + # use cuda op + from deformable_detr_ops import ms_deformable_attn + except: + # use paddle func + ms_deformable_attn = deformable_attention_core_func + self.ms_deformable_attn_core = ms_deformable_attn + self.batch_first = batch_first + + self._reset_parameters() + + def _reset_parameters(self): + # sampling_offsets + constant_(self.sampling_offsets.weight) + thetas = paddle.arange( + self.num_heads, + dtype=paddle.float32) * (2.0 * math.pi / self.num_heads) + grid_init = paddle.stack([thetas.cos(), thetas.sin()], -1) + grid_init = grid_init / grid_init.abs().max(-1, keepdim=True) + grid_init = grid_init.reshape([self.num_heads, 1, 1, 2]).tile( + [1, self.num_levels, self.num_points, 1]) + scaling = paddle.arange( + 1, self.num_points + 1, + dtype=paddle.float32).reshape([1, 1, -1, 1]) + grid_init *= scaling + self.sampling_offsets.bias.set_value(grid_init.flatten()) + # attention_weights + constant_(self.attention_weights.weight) + constant_(self.attention_weights.bias) + # proj + xavier_uniform_(self.value_proj.weight) + constant_(self.value_proj.bias) + xavier_uniform_(self.output_proj.weight) + constant_(self.output_proj.bias) + + def forward(self, + query, + reference_points, + value, + value_spatial_shapes, + value_level_start_index, + value_mask=None): + """ + Args: + query (Tensor): [bs, query_length, C] + reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0), + bottom-right (1, 1), including padding area + value (Tensor): [bs, value_length, C] + value_spatial_shapes (Tensor): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] + value_level_start_index (Tensor(int64)): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...] + value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements + + Returns: + output (Tensor): [bs, Length_{query}, C] + """ + bs, Len_q = query.shape[:2] + Len_v = value.shape[1] + assert int(value_spatial_shapes.prod(1).sum()) == Len_v + + if not self.batch_first: + # change to (bs, num_query ,embed_dims) + query = query.transpose([1, 0, 2]) + value = value.permute([1, 0, 2]) + + value = self.value_proj(value) + if value_mask is not None: + value_mask = (~value_mask).astype(value.dtype).unsqueeze(-1) + value *= value_mask + value = value.reshape([bs, Len_v, self.num_heads, self.head_dim]) + + sampling_offsets = self.sampling_offsets(query).reshape( + [bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2]) + attention_weights = self.attention_weights(query).reshape( + [bs, Len_q, self.num_heads, self.num_levels * self.num_points]) + attention_weights = F.softmax(attention_weights).reshape( + [bs, Len_q, self.num_heads, self.num_levels, self.num_points]) + + if reference_points.shape[-1] == 2: + offset_normalizer = value_spatial_shapes.flip([1]).reshape( + [1, 1, 1, self.num_levels, 1, 2]) + sampling_locations = reference_points.reshape([ + bs, Len_q, 1, self.num_levels, 1, 2 + ]) + sampling_offsets / offset_normalizer + elif reference_points.shape[-1] == 4: + sampling_locations = ( + reference_points[:, :, None, :, None, :2] + sampling_offsets / + self.num_points * reference_points[:, :, None, :, None, 2:] * + 0.5) + else: + raise ValueError( + "Last dim of reference_points must be 2 or 4, but get {} instead.". + format(reference_points.shape[-1])) + + + output = self.ms_deformable_attn_core( + value, value_spatial_shapes.astype('int64'), value_level_start_index.astype('int64'), + sampling_locations, attention_weights) + output = self.output_proj(output) + + if not self.batch_first: + # change to (bs, num_query ,embed_dims) + output = output.transpose([1, 0, 2]) + + return output \ No newline at end of file diff --git a/paddlevlp/models/groundingdino/transformer.py b/paddlevlp/models/groundingdino/transformer.py new file mode 100644 index 00000000000000..697e6a90626eb9 --- /dev/null +++ b/paddlevlp/models/groundingdino/transformer.py @@ -0,0 +1,970 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional +import numpy as np + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.distributed.fleet.utils import recompute + +from .utils import inverse_sigmoid +from paddlenlp.utils.initializer import constant_,xavier_uniform_,normal_ +from .layers import MultiHeadAttention + +from .fuse_modules import BiAttentionBlock +from .ms_deform_attn import MSDeformableAttention as MSDeformAttn +from .transformer_vanilla import TransformerEncoderLayer +from .utils import ( + MLP, + _get_activation_fn, + _get_clones, + gen_encoder_output_proposals, + gen_sineembed_for_position, + get_sine_pos_embed, +) + + +class Transformer(nn.Layer): + def __init__( + self, + d_model=256, + nhead=8, + num_queries=300, + num_encoder_layers=6, + num_unicoder_layers=0, + num_decoder_layers=6, + dim_feedforward=2048, + dropout=0.0, + activation="relu", + normalize_before=False, + return_intermediate_dec=False, + query_dim=4, + num_patterns=0, + # for deformable encoder + num_feature_levels=1, + enc_n_points=4, + dec_n_points=4, + # init query + learnable_tgt_init=False, + # two stage + two_stage_type="no", # ['no', 'standard', 'early', 'combine', 'enceachlayer', 'enclayer1'] + embed_init_tgt=False, + # for text + use_text_enhancer=False, + use_fusion_layer=False, + use_checkpoint=False, + use_transformer_ckpt=False, + use_text_cross_attention=False, + text_dropout=0.1, + fusion_dropout=0.1, + fusion_droppath=0.0, + ): + super().__init__() + self.num_feature_levels = num_feature_levels + self.num_encoder_layers = num_encoder_layers + self.num_unicoder_layers = num_unicoder_layers + self.num_decoder_layers = num_decoder_layers + self.num_queries = num_queries + assert query_dim == 4 + + # choose encoder layer type + encoder_layer = DeformableTransformerEncoderLayer( + d_model, dim_feedforward, dropout, activation, num_feature_levels, nhead, enc_n_points + ) + + if use_text_enhancer: + text_enhance_layer = TransformerEncoderLayer( + d_model=d_model, + nhead=nhead // 2, + dim_feedforward=dim_feedforward // 2, + dropout=text_dropout, + ) + else: + text_enhance_layer = None + + if use_fusion_layer: + feature_fusion_layer = BiAttentionBlock( + v_dim=d_model, + l_dim=d_model, + embed_dim=dim_feedforward // 2, + num_heads=nhead // 2, + dropout=fusion_dropout, + drop_path=fusion_droppath, + ) + else: + feature_fusion_layer = None + + encoder_norm = nn.LayerNorm(d_model) if normalize_before else None + assert encoder_norm is None + self.encoder = TransformerEncoder( + encoder_layer, + num_encoder_layers, + d_model=d_model, + num_queries=num_queries, + text_enhance_layer=text_enhance_layer, + feature_fusion_layer=feature_fusion_layer, + use_checkpoint=use_checkpoint, + use_transformer_ckpt=use_transformer_ckpt, + ) + + # choose decoder layer type + decoder_layer = DeformableTransformerDecoderLayer( + d_model, + dim_feedforward, + dropout, + activation, + num_feature_levels, + nhead, + dec_n_points, + use_text_cross_attention=use_text_cross_attention, + ) + + decoder_norm = nn.LayerNorm(d_model) + self.decoder = TransformerDecoder( + decoder_layer, + num_decoder_layers, + decoder_norm, + return_intermediate=return_intermediate_dec, + d_model=d_model, + query_dim=query_dim, + num_feature_levels=num_feature_levels, + ) + + self.d_model = d_model + self.nhead = nhead + self.dec_layers = num_decoder_layers + self.num_queries = num_queries # useful for single stage model only + self.num_patterns = num_patterns + if not isinstance(num_patterns, int): + Warning("num_patterns should be int but {}".format(type(num_patterns))) + self.num_patterns = 0 + + if num_feature_levels > 1: + if self.num_encoder_layers > 0: + self.level_embed = self.create_parameter(shape=[num_feature_levels, d_model]) + # self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model)) + else: + self.level_embed = None + + self.learnable_tgt_init = learnable_tgt_init + assert learnable_tgt_init, "why not learnable_tgt_init" + self.embed_init_tgt = embed_init_tgt + if (two_stage_type != "no" and embed_init_tgt) or (two_stage_type == "no"): + self.tgt_embed = nn.Embedding(self.num_queries, d_model) + normal_(self.tgt_embed.weight) + else: + self.tgt_embed = None + + # for two stage + self.two_stage_type = two_stage_type + assert two_stage_type in ["no", "standard"], "unknown param {} of two_stage_type".format( + two_stage_type + ) + if two_stage_type == "standard": + # anchor selection at the output of encoder + self.enc_output = nn.Linear(d_model, d_model) + self.enc_output_norm = nn.LayerNorm(d_model) + self.two_stage_wh_embedding = None + + if two_stage_type == "no": + self.init_ref_points(num_queries) # init self.refpoint_embed + + self.enc_out_class_embed = None + self.enc_out_bbox_embed = None + + self._reset_parameters() + + def _reset_parameters(self): + for p in self.parameters(): + if p.dim() > 1: + xavier_uniform_(p) + for m in self.sublayers(): + if isinstance(m, MSDeformAttn): + m._reset_parameters() + if self.num_feature_levels > 1 and self.level_embed is not None: + normal_(self.level_embed) + + def get_valid_ratio(self, mask): + _, H, W = mask.shape + valid_H = paddle.sum(~mask[:, :, 0], 1) + valid_W = paddle.sum(~mask[:, 0, :], 1) + valid_ratio_h = valid_H.cast(paddle.float32) / H + valid_ratio_w = valid_W.cast(paddle.float32) / W + valid_ratio = paddle.stack([valid_ratio_w, valid_ratio_h], -1) + return valid_ratio + + def init_ref_points(self, use_num_queries): + self.refpoint_embed = nn.Embedding(use_num_queries, 4) + + def forward(self, srcs, masks, refpoint_embed, pos_embeds, tgt, attn_mask=None, text_dict=None): + """ + Input: + - srcs: List of multi features [bs, ci, hi, wi] + - masks: List of multi masks [bs, hi, wi] + - refpoint_embed: [bs, num_dn, 4]. None in infer + - pos_embeds: List of multi pos embeds [bs, ci, hi, wi] + - tgt: [bs, num_dn, d_model]. None in infer + + """ + # prepare input for encoder + src_flatten = [] + mask_flatten = [] + lvl_pos_embed_flatten = [] + spatial_shapes = [] + for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)): + bs, c, h, w = src.shape + spatial_shapes.append(paddle.to_tensor([h, w])) + + src = src.flatten(2).transpose([0, 2, 1]) # bs, hw, c + mask = mask.cast(paddle.float32).flatten(1).cast(paddle.bool) # bs, hw + pos_embed = pos_embed.flatten(2).transpose([0, 2, 1]) # bs, hw, c + if self.num_feature_levels > 1 and self.level_embed is not None: + lvl_pos_embed = pos_embed + self.level_embed[lvl].reshape([1, 1, -1]) + else: + lvl_pos_embed = pos_embed + lvl_pos_embed_flatten.append(lvl_pos_embed) + src_flatten.append(src) + mask_flatten.append(mask) + src_flatten = paddle.concat(src_flatten, 1) # bs, \sum{hxw}, c + mask_flatten = paddle.concat(mask_flatten, 1) # bs, \sum{hxw} + lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1) # bs, \sum{hxw}, c + + spatial_shapes = paddle.to_tensor( + paddle.stack(spatial_shapes), dtype=paddle.int32 + ) + + level_start_index = paddle.concat( + (paddle.zeros([1], dtype=spatial_shapes.dtype), spatial_shapes.prod(1).cumsum(0)[:-1])) + valid_ratios = paddle.stack([self.get_valid_ratio(m) for m in masks], 1) + + # two stage + enc_topk_proposals = enc_refpoint_embed = None + + ######################################################### + # Begin Encoder + ######################################################### + memory, memory_text = self.encoder( + src_flatten, + pos=lvl_pos_embed_flatten, + level_start_index=level_start_index, + spatial_shapes=spatial_shapes, + valid_ratios=valid_ratios, + key_padding_mask=mask_flatten, + memory_text=text_dict["encoded_text"], + text_attention_mask=~text_dict["text_token_mask"], + # we ~ the mask . False means use the token; True means pad the token + position_ids=text_dict["position_ids"], + text_self_attention_masks=text_dict["text_self_attention_masks"], + ) + ######################################################### + # End Encoder + # - memory: bs, \sum{hw}, c + # - mask_flatten: bs, \sum{hw} + # - lvl_pos_embed_flatten: bs, \sum{hw}, c + # - enc_intermediate_output: None or (nenc+1, bs, nq, c) or (nenc, bs, nq, c) + # - enc_intermediate_refpoints: None or (nenc+1, bs, nq, c) or (nenc, bs, nq, c) + ######################################################### + text_dict["encoded_text"] = memory_text + # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1': + # if memory.isnan().any() | memory.isinf().any(): + # import ipdb; ipdb.set_trace() + + + if self.two_stage_type == "standard": + output_memory, output_proposals = gen_encoder_output_proposals( + memory, mask_flatten, spatial_shapes + ) + output_memory = self.enc_output_norm(self.enc_output(output_memory)) + + if text_dict is not None: + enc_outputs_class_unselected = self.enc_out_class_embed(output_memory, text_dict) + else: + enc_outputs_class_unselected = self.enc_out_class_embed(output_memory) + + topk_logits = enc_outputs_class_unselected.max(-1) + enc_outputs_coord_unselected = ( + self.enc_out_bbox_embed(output_memory) + output_proposals + ) # (bs, \sum{hw}, 4) unsigmoid + topk = self.num_queries + + topk_proposals = paddle.topk(topk_logits, topk, axis=1)[1] # bs, nq + + topk_ind = topk_proposals.unsqueeze(axis=-1).tile(repeat_times=[1, 1, 4]) + + # gather boxes + refpoint_embed_undetach = paddle.take_along_axis( + arr=enc_outputs_coord_unselected, + axis=1, + indices=topk_ind) + + refpoint_embed_ = refpoint_embed_undetach.detach() + init_box_proposal = F.sigmoid(paddle.take_along_axis( + arr=output_proposals, + axis=1, + indices=topk_ind)) + + tgt_undetach = paddle.take_along_axis(arr=output_memory, axis=1,indices=topk_proposals.unsqueeze(axis=-1).tile(repeat_times=[1, 1, self.d_model])) + + # gather tgt + # tgt_undetach = paddle.gather_nd(output_memory, topk_ind) + if self.embed_init_tgt: + tgt_ = ( + self.tgt_embed.weight[:, None, :].tile([1, bs, 1]).transpose([1, 0, 2]) + ) # nq, bs, d_model + else: + tgt_ = tgt_undetach.detach() + + if refpoint_embed is not None: + refpoint_embed = paddle.concat([refpoint_embed, refpoint_embed_], axis=1) + tgt = paddle.concat([tgt, tgt_], axis=1) + else: + refpoint_embed, tgt = refpoint_embed_, tgt_ + + elif self.two_stage_type == "no": + tgt_ = ( + self.tgt_embed.weight[:, None, :].tile([1, bs, 1]).transpose([1, 0, 2]) + ) # nq, bs, d_model + refpoint_embed_ = ( + self.refpoint_embed.weight[:, None, :].tile([1, bs, 1]).transpose([1, 0, 2]) + ) # nq, bs, 4 + + if refpoint_embed is not None: + refpoint_embed = paddle.concat([refpoint_embed, refpoint_embed_], axis=1) + tgt = paddle.concat([tgt, tgt_], axis=1) + else: + refpoint_embed, tgt = refpoint_embed_, tgt_ + + if self.num_patterns > 0: + tgt_embed = tgt.tile([1, self.num_patterns, 1]) + refpoint_embed = refpoint_embed.tile([1, self.num_patterns, 1]) + tgt_pat = self.patterns.weight[None, :, :].repeat_interleave( + self.num_queries, 1 + ) # 1, n_q*n_pat, d_model + tgt = tgt_embed + tgt_pat + + init_box_proposal = F.sigmoid(refpoint_embed_) + + else: + raise NotImplementedError("unknown two_stage_type {}".format(self.two_stage_type)) + ######################################################### + # End preparing tgt + # - tgt: bs, NQ, d_model + # - refpoint_embed(unsigmoid): bs, NQ, d_model + ######################################################### + + ######################################################### + # Begin Decoder + ######################################################### + hs, references = self.decoder( + tgt=tgt, + memory=memory, + memory_key_padding_mask=mask_flatten, + pos=lvl_pos_embed_flatten, + refpoints_unsigmoid=refpoint_embed, + level_start_index=level_start_index, + spatial_shapes=spatial_shapes, + valid_ratios=valid_ratios, + tgt_mask=attn_mask, + memory_text=text_dict["encoded_text"], + text_attention_mask=~text_dict["text_token_mask"], + # we ~ the mask . False means use the token; True means pad the token + ) + ######################################################### + # End Decoder + # hs: n_dec, bs, nq, d_model + # references: n_dec+1, bs, nq, query_dim + ######################################################### + + ######################################################### + # Begin postprocess + ######################################################### + if self.two_stage_type == "standard": + hs_enc = tgt_undetach.unsqueeze(0) + ref_enc = F.sigmoid(refpoint_embed_undetach).unsqueeze(0) + else: + hs_enc = ref_enc = None + ######################################################### + # End postprocess + # hs_enc: (n_enc+1, bs, nq, d_model) or (1, bs, nq, d_model) or (n_enc, bs, nq, d_model) or None + # ref_enc: (n_enc+1, bs, nq, query_dim) or (1, bs, nq, query_dim) or (n_enc, bs, nq, d_model) or None + ######################################################### + + return hs, references, hs_enc, ref_enc, init_box_proposal + # hs: (n_dec, bs, nq, d_model) + # references: sigmoid coordinates. (n_dec+1, bs, bq, 4) + # hs_enc: (n_enc+1, bs, nq, d_model) or (1, bs, nq, d_model) or None + # ref_enc: sigmoid coordinates. \ + # (n_enc+1, bs, nq, query_dim) or (1, bs, nq, query_dim) or None + + +class TransformerEncoder(nn.Layer): + def __init__( + self, + encoder_layer, + num_layers, + d_model=256, + num_queries=300, + enc_layer_share=False, + text_enhance_layer=None, + feature_fusion_layer=None, + use_checkpoint=False, + use_transformer_ckpt=False, + ): + """_summary_ + + Args: + encoder_layer (_type_): _description_ + num_layers (_type_): _description_ + norm (_type_, optional): _description_. Defaults to None. + d_model (int, optional): _description_. Defaults to 256. + num_queries (int, optional): _description_. Defaults to 300. + enc_layer_share (bool, optional): _description_. Defaults to False. + + """ + super().__init__() + # prepare layers + self.layers = [] + self.text_layers = [] + self.fusion_layers = [] + if num_layers > 0: + self.layers = _get_clones(encoder_layer, num_layers, layer_share=enc_layer_share) + + if text_enhance_layer is not None: + self.text_layers = _get_clones( + text_enhance_layer, num_layers, layer_share=enc_layer_share + ) + if feature_fusion_layer is not None: + self.fusion_layers = _get_clones( + feature_fusion_layer, num_layers, layer_share=enc_layer_share + ) + else: + self.layers = [] + del encoder_layer + + if text_enhance_layer is not None: + self.text_layers = [] + del text_enhance_layer + if feature_fusion_layer is not None: + self.fusion_layers = [] + del feature_fusion_layer + + self.query_scale = None + self.num_queries = num_queries + self.num_layers = num_layers + self.d_model = d_model + + self.use_checkpoint = False + self.use_transformer_ckpt = False + + @staticmethod + def get_reference_points(spatial_shapes, valid_ratios): + reference_points_list = [] + for lvl, (H_, W_) in enumerate(spatial_shapes): + + ref_y, ref_x = paddle.meshgrid( + paddle.linspace(0.5, H_ - 0.5, H_, dtype=paddle.float32), + paddle.linspace(0.5, W_ - 0.5, W_, dtype=paddle.float32), + ) + ref_y = ref_y.reshape([-1,])[None] / (valid_ratios[:, None, lvl, 1] * H_) + ref_x = ref_x.reshape([-1,])[None] / (valid_ratios[:, None, lvl, 0] * W_) + ref = paddle.stack((ref_x, ref_y), -1) + reference_points_list.append(ref) + reference_points = paddle.concat(reference_points_list, 1) + reference_points = reference_points[:, :, None] * valid_ratios[:, None] + return reference_points + + def forward( + self, + # for images + src: paddle.Tensor, + pos: paddle.Tensor, + spatial_shapes: paddle.Tensor, + level_start_index: paddle.Tensor, + valid_ratios: paddle.Tensor, + key_padding_mask: paddle.Tensor, + # for texts + memory_text: paddle.Tensor = None, + text_attention_mask: paddle.Tensor = None, + pos_text: paddle.Tensor = None, + text_self_attention_masks: paddle.Tensor = None, + position_ids: paddle.Tensor = None, + ): + """ + Input: + - src: [bs, sum(hi*wi), 256] + - pos: pos embed for src. [bs, sum(hi*wi), 256] + - spatial_shapes: h,w of each level [num_level, 2] + - level_start_index: [num_level] start point of level in sum(hi*wi). + - valid_ratios: [bs, num_level, 2] + - key_padding_mask: [bs, sum(hi*wi)] + + - memory_text: bs, n_text, 256 + - text_attention_mask: bs, n_text + False for no padding; True for padding + - pos_text: bs, n_text, 256 + + - position_ids: bs, n_text + Intermedia: + - reference_points: [bs, sum(hi*wi), num_level, 2] + Outpus: + - output: [bs, sum(hi*wi), 256] + """ + + output = src + + # preparation and reshape + if self.num_layers > 0: + reference_points = self.get_reference_points( + spatial_shapes, valid_ratios + ) + + if self.text_layers: + # generate pos_text + bs, n_text, text_dim = memory_text.shape + if pos_text is None and position_ids is None: + pos_text = ( + paddle.arange(n_text) + .cast(paddle.float32) + .unsqueeze(0) + .unsqueeze(-1) + .tile([bs, 1, 1]) + ) + pos_text = get_sine_pos_embed(pos_text, num_pos_feats=256, exchange_xy=False) + if position_ids is not None: + pos_text = get_sine_pos_embed( + position_ids[..., None], num_pos_feats=256, exchange_xy=False + ) + + # main process + for layer_id, layer in enumerate(self.layers): + # if output.isnan().any() or memory_text.isnan().any(): + # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO': + # import ipdb; ipdb.set_trace() + if self.fusion_layers: + if self.use_checkpoint: + output, memory_text = recompute( + self.fusion_layers[layer_id], + output, + memory_text, + key_padding_mask, + text_attention_mask, + **{"preserve_rng_state": True} + ) + else: + output, memory_text = self.fusion_layers[layer_id]( + v=output, + l=memory_text, + attention_mask_v=key_padding_mask, + attention_mask_l=text_attention_mask, + ) + + if self.text_layers: + memory_text = self.text_layers[layer_id]( + src=memory_text, + src_mask=text_self_attention_masks, # note we use ~ for mask here + src_key_padding_mask=text_attention_mask, + pos=(pos_text if pos_text is not None else None), + ) + + # main process + if self.use_transformer_ckpt: + output = recompute( + layer, + output, + pos, + reference_points, + spatial_shapes, + level_start_index, + key_padding_mask, + **{"preserve_rng_state": True} + ) + else: + output = layer( + src=output, + pos=pos, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + key_padding_mask=key_padding_mask, + ) + + return output, memory_text + + +class TransformerDecoder(nn.Layer): + def __init__( + self, + decoder_layer, + num_layers, + norm=None, + return_intermediate=False, + d_model=256, + query_dim=4, + num_feature_levels=1, + ): + super().__init__() + if num_layers > 0: + self.layers = _get_clones(decoder_layer, num_layers) + else: + self.layers = [] + self.num_layers = num_layers + self.norm = norm + self.return_intermediate = return_intermediate + assert return_intermediate, "support return_intermediate only" + self.query_dim = query_dim + assert query_dim in [2, 4], "query_dim should be 2/4 but {}".format(query_dim) + self.num_feature_levels = num_feature_levels + + self.ref_point_head = MLP(query_dim // 2 * d_model, d_model, d_model, 2) + self.query_pos_sine_scale = None + + self.query_scale = None + self.bbox_embed = None + self.class_embed = None + + self.d_model = d_model + + self.ref_anchor_head = None + + def forward( + self, + tgt, + memory, + tgt_mask: Optional[paddle.Tensor] = None, + memory_mask: Optional[paddle.Tensor] = None, + tgt_key_padding_mask: Optional[paddle.Tensor] = None, + memory_key_padding_mask: Optional[paddle.Tensor] = None, + pos: Optional[paddle.Tensor] = None, + refpoints_unsigmoid: Optional[paddle.Tensor] = None, # num_queries, bs, 2 + # for memory + level_start_index: Optional[paddle.Tensor] = None, # num_levels + spatial_shapes: Optional[paddle.Tensor] = None, # bs, num_levels, 2 + valid_ratios: Optional[paddle.Tensor] = None, + # for text + memory_text: Optional[paddle.Tensor] = None, + text_attention_mask: Optional[paddle.Tensor] = None, + ): + """ + Input: + - tgt: nq, bs, d_model + - memory: hw, bs, d_model + - pos: hw, bs, d_model + - refpoints_unsigmoid: nq, bs, 2/4 + - valid_ratios/spatial_shapes: bs, nlevel, 2 + """ + output = tgt + + intermediate = [] + reference_points = F.sigmoid(refpoints_unsigmoid) + ref_points = [reference_points] + + for layer_id, layer in enumerate(self.layers): + + if reference_points.shape[-1] == 4: + reference_points_input = ( + reference_points[:, :, None] + * paddle.concat([valid_ratios, valid_ratios], -1)[None, :] + ) # nq, bs, nlevel, 4 + else: + assert reference_points.shape[-1] == 2 + reference_points_input = reference_points[:, :, None] * valid_ratios[None, :] + query_sine_embed = gen_sineembed_for_position( + reference_points_input[:, :, 0, :] + ) # nq, bs, 256*2 + + # conditional query + raw_query_pos = self.ref_point_head(query_sine_embed) # nq, bs, 256 + pos_scale = self.query_scale(output) if self.query_scale is not None else 1 + query_pos = pos_scale * raw_query_pos + # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1': + # if query_pos.isnan().any() | query_pos.isinf().any(): + # import ipdb; ipdb.set_trace() + + # main process + output = layer( + tgt=output, + tgt_query_pos=query_pos, + tgt_query_sine_embed=query_sine_embed, + tgt_key_padding_mask=tgt_key_padding_mask, + tgt_reference_points=reference_points_input, + memory_text=memory_text, + text_attention_mask=text_attention_mask, + memory=memory, + memory_key_padding_mask=memory_key_padding_mask, + memory_level_start_index=level_start_index, + memory_spatial_shapes=spatial_shapes, + memory_pos=pos, + self_attn_mask=tgt_mask, + cross_attn_mask=memory_mask, + ) + + if (output.isnan().any() | output.isinf().any()) and paddle.in_dynamic_mode(): + print(f"output layer_id {layer_id} is nan") + try: + num_nan = output.isnan().sum().item() + num_inf = output.isinf().sum().item() + print(f"num_nan {num_nan}, num_inf {num_inf}") + except Exception as e: + print(e) + # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1': + # import ipdb; ipdb.set_trace() + + # iter update + if self.bbox_embed is not None: + # box_holder = self.bbox_embed(output) + # box_holder[..., :self.query_dim] += inverse_sigmoid(reference_points) + # new_reference_points = box_holder[..., :self.query_dim].sigmoid() + + reference_before_sigmoid = inverse_sigmoid(reference_points) + delta_unsig = self.bbox_embed[layer_id](output) + outputs_unsig = delta_unsig + reference_before_sigmoid + new_reference_points = F.sigmoid(outputs_unsig) + + reference_points = new_reference_points.detach() + # if layer_id != self.num_layers - 1: + ref_points.append(new_reference_points) + + intermediate.append(self.norm(output)) + + return [ + [itm_out for itm_out in intermediate], + [itm_refpoint for itm_refpoint in ref_points], + ] + + +class DeformableTransformerEncoderLayer(nn.Layer): + def __init__( + self, + d_model=256, + d_ffn=1024, + dropout=0.1, + activation="relu", + n_levels=4, + n_heads=8, + n_points=4, + ): + super().__init__() + + # self attention + self.self_attn = MSDeformAttn( + embed_dim=d_model, + num_levels=n_levels, + num_heads=n_heads, + num_points=n_points, + batch_first=True + ) + self.dropout1 = nn.Dropout(dropout) + self.norm1 = nn.LayerNorm(d_model) + + # ffn + self.linear1 = nn.Linear(d_model, d_ffn) + self.activation = _get_activation_fn(activation, d_model=d_ffn) + self.dropout2 = nn.Dropout(dropout) + self.linear2 = nn.Linear(d_ffn, d_model) + self.dropout3 = nn.Dropout(dropout) + self.norm2 = nn.LayerNorm(d_model) + + @staticmethod + def with_pos_embed(tensor, pos): + return tensor if pos is None else tensor + pos + + def forward_ffn(self, src): + src2 = self.linear2(self.dropout2(self.activation(self.linear1(src)))) + src = src + self.dropout3(src2) + src = self.norm2(src) + return src + + def forward( + self, src, pos, reference_points, spatial_shapes, level_start_index, key_padding_mask=None + ): + # self attention + # import ipdb; ipdb.set_trace() + + src2 = self.self_attn( + query=self.with_pos_embed(src, pos), + reference_points=reference_points, + value=src, + value_spatial_shapes=spatial_shapes, + value_level_start_index=level_start_index, + value_mask=key_padding_mask, + ) + src = src + self.dropout1(src2) + src = self.norm1(src) + + # ffn + src = self.forward_ffn(src) + + return src + + +class DeformableTransformerDecoderLayer(nn.Layer): + def __init__( + self, + d_model=256, + d_ffn=1024, + dropout=0.1, + activation="relu", + n_levels=4, + n_heads=8, + n_points=4, + use_text_feat_guide=False, + use_text_cross_attention=False, + ): + super().__init__() + + # cross attention + self.cross_attn = MSDeformAttn( + embed_dim=d_model, + num_levels=n_levels, + num_heads=n_heads, + num_points=n_points, + batch_first=True + ) + self.dropout1 = nn.Dropout(dropout) if dropout > 0 else nn.Identity() + self.norm1 = nn.LayerNorm(d_model) + + # cross attention text + if use_text_cross_attention: + self.ca_text = MultiHeadAttention(d_model, n_heads, dropout=dropout) + self.catext_dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity() + self.catext_norm = nn.LayerNorm(d_model) + + # self attention + self.self_attn = MultiHeadAttention(d_model, n_heads, dropout=dropout) + self.dropout2 = nn.Dropout(dropout) if dropout > 0 else nn.Identity() + self.norm2 = nn.LayerNorm(d_model) + + # ffn + self.linear1 = nn.Linear(d_model, d_ffn) + self.activation = _get_activation_fn(activation, d_model=d_ffn, batch_dim=1) + self.dropout3 = nn.Dropout(dropout) if dropout > 0 else nn.Identity() + self.linear2 = nn.Linear(d_ffn, d_model) + self.dropout4 = nn.Dropout(dropout) if dropout > 0 else nn.Identity() + self.norm3 = nn.LayerNorm(d_model) + + self.key_aware_proj = None + self.use_text_feat_guide = use_text_feat_guide + assert not use_text_feat_guide + self.use_text_cross_attention = use_text_cross_attention + + def rm_self_attn_modules(self): + self.self_attn = None + self.dropout2 = None + self.norm2 = None + + @staticmethod + def with_pos_embed(tensor, pos): + return tensor if pos is None else tensor + pos + + def forward_ffn(self, tgt): + # with paddle.amp.auto_cast(enable=False): + tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt)))) + tgt = tgt + self.dropout4(tgt2) + tgt = self.norm3(tgt) + return tgt + + def forward( + self, + # for tgt + tgt: Optional[paddle.Tensor], # nq, bs, d_model + tgt_query_pos: Optional[paddle.Tensor] = None, # pos for query. MLP(Sine(pos)) + tgt_query_sine_embed: Optional[paddle.Tensor] = None, # pos for query. Sine(pos) + tgt_key_padding_mask: Optional[paddle.Tensor] = None, + tgt_reference_points: Optional[paddle.Tensor] = None, # nq, bs, 4 + memory_text: Optional[paddle.Tensor] = None, # bs, num_token, d_model + text_attention_mask: Optional[paddle.Tensor] = None, # bs, num_token + # for memory + memory: Optional[paddle.Tensor] = None, # hw, bs, d_model + memory_key_padding_mask: Optional[paddle.Tensor] = None, + memory_level_start_index: Optional[paddle.Tensor] = None, # num_levels + memory_spatial_shapes: Optional[paddle.Tensor] = None, # bs, num_levels, 2 + memory_pos: Optional[paddle.Tensor] = None, # pos for memory + # sa + self_attn_mask: Optional[paddle.Tensor] = None, # mask used for self-attention + cross_attn_mask: Optional[paddle.Tensor] = None, # mask used for cross-attention + ): + """ + Input: + - tgt/tgt_query_pos: nq, bs, d_model + - + """ + assert cross_attn_mask is None + + # self attention + if self.self_attn is not None: + # import ipdb; ipdb.set_trace() + q = k = self.with_pos_embed(tgt, tgt_query_pos) + tgt2 = self.self_attn(q, k, tgt, attn_mask=self_attn_mask if self_attn_mask is None else ~self_attn_mask)[0] + tgt = tgt + self.dropout2(tgt2) + tgt = self.norm2(tgt) + + if self.use_text_cross_attention: + tgt2 = self.ca_text( + self.with_pos_embed(tgt, tgt_query_pos), + memory_text, + memory_text, + attn_mask=~text_attention_mask, + )[0] + tgt = tgt + self.catext_dropout(tgt2) + tgt = self.catext_norm(tgt) + + tgt2 = self.cross_attn( + query=self.with_pos_embed(tgt, tgt_query_pos), + reference_points=tgt_reference_points, + value=memory, + value_spatial_shapes=memory_spatial_shapes, + value_level_start_index=memory_level_start_index, + value_mask=memory_key_padding_mask, + ) + tgt = tgt + self.dropout1(tgt2) + tgt = self.norm1(tgt) + + # ffn + tgt = self.forward_ffn(tgt) + + return tgt + + +def build_transformer(args): + return Transformer( + d_model=args.hidden_dim, + dropout=args.dropout, + nhead=args.nheads, + num_queries=args.num_queries, + dim_feedforward=args.dim_feedforward, + num_encoder_layers=args.enc_layers, + num_decoder_layers=args.dec_layers, + normalize_before=args.pre_norm, + return_intermediate_dec=True, + query_dim=args.query_dim, + activation=args.transformer_activation, + num_patterns=args.num_patterns, + num_feature_levels=args.num_feature_levels, + enc_n_points=args.enc_n_points, + dec_n_points=args.dec_n_points, + learnable_tgt_init=True, + # two stage + two_stage_type=args.two_stage_type, # ['no', 'standard', 'early'] + embed_init_tgt=args.embed_init_tgt, + use_text_enhancer=args.use_text_enhancer, + use_fusion_layer=args.use_fusion_layer, + use_checkpoint=args.use_checkpoint, + use_transformer_ckpt=args.use_transformer_ckpt, + use_text_cross_attention=args.use_text_cross_attention, + text_dropout=args.text_dropout, + fusion_dropout=args.fusion_dropout, + fusion_droppath=args.fusion_droppath, + ) diff --git a/paddlevlp/models/groundingdino/transformer_vanilla.py b/paddlevlp/models/groundingdino/transformer_vanilla.py new file mode 100644 index 00000000000000..a671c0e87c54c7 --- /dev/null +++ b/paddlevlp/models/groundingdino/transformer_vanilla.py @@ -0,0 +1,122 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +import paddle +from paddle import Tensor, nn +import paddle.nn.functional as F +from .layers import MultiHeadAttention + + +from .utils import ( + MLP, + _get_activation_fn, + _get_clones, + gen_encoder_output_proposals, + gen_sineembed_for_position, + sigmoid_focal_loss, +) + + +class TextTransformer(nn.Layer): + def __init__(self, num_layers, d_model=256, nheads=8, dim_feedforward=2048, dropout=0.1): + super().__init__() + self.num_layers = num_layers + self.d_model = d_model + self.nheads = nheads + self.dim_feedforward = dim_feedforward + self.norm = None + + single_encoder_layer = TransformerEncoderLayer( + d_model=d_model, nhead=nheads, dim_feedforward=dim_feedforward, dropout=dropout + ) + self.layers = _get_clones(single_encoder_layer, num_layers) + + def forward(self, memory_text: paddle.Tensor, text_attention_mask: paddle.Tensor): + """ + + Args: + text_attention_mask: bs, num_token + memory_text: bs, num_token, d_model + + Raises: + RuntimeError: _description_ + + Returns: + output: bs, num_token, d_model + """ + + output = memory_text + + for layer in self.layers: + output = layer(output, src_key_padding_mask=text_attention_mask) + + if self.norm is not None: + output = self.norm(output) + + return output + + +class TransformerEncoderLayer(nn.Layer): + def __init__( + self, + d_model, + nhead, + dim_feedforward=2048, + dropout=0.1, + activation="relu", + normalize_before=False, + ): + super().__init__() + self.self_attn = MultiHeadAttention(d_model, nhead, dropout=dropout) + # Implementation of Feedforward model + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.dropout = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + + self.activation = _get_activation_fn(activation) + self.normalize_before = normalize_before + self.nhead = nhead + + def with_pos_embed(self, tensor, pos: Optional[Tensor]): + return tensor if pos is None else tensor + pos + + def forward( + self, + src, + src_mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + ): + # repeat attn mask + if src_mask.dim() == 3 and src_mask.shape[0] == src.shape[1]: + # bs, num_q, num_k + src_mask = src_mask.tile([self.nhead, 1, 1]) + + q = k = self.with_pos_embed(src, pos) + + src2 = self.self_attn(q, k, value=src, attn_mask=src_mask)[0] + + src = src + self.dropout1(src2) + src = self.norm1(src) + src2 = self.linear2(self.dropout(self.activation(self.linear1(src)))) + src = src + self.dropout2(src2) + src = self.norm2(src) + return src diff --git a/paddlevlp/models/groundingdino/utils.py b/paddlevlp/models/groundingdino/utils.py new file mode 100644 index 00000000000000..b55987720b5e4a --- /dev/null +++ b/paddlevlp/models/groundingdino/utils.py @@ -0,0 +1,270 @@ +import copy +import math + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +def masked_fill(x, mask, value): + y = paddle.full(x.shape, value, x.dtype) + return paddle.where(mask, y, x) + +def inverse_sigmoid(x, eps=1e-3): + x = x.clip(min=0, max=1) + x1 = x.clip(min=eps) + x2 = (1 - x).clip(min=eps) + return paddle.log(x1 / x2) + +def _get_clones(module, N, layer_share=False): + + if layer_share: + return nn.LayerList([module for i in range(N)]) + else: + return nn.LayerList([copy.deepcopy(module) for i in range(N)]) + + +def get_sine_pos_embed( + pos_tensor: paddle.Tensor, + num_pos_feats: int = 128, + temperature: int = 10000, + exchange_xy: bool = True, +): + """generate sine position embedding from a position tensor + Args: + pos_tensor (paddle.Tensor): shape: [..., n]. + num_pos_feats (int): projected shape for each float in the tensor. + temperature (int): temperature in the sine/cosine function. + exchange_xy (bool, optional): exchange pos x and pos y. \ + For example, input tensor is [x,y], the results will be [pos(y), pos(x)]. Defaults to True. + Returns: + pos_embed (torch.Tensor): shape: [..., n*num_pos_feats]. + """ + scale = 2 * math.pi + dim_t = paddle.arange(num_pos_feats) + dim_t = temperature ** (2. * paddle.floor_divide(dim_t, paddle.to_tensor(2)) / num_pos_feats) + + def sine_func(x: paddle.Tensor): + sin_x = x * scale / dim_t + sin_x = paddle.stack((sin_x[..., 0::2].sin(), sin_x[..., 1::2].cos()), axis=3).flatten(2) + return sin_x + + pos_res = [sine_func(x) for x in pos_tensor.split([1] * pos_tensor.shape[-1], axis=-1)] + if exchange_xy: + pos_res[0], pos_res[1] = pos_res[1], pos_res[0] + pos_res = paddle.concat(pos_res, axis=-1) + return pos_res + + +def gen_encoder_output_proposals( + memory: paddle.Tensor, memory_padding_mask: paddle.Tensor, spatial_shapes: paddle.Tensor, learnedwh=None +): + """ + Input: + - memory: bs, \sum{hw}, d_model + - memory_padding_mask: bs, \sum{hw} + - spatial_shapes: nlevel, 2 + - learnedwh: 2 + Output: + - output_memory: bs, \sum{hw}, d_model + - output_proposals: bs, \sum{hw}, 4 + """ + N_, S_, C_ = memory.shape + proposals = [] + _cur = 0 + for lvl, (H_, W_) in enumerate(spatial_shapes): + mask_flatten_ = memory_padding_mask[:, _cur : (_cur + H_ * W_)].reshape([N_, H_, W_, 1]) + valid_H = paddle.sum(~mask_flatten_[:, :, 0, 0], 1) + valid_W = paddle.sum(~mask_flatten_[:, 0, :, 0], 1) + + # import ipdb; ipdb.set_trace() + + grid_y, grid_x = paddle.meshgrid( + paddle.linspace(0, H_ - 1, H_, dtype=paddle.float32), + paddle.linspace(0, W_ - 1, W_, dtype=paddle.float32), + ) + grid = paddle.concat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) # H_, W_, 2 + + scale = paddle.concat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).reshape([N_, 1, 1, 2]) + grid = (grid.unsqueeze(0).tile([N_, 1, 1, 1]) + 0.5) / scale + + if learnedwh is not None: + # import ipdb; ipdb.set_trace() + wh = paddle.ones_like(grid) * learnedwh.sigmoid() * (2.0**lvl) + else: + wh = paddle.ones_like(grid) * 0.05 * (2.0**lvl) + + # scale = torch.cat([W_[None].unsqueeze(-1), H_[None].unsqueeze(-1)], 1).view(1, 1, 1, 2).repeat(N_, 1, 1, 1) + # grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale + # wh = torch.ones_like(grid) / scale + proposal = paddle.concat((grid, wh), -1).reshape([N_, -1, 4]) + proposals.append(proposal) + _cur += H_ * W_ + # import ipdb; ipdb.set_trace() + output_proposals = paddle.concat(proposals, 1) + output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all( + -1, keepdim=True + ) + output_proposals = paddle.log(output_proposals / (1 - output_proposals)) # unsigmoid + output_proposals = masked_fill(output_proposals, memory_padding_mask.unsqueeze(-1), float("inf")) + output_proposals = masked_fill(output_proposals, ~output_proposals_valid, float("inf")) + + output_memory = memory + output_memory = masked_fill(output_memory, memory_padding_mask.unsqueeze(-1), float(0)) + output_memory = masked_fill(output_memory, ~output_proposals_valid, float(0)) + + # output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf')) + # output_memory = output_memory.masked_fill(~output_proposals_valid, float('inf')) + + return output_memory, output_proposals + + +class RandomBoxPerturber: + def __init__( + self, x_noise_scale=0.2, y_noise_scale=0.2, w_noise_scale=0.2, h_noise_scale=0.2 + ) -> None: + self.noise_scale = paddle.to_tensor( + [x_noise_scale, y_noise_scale, w_noise_scale, h_noise_scale] + ) + + def __call__(self, refanchors: paddle.Tensor) -> paddle.Tensor: + nq, bs, query_dim = refanchors.shape + + noise_raw = paddle.rand(shape=refanchors.shape, dtype=refanchors.dtype) + noise_scale = self.noise_scale[:query_dim] + + new_refanchors = refanchors * (1 + (noise_raw - 0.5) * noise_scale) + return new_refanchors.clip(0, 1) + + +def sigmoid_focal_loss( + inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2, no_reduction=False +): + """ + Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. + Args: + inputs: A float tensor of arbitrary shape. + The predictions for each example. + targets: A float tensor with the same shape as inputs. Stores the binary + classification label for each element in inputs + (0 for the negative class and 1 for the positive class). + alpha: (optional) Weighting factor in range (0,1) to balance + positive vs negative examples. Default = -1 (no weighting). + gamma: Exponent of the modulating factor (1 - p_t) to + balance easy vs hard examples. + Returns: + Loss tensor + """ + prob = inputs.sigmoid() + ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none") + p_t = prob * targets + (1 - prob) * (1 - targets) + loss = ce_loss * ((1 - p_t) ** gamma) + + if alpha >= 0: + alpha_t = alpha * targets + (1 - alpha) * (1 - targets) + loss = alpha_t * loss + + if no_reduction: + return loss + + return loss.mean(1).sum() / num_boxes + + +class MLP(nn.Layer): + """Very simple multi-layer perceptron (also called FFN)""" + + def __init__(self, input_dim, hidden_dim, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.LayerList( + nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]) + ) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + +def _get_activation_fn(activation, d_model=256, batch_dim=0): + """Return an activation function given a string""" + if activation == "relu": + return F.relu + if activation == "gelu": + return F.gelu + if activation == "glu": + return F.glu + if activation == "prelu": + return nn.PReLU() + if activation == "selu": + return F.selu + + raise RuntimeError(f"activation should be relu/gelu, not {activation}.") + + +def gen_sineembed_for_position(pos_tensor): + # n_query, bs, _ = pos_tensor.size() + # sineembed_tensor = torch.zeros(n_query, bs, 256) + scale = 2 * math.pi + dim_t = paddle.arange(128) + dim_t = 10000 ** (2 * (paddle.floor_divide(dim_t, paddle.to_tensor(2))) / 128) + x_embed = pos_tensor[:, :, 0] * scale + y_embed = pos_tensor[:, :, 1] * scale + pos_x = x_embed[:, :, None] / dim_t + pos_y = y_embed[:, :, None] / dim_t + pos_x = paddle.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), axis=3).flatten(2) + pos_y = paddle.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), axis=3).flatten(2) + if pos_tensor.shape[-1] == 2: + pos = paddle.concat((pos_y, pos_x), aixs=2) + elif pos_tensor.shape[-1] == 4: + w_embed = pos_tensor[:, :, 2] * scale + pos_w = w_embed[:, :, None] / dim_t + pos_w = paddle.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), axis=3).flatten(2) + + h_embed = pos_tensor[:, :, 3] * scale + pos_h = h_embed[:, :, None] / dim_t + pos_h = paddle.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), axis=3).flatten(2) + + pos = paddle.concat((pos_y, pos_x, pos_w, pos_h), axis=2) + else: + raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.shape[-1])) + return pos + + +class ContrastiveEmbed(nn.Layer): + def __init__(self, max_text_len=256): + """ + Args: + max_text_len: max length of text. + """ + super().__init__() + self.max_text_len = max_text_len + + def forward(self, x, text_dict): + """_summary_ + + Args: + x (_type_): _description_ + text_dict (_type_): _description_ + { + 'encoded_text': encoded_text, # bs, 195, d_model + 'text_token_mask': text_token_mask, # bs, 195 + # True for used tokens. False for padding tokens + } + Returns: + _type_: _description_ + """ + assert isinstance(text_dict, dict) + + y = text_dict["encoded_text"] + text_token_mask = text_dict["text_token_mask"] + + res = x @ y.transpose([0, 2, 1]) + masked_fill(res, ~text_token_mask[:, None, :], float("-inf")) + + # padding to max_text_len + new_res = paddle.full((*res.shape[:-1], self.max_text_len), float("-inf")) + new_res[..., : res.shape[-1]] = res + + return new_res diff --git a/paddlevlp/processors/__init__.py b/paddlevlp/processors/__init__.py index 04006999f0b629..e3a4f252ceed86 100644 --- a/paddlevlp/processors/__init__.py +++ b/paddlevlp/processors/__init__.py @@ -16,3 +16,4 @@ from .blip_processing import * from .minigpt4_processing import * from .minigpt4_image_processing import * +from .groundingdino_processing import * diff --git a/paddlevlp/processors/groundingdino_processing.py b/paddlevlp/processors/groundingdino_processing.py new file mode 100644 index 00000000000000..eba2e15f0558e3 --- /dev/null +++ b/paddlevlp/processors/groundingdino_processing.py @@ -0,0 +1,365 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Processor class for GroundingDino. +""" + +import re +from typing import Dict, List, Optional, Tuple, Union + +import numpy as np +import PIL +import paddle +import paddle.vision.transforms as T +from paddlenlp.transformers.tokenizer_utils_base import (BatchEncoding, + TensorType, TextInput) + +from .base_processing import ProcessorMixin + +from .image_utils import (IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD,valid_images) +from .processing_utils import (BaseImageProcessor, BaseTextProcessor) +from paddlenlp.taskflow.utils import pad_batch_data +from .utils import _max_by_axis + +__all__ = [ + "GroudingDinoProcessor", + "GroudingDinoImageProcessor", + "GroudingDinoTextProcessor", +] + + +class GroudingDinoProcessor(ProcessorMixin): + + attributes = ["image_processor", "text_processor", "tokenizer"] + image_processor_class = "GroudingDinoImageProcessor" + text_processor_class = "GroudingDinoTextProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__(self, image_processor, text_processor, tokenizer): + super().__init__(image_processor, text_processor, tokenizer) + + def __call__( + self, + images=None, + text: str = None, + **kwargs, + ) : + + if images is None or text is None: + raise ValueError("You have to specify either images and text.") + + self.prompt = self.text_processor.pre_caption(text) + input_ids = self.tokenizer([self.prompt]).input_ids + specical_tokens = self.tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", ".", "?"]) + tokenized_out = self.text_processor(input_ids, specical_tokens) + + image_tensor,mask = self.image_processor(images) + + return image_tensor,mask,tokenized_out + + def decode(self, posmap): + """ + This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + assert isinstance(posmap, paddle.Tensor), "posmap must be paddle.Tensor" + tokenized = self.tokenizer(self.prompt) + if posmap.dim() == 1: + non_zero_idx = posmap.nonzero(as_tuple=True)[0].squeeze(-1).tolist() + token_ids = [tokenized["input_ids"][i] for i in non_zero_idx] + return self.tokenizer.decode(token_ids) + else: + raise NotImplementedError("posmap must be 1-dim") + + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + + +class GroudingDinoTextProcessor(BaseTextProcessor): + r""" + Constructs a GroudingDino text processor. + """ + + def __init__( + self, + max_words: int = 256, + **kwargs, + ): + super().__init__(**kwargs) + + self.max_words = max_words + self.caption = None + + + def __call__( + self, + input_ids, + special_tokens_list, + **kwargs, + ): + """ + Preprocess the text with tokenization. + """ + tokenized_out = {} + input_ids = pad_batch_data(input_ids) + input_ids = paddle.to_tensor(input_ids, dtype = paddle.int64).squeeze(-1) + tokenized_out['input_ids'] = input_ids + tokenized_out['attention_mask'] = paddle.cast(input_ids != 0, paddle.int64) + + ( + text_self_attention_masks, + position_ids, + cate_to_token_mask_list, + ) = self.generate_masks_with_special_tokens_and_transfer_map(tokenized_out,special_tokens_list) + + if text_self_attention_masks.shape[1] > self.max_words: + text_self_attention_masks = text_self_attention_masks[ + :, : self.max_words, : self.max_words + ] + position_ids = position_ids[:, : self.max_words] + tokenized_out["input_ids"] = tokenized_out["input_ids"][:, : self.max_words] + tokenized_out["attention_mask"] = tokenized_out["attention_mask"][:, : self.max_words] + tokenized_out['position_ids'] = position_ids + tokenized_out['text_self_attention_masks'] =text_self_attention_masks + + return tokenized_out + + def pre_caption(self, caption: str) -> str: + """ + Preprocess the text before tokenization. + """ + caption = caption.strip() + if not caption.endswith("."): + caption = caption + "." + self.caption = caption + return caption + + def generate_masks_with_special_tokens_and_transfer_map(self,tokenized,special_tokens_list): + """Generate attention mask between each pair of special tokens + Args: + input_ids (torch.Tensor): input ids. Shape: [bs, num_token] + special_tokens_mask (list): special tokens mask. + Returns: + torch.Tensor: attention mask between each special tokens. + """ + input_ids = tokenized["input_ids"] + bs, num_token = input_ids.shape + # special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens + special_tokens_mask = paddle.zeros((bs, num_token), dtype=paddle.bool) + for special_token in special_tokens_list: + special_tokens_mask |= input_ids == special_token + + # idxs: each row is a list of indices of special tokens + idxs = paddle.nonzero(special_tokens_mask) + + # generate attention mask and positional ids + attention_mask = ( + paddle.eye(num_token, dtype=paddle.int32).cast(paddle.bool).unsqueeze(0).tile([bs, 1, 1]) + ) + position_ids = paddle.zeros((bs, num_token)) + cate_to_token_mask_list = [[] for _ in range(bs)] + previous_col = 0 + + for i in range(idxs.shape[0]): + row, col = idxs[i] + if (col == 0) or (col == num_token - 1): + attention_mask[row, col, col] = True + position_ids[row, col] = 0 + else: + attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True + position_ids[row, previous_col + 1 : col + 1] = paddle.arange(0, col - previous_col) + c2t_maski = paddle.zeros([num_token,]).cast(paddle.bool) + c2t_maski[previous_col + 1 : col] = True + cate_to_token_mask_list[row].append(c2t_maski) + previous_col = col + + + return attention_mask, position_ids.cast(paddle.int64), cate_to_token_mask_list + +class GroudingDinoImageProcessor(BaseImageProcessor): + r""" + Constructs a GroudingDino image processor. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: List[int] = None, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_nested: bool = True, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else 800 + + self.do_resize = do_resize + self.size = size + self.do_normalize = do_normalize + self.image_mean = ( + image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN + ) + self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD + self.do_nested = do_nested + + + def resize(self, + image, + target=None, + size=None, + max_size=1333): + + def get_size_with_aspect_ratio(image_size, size, max_size=None): + w, h = image_size + if max_size is not None: + min_original_size = float(min((w, h))) + max_original_size = float(max((w, h))) + if max_original_size / min_original_size * size > max_size: + size = int(round(max_size * min_original_size / max_original_size)) + + if (w <= h and w == size) or (h <= w and h == size): + return (h, w) + + if w < h: + ow = size + oh = int(size * h / w) + else: + oh = size + ow = int(size * w / h) + + return (oh, ow) + + def get_size(image_size, size, max_size=None): + if isinstance(size, (list, tuple)): + return size[::-1] + else: + return get_size_with_aspect_ratio(image_size, size, max_size) + + size = get_size(image.size, size, max_size) + rescaled_image = T.resize(image, size) + + if target is None: + return rescaled_image + + ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size)) + ratio_width, ratio_height = ratios + + target = target.copy() + if "boxes" in target: + boxes = target["boxes"] + scaled_boxes = boxes * paddle.to_tensor( + [ratio_width, ratio_height, ratio_width, ratio_height] + ) + target["boxes"] = scaled_boxes + + if "area" in target: + area = target["area"] + scaled_area = area * (ratio_width * ratio_height) + target["area"] = scaled_area + + h, w = size + target["size"] = paddle.to_tensor([h, w]) + + if "masks" in target: + target["masks"] = ( + interpolate(target["masks"][:, None].cast(paddle.float32), size, mode="nearest")[:, 0] > 0.5 + ) + + return rescaled_image, target + + def nested_tensor_from_tensor_list(self,tensor_list: List[paddle.Tensor]): + # TODO make this more general + if tensor_list[0].ndim == 3: + + # TODO make it support different-sized images + max_size = _max_by_axis([list(img.shape) for img in tensor_list]) + # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) + batch_shape = [len(tensor_list)] + max_size + b, c, h, w = batch_shape + dtype = tensor_list[0].dtype + tensor = paddle.zeros(batch_shape, dtype=dtype) + mask = paddle.ones((b, h, w), dtype=paddle.bool) + for i in range(b): + img = tensor_list[i] + tensor[i, :img.shape[0], :img.shape[1], :img.shape[2]] = img + mask[i, :img.shape[1], :img.shape[2]] = False + else: + raise ValueError("not supported") + return tensor, mask + + + def preprocess( + self, + images, + do_resize: Optional[bool] = None, + size: Optional[Dict[str, int]] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_nested: bool = None, + **kwargs, + ): + """ + Preprocess an image or batch of images. + + """ + do_resize = do_resize if do_resize is not None else self.do_resize + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + do_nested = do_nested if do_nested is not None else self.do_nested + + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + size = size if size is not None else self.size + + if not isinstance(images, (list, tuple)): + images = [images] + + if isinstance(images[0], str): + images = [load_image(image) for image in images] + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "paddle.Tensor." + ) + + + if do_normalize and (image_mean is None or image_std is None): + raise ValueError( + "Image mean and std must be specified if do_normalize is True." + ) + + if do_resize: + images = [ + T.to_tensor(self.resize(image=image, size=size)) + for image in images + ] + + if do_normalize: + images = T.normalize(images, mean=image_mean, std=image_std) + + if do_nested: + tensors, masks = self.nested_tensor_from_tensor_list(images) + + return tensors, masks + + diff --git a/paddlevlp/processors/utils.py b/paddlevlp/processors/utils.py index 896c4bcd24820b..d340dacbbcec90 100644 --- a/paddlevlp/processors/utils.py +++ b/paddlevlp/processors/utils.py @@ -24,3 +24,12 @@ def _missing_(cls, value): raise ValueError( f"{value} is not a valid {cls.__name__}, please select one of {list(cls._value2member_map_.keys())}" ) + + +def _max_by_axis(the_list): + # type: (List[List[int]]) -> List[int] + maxes = the_list[0] + for sublist in the_list[1:]: + for index, item in enumerate(sublist): + maxes[index] = max(maxes[index], item) + return maxes From 776103061f9263ac8f8bbf6719708a215dd926c5 Mon Sep 17 00:00:00 2001 From: LokeZhou Date: Mon, 3 Jul 2023 12:45:12 +0000 Subject: [PATCH 07/10] fix pr comment --- paddlevlp/examples/groundingdino/README.md | 7 +- .../examples/groundingdino/run_predict.py | 91 +++++++--- paddlevlp/models/groundingdino/__init__.py | 12 +- .../models/groundingdino/backbone/backbone.py | 3 +- .../backbone/position_encoding.py | 88 ---------- .../backbone/swin_transformer.py | 165 +++--------------- paddlevlp/models/groundingdino/bert_model.py | 85 ++------- paddlevlp/models/groundingdino/bertwarper.py | 1 - .../models/groundingdino/configuration.py | 87 +++------ .../models/groundingdino/fuse_modules.py | 1 - paddlevlp/models/groundingdino/modeling.py | 10 +- paddlevlp/models/groundingdino/transformer.py | 25 +-- paddlevlp/models/groundingdino/utils.py | 10 +- 13 files changed, 152 insertions(+), 433 deletions(-) diff --git a/paddlevlp/examples/groundingdino/README.md b/paddlevlp/examples/groundingdino/README.md index d2a004578e15a7..58ccf1541bd7e8 100644 --- a/paddlevlp/examples/groundingdino/README.md +++ b/paddlevlp/examples/groundingdino/README.md @@ -16,10 +16,9 @@ python setup_ms_deformable_attn_op.py install ``` ## 2.2 dynamic inference ```bash -python3.8 run_predict.py -dt groundingdino-swint-ogc --i image_you_want_to_detect.jpg \ --o "dir you want to save the output" \ --t "Detect Cat" +python3.8 run_predict.py +--input_imag image_you_want_to_detect.jpg \ +--prompt "cat" \ ``` diff --git a/paddlevlp/examples/groundingdino/run_predict.py b/paddlevlp/examples/groundingdino/run_predict.py index f461caac41cf3d..6953f1927c0cba 100644 --- a/paddlevlp/examples/groundingdino/run_predict.py +++ b/paddlevlp/examples/groundingdino/run_predict.py @@ -1,4 +1,4 @@ -import argparse +from dataclasses import dataclass, field import os import numpy as np import paddle @@ -7,6 +7,8 @@ from paddlevlp.processors.groundingdino_processing import GroudingDinoProcessor from paddlevlp.models.groundingdino.modeling import GroundingDinoModel from PIL import Image, ImageDraw, ImageFont +from paddlenlp.trainer import PdArgumentParser +from paddlevlp.utils.log import logger def plot_boxes_to_image(image_pil, tgt): @@ -49,39 +51,74 @@ def plot_boxes_to_image(image_pil, tgt): return image_pil, mask -def main(): - parser = argparse.ArgumentParser("Grounding DINO example", add_help=True) - parser.add_argument("--dino_type", "-dt", type=str, default="groundingdino-swint-ogc", help="dino type") - parser.add_argument("--image_path", "-i", type=str, required=True, help="path to image file") - parser.add_argument("--text_prompt", "-t", type=str, required=True, help="text prompt") - parser.add_argument( - "--output_dir", "-o", type=str, default="outputs", help="output directory" +@dataclass +class DataArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + Using `PdArgumentParser` we can turn this class + into argparse arguments to be able to specify them on + the command line. + """ + + input_image: str = field( + metadata={"help": "The name of input image."} + ) + prompt: str = field( + default=None, metadata={"help": "The prompt of the image to be generated."} + ) + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. + """ + + model_name_or_path: str = field( + default="GroundingDino/groundingdino-swint-ogc", + metadata={"help": "Path to pretrained model or model identifier"}, ) - - parser.add_argument("--box_threshold", type=float, default=0.3, help="box threshold") - parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold") - parser.add_argument( - "--visual", - type=eval, + box_threshold: float = field( + default=0.3, + metadata={ + "help": "box threshold." + }, + ) + text_threshold: float = field( + default=0.25, + metadata={ + "help": "text threshold." + }, + ) + output_dir: str = field( + default="output", + metadata={ + "help": "output directory." + }, + ) + visual: bool = field( default=True, + metadata={ + "help": "save visual image." + }, ) - - - args = parser.parse_args() +def main(): + parser = PdArgumentParser((ModelArguments, DataArguments)) + model_args, data_args = parser.parse_args_into_dataclasses() #bulid processor processor = GroudingDinoProcessor.from_pretrained( 'bert-base-uncased' ) #bulid model - print(f'dino_model {args.dino_type}') - dino_model = GroundingDinoModel.from_pretrained(args.dino_type) + logger.info("dino_model: {}".format(model_args.model_name_or_path)) + dino_model = GroundingDinoModel.from_pretrained(model_args.model_name_or_path) #read image - image_pil = Image.open(args.image_path).convert("RGB") + image_pil = Image.open(data_args.input_image).convert("RGB") #preprocess image text_prompt - image_tensor,mask,tokenized_out = processor(images=image_pil,text=args.text_prompt) + image_tensor,mask,tokenized_out = processor(images=image_pil,text=data_args.prompt) with paddle.no_grad(): outputs = dino_model(image_tensor,mask, input_ids=tokenized_out['input_ids'], @@ -94,14 +131,14 @@ def main(): # filter output logits_filt = logits.clone() boxes_filt = boxes.clone() - filt_mask = logits_filt.max(axis=1) > args.box_threshold + filt_mask = logits_filt.max(axis=1) > model_args.box_threshold logits_filt = logits_filt[filt_mask] # num_filt, 256 boxes_filt = boxes_filt[filt_mask] # num_filt, 4 # build pred pred_phrases = [] for logit, box in zip(logits_filt, boxes_filt): - pred_phrase = processor.decode(logit > args.text_threshold) + pred_phrase = processor.decode(logit > model_args.text_threshold) pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})") @@ -111,13 +148,13 @@ def main(): "size": [size[1], size[0]], # H,W "labels": pred_phrases, } - print("output:",pred_dict) + logger.info("output{}".format(pred_dict)) - if args.visual: + if model_args.visual: # make dir - os.makedirs(args.output_dir, exist_ok=True) + os.makedirs(model_args.output_dir, exist_ok=True) image_with_box = plot_boxes_to_image(image_pil, pred_dict)[0] - image_with_box.save(os.path.join(args.output_dir, "pred.jpg")) + image_with_box.save(os.path.join(model_args.output_dir, "pred.jpg")) if __name__ == "__main__": diff --git a/paddlevlp/models/groundingdino/__init__.py b/paddlevlp/models/groundingdino/__init__.py index d1ff79f33aafb8..2b7440cf1041e4 100644 --- a/paddlevlp/models/groundingdino/__init__.py +++ b/paddlevlp/models/groundingdino/__init__.py @@ -1,14 +1,8 @@ # ------------------------------------------------------------------------ # Grounding DINO -# url: https://github.com/IDEA-Research/GroundingDINO -# Copyright (c) 2023 IDEA. All Rights Reserved. +# url: https://github.com/LokeZhou/PPGroundingDINO +# Copyright (c) 2023 PaddlePaddle. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------ -# Conditional DETR -# Copyright (c) 2021 Microsoft. All Rights Reserved. -# Licensed under the Apache License, Version 2.0 [see LICENSE for details] -# ------------------------------------------------------------------------ -# Copied from DETR (https://github.com/facebookresearch/detr) -# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. -# ------------------------------------------------------------------------ + diff --git a/paddlevlp/models/groundingdino/backbone/backbone.py b/paddlevlp/models/groundingdino/backbone/backbone.py index 397a1fc36b234f..e76785f1de57e9 100644 --- a/paddlevlp/models/groundingdino/backbone/backbone.py +++ b/paddlevlp/models/groundingdino/backbone/backbone.py @@ -1,5 +1,4 @@ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -70,7 +69,7 @@ def build_backbone(args): ]: pretrain_img_size = int(args.backbone.split("_")[-2]) backbone = SwinTransformerModel.from_pretrained( - args.backbone, + "Swintransformer/"+args.backbone, pretrain_img_size=pretrain_img_size, out_indices=tuple(return_interm_indices), dilation=False, diff --git a/paddlevlp/models/groundingdino/backbone/position_encoding.py b/paddlevlp/models/groundingdino/backbone/position_encoding.py index 821b0fcc161a6b..f87d671d723e85 100644 --- a/paddlevlp/models/groundingdino/backbone/position_encoding.py +++ b/paddlevlp/models/groundingdino/backbone/position_encoding.py @@ -1,5 +1,4 @@ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -23,55 +22,6 @@ import paddle.nn as nn from paddlenlp.utils.initializer import uniform_ - - -class PositionEmbeddingSine(nn.Layer): - """ - This is a more standard version of the position embedding, very similar to the one - used by the Attention is all you need paper, generalized to work on images. - """ - - def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): - super().__init__() - self.num_pos_feats = num_pos_feats - self.temperature = temperature - self.normalize = normalize - if scale is not None and normalize is False: - raise ValueError("normalize should be True if scale is passed") - if scale is None: - scale = 2 * math.pi - self.scale = scale - - def forward(self, mask:paddle.Tensor): - - assert mask is not None - not_mask = ~mask - y_embed = not_mask.astype(paddle.float32).cumsum(1) - x_embed = not_mask.astype(paddle.float32).cumsum(2) - if self.normalize: - eps = 1e-6 - # if os.environ.get("SHILONG_AMP", None) == '1': - # eps = 1e-4 - # else: - # eps = 1e-6 - y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale - x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale - - dim_t = 2 * (paddle.arange(self.num_pos_feats) // 2).astype(paddle.float32x) - dim_t = self.temperature ** (dim_t / self.num_pos_feats) - - pos_x = x_embed[:, :, :, None] / dim_t - pos_y = y_embed[:, :, :, None] / dim_t - pos_x = paddle.stack( - (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), axis=4 - ).flatten(3) - pos_y = paddle.stack( - (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), axis=4 - ).flatten(3) - pos = paddle.concat((pos_y, pos_x), axis=3).transpose([0, 3, 1, 2]) - return pos - - class PositionEmbeddingSineHW(nn.Layer): """ This is a more standard version of the position embedding, very similar to the one @@ -122,47 +72,9 @@ def forward(self, mask:paddle.Tensor): ).flatten(3) pos = paddle.concat((pos_y, pos_x), axis=3).transpose([0, 3, 1, 2]) - # import ipdb; ipdb.set_trace() - return pos -class PositionEmbeddingLearned(nn.Layer): - """ - Absolute pos embedding, learned. - """ - - def __init__(self, num_pos_feats=256): - super().__init__() - self.row_embed = nn.Embedding(50, num_pos_feats) - self.col_embed = nn.Embedding(50, num_pos_feats) - self.reset_parameters() - - def reset_parameters(self): - uniform_(self.row_embed.weight) - uniform_(self.col_embed.weight) - - def forward(self, x: paddle.Tensor): - - h, w = x.shape[-2:] - i = paddle.arange(w) - j = paddle.arange(h) - x_emb = self.col_embed(i) - y_emb = self.row_embed(j) - pos = ( - paddle.concat( - [ - x_emb.unsqueeze(0).tile([h, 1, 1]), - y_emb.unsqueeze(1).tile([1, w, 1]), - ], - axis=-1, - ) - .transpose([2, 0, 1]) - .unsqueeze(0) - .tile([x.shape[0], 1, 1, 1]) - ) - return pos - def build_position_encoding(args): N_steps = args.hidden_dim // 2 diff --git a/paddlevlp/models/groundingdino/backbone/swin_transformer.py b/paddlevlp/models/groundingdino/backbone/swin_transformer.py index cd636f1b7965d6..191bd6c1977d0b 100644 --- a/paddlevlp/models/groundingdino/backbone/swin_transformer.py +++ b/paddlevlp/models/groundingdino/backbone/swin_transformer.py @@ -1,5 +1,4 @@ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,7 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os import numpy as np +from typing import Union import paddle import paddle.nn as nn import paddle.nn.functional as F @@ -26,140 +27,12 @@ from paddlenlp.transformers.model_utils import PretrainedModel, register_base_model """ swin_transformer model configuration""" -__all__ = ["SWIN_TRANSFORMER_PRETRAINED_INIT_CONFIGURATION", "SwinTransformerConfig", "SWIN_TRANSFORMER_PRETRAINED_RESOURCE_FILES_MAP"] - - -SWIN_TRANSFORMER_PRETRAINED_INIT_CONFIGURATION = { - "swin_T_224_1k": { - "in_chans": 3, - "embed_dim": 96, - "depths": [2, 2, 6, 2], - "num_heads": [3, 6, 12, 24], - "window_size": 7, - "pretrain_img_size": 224, - "patch_size": 4, - "out_indices": (0, 1, 2, 3), - "mlp_ratio": 4.0, - "qkv_bias": True, - "qk_scale": None, - "drop_rate": 0.0, - "attn_drop_rate": 0.0, - "drop_path_rate": 0.2, - "norm_layer": "LayerNorm", - "ape": False, - "patch_norm": True, - "frozen_stages": -1, - "dilation": False, - "use_checkpoint": False, - - - }, - "swin_B_224_22k": { - "in_chans": 3, - "embed_dim": 128, - "depths": [2, 2, 18, 2], - "num_heads": [4, 8, 16, 32], - "window_size": 7, - "pretrain_img_size": 224, - "patch_size": 4, - "out_indices": (0, 1, 2, 3), - "mlp_ratio": 4.0, - "qkv_bias": True, - "qk_scale": None, - "drop_rate": 0.0, - "attn_drop_rate": 0.0, - "drop_path_rate": 0.2, - "norm_layer": "LayerNorm", - "ape": False, - "patch_norm": True, - "frozen_stages": -1, - "dilation": False, - "use_checkpoint": False - }, - "swin_B_384_22k": { - "in_chans": 3, - "embed_dim": 128, - "depths": [2, 2, 18, 2], - "num_heads": [4, 8, 16, 32], - "window_size": 12, - "pretrain_img_size": 384, - "patch_size": 4, - "out_indices": (0, 1, 2, 3), - "mlp_ratio": 4.0, - "qkv_bias": True, - "qk_scale": None, - "drop_rate": 0.0, - "attn_drop_rate": 0.0, - "drop_path_rate": 0.2, - "norm_layer": "LayerNorm", - "ape": False, - "patch_norm": True, - "frozen_stages": -1, - "dilation": False, - "use_checkpoint":False - }, - "swin_L_224_22k": { - "in_chans": 3, - "embed_dim": 192, - "depths": [2, 2, 18, 2], - "num_heads": [6, 12, 24, 48], - "window_size": 7, - "pretrain_img_size": 224, - "patch_size": 4, - "out_indices": (0, 1, 2, 3), - "mlp_ratio": 4.0, - "qkv_bias": True, - "qk_scale": None, - "drop_rate": 0.0, - "attn_drop_rate": 0.0, - "drop_path_rate": 0.2, - "norm_layer": "LayerNorm", - "ape": False, - "patch_norm": True, - "frozen_stages": -1, - "dilation": False, - "use_checkpoint": False - }, - "swin_L_384_22k":{ - "in_chans": 3, - "embed_dim": 192, - "depths": [2, 2, 18, 2], - "num_heads": [6, 12, 24, 48], - "window_size": 12, - "pretrain_img_size": 384, - "patch_size": 4, - "out_indices": (0, 1, 2, 3), - "mlp_ratio": 4.0, - "qkv_bias": True, - "qk_scale": None, - "drop_rate": 0.0, - "attn_drop_rate": 0.0, - "drop_path_rate": 0.2, - "norm_layer": "LayerNorm", - "ape": False, - "patch_norm": True, - "frozen_stages": -1, - "dilation": False, - "use_checkpoint": False - }, - -} - -SWIN_TRANSFORMER_PRETRAINED_RESOURCE_FILES_MAP = { - "model_state": { - "swin_T_224_1k": "https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_tiny_patch4_window7_224_22kto1k_pretrained.pdparams", - "swin_B_224_22k": "https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window7_224_22kto1k_pretrained.pdparams", - "swin_B_384_22k": "https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window12_384_22kto1k_pretrained.pdparams", - "swin_L_224_22k": "https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window7_224_22kto1k_pretrained.pdparams", - "swin_L_384_22k": "https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window12_384_22kto1k_pretrained.pdparams", - } -} +__all__ = ["SwinTransformerConfig"] class SwinTransformerConfig(PretrainedConfig): model_type = "swintransformer" - pretrained_init_configuration = SWIN_TRANSFORMER_PRETRAINED_INIT_CONFIGURATION def __init__( self, @@ -182,9 +55,12 @@ def __init__( out_indices=(0, 1, 2, 3), frozen_stages=-1, dilation=False, - use_checkpoint=False + use_checkpoint=False, + **kwargs, ): - super().__init__() + kwargs["return_dict"] = kwargs.pop("return_dict", True) + super().__init__(**kwargs) + self.in_chans = in_chans self.embed_dim = embed_dim self.depths = depths @@ -206,6 +82,27 @@ def __init__( self.dilation = dilation self.use_checkpoint = use_checkpoint + + @classmethod + def from_pretrained( + cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs + ) -> "PretrainedConfig": + config_dict, kwargs = cls.get_config_dict( + pretrained_model_name_or_path, **kwargs + ) + + if ( + "model_type" in config_dict + and hasattr(cls, "model_type") + and config_dict["model_type"] != cls.model_type + ): + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) + class SwinTransformerPretrainedModel(PretrainedModel): """ See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details. @@ -216,10 +113,6 @@ class SwinTransformerPretrainedModel(PretrainedModel): resource_files_names = {"model_state": "model_state.pdparams"} base_model_prefix = "swintransformer" - pretrained_init_configuration = SWIN_TRANSFORMER_PRETRAINED_INIT_CONFIGURATION - pretrained_resource_files_map = SWIN_TRANSFORMER_PRETRAINED_RESOURCE_FILES_MAP - - class Mlp(nn.Layer): """Multilayer perceptron.""" diff --git a/paddlevlp/models/groundingdino/bert_model.py b/paddlevlp/models/groundingdino/bert_model.py index e0cbf877fba3a9..339b8edf11f657 100644 --- a/paddlevlp/models/groundingdino/bert_model.py +++ b/paddlevlp/models/groundingdino/bert_model.py @@ -67,9 +67,6 @@ def forward( mixed_query_layer = self.query(hidden_states) - # If this is instantiated as a cross-attention module, the keys - # and values come from an encoder; the attention mask needs to be - # such that the encoder's padding tokens are not attended to. is_cross_attention = encoder_hidden_states is not None if is_cross_attention and past_key_value is not None: @@ -93,13 +90,6 @@ def forward( query_layer = self.transpose_for_scores(mixed_query_layer) # return query_layer,key_layer if self.is_decoder: # False - # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. - # Further calls to cross_attention layer can then reuse all cross-attention - # key/value_states (first "if" case) - # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of - # all previous decoder key/value_states. Further calls to uni-directional self-attention - # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) - # if encoder bi-directional self-attention `past_key_value` is always `None` past_key_value = (key_layer, value_layer) # Take the dot product between "query" and "key" to get the raw attention scores. @@ -108,29 +98,20 @@ def forward( attention_scores = attention_scores / math.sqrt(self.attention_head_size) if self.clamp_min_for_underflow: - attention_scores = paddle.clip(attention_scores, min=-50000) # Do not increase -50000, data type half has quite limited range + attention_scores = paddle.clip(attention_scores, min=-50000) if self.clamp_max_for_overflow: - attention_scores = paddle.clip(attention_scores, max=50000) # Do not increase 50000, data type half has quite limited range + attention_scores = paddle.clip(attention_scores, max=50000) if attention_mask is not None: - # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask - # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(axis=-1)(attention_scores) - # if math.isnan(attention_probs.sum().item()): - # for i in range(attention_probs.size(1)): - # for j in range(attention_probs.size(2)): - # if math.isnan(attention_probs[0, i, j].sum().item()): - # print(i, j) - # pdb.set_trace() - - # This is actually dropping out entire tokens to attend to, which might - # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = self.dropout(attention_probs) - # Mask heads if we want to + if head_mask is not None: attention_probs = attention_probs * head_mask @@ -155,10 +136,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states, input_tensor): - hidden_states = self.dense(hidden_states) # diff 7.2274e-06 - hidden_states = self.dropout(hidden_states) # diff 4.22e-05 - # hidden_states + input_tensor diff : 7.22e-6 - hidden_states = self.LayerNorm(hidden_states + input_tensor) #diff 1.087e-05 + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states @@ -190,9 +170,6 @@ def forward( ) #pass # return self_outputs attention_output = self.output(self_outputs[0], hidden_states) - # print(attention_output.shape, self_outputs[0].shape, len(self_outputs)) - # attention_output 1.087e-05, self_outputs 1.31e-06 , hidden_states 1.33e-08 - # return attention_output, self_outputs, hidden_states outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs @@ -234,11 +211,9 @@ def __init__(self, config): self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) - # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load - # any TensorFlow checkpoint file self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) - # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer("position_ids", paddle.arange(config.max_position_embeddings).reshape((1, -1))) self.register_buffer( @@ -263,9 +238,6 @@ def forward( if position_ids is None: position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] - # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs - # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves - # issue #5664 if token_type_ids is None: if hasattr(self, "token_type_ids"): buffered_token_type_ids = self.token_type_ids[:, :seq_length] @@ -312,7 +284,7 @@ def forward( past_key_value = None, output_attentions = False, ): - # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None self_attention_outputs = self.attention( hidden_states, @@ -338,7 +310,6 @@ def forward( " by setting `config.add_cross_attention=True`" ) - # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None cross_attention_outputs = self.crossattention( attention_output, @@ -350,17 +321,15 @@ def forward( output_attentions, ) attention_output = cross_attention_outputs[0] - outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + outputs = outputs + cross_attention_outputs[1:-1] - # add cross-attn cache to positions 3,4 of present_key_value tuple cross_attn_present_key_value = cross_attention_outputs[-1] present_key_value = present_key_value + cross_attn_present_key_value layer_output = self.feed_forward_chunk(attention_output) - # return layer_output, attention_output + outputs = (layer_output,) + outputs - # if decoder, return the attn key/values as the last output if self.is_decoder: outputs = outputs + (present_key_value,) @@ -452,8 +421,7 @@ def __init__(self, config): self.activation = nn.Tanh() def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor: - # We "pool" the model by simply taking the hidden state corresponding - # to the first token. + first_token_tensor = hidden_states[:, 0] pooled_output = self.dense(first_token_tensor) pooled_output = self.activation(pooled_output) @@ -481,8 +449,6 @@ def __init__(self, config, add_pooling_layer=True): self.encoder = BertEncoder(config) self.pooler = BertPooler(config) if add_pooling_layer else None - # Initialize weights and apply final processing - # self.post_init() def get_input_embeddings(self): return self.embeddings.word_embeddings @@ -505,30 +471,22 @@ def get_extended_attention_mask( dtype = np.float32 if not (attention_mask.dim() == 2 and self.config.is_decoder): - # show warning only if it won't be shown in `create_extended_attention_mask_for_decoder` + if device is not None: warnings.warn( "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning ) - # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] - # ourselves in which case we just need to make it broadcastable to all heads. + if attention_mask.dim() == 3: extended_attention_mask = attention_mask[:, None, :, :] elif attention_mask.dim() == 2: - # Provided a padding mask of dimensions [batch_size, seq_length] - # - if the model is a decoder, apply a causal mask in addition to the padding mask - # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] + extended_attention_mask = attention_mask[:, None, None, :] else: raise ValueError( f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})" ) - # Since attention_mask is 1.0 for positions we want to attend and 0.0 for - # masked positions, this operation will create a tensor which is 0.0 for - # positions we want to attend and the dtype's smallest value for masked positions. - # Since we are adding it to the raw scores before the softmax, this is - # effectively the same as removing these entirely. extended_attention_mask = paddle.cast(extended_attention_mask, dtype=dtype) # fp16 compatibility extended_attention_mask = (1.0 - extended_attention_mask) * np.finfo(dtype).min return extended_attention_mask @@ -611,12 +569,8 @@ def forward( else: token_type_ids = paddle.zeros(input_shape, dtype=paddle.int64) - # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] - # ourselves in which case we just need to make it broadcastable to all heads. extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) - # If a 2D or 3D attention mask is provided for the cross-attention - # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] if self.config.is_decoder and encoder_hidden_states is not None: encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.shape encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) @@ -626,11 +580,6 @@ def forward( else: encoder_extended_attention_mask = None - # Prepare head mask if needed - # 1.0 in head_mask indicate we keep the head - # attention_probs has shape bsz x n_heads x N x N - # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] - # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) embedding_output = self.embeddings( @@ -677,8 +626,6 @@ def __init__(self, cfg, bert_config): print("LANGUAGE BACKBONE USE GRADIENT CHECKPOINTING: ", self.cfg.MODEL.LANGUAGE_BACKBONE.USE_CHECKPOINT) bert_config.gradient_checkpointing = self.cfg.MODEL.LANGUAGE_BACKBONE.USE_CHECKPOINT - # bert_config.attention_probs_dropout_prob = 0.0 - # bert_config.hidden_dropout_prob = 0.0 self.model = BertModel(bert_config) self.language_dim = 768 diff --git a/paddlevlp/models/groundingdino/bertwarper.py b/paddlevlp/models/groundingdino/bertwarper.py index d4c75bccdbe339..09904a2cd16e4f 100644 --- a/paddlevlp/models/groundingdino/bertwarper.py +++ b/paddlevlp/models/groundingdino/bertwarper.py @@ -1,5 +1,4 @@ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlevlp/models/groundingdino/configuration.py b/paddlevlp/models/groundingdino/configuration.py index d39c42461b99d0..257539e2c4a8a8 100644 --- a/paddlevlp/models/groundingdino/configuration.py +++ b/paddlevlp/models/groundingdino/configuration.py @@ -1,5 +1,4 @@ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,69 +13,16 @@ # limitations under the License. """ GroundingDino model configuration""" - +import os +from typing import Union from paddlenlp.transformers.configuration_utils import PretrainedConfig -__all__ = ["GROUNDINGDINO_PRETRAINED_INIT_CONFIGURATION", "GroundingDinoConfig", "GROUNDINGDINO_PRETRAINED_RESOURCE_FILES_MAP"] - -GROUNDINGDINO_PRETRAINED_INIT_CONFIGURATION = { - "groundingdino-swint-ogc": { - "modelname" : "groundingdino", - "backbone" : "swin_T_224_1k", - "position_embedding" : "sine", - "pe_temperatureH" : 20, - "pe_temperatureW" : 20, - "return_interm_indices" : [1, 2, 3], - "backbone_freeze_keywords" : None, - "enc_layers" : 6, - "dec_layers" : 6, - "pre_norm" : False, - "dim_feedforward" : 2048, - "hidden_dim" : 256, - "dropout" : 0.0, - "nheads" : 8, - "num_queries" : 900, - "query_dim" : 4, - "num_patterns" : 0, - "num_feature_levels" : 4, - "enc_n_points" : 4, - "dec_n_points" : 4, - "two_stage_type" : "standard", - "two_stage_bbox_embed_share" : False, - "two_stage_class_embed_share" : False, - "transformer_activation" : "relu", - "dec_pred_bbox_embed_share" : True, - "dn_box_noise_scale" : 1.0, - "dn_label_noise_ratio" : 0.5, - "dn_label_coef" : 1.0, - "dn_bbox_coef" : 1.0, - "embed_init_tgt" :True, - "dn_labelbook_size" : 2000, - "max_text_len" : 256, - "text_encoder_type" : "bert-base-uncased", - "use_text_enhancer" : True, - "use_fusion_layer" : True, - "use_checkpoint" : False, - "use_transformer_ckpt" : False, - "use_text_cross_attention" : True, - "text_dropout" : 0.0, - "fusion_dropout" : 0.0, - "fusion_droppath" : 0.1, - "sub_sentence_present" : True - }, -} - -GROUNDINGDINO_PRETRAINED_RESOURCE_FILES_MAP = { - "model_state": { - "groundingdino-swint-ogc": "https://bj.bcebos.com/v1/paddledet/models/groundingdino_swint_ogc.pdparams", - } -} +__all__ = ["GroundingDinoConfig"] class GroundingDinoConfig(PretrainedConfig): model_type = "groundingdino" - pretrained_init_configuration = GROUNDINGDINO_PRETRAINED_INIT_CONFIGURATION def __init__( self, @@ -121,9 +67,11 @@ def __init__( text_dropout = 0.0, fusion_dropout = 0.0, fusion_droppath = 0.1, - sub_sentence_present = True + sub_sentence_present = True, + **kwargs, ): - super().__init__() + kwargs["return_dict"] = kwargs.pop("return_dict", True) + super().__init__(**kwargs) self.modelname = modelname self.backbone = backbone self.position_embedding = position_embedding @@ -166,3 +114,24 @@ def __init__( self.fusion_dropout = fusion_dropout self.fusion_droppath = fusion_dropout self.sub_sentence_present = sub_sentence_present + + + @classmethod + def from_pretrained( + cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs + ) -> "PretrainedConfig": + config_dict, kwargs = cls.get_config_dict( + pretrained_model_name_or_path, **kwargs + ) + + if ( + "model_type" in config_dict + and hasattr(cls, "model_type") + and config_dict["model_type"] != cls.model_type + ): + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) \ No newline at end of file diff --git a/paddlevlp/models/groundingdino/fuse_modules.py b/paddlevlp/models/groundingdino/fuse_modules.py index 0dc731cfa66e7d..2a3feb0b0844dd 100644 --- a/paddlevlp/models/groundingdino/fuse_modules.py +++ b/paddlevlp/models/groundingdino/fuse_modules.py @@ -1,5 +1,4 @@ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlevlp/models/groundingdino/modeling.py b/paddlevlp/models/groundingdino/modeling.py index 11f5fcf76559cc..7504d9e7575056 100644 --- a/paddlevlp/models/groundingdino/modeling.py +++ b/paddlevlp/models/groundingdino/modeling.py @@ -35,11 +35,7 @@ generate_masks_with_special_tokens_and_transfer_map, ) -from .configuration import ( - GROUNDINGDINO_PRETRAINED_INIT_CONFIGURATION, - GROUNDINGDINO_PRETRAINED_RESOURCE_FILES_MAP, - GroundingDinoConfig, -) +from .configuration import GroundingDinoConfig from .backbone import build_backbone from .transformer import build_transformer @@ -58,10 +54,8 @@ class GroundingDinoPretrainedModel(PretrainedModel): model_config_file = "config.json" config_class = GroundingDinoConfig resource_files_names = {"model_state": "model_state.pdparams"} - base_model_prefix = "groundding" + base_model_prefix = "grounddingDino" - pretrained_init_configuration = GROUNDINGDINO_PRETRAINED_INIT_CONFIGURATION - pretrained_resource_files_map = GROUNDINGDINO_PRETRAINED_RESOURCE_FILES_MAP @register_base_model class GroundingDinoModel(GroundingDinoPretrainedModel): diff --git a/paddlevlp/models/groundingdino/transformer.py b/paddlevlp/models/groundingdino/transformer.py index 697e6a90626eb9..034d836d2c5c10 100644 --- a/paddlevlp/models/groundingdino/transformer.py +++ b/paddlevlp/models/groundingdino/transformer.py @@ -60,7 +60,7 @@ def __init__( # init query learnable_tgt_init=False, # two stage - two_stage_type="no", # ['no', 'standard', 'early', 'combine', 'enceachlayer', 'enclayer1'] + two_stage_type="no", embed_init_tgt=False, # for text use_text_enhancer=False, @@ -155,7 +155,6 @@ def __init__( if num_feature_levels > 1: if self.num_encoder_layers > 0: self.level_embed = self.create_parameter(shape=[num_feature_levels, d_model]) - # self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model)) else: self.level_embed = None @@ -278,9 +277,6 @@ def forward(self, srcs, masks, refpoint_embed, pos_embeds, tgt, attn_mask=None, # - enc_intermediate_refpoints: None or (nenc+1, bs, nq, c) or (nenc, bs, nq, c) ######################################################### text_dict["encoded_text"] = memory_text - # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1': - # if memory.isnan().any() | memory.isinf().any(): - # import ipdb; ipdb.set_trace() if self.two_stage_type == "standard": @@ -318,8 +314,6 @@ def forward(self, srcs, masks, refpoint_embed, pos_embeds, tgt, attn_mask=None, tgt_undetach = paddle.take_along_axis(arr=output_memory, axis=1,indices=topk_proposals.unsqueeze(axis=-1).tile(repeat_times=[1, 1, self.d_model])) - # gather tgt - # tgt_undetach = paddle.gather_nd(output_memory, topk_ind) if self.embed_init_tgt: tgt_ = ( self.tgt_embed.weight[:, None, :].tile([1, bs, 1]).transpose([1, 0, 2]) @@ -550,9 +544,6 @@ def forward( # main process for layer_id, layer in enumerate(self.layers): - # if output.isnan().any() or memory_text.isnan().any(): - # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO': - # import ipdb; ipdb.set_trace() if self.fusion_layers: if self.use_checkpoint: output, memory_text = recompute( @@ -689,10 +680,7 @@ def forward( raw_query_pos = self.ref_point_head(query_sine_embed) # nq, bs, 256 pos_scale = self.query_scale(output) if self.query_scale is not None else 1 query_pos = pos_scale * raw_query_pos - # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1': - # if query_pos.isnan().any() | query_pos.isinf().any(): - # import ipdb; ipdb.set_trace() - + # main process output = layer( tgt=output, @@ -719,14 +707,10 @@ def forward( print(f"num_nan {num_nan}, num_inf {num_inf}") except Exception as e: print(e) - # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1': - # import ipdb; ipdb.set_trace() + # iter update if self.bbox_embed is not None: - # box_holder = self.bbox_embed(output) - # box_holder[..., :self.query_dim] += inverse_sigmoid(reference_points) - # new_reference_points = box_holder[..., :self.query_dim].sigmoid() reference_before_sigmoid = inverse_sigmoid(reference_points) delta_unsig = self.bbox_embed[layer_id](output) @@ -790,8 +774,7 @@ def forward_ffn(self, src): def forward( self, src, pos, reference_points, spatial_shapes, level_start_index, key_padding_mask=None ): - # self attention - # import ipdb; ipdb.set_trace() + src2 = self.self_attn( query=self.with_pos_embed(src, pos), diff --git a/paddlevlp/models/groundingdino/utils.py b/paddlevlp/models/groundingdino/utils.py index b55987720b5e4a..4f75874a47cb3d 100644 --- a/paddlevlp/models/groundingdino/utils.py +++ b/paddlevlp/models/groundingdino/utils.py @@ -94,13 +94,10 @@ def gen_encoder_output_proposals( else: wh = paddle.ones_like(grid) * 0.05 * (2.0**lvl) - # scale = torch.cat([W_[None].unsqueeze(-1), H_[None].unsqueeze(-1)], 1).view(1, 1, 1, 2).repeat(N_, 1, 1, 1) - # grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale - # wh = torch.ones_like(grid) / scale proposal = paddle.concat((grid, wh), -1).reshape([N_, -1, 4]) proposals.append(proposal) _cur += H_ * W_ - # import ipdb; ipdb.set_trace() + output_proposals = paddle.concat(proposals, 1) output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all( -1, keepdim=True @@ -113,8 +110,6 @@ def gen_encoder_output_proposals( output_memory = masked_fill(output_memory, memory_padding_mask.unsqueeze(-1), float(0)) output_memory = masked_fill(output_memory, ~output_proposals_valid, float(0)) - # output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf')) - # output_memory = output_memory.masked_fill(~output_proposals_valid, float('inf')) return output_memory, output_proposals @@ -204,8 +199,7 @@ def _get_activation_fn(activation, d_model=256, batch_dim=0): def gen_sineembed_for_position(pos_tensor): - # n_query, bs, _ = pos_tensor.size() - # sineembed_tensor = torch.zeros(n_query, bs, 256) + scale = 2 * math.pi dim_t = paddle.arange(128) dim_t = 10000 ** (2 * (paddle.floor_divide(dim_t, paddle.to_tensor(2))) / 128) From 0592532ea6ba14dbc79d0d86f13484f665aee36e Mon Sep 17 00:00:00 2001 From: Milen <1649759610@qq.com> Date: Tue, 4 Jul 2023 10:19:42 +0000 Subject: [PATCH 08/10] [New Feature] add visualglm --- paddlevlp/models/__init__.py | 4 +- paddlevlp/models/visualglm/__init__.py | 13 + paddlevlp/models/visualglm/configuration.py | 338 ++++ paddlevlp/models/visualglm/modeling.py | 1550 +++++++++++++++++ paddlevlp/processors/__init__.py | 2 + .../processors/visualglm_image_processing.py | 285 +++ paddlevlp/processors/visualglm_processing.py | 223 +++ 7 files changed, 2414 insertions(+), 1 deletion(-) create mode 100644 paddlevlp/models/visualglm/__init__.py create mode 100644 paddlevlp/models/visualglm/configuration.py create mode 100644 paddlevlp/models/visualglm/modeling.py create mode 100644 paddlevlp/processors/visualglm_image_processing.py create mode 100644 paddlevlp/processors/visualglm_processing.py diff --git a/paddlevlp/models/__init__.py b/paddlevlp/models/__init__.py index 77ef10b5801c9c..967c36e525d711 100644 --- a/paddlevlp/models/__init__.py +++ b/paddlevlp/models/__init__.py @@ -15,4 +15,6 @@ from .blip2.modeling import * from .minigpt4.configuration import * -from .minigpt4.modeling import * \ No newline at end of file +from .minigpt4.modeling import * +from .visualglm.configuration import * +from .visualglm.modeling import * \ No newline at end of file diff --git a/paddlevlp/models/visualglm/__init__.py b/paddlevlp/models/visualglm/__init__.py new file mode 100644 index 00000000000000..595add0aed9e11 --- /dev/null +++ b/paddlevlp/models/visualglm/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlevlp/models/visualglm/configuration.py b/paddlevlp/models/visualglm/configuration.py new file mode 100644 index 00000000000000..36dae15687da6b --- /dev/null +++ b/paddlevlp/models/visualglm/configuration.py @@ -0,0 +1,338 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" VisualGLM model configuration """ +import copy +import os +from typing import Union + +from ...utils.log import logger +from paddlenlp.transformers.chatglm.configuration import ChatGLMConfig +from paddlenlp.transformers.configuration_utils import PretrainedConfig + +__all__ = ["VisualGLMVisionConfig", "VisualGLMQFormerConfig", "VisualGLMConfig"] + + +class VisualGLMVisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`VisualGLMVisionModel`]. It is used to instantiate a + VisualGLM vision encoder according to the specified arguments, defining the model architecture. + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + Args: + hidden_size (`int`, *optional*, defaults to 1408): + Dimensionality of the encoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 6144): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + num_hidden_layers (`int`, *optional*, defaults to 39): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 14): + The size (resolution) of each patch. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults + to 1e-5): The epsilon used by the layer normalization layers. + dropout (`float`, *optional*, defaults to 0.0): + The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + initializer_factor (`float``, *optional*, defaults to 1): + A factor for initializing all weight matrices (should be kept to 1, used internally for initialization + testing). + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the queries and values in the self-attention layers. + Example: + ```python + >>> from paddlenlp.transformers import VisualGLMVisionConfig, VisualGLMVisionModel + >>> # Initializing a VisualGLMVisionConfig + >>> configuration = VisualGLMVisionConfig() + >>> # Initializing a VisualGLMVisionModel (with random weights) from the configuration above. + >>> model = VisualGLMVisionModel(configuration) + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "visualglm_vision_model" + + def __init__( + self, + hidden_size=1408, + intermediate_size=6144, + num_hidden_layers=39, + num_attention_heads=16, + num_channels=3, + image_size=224, + patch_size=14, + hidden_act="gelu", + layer_norm_eps=0.00001, + dropout=0.1, + attention_dropout=0.1, + initializer_range=1e-10, + initializer_factor=1.0, + qkv_bias=True, + **kwargs, + ): + kwargs["return_dict"] = kwargs.pop("return_dict", True) + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.dropout = dropout + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_channels = num_channels + self.patch_size = patch_size + self.image_size = image_size + self.initializer_range = initializer_range + self.initializer_factor = initializer_factor + self.attention_dropout = attention_dropout + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + self.qkv_bias = qkv_bias + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + # get the vision config dict if we are loading from VisualGLMConfig + if config_dict.get("model_type") == "visualglm": + config_dict = config_dict["vision_config"] + + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) + + +class VisualGLMQFormerConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`VisualGLMQFormerModel`]. It is used to instantiate a + VisualGLM Querying Transformer (Q-Former) model according to the specified arguments, defining the model architecture. + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from + [`PretrainedConfig`] for more information. + Note that [`VisualGLMQFormerModel`] is very similar to [`BertLMHeadModel`] with interleaved cross-attention. + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + position_embedding_type (`str`, *optional*, defaults to `"absolute"`): + Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For + positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to + [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). + For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models + with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658). + classifier_dropout (`float`, *optional*): + The dropout ratio for the classification head. + cross_attention_frequency (`int`, *optional*, defaults to 2): + The frequency of adding cross-attention to the Transformer layers. + encoder_hidden_size (`int`, *optional*, defaults to 1408): + The hidden size of the hidden states for cross-attention. + Examples: + ```python + >>> from paddlenlp.transformers import VisualGLMQFormerConfig, VisualGLMQFormerModel + >>> # Initializing a VisualGLM configuration + >>> configuration = VisualGLMQFormerConfig() + >>> # Initializing a model (with random weights) from the configuration above + >>> model = VisualGLMQFormerModel(configuration) + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + model_type = "visualglm_qformer_model" + + def __init__( + self, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=0, + position_embedding_type="absolute", + classifier_dropout=None, + cross_attention_frequency=2, + encoder_hidden_size=1408, + **kwargs, + ): + super().__init__(pad_token_id=pad_token_id, **kwargs) + + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.position_embedding_type = position_embedding_type + self.classifier_dropout = classifier_dropout + self.cross_attention_frequency = cross_attention_frequency + self.encoder_hidden_size = encoder_hidden_size + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + + # get the qformer config dict if we are loading from VisualGLMConfig + if config_dict.get("model_type") == "visualglm": + config_dict = config_dict["qformer_config"] + + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) + + +class VisualGLMConfig(PretrainedConfig): + r""" + [`VisualGLMConfig`] is the configuration class to store the configuration of a [`VisualGLMForConditionalGeneration`]. It is + used to instantiate a VisualGLM model according to the specified arguments, defining the vision model, Q-Former model + and language model configs. + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + Args: + vision_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`VisualGLMVisionConfig`]. + qformer_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`VisualGLMQFormerConfig`]. + text_config (`dict`, *optional*): + Dictionary of configuration options used to initialize any [`PretrainedConfig`]. + num_query_tokens (`int`, *optional*, defaults to 32): + The number of query tokens passed through the Transformer. + kwargs (*optional*): + Dictionary of keyword arguments. + Example: + ```python + >>> from paddlenlp.transformers import ( + ... VisualGLMVisionConfig, + ... VisualGLMQFormerConfig, + ... ChatGLMConfig, + ... VisualGLMConfig, + ... VisualGLMForConditionalGeneration, + ... ) + >>> # Initializing a VisualGLMConfig configuration + >>> configuration = VisualGLMConfig() + >>> # Initializing a VisualGLMForConditionalGeneration (with random weights) from the configuration above + >>> model = VisualGLMForConditionalGeneration(configuration) + >>> # Accessing the model configuration + >>> configuration = model.config + >>> # We can also initialize a VisualGLMConfig from a VisualGLMVisionConfig, VisualGLMQFormerConfig and any PretrainedConfig + >>> # Initializing VisualGLM vision, VisualGLM Q-Former and language model configurations + >>> vision_config = VisualGLMVisionConfig() + >>> qformer_config = VisualGLMQFormerConfig() + >>> text_config = ChatGLMConfig() + >>> config = VisualGLMConfig.from_text_vision_configs(vision_config, qformer_config, text_config) + ```""" + + model_type = "visualglm" + + def __init__(self, vision_config=None, qformer_config=None, text_config=None, num_query_tokens=32, **kwargs): + super().__init__(**kwargs) + + if vision_config is None: + vision_config = {} + logger.info("vision_config is None. initializing the VisualGLMVisionConfig with default values.") + + if qformer_config is None: + qformer_config = {} + logger.info("qformer_config is None. Initializing the VisualGLMQFormerConfig with default values.") + + if text_config is None: + text_config = {} + logger.info("text_config is None. Initializing the text config with default values (`ChatGLMConfig`).") + self.vision_config = VisualGLMVisionConfig(**vision_config) + self.qformer_config = VisualGLMQFormerConfig(**qformer_config) + text_model_type = text_config["model_type"] if "model_type" in text_config else "chatglm" + + if text_model_type == "chatglm": + self.text_config = ChatGLMConfig(**text_config) + else: + raise ValueError("Only chatglm accepted for model_type, but accepted {}.".format(text_model_type)) + + self.num_query_tokens = num_query_tokens + self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size + + self.initializer_factor = 1.0 + self.initializer_range = 0.02 + + @classmethod + def from_vision_qformer_text_configs( + cls, + vision_config: VisualGLMVisionConfig, + qformer_config: VisualGLMQFormerConfig, + text_config: PretrainedConfig, + **kwargs, + ): + r""" + Instantiate a [`VisualGLMConfig`] (or a derived class) from a vision model, Q-Former and language model + configurations. + Returns: + [`VisualGLM`]: An instance of a configuration object + """ + + return cls( + vision_config=vision_config.to_dict(), + qformer_config=qformer_config.to_dict(), + text_config=text_config.to_dict(), + **kwargs, + ) + + def to_dict(self): + """ + Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. + Returns: + `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, + """ + output = copy.deepcopy(self.__dict__) + output["vision_config"] = self.vision_config.to_dict() + output["qformer_config"] = self.qformer_config.to_dict() + output["text_config"] = self.text_config.to_dict() + output["model_type"] = self.__class__.model_type + return output diff --git a/paddlevlp/models/visualglm/modeling.py b/paddlevlp/models/visualglm/modeling.py new file mode 100644 index 00000000000000..bd585984fcaafb --- /dev/null +++ b/paddlevlp/models/visualglm/modeling.py @@ -0,0 +1,1550 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from dataclasses import dataclass +from typing import Any, Optional, Tuple, Union + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.distributed.fleet.utils import recompute +from paddle.nn import CrossEntropyLoss + +from paddlenlp.transformers.chatglm.configuration import ChatGLMConfig +from paddlenlp.transformers.chatglm.modeling import ChatGLMForConditionalGeneration +from paddlenlp.transformers.model_outputs import ( + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPooling, + BaseModelOutputWithPoolingAndCrossAttentions, + ModelOutput, +) +from paddlenlp.transformers.model_utils import ( + PretrainedModel, + apply_chunking_to_forward, + find_pruneable_heads_and_indices, + prune_linear_layer, +) + +from ...utils.log import logger +from ...activations import ACT2FN +from ...utils.initializer import normal_, ones_, zeros_ + +from .configuration import ( + VisualGLMConfig, + VisualGLMQFormerConfig, + VisualGLMVisionConfig, +) + +VisualGLM_PRETRAINED_MODEL_ARCHIVE_LIST = [] + +__all__ = [ + "VisualGLMModel", + "VisualGLMPretrainedModel", + "VisualGLMQFormerModel", + "VisualGLMVisionModel", + "VisualGLMForConditionalGeneration", +] + + +def Parameter(tensor, dtype="float16"): + tensor = paddle.cast(tensor, dtype) + return paddle.create_parameter(tensor.shape, dtype=tensor.dtype, default_initializer=nn.initializer.Assign(tensor)) + + +@dataclass +class VisualGLMForConditionalGenerationModelOutput(ModelOutput): + """ + Class defining the outputs of [`VisualGLMForConditionalGeneration`]. + Args: + loss (`paddle.Tensor`, *optional*, returned when `labels` is provided, `paddle.Tensor` of shape `(1,)`): + Language modeling loss from the language model. + logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head of the language model. + vision_outputs (`BaseModelOutputWithPooling`): + Outputs of the vision encoder. + qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`): + Outputs of the Q-Former (Querying Transformer). + language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`): + Outputs of the language model. + """ + + loss: Optional[Tuple[paddle.Tensor]] = None + logits: Optional[Tuple[paddle.Tensor]] = None + vision_outputs: Optional[paddle.Tensor] = None + qformer_outputs: Optional[Tuple[paddle.Tensor]] = None + language_model_outputs: Optional[Tuple[paddle.Tensor]] = None + + def to_tuple(self) -> Tuple[Any]: + return tuple( + self[k] + if k not in ["vision_outputs", "qformer_outputs", "language_model_outputs"] + else getattr(self, k).to_tuple() + for k in self.keys() + ) + + +class VisualGLMPretrainedModel(PretrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = VisualGLMConfig + base_model_prefix = "visualglm" + supports_gradient_checkpointing = True + _keys_to_ignore_on_load_missing = [ + r"position_ids", + ] + + def _init_weights(self, module): + """Initialize the weights""" + factor = self.config.initializer_range + if isinstance(module, nn.Conv2D) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear): + normal_(module.weight, mean=0.0, std=factor) + if hasattr(module, "bias") and module.bias is not None: + zeros_(module.bias) + + if isinstance(module, VisualGLMVisionEmbeddings): + if hasattr(self.config, "vision_config"): + factor = self.config.vision_config.initializer_range + trunc_normal_ = nn.initializer.TruncatedNormal(mean=0.0, std=factor) + trunc_normal_(module.position_embedding) + trunc_normal_( + module.class_embedding, + ) + elif isinstance(module, nn.LayerNorm): + zeros_(module.bias) + ones_(module.weight) + elif isinstance(module, nn.Linear) and module.bias is not None: + zeros_(module.bias) + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, VisualGLMEncoder): + module.gradient_checkpointing = value + + +class VisualGLMVisionEmbeddings(nn.Layer): + def __init__(self, config: VisualGLMVisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + self.in_channels = config.num_channels + + self.patch_embedding = nn.Conv2D( + in_channels=self.in_channels, + out_channels=self.embed_dim, + kernel_size=self.patch_size, + stride=self.patch_size, + ) + + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + 1 + + self.class_embedding = Parameter(paddle.randn([1, 1, self.embed_dim]), dtype=self.patch_embedding.weight.dtype) + self.position_embedding = Parameter( + paddle.randn([1, self.num_positions, self.embed_dim]), dtype=self.patch_embedding.weight.dtype + ) + + def forward(self, pixel_values: paddle.Tensor) -> paddle.Tensor: + batch_size = pixel_values.shape[0] + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(pixel_values) # shape = [*, width, grid, grid] + patch_embeds = patch_embeds.flatten(2).transpose([0, 2, 1]) + + class_embeds = self.class_embedding.expand([batch_size, 1, -1]).cast(target_dtype) + embeddings = paddle.concat([class_embeds, patch_embeds], axis=1) + embeddings = embeddings + self.position_embedding[:, : embeddings.shape[1], :].cast(target_dtype) + return embeddings + + +class VisualGLMAttention(nn.Layer): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.dropout = nn.Dropout(config.attention_dropout) + + # small tweak here compared to CLIP, no bias here + self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias_attr=False) + + if config.qkv_bias: + q_bias = Parameter(paddle.zeros([self.embed_dim], dtype=self.qkv.weight.dtype)) + v_bias = Parameter(paddle.zeros([self.embed_dim], dtype=self.qkv.weight.dtype)) + else: + q_bias = None + v_bias = None + + if q_bias is not None: + qkv_bias = paddle.concat((q_bias, paddle.zeros_like(v_bias), v_bias)) + self.qkv.bias = Parameter(qkv_bias, dtype=self.qkv.weight.dtype) + + self.projection = nn.Linear(self.embed_dim, self.embed_dim) + + def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int): + return tensor.reshape([bsz, seq_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3]) + + def forward( + self, + hidden_states: paddle.Tensor, + head_mask: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]: + """Input shape: Batch x Time x Channel""" + bsz, tgt_len, embed_dim = hidden_states.shape + + mixed_qkv = self.qkv(hidden_states) + + mixed_qkv = mixed_qkv.reshape([bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads]).transpose( + [2, 0, 3, 1, 4] + ) + query_states, key_states, value_states = ( + mixed_qkv[0], + mixed_qkv[1], + mixed_qkv[2], + ) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = paddle.matmul(query_states, key_states, transpose_y=True) + + attention_scores = attention_scores * self.scale + + # Normalize the attention scores to probabilities. + attention_probs = F.softmax(attention_scores, axis=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = paddle.matmul(attention_probs, value_states).transpose([0, 2, 1, 3]) + + new_context_layer_shape = context_layer.shape[:-2] + [ + self.embed_dim, + ] + context_layer = context_layer.reshape(new_context_layer_shape) + + output = self.projection(context_layer) + + outputs = (output, attention_probs) if output_attentions else (output, None) + + return outputs + + +class VisualGLMMLP(nn.Layer): + def __init__(self, config): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.hidden_act] + self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + + def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +class VisualGLMEncoderLayer(nn.Layer): + def __init__(self, config: VisualGLMConfig): + super().__init__() + self.embed_dim = config.hidden_size + self.self_attn = VisualGLMAttention(config) + self.layer_norm1 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps) + self.mlp = VisualGLMMLP(config) + self.layer_norm2 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps) + + def forward( + self, + hidden_states: paddle.Tensor, + attention_mask: paddle.Tensor, + output_attentions: Optional[bool] = False, + ) -> Tuple[paddle.Tensor]: + """ + Args: + hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`paddle.Tensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + `(config.encoder_attention_heads,)`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + head_mask=attention_mask, + output_attentions=output_attentions, + ) + hidden_states = hidden_states + residual + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + + hidden_states = hidden_states + residual + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class VisualGLMEncoder(nn.Layer): + """ + Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a + [`VisualGLMEncoderLayer`]. + Args: + config (`VisualGLMConfig`): + The corresponding vision configuration for the `VisualGLMEncoder`. + """ + + def __init__(self, config: VisualGLMConfig): + super().__init__() + self.config = config + self.layers = nn.LayerList([VisualGLMEncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + inputs_embeds, + attention_mask: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: + r""" + Args: + inputs_embeds (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + + - 0 for tokens that are **masked**. + [What are attention masks?](../glossary#attention-mask) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + hidden_states = inputs_embeds + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, output_attentions) + + return custom_forward + + layer_outputs = recompute( + create_custom_forward(encoder_layer), + hidden_states, + attention_mask, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +class VisualGLMVisionModel(VisualGLMPretrainedModel): + main_input_name = "pixel_values" + config_class = VisualGLMVisionConfig + + def __init__(self, config: VisualGLMVisionConfig): + super().__init__(config) + self.config = config + embed_dim = config.hidden_size + + self.embeddings = VisualGLMVisionEmbeddings(config) + self.encoder = VisualGLMEncoder(config) + self.post_layernorm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps) + + def forward( + self, + pixel_values: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + r""" + Returns: + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + hidden_states = self.embeddings(pixel_values) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = encoder_outputs[0] + last_hidden_state = self.post_layernorm(last_hidden_state) + + pooled_output = last_hidden_state[:, 0, :] + pooled_output = self.post_layernorm(pooled_output) + + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + def get_input_embeddings(self): + return self.embeddings + + +class VisualGLMQFormerMultiHeadAttention(nn.Layer): + def __init__(self, config, is_cross_attention=False): + super().__init__() + self.config = config + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention heads (%d)" + % (config.hidden_size, config.num_attention_heads) + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + if is_cross_attention: + self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size) + self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size) + else: + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + self.save_attention = False + + def save_attn_gradients(self, attn_gradients): + self.attn_gradients = attn_gradients + + def get_attn_gradients(self): + return self.attn_gradients + + def save_attention_map(self, attention_map): + self.attention_map = attention_map + + def get_attention_map(self): + return self.attention_map + + def transpose_for_scores(self, x): + new_x_shape = x.shape[:-1] + [self.num_attention_heads, self.attention_head_size] + x = x.reshape(new_x_shape) + return x.transpose([0, 2, 1, 3]) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + if is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = paddle.concat([past_key_value[0], key_layer], axis=2) + value_layer = paddle.concat([past_key_value[1], value_layer], axis=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + mixed_query_layer = self.query(hidden_states) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = paddle.matmul(query_layer, key_layer, transpose_y=True) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + seq_length = hidden_states.shape[1] + position_ids_l = paddle.arange(seq_length, dtype="int64").reshape([-1, 1]) + position_ids_r = paddle.arange(seq_length, dtype="int64").reshape([1, -1]) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.cast(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = paddle.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = paddle.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = paddle.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(axis=-1)(attention_scores) + + if is_cross_attention and self.save_attention: + self.save_attention_map(attention_probs) + attention_probs.register_hook(self.save_attn_gradients) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs_dropped = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs_dropped = attention_probs_dropped * head_mask + + context_layer = paddle.matmul(attention_probs_dropped, value_layer) + + context_layer = context_layer.transpose([0, 2, 1, 3]) + new_context_layer_shape = context_layer.shape[:-2] + [ + self.all_head_size, + ] + context_layer = context_layer.reshape(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + outputs = outputs + (past_key_value,) + return outputs + + +class VisualGLMQFormerSelfOutput(nn.Layer): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: paddle.Tensor, input_tensor: paddle.Tensor) -> paddle.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class VisualGLMQFormerAttention(nn.Layer): + def __init__(self, config, is_cross_attention=False): + super().__init__() + self.attention = VisualGLMQFormerMultiHeadAttention(config, is_cross_attention) + self.output = VisualGLMQFormerSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.attention.query = prune_linear_layer(self.attention.query, index) + self.attention.key = prune_linear_layer(self.attention.key, index) + self.attention.value = prune_linear_layer(self.attention.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, axis=1) + + # Update hyper params and store pruned heads + self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads) + self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: paddle.Tensor, + attention_mask: Optional[paddle.Tensor] = None, + head_mask: Optional[paddle.Tensor] = None, + encoder_hidden_states: Optional[paddle.Tensor] = None, + encoder_attention_mask: Optional[paddle.Tensor] = None, + past_key_value: Optional[Tuple[Tuple[paddle.Tensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[paddle.Tensor]: + self_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class VisualGLMQFormerIntermediate(nn.Layer): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class VisualGLMQFormerOutput(nn.Layer): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + # self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: paddle.Tensor, input_tensor: paddle.Tensor) -> paddle.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = hidden_states + input_tensor + # hidden_states = self.LayerNorm() + return hidden_states + + +class VisualGLMQFormerLayer(nn.Layer): + def __init__(self, config, layer_idx): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.input_layernorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) + self.attention = VisualGLMQFormerAttention(config) + + self.layer_idx = layer_idx + + if layer_idx % config.cross_attention_frequency == 0: + self.crossattention = VisualGLMQFormerAttention(config, is_cross_attention=True) + self.has_cross_attention = True + else: + self.has_cross_attention = False + + self.intermediate_query = VisualGLMQFormerIntermediate(config) + self.output_query = VisualGLMQFormerOutput(config) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + query_length=0, + ): + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + hidden_states = self.input_layernorm(hidden_states) + self_attention_outputs = self.attention( + hidden_states, # 1, 32, 768 + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:-1] + + present_key_value = self_attention_outputs[-1] + + if query_length > 0: + query_attention_output = attention_output[:, :query_length, :] + + if self.has_cross_attention: + if encoder_hidden_states is None: + raise ValueError("encoder_hidden_states must be given for cross-attention layers") + cross_attention_outputs = self.crossattention( + query_attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + output_attentions=output_attentions, + ) + query_attention_output = cross_attention_outputs[0] + # add cross attentions if we output attention weights + outputs = outputs + cross_attention_outputs[1:-1] + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk_query, + self.chunk_size_feed_forward, + self.seq_len_dim, + query_attention_output, + ) + + if attention_output.shape[1] > query_length: + layer_output_text = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output[:, query_length:, :], + ) + layer_output = paddle.concat([layer_output, layer_output_text], axis=1) + else: + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output, + ) + outputs = (layer_output,) + outputs + + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + def feed_forward_chunk_query(self, attention_output): + intermediate_output = self.intermediate_query(attention_output) + layer_output = self.output_query(intermediate_output, attention_output) + return layer_output + + +class VisualGLMQFormerEncoder(nn.Layer): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.LayerList( + [VisualGLMQFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + query_length=0, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions else None + + next_decoder_cache = () if use_cache else None + + for i in range(self.config.num_hidden_layers): + layer_module = self.layer[i] + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + if getattr(self.config, "gradient_checkpointing", False) and self.training: + if use_cache: + logger.warn( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, past_key_value, output_attentions, query_length) + + return custom_forward + + layer_outputs = recompute( + create_custom_forward(layer_module), + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + query_length, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if layer_module.has_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +class VisualGLMQFormerModel(VisualGLMPretrainedModel): + """ + Querying Transformer (Q-Former), used in VisualGLM. + """ + + def __init__(self, config: VisualGLMQFormerConfig): + super().__init__(config) + self.config = config + + self.final_layernorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + self.encoder = VisualGLMQFormerEncoder(config) + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def get_extended_attention_mask( + self, + attention_mask: paddle.Tensor, + input_shape: Tuple[int], + has_query: bool = False, + ) -> paddle.Tensor: + """ + Makes broadcastable attention and causal masks so that future and masked tokens are ignored. + Arguments: + attention_mask (`paddle.Tensor`): + Mask with ones indicating tokens to attend to, zeros for tokens to ignore. + input_shape (`Tuple[int]`): + The shape of the input to the model. + Returns: + `paddle.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`. + """ + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + if attention_mask.dim() == 3: + extended_attention_mask = attention_mask[:, None, :, :] + elif attention_mask.dim() == 2: + # Provided a padding mask of dimensions [batch_size, seq_length] + # - the model is an encoder, so make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] + extended_attention_mask = attention_mask[:, None, None, :] + else: + raise ValueError( + "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format( + input_shape, attention_mask.shape + ) + ) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.cast(dtype=self.config.dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + return extended_attention_mask + + def invert_attention_mask(self, encoder_attention_mask: paddle.Tensor) -> paddle.Tensor: + """ + Invert an attention mask (e.g., switches 0. and 1.). + Args: + encoder_attention_mask (`paddle.Tensor`): An attention mask. + Returns: + `paddle.Tensor`: The inverted attention mask. + """ + if encoder_attention_mask.ndim == 3: + encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] + if encoder_attention_mask.ndim == 2: + encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] + # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition + # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow + # /transformer/transformer_layers.py#L270 + # encoder_extended_attention_mask = (encoder_extended_attention_mask == + # encoder_extended_attention_mask.transpose(-1, -2)) + encoder_extended_attention_mask = encoder_extended_attention_mask.cast( + dtype=self.config.dtype + ) # fp16 compatibility + encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e4 + + return encoder_extended_attention_mask + + def get_head_mask( + self, head_mask: Optional[paddle.Tensor], num_hidden_layers: int, is_attention_chunked: bool = False + ) -> paddle.Tensor: + """ + Prepare the head mask if needed. + Args: + head_mask (`paddle.Tensor` with shape `[num_heads]` or `[num_hidden_layers x num_heads]`, *optional*): + The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard). + num_hidden_layers (`int`): + The number of hidden layers in the model. + is_attention_chunked: (`bool`, *optional*, defaults to `False`): + Whether or not the attentions scores are computed by chunks or not. + Returns: + `paddle.Tensor` with shape `[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or list with + `[None]` for each layer. + """ + if head_mask is not None: + head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers) + if is_attention_chunked is True: + head_mask = head_mask.unsqueeze(-1) + else: + head_mask = [None] * num_hidden_layers + + return head_mask + + def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers): + """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]""" + if head_mask.ndim == 1: + head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) + head_mask = head_mask.expand([num_hidden_layers, -1, -1, -1, -1]) + elif head_mask.ndim == 2: + head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer + assert head_mask.ndim == 5, f"head_mask.dim != 5, instead {head_mask.dim()}" + head_mask = head_mask.cast(dtype=self.config.dtype) # switch to float if need + fp16 compatibility + return head_mask + + def forward( + self, + query_embeds, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + encoder_hidden_states (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`tuple(tuple(paddle.Tensor))` of length `config.n_layers` with each tuple having 4 tensors of: + shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and + value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are + used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key + value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape + `(batch_size, sequence_length)`. + use_cache (`bool`, `optional`): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # past_key_values_length + past_key_values_length = ( + past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0 + ) + + query_length = query_embeds.shape[1] if query_embeds is not None else 0 + + embedding_output = self.dropout(query_embeds) + + input_shape = embedding_output.shape[:-1] + batch_size, seq_length = input_shape + + if attention_mask is None: + attention_mask = paddle.ones(((batch_size, seq_length + past_key_values_length))) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if encoder_hidden_states is not None: + if type(encoder_hidden_states) == list: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].shape + else: + ( + encoder_batch_size, + encoder_sequence_length, + _, + ) = encoder_hidden_states.shape + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + + if type(encoder_attention_mask) == list: + encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask] + elif encoder_attention_mask is None: + encoder_attention_mask = paddle.ones(encoder_hidden_shape) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + query_length=query_length, + ) + sequence_output = encoder_outputs[0] + sequence_output = self.final_layernorm(sequence_output) + pooled_output = sequence_output[:, 0, :] + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +class VisualGLMModel(VisualGLMPretrainedModel): + config_class = VisualGLMConfig + main_input_name = "pixel_values" + + def __init__(self, config: VisualGLMConfig): + super().__init__(config) + + self.vision_model = VisualGLMVisionModel(config.vision_config) + self.query_tokens = Parameter( + paddle.zeros([1, config.num_query_tokens, config.qformer_config.hidden_size]), dtype=self.config.dtype + ) + self.qformer = VisualGLMQFormerModel(config.qformer_config) + + self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size) + self.language_model = ChatGLMForConditionalGeneration(config.text_config) + + def get_input_embeddings(self) -> nn.Layer: + return self.vision_model.embeddings.patch_embedding + + def get_text_features( + self, + input_ids: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs + ): + r""" + Returns: + text_outputs (`CausalLMOutputWithPast`, or `tuple(paddle.Tensor)` if `return_dict=False`): + The language model outputs. If `return_dict=True`, the output is a [`CausalLMOutputWithPast`] that + contains the language model logits, the past key values and the hidden states if + `output_hidden_states=True`. + Examples: + ```python + >>> import paddle + >>> from paddlenlp.transformers import ChatGLMTokenizer, VisualGLMModel + >>> tokenizer = ChatGLMTokenizer.from_pretrained("model_name") + >>> tokenizer.pad_token = tokenizer.eos_token + >>> model = VisualGLMModel.from_pretrained("model_name") + >>> model.eval() + >>> inputs = tokenizer(["a photo of a cat"], padding=True, return_tensors="pd", return_token_type_ids=False) + >>> text_features = model.get_text_features(**inputs) + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + text_outputs = self.language_model( + input_ids=input_ids, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + return text_outputs + + def get_image_features( + self, + pixel_values: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs + ): + r""" + Returns: + vision_outputs (`BaseModelOutputWithPooling` or tuple of `paddle.Tensor`): + The vision model outputs. If `return_dict=True`, the output is a [`BaseModelOutputWithPooling`] that + contains the image features, the pooled image features and the hidden states if + `output_hidden_states=True`. + Examples: + ```python + >>> import paddle + >>> from PIL import Image + >>> import requests + >>> from paddlenlp.transformers import MinitGPT4Processor, VisualGLMModel + >>> processor = MinitGPT4Processor.from_pretrained("model_name") + >>> model = VisualGLMModel.from_pretrained("model_name") + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> inputs = processor.process_images(images=image, return_tensors="pd") + >>> image_outputs = model.get_image_features(**inputs) + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype) + vision_outputs = self.vision_model( + pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + return vision_outputs + + def get_qformer_features( + self, + pixel_values: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs + ): + r""" + Returns: + vision_outputs (`BaseModelOutputWithPooling` or tuple of `paddle.Tensor`): + The vision model outputs. If `return_dict=True`, the output is a [`BaseModelOutputWithPooling`] that + contains the image features, the pooled image features and the hidden states if + `output_hidden_states=True`. + Examples: + ```python + >>> import paddle + >>> from PIL import Image + >>> import requests + >>> from paddlenlp.transformers import MinitGPT4Processor, VisualGLMModel + >>> processor = MinitGPT4Processor.from_pretrained("model_name") + >>> model = VisualGLMModel.from_pretrained("model_name") + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> inputs = processor.process_images(images=image, return_tensors="pd") + >>> qformer_outputs = model.get_qformer_features(**inputs) + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # step 1: forward the images through the vision encoder, + # to get image embeddings of shape (batch_size, seq_len, hidden_size) + pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype) + vision_outputs = self.vision_model( + pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + image_embeds = vision_outputs[0] + image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64") + + # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention + query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1]) + query_tokens = paddle.cast(query_tokens, self.qformer.layernorm.weight.dtype) + image_embeds = paddle.cast(image_embeds, self.qformer.layernorm.weight.dtype) + query_outputs = self.qformer( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + ) + + return query_outputs + + def forward( + self, + pixel_values: paddle.Tensor, # processed image + first_input_ids: paddle.Tensor, + second_input_ids: paddle.Tensor, + first_attention_mask: Optional[paddle.Tensor] = None, + second_attention_mask: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + labels: Optional[paddle.Tensor] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, VisualGLMForConditionalGenerationModelOutput]: + r""" + Returns: + Examples: + ```python + >>> from PIL import Image + >>> import requests + >>> import paddle + >>> from paddlenlp.transformers import VisualGLMProcessor, VisualGLMModel + >>> processor = VisualGLMProcessor.from_pretrained("model_name") + >>> model = VisualGLMModel.from_pretrained("model_name") + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "describe this image" + >>> prompt = "###Human: ###Assistant:" + >>> inputs = processor(images=image, texts=text, prompts=prompt, return_tensors="pd") + >>> outputs = model(**inputs) + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # step 1: forward the images through the vision encoder, + # to get image embeddings of shape (batch_size, seq_len, hidden_size) + vision_outputs = self.vision_model(pixel_values, return_dict=True) + image_embeds = vision_outputs.last_hidden_state + image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64") + + # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention + query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1]) + query_tokens = paddle.cast(query_tokens, self.qformer.layernorm.weight.dtype) + image_embeds = paddle.cast(image_embeds, self.qformer.layernorm.weight.dtype) + query_outputs = self.qformer( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + return_dict=True, + ) + query_output = query_outputs.last_hidden_state + + # step 3: use the language model, conditioned on the text and image + language_model_inputs = self.language_projection(query_output) + language_model_attention_mask = paddle.ones(language_model_inputs.shape[:-1], dtype="int64") + + first_embeds = self.language_model.chatglm.transformer.word_embeddings(first_input_ids) + second_embeds = self.language_model.chatglm.word_embeddings(second_input_ids) + language_model_inputs = paddle.cast(language_model_inputs, dtype=first_embeds.dtype) + inputs_embeds = paddle.concat([first_embeds, language_model_inputs, second_embeds], axis=1) + + if first_attention_mask is None: + first_attention_mask = paddle.ones_like(first_embeds.shape[:-1], dtype="int64") + if second_attention_mask is None: + second_attention_mask = paddle.ones_like(second_embeds.shape[:-1], dtype="int64") + attention_mask = paddle.concat( + [first_attention_mask, language_model_attention_mask, second_attention_mask], axis=1 + ) + + outputs = self.language_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + logits = outputs.logits if return_dict else outputs[0] + loss = None + # we compute the loss here since we need to take into account the sequence length of the query embeds + if labels is not None: + logits = logits[:, -labels.shape[1] :, :] + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :] + shift_labels = labels[..., 1:] + + # Flatten the tokens + loss_fct = CrossEntropyLoss(reduction="mean") + + loss = loss_fct(shift_logits.reshape([-1, self.config.text_config.vocab_size]), shift_labels.reshape([-1])) + + if not return_dict: + output = (logits, vision_outputs, query_outputs, outputs) + return ((loss,) + output) if loss is not None else output + + return VisualGLMForConditionalGenerationModelOutput( + loss=loss, + logits=logits, + vision_outputs=vision_outputs, + qformer_outputs=query_outputs, + language_model_outputs=outputs, + ) + + +class ChatGLMForConditionalGenerationWithImage(ChatGLMForConditionalGeneration): + def __init__(self, config: ChatGLMConfig): + super(ChatGLMForConditionalGenerationWithImage, self).__init__(config) + self.config = config + + def forward( + self, + image_features: paddle.Tensor, + input_ids: paddle.Tensor, + position_ids: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + pre_image_length: Optional[int] = None, + cache: Optional[Tuple[paddle.Tensor]] = None, + inputs_embeds: Optional[paddle.Tensor] = None, + labels: Optional[paddle.Tensor] = None, + use_cache: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if inputs_embeds is None and cache is None and image_features is not None: + pre_ids, pad_ids, post_ids = paddle.split(input_ids, num_or_sections=[pre_image_length, 32, -1], axis=1) + pre_txt_emb = self.chatglm.transformer.word_embeddings(pre_ids) + post_txt_emb = self.chatglm.transformer.word_embeddings(post_ids) + inputs_embeds = paddle.concat([pre_txt_emb, image_features, post_txt_emb], axis=1) + + outputs = super().forward( + input_ids=input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + cache=cache, + inputs_embeds=inputs_embeds, + labels=labels, + use_cache=use_cache, + return_dict=return_dict, + ) + + return outputs + + +class VisualGLMForConditionalGeneration(VisualGLMPretrainedModel): + config_class = VisualGLMConfig + main_input_name = "pixel_values" + + def __init__(self, config: VisualGLMConfig): + super().__init__(config) + self.config = config + self.vision_model = VisualGLMVisionModel(config.vision_config) + self.query_tokens = Parameter( + paddle.zeros([1, config.num_query_tokens, config.qformer_config.hidden_size]), dtype=self.config.dtype + ) + self.qformer = VisualGLMQFormerModel(config.qformer_config) + self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size) + self.language_model = ChatGLMForConditionalGenerationWithImage(config.text_config) + + def get_input_embeddings(self) -> nn.Layer: + return self.vision_model.embeddings.patch_embedding + + def encode_images( + self, + pixel_values: paddle.Tensor, # processed image + ): + # step 1: forward the images through the vision encoder, + # to get image embeddings of shape (batch_size, seq_len, hidden_size) + pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype) + vision_outputs = self.vision_model(pixel_values, return_dict=True) + image_embeds = vision_outputs.last_hidden_state + image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64") + + # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention + query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1]) + query_tokens = paddle.cast(query_tokens, self.qformer.final_layernorm.weight.dtype) + image_embeds = paddle.cast(image_embeds, self.qformer.final_layernorm.weight.dtype) + query_outputs = self.qformer( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + return_dict=True, + ) + query_output = query_outputs.last_hidden_state + + # step 3: mapping query_output into language_model space + language_model_inputs = self.language_projection(query_output) + + return language_model_inputs + + @paddle.no_grad() + def generate( + self, + pixel_values: paddle.Tensor, + input_ids: paddle.Tensor, + pre_image_length: int, + attention_mask: Optional[paddle.Tensor] = None, + **generate_kwargs, + ) -> paddle.Tensor: + """ + Overrides `generate` function to be able to use the model as a conditional generator. + Args: + pixel_values (`paddle.Tensor` of shape (batch_size, num_channels, height, width)): + Input images to be processed. + input_ids (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*): + The sequence used as a prompt for the generation. + attention_mask (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*): + Mask to avoid performing attention on padding token indices + Returns: + captions (list): A list of strings of length batch_size * num_captions. + + Examples: + ```python + >>> from PIL import Image + >>> import requests + >>> import paddle + >>> from paddlenlp.transformers import VisualGLMProcessor, VisualGLMForConditionalGeneration + >>> processor = VisualGLMProcessor.from_pretrained("model_name") + >>> model = VisualGLMForConditionalGeneration.from_pretrained("model_name") + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "describe this image" + >>> prompt = "###Human: ###Assistant:" + >>> inputs = processor(images=image, texts=text, prompts=prompt, return_tensors="pd") + >>> generated_ids, scores= model.generate(**inputs) + >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip() + """ + + image_features = self.encode_images(pixel_values) + + outputs = self.language_model.generate( + input_ids=input_ids, + image_features=image_features, + pre_image_length=pre_image_length, + attention_mask=attention_mask, + **generate_kwargs, + ) + + return outputs diff --git a/paddlevlp/processors/__init__.py b/paddlevlp/processors/__init__.py index 04006999f0b629..a481ea97ee0bb0 100644 --- a/paddlevlp/processors/__init__.py +++ b/paddlevlp/processors/__init__.py @@ -16,3 +16,5 @@ from .blip_processing import * from .minigpt4_processing import * from .minigpt4_image_processing import * +from .visualglm_processing import * +from .visualglm_image_processing import * \ No newline at end of file diff --git a/paddlevlp/processors/visualglm_image_processing.py b/paddlevlp/processors/visualglm_image_processing.py new file mode 100644 index 00000000000000..920caf1df2c128 --- /dev/null +++ b/paddlevlp/processors/visualglm_image_processing.py @@ -0,0 +1,285 @@ +# coding=utf-8 +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for VisualGLM.""" + +from typing import Dict, List, Optional, Union + +import numpy as np +import PIL + +from .image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from .image_transforms import ( + convert_to_rgb, + normalize, + rescale, + resize, + to_channel_dimension_format, +) +from .image_utils import ( + ChannelDimension, + ImageInput, + PILImageResampling, + is_batched, + to_numpy_array, + valid_images, +) + +from paddlenlp.transformers.tokenizer_utils_base import TensorType + +__all__ = [ + "VisualGLMImageProcessor", +] + + +class VisualGLMImageProcessor(BaseImageProcessor): + r""" + Constructs a VisualGLM image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the + `do_resize` parameter in the `preprocess` method. + size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`): + Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess` + method. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be + overridden by the `resample` parameter in the `preprocess` method. + do_rescale (`bool`, *optional*, defaults to `True`): + Wwhether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the + `do_rescale` parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be + overridden by the `rescale_factor` parameter in the `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. Can be overridden by the `do_normalize` parameter in the `preprocess` method. + image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be + overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + Can be overridden by the `image_std` parameter in the `preprocess` method. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Dict[str, int] = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: bool = True, + **kwargs + ) -> None: + super().__init__(**kwargs) + default_image_mean = [0.48145466, 0.4578275, 0.40821073] + default_image_std = [0.26862954, 0.26130258, 0.27577711] + size = size if size is not None else {"height": 224, "width": 224} + size = get_size_dict(size, default_to_square=True) + + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else default_image_mean + self.image_std = image_std if image_std is not None else default_image_std + self.do_convert_rgb = do_convert_rgb + + def resize( + self, + image: np.ndarray, + size: Dict[str, int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs + ) -> np.ndarray: + """ + Resize an image. + + Resizes the shorter side of the image to `size["shortest_edge"]` while preserving the aspect ratio. If the + longer side is larger than the max size `(int(`size["shortest_edge"]` * 1333 / 800))`, the longer side is then + resized to the max size while preserving the aspect ratio. + + Args: + image (`np.ndarray`): + Image to resize. + size (`Dict[str, int]`): + Controls the size of the output image. Should be of the form `{"shortest_edge": int}`. + resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`): + Resampling filter to use when resiizing the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + """ + size = get_size_dict(size, default_to_square=True) + output_size = (size["width"], size["height"]) + return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs) + + def rescale( + self, + image: np.ndarray, + scale: Union[int, float], + data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs + ): + """ + Rescale an image by a scale factor. image = image * scale. + + Args: + image (`np.ndarray`): + Image to rescale. + scale (`int` or `float`): + Scale to apply to the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + """ + return rescale(image, scale=scale, data_format=data_format, **kwargs) + + def normalize( + self, + image: np.ndarray, + mean: Union[float, List[float]], + std: Union[float, List[float]], + data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs + ) -> np.ndarray: + """ + Normalize an image. image = (image - image_mean) / image_std. + + Args: + image (`np.ndarray`): + Image to normalize. + mean (`float` or `List[float]`): + Image mean. + std (`float` or `List[float]`): + Image standard deviation. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + """ + return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs) + + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[Dict[str, int]] = None, + resample: PILImageResampling = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + do_convert_rgb: bool = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + **kwargs, + ) -> PIL.Image.Image: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to `self.size`): + Controls the size of the image after `resize`. The shortest edge of the image is resized to + `size["shortest_edge"]` while preserving the aspect ratio. If the longest edge of this resized image + is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest + edge equal to `int(size["shortest_edge"] * (1333 / 800))`. + resample (`PILImageResampling`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image values between [0 - 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Image mean to normalize the image by if `do_normalize` is set to `True`. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to normalize the image by if `do_normalize` is set to `True`. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.PADDLE` or `'pt'`: Return a batch of type `paddle.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: defaults to the channel dimension format of the input image. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + resample = resample if resample is not None else self.resample + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + + size = size if size is not None else self.size + size = get_size_dict(size, default_to_square=False) + + if not is_batched(images): + images = [images] + + if not valid_images(images): + raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.") + + if do_resize and size is None or resample is None: + raise ValueError("Size and resample must be specified if do_resize is True.") + + if do_rescale and rescale_factor is None: + raise ValueError("Rescale factor must be specified if do_rescale is True.") + + if do_normalize and (image_mean is None or image_std is None): + raise ValueError("Image mean and std must be specified if do_normalize is True.") + + # PIL RGBA images are converted to RGB + if do_convert_rgb: + images = [convert_to_rgb(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if do_resize: + images = [self.resize(image=image, size=size, resample=resample) for image in images] + + if do_rescale: + images = [self.rescale(image=image, scale=rescale_factor) for image in images] + + if do_normalize: + images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images] + + images = [to_channel_dimension_format(image, data_format) for image in images] + + data = {"pixel_values": images} + return BatchFeature(data=data, tensor_type=return_tensors) diff --git a/paddlevlp/processors/visualglm_processing.py b/paddlevlp/processors/visualglm_processing.py new file mode 100644 index 00000000000000..60eb8ba0c2ed2f --- /dev/null +++ b/paddlevlp/processors/visualglm_processing.py @@ -0,0 +1,223 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Processor class for VisualGLM. +""" + +import re +from typing import List, Optional, Union + +import numpy as np +import paddle +from PIL import Image + +from .image_processing_utils import BatchFeature +from .image_utils import ImageInput +from .base_processing import ProcessorMixin +from paddlenlp.transformers.tokenizer_utils_base import BatchEncoding, TensorType, TextInput + +__all__ = [ + "VisualGLMProcessor", +] + + +class VisualGLMProcessor(ProcessorMixin): + r""" + Constructs a VisualGLM processor which wraps a VisualGLM image processor and an llama tokenizer into a single processor. + [`VisualGLMProcessor`] offers all the functionalities of [`VisualGLMImageProcessor`] and [`LlamaTokenizer`]. See the docstring + of [`~VisualGLMImageProcessor.__call__`] and [`~LlamaTokenizer.decode`] for more information. + + Args: + image_processor (`VisualGLMImageProcessor`): + An instance of [`VisualGLMImageProcessor`]. The image processor is a required input. + tokenizer (`LlamaTokenizer`): + An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input. + + Examples: + ```python + >>> import requests + >>> from PIL import Image + + >>> import paddle + >>> from paddlenlp.transformers import VisualGLMProcessor + + >>> # load processor + >>> minigpt4_13b_path = "model_name" + >>> processor = VisualGLMProcessor.from_pretrained(minigpt4_13b_path) + >>> print("load processor and model done!") + + >>> # prepare model inputs for VisualGLM + >>> url = "https://paddlenlp.bj.bcebos.com/data/images/mugs.png" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "describe this image" + >>> prompt = "Give the following image: ImageContent. You will be able to see the image once I provide it to you. Please answer my questions.###Human: ###Assistant:" + >>> res = processor([image], text, prompt) + ```""" + attributes = ["image_processor", "tokenizer"] + image_processor_class = "VisualGLMImageProcessor" + tokenizer_class = "ChatGLMTokenizer" + + def __init__(self, image_processor, tokenizer): + tokenizer.return_token_type_ids = False + tokenizer.model_input_names = ["input_ids", "attention_mask"] + super().__init__(image_processor, tokenizer) + self.current_processor = self.image_processor + self.default_prompt = "" + self.image_tag = "" + self.num_query_tokens = 32 + + def process_images( + self, + images: ImageInput, + return_tensors: Optional[Union[str, TensorType]] = TensorType.PADDLE, + **kwargs, + ) -> BatchFeature: + """ + This method uses [`VisualGLMImageProcessor.__call__`] method to prepare image(s) for the model. + Please refer to the docstring of the method for more information. + """ + if not images: + raise ValueError("You have to input correct images.") + + if isinstance(images, (Image.Image, np.ndarray, paddle.Tensor)): + images = [images] + + processed_images = self.image_processor(images, return_tensors=return_tensors) + + return processed_images + + def process_texts( + self, + texts: Union[TextInput, List[TextInput]], + return_tensors: Optional[Union[str, TensorType]] = TensorType.PADDLE, + **kwargs, + ) -> BatchEncoding: + if not texts: + raise ValueError("You have to input correct texts.") + + if isinstance(texts, TextInput): + texts = [texts] + + processed_texts = self.tokenizer(text=texts, return_tensors=return_tensors, **kwargs) + return BatchEncoding(processed_texts) + + def build_inputs_with_image( + self, + image: Union[Image.Image, np.ndarray, paddle.Tensor], + query: str, + history: Optional[str] = None, + ): + # construct prompt with inputs + if image is not None: + prompt = self.default_prompt + else: + prompt = "" + for old_query, response in history: + prompt += "问:{}\n答:{}\n".format(old_query, response) + prompt += "问:{}\n答:".format(query) + + if image is not None: + image_start_position = prompt.rfind(self.image_tag) + image_end_position = image_start_position + len(self.image_tag) + first_text_input = self.tokenizer.encode(prompt[:image_start_position], add_special_tokens=False) + image_input = [self.tokenizer.unk_token_id] * self.num_query_tokens + second_text_input = self.tokenizer.encode(prompt[image_end_position:], add_special_tokens=False) + all_input_ids = first_text_input["input_ids"] + image_input + second_text_input["input_ids"] + all_input_ids = self.tokenizer.build_inputs_with_special_tokens(all_input_ids) + + # processing image + processed_image = self.process_images(image) + + inputs = { + "input_ids": paddle.to_tensor(all_input_ids, dtype="int64").unsqueeze(0), + "pre_image_length": len(first_text_input["input_ids"]), + "pixel_values": processed_image["pixel_values"], + } + else: + inputs = self.tokenizer([prompt], return_tensors="pd") + inputs["pre_image_length"] = 0 + + return inputs + + def __call__( + self, + image: Union[Image.Image, np.ndarray, paddle.Tensor], + query: str, + history: Optional[str] = [], + **kwargs, + ): + if image is None: + raise ValueError("Image should not be None.") + if query is None: + raise ValueError("Query should not be None.") + if not isinstance(query, str): + raise TypeError("A string type of query is expected, but acceived {}.".format(type(query))) + if not isinstance(history, list): + raise TypeError( + "A list type of history is expected with each item [query, response] in it, but acceived {}.".format( + type(history) + ) + ) + + inputs = self.build_inputs_with_image(image, query, history=history) + + return inputs + + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer + to the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + def process_response(self, response): + response = response.strip() + response = response.replace("[[训练时间]]", "2023年") + punkts = [ + [",", ","], + ["!", "!"], + [":", ":"], + [";", ";"], + ["\?", "?"], + ] + for item in punkts: + response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response) + response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response) + return response + + def get_responses(self, *args, **kwargs): + processed_responses = [] + responses = self.batch_decode(*args, **kwargs) + + for response in responses: + response = self.process_response(response) + processed_responses.append(response) + + return processed_responses + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) From 1bcb2707f391d77200332552448b1986d474cf0b Mon Sep 17 00:00:00 2001 From: Milen <1649759610@qq.com> Date: Wed, 5 Jul 2023 03:19:01 +0000 Subject: [PATCH 09/10] update examples for visualglm --- paddlevlp/examples/visualglm/README.md | 29 +++++++++------------ paddlevlp/examples/visualglm/run_predict.py | 1 + 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/paddlevlp/examples/visualglm/README.md b/paddlevlp/examples/visualglm/README.md index 5c767ecacf40f5..a0c0047224cf32 100644 --- a/paddlevlp/examples/visualglm/README.md +++ b/paddlevlp/examples/visualglm/README.md @@ -12,39 +12,34 @@ VisualGLM-6B 依靠来自于 CogView 数据集的30M高质量中文图文对, ``` python run_predict.py \ - -- pretrained_name_or_path "your minigpt4 path" + -- pretrained_name_or_path "THUDM/visualglm-6b" ``` 下图这个示例展示了在使用visualglm-6b时的效果: -输入图片:
+输入图片:
输入文本:“写诗描述一下这个场景” 输出: ``` -两个杯子,黑白相间, -一个放在桌子上,另一个放在咖啡杯上。 -它们静静地坐着, -仿佛在讲述着什么故事。 一只猫和另一只猫, -彼此相依相伴, -似乎有着某种神秘的联系。 -它们的黑白对比, -仿佛是一幅美丽的画, -让人不禁沉醉其中。 这两只杯子, -是一份温馨的礼物, -代表着爱和情感的温度。 -它们在桌面上静静等待着, -期待着主人的到来, -让它们成为彼此的依靠。 +泰坦尼克号,浪漫而美丽。 +男女主角手牵手,共舞于船头。 +夕阳余晖洒落,风景如画。 +他们的身影如此优美,令人陶醉。 海水翻涌,波涛汹涌。 +船上的人们,沉浸在这美妙的时刻中。 +爱情的力量,让他们更加坚定。 +他们在大海上翱翔,享受着彼此的温暖。 电影的结束,意味着爱情的开始。 +他们将永远铭记这段美好的日子。 +在回忆里,他们会珍惜这份爱。 ``` 输入文本:“这部电影的导演是谁?” 输出: ``` -电影《猫与杯》由韩国著名导演李在均执导。 +这部电影的导演是詹姆斯·卡梅隆(James Cameron)。 ``` ## 3. License 说明 diff --git a/paddlevlp/examples/visualglm/run_predict.py b/paddlevlp/examples/visualglm/run_predict.py index f12c32c593f2a9..460dcda8f8aa24 100644 --- a/paddlevlp/examples/visualglm/run_predict.py +++ b/paddlevlp/examples/visualglm/run_predict.py @@ -31,6 +31,7 @@ def predict(args): print("load processor and model done!") url = "https://paddlenlp.bj.bcebos.com/data/images/mugs.png" + url = "https://paddlenlp.bj.bcebos.com/data/images/titanic.jpeg" image = Image.open(requests.get(url, stream=True).raw) generate_kwargs = { "max_length":1024, From a7014f8c609c0881a5193a54c4df5be91afcc26d Mon Sep 17 00:00:00 2001 From: Milen <1649759610@qq.com> Date: Wed, 5 Jul 2023 03:53:57 +0000 Subject: [PATCH 10/10] fix license link --- paddlevlp/examples/visualglm/README.md | 2 +- paddlevlp/examples/visualglm/run_predict.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/paddlevlp/examples/visualglm/README.md b/paddlevlp/examples/visualglm/README.md index a0c0047224cf32..81c2321b304bed 100644 --- a/paddlevlp/examples/visualglm/README.md +++ b/paddlevlp/examples/visualglm/README.md @@ -43,7 +43,7 @@ python run_predict.py \ ``` ## 3. License 说明 -VisualGLM-6B模型权重使用需要遵循清华大学发布的[Model License](./MODEL_LICENSE.txt)。 +VisualGLM-6B模型权重使用需要遵循清华大学发布的[Model License](https://github.com/THUDM/VisualGLM-6B/blob/main/MODEL_LICENSE.txt)。 ## Reference diff --git a/paddlevlp/examples/visualglm/run_predict.py b/paddlevlp/examples/visualglm/run_predict.py index 460dcda8f8aa24..560ea5bc559828 100644 --- a/paddlevlp/examples/visualglm/run_predict.py +++ b/paddlevlp/examples/visualglm/run_predict.py @@ -30,7 +30,6 @@ def predict(args): processor = VisualGLMProcessor.from_pretrained(args.pretrained_name_or_path) print("load processor and model done!") - url = "https://paddlenlp.bj.bcebos.com/data/images/mugs.png" url = "https://paddlenlp.bj.bcebos.com/data/images/titanic.jpeg" image = Image.open(requests.get(url, stream=True).raw) generate_kwargs = {